1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 25 /* magic values for the inode_only field in btrfs_log_inode: 26 * 27 * LOG_INODE_ALL means to log everything 28 * LOG_INODE_EXISTS means to log just enough to recreate the inode 29 * during log replay 30 */ 31 enum { 32 LOG_INODE_ALL, 33 LOG_INODE_EXISTS, 34 LOG_OTHER_INODE, 35 LOG_OTHER_INODE_ALL, 36 }; 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 enum { 91 LOG_WALK_PIN_ONLY, 92 LOG_WALK_REPLAY_INODES, 93 LOG_WALK_REPLAY_DIR_INDEX, 94 LOG_WALK_REPLAY_ALL, 95 }; 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_inode *inode, 99 int inode_only, 100 struct btrfs_log_ctx *ctx); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 static void wait_log_commit(struct btrfs_root *root, int transid); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct btrfs_root *tree_root = fs_info->tree_root; 145 const bool zoned = btrfs_is_zoned(fs_info); 146 int ret = 0; 147 bool created = false; 148 149 /* 150 * First check if the log root tree was already created. If not, create 151 * it before locking the root's log_mutex, just to keep lockdep happy. 152 */ 153 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 154 mutex_lock(&tree_root->log_mutex); 155 if (!fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, fs_info); 157 if (!ret) { 158 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 159 created = true; 160 } 161 } 162 mutex_unlock(&tree_root->log_mutex); 163 if (ret) 164 return ret; 165 } 166 167 mutex_lock(&root->log_mutex); 168 169 again: 170 if (root->log_root) { 171 int index = (root->log_transid + 1) % 2; 172 173 if (btrfs_need_log_full_commit(trans)) { 174 ret = BTRFS_LOG_FORCE_COMMIT; 175 goto out; 176 } 177 178 if (zoned && atomic_read(&root->log_commit[index])) { 179 wait_log_commit(root, root->log_transid - 1); 180 goto again; 181 } 182 183 if (!root->log_start_pid) { 184 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 185 root->log_start_pid = current->pid; 186 } else if (root->log_start_pid != current->pid) { 187 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 188 } 189 } else { 190 /* 191 * This means fs_info->log_root_tree was already created 192 * for some other FS trees. Do the full commit not to mix 193 * nodes from multiple log transactions to do sequential 194 * writing. 195 */ 196 if (zoned && !created) { 197 ret = BTRFS_LOG_FORCE_COMMIT; 198 goto out; 199 } 200 201 ret = btrfs_add_log_tree(trans, root); 202 if (ret) 203 goto out; 204 205 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 206 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 207 root->log_start_pid = current->pid; 208 } 209 210 atomic_inc(&root->log_writers); 211 if (!ctx->logging_new_name) { 212 int index = root->log_transid % 2; 213 list_add_tail(&ctx->list, &root->log_ctxs[index]); 214 ctx->log_transid = root->log_transid; 215 } 216 217 out: 218 mutex_unlock(&root->log_mutex); 219 return ret; 220 } 221 222 /* 223 * returns 0 if there was a log transaction running and we were able 224 * to join, or returns -ENOENT if there were not transactions 225 * in progress 226 */ 227 static int join_running_log_trans(struct btrfs_root *root) 228 { 229 const bool zoned = btrfs_is_zoned(root->fs_info); 230 int ret = -ENOENT; 231 232 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 233 return ret; 234 235 mutex_lock(&root->log_mutex); 236 again: 237 if (root->log_root) { 238 int index = (root->log_transid + 1) % 2; 239 240 ret = 0; 241 if (zoned && atomic_read(&root->log_commit[index])) { 242 wait_log_commit(root, root->log_transid - 1); 243 goto again; 244 } 245 atomic_inc(&root->log_writers); 246 } 247 mutex_unlock(&root->log_mutex); 248 return ret; 249 } 250 251 /* 252 * This either makes the current running log transaction wait 253 * until you call btrfs_end_log_trans() or it makes any future 254 * log transactions wait until you call btrfs_end_log_trans() 255 */ 256 void btrfs_pin_log_trans(struct btrfs_root *root) 257 { 258 atomic_inc(&root->log_writers); 259 } 260 261 /* 262 * indicate we're done making changes to the log tree 263 * and wake up anyone waiting to do a sync 264 */ 265 void btrfs_end_log_trans(struct btrfs_root *root) 266 { 267 if (atomic_dec_and_test(&root->log_writers)) { 268 /* atomic_dec_and_test implies a barrier */ 269 cond_wake_up_nomb(&root->log_writer_wait); 270 } 271 } 272 273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 274 { 275 filemap_fdatawait_range(buf->pages[0]->mapping, 276 buf->start, buf->start + buf->len - 1); 277 } 278 279 /* 280 * the walk control struct is used to pass state down the chain when 281 * processing the log tree. The stage field tells us which part 282 * of the log tree processing we are currently doing. The others 283 * are state fields used for that specific part 284 */ 285 struct walk_control { 286 /* should we free the extent on disk when done? This is used 287 * at transaction commit time while freeing a log tree 288 */ 289 int free; 290 291 /* pin only walk, we record which extents on disk belong to the 292 * log trees 293 */ 294 int pin; 295 296 /* what stage of the replay code we're currently in */ 297 int stage; 298 299 /* 300 * Ignore any items from the inode currently being processed. Needs 301 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 302 * the LOG_WALK_REPLAY_INODES stage. 303 */ 304 bool ignore_cur_inode; 305 306 /* the root we are currently replaying */ 307 struct btrfs_root *replay_dest; 308 309 /* the trans handle for the current replay */ 310 struct btrfs_trans_handle *trans; 311 312 /* the function that gets used to process blocks we find in the 313 * tree. Note the extent_buffer might not be up to date when it is 314 * passed in, and it must be checked or read if you need the data 315 * inside it 316 */ 317 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 318 struct walk_control *wc, u64 gen, int level); 319 }; 320 321 /* 322 * process_func used to pin down extents, write them or wait on them 323 */ 324 static int process_one_buffer(struct btrfs_root *log, 325 struct extent_buffer *eb, 326 struct walk_control *wc, u64 gen, int level) 327 { 328 struct btrfs_fs_info *fs_info = log->fs_info; 329 int ret = 0; 330 331 /* 332 * If this fs is mixed then we need to be able to process the leaves to 333 * pin down any logged extents, so we have to read the block. 334 */ 335 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 336 ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 337 if (ret) 338 return ret; 339 } 340 341 if (wc->pin) { 342 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 343 eb->len); 344 if (ret) 345 return ret; 346 347 if (btrfs_buffer_uptodate(eb, gen, 0) && 348 btrfs_header_level(eb) == 0) 349 ret = btrfs_exclude_logged_extents(eb); 350 } 351 return ret; 352 } 353 354 static int do_overwrite_item(struct btrfs_trans_handle *trans, 355 struct btrfs_root *root, 356 struct btrfs_path *path, 357 struct extent_buffer *eb, int slot, 358 struct btrfs_key *key) 359 { 360 int ret; 361 u32 item_size; 362 u64 saved_i_size = 0; 363 int save_old_i_size = 0; 364 unsigned long src_ptr; 365 unsigned long dst_ptr; 366 int overwrite_root = 0; 367 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 368 369 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 370 overwrite_root = 1; 371 372 item_size = btrfs_item_size(eb, slot); 373 src_ptr = btrfs_item_ptr_offset(eb, slot); 374 375 /* Our caller must have done a search for the key for us. */ 376 ASSERT(path->nodes[0] != NULL); 377 378 /* 379 * And the slot must point to the exact key or the slot where the key 380 * should be at (the first item with a key greater than 'key') 381 */ 382 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 383 struct btrfs_key found_key; 384 385 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 386 ret = btrfs_comp_cpu_keys(&found_key, key); 387 ASSERT(ret >= 0); 388 } else { 389 ret = 1; 390 } 391 392 if (ret == 0) { 393 char *src_copy; 394 char *dst_copy; 395 u32 dst_size = btrfs_item_size(path->nodes[0], 396 path->slots[0]); 397 if (dst_size != item_size) 398 goto insert; 399 400 if (item_size == 0) { 401 btrfs_release_path(path); 402 return 0; 403 } 404 dst_copy = kmalloc(item_size, GFP_NOFS); 405 src_copy = kmalloc(item_size, GFP_NOFS); 406 if (!dst_copy || !src_copy) { 407 btrfs_release_path(path); 408 kfree(dst_copy); 409 kfree(src_copy); 410 return -ENOMEM; 411 } 412 413 read_extent_buffer(eb, src_copy, src_ptr, item_size); 414 415 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 416 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 417 item_size); 418 ret = memcmp(dst_copy, src_copy, item_size); 419 420 kfree(dst_copy); 421 kfree(src_copy); 422 /* 423 * they have the same contents, just return, this saves 424 * us from cowing blocks in the destination tree and doing 425 * extra writes that may not have been done by a previous 426 * sync 427 */ 428 if (ret == 0) { 429 btrfs_release_path(path); 430 return 0; 431 } 432 433 /* 434 * We need to load the old nbytes into the inode so when we 435 * replay the extents we've logged we get the right nbytes. 436 */ 437 if (inode_item) { 438 struct btrfs_inode_item *item; 439 u64 nbytes; 440 u32 mode; 441 442 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 443 struct btrfs_inode_item); 444 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 445 item = btrfs_item_ptr(eb, slot, 446 struct btrfs_inode_item); 447 btrfs_set_inode_nbytes(eb, item, nbytes); 448 449 /* 450 * If this is a directory we need to reset the i_size to 451 * 0 so that we can set it up properly when replaying 452 * the rest of the items in this log. 453 */ 454 mode = btrfs_inode_mode(eb, item); 455 if (S_ISDIR(mode)) 456 btrfs_set_inode_size(eb, item, 0); 457 } 458 } else if (inode_item) { 459 struct btrfs_inode_item *item; 460 u32 mode; 461 462 /* 463 * New inode, set nbytes to 0 so that the nbytes comes out 464 * properly when we replay the extents. 465 */ 466 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 467 btrfs_set_inode_nbytes(eb, item, 0); 468 469 /* 470 * If this is a directory we need to reset the i_size to 0 so 471 * that we can set it up properly when replaying the rest of 472 * the items in this log. 473 */ 474 mode = btrfs_inode_mode(eb, item); 475 if (S_ISDIR(mode)) 476 btrfs_set_inode_size(eb, item, 0); 477 } 478 insert: 479 btrfs_release_path(path); 480 /* try to insert the key into the destination tree */ 481 path->skip_release_on_error = 1; 482 ret = btrfs_insert_empty_item(trans, root, path, 483 key, item_size); 484 path->skip_release_on_error = 0; 485 486 /* make sure any existing item is the correct size */ 487 if (ret == -EEXIST || ret == -EOVERFLOW) { 488 u32 found_size; 489 found_size = btrfs_item_size(path->nodes[0], 490 path->slots[0]); 491 if (found_size > item_size) 492 btrfs_truncate_item(path, item_size, 1); 493 else if (found_size < item_size) 494 btrfs_extend_item(path, item_size - found_size); 495 } else if (ret) { 496 return ret; 497 } 498 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 499 path->slots[0]); 500 501 /* don't overwrite an existing inode if the generation number 502 * was logged as zero. This is done when the tree logging code 503 * is just logging an inode to make sure it exists after recovery. 504 * 505 * Also, don't overwrite i_size on directories during replay. 506 * log replay inserts and removes directory items based on the 507 * state of the tree found in the subvolume, and i_size is modified 508 * as it goes 509 */ 510 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 511 struct btrfs_inode_item *src_item; 512 struct btrfs_inode_item *dst_item; 513 514 src_item = (struct btrfs_inode_item *)src_ptr; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 517 if (btrfs_inode_generation(eb, src_item) == 0) { 518 struct extent_buffer *dst_eb = path->nodes[0]; 519 const u64 ino_size = btrfs_inode_size(eb, src_item); 520 521 /* 522 * For regular files an ino_size == 0 is used only when 523 * logging that an inode exists, as part of a directory 524 * fsync, and the inode wasn't fsynced before. In this 525 * case don't set the size of the inode in the fs/subvol 526 * tree, otherwise we would be throwing valid data away. 527 */ 528 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 529 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 530 ino_size != 0) 531 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 532 goto no_copy; 533 } 534 535 if (overwrite_root && 536 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 537 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 538 save_old_i_size = 1; 539 saved_i_size = btrfs_inode_size(path->nodes[0], 540 dst_item); 541 } 542 } 543 544 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 545 src_ptr, item_size); 546 547 if (save_old_i_size) { 548 struct btrfs_inode_item *dst_item; 549 dst_item = (struct btrfs_inode_item *)dst_ptr; 550 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 551 } 552 553 /* make sure the generation is filled in */ 554 if (key->type == BTRFS_INODE_ITEM_KEY) { 555 struct btrfs_inode_item *dst_item; 556 dst_item = (struct btrfs_inode_item *)dst_ptr; 557 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 558 btrfs_set_inode_generation(path->nodes[0], dst_item, 559 trans->transid); 560 } 561 } 562 no_copy: 563 btrfs_mark_buffer_dirty(path->nodes[0]); 564 btrfs_release_path(path); 565 return 0; 566 } 567 568 /* 569 * Item overwrite used by replay and tree logging. eb, slot and key all refer 570 * to the src data we are copying out. 571 * 572 * root is the tree we are copying into, and path is a scratch 573 * path for use in this function (it should be released on entry and 574 * will be released on exit). 575 * 576 * If the key is already in the destination tree the existing item is 577 * overwritten. If the existing item isn't big enough, it is extended. 578 * If it is too large, it is truncated. 579 * 580 * If the key isn't in the destination yet, a new item is inserted. 581 */ 582 static int overwrite_item(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, 584 struct btrfs_path *path, 585 struct extent_buffer *eb, int slot, 586 struct btrfs_key *key) 587 { 588 int ret; 589 590 /* Look for the key in the destination tree. */ 591 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 592 if (ret < 0) 593 return ret; 594 595 return do_overwrite_item(trans, root, path, eb, slot, key); 596 } 597 598 /* 599 * simple helper to read an inode off the disk from a given root 600 * This can only be called for subvolume roots and not for the log 601 */ 602 static noinline struct inode *read_one_inode(struct btrfs_root *root, 603 u64 objectid) 604 { 605 struct inode *inode; 606 607 inode = btrfs_iget(root->fs_info->sb, objectid, root); 608 if (IS_ERR(inode)) 609 inode = NULL; 610 return inode; 611 } 612 613 /* replays a single extent in 'eb' at 'slot' with 'key' into the 614 * subvolume 'root'. path is released on entry and should be released 615 * on exit. 616 * 617 * extents in the log tree have not been allocated out of the extent 618 * tree yet. So, this completes the allocation, taking a reference 619 * as required if the extent already exists or creating a new extent 620 * if it isn't in the extent allocation tree yet. 621 * 622 * The extent is inserted into the file, dropping any existing extents 623 * from the file that overlap the new one. 624 */ 625 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 626 struct btrfs_root *root, 627 struct btrfs_path *path, 628 struct extent_buffer *eb, int slot, 629 struct btrfs_key *key) 630 { 631 struct btrfs_drop_extents_args drop_args = { 0 }; 632 struct btrfs_fs_info *fs_info = root->fs_info; 633 int found_type; 634 u64 extent_end; 635 u64 start = key->offset; 636 u64 nbytes = 0; 637 struct btrfs_file_extent_item *item; 638 struct inode *inode = NULL; 639 unsigned long size; 640 int ret = 0; 641 642 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 643 found_type = btrfs_file_extent_type(eb, item); 644 645 if (found_type == BTRFS_FILE_EXTENT_REG || 646 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 647 nbytes = btrfs_file_extent_num_bytes(eb, item); 648 extent_end = start + nbytes; 649 650 /* 651 * We don't add to the inodes nbytes if we are prealloc or a 652 * hole. 653 */ 654 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 655 nbytes = 0; 656 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 657 size = btrfs_file_extent_ram_bytes(eb, item); 658 nbytes = btrfs_file_extent_ram_bytes(eb, item); 659 extent_end = ALIGN(start + size, 660 fs_info->sectorsize); 661 } else { 662 ret = 0; 663 goto out; 664 } 665 666 inode = read_one_inode(root, key->objectid); 667 if (!inode) { 668 ret = -EIO; 669 goto out; 670 } 671 672 /* 673 * first check to see if we already have this extent in the 674 * file. This must be done before the btrfs_drop_extents run 675 * so we don't try to drop this extent. 676 */ 677 ret = btrfs_lookup_file_extent(trans, root, path, 678 btrfs_ino(BTRFS_I(inode)), start, 0); 679 680 if (ret == 0 && 681 (found_type == BTRFS_FILE_EXTENT_REG || 682 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 683 struct btrfs_file_extent_item cmp1; 684 struct btrfs_file_extent_item cmp2; 685 struct btrfs_file_extent_item *existing; 686 struct extent_buffer *leaf; 687 688 leaf = path->nodes[0]; 689 existing = btrfs_item_ptr(leaf, path->slots[0], 690 struct btrfs_file_extent_item); 691 692 read_extent_buffer(eb, &cmp1, (unsigned long)item, 693 sizeof(cmp1)); 694 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 695 sizeof(cmp2)); 696 697 /* 698 * we already have a pointer to this exact extent, 699 * we don't have to do anything 700 */ 701 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 702 btrfs_release_path(path); 703 goto out; 704 } 705 } 706 btrfs_release_path(path); 707 708 /* drop any overlapping extents */ 709 drop_args.start = start; 710 drop_args.end = extent_end; 711 drop_args.drop_cache = true; 712 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 713 if (ret) 714 goto out; 715 716 if (found_type == BTRFS_FILE_EXTENT_REG || 717 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 718 u64 offset; 719 unsigned long dest_offset; 720 struct btrfs_key ins; 721 722 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 723 btrfs_fs_incompat(fs_info, NO_HOLES)) 724 goto update_inode; 725 726 ret = btrfs_insert_empty_item(trans, root, path, key, 727 sizeof(*item)); 728 if (ret) 729 goto out; 730 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 731 path->slots[0]); 732 copy_extent_buffer(path->nodes[0], eb, dest_offset, 733 (unsigned long)item, sizeof(*item)); 734 735 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 736 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 737 ins.type = BTRFS_EXTENT_ITEM_KEY; 738 offset = key->offset - btrfs_file_extent_offset(eb, item); 739 740 /* 741 * Manually record dirty extent, as here we did a shallow 742 * file extent item copy and skip normal backref update, 743 * but modifying extent tree all by ourselves. 744 * So need to manually record dirty extent for qgroup, 745 * as the owner of the file extent changed from log tree 746 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 747 */ 748 ret = btrfs_qgroup_trace_extent(trans, 749 btrfs_file_extent_disk_bytenr(eb, item), 750 btrfs_file_extent_disk_num_bytes(eb, item), 751 GFP_NOFS); 752 if (ret < 0) 753 goto out; 754 755 if (ins.objectid > 0) { 756 struct btrfs_ref ref = { 0 }; 757 u64 csum_start; 758 u64 csum_end; 759 LIST_HEAD(ordered_sums); 760 761 /* 762 * is this extent already allocated in the extent 763 * allocation tree? If so, just add a reference 764 */ 765 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 766 ins.offset); 767 if (ret < 0) { 768 goto out; 769 } else if (ret == 0) { 770 btrfs_init_generic_ref(&ref, 771 BTRFS_ADD_DELAYED_REF, 772 ins.objectid, ins.offset, 0); 773 btrfs_init_data_ref(&ref, 774 root->root_key.objectid, 775 key->objectid, offset, 0, false); 776 ret = btrfs_inc_extent_ref(trans, &ref); 777 if (ret) 778 goto out; 779 } else { 780 /* 781 * insert the extent pointer in the extent 782 * allocation tree 783 */ 784 ret = btrfs_alloc_logged_file_extent(trans, 785 root->root_key.objectid, 786 key->objectid, offset, &ins); 787 if (ret) 788 goto out; 789 } 790 btrfs_release_path(path); 791 792 if (btrfs_file_extent_compression(eb, item)) { 793 csum_start = ins.objectid; 794 csum_end = csum_start + ins.offset; 795 } else { 796 csum_start = ins.objectid + 797 btrfs_file_extent_offset(eb, item); 798 csum_end = csum_start + 799 btrfs_file_extent_num_bytes(eb, item); 800 } 801 802 ret = btrfs_lookup_csums_range(root->log_root, 803 csum_start, csum_end - 1, 804 &ordered_sums, 0); 805 if (ret) 806 goto out; 807 /* 808 * Now delete all existing cums in the csum root that 809 * cover our range. We do this because we can have an 810 * extent that is completely referenced by one file 811 * extent item and partially referenced by another 812 * file extent item (like after using the clone or 813 * extent_same ioctls). In this case if we end up doing 814 * the replay of the one that partially references the 815 * extent first, and we do not do the csum deletion 816 * below, we can get 2 csum items in the csum tree that 817 * overlap each other. For example, imagine our log has 818 * the two following file extent items: 819 * 820 * key (257 EXTENT_DATA 409600) 821 * extent data disk byte 12845056 nr 102400 822 * extent data offset 20480 nr 20480 ram 102400 823 * 824 * key (257 EXTENT_DATA 819200) 825 * extent data disk byte 12845056 nr 102400 826 * extent data offset 0 nr 102400 ram 102400 827 * 828 * Where the second one fully references the 100K extent 829 * that starts at disk byte 12845056, and the log tree 830 * has a single csum item that covers the entire range 831 * of the extent: 832 * 833 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 834 * 835 * After the first file extent item is replayed, the 836 * csum tree gets the following csum item: 837 * 838 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 839 * 840 * Which covers the 20K sub-range starting at offset 20K 841 * of our extent. Now when we replay the second file 842 * extent item, if we do not delete existing csum items 843 * that cover any of its blocks, we end up getting two 844 * csum items in our csum tree that overlap each other: 845 * 846 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 847 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 848 * 849 * Which is a problem, because after this anyone trying 850 * to lookup up for the checksum of any block of our 851 * extent starting at an offset of 40K or higher, will 852 * end up looking at the second csum item only, which 853 * does not contain the checksum for any block starting 854 * at offset 40K or higher of our extent. 855 */ 856 while (!list_empty(&ordered_sums)) { 857 struct btrfs_ordered_sum *sums; 858 struct btrfs_root *csum_root; 859 860 sums = list_entry(ordered_sums.next, 861 struct btrfs_ordered_sum, 862 list); 863 csum_root = btrfs_csum_root(fs_info, 864 sums->bytenr); 865 if (!ret) 866 ret = btrfs_del_csums(trans, csum_root, 867 sums->bytenr, 868 sums->len); 869 if (!ret) 870 ret = btrfs_csum_file_blocks(trans, 871 csum_root, 872 sums); 873 list_del(&sums->list); 874 kfree(sums); 875 } 876 if (ret) 877 goto out; 878 } else { 879 btrfs_release_path(path); 880 } 881 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 882 /* inline extents are easy, we just overwrite them */ 883 ret = overwrite_item(trans, root, path, eb, slot, key); 884 if (ret) 885 goto out; 886 } 887 888 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 889 extent_end - start); 890 if (ret) 891 goto out; 892 893 update_inode: 894 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 895 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 896 out: 897 iput(inode); 898 return ret; 899 } 900 901 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 902 struct btrfs_inode *dir, 903 struct btrfs_inode *inode, 904 const char *name, 905 int name_len) 906 { 907 int ret; 908 909 ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); 910 if (ret) 911 return ret; 912 /* 913 * Whenever we need to check if a name exists or not, we check the 914 * fs/subvolume tree. So after an unlink we must run delayed items, so 915 * that future checks for a name during log replay see that the name 916 * does not exists anymore. 917 */ 918 return btrfs_run_delayed_items(trans); 919 } 920 921 /* 922 * when cleaning up conflicts between the directory names in the 923 * subvolume, directory names in the log and directory names in the 924 * inode back references, we may have to unlink inodes from directories. 925 * 926 * This is a helper function to do the unlink of a specific directory 927 * item 928 */ 929 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 930 struct btrfs_path *path, 931 struct btrfs_inode *dir, 932 struct btrfs_dir_item *di) 933 { 934 struct btrfs_root *root = dir->root; 935 struct inode *inode; 936 char *name; 937 int name_len; 938 struct extent_buffer *leaf; 939 struct btrfs_key location; 940 int ret; 941 942 leaf = path->nodes[0]; 943 944 btrfs_dir_item_key_to_cpu(leaf, di, &location); 945 name_len = btrfs_dir_name_len(leaf, di); 946 name = kmalloc(name_len, GFP_NOFS); 947 if (!name) 948 return -ENOMEM; 949 950 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 951 btrfs_release_path(path); 952 953 inode = read_one_inode(root, location.objectid); 954 if (!inode) { 955 ret = -EIO; 956 goto out; 957 } 958 959 ret = link_to_fixup_dir(trans, root, path, location.objectid); 960 if (ret) 961 goto out; 962 963 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, 964 name_len); 965 out: 966 kfree(name); 967 iput(inode); 968 return ret; 969 } 970 971 /* 972 * See if a given name and sequence number found in an inode back reference are 973 * already in a directory and correctly point to this inode. 974 * 975 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 976 * exists. 977 */ 978 static noinline int inode_in_dir(struct btrfs_root *root, 979 struct btrfs_path *path, 980 u64 dirid, u64 objectid, u64 index, 981 const char *name, int name_len) 982 { 983 struct btrfs_dir_item *di; 984 struct btrfs_key location; 985 int ret = 0; 986 987 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 988 index, name, name_len, 0); 989 if (IS_ERR(di)) { 990 ret = PTR_ERR(di); 991 goto out; 992 } else if (di) { 993 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 994 if (location.objectid != objectid) 995 goto out; 996 } else { 997 goto out; 998 } 999 1000 btrfs_release_path(path); 1001 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 1002 if (IS_ERR(di)) { 1003 ret = PTR_ERR(di); 1004 goto out; 1005 } else if (di) { 1006 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1007 if (location.objectid == objectid) 1008 ret = 1; 1009 } 1010 out: 1011 btrfs_release_path(path); 1012 return ret; 1013 } 1014 1015 /* 1016 * helper function to check a log tree for a named back reference in 1017 * an inode. This is used to decide if a back reference that is 1018 * found in the subvolume conflicts with what we find in the log. 1019 * 1020 * inode backreferences may have multiple refs in a single item, 1021 * during replay we process one reference at a time, and we don't 1022 * want to delete valid links to a file from the subvolume if that 1023 * link is also in the log. 1024 */ 1025 static noinline int backref_in_log(struct btrfs_root *log, 1026 struct btrfs_key *key, 1027 u64 ref_objectid, 1028 const char *name, int namelen) 1029 { 1030 struct btrfs_path *path; 1031 int ret; 1032 1033 path = btrfs_alloc_path(); 1034 if (!path) 1035 return -ENOMEM; 1036 1037 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1038 if (ret < 0) { 1039 goto out; 1040 } else if (ret == 1) { 1041 ret = 0; 1042 goto out; 1043 } 1044 1045 if (key->type == BTRFS_INODE_EXTREF_KEY) 1046 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1047 path->slots[0], 1048 ref_objectid, 1049 name, namelen); 1050 else 1051 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1052 path->slots[0], 1053 name, namelen); 1054 out: 1055 btrfs_free_path(path); 1056 return ret; 1057 } 1058 1059 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1060 struct btrfs_root *root, 1061 struct btrfs_path *path, 1062 struct btrfs_root *log_root, 1063 struct btrfs_inode *dir, 1064 struct btrfs_inode *inode, 1065 u64 inode_objectid, u64 parent_objectid, 1066 u64 ref_index, char *name, int namelen, 1067 int *search_done) 1068 { 1069 int ret; 1070 char *victim_name; 1071 int victim_name_len; 1072 struct extent_buffer *leaf; 1073 struct btrfs_dir_item *di; 1074 struct btrfs_key search_key; 1075 struct btrfs_inode_extref *extref; 1076 1077 again: 1078 /* Search old style refs */ 1079 search_key.objectid = inode_objectid; 1080 search_key.type = BTRFS_INODE_REF_KEY; 1081 search_key.offset = parent_objectid; 1082 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1083 if (ret == 0) { 1084 struct btrfs_inode_ref *victim_ref; 1085 unsigned long ptr; 1086 unsigned long ptr_end; 1087 1088 leaf = path->nodes[0]; 1089 1090 /* are we trying to overwrite a back ref for the root directory 1091 * if so, just jump out, we're done 1092 */ 1093 if (search_key.objectid == search_key.offset) 1094 return 1; 1095 1096 /* check all the names in this back reference to see 1097 * if they are in the log. if so, we allow them to stay 1098 * otherwise they must be unlinked as a conflict 1099 */ 1100 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1101 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1102 while (ptr < ptr_end) { 1103 victim_ref = (struct btrfs_inode_ref *)ptr; 1104 victim_name_len = btrfs_inode_ref_name_len(leaf, 1105 victim_ref); 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 1110 read_extent_buffer(leaf, victim_name, 1111 (unsigned long)(victim_ref + 1), 1112 victim_name_len); 1113 1114 ret = backref_in_log(log_root, &search_key, 1115 parent_objectid, victim_name, 1116 victim_name_len); 1117 if (ret < 0) { 1118 kfree(victim_name); 1119 return ret; 1120 } else if (!ret) { 1121 inc_nlink(&inode->vfs_inode); 1122 btrfs_release_path(path); 1123 1124 ret = unlink_inode_for_log_replay(trans, dir, inode, 1125 victim_name, victim_name_len); 1126 kfree(victim_name); 1127 if (ret) 1128 return ret; 1129 *search_done = 1; 1130 goto again; 1131 } 1132 kfree(victim_name); 1133 1134 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1135 } 1136 1137 /* 1138 * NOTE: we have searched root tree and checked the 1139 * corresponding ref, it does not need to check again. 1140 */ 1141 *search_done = 1; 1142 } 1143 btrfs_release_path(path); 1144 1145 /* Same search but for extended refs */ 1146 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1147 inode_objectid, parent_objectid, 0, 1148 0); 1149 if (IS_ERR(extref)) { 1150 return PTR_ERR(extref); 1151 } else if (extref) { 1152 u32 item_size; 1153 u32 cur_offset = 0; 1154 unsigned long base; 1155 struct inode *victim_parent; 1156 1157 leaf = path->nodes[0]; 1158 1159 item_size = btrfs_item_size(leaf, path->slots[0]); 1160 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1161 1162 while (cur_offset < item_size) { 1163 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1164 1165 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1166 1167 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1168 goto next; 1169 1170 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1171 if (!victim_name) 1172 return -ENOMEM; 1173 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1174 victim_name_len); 1175 1176 search_key.objectid = inode_objectid; 1177 search_key.type = BTRFS_INODE_EXTREF_KEY; 1178 search_key.offset = btrfs_extref_hash(parent_objectid, 1179 victim_name, 1180 victim_name_len); 1181 ret = backref_in_log(log_root, &search_key, 1182 parent_objectid, victim_name, 1183 victim_name_len); 1184 if (ret < 0) { 1185 kfree(victim_name); 1186 return ret; 1187 } else if (!ret) { 1188 ret = -ENOENT; 1189 victim_parent = read_one_inode(root, 1190 parent_objectid); 1191 if (victim_parent) { 1192 inc_nlink(&inode->vfs_inode); 1193 btrfs_release_path(path); 1194 1195 ret = unlink_inode_for_log_replay(trans, 1196 BTRFS_I(victim_parent), 1197 inode, 1198 victim_name, 1199 victim_name_len); 1200 } 1201 iput(victim_parent); 1202 kfree(victim_name); 1203 if (ret) 1204 return ret; 1205 *search_done = 1; 1206 goto again; 1207 } 1208 kfree(victim_name); 1209 next: 1210 cur_offset += victim_name_len + sizeof(*extref); 1211 } 1212 *search_done = 1; 1213 } 1214 btrfs_release_path(path); 1215 1216 /* look for a conflicting sequence number */ 1217 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1218 ref_index, name, namelen, 0); 1219 if (IS_ERR(di)) { 1220 return PTR_ERR(di); 1221 } else if (di) { 1222 ret = drop_one_dir_item(trans, path, dir, di); 1223 if (ret) 1224 return ret; 1225 } 1226 btrfs_release_path(path); 1227 1228 /* look for a conflicting name */ 1229 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1230 name, namelen, 0); 1231 if (IS_ERR(di)) { 1232 return PTR_ERR(di); 1233 } else if (di) { 1234 ret = drop_one_dir_item(trans, path, dir, di); 1235 if (ret) 1236 return ret; 1237 } 1238 btrfs_release_path(path); 1239 1240 return 0; 1241 } 1242 1243 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1244 u32 *namelen, char **name, u64 *index, 1245 u64 *parent_objectid) 1246 { 1247 struct btrfs_inode_extref *extref; 1248 1249 extref = (struct btrfs_inode_extref *)ref_ptr; 1250 1251 *namelen = btrfs_inode_extref_name_len(eb, extref); 1252 *name = kmalloc(*namelen, GFP_NOFS); 1253 if (*name == NULL) 1254 return -ENOMEM; 1255 1256 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1257 *namelen); 1258 1259 if (index) 1260 *index = btrfs_inode_extref_index(eb, extref); 1261 if (parent_objectid) 1262 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1263 1264 return 0; 1265 } 1266 1267 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1268 u32 *namelen, char **name, u64 *index) 1269 { 1270 struct btrfs_inode_ref *ref; 1271 1272 ref = (struct btrfs_inode_ref *)ref_ptr; 1273 1274 *namelen = btrfs_inode_ref_name_len(eb, ref); 1275 *name = kmalloc(*namelen, GFP_NOFS); 1276 if (*name == NULL) 1277 return -ENOMEM; 1278 1279 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1280 1281 if (index) 1282 *index = btrfs_inode_ref_index(eb, ref); 1283 1284 return 0; 1285 } 1286 1287 /* 1288 * Take an inode reference item from the log tree and iterate all names from the 1289 * inode reference item in the subvolume tree with the same key (if it exists). 1290 * For any name that is not in the inode reference item from the log tree, do a 1291 * proper unlink of that name (that is, remove its entry from the inode 1292 * reference item and both dir index keys). 1293 */ 1294 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1295 struct btrfs_root *root, 1296 struct btrfs_path *path, 1297 struct btrfs_inode *inode, 1298 struct extent_buffer *log_eb, 1299 int log_slot, 1300 struct btrfs_key *key) 1301 { 1302 int ret; 1303 unsigned long ref_ptr; 1304 unsigned long ref_end; 1305 struct extent_buffer *eb; 1306 1307 again: 1308 btrfs_release_path(path); 1309 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1310 if (ret > 0) { 1311 ret = 0; 1312 goto out; 1313 } 1314 if (ret < 0) 1315 goto out; 1316 1317 eb = path->nodes[0]; 1318 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1319 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1320 while (ref_ptr < ref_end) { 1321 char *name = NULL; 1322 int namelen; 1323 u64 parent_id; 1324 1325 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1326 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1327 NULL, &parent_id); 1328 } else { 1329 parent_id = key->offset; 1330 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1331 NULL); 1332 } 1333 if (ret) 1334 goto out; 1335 1336 if (key->type == BTRFS_INODE_EXTREF_KEY) 1337 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1338 parent_id, name, 1339 namelen); 1340 else 1341 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1342 name, namelen); 1343 1344 if (!ret) { 1345 struct inode *dir; 1346 1347 btrfs_release_path(path); 1348 dir = read_one_inode(root, parent_id); 1349 if (!dir) { 1350 ret = -ENOENT; 1351 kfree(name); 1352 goto out; 1353 } 1354 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1355 inode, name, namelen); 1356 kfree(name); 1357 iput(dir); 1358 if (ret) 1359 goto out; 1360 goto again; 1361 } 1362 1363 kfree(name); 1364 ref_ptr += namelen; 1365 if (key->type == BTRFS_INODE_EXTREF_KEY) 1366 ref_ptr += sizeof(struct btrfs_inode_extref); 1367 else 1368 ref_ptr += sizeof(struct btrfs_inode_ref); 1369 } 1370 ret = 0; 1371 out: 1372 btrfs_release_path(path); 1373 return ret; 1374 } 1375 1376 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1377 const u8 ref_type, const char *name, 1378 const int namelen) 1379 { 1380 struct btrfs_key key; 1381 struct btrfs_path *path; 1382 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1383 int ret; 1384 1385 path = btrfs_alloc_path(); 1386 if (!path) 1387 return -ENOMEM; 1388 1389 key.objectid = btrfs_ino(BTRFS_I(inode)); 1390 key.type = ref_type; 1391 if (key.type == BTRFS_INODE_REF_KEY) 1392 key.offset = parent_id; 1393 else 1394 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1395 1396 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1397 if (ret < 0) 1398 goto out; 1399 if (ret > 0) { 1400 ret = 0; 1401 goto out; 1402 } 1403 if (key.type == BTRFS_INODE_EXTREF_KEY) 1404 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1405 path->slots[0], parent_id, name, namelen); 1406 else 1407 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1408 name, namelen); 1409 1410 out: 1411 btrfs_free_path(path); 1412 return ret; 1413 } 1414 1415 static int add_link(struct btrfs_trans_handle *trans, 1416 struct inode *dir, struct inode *inode, const char *name, 1417 int namelen, u64 ref_index) 1418 { 1419 struct btrfs_root *root = BTRFS_I(dir)->root; 1420 struct btrfs_dir_item *dir_item; 1421 struct btrfs_key key; 1422 struct btrfs_path *path; 1423 struct inode *other_inode = NULL; 1424 int ret; 1425 1426 path = btrfs_alloc_path(); 1427 if (!path) 1428 return -ENOMEM; 1429 1430 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1431 btrfs_ino(BTRFS_I(dir)), 1432 name, namelen, 0); 1433 if (!dir_item) { 1434 btrfs_release_path(path); 1435 goto add_link; 1436 } else if (IS_ERR(dir_item)) { 1437 ret = PTR_ERR(dir_item); 1438 goto out; 1439 } 1440 1441 /* 1442 * Our inode's dentry collides with the dentry of another inode which is 1443 * in the log but not yet processed since it has a higher inode number. 1444 * So delete that other dentry. 1445 */ 1446 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1447 btrfs_release_path(path); 1448 other_inode = read_one_inode(root, key.objectid); 1449 if (!other_inode) { 1450 ret = -ENOENT; 1451 goto out; 1452 } 1453 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1454 name, namelen); 1455 if (ret) 1456 goto out; 1457 /* 1458 * If we dropped the link count to 0, bump it so that later the iput() 1459 * on the inode will not free it. We will fixup the link count later. 1460 */ 1461 if (other_inode->i_nlink == 0) 1462 set_nlink(other_inode, 1); 1463 add_link: 1464 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1465 name, namelen, 0, ref_index); 1466 out: 1467 iput(other_inode); 1468 btrfs_free_path(path); 1469 1470 return ret; 1471 } 1472 1473 /* 1474 * replay one inode back reference item found in the log tree. 1475 * eb, slot and key refer to the buffer and key found in the log tree. 1476 * root is the destination we are replaying into, and path is for temp 1477 * use by this function. (it should be released on return). 1478 */ 1479 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1480 struct btrfs_root *root, 1481 struct btrfs_root *log, 1482 struct btrfs_path *path, 1483 struct extent_buffer *eb, int slot, 1484 struct btrfs_key *key) 1485 { 1486 struct inode *dir = NULL; 1487 struct inode *inode = NULL; 1488 unsigned long ref_ptr; 1489 unsigned long ref_end; 1490 char *name = NULL; 1491 int namelen; 1492 int ret; 1493 int search_done = 0; 1494 int log_ref_ver = 0; 1495 u64 parent_objectid; 1496 u64 inode_objectid; 1497 u64 ref_index = 0; 1498 int ref_struct_size; 1499 1500 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1501 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1502 1503 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1504 struct btrfs_inode_extref *r; 1505 1506 ref_struct_size = sizeof(struct btrfs_inode_extref); 1507 log_ref_ver = 1; 1508 r = (struct btrfs_inode_extref *)ref_ptr; 1509 parent_objectid = btrfs_inode_extref_parent(eb, r); 1510 } else { 1511 ref_struct_size = sizeof(struct btrfs_inode_ref); 1512 parent_objectid = key->offset; 1513 } 1514 inode_objectid = key->objectid; 1515 1516 /* 1517 * it is possible that we didn't log all the parent directories 1518 * for a given inode. If we don't find the dir, just don't 1519 * copy the back ref in. The link count fixup code will take 1520 * care of the rest 1521 */ 1522 dir = read_one_inode(root, parent_objectid); 1523 if (!dir) { 1524 ret = -ENOENT; 1525 goto out; 1526 } 1527 1528 inode = read_one_inode(root, inode_objectid); 1529 if (!inode) { 1530 ret = -EIO; 1531 goto out; 1532 } 1533 1534 while (ref_ptr < ref_end) { 1535 if (log_ref_ver) { 1536 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1537 &ref_index, &parent_objectid); 1538 /* 1539 * parent object can change from one array 1540 * item to another. 1541 */ 1542 if (!dir) 1543 dir = read_one_inode(root, parent_objectid); 1544 if (!dir) { 1545 ret = -ENOENT; 1546 goto out; 1547 } 1548 } else { 1549 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1550 &ref_index); 1551 } 1552 if (ret) 1553 goto out; 1554 1555 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1556 btrfs_ino(BTRFS_I(inode)), ref_index, 1557 name, namelen); 1558 if (ret < 0) { 1559 goto out; 1560 } else if (ret == 0) { 1561 /* 1562 * look for a conflicting back reference in the 1563 * metadata. if we find one we have to unlink that name 1564 * of the file before we add our new link. Later on, we 1565 * overwrite any existing back reference, and we don't 1566 * want to create dangling pointers in the directory. 1567 */ 1568 1569 if (!search_done) { 1570 ret = __add_inode_ref(trans, root, path, log, 1571 BTRFS_I(dir), 1572 BTRFS_I(inode), 1573 inode_objectid, 1574 parent_objectid, 1575 ref_index, name, namelen, 1576 &search_done); 1577 if (ret) { 1578 if (ret == 1) 1579 ret = 0; 1580 goto out; 1581 } 1582 } 1583 1584 /* 1585 * If a reference item already exists for this inode 1586 * with the same parent and name, but different index, 1587 * drop it and the corresponding directory index entries 1588 * from the parent before adding the new reference item 1589 * and dir index entries, otherwise we would fail with 1590 * -EEXIST returned from btrfs_add_link() below. 1591 */ 1592 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1593 name, namelen); 1594 if (ret > 0) { 1595 ret = unlink_inode_for_log_replay(trans, 1596 BTRFS_I(dir), 1597 BTRFS_I(inode), 1598 name, namelen); 1599 /* 1600 * If we dropped the link count to 0, bump it so 1601 * that later the iput() on the inode will not 1602 * free it. We will fixup the link count later. 1603 */ 1604 if (!ret && inode->i_nlink == 0) 1605 set_nlink(inode, 1); 1606 } 1607 if (ret < 0) 1608 goto out; 1609 1610 /* insert our name */ 1611 ret = add_link(trans, dir, inode, name, namelen, 1612 ref_index); 1613 if (ret) 1614 goto out; 1615 1616 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1617 if (ret) 1618 goto out; 1619 } 1620 /* Else, ret == 1, we already have a perfect match, we're done. */ 1621 1622 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1623 kfree(name); 1624 name = NULL; 1625 if (log_ref_ver) { 1626 iput(dir); 1627 dir = NULL; 1628 } 1629 } 1630 1631 /* 1632 * Before we overwrite the inode reference item in the subvolume tree 1633 * with the item from the log tree, we must unlink all names from the 1634 * parent directory that are in the subvolume's tree inode reference 1635 * item, otherwise we end up with an inconsistent subvolume tree where 1636 * dir index entries exist for a name but there is no inode reference 1637 * item with the same name. 1638 */ 1639 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1640 key); 1641 if (ret) 1642 goto out; 1643 1644 /* finally write the back reference in the inode */ 1645 ret = overwrite_item(trans, root, path, eb, slot, key); 1646 out: 1647 btrfs_release_path(path); 1648 kfree(name); 1649 iput(dir); 1650 iput(inode); 1651 return ret; 1652 } 1653 1654 static int count_inode_extrefs(struct btrfs_root *root, 1655 struct btrfs_inode *inode, struct btrfs_path *path) 1656 { 1657 int ret = 0; 1658 int name_len; 1659 unsigned int nlink = 0; 1660 u32 item_size; 1661 u32 cur_offset = 0; 1662 u64 inode_objectid = btrfs_ino(inode); 1663 u64 offset = 0; 1664 unsigned long ptr; 1665 struct btrfs_inode_extref *extref; 1666 struct extent_buffer *leaf; 1667 1668 while (1) { 1669 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1670 &extref, &offset); 1671 if (ret) 1672 break; 1673 1674 leaf = path->nodes[0]; 1675 item_size = btrfs_item_size(leaf, path->slots[0]); 1676 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1677 cur_offset = 0; 1678 1679 while (cur_offset < item_size) { 1680 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1681 name_len = btrfs_inode_extref_name_len(leaf, extref); 1682 1683 nlink++; 1684 1685 cur_offset += name_len + sizeof(*extref); 1686 } 1687 1688 offset++; 1689 btrfs_release_path(path); 1690 } 1691 btrfs_release_path(path); 1692 1693 if (ret < 0 && ret != -ENOENT) 1694 return ret; 1695 return nlink; 1696 } 1697 1698 static int count_inode_refs(struct btrfs_root *root, 1699 struct btrfs_inode *inode, struct btrfs_path *path) 1700 { 1701 int ret; 1702 struct btrfs_key key; 1703 unsigned int nlink = 0; 1704 unsigned long ptr; 1705 unsigned long ptr_end; 1706 int name_len; 1707 u64 ino = btrfs_ino(inode); 1708 1709 key.objectid = ino; 1710 key.type = BTRFS_INODE_REF_KEY; 1711 key.offset = (u64)-1; 1712 1713 while (1) { 1714 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1715 if (ret < 0) 1716 break; 1717 if (ret > 0) { 1718 if (path->slots[0] == 0) 1719 break; 1720 path->slots[0]--; 1721 } 1722 process_slot: 1723 btrfs_item_key_to_cpu(path->nodes[0], &key, 1724 path->slots[0]); 1725 if (key.objectid != ino || 1726 key.type != BTRFS_INODE_REF_KEY) 1727 break; 1728 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1729 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1730 path->slots[0]); 1731 while (ptr < ptr_end) { 1732 struct btrfs_inode_ref *ref; 1733 1734 ref = (struct btrfs_inode_ref *)ptr; 1735 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1736 ref); 1737 ptr = (unsigned long)(ref + 1) + name_len; 1738 nlink++; 1739 } 1740 1741 if (key.offset == 0) 1742 break; 1743 if (path->slots[0] > 0) { 1744 path->slots[0]--; 1745 goto process_slot; 1746 } 1747 key.offset--; 1748 btrfs_release_path(path); 1749 } 1750 btrfs_release_path(path); 1751 1752 return nlink; 1753 } 1754 1755 /* 1756 * There are a few corners where the link count of the file can't 1757 * be properly maintained during replay. So, instead of adding 1758 * lots of complexity to the log code, we just scan the backrefs 1759 * for any file that has been through replay. 1760 * 1761 * The scan will update the link count on the inode to reflect the 1762 * number of back refs found. If it goes down to zero, the iput 1763 * will free the inode. 1764 */ 1765 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1766 struct btrfs_root *root, 1767 struct inode *inode) 1768 { 1769 struct btrfs_path *path; 1770 int ret; 1771 u64 nlink = 0; 1772 u64 ino = btrfs_ino(BTRFS_I(inode)); 1773 1774 path = btrfs_alloc_path(); 1775 if (!path) 1776 return -ENOMEM; 1777 1778 ret = count_inode_refs(root, BTRFS_I(inode), path); 1779 if (ret < 0) 1780 goto out; 1781 1782 nlink = ret; 1783 1784 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1785 if (ret < 0) 1786 goto out; 1787 1788 nlink += ret; 1789 1790 ret = 0; 1791 1792 if (nlink != inode->i_nlink) { 1793 set_nlink(inode, nlink); 1794 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1795 if (ret) 1796 goto out; 1797 } 1798 BTRFS_I(inode)->index_cnt = (u64)-1; 1799 1800 if (inode->i_nlink == 0) { 1801 if (S_ISDIR(inode->i_mode)) { 1802 ret = replay_dir_deletes(trans, root, NULL, path, 1803 ino, 1); 1804 if (ret) 1805 goto out; 1806 } 1807 ret = btrfs_insert_orphan_item(trans, root, ino); 1808 if (ret == -EEXIST) 1809 ret = 0; 1810 } 1811 1812 out: 1813 btrfs_free_path(path); 1814 return ret; 1815 } 1816 1817 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1818 struct btrfs_root *root, 1819 struct btrfs_path *path) 1820 { 1821 int ret; 1822 struct btrfs_key key; 1823 struct inode *inode; 1824 1825 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1826 key.type = BTRFS_ORPHAN_ITEM_KEY; 1827 key.offset = (u64)-1; 1828 while (1) { 1829 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1830 if (ret < 0) 1831 break; 1832 1833 if (ret == 1) { 1834 ret = 0; 1835 if (path->slots[0] == 0) 1836 break; 1837 path->slots[0]--; 1838 } 1839 1840 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1841 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1842 key.type != BTRFS_ORPHAN_ITEM_KEY) 1843 break; 1844 1845 ret = btrfs_del_item(trans, root, path); 1846 if (ret) 1847 break; 1848 1849 btrfs_release_path(path); 1850 inode = read_one_inode(root, key.offset); 1851 if (!inode) { 1852 ret = -EIO; 1853 break; 1854 } 1855 1856 ret = fixup_inode_link_count(trans, root, inode); 1857 iput(inode); 1858 if (ret) 1859 break; 1860 1861 /* 1862 * fixup on a directory may create new entries, 1863 * make sure we always look for the highset possible 1864 * offset 1865 */ 1866 key.offset = (u64)-1; 1867 } 1868 btrfs_release_path(path); 1869 return ret; 1870 } 1871 1872 1873 /* 1874 * record a given inode in the fixup dir so we can check its link 1875 * count when replay is done. The link count is incremented here 1876 * so the inode won't go away until we check it 1877 */ 1878 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1879 struct btrfs_root *root, 1880 struct btrfs_path *path, 1881 u64 objectid) 1882 { 1883 struct btrfs_key key; 1884 int ret = 0; 1885 struct inode *inode; 1886 1887 inode = read_one_inode(root, objectid); 1888 if (!inode) 1889 return -EIO; 1890 1891 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1892 key.type = BTRFS_ORPHAN_ITEM_KEY; 1893 key.offset = objectid; 1894 1895 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1896 1897 btrfs_release_path(path); 1898 if (ret == 0) { 1899 if (!inode->i_nlink) 1900 set_nlink(inode, 1); 1901 else 1902 inc_nlink(inode); 1903 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1904 } else if (ret == -EEXIST) { 1905 ret = 0; 1906 } 1907 iput(inode); 1908 1909 return ret; 1910 } 1911 1912 /* 1913 * when replaying the log for a directory, we only insert names 1914 * for inodes that actually exist. This means an fsync on a directory 1915 * does not implicitly fsync all the new files in it 1916 */ 1917 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1918 struct btrfs_root *root, 1919 u64 dirid, u64 index, 1920 char *name, int name_len, 1921 struct btrfs_key *location) 1922 { 1923 struct inode *inode; 1924 struct inode *dir; 1925 int ret; 1926 1927 inode = read_one_inode(root, location->objectid); 1928 if (!inode) 1929 return -ENOENT; 1930 1931 dir = read_one_inode(root, dirid); 1932 if (!dir) { 1933 iput(inode); 1934 return -EIO; 1935 } 1936 1937 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1938 name_len, 1, index); 1939 1940 /* FIXME, put inode into FIXUP list */ 1941 1942 iput(inode); 1943 iput(dir); 1944 return ret; 1945 } 1946 1947 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1948 struct btrfs_inode *dir, 1949 struct btrfs_path *path, 1950 struct btrfs_dir_item *dst_di, 1951 const struct btrfs_key *log_key, 1952 u8 log_type, 1953 bool exists) 1954 { 1955 struct btrfs_key found_key; 1956 1957 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1958 /* The existing dentry points to the same inode, don't delete it. */ 1959 if (found_key.objectid == log_key->objectid && 1960 found_key.type == log_key->type && 1961 found_key.offset == log_key->offset && 1962 btrfs_dir_type(path->nodes[0], dst_di) == log_type) 1963 return 1; 1964 1965 /* 1966 * Don't drop the conflicting directory entry if the inode for the new 1967 * entry doesn't exist. 1968 */ 1969 if (!exists) 1970 return 0; 1971 1972 return drop_one_dir_item(trans, path, dir, dst_di); 1973 } 1974 1975 /* 1976 * take a single entry in a log directory item and replay it into 1977 * the subvolume. 1978 * 1979 * if a conflicting item exists in the subdirectory already, 1980 * the inode it points to is unlinked and put into the link count 1981 * fix up tree. 1982 * 1983 * If a name from the log points to a file or directory that does 1984 * not exist in the FS, it is skipped. fsyncs on directories 1985 * do not force down inodes inside that directory, just changes to the 1986 * names or unlinks in a directory. 1987 * 1988 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1989 * non-existing inode) and 1 if the name was replayed. 1990 */ 1991 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1992 struct btrfs_root *root, 1993 struct btrfs_path *path, 1994 struct extent_buffer *eb, 1995 struct btrfs_dir_item *di, 1996 struct btrfs_key *key) 1997 { 1998 char *name; 1999 int name_len; 2000 struct btrfs_dir_item *dir_dst_di; 2001 struct btrfs_dir_item *index_dst_di; 2002 bool dir_dst_matches = false; 2003 bool index_dst_matches = false; 2004 struct btrfs_key log_key; 2005 struct btrfs_key search_key; 2006 struct inode *dir; 2007 u8 log_type; 2008 bool exists; 2009 int ret; 2010 bool update_size = true; 2011 bool name_added = false; 2012 2013 dir = read_one_inode(root, key->objectid); 2014 if (!dir) 2015 return -EIO; 2016 2017 name_len = btrfs_dir_name_len(eb, di); 2018 name = kmalloc(name_len, GFP_NOFS); 2019 if (!name) { 2020 ret = -ENOMEM; 2021 goto out; 2022 } 2023 2024 log_type = btrfs_dir_type(eb, di); 2025 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2026 name_len); 2027 2028 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 2029 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 2030 btrfs_release_path(path); 2031 if (ret < 0) 2032 goto out; 2033 exists = (ret == 0); 2034 ret = 0; 2035 2036 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 2037 name, name_len, 1); 2038 if (IS_ERR(dir_dst_di)) { 2039 ret = PTR_ERR(dir_dst_di); 2040 goto out; 2041 } else if (dir_dst_di) { 2042 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2043 dir_dst_di, &log_key, log_type, 2044 exists); 2045 if (ret < 0) 2046 goto out; 2047 dir_dst_matches = (ret == 1); 2048 } 2049 2050 btrfs_release_path(path); 2051 2052 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 2053 key->objectid, key->offset, 2054 name, name_len, 1); 2055 if (IS_ERR(index_dst_di)) { 2056 ret = PTR_ERR(index_dst_di); 2057 goto out; 2058 } else if (index_dst_di) { 2059 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2060 index_dst_di, &log_key, 2061 log_type, exists); 2062 if (ret < 0) 2063 goto out; 2064 index_dst_matches = (ret == 1); 2065 } 2066 2067 btrfs_release_path(path); 2068 2069 if (dir_dst_matches && index_dst_matches) { 2070 ret = 0; 2071 update_size = false; 2072 goto out; 2073 } 2074 2075 /* 2076 * Check if the inode reference exists in the log for the given name, 2077 * inode and parent inode 2078 */ 2079 search_key.objectid = log_key.objectid; 2080 search_key.type = BTRFS_INODE_REF_KEY; 2081 search_key.offset = key->objectid; 2082 ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); 2083 if (ret < 0) { 2084 goto out; 2085 } else if (ret) { 2086 /* The dentry will be added later. */ 2087 ret = 0; 2088 update_size = false; 2089 goto out; 2090 } 2091 2092 search_key.objectid = log_key.objectid; 2093 search_key.type = BTRFS_INODE_EXTREF_KEY; 2094 search_key.offset = key->objectid; 2095 ret = backref_in_log(root->log_root, &search_key, key->objectid, name, 2096 name_len); 2097 if (ret < 0) { 2098 goto out; 2099 } else if (ret) { 2100 /* The dentry will be added later. */ 2101 ret = 0; 2102 update_size = false; 2103 goto out; 2104 } 2105 btrfs_release_path(path); 2106 ret = insert_one_name(trans, root, key->objectid, key->offset, 2107 name, name_len, &log_key); 2108 if (ret && ret != -ENOENT && ret != -EEXIST) 2109 goto out; 2110 if (!ret) 2111 name_added = true; 2112 update_size = false; 2113 ret = 0; 2114 2115 out: 2116 if (!ret && update_size) { 2117 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2118 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2119 } 2120 kfree(name); 2121 iput(dir); 2122 if (!ret && name_added) 2123 ret = 1; 2124 return ret; 2125 } 2126 2127 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 2128 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2129 struct btrfs_root *root, 2130 struct btrfs_path *path, 2131 struct extent_buffer *eb, int slot, 2132 struct btrfs_key *key) 2133 { 2134 int ret; 2135 struct btrfs_dir_item *di; 2136 2137 /* We only log dir index keys, which only contain a single dir item. */ 2138 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 2139 2140 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2141 ret = replay_one_name(trans, root, path, eb, di, key); 2142 if (ret < 0) 2143 return ret; 2144 2145 /* 2146 * If this entry refers to a non-directory (directories can not have a 2147 * link count > 1) and it was added in the transaction that was not 2148 * committed, make sure we fixup the link count of the inode the entry 2149 * points to. Otherwise something like the following would result in a 2150 * directory pointing to an inode with a wrong link that does not account 2151 * for this dir entry: 2152 * 2153 * mkdir testdir 2154 * touch testdir/foo 2155 * touch testdir/bar 2156 * sync 2157 * 2158 * ln testdir/bar testdir/bar_link 2159 * ln testdir/foo testdir/foo_link 2160 * xfs_io -c "fsync" testdir/bar 2161 * 2162 * <power failure> 2163 * 2164 * mount fs, log replay happens 2165 * 2166 * File foo would remain with a link count of 1 when it has two entries 2167 * pointing to it in the directory testdir. This would make it impossible 2168 * to ever delete the parent directory has it would result in stale 2169 * dentries that can never be deleted. 2170 */ 2171 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2172 struct btrfs_path *fixup_path; 2173 struct btrfs_key di_key; 2174 2175 fixup_path = btrfs_alloc_path(); 2176 if (!fixup_path) 2177 return -ENOMEM; 2178 2179 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2180 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2181 btrfs_free_path(fixup_path); 2182 } 2183 2184 return ret; 2185 } 2186 2187 /* 2188 * directory replay has two parts. There are the standard directory 2189 * items in the log copied from the subvolume, and range items 2190 * created in the log while the subvolume was logged. 2191 * 2192 * The range items tell us which parts of the key space the log 2193 * is authoritative for. During replay, if a key in the subvolume 2194 * directory is in a logged range item, but not actually in the log 2195 * that means it was deleted from the directory before the fsync 2196 * and should be removed. 2197 */ 2198 static noinline int find_dir_range(struct btrfs_root *root, 2199 struct btrfs_path *path, 2200 u64 dirid, 2201 u64 *start_ret, u64 *end_ret) 2202 { 2203 struct btrfs_key key; 2204 u64 found_end; 2205 struct btrfs_dir_log_item *item; 2206 int ret; 2207 int nritems; 2208 2209 if (*start_ret == (u64)-1) 2210 return 1; 2211 2212 key.objectid = dirid; 2213 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2214 key.offset = *start_ret; 2215 2216 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2217 if (ret < 0) 2218 goto out; 2219 if (ret > 0) { 2220 if (path->slots[0] == 0) 2221 goto out; 2222 path->slots[0]--; 2223 } 2224 if (ret != 0) 2225 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2226 2227 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2228 ret = 1; 2229 goto next; 2230 } 2231 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2232 struct btrfs_dir_log_item); 2233 found_end = btrfs_dir_log_end(path->nodes[0], item); 2234 2235 if (*start_ret >= key.offset && *start_ret <= found_end) { 2236 ret = 0; 2237 *start_ret = key.offset; 2238 *end_ret = found_end; 2239 goto out; 2240 } 2241 ret = 1; 2242 next: 2243 /* check the next slot in the tree to see if it is a valid item */ 2244 nritems = btrfs_header_nritems(path->nodes[0]); 2245 path->slots[0]++; 2246 if (path->slots[0] >= nritems) { 2247 ret = btrfs_next_leaf(root, path); 2248 if (ret) 2249 goto out; 2250 } 2251 2252 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2253 2254 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2255 ret = 1; 2256 goto out; 2257 } 2258 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2259 struct btrfs_dir_log_item); 2260 found_end = btrfs_dir_log_end(path->nodes[0], item); 2261 *start_ret = key.offset; 2262 *end_ret = found_end; 2263 ret = 0; 2264 out: 2265 btrfs_release_path(path); 2266 return ret; 2267 } 2268 2269 /* 2270 * this looks for a given directory item in the log. If the directory 2271 * item is not in the log, the item is removed and the inode it points 2272 * to is unlinked 2273 */ 2274 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2275 struct btrfs_root *log, 2276 struct btrfs_path *path, 2277 struct btrfs_path *log_path, 2278 struct inode *dir, 2279 struct btrfs_key *dir_key) 2280 { 2281 struct btrfs_root *root = BTRFS_I(dir)->root; 2282 int ret; 2283 struct extent_buffer *eb; 2284 int slot; 2285 struct btrfs_dir_item *di; 2286 int name_len; 2287 char *name; 2288 struct inode *inode = NULL; 2289 struct btrfs_key location; 2290 2291 /* 2292 * Currently we only log dir index keys. Even if we replay a log created 2293 * by an older kernel that logged both dir index and dir item keys, all 2294 * we need to do is process the dir index keys, we (and our caller) can 2295 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2296 */ 2297 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2298 2299 eb = path->nodes[0]; 2300 slot = path->slots[0]; 2301 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2302 name_len = btrfs_dir_name_len(eb, di); 2303 name = kmalloc(name_len, GFP_NOFS); 2304 if (!name) { 2305 ret = -ENOMEM; 2306 goto out; 2307 } 2308 2309 read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); 2310 2311 if (log) { 2312 struct btrfs_dir_item *log_di; 2313 2314 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2315 dir_key->objectid, 2316 dir_key->offset, 2317 name, name_len, 0); 2318 if (IS_ERR(log_di)) { 2319 ret = PTR_ERR(log_di); 2320 goto out; 2321 } else if (log_di) { 2322 /* The dentry exists in the log, we have nothing to do. */ 2323 ret = 0; 2324 goto out; 2325 } 2326 } 2327 2328 btrfs_dir_item_key_to_cpu(eb, di, &location); 2329 btrfs_release_path(path); 2330 btrfs_release_path(log_path); 2331 inode = read_one_inode(root, location.objectid); 2332 if (!inode) { 2333 ret = -EIO; 2334 goto out; 2335 } 2336 2337 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2338 if (ret) 2339 goto out; 2340 2341 inc_nlink(inode); 2342 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2343 name, name_len); 2344 /* 2345 * Unlike dir item keys, dir index keys can only have one name (entry) in 2346 * them, as there are no key collisions since each key has a unique offset 2347 * (an index number), so we're done. 2348 */ 2349 out: 2350 btrfs_release_path(path); 2351 btrfs_release_path(log_path); 2352 kfree(name); 2353 iput(inode); 2354 return ret; 2355 } 2356 2357 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2358 struct btrfs_root *root, 2359 struct btrfs_root *log, 2360 struct btrfs_path *path, 2361 const u64 ino) 2362 { 2363 struct btrfs_key search_key; 2364 struct btrfs_path *log_path; 2365 int i; 2366 int nritems; 2367 int ret; 2368 2369 log_path = btrfs_alloc_path(); 2370 if (!log_path) 2371 return -ENOMEM; 2372 2373 search_key.objectid = ino; 2374 search_key.type = BTRFS_XATTR_ITEM_KEY; 2375 search_key.offset = 0; 2376 again: 2377 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2378 if (ret < 0) 2379 goto out; 2380 process_leaf: 2381 nritems = btrfs_header_nritems(path->nodes[0]); 2382 for (i = path->slots[0]; i < nritems; i++) { 2383 struct btrfs_key key; 2384 struct btrfs_dir_item *di; 2385 struct btrfs_dir_item *log_di; 2386 u32 total_size; 2387 u32 cur; 2388 2389 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2390 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2391 ret = 0; 2392 goto out; 2393 } 2394 2395 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2396 total_size = btrfs_item_size(path->nodes[0], i); 2397 cur = 0; 2398 while (cur < total_size) { 2399 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2400 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2401 u32 this_len = sizeof(*di) + name_len + data_len; 2402 char *name; 2403 2404 name = kmalloc(name_len, GFP_NOFS); 2405 if (!name) { 2406 ret = -ENOMEM; 2407 goto out; 2408 } 2409 read_extent_buffer(path->nodes[0], name, 2410 (unsigned long)(di + 1), name_len); 2411 2412 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2413 name, name_len, 0); 2414 btrfs_release_path(log_path); 2415 if (!log_di) { 2416 /* Doesn't exist in log tree, so delete it. */ 2417 btrfs_release_path(path); 2418 di = btrfs_lookup_xattr(trans, root, path, ino, 2419 name, name_len, -1); 2420 kfree(name); 2421 if (IS_ERR(di)) { 2422 ret = PTR_ERR(di); 2423 goto out; 2424 } 2425 ASSERT(di); 2426 ret = btrfs_delete_one_dir_name(trans, root, 2427 path, di); 2428 if (ret) 2429 goto out; 2430 btrfs_release_path(path); 2431 search_key = key; 2432 goto again; 2433 } 2434 kfree(name); 2435 if (IS_ERR(log_di)) { 2436 ret = PTR_ERR(log_di); 2437 goto out; 2438 } 2439 cur += this_len; 2440 di = (struct btrfs_dir_item *)((char *)di + this_len); 2441 } 2442 } 2443 ret = btrfs_next_leaf(root, path); 2444 if (ret > 0) 2445 ret = 0; 2446 else if (ret == 0) 2447 goto process_leaf; 2448 out: 2449 btrfs_free_path(log_path); 2450 btrfs_release_path(path); 2451 return ret; 2452 } 2453 2454 2455 /* 2456 * deletion replay happens before we copy any new directory items 2457 * out of the log or out of backreferences from inodes. It 2458 * scans the log to find ranges of keys that log is authoritative for, 2459 * and then scans the directory to find items in those ranges that are 2460 * not present in the log. 2461 * 2462 * Anything we don't find in the log is unlinked and removed from the 2463 * directory. 2464 */ 2465 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2466 struct btrfs_root *root, 2467 struct btrfs_root *log, 2468 struct btrfs_path *path, 2469 u64 dirid, int del_all) 2470 { 2471 u64 range_start; 2472 u64 range_end; 2473 int ret = 0; 2474 struct btrfs_key dir_key; 2475 struct btrfs_key found_key; 2476 struct btrfs_path *log_path; 2477 struct inode *dir; 2478 2479 dir_key.objectid = dirid; 2480 dir_key.type = BTRFS_DIR_INDEX_KEY; 2481 log_path = btrfs_alloc_path(); 2482 if (!log_path) 2483 return -ENOMEM; 2484 2485 dir = read_one_inode(root, dirid); 2486 /* it isn't an error if the inode isn't there, that can happen 2487 * because we replay the deletes before we copy in the inode item 2488 * from the log 2489 */ 2490 if (!dir) { 2491 btrfs_free_path(log_path); 2492 return 0; 2493 } 2494 2495 range_start = 0; 2496 range_end = 0; 2497 while (1) { 2498 if (del_all) 2499 range_end = (u64)-1; 2500 else { 2501 ret = find_dir_range(log, path, dirid, 2502 &range_start, &range_end); 2503 if (ret < 0) 2504 goto out; 2505 else if (ret > 0) 2506 break; 2507 } 2508 2509 dir_key.offset = range_start; 2510 while (1) { 2511 int nritems; 2512 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2513 0, 0); 2514 if (ret < 0) 2515 goto out; 2516 2517 nritems = btrfs_header_nritems(path->nodes[0]); 2518 if (path->slots[0] >= nritems) { 2519 ret = btrfs_next_leaf(root, path); 2520 if (ret == 1) 2521 break; 2522 else if (ret < 0) 2523 goto out; 2524 } 2525 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2526 path->slots[0]); 2527 if (found_key.objectid != dirid || 2528 found_key.type != dir_key.type) { 2529 ret = 0; 2530 goto out; 2531 } 2532 2533 if (found_key.offset > range_end) 2534 break; 2535 2536 ret = check_item_in_log(trans, log, path, 2537 log_path, dir, 2538 &found_key); 2539 if (ret) 2540 goto out; 2541 if (found_key.offset == (u64)-1) 2542 break; 2543 dir_key.offset = found_key.offset + 1; 2544 } 2545 btrfs_release_path(path); 2546 if (range_end == (u64)-1) 2547 break; 2548 range_start = range_end + 1; 2549 } 2550 ret = 0; 2551 out: 2552 btrfs_release_path(path); 2553 btrfs_free_path(log_path); 2554 iput(dir); 2555 return ret; 2556 } 2557 2558 /* 2559 * the process_func used to replay items from the log tree. This 2560 * gets called in two different stages. The first stage just looks 2561 * for inodes and makes sure they are all copied into the subvolume. 2562 * 2563 * The second stage copies all the other item types from the log into 2564 * the subvolume. The two stage approach is slower, but gets rid of 2565 * lots of complexity around inodes referencing other inodes that exist 2566 * only in the log (references come from either directory items or inode 2567 * back refs). 2568 */ 2569 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2570 struct walk_control *wc, u64 gen, int level) 2571 { 2572 int nritems; 2573 struct btrfs_path *path; 2574 struct btrfs_root *root = wc->replay_dest; 2575 struct btrfs_key key; 2576 int i; 2577 int ret; 2578 2579 ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 2580 if (ret) 2581 return ret; 2582 2583 level = btrfs_header_level(eb); 2584 2585 if (level != 0) 2586 return 0; 2587 2588 path = btrfs_alloc_path(); 2589 if (!path) 2590 return -ENOMEM; 2591 2592 nritems = btrfs_header_nritems(eb); 2593 for (i = 0; i < nritems; i++) { 2594 btrfs_item_key_to_cpu(eb, &key, i); 2595 2596 /* inode keys are done during the first stage */ 2597 if (key.type == BTRFS_INODE_ITEM_KEY && 2598 wc->stage == LOG_WALK_REPLAY_INODES) { 2599 struct btrfs_inode_item *inode_item; 2600 u32 mode; 2601 2602 inode_item = btrfs_item_ptr(eb, i, 2603 struct btrfs_inode_item); 2604 /* 2605 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2606 * and never got linked before the fsync, skip it, as 2607 * replaying it is pointless since it would be deleted 2608 * later. We skip logging tmpfiles, but it's always 2609 * possible we are replaying a log created with a kernel 2610 * that used to log tmpfiles. 2611 */ 2612 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2613 wc->ignore_cur_inode = true; 2614 continue; 2615 } else { 2616 wc->ignore_cur_inode = false; 2617 } 2618 ret = replay_xattr_deletes(wc->trans, root, log, 2619 path, key.objectid); 2620 if (ret) 2621 break; 2622 mode = btrfs_inode_mode(eb, inode_item); 2623 if (S_ISDIR(mode)) { 2624 ret = replay_dir_deletes(wc->trans, 2625 root, log, path, key.objectid, 0); 2626 if (ret) 2627 break; 2628 } 2629 ret = overwrite_item(wc->trans, root, path, 2630 eb, i, &key); 2631 if (ret) 2632 break; 2633 2634 /* 2635 * Before replaying extents, truncate the inode to its 2636 * size. We need to do it now and not after log replay 2637 * because before an fsync we can have prealloc extents 2638 * added beyond the inode's i_size. If we did it after, 2639 * through orphan cleanup for example, we would drop 2640 * those prealloc extents just after replaying them. 2641 */ 2642 if (S_ISREG(mode)) { 2643 struct btrfs_drop_extents_args drop_args = { 0 }; 2644 struct inode *inode; 2645 u64 from; 2646 2647 inode = read_one_inode(root, key.objectid); 2648 if (!inode) { 2649 ret = -EIO; 2650 break; 2651 } 2652 from = ALIGN(i_size_read(inode), 2653 root->fs_info->sectorsize); 2654 drop_args.start = from; 2655 drop_args.end = (u64)-1; 2656 drop_args.drop_cache = true; 2657 ret = btrfs_drop_extents(wc->trans, root, 2658 BTRFS_I(inode), 2659 &drop_args); 2660 if (!ret) { 2661 inode_sub_bytes(inode, 2662 drop_args.bytes_found); 2663 /* Update the inode's nbytes. */ 2664 ret = btrfs_update_inode(wc->trans, 2665 root, BTRFS_I(inode)); 2666 } 2667 iput(inode); 2668 if (ret) 2669 break; 2670 } 2671 2672 ret = link_to_fixup_dir(wc->trans, root, 2673 path, key.objectid); 2674 if (ret) 2675 break; 2676 } 2677 2678 if (wc->ignore_cur_inode) 2679 continue; 2680 2681 if (key.type == BTRFS_DIR_INDEX_KEY && 2682 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2683 ret = replay_one_dir_item(wc->trans, root, path, 2684 eb, i, &key); 2685 if (ret) 2686 break; 2687 } 2688 2689 if (wc->stage < LOG_WALK_REPLAY_ALL) 2690 continue; 2691 2692 /* these keys are simply copied */ 2693 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2694 ret = overwrite_item(wc->trans, root, path, 2695 eb, i, &key); 2696 if (ret) 2697 break; 2698 } else if (key.type == BTRFS_INODE_REF_KEY || 2699 key.type == BTRFS_INODE_EXTREF_KEY) { 2700 ret = add_inode_ref(wc->trans, root, log, path, 2701 eb, i, &key); 2702 if (ret && ret != -ENOENT) 2703 break; 2704 ret = 0; 2705 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2706 ret = replay_one_extent(wc->trans, root, path, 2707 eb, i, &key); 2708 if (ret) 2709 break; 2710 } 2711 /* 2712 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2713 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2714 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2715 * older kernel with such keys, ignore them. 2716 */ 2717 } 2718 btrfs_free_path(path); 2719 return ret; 2720 } 2721 2722 /* 2723 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2724 */ 2725 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2726 { 2727 struct btrfs_block_group *cache; 2728 2729 cache = btrfs_lookup_block_group(fs_info, start); 2730 if (!cache) { 2731 btrfs_err(fs_info, "unable to find block group for %llu", start); 2732 return; 2733 } 2734 2735 spin_lock(&cache->space_info->lock); 2736 spin_lock(&cache->lock); 2737 cache->reserved -= fs_info->nodesize; 2738 cache->space_info->bytes_reserved -= fs_info->nodesize; 2739 spin_unlock(&cache->lock); 2740 spin_unlock(&cache->space_info->lock); 2741 2742 btrfs_put_block_group(cache); 2743 } 2744 2745 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2746 struct btrfs_root *root, 2747 struct btrfs_path *path, int *level, 2748 struct walk_control *wc) 2749 { 2750 struct btrfs_fs_info *fs_info = root->fs_info; 2751 u64 bytenr; 2752 u64 ptr_gen; 2753 struct extent_buffer *next; 2754 struct extent_buffer *cur; 2755 u32 blocksize; 2756 int ret = 0; 2757 2758 while (*level > 0) { 2759 struct btrfs_key first_key; 2760 2761 cur = path->nodes[*level]; 2762 2763 WARN_ON(btrfs_header_level(cur) != *level); 2764 2765 if (path->slots[*level] >= 2766 btrfs_header_nritems(cur)) 2767 break; 2768 2769 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2770 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2771 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2772 blocksize = fs_info->nodesize; 2773 2774 next = btrfs_find_create_tree_block(fs_info, bytenr, 2775 btrfs_header_owner(cur), 2776 *level - 1); 2777 if (IS_ERR(next)) 2778 return PTR_ERR(next); 2779 2780 if (*level == 1) { 2781 ret = wc->process_func(root, next, wc, ptr_gen, 2782 *level - 1); 2783 if (ret) { 2784 free_extent_buffer(next); 2785 return ret; 2786 } 2787 2788 path->slots[*level]++; 2789 if (wc->free) { 2790 ret = btrfs_read_extent_buffer(next, ptr_gen, 2791 *level - 1, &first_key); 2792 if (ret) { 2793 free_extent_buffer(next); 2794 return ret; 2795 } 2796 2797 if (trans) { 2798 btrfs_tree_lock(next); 2799 btrfs_clean_tree_block(next); 2800 btrfs_wait_tree_block_writeback(next); 2801 btrfs_tree_unlock(next); 2802 ret = btrfs_pin_reserved_extent(trans, 2803 bytenr, blocksize); 2804 if (ret) { 2805 free_extent_buffer(next); 2806 return ret; 2807 } 2808 btrfs_redirty_list_add( 2809 trans->transaction, next); 2810 } else { 2811 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2812 clear_extent_buffer_dirty(next); 2813 unaccount_log_buffer(fs_info, bytenr); 2814 } 2815 } 2816 free_extent_buffer(next); 2817 continue; 2818 } 2819 ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); 2820 if (ret) { 2821 free_extent_buffer(next); 2822 return ret; 2823 } 2824 2825 if (path->nodes[*level-1]) 2826 free_extent_buffer(path->nodes[*level-1]); 2827 path->nodes[*level-1] = next; 2828 *level = btrfs_header_level(next); 2829 path->slots[*level] = 0; 2830 cond_resched(); 2831 } 2832 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2833 2834 cond_resched(); 2835 return 0; 2836 } 2837 2838 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2839 struct btrfs_root *root, 2840 struct btrfs_path *path, int *level, 2841 struct walk_control *wc) 2842 { 2843 struct btrfs_fs_info *fs_info = root->fs_info; 2844 int i; 2845 int slot; 2846 int ret; 2847 2848 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2849 slot = path->slots[i]; 2850 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2851 path->slots[i]++; 2852 *level = i; 2853 WARN_ON(*level == 0); 2854 return 0; 2855 } else { 2856 ret = wc->process_func(root, path->nodes[*level], wc, 2857 btrfs_header_generation(path->nodes[*level]), 2858 *level); 2859 if (ret) 2860 return ret; 2861 2862 if (wc->free) { 2863 struct extent_buffer *next; 2864 2865 next = path->nodes[*level]; 2866 2867 if (trans) { 2868 btrfs_tree_lock(next); 2869 btrfs_clean_tree_block(next); 2870 btrfs_wait_tree_block_writeback(next); 2871 btrfs_tree_unlock(next); 2872 ret = btrfs_pin_reserved_extent(trans, 2873 path->nodes[*level]->start, 2874 path->nodes[*level]->len); 2875 if (ret) 2876 return ret; 2877 btrfs_redirty_list_add(trans->transaction, 2878 next); 2879 } else { 2880 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2881 clear_extent_buffer_dirty(next); 2882 2883 unaccount_log_buffer(fs_info, 2884 path->nodes[*level]->start); 2885 } 2886 } 2887 free_extent_buffer(path->nodes[*level]); 2888 path->nodes[*level] = NULL; 2889 *level = i + 1; 2890 } 2891 } 2892 return 1; 2893 } 2894 2895 /* 2896 * drop the reference count on the tree rooted at 'snap'. This traverses 2897 * the tree freeing any blocks that have a ref count of zero after being 2898 * decremented. 2899 */ 2900 static int walk_log_tree(struct btrfs_trans_handle *trans, 2901 struct btrfs_root *log, struct walk_control *wc) 2902 { 2903 struct btrfs_fs_info *fs_info = log->fs_info; 2904 int ret = 0; 2905 int wret; 2906 int level; 2907 struct btrfs_path *path; 2908 int orig_level; 2909 2910 path = btrfs_alloc_path(); 2911 if (!path) 2912 return -ENOMEM; 2913 2914 level = btrfs_header_level(log->node); 2915 orig_level = level; 2916 path->nodes[level] = log->node; 2917 atomic_inc(&log->node->refs); 2918 path->slots[level] = 0; 2919 2920 while (1) { 2921 wret = walk_down_log_tree(trans, log, path, &level, wc); 2922 if (wret > 0) 2923 break; 2924 if (wret < 0) { 2925 ret = wret; 2926 goto out; 2927 } 2928 2929 wret = walk_up_log_tree(trans, log, path, &level, wc); 2930 if (wret > 0) 2931 break; 2932 if (wret < 0) { 2933 ret = wret; 2934 goto out; 2935 } 2936 } 2937 2938 /* was the root node processed? if not, catch it here */ 2939 if (path->nodes[orig_level]) { 2940 ret = wc->process_func(log, path->nodes[orig_level], wc, 2941 btrfs_header_generation(path->nodes[orig_level]), 2942 orig_level); 2943 if (ret) 2944 goto out; 2945 if (wc->free) { 2946 struct extent_buffer *next; 2947 2948 next = path->nodes[orig_level]; 2949 2950 if (trans) { 2951 btrfs_tree_lock(next); 2952 btrfs_clean_tree_block(next); 2953 btrfs_wait_tree_block_writeback(next); 2954 btrfs_tree_unlock(next); 2955 ret = btrfs_pin_reserved_extent(trans, 2956 next->start, next->len); 2957 if (ret) 2958 goto out; 2959 btrfs_redirty_list_add(trans->transaction, next); 2960 } else { 2961 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2962 clear_extent_buffer_dirty(next); 2963 unaccount_log_buffer(fs_info, next->start); 2964 } 2965 } 2966 } 2967 2968 out: 2969 btrfs_free_path(path); 2970 return ret; 2971 } 2972 2973 /* 2974 * helper function to update the item for a given subvolumes log root 2975 * in the tree of log roots 2976 */ 2977 static int update_log_root(struct btrfs_trans_handle *trans, 2978 struct btrfs_root *log, 2979 struct btrfs_root_item *root_item) 2980 { 2981 struct btrfs_fs_info *fs_info = log->fs_info; 2982 int ret; 2983 2984 if (log->log_transid == 1) { 2985 /* insert root item on the first sync */ 2986 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2987 &log->root_key, root_item); 2988 } else { 2989 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2990 &log->root_key, root_item); 2991 } 2992 return ret; 2993 } 2994 2995 static void wait_log_commit(struct btrfs_root *root, int transid) 2996 { 2997 DEFINE_WAIT(wait); 2998 int index = transid % 2; 2999 3000 /* 3001 * we only allow two pending log transactions at a time, 3002 * so we know that if ours is more than 2 older than the 3003 * current transaction, we're done 3004 */ 3005 for (;;) { 3006 prepare_to_wait(&root->log_commit_wait[index], 3007 &wait, TASK_UNINTERRUPTIBLE); 3008 3009 if (!(root->log_transid_committed < transid && 3010 atomic_read(&root->log_commit[index]))) 3011 break; 3012 3013 mutex_unlock(&root->log_mutex); 3014 schedule(); 3015 mutex_lock(&root->log_mutex); 3016 } 3017 finish_wait(&root->log_commit_wait[index], &wait); 3018 } 3019 3020 static void wait_for_writer(struct btrfs_root *root) 3021 { 3022 DEFINE_WAIT(wait); 3023 3024 for (;;) { 3025 prepare_to_wait(&root->log_writer_wait, &wait, 3026 TASK_UNINTERRUPTIBLE); 3027 if (!atomic_read(&root->log_writers)) 3028 break; 3029 3030 mutex_unlock(&root->log_mutex); 3031 schedule(); 3032 mutex_lock(&root->log_mutex); 3033 } 3034 finish_wait(&root->log_writer_wait, &wait); 3035 } 3036 3037 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3038 struct btrfs_log_ctx *ctx) 3039 { 3040 mutex_lock(&root->log_mutex); 3041 list_del_init(&ctx->list); 3042 mutex_unlock(&root->log_mutex); 3043 } 3044 3045 /* 3046 * Invoked in log mutex context, or be sure there is no other task which 3047 * can access the list. 3048 */ 3049 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3050 int index, int error) 3051 { 3052 struct btrfs_log_ctx *ctx; 3053 struct btrfs_log_ctx *safe; 3054 3055 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3056 list_del_init(&ctx->list); 3057 ctx->log_ret = error; 3058 } 3059 } 3060 3061 /* 3062 * btrfs_sync_log does sends a given tree log down to the disk and 3063 * updates the super blocks to record it. When this call is done, 3064 * you know that any inodes previously logged are safely on disk only 3065 * if it returns 0. 3066 * 3067 * Any other return value means you need to call btrfs_commit_transaction. 3068 * Some of the edge cases for fsyncing directories that have had unlinks 3069 * or renames done in the past mean that sometimes the only safe 3070 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3071 * that has happened. 3072 */ 3073 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3074 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3075 { 3076 int index1; 3077 int index2; 3078 int mark; 3079 int ret; 3080 struct btrfs_fs_info *fs_info = root->fs_info; 3081 struct btrfs_root *log = root->log_root; 3082 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3083 struct btrfs_root_item new_root_item; 3084 int log_transid = 0; 3085 struct btrfs_log_ctx root_log_ctx; 3086 struct blk_plug plug; 3087 u64 log_root_start; 3088 u64 log_root_level; 3089 3090 mutex_lock(&root->log_mutex); 3091 log_transid = ctx->log_transid; 3092 if (root->log_transid_committed >= log_transid) { 3093 mutex_unlock(&root->log_mutex); 3094 return ctx->log_ret; 3095 } 3096 3097 index1 = log_transid % 2; 3098 if (atomic_read(&root->log_commit[index1])) { 3099 wait_log_commit(root, log_transid); 3100 mutex_unlock(&root->log_mutex); 3101 return ctx->log_ret; 3102 } 3103 ASSERT(log_transid == root->log_transid); 3104 atomic_set(&root->log_commit[index1], 1); 3105 3106 /* wait for previous tree log sync to complete */ 3107 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3108 wait_log_commit(root, log_transid - 1); 3109 3110 while (1) { 3111 int batch = atomic_read(&root->log_batch); 3112 /* when we're on an ssd, just kick the log commit out */ 3113 if (!btrfs_test_opt(fs_info, SSD) && 3114 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3115 mutex_unlock(&root->log_mutex); 3116 schedule_timeout_uninterruptible(1); 3117 mutex_lock(&root->log_mutex); 3118 } 3119 wait_for_writer(root); 3120 if (batch == atomic_read(&root->log_batch)) 3121 break; 3122 } 3123 3124 /* bail out if we need to do a full commit */ 3125 if (btrfs_need_log_full_commit(trans)) { 3126 ret = BTRFS_LOG_FORCE_COMMIT; 3127 mutex_unlock(&root->log_mutex); 3128 goto out; 3129 } 3130 3131 if (log_transid % 2 == 0) 3132 mark = EXTENT_DIRTY; 3133 else 3134 mark = EXTENT_NEW; 3135 3136 /* we start IO on all the marked extents here, but we don't actually 3137 * wait for them until later. 3138 */ 3139 blk_start_plug(&plug); 3140 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3141 /* 3142 * -EAGAIN happens when someone, e.g., a concurrent transaction 3143 * commit, writes a dirty extent in this tree-log commit. This 3144 * concurrent write will create a hole writing out the extents, 3145 * and we cannot proceed on a zoned filesystem, requiring 3146 * sequential writing. While we can bail out to a full commit 3147 * here, but we can continue hoping the concurrent writing fills 3148 * the hole. 3149 */ 3150 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3151 ret = 0; 3152 if (ret) { 3153 blk_finish_plug(&plug); 3154 btrfs_abort_transaction(trans, ret); 3155 btrfs_set_log_full_commit(trans); 3156 mutex_unlock(&root->log_mutex); 3157 goto out; 3158 } 3159 3160 /* 3161 * We _must_ update under the root->log_mutex in order to make sure we 3162 * have a consistent view of the log root we are trying to commit at 3163 * this moment. 3164 * 3165 * We _must_ copy this into a local copy, because we are not holding the 3166 * log_root_tree->log_mutex yet. This is important because when we 3167 * commit the log_root_tree we must have a consistent view of the 3168 * log_root_tree when we update the super block to point at the 3169 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3170 * with the commit and possibly point at the new block which we may not 3171 * have written out. 3172 */ 3173 btrfs_set_root_node(&log->root_item, log->node); 3174 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3175 3176 root->log_transid++; 3177 log->log_transid = root->log_transid; 3178 root->log_start_pid = 0; 3179 /* 3180 * IO has been started, blocks of the log tree have WRITTEN flag set 3181 * in their headers. new modifications of the log will be written to 3182 * new positions. so it's safe to allow log writers to go in. 3183 */ 3184 mutex_unlock(&root->log_mutex); 3185 3186 if (btrfs_is_zoned(fs_info)) { 3187 mutex_lock(&fs_info->tree_root->log_mutex); 3188 if (!log_root_tree->node) { 3189 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3190 if (ret) { 3191 mutex_unlock(&fs_info->tree_root->log_mutex); 3192 blk_finish_plug(&plug); 3193 goto out; 3194 } 3195 } 3196 mutex_unlock(&fs_info->tree_root->log_mutex); 3197 } 3198 3199 btrfs_init_log_ctx(&root_log_ctx, NULL); 3200 3201 mutex_lock(&log_root_tree->log_mutex); 3202 3203 index2 = log_root_tree->log_transid % 2; 3204 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3205 root_log_ctx.log_transid = log_root_tree->log_transid; 3206 3207 /* 3208 * Now we are safe to update the log_root_tree because we're under the 3209 * log_mutex, and we're a current writer so we're holding the commit 3210 * open until we drop the log_mutex. 3211 */ 3212 ret = update_log_root(trans, log, &new_root_item); 3213 if (ret) { 3214 if (!list_empty(&root_log_ctx.list)) 3215 list_del_init(&root_log_ctx.list); 3216 3217 blk_finish_plug(&plug); 3218 btrfs_set_log_full_commit(trans); 3219 3220 if (ret != -ENOSPC) { 3221 btrfs_abort_transaction(trans, ret); 3222 mutex_unlock(&log_root_tree->log_mutex); 3223 goto out; 3224 } 3225 btrfs_wait_tree_log_extents(log, mark); 3226 mutex_unlock(&log_root_tree->log_mutex); 3227 ret = BTRFS_LOG_FORCE_COMMIT; 3228 goto out; 3229 } 3230 3231 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3232 blk_finish_plug(&plug); 3233 list_del_init(&root_log_ctx.list); 3234 mutex_unlock(&log_root_tree->log_mutex); 3235 ret = root_log_ctx.log_ret; 3236 goto out; 3237 } 3238 3239 index2 = root_log_ctx.log_transid % 2; 3240 if (atomic_read(&log_root_tree->log_commit[index2])) { 3241 blk_finish_plug(&plug); 3242 ret = btrfs_wait_tree_log_extents(log, mark); 3243 wait_log_commit(log_root_tree, 3244 root_log_ctx.log_transid); 3245 mutex_unlock(&log_root_tree->log_mutex); 3246 if (!ret) 3247 ret = root_log_ctx.log_ret; 3248 goto out; 3249 } 3250 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3251 atomic_set(&log_root_tree->log_commit[index2], 1); 3252 3253 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3254 wait_log_commit(log_root_tree, 3255 root_log_ctx.log_transid - 1); 3256 } 3257 3258 /* 3259 * now that we've moved on to the tree of log tree roots, 3260 * check the full commit flag again 3261 */ 3262 if (btrfs_need_log_full_commit(trans)) { 3263 blk_finish_plug(&plug); 3264 btrfs_wait_tree_log_extents(log, mark); 3265 mutex_unlock(&log_root_tree->log_mutex); 3266 ret = BTRFS_LOG_FORCE_COMMIT; 3267 goto out_wake_log_root; 3268 } 3269 3270 ret = btrfs_write_marked_extents(fs_info, 3271 &log_root_tree->dirty_log_pages, 3272 EXTENT_DIRTY | EXTENT_NEW); 3273 blk_finish_plug(&plug); 3274 /* 3275 * As described above, -EAGAIN indicates a hole in the extents. We 3276 * cannot wait for these write outs since the waiting cause a 3277 * deadlock. Bail out to the full commit instead. 3278 */ 3279 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3280 btrfs_set_log_full_commit(trans); 3281 btrfs_wait_tree_log_extents(log, mark); 3282 mutex_unlock(&log_root_tree->log_mutex); 3283 goto out_wake_log_root; 3284 } else if (ret) { 3285 btrfs_set_log_full_commit(trans); 3286 btrfs_abort_transaction(trans, ret); 3287 mutex_unlock(&log_root_tree->log_mutex); 3288 goto out_wake_log_root; 3289 } 3290 ret = btrfs_wait_tree_log_extents(log, mark); 3291 if (!ret) 3292 ret = btrfs_wait_tree_log_extents(log_root_tree, 3293 EXTENT_NEW | EXTENT_DIRTY); 3294 if (ret) { 3295 btrfs_set_log_full_commit(trans); 3296 mutex_unlock(&log_root_tree->log_mutex); 3297 goto out_wake_log_root; 3298 } 3299 3300 log_root_start = log_root_tree->node->start; 3301 log_root_level = btrfs_header_level(log_root_tree->node); 3302 log_root_tree->log_transid++; 3303 mutex_unlock(&log_root_tree->log_mutex); 3304 3305 /* 3306 * Here we are guaranteed that nobody is going to write the superblock 3307 * for the current transaction before us and that neither we do write 3308 * our superblock before the previous transaction finishes its commit 3309 * and writes its superblock, because: 3310 * 3311 * 1) We are holding a handle on the current transaction, so no body 3312 * can commit it until we release the handle; 3313 * 3314 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3315 * if the previous transaction is still committing, and hasn't yet 3316 * written its superblock, we wait for it to do it, because a 3317 * transaction commit acquires the tree_log_mutex when the commit 3318 * begins and releases it only after writing its superblock. 3319 */ 3320 mutex_lock(&fs_info->tree_log_mutex); 3321 3322 /* 3323 * The previous transaction writeout phase could have failed, and thus 3324 * marked the fs in an error state. We must not commit here, as we 3325 * could have updated our generation in the super_for_commit and 3326 * writing the super here would result in transid mismatches. If there 3327 * is an error here just bail. 3328 */ 3329 if (BTRFS_FS_ERROR(fs_info)) { 3330 ret = -EIO; 3331 btrfs_set_log_full_commit(trans); 3332 btrfs_abort_transaction(trans, ret); 3333 mutex_unlock(&fs_info->tree_log_mutex); 3334 goto out_wake_log_root; 3335 } 3336 3337 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3338 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3339 ret = write_all_supers(fs_info, 1); 3340 mutex_unlock(&fs_info->tree_log_mutex); 3341 if (ret) { 3342 btrfs_set_log_full_commit(trans); 3343 btrfs_abort_transaction(trans, ret); 3344 goto out_wake_log_root; 3345 } 3346 3347 /* 3348 * We know there can only be one task here, since we have not yet set 3349 * root->log_commit[index1] to 0 and any task attempting to sync the 3350 * log must wait for the previous log transaction to commit if it's 3351 * still in progress or wait for the current log transaction commit if 3352 * someone else already started it. We use <= and not < because the 3353 * first log transaction has an ID of 0. 3354 */ 3355 ASSERT(root->last_log_commit <= log_transid); 3356 root->last_log_commit = log_transid; 3357 3358 out_wake_log_root: 3359 mutex_lock(&log_root_tree->log_mutex); 3360 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3361 3362 log_root_tree->log_transid_committed++; 3363 atomic_set(&log_root_tree->log_commit[index2], 0); 3364 mutex_unlock(&log_root_tree->log_mutex); 3365 3366 /* 3367 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3368 * all the updates above are seen by the woken threads. It might not be 3369 * necessary, but proving that seems to be hard. 3370 */ 3371 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3372 out: 3373 mutex_lock(&root->log_mutex); 3374 btrfs_remove_all_log_ctxs(root, index1, ret); 3375 root->log_transid_committed++; 3376 atomic_set(&root->log_commit[index1], 0); 3377 mutex_unlock(&root->log_mutex); 3378 3379 /* 3380 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3381 * all the updates above are seen by the woken threads. It might not be 3382 * necessary, but proving that seems to be hard. 3383 */ 3384 cond_wake_up(&root->log_commit_wait[index1]); 3385 return ret; 3386 } 3387 3388 static void free_log_tree(struct btrfs_trans_handle *trans, 3389 struct btrfs_root *log) 3390 { 3391 int ret; 3392 struct walk_control wc = { 3393 .free = 1, 3394 .process_func = process_one_buffer 3395 }; 3396 3397 if (log->node) { 3398 ret = walk_log_tree(trans, log, &wc); 3399 if (ret) { 3400 /* 3401 * We weren't able to traverse the entire log tree, the 3402 * typical scenario is getting an -EIO when reading an 3403 * extent buffer of the tree, due to a previous writeback 3404 * failure of it. 3405 */ 3406 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3407 &log->fs_info->fs_state); 3408 3409 /* 3410 * Some extent buffers of the log tree may still be dirty 3411 * and not yet written back to storage, because we may 3412 * have updates to a log tree without syncing a log tree, 3413 * such as during rename and link operations. So flush 3414 * them out and wait for their writeback to complete, so 3415 * that we properly cleanup their state and pages. 3416 */ 3417 btrfs_write_marked_extents(log->fs_info, 3418 &log->dirty_log_pages, 3419 EXTENT_DIRTY | EXTENT_NEW); 3420 btrfs_wait_tree_log_extents(log, 3421 EXTENT_DIRTY | EXTENT_NEW); 3422 3423 if (trans) 3424 btrfs_abort_transaction(trans, ret); 3425 else 3426 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3427 } 3428 } 3429 3430 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3431 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3432 extent_io_tree_release(&log->log_csum_range); 3433 3434 btrfs_put_root(log); 3435 } 3436 3437 /* 3438 * free all the extents used by the tree log. This should be called 3439 * at commit time of the full transaction 3440 */ 3441 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3442 { 3443 if (root->log_root) { 3444 free_log_tree(trans, root->log_root); 3445 root->log_root = NULL; 3446 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3447 } 3448 return 0; 3449 } 3450 3451 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3452 struct btrfs_fs_info *fs_info) 3453 { 3454 if (fs_info->log_root_tree) { 3455 free_log_tree(trans, fs_info->log_root_tree); 3456 fs_info->log_root_tree = NULL; 3457 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3458 } 3459 return 0; 3460 } 3461 3462 /* 3463 * Check if an inode was logged in the current transaction. This correctly deals 3464 * with the case where the inode was logged but has a logged_trans of 0, which 3465 * happens if the inode is evicted and loaded again, as logged_trans is an in 3466 * memory only field (not persisted). 3467 * 3468 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3469 * and < 0 on error. 3470 */ 3471 static int inode_logged(struct btrfs_trans_handle *trans, 3472 struct btrfs_inode *inode, 3473 struct btrfs_path *path_in) 3474 { 3475 struct btrfs_path *path = path_in; 3476 struct btrfs_key key; 3477 int ret; 3478 3479 if (inode->logged_trans == trans->transid) 3480 return 1; 3481 3482 /* 3483 * If logged_trans is not 0, then we know the inode logged was not logged 3484 * in this transaction, so we can return false right away. 3485 */ 3486 if (inode->logged_trans > 0) 3487 return 0; 3488 3489 /* 3490 * If no log tree was created for this root in this transaction, then 3491 * the inode can not have been logged in this transaction. In that case 3492 * set logged_trans to anything greater than 0 and less than the current 3493 * transaction's ID, to avoid the search below in a future call in case 3494 * a log tree gets created after this. 3495 */ 3496 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3497 inode->logged_trans = trans->transid - 1; 3498 return 0; 3499 } 3500 3501 /* 3502 * We have a log tree and the inode's logged_trans is 0. We can't tell 3503 * for sure if the inode was logged before in this transaction by looking 3504 * only at logged_trans. We could be pessimistic and assume it was, but 3505 * that can lead to unnecessarily logging an inode during rename and link 3506 * operations, and then further updating the log in followup rename and 3507 * link operations, specially if it's a directory, which adds latency 3508 * visible to applications doing a series of rename or link operations. 3509 * 3510 * A logged_trans of 0 here can mean several things: 3511 * 3512 * 1) The inode was never logged since the filesystem was mounted, and may 3513 * or may have not been evicted and loaded again; 3514 * 3515 * 2) The inode was logged in a previous transaction, then evicted and 3516 * then loaded again; 3517 * 3518 * 3) The inode was logged in the current transaction, then evicted and 3519 * then loaded again. 3520 * 3521 * For cases 1) and 2) we don't want to return true, but we need to detect 3522 * case 3) and return true. So we do a search in the log root for the inode 3523 * item. 3524 */ 3525 key.objectid = btrfs_ino(inode); 3526 key.type = BTRFS_INODE_ITEM_KEY; 3527 key.offset = 0; 3528 3529 if (!path) { 3530 path = btrfs_alloc_path(); 3531 if (!path) 3532 return -ENOMEM; 3533 } 3534 3535 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3536 3537 if (path_in) 3538 btrfs_release_path(path); 3539 else 3540 btrfs_free_path(path); 3541 3542 /* 3543 * Logging an inode always results in logging its inode item. So if we 3544 * did not find the item we know the inode was not logged for sure. 3545 */ 3546 if (ret < 0) { 3547 return ret; 3548 } else if (ret > 0) { 3549 /* 3550 * Set logged_trans to a value greater than 0 and less then the 3551 * current transaction to avoid doing the search in future calls. 3552 */ 3553 inode->logged_trans = trans->transid - 1; 3554 return 0; 3555 } 3556 3557 /* 3558 * The inode was previously logged and then evicted, set logged_trans to 3559 * the current transacion's ID, to avoid future tree searches as long as 3560 * the inode is not evicted again. 3561 */ 3562 inode->logged_trans = trans->transid; 3563 3564 /* 3565 * If it's a directory, then we must set last_dir_index_offset to the 3566 * maximum possible value, so that the next attempt to log the inode does 3567 * not skip checking if dir index keys found in modified subvolume tree 3568 * leaves have been logged before, otherwise it would result in attempts 3569 * to insert duplicate dir index keys in the log tree. This must be done 3570 * because last_dir_index_offset is an in-memory only field, not persisted 3571 * in the inode item or any other on-disk structure, so its value is lost 3572 * once the inode is evicted. 3573 */ 3574 if (S_ISDIR(inode->vfs_inode.i_mode)) 3575 inode->last_dir_index_offset = (u64)-1; 3576 3577 return 1; 3578 } 3579 3580 /* 3581 * Delete a directory entry from the log if it exists. 3582 * 3583 * Returns < 0 on error 3584 * 1 if the entry does not exists 3585 * 0 if the entry existed and was successfully deleted 3586 */ 3587 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3588 struct btrfs_root *log, 3589 struct btrfs_path *path, 3590 u64 dir_ino, 3591 const char *name, int name_len, 3592 u64 index) 3593 { 3594 struct btrfs_dir_item *di; 3595 3596 /* 3597 * We only log dir index items of a directory, so we don't need to look 3598 * for dir item keys. 3599 */ 3600 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3601 index, name, name_len, -1); 3602 if (IS_ERR(di)) 3603 return PTR_ERR(di); 3604 else if (!di) 3605 return 1; 3606 3607 /* 3608 * We do not need to update the size field of the directory's 3609 * inode item because on log replay we update the field to reflect 3610 * all existing entries in the directory (see overwrite_item()). 3611 */ 3612 return btrfs_delete_one_dir_name(trans, log, path, di); 3613 } 3614 3615 /* 3616 * If both a file and directory are logged, and unlinks or renames are 3617 * mixed in, we have a few interesting corners: 3618 * 3619 * create file X in dir Y 3620 * link file X to X.link in dir Y 3621 * fsync file X 3622 * unlink file X but leave X.link 3623 * fsync dir Y 3624 * 3625 * After a crash we would expect only X.link to exist. But file X 3626 * didn't get fsync'd again so the log has back refs for X and X.link. 3627 * 3628 * We solve this by removing directory entries and inode backrefs from the 3629 * log when a file that was logged in the current transaction is 3630 * unlinked. Any later fsync will include the updated log entries, and 3631 * we'll be able to reconstruct the proper directory items from backrefs. 3632 * 3633 * This optimizations allows us to avoid relogging the entire inode 3634 * or the entire directory. 3635 */ 3636 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3637 struct btrfs_root *root, 3638 const char *name, int name_len, 3639 struct btrfs_inode *dir, u64 index) 3640 { 3641 struct btrfs_path *path; 3642 int ret; 3643 3644 ret = inode_logged(trans, dir, NULL); 3645 if (ret == 0) 3646 return; 3647 else if (ret < 0) { 3648 btrfs_set_log_full_commit(trans); 3649 return; 3650 } 3651 3652 ret = join_running_log_trans(root); 3653 if (ret) 3654 return; 3655 3656 mutex_lock(&dir->log_mutex); 3657 3658 path = btrfs_alloc_path(); 3659 if (!path) { 3660 ret = -ENOMEM; 3661 goto out_unlock; 3662 } 3663 3664 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3665 name, name_len, index); 3666 btrfs_free_path(path); 3667 out_unlock: 3668 mutex_unlock(&dir->log_mutex); 3669 if (ret < 0) 3670 btrfs_set_log_full_commit(trans); 3671 btrfs_end_log_trans(root); 3672 } 3673 3674 /* see comments for btrfs_del_dir_entries_in_log */ 3675 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3676 struct btrfs_root *root, 3677 const char *name, int name_len, 3678 struct btrfs_inode *inode, u64 dirid) 3679 { 3680 struct btrfs_root *log; 3681 u64 index; 3682 int ret; 3683 3684 ret = inode_logged(trans, inode, NULL); 3685 if (ret == 0) 3686 return; 3687 else if (ret < 0) { 3688 btrfs_set_log_full_commit(trans); 3689 return; 3690 } 3691 3692 ret = join_running_log_trans(root); 3693 if (ret) 3694 return; 3695 log = root->log_root; 3696 mutex_lock(&inode->log_mutex); 3697 3698 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3699 dirid, &index); 3700 mutex_unlock(&inode->log_mutex); 3701 if (ret < 0 && ret != -ENOENT) 3702 btrfs_set_log_full_commit(trans); 3703 btrfs_end_log_trans(root); 3704 } 3705 3706 /* 3707 * creates a range item in the log for 'dirid'. first_offset and 3708 * last_offset tell us which parts of the key space the log should 3709 * be considered authoritative for. 3710 */ 3711 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3712 struct btrfs_root *log, 3713 struct btrfs_path *path, 3714 u64 dirid, 3715 u64 first_offset, u64 last_offset) 3716 { 3717 int ret; 3718 struct btrfs_key key; 3719 struct btrfs_dir_log_item *item; 3720 3721 key.objectid = dirid; 3722 key.offset = first_offset; 3723 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3724 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3725 /* 3726 * -EEXIST is fine and can happen sporadically when we are logging a 3727 * directory and have concurrent insertions in the subvolume's tree for 3728 * items from other inodes and that result in pushing off some dir items 3729 * from one leaf to another in order to accommodate for the new items. 3730 * This results in logging the same dir index range key. 3731 */ 3732 if (ret && ret != -EEXIST) 3733 return ret; 3734 3735 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3736 struct btrfs_dir_log_item); 3737 if (ret == -EEXIST) { 3738 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3739 3740 /* 3741 * btrfs_del_dir_entries_in_log() might have been called during 3742 * an unlink between the initial insertion of this key and the 3743 * current update, or we might be logging a single entry deletion 3744 * during a rename, so set the new last_offset to the max value. 3745 */ 3746 last_offset = max(last_offset, curr_end); 3747 } 3748 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3749 btrfs_mark_buffer_dirty(path->nodes[0]); 3750 btrfs_release_path(path); 3751 return 0; 3752 } 3753 3754 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3755 struct btrfs_root *log, 3756 struct extent_buffer *src, 3757 struct btrfs_path *dst_path, 3758 int start_slot, 3759 int count) 3760 { 3761 char *ins_data = NULL; 3762 struct btrfs_item_batch batch; 3763 struct extent_buffer *dst; 3764 unsigned long src_offset; 3765 unsigned long dst_offset; 3766 struct btrfs_key key; 3767 u32 item_size; 3768 int ret; 3769 int i; 3770 3771 ASSERT(count > 0); 3772 batch.nr = count; 3773 3774 if (count == 1) { 3775 btrfs_item_key_to_cpu(src, &key, start_slot); 3776 item_size = btrfs_item_size(src, start_slot); 3777 batch.keys = &key; 3778 batch.data_sizes = &item_size; 3779 batch.total_data_size = item_size; 3780 } else { 3781 struct btrfs_key *ins_keys; 3782 u32 *ins_sizes; 3783 3784 ins_data = kmalloc(count * sizeof(u32) + 3785 count * sizeof(struct btrfs_key), GFP_NOFS); 3786 if (!ins_data) 3787 return -ENOMEM; 3788 3789 ins_sizes = (u32 *)ins_data; 3790 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3791 batch.keys = ins_keys; 3792 batch.data_sizes = ins_sizes; 3793 batch.total_data_size = 0; 3794 3795 for (i = 0; i < count; i++) { 3796 const int slot = start_slot + i; 3797 3798 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3799 ins_sizes[i] = btrfs_item_size(src, slot); 3800 batch.total_data_size += ins_sizes[i]; 3801 } 3802 } 3803 3804 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3805 if (ret) 3806 goto out; 3807 3808 dst = dst_path->nodes[0]; 3809 /* 3810 * Copy all the items in bulk, in a single copy operation. Item data is 3811 * organized such that it's placed at the end of a leaf and from right 3812 * to left. For example, the data for the second item ends at an offset 3813 * that matches the offset where the data for the first item starts, the 3814 * data for the third item ends at an offset that matches the offset 3815 * where the data of the second items starts, and so on. 3816 * Therefore our source and destination start offsets for copy match the 3817 * offsets of the last items (highest slots). 3818 */ 3819 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3820 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3821 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3822 btrfs_release_path(dst_path); 3823 out: 3824 kfree(ins_data); 3825 3826 return ret; 3827 } 3828 3829 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3830 struct btrfs_inode *inode, 3831 struct btrfs_path *path, 3832 struct btrfs_path *dst_path, 3833 struct btrfs_log_ctx *ctx, 3834 u64 *last_old_dentry_offset) 3835 { 3836 struct btrfs_root *log = inode->root->log_root; 3837 struct extent_buffer *src = path->nodes[0]; 3838 const int nritems = btrfs_header_nritems(src); 3839 const u64 ino = btrfs_ino(inode); 3840 bool last_found = false; 3841 int batch_start = 0; 3842 int batch_size = 0; 3843 int i; 3844 3845 for (i = path->slots[0]; i < nritems; i++) { 3846 struct btrfs_dir_item *di; 3847 struct btrfs_key key; 3848 int ret; 3849 3850 btrfs_item_key_to_cpu(src, &key, i); 3851 3852 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3853 last_found = true; 3854 break; 3855 } 3856 3857 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3858 ctx->last_dir_item_offset = key.offset; 3859 3860 /* 3861 * Skip ranges of items that consist only of dir item keys created 3862 * in past transactions. However if we find a gap, we must log a 3863 * dir index range item for that gap, so that index keys in that 3864 * gap are deleted during log replay. 3865 */ 3866 if (btrfs_dir_transid(src, di) < trans->transid) { 3867 if (key.offset > *last_old_dentry_offset + 1) { 3868 ret = insert_dir_log_key(trans, log, dst_path, 3869 ino, *last_old_dentry_offset + 1, 3870 key.offset - 1); 3871 if (ret < 0) 3872 return ret; 3873 } 3874 3875 *last_old_dentry_offset = key.offset; 3876 continue; 3877 } 3878 /* 3879 * We must make sure that when we log a directory entry, the 3880 * corresponding inode, after log replay, has a matching link 3881 * count. For example: 3882 * 3883 * touch foo 3884 * mkdir mydir 3885 * sync 3886 * ln foo mydir/bar 3887 * xfs_io -c "fsync" mydir 3888 * <crash> 3889 * <mount fs and log replay> 3890 * 3891 * Would result in a fsync log that when replayed, our file inode 3892 * would have a link count of 1, but we get two directory entries 3893 * pointing to the same inode. After removing one of the names, 3894 * it would not be possible to remove the other name, which 3895 * resulted always in stale file handle errors, and would not be 3896 * possible to rmdir the parent directory, since its i_size could 3897 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3898 * resulting in -ENOTEMPTY errors. 3899 */ 3900 if (!ctx->log_new_dentries) { 3901 struct btrfs_key di_key; 3902 3903 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3904 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3905 ctx->log_new_dentries = true; 3906 } 3907 3908 if (!ctx->logged_before) 3909 goto add_to_batch; 3910 3911 /* 3912 * If we were logged before and have logged dir items, we can skip 3913 * checking if any item with a key offset larger than the last one 3914 * we logged is in the log tree, saving time and avoiding adding 3915 * contention on the log tree. We can only rely on the value of 3916 * last_dir_index_offset when we know for sure that the inode was 3917 * previously logged in the current transaction. 3918 */ 3919 if (key.offset > inode->last_dir_index_offset) 3920 goto add_to_batch; 3921 /* 3922 * Check if the key was already logged before. If not we can add 3923 * it to a batch for bulk insertion. 3924 */ 3925 ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); 3926 if (ret < 0) { 3927 return ret; 3928 } else if (ret > 0) { 3929 btrfs_release_path(dst_path); 3930 goto add_to_batch; 3931 } 3932 3933 /* 3934 * Item exists in the log. Overwrite the item in the log if it 3935 * has different content or do nothing if it has exactly the same 3936 * content. And then flush the current batch if any - do it after 3937 * overwriting the current item, or we would deadlock otherwise, 3938 * since we are holding a path for the existing item. 3939 */ 3940 ret = do_overwrite_item(trans, log, dst_path, src, i, &key); 3941 if (ret < 0) 3942 return ret; 3943 3944 if (batch_size > 0) { 3945 ret = flush_dir_items_batch(trans, log, src, dst_path, 3946 batch_start, batch_size); 3947 if (ret < 0) 3948 return ret; 3949 batch_size = 0; 3950 } 3951 continue; 3952 add_to_batch: 3953 if (batch_size == 0) 3954 batch_start = i; 3955 batch_size++; 3956 } 3957 3958 if (batch_size > 0) { 3959 int ret; 3960 3961 ret = flush_dir_items_batch(trans, log, src, dst_path, 3962 batch_start, batch_size); 3963 if (ret < 0) 3964 return ret; 3965 } 3966 3967 return last_found ? 1 : 0; 3968 } 3969 3970 /* 3971 * log all the items included in the current transaction for a given 3972 * directory. This also creates the range items in the log tree required 3973 * to replay anything deleted before the fsync 3974 */ 3975 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3976 struct btrfs_inode *inode, 3977 struct btrfs_path *path, 3978 struct btrfs_path *dst_path, 3979 struct btrfs_log_ctx *ctx, 3980 u64 min_offset, u64 *last_offset_ret) 3981 { 3982 struct btrfs_key min_key; 3983 struct btrfs_root *root = inode->root; 3984 struct btrfs_root *log = root->log_root; 3985 int err = 0; 3986 int ret; 3987 u64 last_old_dentry_offset = min_offset - 1; 3988 u64 last_offset = (u64)-1; 3989 u64 ino = btrfs_ino(inode); 3990 3991 min_key.objectid = ino; 3992 min_key.type = BTRFS_DIR_INDEX_KEY; 3993 min_key.offset = min_offset; 3994 3995 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3996 3997 /* 3998 * we didn't find anything from this transaction, see if there 3999 * is anything at all 4000 */ 4001 if (ret != 0 || min_key.objectid != ino || 4002 min_key.type != BTRFS_DIR_INDEX_KEY) { 4003 min_key.objectid = ino; 4004 min_key.type = BTRFS_DIR_INDEX_KEY; 4005 min_key.offset = (u64)-1; 4006 btrfs_release_path(path); 4007 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 4008 if (ret < 0) { 4009 btrfs_release_path(path); 4010 return ret; 4011 } 4012 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4013 4014 /* if ret == 0 there are items for this type, 4015 * create a range to tell us the last key of this type. 4016 * otherwise, there are no items in this directory after 4017 * *min_offset, and we create a range to indicate that. 4018 */ 4019 if (ret == 0) { 4020 struct btrfs_key tmp; 4021 4022 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 4023 path->slots[0]); 4024 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4025 last_old_dentry_offset = tmp.offset; 4026 } 4027 goto done; 4028 } 4029 4030 /* go backward to find any previous key */ 4031 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4032 if (ret == 0) { 4033 struct btrfs_key tmp; 4034 4035 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 4036 /* 4037 * The dir index key before the first one we found that needs to 4038 * be logged might be in a previous leaf, and there might be a 4039 * gap between these keys, meaning that we had deletions that 4040 * happened. So the key range item we log (key type 4041 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 4042 * previous key's offset plus 1, so that those deletes are replayed. 4043 */ 4044 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4045 last_old_dentry_offset = tmp.offset; 4046 } 4047 btrfs_release_path(path); 4048 4049 /* 4050 * Find the first key from this transaction again. See the note for 4051 * log_new_dir_dentries, if we're logging a directory recursively we 4052 * won't be holding its i_mutex, which means we can modify the directory 4053 * while we're logging it. If we remove an entry between our first 4054 * search and this search we'll not find the key again and can just 4055 * bail. 4056 */ 4057 search: 4058 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 4059 if (ret != 0) 4060 goto done; 4061 4062 /* 4063 * we have a block from this transaction, log every item in it 4064 * from our directory 4065 */ 4066 while (1) { 4067 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 4068 &last_old_dentry_offset); 4069 if (ret != 0) { 4070 if (ret < 0) 4071 err = ret; 4072 goto done; 4073 } 4074 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 4075 4076 /* 4077 * look ahead to the next item and see if it is also 4078 * from this directory and from this transaction 4079 */ 4080 ret = btrfs_next_leaf(root, path); 4081 if (ret) { 4082 if (ret == 1) 4083 last_offset = (u64)-1; 4084 else 4085 err = ret; 4086 goto done; 4087 } 4088 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 4089 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 4090 last_offset = (u64)-1; 4091 goto done; 4092 } 4093 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 4094 /* 4095 * The next leaf was not changed in the current transaction 4096 * and has at least one dir index key. 4097 * We check for the next key because there might have been 4098 * one or more deletions between the last key we logged and 4099 * that next key. So the key range item we log (key type 4100 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 4101 * offset minus 1, so that those deletes are replayed. 4102 */ 4103 last_offset = min_key.offset - 1; 4104 goto done; 4105 } 4106 if (need_resched()) { 4107 btrfs_release_path(path); 4108 cond_resched(); 4109 goto search; 4110 } 4111 } 4112 done: 4113 btrfs_release_path(path); 4114 btrfs_release_path(dst_path); 4115 4116 if (err == 0) { 4117 *last_offset_ret = last_offset; 4118 /* 4119 * In case the leaf was changed in the current transaction but 4120 * all its dir items are from a past transaction, the last item 4121 * in the leaf is a dir item and there's no gap between that last 4122 * dir item and the first one on the next leaf (which did not 4123 * change in the current transaction), then we don't need to log 4124 * a range, last_old_dentry_offset is == to last_offset. 4125 */ 4126 ASSERT(last_old_dentry_offset <= last_offset); 4127 if (last_old_dentry_offset < last_offset) { 4128 ret = insert_dir_log_key(trans, log, path, ino, 4129 last_old_dentry_offset + 1, 4130 last_offset); 4131 if (ret) 4132 err = ret; 4133 } 4134 } 4135 return err; 4136 } 4137 4138 /* 4139 * logging directories is very similar to logging inodes, We find all the items 4140 * from the current transaction and write them to the log. 4141 * 4142 * The recovery code scans the directory in the subvolume, and if it finds a 4143 * key in the range logged that is not present in the log tree, then it means 4144 * that dir entry was unlinked during the transaction. 4145 * 4146 * In order for that scan to work, we must include one key smaller than 4147 * the smallest logged by this transaction and one key larger than the largest 4148 * key logged by this transaction. 4149 */ 4150 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4151 struct btrfs_inode *inode, 4152 struct btrfs_path *path, 4153 struct btrfs_path *dst_path, 4154 struct btrfs_log_ctx *ctx) 4155 { 4156 u64 min_key; 4157 u64 max_key; 4158 int ret; 4159 4160 min_key = BTRFS_DIR_START_INDEX; 4161 max_key = 0; 4162 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4163 4164 while (1) { 4165 ret = log_dir_items(trans, inode, path, dst_path, 4166 ctx, min_key, &max_key); 4167 if (ret) 4168 return ret; 4169 if (max_key == (u64)-1) 4170 break; 4171 min_key = max_key + 1; 4172 } 4173 4174 inode->last_dir_index_offset = ctx->last_dir_item_offset; 4175 4176 return 0; 4177 } 4178 4179 /* 4180 * a helper function to drop items from the log before we relog an 4181 * inode. max_key_type indicates the highest item type to remove. 4182 * This cannot be run for file data extents because it does not 4183 * free the extents they point to. 4184 */ 4185 static int drop_inode_items(struct btrfs_trans_handle *trans, 4186 struct btrfs_root *log, 4187 struct btrfs_path *path, 4188 struct btrfs_inode *inode, 4189 int max_key_type) 4190 { 4191 int ret; 4192 struct btrfs_key key; 4193 struct btrfs_key found_key; 4194 int start_slot; 4195 4196 key.objectid = btrfs_ino(inode); 4197 key.type = max_key_type; 4198 key.offset = (u64)-1; 4199 4200 while (1) { 4201 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4202 BUG_ON(ret == 0); /* Logic error */ 4203 if (ret < 0) 4204 break; 4205 4206 if (path->slots[0] == 0) 4207 break; 4208 4209 path->slots[0]--; 4210 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4211 path->slots[0]); 4212 4213 if (found_key.objectid != key.objectid) 4214 break; 4215 4216 found_key.offset = 0; 4217 found_key.type = 0; 4218 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4219 if (ret < 0) 4220 break; 4221 4222 ret = btrfs_del_items(trans, log, path, start_slot, 4223 path->slots[0] - start_slot + 1); 4224 /* 4225 * If start slot isn't 0 then we don't need to re-search, we've 4226 * found the last guy with the objectid in this tree. 4227 */ 4228 if (ret || start_slot != 0) 4229 break; 4230 btrfs_release_path(path); 4231 } 4232 btrfs_release_path(path); 4233 if (ret > 0) 4234 ret = 0; 4235 return ret; 4236 } 4237 4238 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4239 struct btrfs_root *log_root, 4240 struct btrfs_inode *inode, 4241 u64 new_size, u32 min_type) 4242 { 4243 struct btrfs_truncate_control control = { 4244 .new_size = new_size, 4245 .ino = btrfs_ino(inode), 4246 .min_type = min_type, 4247 .skip_ref_updates = true, 4248 }; 4249 4250 return btrfs_truncate_inode_items(trans, log_root, &control); 4251 } 4252 4253 static void fill_inode_item(struct btrfs_trans_handle *trans, 4254 struct extent_buffer *leaf, 4255 struct btrfs_inode_item *item, 4256 struct inode *inode, int log_inode_only, 4257 u64 logged_isize) 4258 { 4259 struct btrfs_map_token token; 4260 u64 flags; 4261 4262 btrfs_init_map_token(&token, leaf); 4263 4264 if (log_inode_only) { 4265 /* set the generation to zero so the recover code 4266 * can tell the difference between an logging 4267 * just to say 'this inode exists' and a logging 4268 * to say 'update this inode with these values' 4269 */ 4270 btrfs_set_token_inode_generation(&token, item, 0); 4271 btrfs_set_token_inode_size(&token, item, logged_isize); 4272 } else { 4273 btrfs_set_token_inode_generation(&token, item, 4274 BTRFS_I(inode)->generation); 4275 btrfs_set_token_inode_size(&token, item, inode->i_size); 4276 } 4277 4278 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4279 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4280 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4281 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4282 4283 btrfs_set_token_timespec_sec(&token, &item->atime, 4284 inode->i_atime.tv_sec); 4285 btrfs_set_token_timespec_nsec(&token, &item->atime, 4286 inode->i_atime.tv_nsec); 4287 4288 btrfs_set_token_timespec_sec(&token, &item->mtime, 4289 inode->i_mtime.tv_sec); 4290 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4291 inode->i_mtime.tv_nsec); 4292 4293 btrfs_set_token_timespec_sec(&token, &item->ctime, 4294 inode->i_ctime.tv_sec); 4295 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4296 inode->i_ctime.tv_nsec); 4297 4298 /* 4299 * We do not need to set the nbytes field, in fact during a fast fsync 4300 * its value may not even be correct, since a fast fsync does not wait 4301 * for ordered extent completion, which is where we update nbytes, it 4302 * only waits for writeback to complete. During log replay as we find 4303 * file extent items and replay them, we adjust the nbytes field of the 4304 * inode item in subvolume tree as needed (see overwrite_item()). 4305 */ 4306 4307 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4308 btrfs_set_token_inode_transid(&token, item, trans->transid); 4309 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4310 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4311 BTRFS_I(inode)->ro_flags); 4312 btrfs_set_token_inode_flags(&token, item, flags); 4313 btrfs_set_token_inode_block_group(&token, item, 0); 4314 } 4315 4316 static int log_inode_item(struct btrfs_trans_handle *trans, 4317 struct btrfs_root *log, struct btrfs_path *path, 4318 struct btrfs_inode *inode, bool inode_item_dropped) 4319 { 4320 struct btrfs_inode_item *inode_item; 4321 int ret; 4322 4323 /* 4324 * If we are doing a fast fsync and the inode was logged before in the 4325 * current transaction, then we know the inode was previously logged and 4326 * it exists in the log tree. For performance reasons, in this case use 4327 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4328 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4329 * contention in case there are concurrent fsyncs for other inodes of the 4330 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4331 * already exists can also result in unnecessarily splitting a leaf. 4332 */ 4333 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4334 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4335 ASSERT(ret <= 0); 4336 if (ret > 0) 4337 ret = -ENOENT; 4338 } else { 4339 /* 4340 * This means it is the first fsync in the current transaction, 4341 * so the inode item is not in the log and we need to insert it. 4342 * We can never get -EEXIST because we are only called for a fast 4343 * fsync and in case an inode eviction happens after the inode was 4344 * logged before in the current transaction, when we load again 4345 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4346 * flags and set ->logged_trans to 0. 4347 */ 4348 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4349 sizeof(*inode_item)); 4350 ASSERT(ret != -EEXIST); 4351 } 4352 if (ret) 4353 return ret; 4354 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4355 struct btrfs_inode_item); 4356 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4357 0, 0); 4358 btrfs_release_path(path); 4359 return 0; 4360 } 4361 4362 static int log_csums(struct btrfs_trans_handle *trans, 4363 struct btrfs_inode *inode, 4364 struct btrfs_root *log_root, 4365 struct btrfs_ordered_sum *sums) 4366 { 4367 const u64 lock_end = sums->bytenr + sums->len - 1; 4368 struct extent_state *cached_state = NULL; 4369 int ret; 4370 4371 /* 4372 * If this inode was not used for reflink operations in the current 4373 * transaction with new extents, then do the fast path, no need to 4374 * worry about logging checksum items with overlapping ranges. 4375 */ 4376 if (inode->last_reflink_trans < trans->transid) 4377 return btrfs_csum_file_blocks(trans, log_root, sums); 4378 4379 /* 4380 * Serialize logging for checksums. This is to avoid racing with the 4381 * same checksum being logged by another task that is logging another 4382 * file which happens to refer to the same extent as well. Such races 4383 * can leave checksum items in the log with overlapping ranges. 4384 */ 4385 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4386 lock_end, &cached_state); 4387 if (ret) 4388 return ret; 4389 /* 4390 * Due to extent cloning, we might have logged a csum item that covers a 4391 * subrange of a cloned extent, and later we can end up logging a csum 4392 * item for a larger subrange of the same extent or the entire range. 4393 * This would leave csum items in the log tree that cover the same range 4394 * and break the searches for checksums in the log tree, resulting in 4395 * some checksums missing in the fs/subvolume tree. So just delete (or 4396 * trim and adjust) any existing csum items in the log for this range. 4397 */ 4398 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4399 if (!ret) 4400 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4401 4402 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4403 &cached_state); 4404 4405 return ret; 4406 } 4407 4408 static noinline int copy_items(struct btrfs_trans_handle *trans, 4409 struct btrfs_inode *inode, 4410 struct btrfs_path *dst_path, 4411 struct btrfs_path *src_path, 4412 int start_slot, int nr, int inode_only, 4413 u64 logged_isize) 4414 { 4415 struct btrfs_root *log = inode->root->log_root; 4416 struct btrfs_file_extent_item *extent; 4417 struct extent_buffer *src = src_path->nodes[0]; 4418 int ret = 0; 4419 struct btrfs_key *ins_keys; 4420 u32 *ins_sizes; 4421 struct btrfs_item_batch batch; 4422 char *ins_data; 4423 int i; 4424 int dst_index; 4425 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4426 const u64 i_size = i_size_read(&inode->vfs_inode); 4427 4428 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4429 nr * sizeof(u32), GFP_NOFS); 4430 if (!ins_data) 4431 return -ENOMEM; 4432 4433 ins_sizes = (u32 *)ins_data; 4434 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4435 batch.keys = ins_keys; 4436 batch.data_sizes = ins_sizes; 4437 batch.total_data_size = 0; 4438 batch.nr = 0; 4439 4440 dst_index = 0; 4441 for (i = 0; i < nr; i++) { 4442 const int src_slot = start_slot + i; 4443 struct btrfs_root *csum_root; 4444 struct btrfs_ordered_sum *sums; 4445 struct btrfs_ordered_sum *sums_next; 4446 LIST_HEAD(ordered_sums); 4447 u64 disk_bytenr; 4448 u64 disk_num_bytes; 4449 u64 extent_offset; 4450 u64 extent_num_bytes; 4451 bool is_old_extent; 4452 4453 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4454 4455 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4456 goto add_to_batch; 4457 4458 extent = btrfs_item_ptr(src, src_slot, 4459 struct btrfs_file_extent_item); 4460 4461 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4462 trans->transid); 4463 4464 /* 4465 * Don't copy extents from past generations. That would make us 4466 * log a lot more metadata for common cases like doing only a 4467 * few random writes into a file and then fsync it for the first 4468 * time or after the full sync flag is set on the inode. We can 4469 * get leaves full of extent items, most of which are from past 4470 * generations, so we can skip them - as long as the inode has 4471 * not been the target of a reflink operation in this transaction, 4472 * as in that case it might have had file extent items with old 4473 * generations copied into it. We also must always log prealloc 4474 * extents that start at or beyond eof, otherwise we would lose 4475 * them on log replay. 4476 */ 4477 if (is_old_extent && 4478 ins_keys[dst_index].offset < i_size && 4479 inode->last_reflink_trans < trans->transid) 4480 continue; 4481 4482 if (skip_csum) 4483 goto add_to_batch; 4484 4485 /* Only regular extents have checksums. */ 4486 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4487 goto add_to_batch; 4488 4489 /* 4490 * If it's an extent created in a past transaction, then its 4491 * checksums are already accessible from the committed csum tree, 4492 * no need to log them. 4493 */ 4494 if (is_old_extent) 4495 goto add_to_batch; 4496 4497 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4498 /* If it's an explicit hole, there are no checksums. */ 4499 if (disk_bytenr == 0) 4500 goto add_to_batch; 4501 4502 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4503 4504 if (btrfs_file_extent_compression(src, extent)) { 4505 extent_offset = 0; 4506 extent_num_bytes = disk_num_bytes; 4507 } else { 4508 extent_offset = btrfs_file_extent_offset(src, extent); 4509 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4510 } 4511 4512 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4513 disk_bytenr += extent_offset; 4514 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4515 disk_bytenr + extent_num_bytes - 1, 4516 &ordered_sums, 0); 4517 if (ret) 4518 goto out; 4519 4520 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4521 if (!ret) 4522 ret = log_csums(trans, inode, log, sums); 4523 list_del(&sums->list); 4524 kfree(sums); 4525 } 4526 if (ret) 4527 goto out; 4528 4529 add_to_batch: 4530 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4531 batch.total_data_size += ins_sizes[dst_index]; 4532 batch.nr++; 4533 dst_index++; 4534 } 4535 4536 /* 4537 * We have a leaf full of old extent items that don't need to be logged, 4538 * so we don't need to do anything. 4539 */ 4540 if (batch.nr == 0) 4541 goto out; 4542 4543 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4544 if (ret) 4545 goto out; 4546 4547 dst_index = 0; 4548 for (i = 0; i < nr; i++) { 4549 const int src_slot = start_slot + i; 4550 const int dst_slot = dst_path->slots[0] + dst_index; 4551 struct btrfs_key key; 4552 unsigned long src_offset; 4553 unsigned long dst_offset; 4554 4555 /* 4556 * We're done, all the remaining items in the source leaf 4557 * correspond to old file extent items. 4558 */ 4559 if (dst_index >= batch.nr) 4560 break; 4561 4562 btrfs_item_key_to_cpu(src, &key, src_slot); 4563 4564 if (key.type != BTRFS_EXTENT_DATA_KEY) 4565 goto copy_item; 4566 4567 extent = btrfs_item_ptr(src, src_slot, 4568 struct btrfs_file_extent_item); 4569 4570 /* See the comment in the previous loop, same logic. */ 4571 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4572 key.offset < i_size && 4573 inode->last_reflink_trans < trans->transid) 4574 continue; 4575 4576 copy_item: 4577 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4578 src_offset = btrfs_item_ptr_offset(src, src_slot); 4579 4580 if (key.type == BTRFS_INODE_ITEM_KEY) { 4581 struct btrfs_inode_item *inode_item; 4582 4583 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4584 struct btrfs_inode_item); 4585 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4586 &inode->vfs_inode, 4587 inode_only == LOG_INODE_EXISTS, 4588 logged_isize); 4589 } else { 4590 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4591 src_offset, ins_sizes[dst_index]); 4592 } 4593 4594 dst_index++; 4595 } 4596 4597 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4598 btrfs_release_path(dst_path); 4599 out: 4600 kfree(ins_data); 4601 4602 return ret; 4603 } 4604 4605 static int extent_cmp(void *priv, const struct list_head *a, 4606 const struct list_head *b) 4607 { 4608 const struct extent_map *em1, *em2; 4609 4610 em1 = list_entry(a, struct extent_map, list); 4611 em2 = list_entry(b, struct extent_map, list); 4612 4613 if (em1->start < em2->start) 4614 return -1; 4615 else if (em1->start > em2->start) 4616 return 1; 4617 return 0; 4618 } 4619 4620 static int log_extent_csums(struct btrfs_trans_handle *trans, 4621 struct btrfs_inode *inode, 4622 struct btrfs_root *log_root, 4623 const struct extent_map *em, 4624 struct btrfs_log_ctx *ctx) 4625 { 4626 struct btrfs_ordered_extent *ordered; 4627 struct btrfs_root *csum_root; 4628 u64 csum_offset; 4629 u64 csum_len; 4630 u64 mod_start = em->mod_start; 4631 u64 mod_len = em->mod_len; 4632 LIST_HEAD(ordered_sums); 4633 int ret = 0; 4634 4635 if (inode->flags & BTRFS_INODE_NODATASUM || 4636 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4637 em->block_start == EXTENT_MAP_HOLE) 4638 return 0; 4639 4640 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4641 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4642 const u64 mod_end = mod_start + mod_len; 4643 struct btrfs_ordered_sum *sums; 4644 4645 if (mod_len == 0) 4646 break; 4647 4648 if (ordered_end <= mod_start) 4649 continue; 4650 if (mod_end <= ordered->file_offset) 4651 break; 4652 4653 /* 4654 * We are going to copy all the csums on this ordered extent, so 4655 * go ahead and adjust mod_start and mod_len in case this ordered 4656 * extent has already been logged. 4657 */ 4658 if (ordered->file_offset > mod_start) { 4659 if (ordered_end >= mod_end) 4660 mod_len = ordered->file_offset - mod_start; 4661 /* 4662 * If we have this case 4663 * 4664 * |--------- logged extent ---------| 4665 * |----- ordered extent ----| 4666 * 4667 * Just don't mess with mod_start and mod_len, we'll 4668 * just end up logging more csums than we need and it 4669 * will be ok. 4670 */ 4671 } else { 4672 if (ordered_end < mod_end) { 4673 mod_len = mod_end - ordered_end; 4674 mod_start = ordered_end; 4675 } else { 4676 mod_len = 0; 4677 } 4678 } 4679 4680 /* 4681 * To keep us from looping for the above case of an ordered 4682 * extent that falls inside of the logged extent. 4683 */ 4684 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4685 continue; 4686 4687 list_for_each_entry(sums, &ordered->list, list) { 4688 ret = log_csums(trans, inode, log_root, sums); 4689 if (ret) 4690 return ret; 4691 } 4692 } 4693 4694 /* We're done, found all csums in the ordered extents. */ 4695 if (mod_len == 0) 4696 return 0; 4697 4698 /* If we're compressed we have to save the entire range of csums. */ 4699 if (em->compress_type) { 4700 csum_offset = 0; 4701 csum_len = max(em->block_len, em->orig_block_len); 4702 } else { 4703 csum_offset = mod_start - em->start; 4704 csum_len = mod_len; 4705 } 4706 4707 /* block start is already adjusted for the file extent offset. */ 4708 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4709 ret = btrfs_lookup_csums_range(csum_root, 4710 em->block_start + csum_offset, 4711 em->block_start + csum_offset + 4712 csum_len - 1, &ordered_sums, 0); 4713 if (ret) 4714 return ret; 4715 4716 while (!list_empty(&ordered_sums)) { 4717 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4718 struct btrfs_ordered_sum, 4719 list); 4720 if (!ret) 4721 ret = log_csums(trans, inode, log_root, sums); 4722 list_del(&sums->list); 4723 kfree(sums); 4724 } 4725 4726 return ret; 4727 } 4728 4729 static int log_one_extent(struct btrfs_trans_handle *trans, 4730 struct btrfs_inode *inode, 4731 const struct extent_map *em, 4732 struct btrfs_path *path, 4733 struct btrfs_log_ctx *ctx) 4734 { 4735 struct btrfs_drop_extents_args drop_args = { 0 }; 4736 struct btrfs_root *log = inode->root->log_root; 4737 struct btrfs_file_extent_item fi = { 0 }; 4738 struct extent_buffer *leaf; 4739 struct btrfs_key key; 4740 u64 extent_offset = em->start - em->orig_start; 4741 u64 block_len; 4742 int ret; 4743 4744 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4745 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4746 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4747 else 4748 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4749 4750 block_len = max(em->block_len, em->orig_block_len); 4751 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4752 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4753 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4754 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4755 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4756 extent_offset); 4757 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4758 } 4759 4760 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4761 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4762 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4763 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4764 4765 ret = log_extent_csums(trans, inode, log, em, ctx); 4766 if (ret) 4767 return ret; 4768 4769 /* 4770 * If this is the first time we are logging the inode in the current 4771 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4772 * because it does a deletion search, which always acquires write locks 4773 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4774 * but also adds significant contention in a log tree, since log trees 4775 * are small, with a root at level 2 or 3 at most, due to their short 4776 * life span. 4777 */ 4778 if (ctx->logged_before) { 4779 drop_args.path = path; 4780 drop_args.start = em->start; 4781 drop_args.end = em->start + em->len; 4782 drop_args.replace_extent = true; 4783 drop_args.extent_item_size = sizeof(fi); 4784 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4785 if (ret) 4786 return ret; 4787 } 4788 4789 if (!drop_args.extent_inserted) { 4790 key.objectid = btrfs_ino(inode); 4791 key.type = BTRFS_EXTENT_DATA_KEY; 4792 key.offset = em->start; 4793 4794 ret = btrfs_insert_empty_item(trans, log, path, &key, 4795 sizeof(fi)); 4796 if (ret) 4797 return ret; 4798 } 4799 leaf = path->nodes[0]; 4800 write_extent_buffer(leaf, &fi, 4801 btrfs_item_ptr_offset(leaf, path->slots[0]), 4802 sizeof(fi)); 4803 btrfs_mark_buffer_dirty(leaf); 4804 4805 btrfs_release_path(path); 4806 4807 return ret; 4808 } 4809 4810 /* 4811 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4812 * lose them after doing a full/fast fsync and replaying the log. We scan the 4813 * subvolume's root instead of iterating the inode's extent map tree because 4814 * otherwise we can log incorrect extent items based on extent map conversion. 4815 * That can happen due to the fact that extent maps are merged when they 4816 * are not in the extent map tree's list of modified extents. 4817 */ 4818 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4819 struct btrfs_inode *inode, 4820 struct btrfs_path *path) 4821 { 4822 struct btrfs_root *root = inode->root; 4823 struct btrfs_key key; 4824 const u64 i_size = i_size_read(&inode->vfs_inode); 4825 const u64 ino = btrfs_ino(inode); 4826 struct btrfs_path *dst_path = NULL; 4827 bool dropped_extents = false; 4828 u64 truncate_offset = i_size; 4829 struct extent_buffer *leaf; 4830 int slot; 4831 int ins_nr = 0; 4832 int start_slot; 4833 int ret; 4834 4835 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4836 return 0; 4837 4838 key.objectid = ino; 4839 key.type = BTRFS_EXTENT_DATA_KEY; 4840 key.offset = i_size; 4841 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4842 if (ret < 0) 4843 goto out; 4844 4845 /* 4846 * We must check if there is a prealloc extent that starts before the 4847 * i_size and crosses the i_size boundary. This is to ensure later we 4848 * truncate down to the end of that extent and not to the i_size, as 4849 * otherwise we end up losing part of the prealloc extent after a log 4850 * replay and with an implicit hole if there is another prealloc extent 4851 * that starts at an offset beyond i_size. 4852 */ 4853 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4854 if (ret < 0) 4855 goto out; 4856 4857 if (ret == 0) { 4858 struct btrfs_file_extent_item *ei; 4859 4860 leaf = path->nodes[0]; 4861 slot = path->slots[0]; 4862 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4863 4864 if (btrfs_file_extent_type(leaf, ei) == 4865 BTRFS_FILE_EXTENT_PREALLOC) { 4866 u64 extent_end; 4867 4868 btrfs_item_key_to_cpu(leaf, &key, slot); 4869 extent_end = key.offset + 4870 btrfs_file_extent_num_bytes(leaf, ei); 4871 4872 if (extent_end > i_size) 4873 truncate_offset = extent_end; 4874 } 4875 } else { 4876 ret = 0; 4877 } 4878 4879 while (true) { 4880 leaf = path->nodes[0]; 4881 slot = path->slots[0]; 4882 4883 if (slot >= btrfs_header_nritems(leaf)) { 4884 if (ins_nr > 0) { 4885 ret = copy_items(trans, inode, dst_path, path, 4886 start_slot, ins_nr, 1, 0); 4887 if (ret < 0) 4888 goto out; 4889 ins_nr = 0; 4890 } 4891 ret = btrfs_next_leaf(root, path); 4892 if (ret < 0) 4893 goto out; 4894 if (ret > 0) { 4895 ret = 0; 4896 break; 4897 } 4898 continue; 4899 } 4900 4901 btrfs_item_key_to_cpu(leaf, &key, slot); 4902 if (key.objectid > ino) 4903 break; 4904 if (WARN_ON_ONCE(key.objectid < ino) || 4905 key.type < BTRFS_EXTENT_DATA_KEY || 4906 key.offset < i_size) { 4907 path->slots[0]++; 4908 continue; 4909 } 4910 if (!dropped_extents) { 4911 /* 4912 * Avoid logging extent items logged in past fsync calls 4913 * and leading to duplicate keys in the log tree. 4914 */ 4915 ret = truncate_inode_items(trans, root->log_root, inode, 4916 truncate_offset, 4917 BTRFS_EXTENT_DATA_KEY); 4918 if (ret) 4919 goto out; 4920 dropped_extents = true; 4921 } 4922 if (ins_nr == 0) 4923 start_slot = slot; 4924 ins_nr++; 4925 path->slots[0]++; 4926 if (!dst_path) { 4927 dst_path = btrfs_alloc_path(); 4928 if (!dst_path) { 4929 ret = -ENOMEM; 4930 goto out; 4931 } 4932 } 4933 } 4934 if (ins_nr > 0) 4935 ret = copy_items(trans, inode, dst_path, path, 4936 start_slot, ins_nr, 1, 0); 4937 out: 4938 btrfs_release_path(path); 4939 btrfs_free_path(dst_path); 4940 return ret; 4941 } 4942 4943 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4944 struct btrfs_inode *inode, 4945 struct btrfs_path *path, 4946 struct btrfs_log_ctx *ctx) 4947 { 4948 struct btrfs_ordered_extent *ordered; 4949 struct btrfs_ordered_extent *tmp; 4950 struct extent_map *em, *n; 4951 struct list_head extents; 4952 struct extent_map_tree *tree = &inode->extent_tree; 4953 int ret = 0; 4954 int num = 0; 4955 4956 INIT_LIST_HEAD(&extents); 4957 4958 write_lock(&tree->lock); 4959 4960 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4961 list_del_init(&em->list); 4962 /* 4963 * Just an arbitrary number, this can be really CPU intensive 4964 * once we start getting a lot of extents, and really once we 4965 * have a bunch of extents we just want to commit since it will 4966 * be faster. 4967 */ 4968 if (++num > 32768) { 4969 list_del_init(&tree->modified_extents); 4970 ret = -EFBIG; 4971 goto process; 4972 } 4973 4974 if (em->generation < trans->transid) 4975 continue; 4976 4977 /* We log prealloc extents beyond eof later. */ 4978 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4979 em->start >= i_size_read(&inode->vfs_inode)) 4980 continue; 4981 4982 /* Need a ref to keep it from getting evicted from cache */ 4983 refcount_inc(&em->refs); 4984 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4985 list_add_tail(&em->list, &extents); 4986 num++; 4987 } 4988 4989 list_sort(NULL, &extents, extent_cmp); 4990 process: 4991 while (!list_empty(&extents)) { 4992 em = list_entry(extents.next, struct extent_map, list); 4993 4994 list_del_init(&em->list); 4995 4996 /* 4997 * If we had an error we just need to delete everybody from our 4998 * private list. 4999 */ 5000 if (ret) { 5001 clear_em_logging(tree, em); 5002 free_extent_map(em); 5003 continue; 5004 } 5005 5006 write_unlock(&tree->lock); 5007 5008 ret = log_one_extent(trans, inode, em, path, ctx); 5009 write_lock(&tree->lock); 5010 clear_em_logging(tree, em); 5011 free_extent_map(em); 5012 } 5013 WARN_ON(!list_empty(&extents)); 5014 write_unlock(&tree->lock); 5015 5016 if (!ret) 5017 ret = btrfs_log_prealloc_extents(trans, inode, path); 5018 if (ret) 5019 return ret; 5020 5021 /* 5022 * We have logged all extents successfully, now make sure the commit of 5023 * the current transaction waits for the ordered extents to complete 5024 * before it commits and wipes out the log trees, otherwise we would 5025 * lose data if an ordered extents completes after the transaction 5026 * commits and a power failure happens after the transaction commit. 5027 */ 5028 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 5029 list_del_init(&ordered->log_list); 5030 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 5031 5032 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5033 spin_lock_irq(&inode->ordered_tree.lock); 5034 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5035 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 5036 atomic_inc(&trans->transaction->pending_ordered); 5037 } 5038 spin_unlock_irq(&inode->ordered_tree.lock); 5039 } 5040 btrfs_put_ordered_extent(ordered); 5041 } 5042 5043 return 0; 5044 } 5045 5046 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 5047 struct btrfs_path *path, u64 *size_ret) 5048 { 5049 struct btrfs_key key; 5050 int ret; 5051 5052 key.objectid = btrfs_ino(inode); 5053 key.type = BTRFS_INODE_ITEM_KEY; 5054 key.offset = 0; 5055 5056 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 5057 if (ret < 0) { 5058 return ret; 5059 } else if (ret > 0) { 5060 *size_ret = 0; 5061 } else { 5062 struct btrfs_inode_item *item; 5063 5064 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5065 struct btrfs_inode_item); 5066 *size_ret = btrfs_inode_size(path->nodes[0], item); 5067 /* 5068 * If the in-memory inode's i_size is smaller then the inode 5069 * size stored in the btree, return the inode's i_size, so 5070 * that we get a correct inode size after replaying the log 5071 * when before a power failure we had a shrinking truncate 5072 * followed by addition of a new name (rename / new hard link). 5073 * Otherwise return the inode size from the btree, to avoid 5074 * data loss when replaying a log due to previously doing a 5075 * write that expands the inode's size and logging a new name 5076 * immediately after. 5077 */ 5078 if (*size_ret > inode->vfs_inode.i_size) 5079 *size_ret = inode->vfs_inode.i_size; 5080 } 5081 5082 btrfs_release_path(path); 5083 return 0; 5084 } 5085 5086 /* 5087 * At the moment we always log all xattrs. This is to figure out at log replay 5088 * time which xattrs must have their deletion replayed. If a xattr is missing 5089 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5090 * because if a xattr is deleted, the inode is fsynced and a power failure 5091 * happens, causing the log to be replayed the next time the fs is mounted, 5092 * we want the xattr to not exist anymore (same behaviour as other filesystems 5093 * with a journal, ext3/4, xfs, f2fs, etc). 5094 */ 5095 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5096 struct btrfs_inode *inode, 5097 struct btrfs_path *path, 5098 struct btrfs_path *dst_path) 5099 { 5100 struct btrfs_root *root = inode->root; 5101 int ret; 5102 struct btrfs_key key; 5103 const u64 ino = btrfs_ino(inode); 5104 int ins_nr = 0; 5105 int start_slot = 0; 5106 bool found_xattrs = false; 5107 5108 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5109 return 0; 5110 5111 key.objectid = ino; 5112 key.type = BTRFS_XATTR_ITEM_KEY; 5113 key.offset = 0; 5114 5115 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5116 if (ret < 0) 5117 return ret; 5118 5119 while (true) { 5120 int slot = path->slots[0]; 5121 struct extent_buffer *leaf = path->nodes[0]; 5122 int nritems = btrfs_header_nritems(leaf); 5123 5124 if (slot >= nritems) { 5125 if (ins_nr > 0) { 5126 ret = copy_items(trans, inode, dst_path, path, 5127 start_slot, ins_nr, 1, 0); 5128 if (ret < 0) 5129 return ret; 5130 ins_nr = 0; 5131 } 5132 ret = btrfs_next_leaf(root, path); 5133 if (ret < 0) 5134 return ret; 5135 else if (ret > 0) 5136 break; 5137 continue; 5138 } 5139 5140 btrfs_item_key_to_cpu(leaf, &key, slot); 5141 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5142 break; 5143 5144 if (ins_nr == 0) 5145 start_slot = slot; 5146 ins_nr++; 5147 path->slots[0]++; 5148 found_xattrs = true; 5149 cond_resched(); 5150 } 5151 if (ins_nr > 0) { 5152 ret = copy_items(trans, inode, dst_path, path, 5153 start_slot, ins_nr, 1, 0); 5154 if (ret < 0) 5155 return ret; 5156 } 5157 5158 if (!found_xattrs) 5159 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5160 5161 return 0; 5162 } 5163 5164 /* 5165 * When using the NO_HOLES feature if we punched a hole that causes the 5166 * deletion of entire leafs or all the extent items of the first leaf (the one 5167 * that contains the inode item and references) we may end up not processing 5168 * any extents, because there are no leafs with a generation matching the 5169 * current transaction that have extent items for our inode. So we need to find 5170 * if any holes exist and then log them. We also need to log holes after any 5171 * truncate operation that changes the inode's size. 5172 */ 5173 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5174 struct btrfs_inode *inode, 5175 struct btrfs_path *path) 5176 { 5177 struct btrfs_root *root = inode->root; 5178 struct btrfs_fs_info *fs_info = root->fs_info; 5179 struct btrfs_key key; 5180 const u64 ino = btrfs_ino(inode); 5181 const u64 i_size = i_size_read(&inode->vfs_inode); 5182 u64 prev_extent_end = 0; 5183 int ret; 5184 5185 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5186 return 0; 5187 5188 key.objectid = ino; 5189 key.type = BTRFS_EXTENT_DATA_KEY; 5190 key.offset = 0; 5191 5192 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5193 if (ret < 0) 5194 return ret; 5195 5196 while (true) { 5197 struct extent_buffer *leaf = path->nodes[0]; 5198 5199 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5200 ret = btrfs_next_leaf(root, path); 5201 if (ret < 0) 5202 return ret; 5203 if (ret > 0) { 5204 ret = 0; 5205 break; 5206 } 5207 leaf = path->nodes[0]; 5208 } 5209 5210 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5211 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5212 break; 5213 5214 /* We have a hole, log it. */ 5215 if (prev_extent_end < key.offset) { 5216 const u64 hole_len = key.offset - prev_extent_end; 5217 5218 /* 5219 * Release the path to avoid deadlocks with other code 5220 * paths that search the root while holding locks on 5221 * leafs from the log root. 5222 */ 5223 btrfs_release_path(path); 5224 ret = btrfs_insert_file_extent(trans, root->log_root, 5225 ino, prev_extent_end, 0, 5226 0, hole_len, 0, hole_len, 5227 0, 0, 0); 5228 if (ret < 0) 5229 return ret; 5230 5231 /* 5232 * Search for the same key again in the root. Since it's 5233 * an extent item and we are holding the inode lock, the 5234 * key must still exist. If it doesn't just emit warning 5235 * and return an error to fall back to a transaction 5236 * commit. 5237 */ 5238 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5239 if (ret < 0) 5240 return ret; 5241 if (WARN_ON(ret > 0)) 5242 return -ENOENT; 5243 leaf = path->nodes[0]; 5244 } 5245 5246 prev_extent_end = btrfs_file_extent_end(path); 5247 path->slots[0]++; 5248 cond_resched(); 5249 } 5250 5251 if (prev_extent_end < i_size) { 5252 u64 hole_len; 5253 5254 btrfs_release_path(path); 5255 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5256 ret = btrfs_insert_file_extent(trans, root->log_root, 5257 ino, prev_extent_end, 0, 0, 5258 hole_len, 0, hole_len, 5259 0, 0, 0); 5260 if (ret < 0) 5261 return ret; 5262 } 5263 5264 return 0; 5265 } 5266 5267 /* 5268 * When we are logging a new inode X, check if it doesn't have a reference that 5269 * matches the reference from some other inode Y created in a past transaction 5270 * and that was renamed in the current transaction. If we don't do this, then at 5271 * log replay time we can lose inode Y (and all its files if it's a directory): 5272 * 5273 * mkdir /mnt/x 5274 * echo "hello world" > /mnt/x/foobar 5275 * sync 5276 * mv /mnt/x /mnt/y 5277 * mkdir /mnt/x # or touch /mnt/x 5278 * xfs_io -c fsync /mnt/x 5279 * <power fail> 5280 * mount fs, trigger log replay 5281 * 5282 * After the log replay procedure, we would lose the first directory and all its 5283 * files (file foobar). 5284 * For the case where inode Y is not a directory we simply end up losing it: 5285 * 5286 * echo "123" > /mnt/foo 5287 * sync 5288 * mv /mnt/foo /mnt/bar 5289 * echo "abc" > /mnt/foo 5290 * xfs_io -c fsync /mnt/foo 5291 * <power fail> 5292 * 5293 * We also need this for cases where a snapshot entry is replaced by some other 5294 * entry (file or directory) otherwise we end up with an unreplayable log due to 5295 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5296 * if it were a regular entry: 5297 * 5298 * mkdir /mnt/x 5299 * btrfs subvolume snapshot /mnt /mnt/x/snap 5300 * btrfs subvolume delete /mnt/x/snap 5301 * rmdir /mnt/x 5302 * mkdir /mnt/x 5303 * fsync /mnt/x or fsync some new file inside it 5304 * <power fail> 5305 * 5306 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5307 * the same transaction. 5308 */ 5309 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5310 const int slot, 5311 const struct btrfs_key *key, 5312 struct btrfs_inode *inode, 5313 u64 *other_ino, u64 *other_parent) 5314 { 5315 int ret; 5316 struct btrfs_path *search_path; 5317 char *name = NULL; 5318 u32 name_len = 0; 5319 u32 item_size = btrfs_item_size(eb, slot); 5320 u32 cur_offset = 0; 5321 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5322 5323 search_path = btrfs_alloc_path(); 5324 if (!search_path) 5325 return -ENOMEM; 5326 search_path->search_commit_root = 1; 5327 search_path->skip_locking = 1; 5328 5329 while (cur_offset < item_size) { 5330 u64 parent; 5331 u32 this_name_len; 5332 u32 this_len; 5333 unsigned long name_ptr; 5334 struct btrfs_dir_item *di; 5335 5336 if (key->type == BTRFS_INODE_REF_KEY) { 5337 struct btrfs_inode_ref *iref; 5338 5339 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5340 parent = key->offset; 5341 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5342 name_ptr = (unsigned long)(iref + 1); 5343 this_len = sizeof(*iref) + this_name_len; 5344 } else { 5345 struct btrfs_inode_extref *extref; 5346 5347 extref = (struct btrfs_inode_extref *)(ptr + 5348 cur_offset); 5349 parent = btrfs_inode_extref_parent(eb, extref); 5350 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5351 name_ptr = (unsigned long)&extref->name; 5352 this_len = sizeof(*extref) + this_name_len; 5353 } 5354 5355 if (this_name_len > name_len) { 5356 char *new_name; 5357 5358 new_name = krealloc(name, this_name_len, GFP_NOFS); 5359 if (!new_name) { 5360 ret = -ENOMEM; 5361 goto out; 5362 } 5363 name_len = this_name_len; 5364 name = new_name; 5365 } 5366 5367 read_extent_buffer(eb, name, name_ptr, this_name_len); 5368 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5369 parent, name, this_name_len, 0); 5370 if (di && !IS_ERR(di)) { 5371 struct btrfs_key di_key; 5372 5373 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5374 di, &di_key); 5375 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5376 if (di_key.objectid != key->objectid) { 5377 ret = 1; 5378 *other_ino = di_key.objectid; 5379 *other_parent = parent; 5380 } else { 5381 ret = 0; 5382 } 5383 } else { 5384 ret = -EAGAIN; 5385 } 5386 goto out; 5387 } else if (IS_ERR(di)) { 5388 ret = PTR_ERR(di); 5389 goto out; 5390 } 5391 btrfs_release_path(search_path); 5392 5393 cur_offset += this_len; 5394 } 5395 ret = 0; 5396 out: 5397 btrfs_free_path(search_path); 5398 kfree(name); 5399 return ret; 5400 } 5401 5402 struct btrfs_ino_list { 5403 u64 ino; 5404 u64 parent; 5405 struct list_head list; 5406 }; 5407 5408 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5409 struct btrfs_root *root, 5410 struct btrfs_path *path, 5411 struct btrfs_log_ctx *ctx, 5412 u64 ino, u64 parent) 5413 { 5414 struct btrfs_ino_list *ino_elem; 5415 LIST_HEAD(inode_list); 5416 int ret = 0; 5417 5418 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5419 if (!ino_elem) 5420 return -ENOMEM; 5421 ino_elem->ino = ino; 5422 ino_elem->parent = parent; 5423 list_add_tail(&ino_elem->list, &inode_list); 5424 5425 while (!list_empty(&inode_list)) { 5426 struct btrfs_fs_info *fs_info = root->fs_info; 5427 struct btrfs_key key; 5428 struct inode *inode; 5429 5430 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5431 list); 5432 ino = ino_elem->ino; 5433 parent = ino_elem->parent; 5434 list_del(&ino_elem->list); 5435 kfree(ino_elem); 5436 if (ret) 5437 continue; 5438 5439 btrfs_release_path(path); 5440 5441 inode = btrfs_iget(fs_info->sb, ino, root); 5442 /* 5443 * If the other inode that had a conflicting dir entry was 5444 * deleted in the current transaction, we need to log its parent 5445 * directory. 5446 */ 5447 if (IS_ERR(inode)) { 5448 ret = PTR_ERR(inode); 5449 if (ret == -ENOENT) { 5450 inode = btrfs_iget(fs_info->sb, parent, root); 5451 if (IS_ERR(inode)) { 5452 ret = PTR_ERR(inode); 5453 } else { 5454 ret = btrfs_log_inode(trans, 5455 BTRFS_I(inode), 5456 LOG_OTHER_INODE_ALL, 5457 ctx); 5458 btrfs_add_delayed_iput(inode); 5459 } 5460 } 5461 continue; 5462 } 5463 /* 5464 * If the inode was already logged skip it - otherwise we can 5465 * hit an infinite loop. Example: 5466 * 5467 * From the commit root (previous transaction) we have the 5468 * following inodes: 5469 * 5470 * inode 257 a directory 5471 * inode 258 with references "zz" and "zz_link" on inode 257 5472 * inode 259 with reference "a" on inode 257 5473 * 5474 * And in the current (uncommitted) transaction we have: 5475 * 5476 * inode 257 a directory, unchanged 5477 * inode 258 with references "a" and "a2" on inode 257 5478 * inode 259 with reference "zz_link" on inode 257 5479 * inode 261 with reference "zz" on inode 257 5480 * 5481 * When logging inode 261 the following infinite loop could 5482 * happen if we don't skip already logged inodes: 5483 * 5484 * - we detect inode 258 as a conflicting inode, with inode 261 5485 * on reference "zz", and log it; 5486 * 5487 * - we detect inode 259 as a conflicting inode, with inode 258 5488 * on reference "a", and log it; 5489 * 5490 * - we detect inode 258 as a conflicting inode, with inode 259 5491 * on reference "zz_link", and log it - again! After this we 5492 * repeat the above steps forever. 5493 */ 5494 spin_lock(&BTRFS_I(inode)->lock); 5495 /* 5496 * Check the inode's logged_trans only instead of 5497 * btrfs_inode_in_log(). This is because the last_log_commit of 5498 * the inode is not updated when we only log that it exists (see 5499 * btrfs_log_inode()). 5500 */ 5501 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5502 spin_unlock(&BTRFS_I(inode)->lock); 5503 btrfs_add_delayed_iput(inode); 5504 continue; 5505 } 5506 spin_unlock(&BTRFS_I(inode)->lock); 5507 /* 5508 * We are safe logging the other inode without acquiring its 5509 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5510 * are safe against concurrent renames of the other inode as 5511 * well because during a rename we pin the log and update the 5512 * log with the new name before we unpin it. 5513 */ 5514 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); 5515 if (ret) { 5516 btrfs_add_delayed_iput(inode); 5517 continue; 5518 } 5519 5520 key.objectid = ino; 5521 key.type = BTRFS_INODE_REF_KEY; 5522 key.offset = 0; 5523 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5524 if (ret < 0) { 5525 btrfs_add_delayed_iput(inode); 5526 continue; 5527 } 5528 5529 while (true) { 5530 struct extent_buffer *leaf = path->nodes[0]; 5531 int slot = path->slots[0]; 5532 u64 other_ino = 0; 5533 u64 other_parent = 0; 5534 5535 if (slot >= btrfs_header_nritems(leaf)) { 5536 ret = btrfs_next_leaf(root, path); 5537 if (ret < 0) { 5538 break; 5539 } else if (ret > 0) { 5540 ret = 0; 5541 break; 5542 } 5543 continue; 5544 } 5545 5546 btrfs_item_key_to_cpu(leaf, &key, slot); 5547 if (key.objectid != ino || 5548 (key.type != BTRFS_INODE_REF_KEY && 5549 key.type != BTRFS_INODE_EXTREF_KEY)) { 5550 ret = 0; 5551 break; 5552 } 5553 5554 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5555 BTRFS_I(inode), &other_ino, 5556 &other_parent); 5557 if (ret < 0) 5558 break; 5559 if (ret > 0) { 5560 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5561 if (!ino_elem) { 5562 ret = -ENOMEM; 5563 break; 5564 } 5565 ino_elem->ino = other_ino; 5566 ino_elem->parent = other_parent; 5567 list_add_tail(&ino_elem->list, &inode_list); 5568 ret = 0; 5569 } 5570 path->slots[0]++; 5571 } 5572 btrfs_add_delayed_iput(inode); 5573 } 5574 5575 return ret; 5576 } 5577 5578 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5579 struct btrfs_inode *inode, 5580 struct btrfs_key *min_key, 5581 const struct btrfs_key *max_key, 5582 struct btrfs_path *path, 5583 struct btrfs_path *dst_path, 5584 const u64 logged_isize, 5585 const bool recursive_logging, 5586 const int inode_only, 5587 struct btrfs_log_ctx *ctx, 5588 bool *need_log_inode_item) 5589 { 5590 const u64 i_size = i_size_read(&inode->vfs_inode); 5591 struct btrfs_root *root = inode->root; 5592 int ins_start_slot = 0; 5593 int ins_nr = 0; 5594 int ret; 5595 5596 while (1) { 5597 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5598 if (ret < 0) 5599 return ret; 5600 if (ret > 0) { 5601 ret = 0; 5602 break; 5603 } 5604 again: 5605 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5606 if (min_key->objectid != max_key->objectid) 5607 break; 5608 if (min_key->type > max_key->type) 5609 break; 5610 5611 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5612 *need_log_inode_item = false; 5613 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5614 min_key->offset >= i_size) { 5615 /* 5616 * Extents at and beyond eof are logged with 5617 * btrfs_log_prealloc_extents(). 5618 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5619 * and no keys greater than that, so bail out. 5620 */ 5621 break; 5622 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5623 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5624 inode->generation == trans->transid && 5625 !recursive_logging) { 5626 u64 other_ino = 0; 5627 u64 other_parent = 0; 5628 5629 ret = btrfs_check_ref_name_override(path->nodes[0], 5630 path->slots[0], min_key, inode, 5631 &other_ino, &other_parent); 5632 if (ret < 0) { 5633 return ret; 5634 } else if (ret > 0 && 5635 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5636 if (ins_nr > 0) { 5637 ins_nr++; 5638 } else { 5639 ins_nr = 1; 5640 ins_start_slot = path->slots[0]; 5641 } 5642 ret = copy_items(trans, inode, dst_path, path, 5643 ins_start_slot, ins_nr, 5644 inode_only, logged_isize); 5645 if (ret < 0) 5646 return ret; 5647 ins_nr = 0; 5648 5649 ret = log_conflicting_inodes(trans, root, path, 5650 ctx, other_ino, other_parent); 5651 if (ret) 5652 return ret; 5653 btrfs_release_path(path); 5654 goto next_key; 5655 } 5656 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5657 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5658 if (ins_nr == 0) 5659 goto next_slot; 5660 ret = copy_items(trans, inode, dst_path, path, 5661 ins_start_slot, 5662 ins_nr, inode_only, logged_isize); 5663 if (ret < 0) 5664 return ret; 5665 ins_nr = 0; 5666 goto next_slot; 5667 } 5668 5669 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5670 ins_nr++; 5671 goto next_slot; 5672 } else if (!ins_nr) { 5673 ins_start_slot = path->slots[0]; 5674 ins_nr = 1; 5675 goto next_slot; 5676 } 5677 5678 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5679 ins_nr, inode_only, logged_isize); 5680 if (ret < 0) 5681 return ret; 5682 ins_nr = 1; 5683 ins_start_slot = path->slots[0]; 5684 next_slot: 5685 path->slots[0]++; 5686 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5687 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5688 path->slots[0]); 5689 goto again; 5690 } 5691 if (ins_nr) { 5692 ret = copy_items(trans, inode, dst_path, path, 5693 ins_start_slot, ins_nr, inode_only, 5694 logged_isize); 5695 if (ret < 0) 5696 return ret; 5697 ins_nr = 0; 5698 } 5699 btrfs_release_path(path); 5700 next_key: 5701 if (min_key->offset < (u64)-1) { 5702 min_key->offset++; 5703 } else if (min_key->type < max_key->type) { 5704 min_key->type++; 5705 min_key->offset = 0; 5706 } else { 5707 break; 5708 } 5709 5710 /* 5711 * We may process many leaves full of items for our inode, so 5712 * avoid monopolizing a cpu for too long by rescheduling while 5713 * not holding locks on any tree. 5714 */ 5715 cond_resched(); 5716 } 5717 if (ins_nr) { 5718 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5719 ins_nr, inode_only, logged_isize); 5720 if (ret) 5721 return ret; 5722 } 5723 5724 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5725 /* 5726 * Release the path because otherwise we might attempt to double 5727 * lock the same leaf with btrfs_log_prealloc_extents() below. 5728 */ 5729 btrfs_release_path(path); 5730 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5731 } 5732 5733 return ret; 5734 } 5735 5736 /* log a single inode in the tree log. 5737 * At least one parent directory for this inode must exist in the tree 5738 * or be logged already. 5739 * 5740 * Any items from this inode changed by the current transaction are copied 5741 * to the log tree. An extra reference is taken on any extents in this 5742 * file, allowing us to avoid a whole pile of corner cases around logging 5743 * blocks that have been removed from the tree. 5744 * 5745 * See LOG_INODE_ALL and related defines for a description of what inode_only 5746 * does. 5747 * 5748 * This handles both files and directories. 5749 */ 5750 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5751 struct btrfs_inode *inode, 5752 int inode_only, 5753 struct btrfs_log_ctx *ctx) 5754 { 5755 struct btrfs_path *path; 5756 struct btrfs_path *dst_path; 5757 struct btrfs_key min_key; 5758 struct btrfs_key max_key; 5759 struct btrfs_root *log = inode->root->log_root; 5760 int ret; 5761 bool fast_search = false; 5762 u64 ino = btrfs_ino(inode); 5763 struct extent_map_tree *em_tree = &inode->extent_tree; 5764 u64 logged_isize = 0; 5765 bool need_log_inode_item = true; 5766 bool xattrs_logged = false; 5767 bool recursive_logging = false; 5768 bool inode_item_dropped = true; 5769 const bool orig_logged_before = ctx->logged_before; 5770 5771 path = btrfs_alloc_path(); 5772 if (!path) 5773 return -ENOMEM; 5774 dst_path = btrfs_alloc_path(); 5775 if (!dst_path) { 5776 btrfs_free_path(path); 5777 return -ENOMEM; 5778 } 5779 5780 min_key.objectid = ino; 5781 min_key.type = BTRFS_INODE_ITEM_KEY; 5782 min_key.offset = 0; 5783 5784 max_key.objectid = ino; 5785 5786 5787 /* today the code can only do partial logging of directories */ 5788 if (S_ISDIR(inode->vfs_inode.i_mode) || 5789 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5790 &inode->runtime_flags) && 5791 inode_only >= LOG_INODE_EXISTS)) 5792 max_key.type = BTRFS_XATTR_ITEM_KEY; 5793 else 5794 max_key.type = (u8)-1; 5795 max_key.offset = (u64)-1; 5796 5797 /* 5798 * Only run delayed items if we are a directory. We want to make sure 5799 * all directory indexes hit the fs/subvolume tree so we can find them 5800 * and figure out which index ranges have to be logged. 5801 */ 5802 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5803 ret = btrfs_commit_inode_delayed_items(trans, inode); 5804 if (ret) 5805 goto out; 5806 } 5807 5808 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5809 recursive_logging = true; 5810 if (inode_only == LOG_OTHER_INODE) 5811 inode_only = LOG_INODE_EXISTS; 5812 else 5813 inode_only = LOG_INODE_ALL; 5814 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5815 } else { 5816 mutex_lock(&inode->log_mutex); 5817 } 5818 5819 /* 5820 * For symlinks, we must always log their content, which is stored in an 5821 * inline extent, otherwise we could end up with an empty symlink after 5822 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 5823 * one attempts to create an empty symlink). 5824 * We don't need to worry about flushing delalloc, because when we create 5825 * the inline extent when the symlink is created (we never have delalloc 5826 * for symlinks). 5827 */ 5828 if (S_ISLNK(inode->vfs_inode.i_mode)) 5829 inode_only = LOG_INODE_ALL; 5830 5831 /* 5832 * Before logging the inode item, cache the value returned by 5833 * inode_logged(), because after that we have the need to figure out if 5834 * the inode was previously logged in this transaction. 5835 */ 5836 ret = inode_logged(trans, inode, path); 5837 if (ret < 0) 5838 goto out_unlock; 5839 ctx->logged_before = (ret == 1); 5840 ret = 0; 5841 5842 /* 5843 * This is for cases where logging a directory could result in losing a 5844 * a file after replaying the log. For example, if we move a file from a 5845 * directory A to a directory B, then fsync directory A, we have no way 5846 * to known the file was moved from A to B, so logging just A would 5847 * result in losing the file after a log replay. 5848 */ 5849 if (S_ISDIR(inode->vfs_inode.i_mode) && 5850 inode_only == LOG_INODE_ALL && 5851 inode->last_unlink_trans >= trans->transid) { 5852 btrfs_set_log_full_commit(trans); 5853 ret = BTRFS_LOG_FORCE_COMMIT; 5854 goto out_unlock; 5855 } 5856 5857 /* 5858 * a brute force approach to making sure we get the most uptodate 5859 * copies of everything. 5860 */ 5861 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5862 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5863 5864 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5865 if (inode_only == LOG_INODE_EXISTS) 5866 max_key_type = BTRFS_XATTR_ITEM_KEY; 5867 if (ctx->logged_before) 5868 ret = drop_inode_items(trans, log, path, inode, 5869 max_key_type); 5870 } else { 5871 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 5872 /* 5873 * Make sure the new inode item we write to the log has 5874 * the same isize as the current one (if it exists). 5875 * This is necessary to prevent data loss after log 5876 * replay, and also to prevent doing a wrong expanding 5877 * truncate - for e.g. create file, write 4K into offset 5878 * 0, fsync, write 4K into offset 4096, add hard link, 5879 * fsync some other file (to sync log), power fail - if 5880 * we use the inode's current i_size, after log replay 5881 * we get a 8Kb file, with the last 4Kb extent as a hole 5882 * (zeroes), as if an expanding truncate happened, 5883 * instead of getting a file of 4Kb only. 5884 */ 5885 ret = logged_inode_size(log, inode, path, &logged_isize); 5886 if (ret) 5887 goto out_unlock; 5888 } 5889 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5890 &inode->runtime_flags)) { 5891 if (inode_only == LOG_INODE_EXISTS) { 5892 max_key.type = BTRFS_XATTR_ITEM_KEY; 5893 if (ctx->logged_before) 5894 ret = drop_inode_items(trans, log, path, 5895 inode, max_key.type); 5896 } else { 5897 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5898 &inode->runtime_flags); 5899 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5900 &inode->runtime_flags); 5901 if (ctx->logged_before) 5902 ret = truncate_inode_items(trans, log, 5903 inode, 0, 0); 5904 } 5905 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5906 &inode->runtime_flags) || 5907 inode_only == LOG_INODE_EXISTS) { 5908 if (inode_only == LOG_INODE_ALL) 5909 fast_search = true; 5910 max_key.type = BTRFS_XATTR_ITEM_KEY; 5911 if (ctx->logged_before) 5912 ret = drop_inode_items(trans, log, path, inode, 5913 max_key.type); 5914 } else { 5915 if (inode_only == LOG_INODE_ALL) 5916 fast_search = true; 5917 inode_item_dropped = false; 5918 goto log_extents; 5919 } 5920 5921 } 5922 if (ret) 5923 goto out_unlock; 5924 5925 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5926 path, dst_path, logged_isize, 5927 recursive_logging, inode_only, ctx, 5928 &need_log_inode_item); 5929 if (ret) 5930 goto out_unlock; 5931 5932 btrfs_release_path(path); 5933 btrfs_release_path(dst_path); 5934 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5935 if (ret) 5936 goto out_unlock; 5937 xattrs_logged = true; 5938 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5939 btrfs_release_path(path); 5940 btrfs_release_path(dst_path); 5941 ret = btrfs_log_holes(trans, inode, path); 5942 if (ret) 5943 goto out_unlock; 5944 } 5945 log_extents: 5946 btrfs_release_path(path); 5947 btrfs_release_path(dst_path); 5948 if (need_log_inode_item) { 5949 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5950 if (ret) 5951 goto out_unlock; 5952 /* 5953 * If we are doing a fast fsync and the inode was logged before 5954 * in this transaction, we don't need to log the xattrs because 5955 * they were logged before. If xattrs were added, changed or 5956 * deleted since the last time we logged the inode, then we have 5957 * already logged them because the inode had the runtime flag 5958 * BTRFS_INODE_COPY_EVERYTHING set. 5959 */ 5960 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5961 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5962 if (ret) 5963 goto out_unlock; 5964 btrfs_release_path(path); 5965 } 5966 } 5967 if (fast_search) { 5968 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 5969 if (ret) 5970 goto out_unlock; 5971 } else if (inode_only == LOG_INODE_ALL) { 5972 struct extent_map *em, *n; 5973 5974 write_lock(&em_tree->lock); 5975 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5976 list_del_init(&em->list); 5977 write_unlock(&em_tree->lock); 5978 } 5979 5980 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5981 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 5982 if (ret) 5983 goto out_unlock; 5984 } 5985 5986 spin_lock(&inode->lock); 5987 inode->logged_trans = trans->transid; 5988 /* 5989 * Don't update last_log_commit if we logged that an inode exists. 5990 * We do this for three reasons: 5991 * 5992 * 1) We might have had buffered writes to this inode that were 5993 * flushed and had their ordered extents completed in this 5994 * transaction, but we did not previously log the inode with 5995 * LOG_INODE_ALL. Later the inode was evicted and after that 5996 * it was loaded again and this LOG_INODE_EXISTS log operation 5997 * happened. We must make sure that if an explicit fsync against 5998 * the inode is performed later, it logs the new extents, an 5999 * updated inode item, etc, and syncs the log. The same logic 6000 * applies to direct IO writes instead of buffered writes. 6001 * 6002 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6003 * is logged with an i_size of 0 or whatever value was logged 6004 * before. If later the i_size of the inode is increased by a 6005 * truncate operation, the log is synced through an fsync of 6006 * some other inode and then finally an explicit fsync against 6007 * this inode is made, we must make sure this fsync logs the 6008 * inode with the new i_size, the hole between old i_size and 6009 * the new i_size, and syncs the log. 6010 * 6011 * 3) If we are logging that an ancestor inode exists as part of 6012 * logging a new name from a link or rename operation, don't update 6013 * its last_log_commit - otherwise if an explicit fsync is made 6014 * against an ancestor, the fsync considers the inode in the log 6015 * and doesn't sync the log, resulting in the ancestor missing after 6016 * a power failure unless the log was synced as part of an fsync 6017 * against any other unrelated inode. 6018 */ 6019 if (inode_only != LOG_INODE_EXISTS) 6020 inode->last_log_commit = inode->last_sub_trans; 6021 spin_unlock(&inode->lock); 6022 6023 /* 6024 * Reset the last_reflink_trans so that the next fsync does not need to 6025 * go through the slower path when logging extents and their checksums. 6026 */ 6027 if (inode_only == LOG_INODE_ALL) 6028 inode->last_reflink_trans = 0; 6029 6030 out_unlock: 6031 mutex_unlock(&inode->log_mutex); 6032 out: 6033 btrfs_free_path(path); 6034 btrfs_free_path(dst_path); 6035 6036 if (recursive_logging) 6037 ctx->logged_before = orig_logged_before; 6038 6039 return ret; 6040 } 6041 6042 /* 6043 * Check if we need to log an inode. This is used in contexts where while 6044 * logging an inode we need to log another inode (either that it exists or in 6045 * full mode). This is used instead of btrfs_inode_in_log() because the later 6046 * requires the inode to be in the log and have the log transaction committed, 6047 * while here we do not care if the log transaction was already committed - our 6048 * caller will commit the log later - and we want to avoid logging an inode 6049 * multiple times when multiple tasks have joined the same log transaction. 6050 */ 6051 static bool need_log_inode(struct btrfs_trans_handle *trans, 6052 struct btrfs_inode *inode) 6053 { 6054 /* 6055 * If a directory was not modified, no dentries added or removed, we can 6056 * and should avoid logging it. 6057 */ 6058 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 6059 return false; 6060 6061 /* 6062 * If this inode does not have new/updated/deleted xattrs since the last 6063 * time it was logged and is flagged as logged in the current transaction, 6064 * we can skip logging it. As for new/deleted names, those are updated in 6065 * the log by link/unlink/rename operations. 6066 * In case the inode was logged and then evicted and reloaded, its 6067 * logged_trans will be 0, in which case we have to fully log it since 6068 * logged_trans is a transient field, not persisted. 6069 */ 6070 if (inode->logged_trans == trans->transid && 6071 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 6072 return false; 6073 6074 return true; 6075 } 6076 6077 struct btrfs_dir_list { 6078 u64 ino; 6079 struct list_head list; 6080 }; 6081 6082 /* 6083 * Log the inodes of the new dentries of a directory. See log_dir_items() for 6084 * details about the why it is needed. 6085 * This is a recursive operation - if an existing dentry corresponds to a 6086 * directory, that directory's new entries are logged too (same behaviour as 6087 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 6088 * the dentries point to we do not lock their i_mutex, otherwise lockdep 6089 * complains about the following circular lock dependency / possible deadlock: 6090 * 6091 * CPU0 CPU1 6092 * ---- ---- 6093 * lock(&type->i_mutex_dir_key#3/2); 6094 * lock(sb_internal#2); 6095 * lock(&type->i_mutex_dir_key#3/2); 6096 * lock(&sb->s_type->i_mutex_key#14); 6097 * 6098 * Where sb_internal is the lock (a counter that works as a lock) acquired by 6099 * sb_start_intwrite() in btrfs_start_transaction(). 6100 * Not locking i_mutex of the inodes is still safe because: 6101 * 6102 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 6103 * that while logging the inode new references (names) are added or removed 6104 * from the inode, leaving the logged inode item with a link count that does 6105 * not match the number of logged inode reference items. This is fine because 6106 * at log replay time we compute the real number of links and correct the 6107 * link count in the inode item (see replay_one_buffer() and 6108 * link_to_fixup_dir()); 6109 * 6110 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 6111 * while logging the inode's items new index items (key type 6112 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 6113 * has a size that doesn't match the sum of the lengths of all the logged 6114 * names - this is ok, not a problem, because at log replay time we set the 6115 * directory's i_size to the correct value (see replay_one_name() and 6116 * do_overwrite_item()). 6117 */ 6118 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 6119 struct btrfs_root *root, 6120 struct btrfs_inode *start_inode, 6121 struct btrfs_log_ctx *ctx) 6122 { 6123 struct btrfs_fs_info *fs_info = root->fs_info; 6124 struct btrfs_path *path; 6125 LIST_HEAD(dir_list); 6126 struct btrfs_dir_list *dir_elem; 6127 int ret = 0; 6128 6129 /* 6130 * If we are logging a new name, as part of a link or rename operation, 6131 * don't bother logging new dentries, as we just want to log the names 6132 * of an inode and that any new parents exist. 6133 */ 6134 if (ctx->logging_new_name) 6135 return 0; 6136 6137 path = btrfs_alloc_path(); 6138 if (!path) 6139 return -ENOMEM; 6140 6141 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 6142 if (!dir_elem) { 6143 btrfs_free_path(path); 6144 return -ENOMEM; 6145 } 6146 dir_elem->ino = btrfs_ino(start_inode); 6147 list_add_tail(&dir_elem->list, &dir_list); 6148 6149 while (!list_empty(&dir_list)) { 6150 struct extent_buffer *leaf; 6151 struct btrfs_key min_key; 6152 int nritems; 6153 int i; 6154 6155 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 6156 list); 6157 if (ret) 6158 goto next_dir_inode; 6159 6160 min_key.objectid = dir_elem->ino; 6161 min_key.type = BTRFS_DIR_INDEX_KEY; 6162 min_key.offset = 0; 6163 again: 6164 btrfs_release_path(path); 6165 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 6166 if (ret < 0) { 6167 goto next_dir_inode; 6168 } else if (ret > 0) { 6169 ret = 0; 6170 goto next_dir_inode; 6171 } 6172 6173 leaf = path->nodes[0]; 6174 nritems = btrfs_header_nritems(leaf); 6175 for (i = path->slots[0]; i < nritems; i++) { 6176 struct btrfs_dir_item *di; 6177 struct btrfs_key di_key; 6178 struct inode *di_inode; 6179 struct btrfs_dir_list *new_dir_elem; 6180 int log_mode = LOG_INODE_EXISTS; 6181 int type; 6182 6183 btrfs_item_key_to_cpu(leaf, &min_key, i); 6184 if (min_key.objectid != dir_elem->ino || 6185 min_key.type != BTRFS_DIR_INDEX_KEY) 6186 goto next_dir_inode; 6187 6188 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 6189 type = btrfs_dir_type(leaf, di); 6190 if (btrfs_dir_transid(leaf, di) < trans->transid) 6191 continue; 6192 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 6193 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 6194 continue; 6195 6196 btrfs_release_path(path); 6197 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 6198 if (IS_ERR(di_inode)) { 6199 ret = PTR_ERR(di_inode); 6200 goto next_dir_inode; 6201 } 6202 6203 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6204 btrfs_add_delayed_iput(di_inode); 6205 break; 6206 } 6207 6208 ctx->log_new_dentries = false; 6209 if (type == BTRFS_FT_DIR) 6210 log_mode = LOG_INODE_ALL; 6211 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 6212 log_mode, ctx); 6213 btrfs_add_delayed_iput(di_inode); 6214 if (ret) 6215 goto next_dir_inode; 6216 if (ctx->log_new_dentries) { 6217 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 6218 GFP_NOFS); 6219 if (!new_dir_elem) { 6220 ret = -ENOMEM; 6221 goto next_dir_inode; 6222 } 6223 new_dir_elem->ino = di_key.objectid; 6224 list_add_tail(&new_dir_elem->list, &dir_list); 6225 } 6226 break; 6227 } 6228 if (min_key.offset < (u64)-1) { 6229 min_key.offset++; 6230 goto again; 6231 } 6232 next_dir_inode: 6233 list_del(&dir_elem->list); 6234 kfree(dir_elem); 6235 } 6236 6237 btrfs_free_path(path); 6238 return ret; 6239 } 6240 6241 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6242 struct btrfs_inode *inode, 6243 struct btrfs_log_ctx *ctx) 6244 { 6245 struct btrfs_fs_info *fs_info = trans->fs_info; 6246 int ret; 6247 struct btrfs_path *path; 6248 struct btrfs_key key; 6249 struct btrfs_root *root = inode->root; 6250 const u64 ino = btrfs_ino(inode); 6251 6252 path = btrfs_alloc_path(); 6253 if (!path) 6254 return -ENOMEM; 6255 path->skip_locking = 1; 6256 path->search_commit_root = 1; 6257 6258 key.objectid = ino; 6259 key.type = BTRFS_INODE_REF_KEY; 6260 key.offset = 0; 6261 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6262 if (ret < 0) 6263 goto out; 6264 6265 while (true) { 6266 struct extent_buffer *leaf = path->nodes[0]; 6267 int slot = path->slots[0]; 6268 u32 cur_offset = 0; 6269 u32 item_size; 6270 unsigned long ptr; 6271 6272 if (slot >= btrfs_header_nritems(leaf)) { 6273 ret = btrfs_next_leaf(root, path); 6274 if (ret < 0) 6275 goto out; 6276 else if (ret > 0) 6277 break; 6278 continue; 6279 } 6280 6281 btrfs_item_key_to_cpu(leaf, &key, slot); 6282 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6283 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6284 break; 6285 6286 item_size = btrfs_item_size(leaf, slot); 6287 ptr = btrfs_item_ptr_offset(leaf, slot); 6288 while (cur_offset < item_size) { 6289 struct btrfs_key inode_key; 6290 struct inode *dir_inode; 6291 6292 inode_key.type = BTRFS_INODE_ITEM_KEY; 6293 inode_key.offset = 0; 6294 6295 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6296 struct btrfs_inode_extref *extref; 6297 6298 extref = (struct btrfs_inode_extref *) 6299 (ptr + cur_offset); 6300 inode_key.objectid = btrfs_inode_extref_parent( 6301 leaf, extref); 6302 cur_offset += sizeof(*extref); 6303 cur_offset += btrfs_inode_extref_name_len(leaf, 6304 extref); 6305 } else { 6306 inode_key.objectid = key.offset; 6307 cur_offset = item_size; 6308 } 6309 6310 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6311 root); 6312 /* 6313 * If the parent inode was deleted, return an error to 6314 * fallback to a transaction commit. This is to prevent 6315 * getting an inode that was moved from one parent A to 6316 * a parent B, got its former parent A deleted and then 6317 * it got fsync'ed, from existing at both parents after 6318 * a log replay (and the old parent still existing). 6319 * Example: 6320 * 6321 * mkdir /mnt/A 6322 * mkdir /mnt/B 6323 * touch /mnt/B/bar 6324 * sync 6325 * mv /mnt/B/bar /mnt/A/bar 6326 * mv -T /mnt/A /mnt/B 6327 * fsync /mnt/B/bar 6328 * <power fail> 6329 * 6330 * If we ignore the old parent B which got deleted, 6331 * after a log replay we would have file bar linked 6332 * at both parents and the old parent B would still 6333 * exist. 6334 */ 6335 if (IS_ERR(dir_inode)) { 6336 ret = PTR_ERR(dir_inode); 6337 goto out; 6338 } 6339 6340 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6341 btrfs_add_delayed_iput(dir_inode); 6342 continue; 6343 } 6344 6345 ctx->log_new_dentries = false; 6346 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6347 LOG_INODE_ALL, ctx); 6348 if (!ret && ctx->log_new_dentries) 6349 ret = log_new_dir_dentries(trans, root, 6350 BTRFS_I(dir_inode), ctx); 6351 btrfs_add_delayed_iput(dir_inode); 6352 if (ret) 6353 goto out; 6354 } 6355 path->slots[0]++; 6356 } 6357 ret = 0; 6358 out: 6359 btrfs_free_path(path); 6360 return ret; 6361 } 6362 6363 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6364 struct btrfs_root *root, 6365 struct btrfs_path *path, 6366 struct btrfs_log_ctx *ctx) 6367 { 6368 struct btrfs_key found_key; 6369 6370 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6371 6372 while (true) { 6373 struct btrfs_fs_info *fs_info = root->fs_info; 6374 struct extent_buffer *leaf = path->nodes[0]; 6375 int slot = path->slots[0]; 6376 struct btrfs_key search_key; 6377 struct inode *inode; 6378 u64 ino; 6379 int ret = 0; 6380 6381 btrfs_release_path(path); 6382 6383 ino = found_key.offset; 6384 6385 search_key.objectid = found_key.offset; 6386 search_key.type = BTRFS_INODE_ITEM_KEY; 6387 search_key.offset = 0; 6388 inode = btrfs_iget(fs_info->sb, ino, root); 6389 if (IS_ERR(inode)) 6390 return PTR_ERR(inode); 6391 6392 if (BTRFS_I(inode)->generation >= trans->transid && 6393 need_log_inode(trans, BTRFS_I(inode))) 6394 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6395 LOG_INODE_EXISTS, ctx); 6396 btrfs_add_delayed_iput(inode); 6397 if (ret) 6398 return ret; 6399 6400 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6401 break; 6402 6403 search_key.type = BTRFS_INODE_REF_KEY; 6404 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6405 if (ret < 0) 6406 return ret; 6407 6408 leaf = path->nodes[0]; 6409 slot = path->slots[0]; 6410 if (slot >= btrfs_header_nritems(leaf)) { 6411 ret = btrfs_next_leaf(root, path); 6412 if (ret < 0) 6413 return ret; 6414 else if (ret > 0) 6415 return -ENOENT; 6416 leaf = path->nodes[0]; 6417 slot = path->slots[0]; 6418 } 6419 6420 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6421 if (found_key.objectid != search_key.objectid || 6422 found_key.type != BTRFS_INODE_REF_KEY) 6423 return -ENOENT; 6424 } 6425 return 0; 6426 } 6427 6428 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6429 struct btrfs_inode *inode, 6430 struct dentry *parent, 6431 struct btrfs_log_ctx *ctx) 6432 { 6433 struct btrfs_root *root = inode->root; 6434 struct dentry *old_parent = NULL; 6435 struct super_block *sb = inode->vfs_inode.i_sb; 6436 int ret = 0; 6437 6438 while (true) { 6439 if (!parent || d_really_is_negative(parent) || 6440 sb != parent->d_sb) 6441 break; 6442 6443 inode = BTRFS_I(d_inode(parent)); 6444 if (root != inode->root) 6445 break; 6446 6447 if (inode->generation >= trans->transid && 6448 need_log_inode(trans, inode)) { 6449 ret = btrfs_log_inode(trans, inode, 6450 LOG_INODE_EXISTS, ctx); 6451 if (ret) 6452 break; 6453 } 6454 if (IS_ROOT(parent)) 6455 break; 6456 6457 parent = dget_parent(parent); 6458 dput(old_parent); 6459 old_parent = parent; 6460 } 6461 dput(old_parent); 6462 6463 return ret; 6464 } 6465 6466 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6467 struct btrfs_inode *inode, 6468 struct dentry *parent, 6469 struct btrfs_log_ctx *ctx) 6470 { 6471 struct btrfs_root *root = inode->root; 6472 const u64 ino = btrfs_ino(inode); 6473 struct btrfs_path *path; 6474 struct btrfs_key search_key; 6475 int ret; 6476 6477 /* 6478 * For a single hard link case, go through a fast path that does not 6479 * need to iterate the fs/subvolume tree. 6480 */ 6481 if (inode->vfs_inode.i_nlink < 2) 6482 return log_new_ancestors_fast(trans, inode, parent, ctx); 6483 6484 path = btrfs_alloc_path(); 6485 if (!path) 6486 return -ENOMEM; 6487 6488 search_key.objectid = ino; 6489 search_key.type = BTRFS_INODE_REF_KEY; 6490 search_key.offset = 0; 6491 again: 6492 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6493 if (ret < 0) 6494 goto out; 6495 if (ret == 0) 6496 path->slots[0]++; 6497 6498 while (true) { 6499 struct extent_buffer *leaf = path->nodes[0]; 6500 int slot = path->slots[0]; 6501 struct btrfs_key found_key; 6502 6503 if (slot >= btrfs_header_nritems(leaf)) { 6504 ret = btrfs_next_leaf(root, path); 6505 if (ret < 0) 6506 goto out; 6507 else if (ret > 0) 6508 break; 6509 continue; 6510 } 6511 6512 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6513 if (found_key.objectid != ino || 6514 found_key.type > BTRFS_INODE_EXTREF_KEY) 6515 break; 6516 6517 /* 6518 * Don't deal with extended references because they are rare 6519 * cases and too complex to deal with (we would need to keep 6520 * track of which subitem we are processing for each item in 6521 * this loop, etc). So just return some error to fallback to 6522 * a transaction commit. 6523 */ 6524 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6525 ret = -EMLINK; 6526 goto out; 6527 } 6528 6529 /* 6530 * Logging ancestors needs to do more searches on the fs/subvol 6531 * tree, so it releases the path as needed to avoid deadlocks. 6532 * Keep track of the last inode ref key and resume from that key 6533 * after logging all new ancestors for the current hard link. 6534 */ 6535 memcpy(&search_key, &found_key, sizeof(search_key)); 6536 6537 ret = log_new_ancestors(trans, root, path, ctx); 6538 if (ret) 6539 goto out; 6540 btrfs_release_path(path); 6541 goto again; 6542 } 6543 ret = 0; 6544 out: 6545 btrfs_free_path(path); 6546 return ret; 6547 } 6548 6549 /* 6550 * helper function around btrfs_log_inode to make sure newly created 6551 * parent directories also end up in the log. A minimal inode and backref 6552 * only logging is done of any parent directories that are older than 6553 * the last committed transaction 6554 */ 6555 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6556 struct btrfs_inode *inode, 6557 struct dentry *parent, 6558 int inode_only, 6559 struct btrfs_log_ctx *ctx) 6560 { 6561 struct btrfs_root *root = inode->root; 6562 struct btrfs_fs_info *fs_info = root->fs_info; 6563 int ret = 0; 6564 bool log_dentries = false; 6565 6566 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6567 ret = BTRFS_LOG_FORCE_COMMIT; 6568 goto end_no_trans; 6569 } 6570 6571 if (btrfs_root_refs(&root->root_item) == 0) { 6572 ret = BTRFS_LOG_FORCE_COMMIT; 6573 goto end_no_trans; 6574 } 6575 6576 /* 6577 * Skip already logged inodes or inodes corresponding to tmpfiles 6578 * (since logging them is pointless, a link count of 0 means they 6579 * will never be accessible). 6580 */ 6581 if ((btrfs_inode_in_log(inode, trans->transid) && 6582 list_empty(&ctx->ordered_extents)) || 6583 inode->vfs_inode.i_nlink == 0) { 6584 ret = BTRFS_NO_LOG_SYNC; 6585 goto end_no_trans; 6586 } 6587 6588 ret = start_log_trans(trans, root, ctx); 6589 if (ret) 6590 goto end_no_trans; 6591 6592 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 6593 if (ret) 6594 goto end_trans; 6595 6596 /* 6597 * for regular files, if its inode is already on disk, we don't 6598 * have to worry about the parents at all. This is because 6599 * we can use the last_unlink_trans field to record renames 6600 * and other fun in this file. 6601 */ 6602 if (S_ISREG(inode->vfs_inode.i_mode) && 6603 inode->generation < trans->transid && 6604 inode->last_unlink_trans < trans->transid) { 6605 ret = 0; 6606 goto end_trans; 6607 } 6608 6609 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 6610 log_dentries = true; 6611 6612 /* 6613 * On unlink we must make sure all our current and old parent directory 6614 * inodes are fully logged. This is to prevent leaving dangling 6615 * directory index entries in directories that were our parents but are 6616 * not anymore. Not doing this results in old parent directory being 6617 * impossible to delete after log replay (rmdir will always fail with 6618 * error -ENOTEMPTY). 6619 * 6620 * Example 1: 6621 * 6622 * mkdir testdir 6623 * touch testdir/foo 6624 * ln testdir/foo testdir/bar 6625 * sync 6626 * unlink testdir/bar 6627 * xfs_io -c fsync testdir/foo 6628 * <power failure> 6629 * mount fs, triggers log replay 6630 * 6631 * If we don't log the parent directory (testdir), after log replay the 6632 * directory still has an entry pointing to the file inode using the bar 6633 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6634 * the file inode has a link count of 1. 6635 * 6636 * Example 2: 6637 * 6638 * mkdir testdir 6639 * touch foo 6640 * ln foo testdir/foo2 6641 * ln foo testdir/foo3 6642 * sync 6643 * unlink testdir/foo3 6644 * xfs_io -c fsync foo 6645 * <power failure> 6646 * mount fs, triggers log replay 6647 * 6648 * Similar as the first example, after log replay the parent directory 6649 * testdir still has an entry pointing to the inode file with name foo3 6650 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6651 * and has a link count of 2. 6652 */ 6653 if (inode->last_unlink_trans >= trans->transid) { 6654 ret = btrfs_log_all_parents(trans, inode, ctx); 6655 if (ret) 6656 goto end_trans; 6657 } 6658 6659 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6660 if (ret) 6661 goto end_trans; 6662 6663 if (log_dentries) 6664 ret = log_new_dir_dentries(trans, root, inode, ctx); 6665 else 6666 ret = 0; 6667 end_trans: 6668 if (ret < 0) { 6669 btrfs_set_log_full_commit(trans); 6670 ret = BTRFS_LOG_FORCE_COMMIT; 6671 } 6672 6673 if (ret) 6674 btrfs_remove_log_ctx(root, ctx); 6675 btrfs_end_log_trans(root); 6676 end_no_trans: 6677 return ret; 6678 } 6679 6680 /* 6681 * it is not safe to log dentry if the chunk root has added new 6682 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6683 * If this returns 1, you must commit the transaction to safely get your 6684 * data on disk. 6685 */ 6686 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6687 struct dentry *dentry, 6688 struct btrfs_log_ctx *ctx) 6689 { 6690 struct dentry *parent = dget_parent(dentry); 6691 int ret; 6692 6693 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6694 LOG_INODE_ALL, ctx); 6695 dput(parent); 6696 6697 return ret; 6698 } 6699 6700 /* 6701 * should be called during mount to recover any replay any log trees 6702 * from the FS 6703 */ 6704 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6705 { 6706 int ret; 6707 struct btrfs_path *path; 6708 struct btrfs_trans_handle *trans; 6709 struct btrfs_key key; 6710 struct btrfs_key found_key; 6711 struct btrfs_root *log; 6712 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6713 struct walk_control wc = { 6714 .process_func = process_one_buffer, 6715 .stage = LOG_WALK_PIN_ONLY, 6716 }; 6717 6718 path = btrfs_alloc_path(); 6719 if (!path) 6720 return -ENOMEM; 6721 6722 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6723 6724 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6725 if (IS_ERR(trans)) { 6726 ret = PTR_ERR(trans); 6727 goto error; 6728 } 6729 6730 wc.trans = trans; 6731 wc.pin = 1; 6732 6733 ret = walk_log_tree(trans, log_root_tree, &wc); 6734 if (ret) { 6735 btrfs_abort_transaction(trans, ret); 6736 goto error; 6737 } 6738 6739 again: 6740 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6741 key.offset = (u64)-1; 6742 key.type = BTRFS_ROOT_ITEM_KEY; 6743 6744 while (1) { 6745 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6746 6747 if (ret < 0) { 6748 btrfs_abort_transaction(trans, ret); 6749 goto error; 6750 } 6751 if (ret > 0) { 6752 if (path->slots[0] == 0) 6753 break; 6754 path->slots[0]--; 6755 } 6756 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6757 path->slots[0]); 6758 btrfs_release_path(path); 6759 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6760 break; 6761 6762 log = btrfs_read_tree_root(log_root_tree, &found_key); 6763 if (IS_ERR(log)) { 6764 ret = PTR_ERR(log); 6765 btrfs_abort_transaction(trans, ret); 6766 goto error; 6767 } 6768 6769 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6770 true); 6771 if (IS_ERR(wc.replay_dest)) { 6772 ret = PTR_ERR(wc.replay_dest); 6773 6774 /* 6775 * We didn't find the subvol, likely because it was 6776 * deleted. This is ok, simply skip this log and go to 6777 * the next one. 6778 * 6779 * We need to exclude the root because we can't have 6780 * other log replays overwriting this log as we'll read 6781 * it back in a few more times. This will keep our 6782 * block from being modified, and we'll just bail for 6783 * each subsequent pass. 6784 */ 6785 if (ret == -ENOENT) 6786 ret = btrfs_pin_extent_for_log_replay(trans, 6787 log->node->start, 6788 log->node->len); 6789 btrfs_put_root(log); 6790 6791 if (!ret) 6792 goto next; 6793 btrfs_abort_transaction(trans, ret); 6794 goto error; 6795 } 6796 6797 wc.replay_dest->log_root = log; 6798 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6799 if (ret) 6800 /* The loop needs to continue due to the root refs */ 6801 btrfs_abort_transaction(trans, ret); 6802 else 6803 ret = walk_log_tree(trans, log, &wc); 6804 6805 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6806 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6807 path); 6808 if (ret) 6809 btrfs_abort_transaction(trans, ret); 6810 } 6811 6812 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6813 struct btrfs_root *root = wc.replay_dest; 6814 6815 btrfs_release_path(path); 6816 6817 /* 6818 * We have just replayed everything, and the highest 6819 * objectid of fs roots probably has changed in case 6820 * some inode_item's got replayed. 6821 * 6822 * root->objectid_mutex is not acquired as log replay 6823 * could only happen during mount. 6824 */ 6825 ret = btrfs_init_root_free_objectid(root); 6826 if (ret) 6827 btrfs_abort_transaction(trans, ret); 6828 } 6829 6830 wc.replay_dest->log_root = NULL; 6831 btrfs_put_root(wc.replay_dest); 6832 btrfs_put_root(log); 6833 6834 if (ret) 6835 goto error; 6836 next: 6837 if (found_key.offset == 0) 6838 break; 6839 key.offset = found_key.offset - 1; 6840 } 6841 btrfs_release_path(path); 6842 6843 /* step one is to pin it all, step two is to replay just inodes */ 6844 if (wc.pin) { 6845 wc.pin = 0; 6846 wc.process_func = replay_one_buffer; 6847 wc.stage = LOG_WALK_REPLAY_INODES; 6848 goto again; 6849 } 6850 /* step three is to replay everything */ 6851 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6852 wc.stage++; 6853 goto again; 6854 } 6855 6856 btrfs_free_path(path); 6857 6858 /* step 4: commit the transaction, which also unpins the blocks */ 6859 ret = btrfs_commit_transaction(trans); 6860 if (ret) 6861 return ret; 6862 6863 log_root_tree->log_root = NULL; 6864 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6865 btrfs_put_root(log_root_tree); 6866 6867 return 0; 6868 error: 6869 if (wc.trans) 6870 btrfs_end_transaction(wc.trans); 6871 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6872 btrfs_free_path(path); 6873 return ret; 6874 } 6875 6876 /* 6877 * there are some corner cases where we want to force a full 6878 * commit instead of allowing a directory to be logged. 6879 * 6880 * They revolve around files there were unlinked from the directory, and 6881 * this function updates the parent directory so that a full commit is 6882 * properly done if it is fsync'd later after the unlinks are done. 6883 * 6884 * Must be called before the unlink operations (updates to the subvolume tree, 6885 * inodes, etc) are done. 6886 */ 6887 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6888 struct btrfs_inode *dir, struct btrfs_inode *inode, 6889 int for_rename) 6890 { 6891 /* 6892 * when we're logging a file, if it hasn't been renamed 6893 * or unlinked, and its inode is fully committed on disk, 6894 * we don't have to worry about walking up the directory chain 6895 * to log its parents. 6896 * 6897 * So, we use the last_unlink_trans field to put this transid 6898 * into the file. When the file is logged we check it and 6899 * don't log the parents if the file is fully on disk. 6900 */ 6901 mutex_lock(&inode->log_mutex); 6902 inode->last_unlink_trans = trans->transid; 6903 mutex_unlock(&inode->log_mutex); 6904 6905 /* 6906 * if this directory was already logged any new 6907 * names for this file/dir will get recorded 6908 */ 6909 if (dir->logged_trans == trans->transid) 6910 return; 6911 6912 /* 6913 * if the inode we're about to unlink was logged, 6914 * the log will be properly updated for any new names 6915 */ 6916 if (inode->logged_trans == trans->transid) 6917 return; 6918 6919 /* 6920 * when renaming files across directories, if the directory 6921 * there we're unlinking from gets fsync'd later on, there's 6922 * no way to find the destination directory later and fsync it 6923 * properly. So, we have to be conservative and force commits 6924 * so the new name gets discovered. 6925 */ 6926 if (for_rename) 6927 goto record; 6928 6929 /* we can safely do the unlink without any special recording */ 6930 return; 6931 6932 record: 6933 mutex_lock(&dir->log_mutex); 6934 dir->last_unlink_trans = trans->transid; 6935 mutex_unlock(&dir->log_mutex); 6936 } 6937 6938 /* 6939 * Make sure that if someone attempts to fsync the parent directory of a deleted 6940 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6941 * that after replaying the log tree of the parent directory's root we will not 6942 * see the snapshot anymore and at log replay time we will not see any log tree 6943 * corresponding to the deleted snapshot's root, which could lead to replaying 6944 * it after replaying the log tree of the parent directory (which would replay 6945 * the snapshot delete operation). 6946 * 6947 * Must be called before the actual snapshot destroy operation (updates to the 6948 * parent root and tree of tree roots trees, etc) are done. 6949 */ 6950 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6951 struct btrfs_inode *dir) 6952 { 6953 mutex_lock(&dir->log_mutex); 6954 dir->last_unlink_trans = trans->transid; 6955 mutex_unlock(&dir->log_mutex); 6956 } 6957 6958 /** 6959 * Update the log after adding a new name for an inode. 6960 * 6961 * @trans: Transaction handle. 6962 * @old_dentry: The dentry associated with the old name and the old 6963 * parent directory. 6964 * @old_dir: The inode of the previous parent directory for the case 6965 * of a rename. For a link operation, it must be NULL. 6966 * @old_dir_index: The index number associated with the old name, meaningful 6967 * only for rename operations (when @old_dir is not NULL). 6968 * Ignored for link operations. 6969 * @parent: The dentry associated with the directory under which the 6970 * new name is located. 6971 * 6972 * Call this after adding a new name for an inode, as a result of a link or 6973 * rename operation, and it will properly update the log to reflect the new name. 6974 */ 6975 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6976 struct dentry *old_dentry, struct btrfs_inode *old_dir, 6977 u64 old_dir_index, struct dentry *parent) 6978 { 6979 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 6980 struct btrfs_root *root = inode->root; 6981 struct btrfs_log_ctx ctx; 6982 bool log_pinned = false; 6983 int ret; 6984 6985 /* 6986 * this will force the logging code to walk the dentry chain 6987 * up for the file 6988 */ 6989 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6990 inode->last_unlink_trans = trans->transid; 6991 6992 /* 6993 * if this inode hasn't been logged and directory we're renaming it 6994 * from hasn't been logged, we don't need to log it 6995 */ 6996 ret = inode_logged(trans, inode, NULL); 6997 if (ret < 0) { 6998 goto out; 6999 } else if (ret == 0) { 7000 if (!old_dir) 7001 return; 7002 /* 7003 * If the inode was not logged and we are doing a rename (old_dir is not 7004 * NULL), check if old_dir was logged - if it was not we can return and 7005 * do nothing. 7006 */ 7007 ret = inode_logged(trans, old_dir, NULL); 7008 if (ret < 0) 7009 goto out; 7010 else if (ret == 0) 7011 return; 7012 } 7013 ret = 0; 7014 7015 /* 7016 * If we are doing a rename (old_dir is not NULL) from a directory that 7017 * was previously logged, make sure that on log replay we get the old 7018 * dir entry deleted. This is needed because we will also log the new 7019 * name of the renamed inode, so we need to make sure that after log 7020 * replay we don't end up with both the new and old dir entries existing. 7021 */ 7022 if (old_dir && old_dir->logged_trans == trans->transid) { 7023 struct btrfs_root *log = old_dir->root->log_root; 7024 struct btrfs_path *path; 7025 7026 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7027 7028 /* 7029 * We have two inodes to update in the log, the old directory and 7030 * the inode that got renamed, so we must pin the log to prevent 7031 * anyone from syncing the log until we have updated both inodes 7032 * in the log. 7033 */ 7034 ret = join_running_log_trans(root); 7035 /* 7036 * At least one of the inodes was logged before, so this should 7037 * not fail, but if it does, it's not serious, just bail out and 7038 * mark the log for a full commit. 7039 */ 7040 if (WARN_ON_ONCE(ret < 0)) 7041 goto out; 7042 log_pinned = true; 7043 7044 path = btrfs_alloc_path(); 7045 if (!path) { 7046 ret = -ENOMEM; 7047 goto out; 7048 } 7049 7050 /* 7051 * Other concurrent task might be logging the old directory, 7052 * as it can be triggered when logging other inode that had or 7053 * still has a dentry in the old directory. We lock the old 7054 * directory's log_mutex to ensure the deletion of the old 7055 * name is persisted, because during directory logging we 7056 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7057 * the old name's dir index item is in the delayed items, so 7058 * it could be missed by an in progress directory logging. 7059 */ 7060 mutex_lock(&old_dir->log_mutex); 7061 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7062 old_dentry->d_name.name, 7063 old_dentry->d_name.len, old_dir_index); 7064 if (ret > 0) { 7065 /* 7066 * The dentry does not exist in the log, so record its 7067 * deletion. 7068 */ 7069 btrfs_release_path(path); 7070 ret = insert_dir_log_key(trans, log, path, 7071 btrfs_ino(old_dir), 7072 old_dir_index, old_dir_index); 7073 } 7074 mutex_unlock(&old_dir->log_mutex); 7075 7076 btrfs_free_path(path); 7077 if (ret < 0) 7078 goto out; 7079 } 7080 7081 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7082 ctx.logging_new_name = true; 7083 /* 7084 * We don't care about the return value. If we fail to log the new name 7085 * then we know the next attempt to sync the log will fallback to a full 7086 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7087 * we don't need to worry about getting a log committed that has an 7088 * inconsistent state after a rename operation. 7089 */ 7090 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7091 out: 7092 /* 7093 * If an error happened mark the log for a full commit because it's not 7094 * consistent and up to date or we couldn't find out if one of the 7095 * inodes was logged before in this transaction. Do it before unpinning 7096 * the log, to avoid any races with someone else trying to commit it. 7097 */ 7098 if (ret < 0) 7099 btrfs_set_log_full_commit(trans); 7100 if (log_pinned) 7101 btrfs_end_log_trans(root); 7102 } 7103 7104