1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 24 /* magic values for the inode_only field in btrfs_log_inode: 25 * 26 * LOG_INODE_ALL means to log everything 27 * LOG_INODE_EXISTS means to log just enough to recreate the inode 28 * during log replay 29 */ 30 enum { 31 LOG_INODE_ALL, 32 LOG_INODE_EXISTS, 33 LOG_OTHER_INODE, 34 LOG_OTHER_INODE_ALL, 35 }; 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 enum { 90 LOG_WALK_PIN_ONLY, 91 LOG_WALK_REPLAY_INODES, 92 LOG_WALK_REPLAY_DIR_INDEX, 93 LOG_WALK_REPLAY_ALL, 94 }; 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct btrfs_inode *inode, 98 int inode_only, 99 struct btrfs_log_ctx *ctx); 100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_path *path, u64 objectid); 103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_root *log, 106 struct btrfs_path *path, 107 u64 dirid, int del_all); 108 static void wait_log_commit(struct btrfs_root *root, int transid); 109 110 /* 111 * tree logging is a special write ahead log used to make sure that 112 * fsyncs and O_SYNCs can happen without doing full tree commits. 113 * 114 * Full tree commits are expensive because they require commonly 115 * modified blocks to be recowed, creating many dirty pages in the 116 * extent tree an 4x-6x higher write load than ext3. 117 * 118 * Instead of doing a tree commit on every fsync, we use the 119 * key ranges and transaction ids to find items for a given file or directory 120 * that have changed in this transaction. Those items are copied into 121 * a special tree (one per subvolume root), that tree is written to disk 122 * and then the fsync is considered complete. 123 * 124 * After a crash, items are copied out of the log-tree back into the 125 * subvolume tree. Any file data extents found are recorded in the extent 126 * allocation tree, and the log-tree freed. 127 * 128 * The log tree is read three times, once to pin down all the extents it is 129 * using in ram and once, once to create all the inodes logged in the tree 130 * and once to do all the other items. 131 */ 132 133 /* 134 * start a sub transaction and setup the log tree 135 * this increments the log tree writer count to make the people 136 * syncing the tree wait for us to finish 137 */ 138 static int start_log_trans(struct btrfs_trans_handle *trans, 139 struct btrfs_root *root, 140 struct btrfs_log_ctx *ctx) 141 { 142 struct btrfs_fs_info *fs_info = root->fs_info; 143 struct btrfs_root *tree_root = fs_info->tree_root; 144 const bool zoned = btrfs_is_zoned(fs_info); 145 int ret = 0; 146 bool created = false; 147 148 /* 149 * First check if the log root tree was already created. If not, create 150 * it before locking the root's log_mutex, just to keep lockdep happy. 151 */ 152 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 153 mutex_lock(&tree_root->log_mutex); 154 if (!fs_info->log_root_tree) { 155 ret = btrfs_init_log_root_tree(trans, fs_info); 156 if (!ret) { 157 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 158 created = true; 159 } 160 } 161 mutex_unlock(&tree_root->log_mutex); 162 if (ret) 163 return ret; 164 } 165 166 mutex_lock(&root->log_mutex); 167 168 again: 169 if (root->log_root) { 170 int index = (root->log_transid + 1) % 2; 171 172 if (btrfs_need_log_full_commit(trans)) { 173 ret = -EAGAIN; 174 goto out; 175 } 176 177 if (zoned && atomic_read(&root->log_commit[index])) { 178 wait_log_commit(root, root->log_transid - 1); 179 goto again; 180 } 181 182 if (!root->log_start_pid) { 183 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 184 root->log_start_pid = current->pid; 185 } else if (root->log_start_pid != current->pid) { 186 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 187 } 188 } else { 189 /* 190 * This means fs_info->log_root_tree was already created 191 * for some other FS trees. Do the full commit not to mix 192 * nodes from multiple log transactions to do sequential 193 * writing. 194 */ 195 if (zoned && !created) { 196 ret = -EAGAIN; 197 goto out; 198 } 199 200 ret = btrfs_add_log_tree(trans, root); 201 if (ret) 202 goto out; 203 204 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 205 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 206 root->log_start_pid = current->pid; 207 } 208 209 atomic_inc(&root->log_writers); 210 if (ctx && !ctx->logging_new_name) { 211 int index = root->log_transid % 2; 212 list_add_tail(&ctx->list, &root->log_ctxs[index]); 213 ctx->log_transid = root->log_transid; 214 } 215 216 out: 217 mutex_unlock(&root->log_mutex); 218 return ret; 219 } 220 221 /* 222 * returns 0 if there was a log transaction running and we were able 223 * to join, or returns -ENOENT if there were not transactions 224 * in progress 225 */ 226 static int join_running_log_trans(struct btrfs_root *root) 227 { 228 const bool zoned = btrfs_is_zoned(root->fs_info); 229 int ret = -ENOENT; 230 231 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 232 return ret; 233 234 mutex_lock(&root->log_mutex); 235 again: 236 if (root->log_root) { 237 int index = (root->log_transid + 1) % 2; 238 239 ret = 0; 240 if (zoned && atomic_read(&root->log_commit[index])) { 241 wait_log_commit(root, root->log_transid - 1); 242 goto again; 243 } 244 atomic_inc(&root->log_writers); 245 } 246 mutex_unlock(&root->log_mutex); 247 return ret; 248 } 249 250 /* 251 * This either makes the current running log transaction wait 252 * until you call btrfs_end_log_trans() or it makes any future 253 * log transactions wait until you call btrfs_end_log_trans() 254 */ 255 void btrfs_pin_log_trans(struct btrfs_root *root) 256 { 257 atomic_inc(&root->log_writers); 258 } 259 260 /* 261 * indicate we're done making changes to the log tree 262 * and wake up anyone waiting to do a sync 263 */ 264 void btrfs_end_log_trans(struct btrfs_root *root) 265 { 266 if (atomic_dec_and_test(&root->log_writers)) { 267 /* atomic_dec_and_test implies a barrier */ 268 cond_wake_up_nomb(&root->log_writer_wait); 269 } 270 } 271 272 static int btrfs_write_tree_block(struct extent_buffer *buf) 273 { 274 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 275 buf->start + buf->len - 1); 276 } 277 278 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 279 { 280 filemap_fdatawait_range(buf->pages[0]->mapping, 281 buf->start, buf->start + buf->len - 1); 282 } 283 284 /* 285 * the walk control struct is used to pass state down the chain when 286 * processing the log tree. The stage field tells us which part 287 * of the log tree processing we are currently doing. The others 288 * are state fields used for that specific part 289 */ 290 struct walk_control { 291 /* should we free the extent on disk when done? This is used 292 * at transaction commit time while freeing a log tree 293 */ 294 int free; 295 296 /* should we write out the extent buffer? This is used 297 * while flushing the log tree to disk during a sync 298 */ 299 int write; 300 301 /* should we wait for the extent buffer io to finish? Also used 302 * while flushing the log tree to disk for a sync 303 */ 304 int wait; 305 306 /* pin only walk, we record which extents on disk belong to the 307 * log trees 308 */ 309 int pin; 310 311 /* what stage of the replay code we're currently in */ 312 int stage; 313 314 /* 315 * Ignore any items from the inode currently being processed. Needs 316 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 317 * the LOG_WALK_REPLAY_INODES stage. 318 */ 319 bool ignore_cur_inode; 320 321 /* the root we are currently replaying */ 322 struct btrfs_root *replay_dest; 323 324 /* the trans handle for the current replay */ 325 struct btrfs_trans_handle *trans; 326 327 /* the function that gets used to process blocks we find in the 328 * tree. Note the extent_buffer might not be up to date when it is 329 * passed in, and it must be checked or read if you need the data 330 * inside it 331 */ 332 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 333 struct walk_control *wc, u64 gen, int level); 334 }; 335 336 /* 337 * process_func used to pin down extents, write them or wait on them 338 */ 339 static int process_one_buffer(struct btrfs_root *log, 340 struct extent_buffer *eb, 341 struct walk_control *wc, u64 gen, int level) 342 { 343 struct btrfs_fs_info *fs_info = log->fs_info; 344 int ret = 0; 345 346 /* 347 * If this fs is mixed then we need to be able to process the leaves to 348 * pin down any logged extents, so we have to read the block. 349 */ 350 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 351 ret = btrfs_read_buffer(eb, gen, level, NULL); 352 if (ret) 353 return ret; 354 } 355 356 if (wc->pin) 357 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 358 eb->len); 359 360 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 361 if (wc->pin && btrfs_header_level(eb) == 0) 362 ret = btrfs_exclude_logged_extents(eb); 363 if (wc->write) 364 btrfs_write_tree_block(eb); 365 if (wc->wait) 366 btrfs_wait_tree_block_writeback(eb); 367 } 368 return ret; 369 } 370 371 /* 372 * Item overwrite used by replay and tree logging. eb, slot and key all refer 373 * to the src data we are copying out. 374 * 375 * root is the tree we are copying into, and path is a scratch 376 * path for use in this function (it should be released on entry and 377 * will be released on exit). 378 * 379 * If the key is already in the destination tree the existing item is 380 * overwritten. If the existing item isn't big enough, it is extended. 381 * If it is too large, it is truncated. 382 * 383 * If the key isn't in the destination yet, a new item is inserted. 384 */ 385 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 386 struct btrfs_root *root, 387 struct btrfs_path *path, 388 struct extent_buffer *eb, int slot, 389 struct btrfs_key *key) 390 { 391 int ret; 392 u32 item_size; 393 u64 saved_i_size = 0; 394 int save_old_i_size = 0; 395 unsigned long src_ptr; 396 unsigned long dst_ptr; 397 int overwrite_root = 0; 398 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 399 400 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 401 overwrite_root = 1; 402 403 item_size = btrfs_item_size_nr(eb, slot); 404 src_ptr = btrfs_item_ptr_offset(eb, slot); 405 406 /* look for the key in the destination tree */ 407 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 408 if (ret < 0) 409 return ret; 410 411 if (ret == 0) { 412 char *src_copy; 413 char *dst_copy; 414 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 415 path->slots[0]); 416 if (dst_size != item_size) 417 goto insert; 418 419 if (item_size == 0) { 420 btrfs_release_path(path); 421 return 0; 422 } 423 dst_copy = kmalloc(item_size, GFP_NOFS); 424 src_copy = kmalloc(item_size, GFP_NOFS); 425 if (!dst_copy || !src_copy) { 426 btrfs_release_path(path); 427 kfree(dst_copy); 428 kfree(src_copy); 429 return -ENOMEM; 430 } 431 432 read_extent_buffer(eb, src_copy, src_ptr, item_size); 433 434 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 435 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 436 item_size); 437 ret = memcmp(dst_copy, src_copy, item_size); 438 439 kfree(dst_copy); 440 kfree(src_copy); 441 /* 442 * they have the same contents, just return, this saves 443 * us from cowing blocks in the destination tree and doing 444 * extra writes that may not have been done by a previous 445 * sync 446 */ 447 if (ret == 0) { 448 btrfs_release_path(path); 449 return 0; 450 } 451 452 /* 453 * We need to load the old nbytes into the inode so when we 454 * replay the extents we've logged we get the right nbytes. 455 */ 456 if (inode_item) { 457 struct btrfs_inode_item *item; 458 u64 nbytes; 459 u32 mode; 460 461 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 462 struct btrfs_inode_item); 463 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 464 item = btrfs_item_ptr(eb, slot, 465 struct btrfs_inode_item); 466 btrfs_set_inode_nbytes(eb, item, nbytes); 467 468 /* 469 * If this is a directory we need to reset the i_size to 470 * 0 so that we can set it up properly when replaying 471 * the rest of the items in this log. 472 */ 473 mode = btrfs_inode_mode(eb, item); 474 if (S_ISDIR(mode)) 475 btrfs_set_inode_size(eb, item, 0); 476 } 477 } else if (inode_item) { 478 struct btrfs_inode_item *item; 479 u32 mode; 480 481 /* 482 * New inode, set nbytes to 0 so that the nbytes comes out 483 * properly when we replay the extents. 484 */ 485 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 486 btrfs_set_inode_nbytes(eb, item, 0); 487 488 /* 489 * If this is a directory we need to reset the i_size to 0 so 490 * that we can set it up properly when replaying the rest of 491 * the items in this log. 492 */ 493 mode = btrfs_inode_mode(eb, item); 494 if (S_ISDIR(mode)) 495 btrfs_set_inode_size(eb, item, 0); 496 } 497 insert: 498 btrfs_release_path(path); 499 /* try to insert the key into the destination tree */ 500 path->skip_release_on_error = 1; 501 ret = btrfs_insert_empty_item(trans, root, path, 502 key, item_size); 503 path->skip_release_on_error = 0; 504 505 /* make sure any existing item is the correct size */ 506 if (ret == -EEXIST || ret == -EOVERFLOW) { 507 u32 found_size; 508 found_size = btrfs_item_size_nr(path->nodes[0], 509 path->slots[0]); 510 if (found_size > item_size) 511 btrfs_truncate_item(path, item_size, 1); 512 else if (found_size < item_size) 513 btrfs_extend_item(path, item_size - found_size); 514 } else if (ret) { 515 return ret; 516 } 517 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 518 path->slots[0]); 519 520 /* don't overwrite an existing inode if the generation number 521 * was logged as zero. This is done when the tree logging code 522 * is just logging an inode to make sure it exists after recovery. 523 * 524 * Also, don't overwrite i_size on directories during replay. 525 * log replay inserts and removes directory items based on the 526 * state of the tree found in the subvolume, and i_size is modified 527 * as it goes 528 */ 529 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 530 struct btrfs_inode_item *src_item; 531 struct btrfs_inode_item *dst_item; 532 533 src_item = (struct btrfs_inode_item *)src_ptr; 534 dst_item = (struct btrfs_inode_item *)dst_ptr; 535 536 if (btrfs_inode_generation(eb, src_item) == 0) { 537 struct extent_buffer *dst_eb = path->nodes[0]; 538 const u64 ino_size = btrfs_inode_size(eb, src_item); 539 540 /* 541 * For regular files an ino_size == 0 is used only when 542 * logging that an inode exists, as part of a directory 543 * fsync, and the inode wasn't fsynced before. In this 544 * case don't set the size of the inode in the fs/subvol 545 * tree, otherwise we would be throwing valid data away. 546 */ 547 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 548 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 549 ino_size != 0) 550 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 551 goto no_copy; 552 } 553 554 if (overwrite_root && 555 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 556 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 557 save_old_i_size = 1; 558 saved_i_size = btrfs_inode_size(path->nodes[0], 559 dst_item); 560 } 561 } 562 563 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 564 src_ptr, item_size); 565 566 if (save_old_i_size) { 567 struct btrfs_inode_item *dst_item; 568 dst_item = (struct btrfs_inode_item *)dst_ptr; 569 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 570 } 571 572 /* make sure the generation is filled in */ 573 if (key->type == BTRFS_INODE_ITEM_KEY) { 574 struct btrfs_inode_item *dst_item; 575 dst_item = (struct btrfs_inode_item *)dst_ptr; 576 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 577 btrfs_set_inode_generation(path->nodes[0], dst_item, 578 trans->transid); 579 } 580 } 581 no_copy: 582 btrfs_mark_buffer_dirty(path->nodes[0]); 583 btrfs_release_path(path); 584 return 0; 585 } 586 587 /* 588 * simple helper to read an inode off the disk from a given root 589 * This can only be called for subvolume roots and not for the log 590 */ 591 static noinline struct inode *read_one_inode(struct btrfs_root *root, 592 u64 objectid) 593 { 594 struct inode *inode; 595 596 inode = btrfs_iget(root->fs_info->sb, objectid, root); 597 if (IS_ERR(inode)) 598 inode = NULL; 599 return inode; 600 } 601 602 /* replays a single extent in 'eb' at 'slot' with 'key' into the 603 * subvolume 'root'. path is released on entry and should be released 604 * on exit. 605 * 606 * extents in the log tree have not been allocated out of the extent 607 * tree yet. So, this completes the allocation, taking a reference 608 * as required if the extent already exists or creating a new extent 609 * if it isn't in the extent allocation tree yet. 610 * 611 * The extent is inserted into the file, dropping any existing extents 612 * from the file that overlap the new one. 613 */ 614 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 615 struct btrfs_root *root, 616 struct btrfs_path *path, 617 struct extent_buffer *eb, int slot, 618 struct btrfs_key *key) 619 { 620 struct btrfs_drop_extents_args drop_args = { 0 }; 621 struct btrfs_fs_info *fs_info = root->fs_info; 622 int found_type; 623 u64 extent_end; 624 u64 start = key->offset; 625 u64 nbytes = 0; 626 struct btrfs_file_extent_item *item; 627 struct inode *inode = NULL; 628 unsigned long size; 629 int ret = 0; 630 631 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 632 found_type = btrfs_file_extent_type(eb, item); 633 634 if (found_type == BTRFS_FILE_EXTENT_REG || 635 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 636 nbytes = btrfs_file_extent_num_bytes(eb, item); 637 extent_end = start + nbytes; 638 639 /* 640 * We don't add to the inodes nbytes if we are prealloc or a 641 * hole. 642 */ 643 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 644 nbytes = 0; 645 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 646 size = btrfs_file_extent_ram_bytes(eb, item); 647 nbytes = btrfs_file_extent_ram_bytes(eb, item); 648 extent_end = ALIGN(start + size, 649 fs_info->sectorsize); 650 } else { 651 ret = 0; 652 goto out; 653 } 654 655 inode = read_one_inode(root, key->objectid); 656 if (!inode) { 657 ret = -EIO; 658 goto out; 659 } 660 661 /* 662 * first check to see if we already have this extent in the 663 * file. This must be done before the btrfs_drop_extents run 664 * so we don't try to drop this extent. 665 */ 666 ret = btrfs_lookup_file_extent(trans, root, path, 667 btrfs_ino(BTRFS_I(inode)), start, 0); 668 669 if (ret == 0 && 670 (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 672 struct btrfs_file_extent_item cmp1; 673 struct btrfs_file_extent_item cmp2; 674 struct btrfs_file_extent_item *existing; 675 struct extent_buffer *leaf; 676 677 leaf = path->nodes[0]; 678 existing = btrfs_item_ptr(leaf, path->slots[0], 679 struct btrfs_file_extent_item); 680 681 read_extent_buffer(eb, &cmp1, (unsigned long)item, 682 sizeof(cmp1)); 683 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 684 sizeof(cmp2)); 685 686 /* 687 * we already have a pointer to this exact extent, 688 * we don't have to do anything 689 */ 690 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 691 btrfs_release_path(path); 692 goto out; 693 } 694 } 695 btrfs_release_path(path); 696 697 /* drop any overlapping extents */ 698 drop_args.start = start; 699 drop_args.end = extent_end; 700 drop_args.drop_cache = true; 701 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 702 if (ret) 703 goto out; 704 705 if (found_type == BTRFS_FILE_EXTENT_REG || 706 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 707 u64 offset; 708 unsigned long dest_offset; 709 struct btrfs_key ins; 710 711 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 712 btrfs_fs_incompat(fs_info, NO_HOLES)) 713 goto update_inode; 714 715 ret = btrfs_insert_empty_item(trans, root, path, key, 716 sizeof(*item)); 717 if (ret) 718 goto out; 719 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 720 path->slots[0]); 721 copy_extent_buffer(path->nodes[0], eb, dest_offset, 722 (unsigned long)item, sizeof(*item)); 723 724 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 725 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 726 ins.type = BTRFS_EXTENT_ITEM_KEY; 727 offset = key->offset - btrfs_file_extent_offset(eb, item); 728 729 /* 730 * Manually record dirty extent, as here we did a shallow 731 * file extent item copy and skip normal backref update, 732 * but modifying extent tree all by ourselves. 733 * So need to manually record dirty extent for qgroup, 734 * as the owner of the file extent changed from log tree 735 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 736 */ 737 ret = btrfs_qgroup_trace_extent(trans, 738 btrfs_file_extent_disk_bytenr(eb, item), 739 btrfs_file_extent_disk_num_bytes(eb, item), 740 GFP_NOFS); 741 if (ret < 0) 742 goto out; 743 744 if (ins.objectid > 0) { 745 struct btrfs_ref ref = { 0 }; 746 u64 csum_start; 747 u64 csum_end; 748 LIST_HEAD(ordered_sums); 749 750 /* 751 * is this extent already allocated in the extent 752 * allocation tree? If so, just add a reference 753 */ 754 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 755 ins.offset); 756 if (ret < 0) { 757 goto out; 758 } else if (ret == 0) { 759 btrfs_init_generic_ref(&ref, 760 BTRFS_ADD_DELAYED_REF, 761 ins.objectid, ins.offset, 0); 762 btrfs_init_data_ref(&ref, 763 root->root_key.objectid, 764 key->objectid, offset); 765 ret = btrfs_inc_extent_ref(trans, &ref); 766 if (ret) 767 goto out; 768 } else { 769 /* 770 * insert the extent pointer in the extent 771 * allocation tree 772 */ 773 ret = btrfs_alloc_logged_file_extent(trans, 774 root->root_key.objectid, 775 key->objectid, offset, &ins); 776 if (ret) 777 goto out; 778 } 779 btrfs_release_path(path); 780 781 if (btrfs_file_extent_compression(eb, item)) { 782 csum_start = ins.objectid; 783 csum_end = csum_start + ins.offset; 784 } else { 785 csum_start = ins.objectid + 786 btrfs_file_extent_offset(eb, item); 787 csum_end = csum_start + 788 btrfs_file_extent_num_bytes(eb, item); 789 } 790 791 ret = btrfs_lookup_csums_range(root->log_root, 792 csum_start, csum_end - 1, 793 &ordered_sums, 0); 794 if (ret) 795 goto out; 796 /* 797 * Now delete all existing cums in the csum root that 798 * cover our range. We do this because we can have an 799 * extent that is completely referenced by one file 800 * extent item and partially referenced by another 801 * file extent item (like after using the clone or 802 * extent_same ioctls). In this case if we end up doing 803 * the replay of the one that partially references the 804 * extent first, and we do not do the csum deletion 805 * below, we can get 2 csum items in the csum tree that 806 * overlap each other. For example, imagine our log has 807 * the two following file extent items: 808 * 809 * key (257 EXTENT_DATA 409600) 810 * extent data disk byte 12845056 nr 102400 811 * extent data offset 20480 nr 20480 ram 102400 812 * 813 * key (257 EXTENT_DATA 819200) 814 * extent data disk byte 12845056 nr 102400 815 * extent data offset 0 nr 102400 ram 102400 816 * 817 * Where the second one fully references the 100K extent 818 * that starts at disk byte 12845056, and the log tree 819 * has a single csum item that covers the entire range 820 * of the extent: 821 * 822 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 823 * 824 * After the first file extent item is replayed, the 825 * csum tree gets the following csum item: 826 * 827 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 828 * 829 * Which covers the 20K sub-range starting at offset 20K 830 * of our extent. Now when we replay the second file 831 * extent item, if we do not delete existing csum items 832 * that cover any of its blocks, we end up getting two 833 * csum items in our csum tree that overlap each other: 834 * 835 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 836 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 837 * 838 * Which is a problem, because after this anyone trying 839 * to lookup up for the checksum of any block of our 840 * extent starting at an offset of 40K or higher, will 841 * end up looking at the second csum item only, which 842 * does not contain the checksum for any block starting 843 * at offset 40K or higher of our extent. 844 */ 845 while (!list_empty(&ordered_sums)) { 846 struct btrfs_ordered_sum *sums; 847 sums = list_entry(ordered_sums.next, 848 struct btrfs_ordered_sum, 849 list); 850 if (!ret) 851 ret = btrfs_del_csums(trans, 852 fs_info->csum_root, 853 sums->bytenr, 854 sums->len); 855 if (!ret) 856 ret = btrfs_csum_file_blocks(trans, 857 fs_info->csum_root, sums); 858 list_del(&sums->list); 859 kfree(sums); 860 } 861 if (ret) 862 goto out; 863 } else { 864 btrfs_release_path(path); 865 } 866 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 867 /* inline extents are easy, we just overwrite them */ 868 ret = overwrite_item(trans, root, path, eb, slot, key); 869 if (ret) 870 goto out; 871 } 872 873 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 874 extent_end - start); 875 if (ret) 876 goto out; 877 878 update_inode: 879 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 880 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 881 out: 882 if (inode) 883 iput(inode); 884 return ret; 885 } 886 887 /* 888 * when cleaning up conflicts between the directory names in the 889 * subvolume, directory names in the log and directory names in the 890 * inode back references, we may have to unlink inodes from directories. 891 * 892 * This is a helper function to do the unlink of a specific directory 893 * item 894 */ 895 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 896 struct btrfs_root *root, 897 struct btrfs_path *path, 898 struct btrfs_inode *dir, 899 struct btrfs_dir_item *di) 900 { 901 struct inode *inode; 902 char *name; 903 int name_len; 904 struct extent_buffer *leaf; 905 struct btrfs_key location; 906 int ret; 907 908 leaf = path->nodes[0]; 909 910 btrfs_dir_item_key_to_cpu(leaf, di, &location); 911 name_len = btrfs_dir_name_len(leaf, di); 912 name = kmalloc(name_len, GFP_NOFS); 913 if (!name) 914 return -ENOMEM; 915 916 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 917 btrfs_release_path(path); 918 919 inode = read_one_inode(root, location.objectid); 920 if (!inode) { 921 ret = -EIO; 922 goto out; 923 } 924 925 ret = link_to_fixup_dir(trans, root, path, location.objectid); 926 if (ret) 927 goto out; 928 929 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 930 name_len); 931 if (ret) 932 goto out; 933 else 934 ret = btrfs_run_delayed_items(trans); 935 out: 936 kfree(name); 937 iput(inode); 938 return ret; 939 } 940 941 /* 942 * helper function to see if a given name and sequence number found 943 * in an inode back reference are already in a directory and correctly 944 * point to this inode 945 */ 946 static noinline int inode_in_dir(struct btrfs_root *root, 947 struct btrfs_path *path, 948 u64 dirid, u64 objectid, u64 index, 949 const char *name, int name_len) 950 { 951 struct btrfs_dir_item *di; 952 struct btrfs_key location; 953 int match = 0; 954 955 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 956 index, name, name_len, 0); 957 if (di && !IS_ERR(di)) { 958 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 959 if (location.objectid != objectid) 960 goto out; 961 } else 962 goto out; 963 btrfs_release_path(path); 964 965 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 966 if (di && !IS_ERR(di)) { 967 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 968 if (location.objectid != objectid) 969 goto out; 970 } else 971 goto out; 972 match = 1; 973 out: 974 btrfs_release_path(path); 975 return match; 976 } 977 978 /* 979 * helper function to check a log tree for a named back reference in 980 * an inode. This is used to decide if a back reference that is 981 * found in the subvolume conflicts with what we find in the log. 982 * 983 * inode backreferences may have multiple refs in a single item, 984 * during replay we process one reference at a time, and we don't 985 * want to delete valid links to a file from the subvolume if that 986 * link is also in the log. 987 */ 988 static noinline int backref_in_log(struct btrfs_root *log, 989 struct btrfs_key *key, 990 u64 ref_objectid, 991 const char *name, int namelen) 992 { 993 struct btrfs_path *path; 994 int ret; 995 996 path = btrfs_alloc_path(); 997 if (!path) 998 return -ENOMEM; 999 1000 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1001 if (ret < 0) { 1002 goto out; 1003 } else if (ret == 1) { 1004 ret = 0; 1005 goto out; 1006 } 1007 1008 if (key->type == BTRFS_INODE_EXTREF_KEY) 1009 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1010 path->slots[0], 1011 ref_objectid, 1012 name, namelen); 1013 else 1014 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1015 path->slots[0], 1016 name, namelen); 1017 out: 1018 btrfs_free_path(path); 1019 return ret; 1020 } 1021 1022 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1023 struct btrfs_root *root, 1024 struct btrfs_path *path, 1025 struct btrfs_root *log_root, 1026 struct btrfs_inode *dir, 1027 struct btrfs_inode *inode, 1028 u64 inode_objectid, u64 parent_objectid, 1029 u64 ref_index, char *name, int namelen, 1030 int *search_done) 1031 { 1032 int ret; 1033 char *victim_name; 1034 int victim_name_len; 1035 struct extent_buffer *leaf; 1036 struct btrfs_dir_item *di; 1037 struct btrfs_key search_key; 1038 struct btrfs_inode_extref *extref; 1039 1040 again: 1041 /* Search old style refs */ 1042 search_key.objectid = inode_objectid; 1043 search_key.type = BTRFS_INODE_REF_KEY; 1044 search_key.offset = parent_objectid; 1045 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1046 if (ret == 0) { 1047 struct btrfs_inode_ref *victim_ref; 1048 unsigned long ptr; 1049 unsigned long ptr_end; 1050 1051 leaf = path->nodes[0]; 1052 1053 /* are we trying to overwrite a back ref for the root directory 1054 * if so, just jump out, we're done 1055 */ 1056 if (search_key.objectid == search_key.offset) 1057 return 1; 1058 1059 /* check all the names in this back reference to see 1060 * if they are in the log. if so, we allow them to stay 1061 * otherwise they must be unlinked as a conflict 1062 */ 1063 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1064 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1065 while (ptr < ptr_end) { 1066 victim_ref = (struct btrfs_inode_ref *)ptr; 1067 victim_name_len = btrfs_inode_ref_name_len(leaf, 1068 victim_ref); 1069 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1070 if (!victim_name) 1071 return -ENOMEM; 1072 1073 read_extent_buffer(leaf, victim_name, 1074 (unsigned long)(victim_ref + 1), 1075 victim_name_len); 1076 1077 ret = backref_in_log(log_root, &search_key, 1078 parent_objectid, victim_name, 1079 victim_name_len); 1080 if (ret < 0) { 1081 kfree(victim_name); 1082 return ret; 1083 } else if (!ret) { 1084 inc_nlink(&inode->vfs_inode); 1085 btrfs_release_path(path); 1086 1087 ret = btrfs_unlink_inode(trans, root, dir, inode, 1088 victim_name, victim_name_len); 1089 kfree(victim_name); 1090 if (ret) 1091 return ret; 1092 ret = btrfs_run_delayed_items(trans); 1093 if (ret) 1094 return ret; 1095 *search_done = 1; 1096 goto again; 1097 } 1098 kfree(victim_name); 1099 1100 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1101 } 1102 1103 /* 1104 * NOTE: we have searched root tree and checked the 1105 * corresponding ref, it does not need to check again. 1106 */ 1107 *search_done = 1; 1108 } 1109 btrfs_release_path(path); 1110 1111 /* Same search but for extended refs */ 1112 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1113 inode_objectid, parent_objectid, 0, 1114 0); 1115 if (!IS_ERR_OR_NULL(extref)) { 1116 u32 item_size; 1117 u32 cur_offset = 0; 1118 unsigned long base; 1119 struct inode *victim_parent; 1120 1121 leaf = path->nodes[0]; 1122 1123 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1124 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1125 1126 while (cur_offset < item_size) { 1127 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1128 1129 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1130 1131 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1132 goto next; 1133 1134 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1135 if (!victim_name) 1136 return -ENOMEM; 1137 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1138 victim_name_len); 1139 1140 search_key.objectid = inode_objectid; 1141 search_key.type = BTRFS_INODE_EXTREF_KEY; 1142 search_key.offset = btrfs_extref_hash(parent_objectid, 1143 victim_name, 1144 victim_name_len); 1145 ret = backref_in_log(log_root, &search_key, 1146 parent_objectid, victim_name, 1147 victim_name_len); 1148 if (ret < 0) { 1149 return ret; 1150 } else if (!ret) { 1151 ret = -ENOENT; 1152 victim_parent = read_one_inode(root, 1153 parent_objectid); 1154 if (victim_parent) { 1155 inc_nlink(&inode->vfs_inode); 1156 btrfs_release_path(path); 1157 1158 ret = btrfs_unlink_inode(trans, root, 1159 BTRFS_I(victim_parent), 1160 inode, 1161 victim_name, 1162 victim_name_len); 1163 if (!ret) 1164 ret = btrfs_run_delayed_items( 1165 trans); 1166 } 1167 iput(victim_parent); 1168 kfree(victim_name); 1169 if (ret) 1170 return ret; 1171 *search_done = 1; 1172 goto again; 1173 } 1174 kfree(victim_name); 1175 next: 1176 cur_offset += victim_name_len + sizeof(*extref); 1177 } 1178 *search_done = 1; 1179 } 1180 btrfs_release_path(path); 1181 1182 /* look for a conflicting sequence number */ 1183 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1184 ref_index, name, namelen, 0); 1185 if (di && !IS_ERR(di)) { 1186 ret = drop_one_dir_item(trans, root, path, dir, di); 1187 if (ret) 1188 return ret; 1189 } 1190 btrfs_release_path(path); 1191 1192 /* look for a conflicting name */ 1193 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1194 name, namelen, 0); 1195 if (di && !IS_ERR(di)) { 1196 ret = drop_one_dir_item(trans, root, path, dir, di); 1197 if (ret) 1198 return ret; 1199 } 1200 btrfs_release_path(path); 1201 1202 return 0; 1203 } 1204 1205 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1206 u32 *namelen, char **name, u64 *index, 1207 u64 *parent_objectid) 1208 { 1209 struct btrfs_inode_extref *extref; 1210 1211 extref = (struct btrfs_inode_extref *)ref_ptr; 1212 1213 *namelen = btrfs_inode_extref_name_len(eb, extref); 1214 *name = kmalloc(*namelen, GFP_NOFS); 1215 if (*name == NULL) 1216 return -ENOMEM; 1217 1218 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1219 *namelen); 1220 1221 if (index) 1222 *index = btrfs_inode_extref_index(eb, extref); 1223 if (parent_objectid) 1224 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1225 1226 return 0; 1227 } 1228 1229 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1230 u32 *namelen, char **name, u64 *index) 1231 { 1232 struct btrfs_inode_ref *ref; 1233 1234 ref = (struct btrfs_inode_ref *)ref_ptr; 1235 1236 *namelen = btrfs_inode_ref_name_len(eb, ref); 1237 *name = kmalloc(*namelen, GFP_NOFS); 1238 if (*name == NULL) 1239 return -ENOMEM; 1240 1241 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1242 1243 if (index) 1244 *index = btrfs_inode_ref_index(eb, ref); 1245 1246 return 0; 1247 } 1248 1249 /* 1250 * Take an inode reference item from the log tree and iterate all names from the 1251 * inode reference item in the subvolume tree with the same key (if it exists). 1252 * For any name that is not in the inode reference item from the log tree, do a 1253 * proper unlink of that name (that is, remove its entry from the inode 1254 * reference item and both dir index keys). 1255 */ 1256 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1257 struct btrfs_root *root, 1258 struct btrfs_path *path, 1259 struct btrfs_inode *inode, 1260 struct extent_buffer *log_eb, 1261 int log_slot, 1262 struct btrfs_key *key) 1263 { 1264 int ret; 1265 unsigned long ref_ptr; 1266 unsigned long ref_end; 1267 struct extent_buffer *eb; 1268 1269 again: 1270 btrfs_release_path(path); 1271 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1272 if (ret > 0) { 1273 ret = 0; 1274 goto out; 1275 } 1276 if (ret < 0) 1277 goto out; 1278 1279 eb = path->nodes[0]; 1280 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1281 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1282 while (ref_ptr < ref_end) { 1283 char *name = NULL; 1284 int namelen; 1285 u64 parent_id; 1286 1287 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1288 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1289 NULL, &parent_id); 1290 } else { 1291 parent_id = key->offset; 1292 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1293 NULL); 1294 } 1295 if (ret) 1296 goto out; 1297 1298 if (key->type == BTRFS_INODE_EXTREF_KEY) 1299 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1300 parent_id, name, 1301 namelen); 1302 else 1303 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1304 name, namelen); 1305 1306 if (!ret) { 1307 struct inode *dir; 1308 1309 btrfs_release_path(path); 1310 dir = read_one_inode(root, parent_id); 1311 if (!dir) { 1312 ret = -ENOENT; 1313 kfree(name); 1314 goto out; 1315 } 1316 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1317 inode, name, namelen); 1318 kfree(name); 1319 iput(dir); 1320 if (ret) 1321 goto out; 1322 goto again; 1323 } 1324 1325 kfree(name); 1326 ref_ptr += namelen; 1327 if (key->type == BTRFS_INODE_EXTREF_KEY) 1328 ref_ptr += sizeof(struct btrfs_inode_extref); 1329 else 1330 ref_ptr += sizeof(struct btrfs_inode_ref); 1331 } 1332 ret = 0; 1333 out: 1334 btrfs_release_path(path); 1335 return ret; 1336 } 1337 1338 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1339 const u8 ref_type, const char *name, 1340 const int namelen) 1341 { 1342 struct btrfs_key key; 1343 struct btrfs_path *path; 1344 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1345 int ret; 1346 1347 path = btrfs_alloc_path(); 1348 if (!path) 1349 return -ENOMEM; 1350 1351 key.objectid = btrfs_ino(BTRFS_I(inode)); 1352 key.type = ref_type; 1353 if (key.type == BTRFS_INODE_REF_KEY) 1354 key.offset = parent_id; 1355 else 1356 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1357 1358 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1359 if (ret < 0) 1360 goto out; 1361 if (ret > 0) { 1362 ret = 0; 1363 goto out; 1364 } 1365 if (key.type == BTRFS_INODE_EXTREF_KEY) 1366 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1367 path->slots[0], parent_id, name, namelen); 1368 else 1369 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1370 name, namelen); 1371 1372 out: 1373 btrfs_free_path(path); 1374 return ret; 1375 } 1376 1377 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1378 struct inode *dir, struct inode *inode, const char *name, 1379 int namelen, u64 ref_index) 1380 { 1381 struct btrfs_dir_item *dir_item; 1382 struct btrfs_key key; 1383 struct btrfs_path *path; 1384 struct inode *other_inode = NULL; 1385 int ret; 1386 1387 path = btrfs_alloc_path(); 1388 if (!path) 1389 return -ENOMEM; 1390 1391 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1392 btrfs_ino(BTRFS_I(dir)), 1393 name, namelen, 0); 1394 if (!dir_item) { 1395 btrfs_release_path(path); 1396 goto add_link; 1397 } else if (IS_ERR(dir_item)) { 1398 ret = PTR_ERR(dir_item); 1399 goto out; 1400 } 1401 1402 /* 1403 * Our inode's dentry collides with the dentry of another inode which is 1404 * in the log but not yet processed since it has a higher inode number. 1405 * So delete that other dentry. 1406 */ 1407 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1408 btrfs_release_path(path); 1409 other_inode = read_one_inode(root, key.objectid); 1410 if (!other_inode) { 1411 ret = -ENOENT; 1412 goto out; 1413 } 1414 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1415 name, namelen); 1416 if (ret) 1417 goto out; 1418 /* 1419 * If we dropped the link count to 0, bump it so that later the iput() 1420 * on the inode will not free it. We will fixup the link count later. 1421 */ 1422 if (other_inode->i_nlink == 0) 1423 inc_nlink(other_inode); 1424 1425 ret = btrfs_run_delayed_items(trans); 1426 if (ret) 1427 goto out; 1428 add_link: 1429 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1430 name, namelen, 0, ref_index); 1431 out: 1432 iput(other_inode); 1433 btrfs_free_path(path); 1434 1435 return ret; 1436 } 1437 1438 /* 1439 * replay one inode back reference item found in the log tree. 1440 * eb, slot and key refer to the buffer and key found in the log tree. 1441 * root is the destination we are replaying into, and path is for temp 1442 * use by this function. (it should be released on return). 1443 */ 1444 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1445 struct btrfs_root *root, 1446 struct btrfs_root *log, 1447 struct btrfs_path *path, 1448 struct extent_buffer *eb, int slot, 1449 struct btrfs_key *key) 1450 { 1451 struct inode *dir = NULL; 1452 struct inode *inode = NULL; 1453 unsigned long ref_ptr; 1454 unsigned long ref_end; 1455 char *name = NULL; 1456 int namelen; 1457 int ret; 1458 int search_done = 0; 1459 int log_ref_ver = 0; 1460 u64 parent_objectid; 1461 u64 inode_objectid; 1462 u64 ref_index = 0; 1463 int ref_struct_size; 1464 1465 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1466 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1467 1468 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1469 struct btrfs_inode_extref *r; 1470 1471 ref_struct_size = sizeof(struct btrfs_inode_extref); 1472 log_ref_ver = 1; 1473 r = (struct btrfs_inode_extref *)ref_ptr; 1474 parent_objectid = btrfs_inode_extref_parent(eb, r); 1475 } else { 1476 ref_struct_size = sizeof(struct btrfs_inode_ref); 1477 parent_objectid = key->offset; 1478 } 1479 inode_objectid = key->objectid; 1480 1481 /* 1482 * it is possible that we didn't log all the parent directories 1483 * for a given inode. If we don't find the dir, just don't 1484 * copy the back ref in. The link count fixup code will take 1485 * care of the rest 1486 */ 1487 dir = read_one_inode(root, parent_objectid); 1488 if (!dir) { 1489 ret = -ENOENT; 1490 goto out; 1491 } 1492 1493 inode = read_one_inode(root, inode_objectid); 1494 if (!inode) { 1495 ret = -EIO; 1496 goto out; 1497 } 1498 1499 while (ref_ptr < ref_end) { 1500 if (log_ref_ver) { 1501 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1502 &ref_index, &parent_objectid); 1503 /* 1504 * parent object can change from one array 1505 * item to another. 1506 */ 1507 if (!dir) 1508 dir = read_one_inode(root, parent_objectid); 1509 if (!dir) { 1510 ret = -ENOENT; 1511 goto out; 1512 } 1513 } else { 1514 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1515 &ref_index); 1516 } 1517 if (ret) 1518 goto out; 1519 1520 /* if we already have a perfect match, we're done */ 1521 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1522 btrfs_ino(BTRFS_I(inode)), ref_index, 1523 name, namelen)) { 1524 /* 1525 * look for a conflicting back reference in the 1526 * metadata. if we find one we have to unlink that name 1527 * of the file before we add our new link. Later on, we 1528 * overwrite any existing back reference, and we don't 1529 * want to create dangling pointers in the directory. 1530 */ 1531 1532 if (!search_done) { 1533 ret = __add_inode_ref(trans, root, path, log, 1534 BTRFS_I(dir), 1535 BTRFS_I(inode), 1536 inode_objectid, 1537 parent_objectid, 1538 ref_index, name, namelen, 1539 &search_done); 1540 if (ret) { 1541 if (ret == 1) 1542 ret = 0; 1543 goto out; 1544 } 1545 } 1546 1547 /* 1548 * If a reference item already exists for this inode 1549 * with the same parent and name, but different index, 1550 * drop it and the corresponding directory index entries 1551 * from the parent before adding the new reference item 1552 * and dir index entries, otherwise we would fail with 1553 * -EEXIST returned from btrfs_add_link() below. 1554 */ 1555 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1556 name, namelen); 1557 if (ret > 0) { 1558 ret = btrfs_unlink_inode(trans, root, 1559 BTRFS_I(dir), 1560 BTRFS_I(inode), 1561 name, namelen); 1562 /* 1563 * If we dropped the link count to 0, bump it so 1564 * that later the iput() on the inode will not 1565 * free it. We will fixup the link count later. 1566 */ 1567 if (!ret && inode->i_nlink == 0) 1568 inc_nlink(inode); 1569 } 1570 if (ret < 0) 1571 goto out; 1572 1573 /* insert our name */ 1574 ret = add_link(trans, root, dir, inode, name, namelen, 1575 ref_index); 1576 if (ret) 1577 goto out; 1578 1579 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1580 if (ret) 1581 goto out; 1582 } 1583 1584 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1585 kfree(name); 1586 name = NULL; 1587 if (log_ref_ver) { 1588 iput(dir); 1589 dir = NULL; 1590 } 1591 } 1592 1593 /* 1594 * Before we overwrite the inode reference item in the subvolume tree 1595 * with the item from the log tree, we must unlink all names from the 1596 * parent directory that are in the subvolume's tree inode reference 1597 * item, otherwise we end up with an inconsistent subvolume tree where 1598 * dir index entries exist for a name but there is no inode reference 1599 * item with the same name. 1600 */ 1601 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1602 key); 1603 if (ret) 1604 goto out; 1605 1606 /* finally write the back reference in the inode */ 1607 ret = overwrite_item(trans, root, path, eb, slot, key); 1608 out: 1609 btrfs_release_path(path); 1610 kfree(name); 1611 iput(dir); 1612 iput(inode); 1613 return ret; 1614 } 1615 1616 static int count_inode_extrefs(struct btrfs_root *root, 1617 struct btrfs_inode *inode, struct btrfs_path *path) 1618 { 1619 int ret = 0; 1620 int name_len; 1621 unsigned int nlink = 0; 1622 u32 item_size; 1623 u32 cur_offset = 0; 1624 u64 inode_objectid = btrfs_ino(inode); 1625 u64 offset = 0; 1626 unsigned long ptr; 1627 struct btrfs_inode_extref *extref; 1628 struct extent_buffer *leaf; 1629 1630 while (1) { 1631 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1632 &extref, &offset); 1633 if (ret) 1634 break; 1635 1636 leaf = path->nodes[0]; 1637 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1638 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1639 cur_offset = 0; 1640 1641 while (cur_offset < item_size) { 1642 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1643 name_len = btrfs_inode_extref_name_len(leaf, extref); 1644 1645 nlink++; 1646 1647 cur_offset += name_len + sizeof(*extref); 1648 } 1649 1650 offset++; 1651 btrfs_release_path(path); 1652 } 1653 btrfs_release_path(path); 1654 1655 if (ret < 0 && ret != -ENOENT) 1656 return ret; 1657 return nlink; 1658 } 1659 1660 static int count_inode_refs(struct btrfs_root *root, 1661 struct btrfs_inode *inode, struct btrfs_path *path) 1662 { 1663 int ret; 1664 struct btrfs_key key; 1665 unsigned int nlink = 0; 1666 unsigned long ptr; 1667 unsigned long ptr_end; 1668 int name_len; 1669 u64 ino = btrfs_ino(inode); 1670 1671 key.objectid = ino; 1672 key.type = BTRFS_INODE_REF_KEY; 1673 key.offset = (u64)-1; 1674 1675 while (1) { 1676 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1677 if (ret < 0) 1678 break; 1679 if (ret > 0) { 1680 if (path->slots[0] == 0) 1681 break; 1682 path->slots[0]--; 1683 } 1684 process_slot: 1685 btrfs_item_key_to_cpu(path->nodes[0], &key, 1686 path->slots[0]); 1687 if (key.objectid != ino || 1688 key.type != BTRFS_INODE_REF_KEY) 1689 break; 1690 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1691 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1692 path->slots[0]); 1693 while (ptr < ptr_end) { 1694 struct btrfs_inode_ref *ref; 1695 1696 ref = (struct btrfs_inode_ref *)ptr; 1697 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1698 ref); 1699 ptr = (unsigned long)(ref + 1) + name_len; 1700 nlink++; 1701 } 1702 1703 if (key.offset == 0) 1704 break; 1705 if (path->slots[0] > 0) { 1706 path->slots[0]--; 1707 goto process_slot; 1708 } 1709 key.offset--; 1710 btrfs_release_path(path); 1711 } 1712 btrfs_release_path(path); 1713 1714 return nlink; 1715 } 1716 1717 /* 1718 * There are a few corners where the link count of the file can't 1719 * be properly maintained during replay. So, instead of adding 1720 * lots of complexity to the log code, we just scan the backrefs 1721 * for any file that has been through replay. 1722 * 1723 * The scan will update the link count on the inode to reflect the 1724 * number of back refs found. If it goes down to zero, the iput 1725 * will free the inode. 1726 */ 1727 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1728 struct btrfs_root *root, 1729 struct inode *inode) 1730 { 1731 struct btrfs_path *path; 1732 int ret; 1733 u64 nlink = 0; 1734 u64 ino = btrfs_ino(BTRFS_I(inode)); 1735 1736 path = btrfs_alloc_path(); 1737 if (!path) 1738 return -ENOMEM; 1739 1740 ret = count_inode_refs(root, BTRFS_I(inode), path); 1741 if (ret < 0) 1742 goto out; 1743 1744 nlink = ret; 1745 1746 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1747 if (ret < 0) 1748 goto out; 1749 1750 nlink += ret; 1751 1752 ret = 0; 1753 1754 if (nlink != inode->i_nlink) { 1755 set_nlink(inode, nlink); 1756 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1757 if (ret) 1758 goto out; 1759 } 1760 BTRFS_I(inode)->index_cnt = (u64)-1; 1761 1762 if (inode->i_nlink == 0) { 1763 if (S_ISDIR(inode->i_mode)) { 1764 ret = replay_dir_deletes(trans, root, NULL, path, 1765 ino, 1); 1766 if (ret) 1767 goto out; 1768 } 1769 ret = btrfs_insert_orphan_item(trans, root, ino); 1770 if (ret == -EEXIST) 1771 ret = 0; 1772 } 1773 1774 out: 1775 btrfs_free_path(path); 1776 return ret; 1777 } 1778 1779 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1780 struct btrfs_root *root, 1781 struct btrfs_path *path) 1782 { 1783 int ret; 1784 struct btrfs_key key; 1785 struct inode *inode; 1786 1787 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1788 key.type = BTRFS_ORPHAN_ITEM_KEY; 1789 key.offset = (u64)-1; 1790 while (1) { 1791 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1792 if (ret < 0) 1793 break; 1794 1795 if (ret == 1) { 1796 ret = 0; 1797 if (path->slots[0] == 0) 1798 break; 1799 path->slots[0]--; 1800 } 1801 1802 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1803 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1804 key.type != BTRFS_ORPHAN_ITEM_KEY) 1805 break; 1806 1807 ret = btrfs_del_item(trans, root, path); 1808 if (ret) 1809 break; 1810 1811 btrfs_release_path(path); 1812 inode = read_one_inode(root, key.offset); 1813 if (!inode) { 1814 ret = -EIO; 1815 break; 1816 } 1817 1818 ret = fixup_inode_link_count(trans, root, inode); 1819 iput(inode); 1820 if (ret) 1821 break; 1822 1823 /* 1824 * fixup on a directory may create new entries, 1825 * make sure we always look for the highset possible 1826 * offset 1827 */ 1828 key.offset = (u64)-1; 1829 } 1830 btrfs_release_path(path); 1831 return ret; 1832 } 1833 1834 1835 /* 1836 * record a given inode in the fixup dir so we can check its link 1837 * count when replay is done. The link count is incremented here 1838 * so the inode won't go away until we check it 1839 */ 1840 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1841 struct btrfs_root *root, 1842 struct btrfs_path *path, 1843 u64 objectid) 1844 { 1845 struct btrfs_key key; 1846 int ret = 0; 1847 struct inode *inode; 1848 1849 inode = read_one_inode(root, objectid); 1850 if (!inode) 1851 return -EIO; 1852 1853 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1854 key.type = BTRFS_ORPHAN_ITEM_KEY; 1855 key.offset = objectid; 1856 1857 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1858 1859 btrfs_release_path(path); 1860 if (ret == 0) { 1861 if (!inode->i_nlink) 1862 set_nlink(inode, 1); 1863 else 1864 inc_nlink(inode); 1865 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1866 } else if (ret == -EEXIST) { 1867 ret = 0; 1868 } 1869 iput(inode); 1870 1871 return ret; 1872 } 1873 1874 /* 1875 * when replaying the log for a directory, we only insert names 1876 * for inodes that actually exist. This means an fsync on a directory 1877 * does not implicitly fsync all the new files in it 1878 */ 1879 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1880 struct btrfs_root *root, 1881 u64 dirid, u64 index, 1882 char *name, int name_len, 1883 struct btrfs_key *location) 1884 { 1885 struct inode *inode; 1886 struct inode *dir; 1887 int ret; 1888 1889 inode = read_one_inode(root, location->objectid); 1890 if (!inode) 1891 return -ENOENT; 1892 1893 dir = read_one_inode(root, dirid); 1894 if (!dir) { 1895 iput(inode); 1896 return -EIO; 1897 } 1898 1899 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1900 name_len, 1, index); 1901 1902 /* FIXME, put inode into FIXUP list */ 1903 1904 iput(inode); 1905 iput(dir); 1906 return ret; 1907 } 1908 1909 /* 1910 * take a single entry in a log directory item and replay it into 1911 * the subvolume. 1912 * 1913 * if a conflicting item exists in the subdirectory already, 1914 * the inode it points to is unlinked and put into the link count 1915 * fix up tree. 1916 * 1917 * If a name from the log points to a file or directory that does 1918 * not exist in the FS, it is skipped. fsyncs on directories 1919 * do not force down inodes inside that directory, just changes to the 1920 * names or unlinks in a directory. 1921 * 1922 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1923 * non-existing inode) and 1 if the name was replayed. 1924 */ 1925 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1926 struct btrfs_root *root, 1927 struct btrfs_path *path, 1928 struct extent_buffer *eb, 1929 struct btrfs_dir_item *di, 1930 struct btrfs_key *key) 1931 { 1932 char *name; 1933 int name_len; 1934 struct btrfs_dir_item *dst_di; 1935 struct btrfs_key found_key; 1936 struct btrfs_key log_key; 1937 struct inode *dir; 1938 u8 log_type; 1939 int exists; 1940 int ret = 0; 1941 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1942 bool name_added = false; 1943 1944 dir = read_one_inode(root, key->objectid); 1945 if (!dir) 1946 return -EIO; 1947 1948 name_len = btrfs_dir_name_len(eb, di); 1949 name = kmalloc(name_len, GFP_NOFS); 1950 if (!name) { 1951 ret = -ENOMEM; 1952 goto out; 1953 } 1954 1955 log_type = btrfs_dir_type(eb, di); 1956 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1957 name_len); 1958 1959 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1960 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1961 if (exists == 0) 1962 exists = 1; 1963 else 1964 exists = 0; 1965 btrfs_release_path(path); 1966 1967 if (key->type == BTRFS_DIR_ITEM_KEY) { 1968 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1969 name, name_len, 1); 1970 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1971 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1972 key->objectid, 1973 key->offset, name, 1974 name_len, 1); 1975 } else { 1976 /* Corruption */ 1977 ret = -EINVAL; 1978 goto out; 1979 } 1980 if (IS_ERR_OR_NULL(dst_di)) { 1981 /* we need a sequence number to insert, so we only 1982 * do inserts for the BTRFS_DIR_INDEX_KEY types 1983 */ 1984 if (key->type != BTRFS_DIR_INDEX_KEY) 1985 goto out; 1986 goto insert; 1987 } 1988 1989 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1990 /* the existing item matches the logged item */ 1991 if (found_key.objectid == log_key.objectid && 1992 found_key.type == log_key.type && 1993 found_key.offset == log_key.offset && 1994 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1995 update_size = false; 1996 goto out; 1997 } 1998 1999 /* 2000 * don't drop the conflicting directory entry if the inode 2001 * for the new entry doesn't exist 2002 */ 2003 if (!exists) 2004 goto out; 2005 2006 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 2007 if (ret) 2008 goto out; 2009 2010 if (key->type == BTRFS_DIR_INDEX_KEY) 2011 goto insert; 2012 out: 2013 btrfs_release_path(path); 2014 if (!ret && update_size) { 2015 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2016 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2017 } 2018 kfree(name); 2019 iput(dir); 2020 if (!ret && name_added) 2021 ret = 1; 2022 return ret; 2023 2024 insert: 2025 /* 2026 * Check if the inode reference exists in the log for the given name, 2027 * inode and parent inode 2028 */ 2029 found_key.objectid = log_key.objectid; 2030 found_key.type = BTRFS_INODE_REF_KEY; 2031 found_key.offset = key->objectid; 2032 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 2033 if (ret < 0) { 2034 goto out; 2035 } else if (ret) { 2036 /* The dentry will be added later. */ 2037 ret = 0; 2038 update_size = false; 2039 goto out; 2040 } 2041 2042 found_key.objectid = log_key.objectid; 2043 found_key.type = BTRFS_INODE_EXTREF_KEY; 2044 found_key.offset = key->objectid; 2045 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2046 name_len); 2047 if (ret < 0) { 2048 goto out; 2049 } else if (ret) { 2050 /* The dentry will be added later. */ 2051 ret = 0; 2052 update_size = false; 2053 goto out; 2054 } 2055 btrfs_release_path(path); 2056 ret = insert_one_name(trans, root, key->objectid, key->offset, 2057 name, name_len, &log_key); 2058 if (ret && ret != -ENOENT && ret != -EEXIST) 2059 goto out; 2060 if (!ret) 2061 name_added = true; 2062 update_size = false; 2063 ret = 0; 2064 goto out; 2065 } 2066 2067 /* 2068 * find all the names in a directory item and reconcile them into 2069 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2070 * one name in a directory item, but the same code gets used for 2071 * both directory index types 2072 */ 2073 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2074 struct btrfs_root *root, 2075 struct btrfs_path *path, 2076 struct extent_buffer *eb, int slot, 2077 struct btrfs_key *key) 2078 { 2079 int ret = 0; 2080 u32 item_size = btrfs_item_size_nr(eb, slot); 2081 struct btrfs_dir_item *di; 2082 int name_len; 2083 unsigned long ptr; 2084 unsigned long ptr_end; 2085 struct btrfs_path *fixup_path = NULL; 2086 2087 ptr = btrfs_item_ptr_offset(eb, slot); 2088 ptr_end = ptr + item_size; 2089 while (ptr < ptr_end) { 2090 di = (struct btrfs_dir_item *)ptr; 2091 name_len = btrfs_dir_name_len(eb, di); 2092 ret = replay_one_name(trans, root, path, eb, di, key); 2093 if (ret < 0) 2094 break; 2095 ptr = (unsigned long)(di + 1); 2096 ptr += name_len; 2097 2098 /* 2099 * If this entry refers to a non-directory (directories can not 2100 * have a link count > 1) and it was added in the transaction 2101 * that was not committed, make sure we fixup the link count of 2102 * the inode it the entry points to. Otherwise something like 2103 * the following would result in a directory pointing to an 2104 * inode with a wrong link that does not account for this dir 2105 * entry: 2106 * 2107 * mkdir testdir 2108 * touch testdir/foo 2109 * touch testdir/bar 2110 * sync 2111 * 2112 * ln testdir/bar testdir/bar_link 2113 * ln testdir/foo testdir/foo_link 2114 * xfs_io -c "fsync" testdir/bar 2115 * 2116 * <power failure> 2117 * 2118 * mount fs, log replay happens 2119 * 2120 * File foo would remain with a link count of 1 when it has two 2121 * entries pointing to it in the directory testdir. This would 2122 * make it impossible to ever delete the parent directory has 2123 * it would result in stale dentries that can never be deleted. 2124 */ 2125 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2126 struct btrfs_key di_key; 2127 2128 if (!fixup_path) { 2129 fixup_path = btrfs_alloc_path(); 2130 if (!fixup_path) { 2131 ret = -ENOMEM; 2132 break; 2133 } 2134 } 2135 2136 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2137 ret = link_to_fixup_dir(trans, root, fixup_path, 2138 di_key.objectid); 2139 if (ret) 2140 break; 2141 } 2142 ret = 0; 2143 } 2144 btrfs_free_path(fixup_path); 2145 return ret; 2146 } 2147 2148 /* 2149 * directory replay has two parts. There are the standard directory 2150 * items in the log copied from the subvolume, and range items 2151 * created in the log while the subvolume was logged. 2152 * 2153 * The range items tell us which parts of the key space the log 2154 * is authoritative for. During replay, if a key in the subvolume 2155 * directory is in a logged range item, but not actually in the log 2156 * that means it was deleted from the directory before the fsync 2157 * and should be removed. 2158 */ 2159 static noinline int find_dir_range(struct btrfs_root *root, 2160 struct btrfs_path *path, 2161 u64 dirid, int key_type, 2162 u64 *start_ret, u64 *end_ret) 2163 { 2164 struct btrfs_key key; 2165 u64 found_end; 2166 struct btrfs_dir_log_item *item; 2167 int ret; 2168 int nritems; 2169 2170 if (*start_ret == (u64)-1) 2171 return 1; 2172 2173 key.objectid = dirid; 2174 key.type = key_type; 2175 key.offset = *start_ret; 2176 2177 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2178 if (ret < 0) 2179 goto out; 2180 if (ret > 0) { 2181 if (path->slots[0] == 0) 2182 goto out; 2183 path->slots[0]--; 2184 } 2185 if (ret != 0) 2186 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2187 2188 if (key.type != key_type || key.objectid != dirid) { 2189 ret = 1; 2190 goto next; 2191 } 2192 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2193 struct btrfs_dir_log_item); 2194 found_end = btrfs_dir_log_end(path->nodes[0], item); 2195 2196 if (*start_ret >= key.offset && *start_ret <= found_end) { 2197 ret = 0; 2198 *start_ret = key.offset; 2199 *end_ret = found_end; 2200 goto out; 2201 } 2202 ret = 1; 2203 next: 2204 /* check the next slot in the tree to see if it is a valid item */ 2205 nritems = btrfs_header_nritems(path->nodes[0]); 2206 path->slots[0]++; 2207 if (path->slots[0] >= nritems) { 2208 ret = btrfs_next_leaf(root, path); 2209 if (ret) 2210 goto out; 2211 } 2212 2213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2214 2215 if (key.type != key_type || key.objectid != dirid) { 2216 ret = 1; 2217 goto out; 2218 } 2219 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2220 struct btrfs_dir_log_item); 2221 found_end = btrfs_dir_log_end(path->nodes[0], item); 2222 *start_ret = key.offset; 2223 *end_ret = found_end; 2224 ret = 0; 2225 out: 2226 btrfs_release_path(path); 2227 return ret; 2228 } 2229 2230 /* 2231 * this looks for a given directory item in the log. If the directory 2232 * item is not in the log, the item is removed and the inode it points 2233 * to is unlinked 2234 */ 2235 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2236 struct btrfs_root *root, 2237 struct btrfs_root *log, 2238 struct btrfs_path *path, 2239 struct btrfs_path *log_path, 2240 struct inode *dir, 2241 struct btrfs_key *dir_key) 2242 { 2243 int ret; 2244 struct extent_buffer *eb; 2245 int slot; 2246 u32 item_size; 2247 struct btrfs_dir_item *di; 2248 struct btrfs_dir_item *log_di; 2249 int name_len; 2250 unsigned long ptr; 2251 unsigned long ptr_end; 2252 char *name; 2253 struct inode *inode; 2254 struct btrfs_key location; 2255 2256 again: 2257 eb = path->nodes[0]; 2258 slot = path->slots[0]; 2259 item_size = btrfs_item_size_nr(eb, slot); 2260 ptr = btrfs_item_ptr_offset(eb, slot); 2261 ptr_end = ptr + item_size; 2262 while (ptr < ptr_end) { 2263 di = (struct btrfs_dir_item *)ptr; 2264 name_len = btrfs_dir_name_len(eb, di); 2265 name = kmalloc(name_len, GFP_NOFS); 2266 if (!name) { 2267 ret = -ENOMEM; 2268 goto out; 2269 } 2270 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2271 name_len); 2272 log_di = NULL; 2273 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2274 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2275 dir_key->objectid, 2276 name, name_len, 0); 2277 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2278 log_di = btrfs_lookup_dir_index_item(trans, log, 2279 log_path, 2280 dir_key->objectid, 2281 dir_key->offset, 2282 name, name_len, 0); 2283 } 2284 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2285 btrfs_dir_item_key_to_cpu(eb, di, &location); 2286 btrfs_release_path(path); 2287 btrfs_release_path(log_path); 2288 inode = read_one_inode(root, location.objectid); 2289 if (!inode) { 2290 kfree(name); 2291 return -EIO; 2292 } 2293 2294 ret = link_to_fixup_dir(trans, root, 2295 path, location.objectid); 2296 if (ret) { 2297 kfree(name); 2298 iput(inode); 2299 goto out; 2300 } 2301 2302 inc_nlink(inode); 2303 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2304 BTRFS_I(inode), name, name_len); 2305 if (!ret) 2306 ret = btrfs_run_delayed_items(trans); 2307 kfree(name); 2308 iput(inode); 2309 if (ret) 2310 goto out; 2311 2312 /* there might still be more names under this key 2313 * check and repeat if required 2314 */ 2315 ret = btrfs_search_slot(NULL, root, dir_key, path, 2316 0, 0); 2317 if (ret == 0) 2318 goto again; 2319 ret = 0; 2320 goto out; 2321 } else if (IS_ERR(log_di)) { 2322 kfree(name); 2323 return PTR_ERR(log_di); 2324 } 2325 btrfs_release_path(log_path); 2326 kfree(name); 2327 2328 ptr = (unsigned long)(di + 1); 2329 ptr += name_len; 2330 } 2331 ret = 0; 2332 out: 2333 btrfs_release_path(path); 2334 btrfs_release_path(log_path); 2335 return ret; 2336 } 2337 2338 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2339 struct btrfs_root *root, 2340 struct btrfs_root *log, 2341 struct btrfs_path *path, 2342 const u64 ino) 2343 { 2344 struct btrfs_key search_key; 2345 struct btrfs_path *log_path; 2346 int i; 2347 int nritems; 2348 int ret; 2349 2350 log_path = btrfs_alloc_path(); 2351 if (!log_path) 2352 return -ENOMEM; 2353 2354 search_key.objectid = ino; 2355 search_key.type = BTRFS_XATTR_ITEM_KEY; 2356 search_key.offset = 0; 2357 again: 2358 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2359 if (ret < 0) 2360 goto out; 2361 process_leaf: 2362 nritems = btrfs_header_nritems(path->nodes[0]); 2363 for (i = path->slots[0]; i < nritems; i++) { 2364 struct btrfs_key key; 2365 struct btrfs_dir_item *di; 2366 struct btrfs_dir_item *log_di; 2367 u32 total_size; 2368 u32 cur; 2369 2370 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2371 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2372 ret = 0; 2373 goto out; 2374 } 2375 2376 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2377 total_size = btrfs_item_size_nr(path->nodes[0], i); 2378 cur = 0; 2379 while (cur < total_size) { 2380 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2381 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2382 u32 this_len = sizeof(*di) + name_len + data_len; 2383 char *name; 2384 2385 name = kmalloc(name_len, GFP_NOFS); 2386 if (!name) { 2387 ret = -ENOMEM; 2388 goto out; 2389 } 2390 read_extent_buffer(path->nodes[0], name, 2391 (unsigned long)(di + 1), name_len); 2392 2393 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2394 name, name_len, 0); 2395 btrfs_release_path(log_path); 2396 if (!log_di) { 2397 /* Doesn't exist in log tree, so delete it. */ 2398 btrfs_release_path(path); 2399 di = btrfs_lookup_xattr(trans, root, path, ino, 2400 name, name_len, -1); 2401 kfree(name); 2402 if (IS_ERR(di)) { 2403 ret = PTR_ERR(di); 2404 goto out; 2405 } 2406 ASSERT(di); 2407 ret = btrfs_delete_one_dir_name(trans, root, 2408 path, di); 2409 if (ret) 2410 goto out; 2411 btrfs_release_path(path); 2412 search_key = key; 2413 goto again; 2414 } 2415 kfree(name); 2416 if (IS_ERR(log_di)) { 2417 ret = PTR_ERR(log_di); 2418 goto out; 2419 } 2420 cur += this_len; 2421 di = (struct btrfs_dir_item *)((char *)di + this_len); 2422 } 2423 } 2424 ret = btrfs_next_leaf(root, path); 2425 if (ret > 0) 2426 ret = 0; 2427 else if (ret == 0) 2428 goto process_leaf; 2429 out: 2430 btrfs_free_path(log_path); 2431 btrfs_release_path(path); 2432 return ret; 2433 } 2434 2435 2436 /* 2437 * deletion replay happens before we copy any new directory items 2438 * out of the log or out of backreferences from inodes. It 2439 * scans the log to find ranges of keys that log is authoritative for, 2440 * and then scans the directory to find items in those ranges that are 2441 * not present in the log. 2442 * 2443 * Anything we don't find in the log is unlinked and removed from the 2444 * directory. 2445 */ 2446 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2447 struct btrfs_root *root, 2448 struct btrfs_root *log, 2449 struct btrfs_path *path, 2450 u64 dirid, int del_all) 2451 { 2452 u64 range_start; 2453 u64 range_end; 2454 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2455 int ret = 0; 2456 struct btrfs_key dir_key; 2457 struct btrfs_key found_key; 2458 struct btrfs_path *log_path; 2459 struct inode *dir; 2460 2461 dir_key.objectid = dirid; 2462 dir_key.type = BTRFS_DIR_ITEM_KEY; 2463 log_path = btrfs_alloc_path(); 2464 if (!log_path) 2465 return -ENOMEM; 2466 2467 dir = read_one_inode(root, dirid); 2468 /* it isn't an error if the inode isn't there, that can happen 2469 * because we replay the deletes before we copy in the inode item 2470 * from the log 2471 */ 2472 if (!dir) { 2473 btrfs_free_path(log_path); 2474 return 0; 2475 } 2476 again: 2477 range_start = 0; 2478 range_end = 0; 2479 while (1) { 2480 if (del_all) 2481 range_end = (u64)-1; 2482 else { 2483 ret = find_dir_range(log, path, dirid, key_type, 2484 &range_start, &range_end); 2485 if (ret != 0) 2486 break; 2487 } 2488 2489 dir_key.offset = range_start; 2490 while (1) { 2491 int nritems; 2492 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2493 0, 0); 2494 if (ret < 0) 2495 goto out; 2496 2497 nritems = btrfs_header_nritems(path->nodes[0]); 2498 if (path->slots[0] >= nritems) { 2499 ret = btrfs_next_leaf(root, path); 2500 if (ret == 1) 2501 break; 2502 else if (ret < 0) 2503 goto out; 2504 } 2505 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2506 path->slots[0]); 2507 if (found_key.objectid != dirid || 2508 found_key.type != dir_key.type) 2509 goto next_type; 2510 2511 if (found_key.offset > range_end) 2512 break; 2513 2514 ret = check_item_in_log(trans, root, log, path, 2515 log_path, dir, 2516 &found_key); 2517 if (ret) 2518 goto out; 2519 if (found_key.offset == (u64)-1) 2520 break; 2521 dir_key.offset = found_key.offset + 1; 2522 } 2523 btrfs_release_path(path); 2524 if (range_end == (u64)-1) 2525 break; 2526 range_start = range_end + 1; 2527 } 2528 2529 next_type: 2530 ret = 0; 2531 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2532 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2533 dir_key.type = BTRFS_DIR_INDEX_KEY; 2534 btrfs_release_path(path); 2535 goto again; 2536 } 2537 out: 2538 btrfs_release_path(path); 2539 btrfs_free_path(log_path); 2540 iput(dir); 2541 return ret; 2542 } 2543 2544 /* 2545 * the process_func used to replay items from the log tree. This 2546 * gets called in two different stages. The first stage just looks 2547 * for inodes and makes sure they are all copied into the subvolume. 2548 * 2549 * The second stage copies all the other item types from the log into 2550 * the subvolume. The two stage approach is slower, but gets rid of 2551 * lots of complexity around inodes referencing other inodes that exist 2552 * only in the log (references come from either directory items or inode 2553 * back refs). 2554 */ 2555 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2556 struct walk_control *wc, u64 gen, int level) 2557 { 2558 int nritems; 2559 struct btrfs_path *path; 2560 struct btrfs_root *root = wc->replay_dest; 2561 struct btrfs_key key; 2562 int i; 2563 int ret; 2564 2565 ret = btrfs_read_buffer(eb, gen, level, NULL); 2566 if (ret) 2567 return ret; 2568 2569 level = btrfs_header_level(eb); 2570 2571 if (level != 0) 2572 return 0; 2573 2574 path = btrfs_alloc_path(); 2575 if (!path) 2576 return -ENOMEM; 2577 2578 nritems = btrfs_header_nritems(eb); 2579 for (i = 0; i < nritems; i++) { 2580 btrfs_item_key_to_cpu(eb, &key, i); 2581 2582 /* inode keys are done during the first stage */ 2583 if (key.type == BTRFS_INODE_ITEM_KEY && 2584 wc->stage == LOG_WALK_REPLAY_INODES) { 2585 struct btrfs_inode_item *inode_item; 2586 u32 mode; 2587 2588 inode_item = btrfs_item_ptr(eb, i, 2589 struct btrfs_inode_item); 2590 /* 2591 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2592 * and never got linked before the fsync, skip it, as 2593 * replaying it is pointless since it would be deleted 2594 * later. We skip logging tmpfiles, but it's always 2595 * possible we are replaying a log created with a kernel 2596 * that used to log tmpfiles. 2597 */ 2598 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2599 wc->ignore_cur_inode = true; 2600 continue; 2601 } else { 2602 wc->ignore_cur_inode = false; 2603 } 2604 ret = replay_xattr_deletes(wc->trans, root, log, 2605 path, key.objectid); 2606 if (ret) 2607 break; 2608 mode = btrfs_inode_mode(eb, inode_item); 2609 if (S_ISDIR(mode)) { 2610 ret = replay_dir_deletes(wc->trans, 2611 root, log, path, key.objectid, 0); 2612 if (ret) 2613 break; 2614 } 2615 ret = overwrite_item(wc->trans, root, path, 2616 eb, i, &key); 2617 if (ret) 2618 break; 2619 2620 /* 2621 * Before replaying extents, truncate the inode to its 2622 * size. We need to do it now and not after log replay 2623 * because before an fsync we can have prealloc extents 2624 * added beyond the inode's i_size. If we did it after, 2625 * through orphan cleanup for example, we would drop 2626 * those prealloc extents just after replaying them. 2627 */ 2628 if (S_ISREG(mode)) { 2629 struct btrfs_drop_extents_args drop_args = { 0 }; 2630 struct inode *inode; 2631 u64 from; 2632 2633 inode = read_one_inode(root, key.objectid); 2634 if (!inode) { 2635 ret = -EIO; 2636 break; 2637 } 2638 from = ALIGN(i_size_read(inode), 2639 root->fs_info->sectorsize); 2640 drop_args.start = from; 2641 drop_args.end = (u64)-1; 2642 drop_args.drop_cache = true; 2643 ret = btrfs_drop_extents(wc->trans, root, 2644 BTRFS_I(inode), 2645 &drop_args); 2646 if (!ret) { 2647 inode_sub_bytes(inode, 2648 drop_args.bytes_found); 2649 /* Update the inode's nbytes. */ 2650 ret = btrfs_update_inode(wc->trans, 2651 root, BTRFS_I(inode)); 2652 } 2653 iput(inode); 2654 if (ret) 2655 break; 2656 } 2657 2658 ret = link_to_fixup_dir(wc->trans, root, 2659 path, key.objectid); 2660 if (ret) 2661 break; 2662 } 2663 2664 if (wc->ignore_cur_inode) 2665 continue; 2666 2667 if (key.type == BTRFS_DIR_INDEX_KEY && 2668 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2669 ret = replay_one_dir_item(wc->trans, root, path, 2670 eb, i, &key); 2671 if (ret) 2672 break; 2673 } 2674 2675 if (wc->stage < LOG_WALK_REPLAY_ALL) 2676 continue; 2677 2678 /* these keys are simply copied */ 2679 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2680 ret = overwrite_item(wc->trans, root, path, 2681 eb, i, &key); 2682 if (ret) 2683 break; 2684 } else if (key.type == BTRFS_INODE_REF_KEY || 2685 key.type == BTRFS_INODE_EXTREF_KEY) { 2686 ret = add_inode_ref(wc->trans, root, log, path, 2687 eb, i, &key); 2688 if (ret && ret != -ENOENT) 2689 break; 2690 ret = 0; 2691 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2692 ret = replay_one_extent(wc->trans, root, path, 2693 eb, i, &key); 2694 if (ret) 2695 break; 2696 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2697 ret = replay_one_dir_item(wc->trans, root, path, 2698 eb, i, &key); 2699 if (ret) 2700 break; 2701 } 2702 } 2703 btrfs_free_path(path); 2704 return ret; 2705 } 2706 2707 /* 2708 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2709 */ 2710 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2711 { 2712 struct btrfs_block_group *cache; 2713 2714 cache = btrfs_lookup_block_group(fs_info, start); 2715 if (!cache) { 2716 btrfs_err(fs_info, "unable to find block group for %llu", start); 2717 return; 2718 } 2719 2720 spin_lock(&cache->space_info->lock); 2721 spin_lock(&cache->lock); 2722 cache->reserved -= fs_info->nodesize; 2723 cache->space_info->bytes_reserved -= fs_info->nodesize; 2724 spin_unlock(&cache->lock); 2725 spin_unlock(&cache->space_info->lock); 2726 2727 btrfs_put_block_group(cache); 2728 } 2729 2730 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2731 struct btrfs_root *root, 2732 struct btrfs_path *path, int *level, 2733 struct walk_control *wc) 2734 { 2735 struct btrfs_fs_info *fs_info = root->fs_info; 2736 u64 bytenr; 2737 u64 ptr_gen; 2738 struct extent_buffer *next; 2739 struct extent_buffer *cur; 2740 u32 blocksize; 2741 int ret = 0; 2742 2743 while (*level > 0) { 2744 struct btrfs_key first_key; 2745 2746 cur = path->nodes[*level]; 2747 2748 WARN_ON(btrfs_header_level(cur) != *level); 2749 2750 if (path->slots[*level] >= 2751 btrfs_header_nritems(cur)) 2752 break; 2753 2754 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2755 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2756 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2757 blocksize = fs_info->nodesize; 2758 2759 next = btrfs_find_create_tree_block(fs_info, bytenr, 2760 btrfs_header_owner(cur), 2761 *level - 1); 2762 if (IS_ERR(next)) 2763 return PTR_ERR(next); 2764 2765 if (*level == 1) { 2766 ret = wc->process_func(root, next, wc, ptr_gen, 2767 *level - 1); 2768 if (ret) { 2769 free_extent_buffer(next); 2770 return ret; 2771 } 2772 2773 path->slots[*level]++; 2774 if (wc->free) { 2775 ret = btrfs_read_buffer(next, ptr_gen, 2776 *level - 1, &first_key); 2777 if (ret) { 2778 free_extent_buffer(next); 2779 return ret; 2780 } 2781 2782 if (trans) { 2783 btrfs_tree_lock(next); 2784 btrfs_clean_tree_block(next); 2785 btrfs_wait_tree_block_writeback(next); 2786 btrfs_tree_unlock(next); 2787 ret = btrfs_pin_reserved_extent(trans, 2788 bytenr, blocksize); 2789 if (ret) { 2790 free_extent_buffer(next); 2791 return ret; 2792 } 2793 btrfs_redirty_list_add( 2794 trans->transaction, next); 2795 } else { 2796 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2797 clear_extent_buffer_dirty(next); 2798 unaccount_log_buffer(fs_info, bytenr); 2799 } 2800 } 2801 free_extent_buffer(next); 2802 continue; 2803 } 2804 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2805 if (ret) { 2806 free_extent_buffer(next); 2807 return ret; 2808 } 2809 2810 if (path->nodes[*level-1]) 2811 free_extent_buffer(path->nodes[*level-1]); 2812 path->nodes[*level-1] = next; 2813 *level = btrfs_header_level(next); 2814 path->slots[*level] = 0; 2815 cond_resched(); 2816 } 2817 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2818 2819 cond_resched(); 2820 return 0; 2821 } 2822 2823 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2824 struct btrfs_root *root, 2825 struct btrfs_path *path, int *level, 2826 struct walk_control *wc) 2827 { 2828 struct btrfs_fs_info *fs_info = root->fs_info; 2829 int i; 2830 int slot; 2831 int ret; 2832 2833 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2834 slot = path->slots[i]; 2835 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2836 path->slots[i]++; 2837 *level = i; 2838 WARN_ON(*level == 0); 2839 return 0; 2840 } else { 2841 ret = wc->process_func(root, path->nodes[*level], wc, 2842 btrfs_header_generation(path->nodes[*level]), 2843 *level); 2844 if (ret) 2845 return ret; 2846 2847 if (wc->free) { 2848 struct extent_buffer *next; 2849 2850 next = path->nodes[*level]; 2851 2852 if (trans) { 2853 btrfs_tree_lock(next); 2854 btrfs_clean_tree_block(next); 2855 btrfs_wait_tree_block_writeback(next); 2856 btrfs_tree_unlock(next); 2857 ret = btrfs_pin_reserved_extent(trans, 2858 path->nodes[*level]->start, 2859 path->nodes[*level]->len); 2860 if (ret) 2861 return ret; 2862 } else { 2863 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2864 clear_extent_buffer_dirty(next); 2865 2866 unaccount_log_buffer(fs_info, 2867 path->nodes[*level]->start); 2868 } 2869 } 2870 free_extent_buffer(path->nodes[*level]); 2871 path->nodes[*level] = NULL; 2872 *level = i + 1; 2873 } 2874 } 2875 return 1; 2876 } 2877 2878 /* 2879 * drop the reference count on the tree rooted at 'snap'. This traverses 2880 * the tree freeing any blocks that have a ref count of zero after being 2881 * decremented. 2882 */ 2883 static int walk_log_tree(struct btrfs_trans_handle *trans, 2884 struct btrfs_root *log, struct walk_control *wc) 2885 { 2886 struct btrfs_fs_info *fs_info = log->fs_info; 2887 int ret = 0; 2888 int wret; 2889 int level; 2890 struct btrfs_path *path; 2891 int orig_level; 2892 2893 path = btrfs_alloc_path(); 2894 if (!path) 2895 return -ENOMEM; 2896 2897 level = btrfs_header_level(log->node); 2898 orig_level = level; 2899 path->nodes[level] = log->node; 2900 atomic_inc(&log->node->refs); 2901 path->slots[level] = 0; 2902 2903 while (1) { 2904 wret = walk_down_log_tree(trans, log, path, &level, wc); 2905 if (wret > 0) 2906 break; 2907 if (wret < 0) { 2908 ret = wret; 2909 goto out; 2910 } 2911 2912 wret = walk_up_log_tree(trans, log, path, &level, wc); 2913 if (wret > 0) 2914 break; 2915 if (wret < 0) { 2916 ret = wret; 2917 goto out; 2918 } 2919 } 2920 2921 /* was the root node processed? if not, catch it here */ 2922 if (path->nodes[orig_level]) { 2923 ret = wc->process_func(log, path->nodes[orig_level], wc, 2924 btrfs_header_generation(path->nodes[orig_level]), 2925 orig_level); 2926 if (ret) 2927 goto out; 2928 if (wc->free) { 2929 struct extent_buffer *next; 2930 2931 next = path->nodes[orig_level]; 2932 2933 if (trans) { 2934 btrfs_tree_lock(next); 2935 btrfs_clean_tree_block(next); 2936 btrfs_wait_tree_block_writeback(next); 2937 btrfs_tree_unlock(next); 2938 ret = btrfs_pin_reserved_extent(trans, 2939 next->start, next->len); 2940 if (ret) 2941 goto out; 2942 } else { 2943 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2944 clear_extent_buffer_dirty(next); 2945 unaccount_log_buffer(fs_info, next->start); 2946 } 2947 } 2948 } 2949 2950 out: 2951 btrfs_free_path(path); 2952 return ret; 2953 } 2954 2955 /* 2956 * helper function to update the item for a given subvolumes log root 2957 * in the tree of log roots 2958 */ 2959 static int update_log_root(struct btrfs_trans_handle *trans, 2960 struct btrfs_root *log, 2961 struct btrfs_root_item *root_item) 2962 { 2963 struct btrfs_fs_info *fs_info = log->fs_info; 2964 int ret; 2965 2966 if (log->log_transid == 1) { 2967 /* insert root item on the first sync */ 2968 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2969 &log->root_key, root_item); 2970 } else { 2971 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2972 &log->root_key, root_item); 2973 } 2974 return ret; 2975 } 2976 2977 static void wait_log_commit(struct btrfs_root *root, int transid) 2978 { 2979 DEFINE_WAIT(wait); 2980 int index = transid % 2; 2981 2982 /* 2983 * we only allow two pending log transactions at a time, 2984 * so we know that if ours is more than 2 older than the 2985 * current transaction, we're done 2986 */ 2987 for (;;) { 2988 prepare_to_wait(&root->log_commit_wait[index], 2989 &wait, TASK_UNINTERRUPTIBLE); 2990 2991 if (!(root->log_transid_committed < transid && 2992 atomic_read(&root->log_commit[index]))) 2993 break; 2994 2995 mutex_unlock(&root->log_mutex); 2996 schedule(); 2997 mutex_lock(&root->log_mutex); 2998 } 2999 finish_wait(&root->log_commit_wait[index], &wait); 3000 } 3001 3002 static void wait_for_writer(struct btrfs_root *root) 3003 { 3004 DEFINE_WAIT(wait); 3005 3006 for (;;) { 3007 prepare_to_wait(&root->log_writer_wait, &wait, 3008 TASK_UNINTERRUPTIBLE); 3009 if (!atomic_read(&root->log_writers)) 3010 break; 3011 3012 mutex_unlock(&root->log_mutex); 3013 schedule(); 3014 mutex_lock(&root->log_mutex); 3015 } 3016 finish_wait(&root->log_writer_wait, &wait); 3017 } 3018 3019 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3020 struct btrfs_log_ctx *ctx) 3021 { 3022 if (!ctx) 3023 return; 3024 3025 mutex_lock(&root->log_mutex); 3026 list_del_init(&ctx->list); 3027 mutex_unlock(&root->log_mutex); 3028 } 3029 3030 /* 3031 * Invoked in log mutex context, or be sure there is no other task which 3032 * can access the list. 3033 */ 3034 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3035 int index, int error) 3036 { 3037 struct btrfs_log_ctx *ctx; 3038 struct btrfs_log_ctx *safe; 3039 3040 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3041 list_del_init(&ctx->list); 3042 ctx->log_ret = error; 3043 } 3044 } 3045 3046 /* 3047 * btrfs_sync_log does sends a given tree log down to the disk and 3048 * updates the super blocks to record it. When this call is done, 3049 * you know that any inodes previously logged are safely on disk only 3050 * if it returns 0. 3051 * 3052 * Any other return value means you need to call btrfs_commit_transaction. 3053 * Some of the edge cases for fsyncing directories that have had unlinks 3054 * or renames done in the past mean that sometimes the only safe 3055 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3056 * that has happened. 3057 */ 3058 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3059 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3060 { 3061 int index1; 3062 int index2; 3063 int mark; 3064 int ret; 3065 struct btrfs_fs_info *fs_info = root->fs_info; 3066 struct btrfs_root *log = root->log_root; 3067 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3068 struct btrfs_root_item new_root_item; 3069 int log_transid = 0; 3070 struct btrfs_log_ctx root_log_ctx; 3071 struct blk_plug plug; 3072 u64 log_root_start; 3073 u64 log_root_level; 3074 3075 mutex_lock(&root->log_mutex); 3076 log_transid = ctx->log_transid; 3077 if (root->log_transid_committed >= log_transid) { 3078 mutex_unlock(&root->log_mutex); 3079 return ctx->log_ret; 3080 } 3081 3082 index1 = log_transid % 2; 3083 if (atomic_read(&root->log_commit[index1])) { 3084 wait_log_commit(root, log_transid); 3085 mutex_unlock(&root->log_mutex); 3086 return ctx->log_ret; 3087 } 3088 ASSERT(log_transid == root->log_transid); 3089 atomic_set(&root->log_commit[index1], 1); 3090 3091 /* wait for previous tree log sync to complete */ 3092 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3093 wait_log_commit(root, log_transid - 1); 3094 3095 while (1) { 3096 int batch = atomic_read(&root->log_batch); 3097 /* when we're on an ssd, just kick the log commit out */ 3098 if (!btrfs_test_opt(fs_info, SSD) && 3099 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3100 mutex_unlock(&root->log_mutex); 3101 schedule_timeout_uninterruptible(1); 3102 mutex_lock(&root->log_mutex); 3103 } 3104 wait_for_writer(root); 3105 if (batch == atomic_read(&root->log_batch)) 3106 break; 3107 } 3108 3109 /* bail out if we need to do a full commit */ 3110 if (btrfs_need_log_full_commit(trans)) { 3111 ret = -EAGAIN; 3112 mutex_unlock(&root->log_mutex); 3113 goto out; 3114 } 3115 3116 if (log_transid % 2 == 0) 3117 mark = EXTENT_DIRTY; 3118 else 3119 mark = EXTENT_NEW; 3120 3121 /* we start IO on all the marked extents here, but we don't actually 3122 * wait for them until later. 3123 */ 3124 blk_start_plug(&plug); 3125 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3126 /* 3127 * -EAGAIN happens when someone, e.g., a concurrent transaction 3128 * commit, writes a dirty extent in this tree-log commit. This 3129 * concurrent write will create a hole writing out the extents, 3130 * and we cannot proceed on a zoned filesystem, requiring 3131 * sequential writing. While we can bail out to a full commit 3132 * here, but we can continue hoping the concurrent writing fills 3133 * the hole. 3134 */ 3135 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3136 ret = 0; 3137 if (ret) { 3138 blk_finish_plug(&plug); 3139 btrfs_abort_transaction(trans, ret); 3140 btrfs_set_log_full_commit(trans); 3141 mutex_unlock(&root->log_mutex); 3142 goto out; 3143 } 3144 3145 /* 3146 * We _must_ update under the root->log_mutex in order to make sure we 3147 * have a consistent view of the log root we are trying to commit at 3148 * this moment. 3149 * 3150 * We _must_ copy this into a local copy, because we are not holding the 3151 * log_root_tree->log_mutex yet. This is important because when we 3152 * commit the log_root_tree we must have a consistent view of the 3153 * log_root_tree when we update the super block to point at the 3154 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3155 * with the commit and possibly point at the new block which we may not 3156 * have written out. 3157 */ 3158 btrfs_set_root_node(&log->root_item, log->node); 3159 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3160 3161 root->log_transid++; 3162 log->log_transid = root->log_transid; 3163 root->log_start_pid = 0; 3164 /* 3165 * IO has been started, blocks of the log tree have WRITTEN flag set 3166 * in their headers. new modifications of the log will be written to 3167 * new positions. so it's safe to allow log writers to go in. 3168 */ 3169 mutex_unlock(&root->log_mutex); 3170 3171 if (btrfs_is_zoned(fs_info)) { 3172 mutex_lock(&fs_info->tree_root->log_mutex); 3173 if (!log_root_tree->node) { 3174 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3175 if (ret) { 3176 mutex_unlock(&fs_info->tree_root->log_mutex); 3177 goto out; 3178 } 3179 } 3180 mutex_unlock(&fs_info->tree_root->log_mutex); 3181 } 3182 3183 btrfs_init_log_ctx(&root_log_ctx, NULL); 3184 3185 mutex_lock(&log_root_tree->log_mutex); 3186 3187 index2 = log_root_tree->log_transid % 2; 3188 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3189 root_log_ctx.log_transid = log_root_tree->log_transid; 3190 3191 /* 3192 * Now we are safe to update the log_root_tree because we're under the 3193 * log_mutex, and we're a current writer so we're holding the commit 3194 * open until we drop the log_mutex. 3195 */ 3196 ret = update_log_root(trans, log, &new_root_item); 3197 if (ret) { 3198 if (!list_empty(&root_log_ctx.list)) 3199 list_del_init(&root_log_ctx.list); 3200 3201 blk_finish_plug(&plug); 3202 btrfs_set_log_full_commit(trans); 3203 3204 if (ret != -ENOSPC) { 3205 btrfs_abort_transaction(trans, ret); 3206 mutex_unlock(&log_root_tree->log_mutex); 3207 goto out; 3208 } 3209 btrfs_wait_tree_log_extents(log, mark); 3210 mutex_unlock(&log_root_tree->log_mutex); 3211 ret = -EAGAIN; 3212 goto out; 3213 } 3214 3215 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3216 blk_finish_plug(&plug); 3217 list_del_init(&root_log_ctx.list); 3218 mutex_unlock(&log_root_tree->log_mutex); 3219 ret = root_log_ctx.log_ret; 3220 goto out; 3221 } 3222 3223 index2 = root_log_ctx.log_transid % 2; 3224 if (atomic_read(&log_root_tree->log_commit[index2])) { 3225 blk_finish_plug(&plug); 3226 ret = btrfs_wait_tree_log_extents(log, mark); 3227 wait_log_commit(log_root_tree, 3228 root_log_ctx.log_transid); 3229 mutex_unlock(&log_root_tree->log_mutex); 3230 if (!ret) 3231 ret = root_log_ctx.log_ret; 3232 goto out; 3233 } 3234 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3235 atomic_set(&log_root_tree->log_commit[index2], 1); 3236 3237 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3238 wait_log_commit(log_root_tree, 3239 root_log_ctx.log_transid - 1); 3240 } 3241 3242 /* 3243 * now that we've moved on to the tree of log tree roots, 3244 * check the full commit flag again 3245 */ 3246 if (btrfs_need_log_full_commit(trans)) { 3247 blk_finish_plug(&plug); 3248 btrfs_wait_tree_log_extents(log, mark); 3249 mutex_unlock(&log_root_tree->log_mutex); 3250 ret = -EAGAIN; 3251 goto out_wake_log_root; 3252 } 3253 3254 ret = btrfs_write_marked_extents(fs_info, 3255 &log_root_tree->dirty_log_pages, 3256 EXTENT_DIRTY | EXTENT_NEW); 3257 blk_finish_plug(&plug); 3258 /* 3259 * As described above, -EAGAIN indicates a hole in the extents. We 3260 * cannot wait for these write outs since the waiting cause a 3261 * deadlock. Bail out to the full commit instead. 3262 */ 3263 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3264 btrfs_set_log_full_commit(trans); 3265 btrfs_wait_tree_log_extents(log, mark); 3266 mutex_unlock(&log_root_tree->log_mutex); 3267 goto out_wake_log_root; 3268 } else if (ret) { 3269 btrfs_set_log_full_commit(trans); 3270 btrfs_abort_transaction(trans, ret); 3271 mutex_unlock(&log_root_tree->log_mutex); 3272 goto out_wake_log_root; 3273 } 3274 ret = btrfs_wait_tree_log_extents(log, mark); 3275 if (!ret) 3276 ret = btrfs_wait_tree_log_extents(log_root_tree, 3277 EXTENT_NEW | EXTENT_DIRTY); 3278 if (ret) { 3279 btrfs_set_log_full_commit(trans); 3280 mutex_unlock(&log_root_tree->log_mutex); 3281 goto out_wake_log_root; 3282 } 3283 3284 log_root_start = log_root_tree->node->start; 3285 log_root_level = btrfs_header_level(log_root_tree->node); 3286 log_root_tree->log_transid++; 3287 mutex_unlock(&log_root_tree->log_mutex); 3288 3289 /* 3290 * Here we are guaranteed that nobody is going to write the superblock 3291 * for the current transaction before us and that neither we do write 3292 * our superblock before the previous transaction finishes its commit 3293 * and writes its superblock, because: 3294 * 3295 * 1) We are holding a handle on the current transaction, so no body 3296 * can commit it until we release the handle; 3297 * 3298 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3299 * if the previous transaction is still committing, and hasn't yet 3300 * written its superblock, we wait for it to do it, because a 3301 * transaction commit acquires the tree_log_mutex when the commit 3302 * begins and releases it only after writing its superblock. 3303 */ 3304 mutex_lock(&fs_info->tree_log_mutex); 3305 3306 /* 3307 * The previous transaction writeout phase could have failed, and thus 3308 * marked the fs in an error state. We must not commit here, as we 3309 * could have updated our generation in the super_for_commit and 3310 * writing the super here would result in transid mismatches. If there 3311 * is an error here just bail. 3312 */ 3313 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 3314 ret = -EIO; 3315 btrfs_set_log_full_commit(trans); 3316 btrfs_abort_transaction(trans, ret); 3317 mutex_unlock(&fs_info->tree_log_mutex); 3318 goto out_wake_log_root; 3319 } 3320 3321 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3322 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3323 ret = write_all_supers(fs_info, 1); 3324 mutex_unlock(&fs_info->tree_log_mutex); 3325 if (ret) { 3326 btrfs_set_log_full_commit(trans); 3327 btrfs_abort_transaction(trans, ret); 3328 goto out_wake_log_root; 3329 } 3330 3331 /* 3332 * We know there can only be one task here, since we have not yet set 3333 * root->log_commit[index1] to 0 and any task attempting to sync the 3334 * log must wait for the previous log transaction to commit if it's 3335 * still in progress or wait for the current log transaction commit if 3336 * someone else already started it. We use <= and not < because the 3337 * first log transaction has an ID of 0. 3338 */ 3339 ASSERT(root->last_log_commit <= log_transid); 3340 root->last_log_commit = log_transid; 3341 3342 out_wake_log_root: 3343 mutex_lock(&log_root_tree->log_mutex); 3344 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3345 3346 log_root_tree->log_transid_committed++; 3347 atomic_set(&log_root_tree->log_commit[index2], 0); 3348 mutex_unlock(&log_root_tree->log_mutex); 3349 3350 /* 3351 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3352 * all the updates above are seen by the woken threads. It might not be 3353 * necessary, but proving that seems to be hard. 3354 */ 3355 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3356 out: 3357 mutex_lock(&root->log_mutex); 3358 btrfs_remove_all_log_ctxs(root, index1, ret); 3359 root->log_transid_committed++; 3360 atomic_set(&root->log_commit[index1], 0); 3361 mutex_unlock(&root->log_mutex); 3362 3363 /* 3364 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3365 * all the updates above are seen by the woken threads. It might not be 3366 * necessary, but proving that seems to be hard. 3367 */ 3368 cond_wake_up(&root->log_commit_wait[index1]); 3369 return ret; 3370 } 3371 3372 static void free_log_tree(struct btrfs_trans_handle *trans, 3373 struct btrfs_root *log) 3374 { 3375 int ret; 3376 struct walk_control wc = { 3377 .free = 1, 3378 .process_func = process_one_buffer 3379 }; 3380 3381 if (log->node) { 3382 ret = walk_log_tree(trans, log, &wc); 3383 if (ret) { 3384 if (trans) 3385 btrfs_abort_transaction(trans, ret); 3386 else 3387 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3388 } 3389 } 3390 3391 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3392 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3393 extent_io_tree_release(&log->log_csum_range); 3394 3395 if (trans && log->node) 3396 btrfs_redirty_list_add(trans->transaction, log->node); 3397 btrfs_put_root(log); 3398 } 3399 3400 /* 3401 * free all the extents used by the tree log. This should be called 3402 * at commit time of the full transaction 3403 */ 3404 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3405 { 3406 if (root->log_root) { 3407 free_log_tree(trans, root->log_root); 3408 root->log_root = NULL; 3409 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3410 } 3411 return 0; 3412 } 3413 3414 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3415 struct btrfs_fs_info *fs_info) 3416 { 3417 if (fs_info->log_root_tree) { 3418 free_log_tree(trans, fs_info->log_root_tree); 3419 fs_info->log_root_tree = NULL; 3420 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3421 } 3422 return 0; 3423 } 3424 3425 /* 3426 * Check if an inode was logged in the current transaction. This may often 3427 * return some false positives, because logged_trans is an in memory only field, 3428 * not persisted anywhere. This is meant to be used in contexts where a false 3429 * positive has no functional consequences. 3430 */ 3431 static bool inode_logged(struct btrfs_trans_handle *trans, 3432 struct btrfs_inode *inode) 3433 { 3434 if (inode->logged_trans == trans->transid) 3435 return true; 3436 3437 /* 3438 * The inode's logged_trans is always 0 when we load it (because it is 3439 * not persisted in the inode item or elsewhere). So if it is 0, the 3440 * inode was last modified in the current transaction then the inode may 3441 * have been logged before in the current transaction, then evicted and 3442 * loaded again in the current transaction - or may have never been logged 3443 * in the current transaction, but since we can not be sure, we have to 3444 * assume it was, otherwise our callers can leave an inconsistent log. 3445 */ 3446 if (inode->logged_trans == 0 && 3447 inode->last_trans == trans->transid && 3448 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3449 return true; 3450 3451 return false; 3452 } 3453 3454 /* 3455 * If both a file and directory are logged, and unlinks or renames are 3456 * mixed in, we have a few interesting corners: 3457 * 3458 * create file X in dir Y 3459 * link file X to X.link in dir Y 3460 * fsync file X 3461 * unlink file X but leave X.link 3462 * fsync dir Y 3463 * 3464 * After a crash we would expect only X.link to exist. But file X 3465 * didn't get fsync'd again so the log has back refs for X and X.link. 3466 * 3467 * We solve this by removing directory entries and inode backrefs from the 3468 * log when a file that was logged in the current transaction is 3469 * unlinked. Any later fsync will include the updated log entries, and 3470 * we'll be able to reconstruct the proper directory items from backrefs. 3471 * 3472 * This optimizations allows us to avoid relogging the entire inode 3473 * or the entire directory. 3474 */ 3475 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3476 struct btrfs_root *root, 3477 const char *name, int name_len, 3478 struct btrfs_inode *dir, u64 index) 3479 { 3480 struct btrfs_root *log; 3481 struct btrfs_dir_item *di; 3482 struct btrfs_path *path; 3483 int ret; 3484 int err = 0; 3485 u64 dir_ino = btrfs_ino(dir); 3486 3487 if (!inode_logged(trans, dir)) 3488 return 0; 3489 3490 ret = join_running_log_trans(root); 3491 if (ret) 3492 return 0; 3493 3494 mutex_lock(&dir->log_mutex); 3495 3496 log = root->log_root; 3497 path = btrfs_alloc_path(); 3498 if (!path) { 3499 err = -ENOMEM; 3500 goto out_unlock; 3501 } 3502 3503 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3504 name, name_len, -1); 3505 if (IS_ERR(di)) { 3506 err = PTR_ERR(di); 3507 goto fail; 3508 } 3509 if (di) { 3510 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3511 if (ret) { 3512 err = ret; 3513 goto fail; 3514 } 3515 } 3516 btrfs_release_path(path); 3517 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3518 index, name, name_len, -1); 3519 if (IS_ERR(di)) { 3520 err = PTR_ERR(di); 3521 goto fail; 3522 } 3523 if (di) { 3524 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3525 if (ret) { 3526 err = ret; 3527 goto fail; 3528 } 3529 } 3530 3531 /* 3532 * We do not need to update the size field of the directory's inode item 3533 * because on log replay we update the field to reflect all existing 3534 * entries in the directory (see overwrite_item()). 3535 */ 3536 fail: 3537 btrfs_free_path(path); 3538 out_unlock: 3539 mutex_unlock(&dir->log_mutex); 3540 if (err == -ENOSPC) { 3541 btrfs_set_log_full_commit(trans); 3542 err = 0; 3543 } else if (err < 0 && err != -ENOENT) { 3544 /* ENOENT can be returned if the entry hasn't been fsynced yet */ 3545 btrfs_abort_transaction(trans, err); 3546 } 3547 3548 btrfs_end_log_trans(root); 3549 3550 return err; 3551 } 3552 3553 /* see comments for btrfs_del_dir_entries_in_log */ 3554 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3555 struct btrfs_root *root, 3556 const char *name, int name_len, 3557 struct btrfs_inode *inode, u64 dirid) 3558 { 3559 struct btrfs_root *log; 3560 u64 index; 3561 int ret; 3562 3563 if (!inode_logged(trans, inode)) 3564 return 0; 3565 3566 ret = join_running_log_trans(root); 3567 if (ret) 3568 return 0; 3569 log = root->log_root; 3570 mutex_lock(&inode->log_mutex); 3571 3572 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3573 dirid, &index); 3574 mutex_unlock(&inode->log_mutex); 3575 if (ret == -ENOSPC) { 3576 btrfs_set_log_full_commit(trans); 3577 ret = 0; 3578 } else if (ret < 0 && ret != -ENOENT) 3579 btrfs_abort_transaction(trans, ret); 3580 btrfs_end_log_trans(root); 3581 3582 return ret; 3583 } 3584 3585 /* 3586 * creates a range item in the log for 'dirid'. first_offset and 3587 * last_offset tell us which parts of the key space the log should 3588 * be considered authoritative for. 3589 */ 3590 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3591 struct btrfs_root *log, 3592 struct btrfs_path *path, 3593 int key_type, u64 dirid, 3594 u64 first_offset, u64 last_offset) 3595 { 3596 int ret; 3597 struct btrfs_key key; 3598 struct btrfs_dir_log_item *item; 3599 3600 key.objectid = dirid; 3601 key.offset = first_offset; 3602 if (key_type == BTRFS_DIR_ITEM_KEY) 3603 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3604 else 3605 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3606 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3607 if (ret) 3608 return ret; 3609 3610 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3611 struct btrfs_dir_log_item); 3612 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3613 btrfs_mark_buffer_dirty(path->nodes[0]); 3614 btrfs_release_path(path); 3615 return 0; 3616 } 3617 3618 /* 3619 * log all the items included in the current transaction for a given 3620 * directory. This also creates the range items in the log tree required 3621 * to replay anything deleted before the fsync 3622 */ 3623 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3624 struct btrfs_root *root, struct btrfs_inode *inode, 3625 struct btrfs_path *path, 3626 struct btrfs_path *dst_path, int key_type, 3627 struct btrfs_log_ctx *ctx, 3628 u64 min_offset, u64 *last_offset_ret) 3629 { 3630 struct btrfs_key min_key; 3631 struct btrfs_root *log = root->log_root; 3632 struct extent_buffer *src; 3633 int err = 0; 3634 int ret; 3635 int i; 3636 int nritems; 3637 u64 first_offset = min_offset; 3638 u64 last_offset = (u64)-1; 3639 u64 ino = btrfs_ino(inode); 3640 3641 log = root->log_root; 3642 3643 min_key.objectid = ino; 3644 min_key.type = key_type; 3645 min_key.offset = min_offset; 3646 3647 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3648 3649 /* 3650 * we didn't find anything from this transaction, see if there 3651 * is anything at all 3652 */ 3653 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3654 min_key.objectid = ino; 3655 min_key.type = key_type; 3656 min_key.offset = (u64)-1; 3657 btrfs_release_path(path); 3658 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3659 if (ret < 0) { 3660 btrfs_release_path(path); 3661 return ret; 3662 } 3663 ret = btrfs_previous_item(root, path, ino, key_type); 3664 3665 /* if ret == 0 there are items for this type, 3666 * create a range to tell us the last key of this type. 3667 * otherwise, there are no items in this directory after 3668 * *min_offset, and we create a range to indicate that. 3669 */ 3670 if (ret == 0) { 3671 struct btrfs_key tmp; 3672 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3673 path->slots[0]); 3674 if (key_type == tmp.type) 3675 first_offset = max(min_offset, tmp.offset) + 1; 3676 } 3677 goto done; 3678 } 3679 3680 /* go backward to find any previous key */ 3681 ret = btrfs_previous_item(root, path, ino, key_type); 3682 if (ret == 0) { 3683 struct btrfs_key tmp; 3684 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3685 if (key_type == tmp.type) { 3686 first_offset = tmp.offset; 3687 ret = overwrite_item(trans, log, dst_path, 3688 path->nodes[0], path->slots[0], 3689 &tmp); 3690 if (ret) { 3691 err = ret; 3692 goto done; 3693 } 3694 } 3695 } 3696 btrfs_release_path(path); 3697 3698 /* 3699 * Find the first key from this transaction again. See the note for 3700 * log_new_dir_dentries, if we're logging a directory recursively we 3701 * won't be holding its i_mutex, which means we can modify the directory 3702 * while we're logging it. If we remove an entry between our first 3703 * search and this search we'll not find the key again and can just 3704 * bail. 3705 */ 3706 search: 3707 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3708 if (ret != 0) 3709 goto done; 3710 3711 /* 3712 * we have a block from this transaction, log every item in it 3713 * from our directory 3714 */ 3715 while (1) { 3716 struct btrfs_key tmp; 3717 src = path->nodes[0]; 3718 nritems = btrfs_header_nritems(src); 3719 for (i = path->slots[0]; i < nritems; i++) { 3720 struct btrfs_dir_item *di; 3721 3722 btrfs_item_key_to_cpu(src, &min_key, i); 3723 3724 if (min_key.objectid != ino || min_key.type != key_type) 3725 goto done; 3726 3727 if (need_resched()) { 3728 btrfs_release_path(path); 3729 cond_resched(); 3730 goto search; 3731 } 3732 3733 ret = overwrite_item(trans, log, dst_path, src, i, 3734 &min_key); 3735 if (ret) { 3736 err = ret; 3737 goto done; 3738 } 3739 3740 /* 3741 * We must make sure that when we log a directory entry, 3742 * the corresponding inode, after log replay, has a 3743 * matching link count. For example: 3744 * 3745 * touch foo 3746 * mkdir mydir 3747 * sync 3748 * ln foo mydir/bar 3749 * xfs_io -c "fsync" mydir 3750 * <crash> 3751 * <mount fs and log replay> 3752 * 3753 * Would result in a fsync log that when replayed, our 3754 * file inode would have a link count of 1, but we get 3755 * two directory entries pointing to the same inode. 3756 * After removing one of the names, it would not be 3757 * possible to remove the other name, which resulted 3758 * always in stale file handle errors, and would not 3759 * be possible to rmdir the parent directory, since 3760 * its i_size could never decrement to the value 3761 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3762 */ 3763 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3764 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3765 if (ctx && 3766 (btrfs_dir_transid(src, di) == trans->transid || 3767 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3768 tmp.type != BTRFS_ROOT_ITEM_KEY) 3769 ctx->log_new_dentries = true; 3770 } 3771 path->slots[0] = nritems; 3772 3773 /* 3774 * look ahead to the next item and see if it is also 3775 * from this directory and from this transaction 3776 */ 3777 ret = btrfs_next_leaf(root, path); 3778 if (ret) { 3779 if (ret == 1) 3780 last_offset = (u64)-1; 3781 else 3782 err = ret; 3783 goto done; 3784 } 3785 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3786 if (tmp.objectid != ino || tmp.type != key_type) { 3787 last_offset = (u64)-1; 3788 goto done; 3789 } 3790 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3791 ret = overwrite_item(trans, log, dst_path, 3792 path->nodes[0], path->slots[0], 3793 &tmp); 3794 if (ret) 3795 err = ret; 3796 else 3797 last_offset = tmp.offset; 3798 goto done; 3799 } 3800 } 3801 done: 3802 btrfs_release_path(path); 3803 btrfs_release_path(dst_path); 3804 3805 if (err == 0) { 3806 *last_offset_ret = last_offset; 3807 /* 3808 * insert the log range keys to indicate where the log 3809 * is valid 3810 */ 3811 ret = insert_dir_log_key(trans, log, path, key_type, 3812 ino, first_offset, last_offset); 3813 if (ret) 3814 err = ret; 3815 } 3816 return err; 3817 } 3818 3819 /* 3820 * logging directories is very similar to logging inodes, We find all the items 3821 * from the current transaction and write them to the log. 3822 * 3823 * The recovery code scans the directory in the subvolume, and if it finds a 3824 * key in the range logged that is not present in the log tree, then it means 3825 * that dir entry was unlinked during the transaction. 3826 * 3827 * In order for that scan to work, we must include one key smaller than 3828 * the smallest logged by this transaction and one key larger than the largest 3829 * key logged by this transaction. 3830 */ 3831 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3832 struct btrfs_root *root, struct btrfs_inode *inode, 3833 struct btrfs_path *path, 3834 struct btrfs_path *dst_path, 3835 struct btrfs_log_ctx *ctx) 3836 { 3837 u64 min_key; 3838 u64 max_key; 3839 int ret; 3840 int key_type = BTRFS_DIR_ITEM_KEY; 3841 3842 again: 3843 min_key = 0; 3844 max_key = 0; 3845 while (1) { 3846 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3847 ctx, min_key, &max_key); 3848 if (ret) 3849 return ret; 3850 if (max_key == (u64)-1) 3851 break; 3852 min_key = max_key + 1; 3853 } 3854 3855 if (key_type == BTRFS_DIR_ITEM_KEY) { 3856 key_type = BTRFS_DIR_INDEX_KEY; 3857 goto again; 3858 } 3859 return 0; 3860 } 3861 3862 /* 3863 * a helper function to drop items from the log before we relog an 3864 * inode. max_key_type indicates the highest item type to remove. 3865 * This cannot be run for file data extents because it does not 3866 * free the extents they point to. 3867 */ 3868 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3869 struct btrfs_root *log, 3870 struct btrfs_path *path, 3871 u64 objectid, int max_key_type) 3872 { 3873 int ret; 3874 struct btrfs_key key; 3875 struct btrfs_key found_key; 3876 int start_slot; 3877 3878 key.objectid = objectid; 3879 key.type = max_key_type; 3880 key.offset = (u64)-1; 3881 3882 while (1) { 3883 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3884 BUG_ON(ret == 0); /* Logic error */ 3885 if (ret < 0) 3886 break; 3887 3888 if (path->slots[0] == 0) 3889 break; 3890 3891 path->slots[0]--; 3892 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3893 path->slots[0]); 3894 3895 if (found_key.objectid != objectid) 3896 break; 3897 3898 found_key.offset = 0; 3899 found_key.type = 0; 3900 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 3901 if (ret < 0) 3902 break; 3903 3904 ret = btrfs_del_items(trans, log, path, start_slot, 3905 path->slots[0] - start_slot + 1); 3906 /* 3907 * If start slot isn't 0 then we don't need to re-search, we've 3908 * found the last guy with the objectid in this tree. 3909 */ 3910 if (ret || start_slot != 0) 3911 break; 3912 btrfs_release_path(path); 3913 } 3914 btrfs_release_path(path); 3915 if (ret > 0) 3916 ret = 0; 3917 return ret; 3918 } 3919 3920 static void fill_inode_item(struct btrfs_trans_handle *trans, 3921 struct extent_buffer *leaf, 3922 struct btrfs_inode_item *item, 3923 struct inode *inode, int log_inode_only, 3924 u64 logged_isize) 3925 { 3926 struct btrfs_map_token token; 3927 u64 flags; 3928 3929 btrfs_init_map_token(&token, leaf); 3930 3931 if (log_inode_only) { 3932 /* set the generation to zero so the recover code 3933 * can tell the difference between an logging 3934 * just to say 'this inode exists' and a logging 3935 * to say 'update this inode with these values' 3936 */ 3937 btrfs_set_token_inode_generation(&token, item, 0); 3938 btrfs_set_token_inode_size(&token, item, logged_isize); 3939 } else { 3940 btrfs_set_token_inode_generation(&token, item, 3941 BTRFS_I(inode)->generation); 3942 btrfs_set_token_inode_size(&token, item, inode->i_size); 3943 } 3944 3945 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3946 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3947 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3948 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3949 3950 btrfs_set_token_timespec_sec(&token, &item->atime, 3951 inode->i_atime.tv_sec); 3952 btrfs_set_token_timespec_nsec(&token, &item->atime, 3953 inode->i_atime.tv_nsec); 3954 3955 btrfs_set_token_timespec_sec(&token, &item->mtime, 3956 inode->i_mtime.tv_sec); 3957 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3958 inode->i_mtime.tv_nsec); 3959 3960 btrfs_set_token_timespec_sec(&token, &item->ctime, 3961 inode->i_ctime.tv_sec); 3962 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3963 inode->i_ctime.tv_nsec); 3964 3965 /* 3966 * We do not need to set the nbytes field, in fact during a fast fsync 3967 * its value may not even be correct, since a fast fsync does not wait 3968 * for ordered extent completion, which is where we update nbytes, it 3969 * only waits for writeback to complete. During log replay as we find 3970 * file extent items and replay them, we adjust the nbytes field of the 3971 * inode item in subvolume tree as needed (see overwrite_item()). 3972 */ 3973 3974 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3975 btrfs_set_token_inode_transid(&token, item, trans->transid); 3976 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3977 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 3978 BTRFS_I(inode)->ro_flags); 3979 btrfs_set_token_inode_flags(&token, item, flags); 3980 btrfs_set_token_inode_block_group(&token, item, 0); 3981 } 3982 3983 static int log_inode_item(struct btrfs_trans_handle *trans, 3984 struct btrfs_root *log, struct btrfs_path *path, 3985 struct btrfs_inode *inode, bool inode_item_dropped) 3986 { 3987 struct btrfs_inode_item *inode_item; 3988 int ret; 3989 3990 /* 3991 * If we are doing a fast fsync and the inode was logged before in the 3992 * current transaction, then we know the inode was previously logged and 3993 * it exists in the log tree. For performance reasons, in this case use 3994 * btrfs_search_slot() directly with ins_len set to 0 so that we never 3995 * attempt a write lock on the leaf's parent, which adds unnecessary lock 3996 * contention in case there are concurrent fsyncs for other inodes of the 3997 * same subvolume. Using btrfs_insert_empty_item() when the inode item 3998 * already exists can also result in unnecessarily splitting a leaf. 3999 */ 4000 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4001 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4002 ASSERT(ret <= 0); 4003 if (ret > 0) 4004 ret = -ENOENT; 4005 } else { 4006 /* 4007 * This means it is the first fsync in the current transaction, 4008 * so the inode item is not in the log and we need to insert it. 4009 * We can never get -EEXIST because we are only called for a fast 4010 * fsync and in case an inode eviction happens after the inode was 4011 * logged before in the current transaction, when we load again 4012 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4013 * flags and set ->logged_trans to 0. 4014 */ 4015 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4016 sizeof(*inode_item)); 4017 ASSERT(ret != -EEXIST); 4018 } 4019 if (ret) 4020 return ret; 4021 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4022 struct btrfs_inode_item); 4023 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4024 0, 0); 4025 btrfs_release_path(path); 4026 return 0; 4027 } 4028 4029 static int log_csums(struct btrfs_trans_handle *trans, 4030 struct btrfs_inode *inode, 4031 struct btrfs_root *log_root, 4032 struct btrfs_ordered_sum *sums) 4033 { 4034 const u64 lock_end = sums->bytenr + sums->len - 1; 4035 struct extent_state *cached_state = NULL; 4036 int ret; 4037 4038 /* 4039 * If this inode was not used for reflink operations in the current 4040 * transaction with new extents, then do the fast path, no need to 4041 * worry about logging checksum items with overlapping ranges. 4042 */ 4043 if (inode->last_reflink_trans < trans->transid) 4044 return btrfs_csum_file_blocks(trans, log_root, sums); 4045 4046 /* 4047 * Serialize logging for checksums. This is to avoid racing with the 4048 * same checksum being logged by another task that is logging another 4049 * file which happens to refer to the same extent as well. Such races 4050 * can leave checksum items in the log with overlapping ranges. 4051 */ 4052 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4053 lock_end, &cached_state); 4054 if (ret) 4055 return ret; 4056 /* 4057 * Due to extent cloning, we might have logged a csum item that covers a 4058 * subrange of a cloned extent, and later we can end up logging a csum 4059 * item for a larger subrange of the same extent or the entire range. 4060 * This would leave csum items in the log tree that cover the same range 4061 * and break the searches for checksums in the log tree, resulting in 4062 * some checksums missing in the fs/subvolume tree. So just delete (or 4063 * trim and adjust) any existing csum items in the log for this range. 4064 */ 4065 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4066 if (!ret) 4067 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4068 4069 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4070 &cached_state); 4071 4072 return ret; 4073 } 4074 4075 static noinline int copy_items(struct btrfs_trans_handle *trans, 4076 struct btrfs_inode *inode, 4077 struct btrfs_path *dst_path, 4078 struct btrfs_path *src_path, 4079 int start_slot, int nr, int inode_only, 4080 u64 logged_isize) 4081 { 4082 struct btrfs_fs_info *fs_info = trans->fs_info; 4083 unsigned long src_offset; 4084 unsigned long dst_offset; 4085 struct btrfs_root *log = inode->root->log_root; 4086 struct btrfs_file_extent_item *extent; 4087 struct btrfs_inode_item *inode_item; 4088 struct extent_buffer *src = src_path->nodes[0]; 4089 int ret; 4090 struct btrfs_key *ins_keys; 4091 u32 *ins_sizes; 4092 char *ins_data; 4093 int i; 4094 struct list_head ordered_sums; 4095 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 4096 4097 INIT_LIST_HEAD(&ordered_sums); 4098 4099 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4100 nr * sizeof(u32), GFP_NOFS); 4101 if (!ins_data) 4102 return -ENOMEM; 4103 4104 ins_sizes = (u32 *)ins_data; 4105 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4106 4107 for (i = 0; i < nr; i++) { 4108 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 4109 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 4110 } 4111 ret = btrfs_insert_empty_items(trans, log, dst_path, 4112 ins_keys, ins_sizes, nr); 4113 if (ret) { 4114 kfree(ins_data); 4115 return ret; 4116 } 4117 4118 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 4119 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 4120 dst_path->slots[0]); 4121 4122 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 4123 4124 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 4125 inode_item = btrfs_item_ptr(dst_path->nodes[0], 4126 dst_path->slots[0], 4127 struct btrfs_inode_item); 4128 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4129 &inode->vfs_inode, 4130 inode_only == LOG_INODE_EXISTS, 4131 logged_isize); 4132 } else { 4133 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4134 src_offset, ins_sizes[i]); 4135 } 4136 4137 /* take a reference on file data extents so that truncates 4138 * or deletes of this inode don't have to relog the inode 4139 * again 4140 */ 4141 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4142 !skip_csum) { 4143 int found_type; 4144 extent = btrfs_item_ptr(src, start_slot + i, 4145 struct btrfs_file_extent_item); 4146 4147 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4148 continue; 4149 4150 found_type = btrfs_file_extent_type(src, extent); 4151 if (found_type == BTRFS_FILE_EXTENT_REG) { 4152 u64 ds, dl, cs, cl; 4153 ds = btrfs_file_extent_disk_bytenr(src, 4154 extent); 4155 /* ds == 0 is a hole */ 4156 if (ds == 0) 4157 continue; 4158 4159 dl = btrfs_file_extent_disk_num_bytes(src, 4160 extent); 4161 cs = btrfs_file_extent_offset(src, extent); 4162 cl = btrfs_file_extent_num_bytes(src, 4163 extent); 4164 if (btrfs_file_extent_compression(src, 4165 extent)) { 4166 cs = 0; 4167 cl = dl; 4168 } 4169 4170 ret = btrfs_lookup_csums_range( 4171 fs_info->csum_root, 4172 ds + cs, ds + cs + cl - 1, 4173 &ordered_sums, 0); 4174 if (ret) 4175 break; 4176 } 4177 } 4178 } 4179 4180 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4181 btrfs_release_path(dst_path); 4182 kfree(ins_data); 4183 4184 /* 4185 * we have to do this after the loop above to avoid changing the 4186 * log tree while trying to change the log tree. 4187 */ 4188 while (!list_empty(&ordered_sums)) { 4189 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4190 struct btrfs_ordered_sum, 4191 list); 4192 if (!ret) 4193 ret = log_csums(trans, inode, log, sums); 4194 list_del(&sums->list); 4195 kfree(sums); 4196 } 4197 4198 return ret; 4199 } 4200 4201 static int extent_cmp(void *priv, const struct list_head *a, 4202 const struct list_head *b) 4203 { 4204 const struct extent_map *em1, *em2; 4205 4206 em1 = list_entry(a, struct extent_map, list); 4207 em2 = list_entry(b, struct extent_map, list); 4208 4209 if (em1->start < em2->start) 4210 return -1; 4211 else if (em1->start > em2->start) 4212 return 1; 4213 return 0; 4214 } 4215 4216 static int log_extent_csums(struct btrfs_trans_handle *trans, 4217 struct btrfs_inode *inode, 4218 struct btrfs_root *log_root, 4219 const struct extent_map *em, 4220 struct btrfs_log_ctx *ctx) 4221 { 4222 struct btrfs_ordered_extent *ordered; 4223 u64 csum_offset; 4224 u64 csum_len; 4225 u64 mod_start = em->mod_start; 4226 u64 mod_len = em->mod_len; 4227 LIST_HEAD(ordered_sums); 4228 int ret = 0; 4229 4230 if (inode->flags & BTRFS_INODE_NODATASUM || 4231 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4232 em->block_start == EXTENT_MAP_HOLE) 4233 return 0; 4234 4235 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4236 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4237 const u64 mod_end = mod_start + mod_len; 4238 struct btrfs_ordered_sum *sums; 4239 4240 if (mod_len == 0) 4241 break; 4242 4243 if (ordered_end <= mod_start) 4244 continue; 4245 if (mod_end <= ordered->file_offset) 4246 break; 4247 4248 /* 4249 * We are going to copy all the csums on this ordered extent, so 4250 * go ahead and adjust mod_start and mod_len in case this ordered 4251 * extent has already been logged. 4252 */ 4253 if (ordered->file_offset > mod_start) { 4254 if (ordered_end >= mod_end) 4255 mod_len = ordered->file_offset - mod_start; 4256 /* 4257 * If we have this case 4258 * 4259 * |--------- logged extent ---------| 4260 * |----- ordered extent ----| 4261 * 4262 * Just don't mess with mod_start and mod_len, we'll 4263 * just end up logging more csums than we need and it 4264 * will be ok. 4265 */ 4266 } else { 4267 if (ordered_end < mod_end) { 4268 mod_len = mod_end - ordered_end; 4269 mod_start = ordered_end; 4270 } else { 4271 mod_len = 0; 4272 } 4273 } 4274 4275 /* 4276 * To keep us from looping for the above case of an ordered 4277 * extent that falls inside of the logged extent. 4278 */ 4279 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4280 continue; 4281 4282 list_for_each_entry(sums, &ordered->list, list) { 4283 ret = log_csums(trans, inode, log_root, sums); 4284 if (ret) 4285 return ret; 4286 } 4287 } 4288 4289 /* We're done, found all csums in the ordered extents. */ 4290 if (mod_len == 0) 4291 return 0; 4292 4293 /* If we're compressed we have to save the entire range of csums. */ 4294 if (em->compress_type) { 4295 csum_offset = 0; 4296 csum_len = max(em->block_len, em->orig_block_len); 4297 } else { 4298 csum_offset = mod_start - em->start; 4299 csum_len = mod_len; 4300 } 4301 4302 /* block start is already adjusted for the file extent offset. */ 4303 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4304 em->block_start + csum_offset, 4305 em->block_start + csum_offset + 4306 csum_len - 1, &ordered_sums, 0); 4307 if (ret) 4308 return ret; 4309 4310 while (!list_empty(&ordered_sums)) { 4311 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4312 struct btrfs_ordered_sum, 4313 list); 4314 if (!ret) 4315 ret = log_csums(trans, inode, log_root, sums); 4316 list_del(&sums->list); 4317 kfree(sums); 4318 } 4319 4320 return ret; 4321 } 4322 4323 static int log_one_extent(struct btrfs_trans_handle *trans, 4324 struct btrfs_inode *inode, struct btrfs_root *root, 4325 const struct extent_map *em, 4326 struct btrfs_path *path, 4327 struct btrfs_log_ctx *ctx) 4328 { 4329 struct btrfs_drop_extents_args drop_args = { 0 }; 4330 struct btrfs_root *log = root->log_root; 4331 struct btrfs_file_extent_item *fi; 4332 struct extent_buffer *leaf; 4333 struct btrfs_map_token token; 4334 struct btrfs_key key; 4335 u64 extent_offset = em->start - em->orig_start; 4336 u64 block_len; 4337 int ret; 4338 4339 ret = log_extent_csums(trans, inode, log, em, ctx); 4340 if (ret) 4341 return ret; 4342 4343 drop_args.path = path; 4344 drop_args.start = em->start; 4345 drop_args.end = em->start + em->len; 4346 drop_args.replace_extent = true; 4347 drop_args.extent_item_size = sizeof(*fi); 4348 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4349 if (ret) 4350 return ret; 4351 4352 if (!drop_args.extent_inserted) { 4353 key.objectid = btrfs_ino(inode); 4354 key.type = BTRFS_EXTENT_DATA_KEY; 4355 key.offset = em->start; 4356 4357 ret = btrfs_insert_empty_item(trans, log, path, &key, 4358 sizeof(*fi)); 4359 if (ret) 4360 return ret; 4361 } 4362 leaf = path->nodes[0]; 4363 btrfs_init_map_token(&token, leaf); 4364 fi = btrfs_item_ptr(leaf, path->slots[0], 4365 struct btrfs_file_extent_item); 4366 4367 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4368 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4369 btrfs_set_token_file_extent_type(&token, fi, 4370 BTRFS_FILE_EXTENT_PREALLOC); 4371 else 4372 btrfs_set_token_file_extent_type(&token, fi, 4373 BTRFS_FILE_EXTENT_REG); 4374 4375 block_len = max(em->block_len, em->orig_block_len); 4376 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4377 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4378 em->block_start); 4379 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4380 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4381 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4382 em->block_start - 4383 extent_offset); 4384 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4385 } else { 4386 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4387 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4388 } 4389 4390 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4391 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4392 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4393 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4394 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4395 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4396 btrfs_mark_buffer_dirty(leaf); 4397 4398 btrfs_release_path(path); 4399 4400 return ret; 4401 } 4402 4403 /* 4404 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4405 * lose them after doing a fast fsync and replaying the log. We scan the 4406 * subvolume's root instead of iterating the inode's extent map tree because 4407 * otherwise we can log incorrect extent items based on extent map conversion. 4408 * That can happen due to the fact that extent maps are merged when they 4409 * are not in the extent map tree's list of modified extents. 4410 */ 4411 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4412 struct btrfs_inode *inode, 4413 struct btrfs_path *path) 4414 { 4415 struct btrfs_root *root = inode->root; 4416 struct btrfs_key key; 4417 const u64 i_size = i_size_read(&inode->vfs_inode); 4418 const u64 ino = btrfs_ino(inode); 4419 struct btrfs_path *dst_path = NULL; 4420 bool dropped_extents = false; 4421 u64 truncate_offset = i_size; 4422 struct extent_buffer *leaf; 4423 int slot; 4424 int ins_nr = 0; 4425 int start_slot; 4426 int ret; 4427 4428 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4429 return 0; 4430 4431 key.objectid = ino; 4432 key.type = BTRFS_EXTENT_DATA_KEY; 4433 key.offset = i_size; 4434 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4435 if (ret < 0) 4436 goto out; 4437 4438 /* 4439 * We must check if there is a prealloc extent that starts before the 4440 * i_size and crosses the i_size boundary. This is to ensure later we 4441 * truncate down to the end of that extent and not to the i_size, as 4442 * otherwise we end up losing part of the prealloc extent after a log 4443 * replay and with an implicit hole if there is another prealloc extent 4444 * that starts at an offset beyond i_size. 4445 */ 4446 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4447 if (ret < 0) 4448 goto out; 4449 4450 if (ret == 0) { 4451 struct btrfs_file_extent_item *ei; 4452 4453 leaf = path->nodes[0]; 4454 slot = path->slots[0]; 4455 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4456 4457 if (btrfs_file_extent_type(leaf, ei) == 4458 BTRFS_FILE_EXTENT_PREALLOC) { 4459 u64 extent_end; 4460 4461 btrfs_item_key_to_cpu(leaf, &key, slot); 4462 extent_end = key.offset + 4463 btrfs_file_extent_num_bytes(leaf, ei); 4464 4465 if (extent_end > i_size) 4466 truncate_offset = extent_end; 4467 } 4468 } else { 4469 ret = 0; 4470 } 4471 4472 while (true) { 4473 leaf = path->nodes[0]; 4474 slot = path->slots[0]; 4475 4476 if (slot >= btrfs_header_nritems(leaf)) { 4477 if (ins_nr > 0) { 4478 ret = copy_items(trans, inode, dst_path, path, 4479 start_slot, ins_nr, 1, 0); 4480 if (ret < 0) 4481 goto out; 4482 ins_nr = 0; 4483 } 4484 ret = btrfs_next_leaf(root, path); 4485 if (ret < 0) 4486 goto out; 4487 if (ret > 0) { 4488 ret = 0; 4489 break; 4490 } 4491 continue; 4492 } 4493 4494 btrfs_item_key_to_cpu(leaf, &key, slot); 4495 if (key.objectid > ino) 4496 break; 4497 if (WARN_ON_ONCE(key.objectid < ino) || 4498 key.type < BTRFS_EXTENT_DATA_KEY || 4499 key.offset < i_size) { 4500 path->slots[0]++; 4501 continue; 4502 } 4503 if (!dropped_extents) { 4504 /* 4505 * Avoid logging extent items logged in past fsync calls 4506 * and leading to duplicate keys in the log tree. 4507 */ 4508 do { 4509 ret = btrfs_truncate_inode_items(trans, 4510 root->log_root, 4511 inode, truncate_offset, 4512 BTRFS_EXTENT_DATA_KEY, 4513 NULL); 4514 } while (ret == -EAGAIN); 4515 if (ret) 4516 goto out; 4517 dropped_extents = true; 4518 } 4519 if (ins_nr == 0) 4520 start_slot = slot; 4521 ins_nr++; 4522 path->slots[0]++; 4523 if (!dst_path) { 4524 dst_path = btrfs_alloc_path(); 4525 if (!dst_path) { 4526 ret = -ENOMEM; 4527 goto out; 4528 } 4529 } 4530 } 4531 if (ins_nr > 0) 4532 ret = copy_items(trans, inode, dst_path, path, 4533 start_slot, ins_nr, 1, 0); 4534 out: 4535 btrfs_release_path(path); 4536 btrfs_free_path(dst_path); 4537 return ret; 4538 } 4539 4540 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4541 struct btrfs_root *root, 4542 struct btrfs_inode *inode, 4543 struct btrfs_path *path, 4544 struct btrfs_log_ctx *ctx) 4545 { 4546 struct btrfs_ordered_extent *ordered; 4547 struct btrfs_ordered_extent *tmp; 4548 struct extent_map *em, *n; 4549 struct list_head extents; 4550 struct extent_map_tree *tree = &inode->extent_tree; 4551 int ret = 0; 4552 int num = 0; 4553 4554 INIT_LIST_HEAD(&extents); 4555 4556 write_lock(&tree->lock); 4557 4558 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4559 list_del_init(&em->list); 4560 /* 4561 * Just an arbitrary number, this can be really CPU intensive 4562 * once we start getting a lot of extents, and really once we 4563 * have a bunch of extents we just want to commit since it will 4564 * be faster. 4565 */ 4566 if (++num > 32768) { 4567 list_del_init(&tree->modified_extents); 4568 ret = -EFBIG; 4569 goto process; 4570 } 4571 4572 if (em->generation < trans->transid) 4573 continue; 4574 4575 /* We log prealloc extents beyond eof later. */ 4576 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4577 em->start >= i_size_read(&inode->vfs_inode)) 4578 continue; 4579 4580 /* Need a ref to keep it from getting evicted from cache */ 4581 refcount_inc(&em->refs); 4582 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4583 list_add_tail(&em->list, &extents); 4584 num++; 4585 } 4586 4587 list_sort(NULL, &extents, extent_cmp); 4588 process: 4589 while (!list_empty(&extents)) { 4590 em = list_entry(extents.next, struct extent_map, list); 4591 4592 list_del_init(&em->list); 4593 4594 /* 4595 * If we had an error we just need to delete everybody from our 4596 * private list. 4597 */ 4598 if (ret) { 4599 clear_em_logging(tree, em); 4600 free_extent_map(em); 4601 continue; 4602 } 4603 4604 write_unlock(&tree->lock); 4605 4606 ret = log_one_extent(trans, inode, root, em, path, ctx); 4607 write_lock(&tree->lock); 4608 clear_em_logging(tree, em); 4609 free_extent_map(em); 4610 } 4611 WARN_ON(!list_empty(&extents)); 4612 write_unlock(&tree->lock); 4613 4614 btrfs_release_path(path); 4615 if (!ret) 4616 ret = btrfs_log_prealloc_extents(trans, inode, path); 4617 if (ret) 4618 return ret; 4619 4620 /* 4621 * We have logged all extents successfully, now make sure the commit of 4622 * the current transaction waits for the ordered extents to complete 4623 * before it commits and wipes out the log trees, otherwise we would 4624 * lose data if an ordered extents completes after the transaction 4625 * commits and a power failure happens after the transaction commit. 4626 */ 4627 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4628 list_del_init(&ordered->log_list); 4629 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4630 4631 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4632 spin_lock_irq(&inode->ordered_tree.lock); 4633 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4634 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4635 atomic_inc(&trans->transaction->pending_ordered); 4636 } 4637 spin_unlock_irq(&inode->ordered_tree.lock); 4638 } 4639 btrfs_put_ordered_extent(ordered); 4640 } 4641 4642 return 0; 4643 } 4644 4645 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4646 struct btrfs_path *path, u64 *size_ret) 4647 { 4648 struct btrfs_key key; 4649 int ret; 4650 4651 key.objectid = btrfs_ino(inode); 4652 key.type = BTRFS_INODE_ITEM_KEY; 4653 key.offset = 0; 4654 4655 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4656 if (ret < 0) { 4657 return ret; 4658 } else if (ret > 0) { 4659 *size_ret = 0; 4660 } else { 4661 struct btrfs_inode_item *item; 4662 4663 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4664 struct btrfs_inode_item); 4665 *size_ret = btrfs_inode_size(path->nodes[0], item); 4666 /* 4667 * If the in-memory inode's i_size is smaller then the inode 4668 * size stored in the btree, return the inode's i_size, so 4669 * that we get a correct inode size after replaying the log 4670 * when before a power failure we had a shrinking truncate 4671 * followed by addition of a new name (rename / new hard link). 4672 * Otherwise return the inode size from the btree, to avoid 4673 * data loss when replaying a log due to previously doing a 4674 * write that expands the inode's size and logging a new name 4675 * immediately after. 4676 */ 4677 if (*size_ret > inode->vfs_inode.i_size) 4678 *size_ret = inode->vfs_inode.i_size; 4679 } 4680 4681 btrfs_release_path(path); 4682 return 0; 4683 } 4684 4685 /* 4686 * At the moment we always log all xattrs. This is to figure out at log replay 4687 * time which xattrs must have their deletion replayed. If a xattr is missing 4688 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4689 * because if a xattr is deleted, the inode is fsynced and a power failure 4690 * happens, causing the log to be replayed the next time the fs is mounted, 4691 * we want the xattr to not exist anymore (same behaviour as other filesystems 4692 * with a journal, ext3/4, xfs, f2fs, etc). 4693 */ 4694 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4695 struct btrfs_root *root, 4696 struct btrfs_inode *inode, 4697 struct btrfs_path *path, 4698 struct btrfs_path *dst_path) 4699 { 4700 int ret; 4701 struct btrfs_key key; 4702 const u64 ino = btrfs_ino(inode); 4703 int ins_nr = 0; 4704 int start_slot = 0; 4705 bool found_xattrs = false; 4706 4707 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 4708 return 0; 4709 4710 key.objectid = ino; 4711 key.type = BTRFS_XATTR_ITEM_KEY; 4712 key.offset = 0; 4713 4714 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4715 if (ret < 0) 4716 return ret; 4717 4718 while (true) { 4719 int slot = path->slots[0]; 4720 struct extent_buffer *leaf = path->nodes[0]; 4721 int nritems = btrfs_header_nritems(leaf); 4722 4723 if (slot >= nritems) { 4724 if (ins_nr > 0) { 4725 ret = copy_items(trans, inode, dst_path, path, 4726 start_slot, ins_nr, 1, 0); 4727 if (ret < 0) 4728 return ret; 4729 ins_nr = 0; 4730 } 4731 ret = btrfs_next_leaf(root, path); 4732 if (ret < 0) 4733 return ret; 4734 else if (ret > 0) 4735 break; 4736 continue; 4737 } 4738 4739 btrfs_item_key_to_cpu(leaf, &key, slot); 4740 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4741 break; 4742 4743 if (ins_nr == 0) 4744 start_slot = slot; 4745 ins_nr++; 4746 path->slots[0]++; 4747 found_xattrs = true; 4748 cond_resched(); 4749 } 4750 if (ins_nr > 0) { 4751 ret = copy_items(trans, inode, dst_path, path, 4752 start_slot, ins_nr, 1, 0); 4753 if (ret < 0) 4754 return ret; 4755 } 4756 4757 if (!found_xattrs) 4758 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 4759 4760 return 0; 4761 } 4762 4763 /* 4764 * When using the NO_HOLES feature if we punched a hole that causes the 4765 * deletion of entire leafs or all the extent items of the first leaf (the one 4766 * that contains the inode item and references) we may end up not processing 4767 * any extents, because there are no leafs with a generation matching the 4768 * current transaction that have extent items for our inode. So we need to find 4769 * if any holes exist and then log them. We also need to log holes after any 4770 * truncate operation that changes the inode's size. 4771 */ 4772 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4773 struct btrfs_root *root, 4774 struct btrfs_inode *inode, 4775 struct btrfs_path *path) 4776 { 4777 struct btrfs_fs_info *fs_info = root->fs_info; 4778 struct btrfs_key key; 4779 const u64 ino = btrfs_ino(inode); 4780 const u64 i_size = i_size_read(&inode->vfs_inode); 4781 u64 prev_extent_end = 0; 4782 int ret; 4783 4784 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4785 return 0; 4786 4787 key.objectid = ino; 4788 key.type = BTRFS_EXTENT_DATA_KEY; 4789 key.offset = 0; 4790 4791 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4792 if (ret < 0) 4793 return ret; 4794 4795 while (true) { 4796 struct extent_buffer *leaf = path->nodes[0]; 4797 4798 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4799 ret = btrfs_next_leaf(root, path); 4800 if (ret < 0) 4801 return ret; 4802 if (ret > 0) { 4803 ret = 0; 4804 break; 4805 } 4806 leaf = path->nodes[0]; 4807 } 4808 4809 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4810 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4811 break; 4812 4813 /* We have a hole, log it. */ 4814 if (prev_extent_end < key.offset) { 4815 const u64 hole_len = key.offset - prev_extent_end; 4816 4817 /* 4818 * Release the path to avoid deadlocks with other code 4819 * paths that search the root while holding locks on 4820 * leafs from the log root. 4821 */ 4822 btrfs_release_path(path); 4823 ret = btrfs_insert_file_extent(trans, root->log_root, 4824 ino, prev_extent_end, 0, 4825 0, hole_len, 0, hole_len, 4826 0, 0, 0); 4827 if (ret < 0) 4828 return ret; 4829 4830 /* 4831 * Search for the same key again in the root. Since it's 4832 * an extent item and we are holding the inode lock, the 4833 * key must still exist. If it doesn't just emit warning 4834 * and return an error to fall back to a transaction 4835 * commit. 4836 */ 4837 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4838 if (ret < 0) 4839 return ret; 4840 if (WARN_ON(ret > 0)) 4841 return -ENOENT; 4842 leaf = path->nodes[0]; 4843 } 4844 4845 prev_extent_end = btrfs_file_extent_end(path); 4846 path->slots[0]++; 4847 cond_resched(); 4848 } 4849 4850 if (prev_extent_end < i_size) { 4851 u64 hole_len; 4852 4853 btrfs_release_path(path); 4854 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 4855 ret = btrfs_insert_file_extent(trans, root->log_root, 4856 ino, prev_extent_end, 0, 0, 4857 hole_len, 0, hole_len, 4858 0, 0, 0); 4859 if (ret < 0) 4860 return ret; 4861 } 4862 4863 return 0; 4864 } 4865 4866 /* 4867 * When we are logging a new inode X, check if it doesn't have a reference that 4868 * matches the reference from some other inode Y created in a past transaction 4869 * and that was renamed in the current transaction. If we don't do this, then at 4870 * log replay time we can lose inode Y (and all its files if it's a directory): 4871 * 4872 * mkdir /mnt/x 4873 * echo "hello world" > /mnt/x/foobar 4874 * sync 4875 * mv /mnt/x /mnt/y 4876 * mkdir /mnt/x # or touch /mnt/x 4877 * xfs_io -c fsync /mnt/x 4878 * <power fail> 4879 * mount fs, trigger log replay 4880 * 4881 * After the log replay procedure, we would lose the first directory and all its 4882 * files (file foobar). 4883 * For the case where inode Y is not a directory we simply end up losing it: 4884 * 4885 * echo "123" > /mnt/foo 4886 * sync 4887 * mv /mnt/foo /mnt/bar 4888 * echo "abc" > /mnt/foo 4889 * xfs_io -c fsync /mnt/foo 4890 * <power fail> 4891 * 4892 * We also need this for cases where a snapshot entry is replaced by some other 4893 * entry (file or directory) otherwise we end up with an unreplayable log due to 4894 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4895 * if it were a regular entry: 4896 * 4897 * mkdir /mnt/x 4898 * btrfs subvolume snapshot /mnt /mnt/x/snap 4899 * btrfs subvolume delete /mnt/x/snap 4900 * rmdir /mnt/x 4901 * mkdir /mnt/x 4902 * fsync /mnt/x or fsync some new file inside it 4903 * <power fail> 4904 * 4905 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4906 * the same transaction. 4907 */ 4908 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4909 const int slot, 4910 const struct btrfs_key *key, 4911 struct btrfs_inode *inode, 4912 u64 *other_ino, u64 *other_parent) 4913 { 4914 int ret; 4915 struct btrfs_path *search_path; 4916 char *name = NULL; 4917 u32 name_len = 0; 4918 u32 item_size = btrfs_item_size_nr(eb, slot); 4919 u32 cur_offset = 0; 4920 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4921 4922 search_path = btrfs_alloc_path(); 4923 if (!search_path) 4924 return -ENOMEM; 4925 search_path->search_commit_root = 1; 4926 search_path->skip_locking = 1; 4927 4928 while (cur_offset < item_size) { 4929 u64 parent; 4930 u32 this_name_len; 4931 u32 this_len; 4932 unsigned long name_ptr; 4933 struct btrfs_dir_item *di; 4934 4935 if (key->type == BTRFS_INODE_REF_KEY) { 4936 struct btrfs_inode_ref *iref; 4937 4938 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4939 parent = key->offset; 4940 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4941 name_ptr = (unsigned long)(iref + 1); 4942 this_len = sizeof(*iref) + this_name_len; 4943 } else { 4944 struct btrfs_inode_extref *extref; 4945 4946 extref = (struct btrfs_inode_extref *)(ptr + 4947 cur_offset); 4948 parent = btrfs_inode_extref_parent(eb, extref); 4949 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4950 name_ptr = (unsigned long)&extref->name; 4951 this_len = sizeof(*extref) + this_name_len; 4952 } 4953 4954 if (this_name_len > name_len) { 4955 char *new_name; 4956 4957 new_name = krealloc(name, this_name_len, GFP_NOFS); 4958 if (!new_name) { 4959 ret = -ENOMEM; 4960 goto out; 4961 } 4962 name_len = this_name_len; 4963 name = new_name; 4964 } 4965 4966 read_extent_buffer(eb, name, name_ptr, this_name_len); 4967 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4968 parent, name, this_name_len, 0); 4969 if (di && !IS_ERR(di)) { 4970 struct btrfs_key di_key; 4971 4972 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4973 di, &di_key); 4974 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4975 if (di_key.objectid != key->objectid) { 4976 ret = 1; 4977 *other_ino = di_key.objectid; 4978 *other_parent = parent; 4979 } else { 4980 ret = 0; 4981 } 4982 } else { 4983 ret = -EAGAIN; 4984 } 4985 goto out; 4986 } else if (IS_ERR(di)) { 4987 ret = PTR_ERR(di); 4988 goto out; 4989 } 4990 btrfs_release_path(search_path); 4991 4992 cur_offset += this_len; 4993 } 4994 ret = 0; 4995 out: 4996 btrfs_free_path(search_path); 4997 kfree(name); 4998 return ret; 4999 } 5000 5001 struct btrfs_ino_list { 5002 u64 ino; 5003 u64 parent; 5004 struct list_head list; 5005 }; 5006 5007 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5008 struct btrfs_root *root, 5009 struct btrfs_path *path, 5010 struct btrfs_log_ctx *ctx, 5011 u64 ino, u64 parent) 5012 { 5013 struct btrfs_ino_list *ino_elem; 5014 LIST_HEAD(inode_list); 5015 int ret = 0; 5016 5017 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5018 if (!ino_elem) 5019 return -ENOMEM; 5020 ino_elem->ino = ino; 5021 ino_elem->parent = parent; 5022 list_add_tail(&ino_elem->list, &inode_list); 5023 5024 while (!list_empty(&inode_list)) { 5025 struct btrfs_fs_info *fs_info = root->fs_info; 5026 struct btrfs_key key; 5027 struct inode *inode; 5028 5029 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5030 list); 5031 ino = ino_elem->ino; 5032 parent = ino_elem->parent; 5033 list_del(&ino_elem->list); 5034 kfree(ino_elem); 5035 if (ret) 5036 continue; 5037 5038 btrfs_release_path(path); 5039 5040 inode = btrfs_iget(fs_info->sb, ino, root); 5041 /* 5042 * If the other inode that had a conflicting dir entry was 5043 * deleted in the current transaction, we need to log its parent 5044 * directory. 5045 */ 5046 if (IS_ERR(inode)) { 5047 ret = PTR_ERR(inode); 5048 if (ret == -ENOENT) { 5049 inode = btrfs_iget(fs_info->sb, parent, root); 5050 if (IS_ERR(inode)) { 5051 ret = PTR_ERR(inode); 5052 } else { 5053 ret = btrfs_log_inode(trans, root, 5054 BTRFS_I(inode), 5055 LOG_OTHER_INODE_ALL, 5056 ctx); 5057 btrfs_add_delayed_iput(inode); 5058 } 5059 } 5060 continue; 5061 } 5062 /* 5063 * If the inode was already logged skip it - otherwise we can 5064 * hit an infinite loop. Example: 5065 * 5066 * From the commit root (previous transaction) we have the 5067 * following inodes: 5068 * 5069 * inode 257 a directory 5070 * inode 258 with references "zz" and "zz_link" on inode 257 5071 * inode 259 with reference "a" on inode 257 5072 * 5073 * And in the current (uncommitted) transaction we have: 5074 * 5075 * inode 257 a directory, unchanged 5076 * inode 258 with references "a" and "a2" on inode 257 5077 * inode 259 with reference "zz_link" on inode 257 5078 * inode 261 with reference "zz" on inode 257 5079 * 5080 * When logging inode 261 the following infinite loop could 5081 * happen if we don't skip already logged inodes: 5082 * 5083 * - we detect inode 258 as a conflicting inode, with inode 261 5084 * on reference "zz", and log it; 5085 * 5086 * - we detect inode 259 as a conflicting inode, with inode 258 5087 * on reference "a", and log it; 5088 * 5089 * - we detect inode 258 as a conflicting inode, with inode 259 5090 * on reference "zz_link", and log it - again! After this we 5091 * repeat the above steps forever. 5092 */ 5093 spin_lock(&BTRFS_I(inode)->lock); 5094 /* 5095 * Check the inode's logged_trans only instead of 5096 * btrfs_inode_in_log(). This is because the last_log_commit of 5097 * the inode is not updated when we only log that it exists (see 5098 * btrfs_log_inode()). 5099 */ 5100 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5101 spin_unlock(&BTRFS_I(inode)->lock); 5102 btrfs_add_delayed_iput(inode); 5103 continue; 5104 } 5105 spin_unlock(&BTRFS_I(inode)->lock); 5106 /* 5107 * We are safe logging the other inode without acquiring its 5108 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5109 * are safe against concurrent renames of the other inode as 5110 * well because during a rename we pin the log and update the 5111 * log with the new name before we unpin it. 5112 */ 5113 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5114 LOG_OTHER_INODE, ctx); 5115 if (ret) { 5116 btrfs_add_delayed_iput(inode); 5117 continue; 5118 } 5119 5120 key.objectid = ino; 5121 key.type = BTRFS_INODE_REF_KEY; 5122 key.offset = 0; 5123 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5124 if (ret < 0) { 5125 btrfs_add_delayed_iput(inode); 5126 continue; 5127 } 5128 5129 while (true) { 5130 struct extent_buffer *leaf = path->nodes[0]; 5131 int slot = path->slots[0]; 5132 u64 other_ino = 0; 5133 u64 other_parent = 0; 5134 5135 if (slot >= btrfs_header_nritems(leaf)) { 5136 ret = btrfs_next_leaf(root, path); 5137 if (ret < 0) { 5138 break; 5139 } else if (ret > 0) { 5140 ret = 0; 5141 break; 5142 } 5143 continue; 5144 } 5145 5146 btrfs_item_key_to_cpu(leaf, &key, slot); 5147 if (key.objectid != ino || 5148 (key.type != BTRFS_INODE_REF_KEY && 5149 key.type != BTRFS_INODE_EXTREF_KEY)) { 5150 ret = 0; 5151 break; 5152 } 5153 5154 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5155 BTRFS_I(inode), &other_ino, 5156 &other_parent); 5157 if (ret < 0) 5158 break; 5159 if (ret > 0) { 5160 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5161 if (!ino_elem) { 5162 ret = -ENOMEM; 5163 break; 5164 } 5165 ino_elem->ino = other_ino; 5166 ino_elem->parent = other_parent; 5167 list_add_tail(&ino_elem->list, &inode_list); 5168 ret = 0; 5169 } 5170 path->slots[0]++; 5171 } 5172 btrfs_add_delayed_iput(inode); 5173 } 5174 5175 return ret; 5176 } 5177 5178 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5179 struct btrfs_inode *inode, 5180 struct btrfs_key *min_key, 5181 const struct btrfs_key *max_key, 5182 struct btrfs_path *path, 5183 struct btrfs_path *dst_path, 5184 const u64 logged_isize, 5185 const bool recursive_logging, 5186 const int inode_only, 5187 struct btrfs_log_ctx *ctx, 5188 bool *need_log_inode_item) 5189 { 5190 struct btrfs_root *root = inode->root; 5191 int ins_start_slot = 0; 5192 int ins_nr = 0; 5193 int ret; 5194 5195 while (1) { 5196 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5197 if (ret < 0) 5198 return ret; 5199 if (ret > 0) { 5200 ret = 0; 5201 break; 5202 } 5203 again: 5204 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5205 if (min_key->objectid != max_key->objectid) 5206 break; 5207 if (min_key->type > max_key->type) 5208 break; 5209 5210 if (min_key->type == BTRFS_INODE_ITEM_KEY) 5211 *need_log_inode_item = false; 5212 5213 if ((min_key->type == BTRFS_INODE_REF_KEY || 5214 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5215 inode->generation == trans->transid && 5216 !recursive_logging) { 5217 u64 other_ino = 0; 5218 u64 other_parent = 0; 5219 5220 ret = btrfs_check_ref_name_override(path->nodes[0], 5221 path->slots[0], min_key, inode, 5222 &other_ino, &other_parent); 5223 if (ret < 0) { 5224 return ret; 5225 } else if (ret > 0 && ctx && 5226 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5227 if (ins_nr > 0) { 5228 ins_nr++; 5229 } else { 5230 ins_nr = 1; 5231 ins_start_slot = path->slots[0]; 5232 } 5233 ret = copy_items(trans, inode, dst_path, path, 5234 ins_start_slot, ins_nr, 5235 inode_only, logged_isize); 5236 if (ret < 0) 5237 return ret; 5238 ins_nr = 0; 5239 5240 ret = log_conflicting_inodes(trans, root, path, 5241 ctx, other_ino, other_parent); 5242 if (ret) 5243 return ret; 5244 btrfs_release_path(path); 5245 goto next_key; 5246 } 5247 } 5248 5249 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5250 if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5251 if (ins_nr == 0) 5252 goto next_slot; 5253 ret = copy_items(trans, inode, dst_path, path, 5254 ins_start_slot, 5255 ins_nr, inode_only, logged_isize); 5256 if (ret < 0) 5257 return ret; 5258 ins_nr = 0; 5259 goto next_slot; 5260 } 5261 5262 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5263 ins_nr++; 5264 goto next_slot; 5265 } else if (!ins_nr) { 5266 ins_start_slot = path->slots[0]; 5267 ins_nr = 1; 5268 goto next_slot; 5269 } 5270 5271 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5272 ins_nr, inode_only, logged_isize); 5273 if (ret < 0) 5274 return ret; 5275 ins_nr = 1; 5276 ins_start_slot = path->slots[0]; 5277 next_slot: 5278 path->slots[0]++; 5279 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5280 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5281 path->slots[0]); 5282 goto again; 5283 } 5284 if (ins_nr) { 5285 ret = copy_items(trans, inode, dst_path, path, 5286 ins_start_slot, ins_nr, inode_only, 5287 logged_isize); 5288 if (ret < 0) 5289 return ret; 5290 ins_nr = 0; 5291 } 5292 btrfs_release_path(path); 5293 next_key: 5294 if (min_key->offset < (u64)-1) { 5295 min_key->offset++; 5296 } else if (min_key->type < max_key->type) { 5297 min_key->type++; 5298 min_key->offset = 0; 5299 } else { 5300 break; 5301 } 5302 } 5303 if (ins_nr) 5304 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5305 ins_nr, inode_only, logged_isize); 5306 5307 return ret; 5308 } 5309 5310 /* log a single inode in the tree log. 5311 * At least one parent directory for this inode must exist in the tree 5312 * or be logged already. 5313 * 5314 * Any items from this inode changed by the current transaction are copied 5315 * to the log tree. An extra reference is taken on any extents in this 5316 * file, allowing us to avoid a whole pile of corner cases around logging 5317 * blocks that have been removed from the tree. 5318 * 5319 * See LOG_INODE_ALL and related defines for a description of what inode_only 5320 * does. 5321 * 5322 * This handles both files and directories. 5323 */ 5324 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5325 struct btrfs_root *root, struct btrfs_inode *inode, 5326 int inode_only, 5327 struct btrfs_log_ctx *ctx) 5328 { 5329 struct btrfs_path *path; 5330 struct btrfs_path *dst_path; 5331 struct btrfs_key min_key; 5332 struct btrfs_key max_key; 5333 struct btrfs_root *log = root->log_root; 5334 int err = 0; 5335 int ret = 0; 5336 bool fast_search = false; 5337 u64 ino = btrfs_ino(inode); 5338 struct extent_map_tree *em_tree = &inode->extent_tree; 5339 u64 logged_isize = 0; 5340 bool need_log_inode_item = true; 5341 bool xattrs_logged = false; 5342 bool recursive_logging = false; 5343 bool inode_item_dropped = true; 5344 5345 path = btrfs_alloc_path(); 5346 if (!path) 5347 return -ENOMEM; 5348 dst_path = btrfs_alloc_path(); 5349 if (!dst_path) { 5350 btrfs_free_path(path); 5351 return -ENOMEM; 5352 } 5353 5354 min_key.objectid = ino; 5355 min_key.type = BTRFS_INODE_ITEM_KEY; 5356 min_key.offset = 0; 5357 5358 max_key.objectid = ino; 5359 5360 5361 /* today the code can only do partial logging of directories */ 5362 if (S_ISDIR(inode->vfs_inode.i_mode) || 5363 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5364 &inode->runtime_flags) && 5365 inode_only >= LOG_INODE_EXISTS)) 5366 max_key.type = BTRFS_XATTR_ITEM_KEY; 5367 else 5368 max_key.type = (u8)-1; 5369 max_key.offset = (u64)-1; 5370 5371 /* 5372 * Only run delayed items if we are a directory. We want to make sure 5373 * all directory indexes hit the fs/subvolume tree so we can find them 5374 * and figure out which index ranges have to be logged. 5375 * 5376 * Otherwise commit the delayed inode only if the full sync flag is set, 5377 * as we want to make sure an up to date version is in the subvolume 5378 * tree so copy_inode_items_to_log() / copy_items() can find it and copy 5379 * it to the log tree. For a non full sync, we always log the inode item 5380 * based on the in-memory struct btrfs_inode which is always up to date. 5381 */ 5382 if (S_ISDIR(inode->vfs_inode.i_mode)) 5383 ret = btrfs_commit_inode_delayed_items(trans, inode); 5384 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5385 ret = btrfs_commit_inode_delayed_inode(inode); 5386 5387 if (ret) { 5388 btrfs_free_path(path); 5389 btrfs_free_path(dst_path); 5390 return ret; 5391 } 5392 5393 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5394 recursive_logging = true; 5395 if (inode_only == LOG_OTHER_INODE) 5396 inode_only = LOG_INODE_EXISTS; 5397 else 5398 inode_only = LOG_INODE_ALL; 5399 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5400 } else { 5401 mutex_lock(&inode->log_mutex); 5402 } 5403 5404 /* 5405 * This is for cases where logging a directory could result in losing a 5406 * a file after replaying the log. For example, if we move a file from a 5407 * directory A to a directory B, then fsync directory A, we have no way 5408 * to known the file was moved from A to B, so logging just A would 5409 * result in losing the file after a log replay. 5410 */ 5411 if (S_ISDIR(inode->vfs_inode.i_mode) && 5412 inode_only == LOG_INODE_ALL && 5413 inode->last_unlink_trans >= trans->transid) { 5414 btrfs_set_log_full_commit(trans); 5415 err = 1; 5416 goto out_unlock; 5417 } 5418 5419 /* 5420 * a brute force approach to making sure we get the most uptodate 5421 * copies of everything. 5422 */ 5423 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5424 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5425 5426 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5427 if (inode_only == LOG_INODE_EXISTS) 5428 max_key_type = BTRFS_XATTR_ITEM_KEY; 5429 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5430 } else { 5431 if (inode_only == LOG_INODE_EXISTS) { 5432 /* 5433 * Make sure the new inode item we write to the log has 5434 * the same isize as the current one (if it exists). 5435 * This is necessary to prevent data loss after log 5436 * replay, and also to prevent doing a wrong expanding 5437 * truncate - for e.g. create file, write 4K into offset 5438 * 0, fsync, write 4K into offset 4096, add hard link, 5439 * fsync some other file (to sync log), power fail - if 5440 * we use the inode's current i_size, after log replay 5441 * we get a 8Kb file, with the last 4Kb extent as a hole 5442 * (zeroes), as if an expanding truncate happened, 5443 * instead of getting a file of 4Kb only. 5444 */ 5445 err = logged_inode_size(log, inode, path, &logged_isize); 5446 if (err) 5447 goto out_unlock; 5448 } 5449 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5450 &inode->runtime_flags)) { 5451 if (inode_only == LOG_INODE_EXISTS) { 5452 max_key.type = BTRFS_XATTR_ITEM_KEY; 5453 ret = drop_objectid_items(trans, log, path, ino, 5454 max_key.type); 5455 } else { 5456 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5457 &inode->runtime_flags); 5458 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5459 &inode->runtime_flags); 5460 while(1) { 5461 ret = btrfs_truncate_inode_items(trans, 5462 log, inode, 0, 0, NULL); 5463 if (ret != -EAGAIN) 5464 break; 5465 } 5466 } 5467 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5468 &inode->runtime_flags) || 5469 inode_only == LOG_INODE_EXISTS) { 5470 if (inode_only == LOG_INODE_ALL) 5471 fast_search = true; 5472 max_key.type = BTRFS_XATTR_ITEM_KEY; 5473 ret = drop_objectid_items(trans, log, path, ino, 5474 max_key.type); 5475 } else { 5476 if (inode_only == LOG_INODE_ALL) 5477 fast_search = true; 5478 inode_item_dropped = false; 5479 goto log_extents; 5480 } 5481 5482 } 5483 if (ret) { 5484 err = ret; 5485 goto out_unlock; 5486 } 5487 5488 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5489 path, dst_path, logged_isize, 5490 recursive_logging, inode_only, ctx, 5491 &need_log_inode_item); 5492 if (err) 5493 goto out_unlock; 5494 5495 btrfs_release_path(path); 5496 btrfs_release_path(dst_path); 5497 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5498 if (err) 5499 goto out_unlock; 5500 xattrs_logged = true; 5501 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5502 btrfs_release_path(path); 5503 btrfs_release_path(dst_path); 5504 err = btrfs_log_holes(trans, root, inode, path); 5505 if (err) 5506 goto out_unlock; 5507 } 5508 log_extents: 5509 btrfs_release_path(path); 5510 btrfs_release_path(dst_path); 5511 if (need_log_inode_item) { 5512 err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5513 if (err) 5514 goto out_unlock; 5515 /* 5516 * If we are doing a fast fsync and the inode was logged before 5517 * in this transaction, we don't need to log the xattrs because 5518 * they were logged before. If xattrs were added, changed or 5519 * deleted since the last time we logged the inode, then we have 5520 * already logged them because the inode had the runtime flag 5521 * BTRFS_INODE_COPY_EVERYTHING set. 5522 */ 5523 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5524 err = btrfs_log_all_xattrs(trans, root, inode, path, 5525 dst_path); 5526 if (err) 5527 goto out_unlock; 5528 btrfs_release_path(path); 5529 } 5530 } 5531 if (fast_search) { 5532 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5533 ctx); 5534 if (ret) { 5535 err = ret; 5536 goto out_unlock; 5537 } 5538 } else if (inode_only == LOG_INODE_ALL) { 5539 struct extent_map *em, *n; 5540 5541 write_lock(&em_tree->lock); 5542 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5543 list_del_init(&em->list); 5544 write_unlock(&em_tree->lock); 5545 } 5546 5547 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5548 ret = log_directory_changes(trans, root, inode, path, dst_path, 5549 ctx); 5550 if (ret) { 5551 err = ret; 5552 goto out_unlock; 5553 } 5554 } 5555 5556 /* 5557 * If we are logging that an ancestor inode exists as part of logging a 5558 * new name from a link or rename operation, don't mark the inode as 5559 * logged - otherwise if an explicit fsync is made against an ancestor, 5560 * the fsync considers the inode in the log and doesn't sync the log, 5561 * resulting in the ancestor missing after a power failure unless the 5562 * log was synced as part of an fsync against any other unrelated inode. 5563 * So keep it simple for this case and just don't flag the ancestors as 5564 * logged. 5565 */ 5566 if (!ctx || 5567 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && 5568 &inode->vfs_inode != ctx->inode)) { 5569 spin_lock(&inode->lock); 5570 inode->logged_trans = trans->transid; 5571 /* 5572 * Don't update last_log_commit if we logged that an inode exists. 5573 * We do this for two reasons: 5574 * 5575 * 1) We might have had buffered writes to this inode that were 5576 * flushed and had their ordered extents completed in this 5577 * transaction, but we did not previously log the inode with 5578 * LOG_INODE_ALL. Later the inode was evicted and after that 5579 * it was loaded again and this LOG_INODE_EXISTS log operation 5580 * happened. We must make sure that if an explicit fsync against 5581 * the inode is performed later, it logs the new extents, an 5582 * updated inode item, etc, and syncs the log. The same logic 5583 * applies to direct IO writes instead of buffered writes. 5584 * 5585 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 5586 * is logged with an i_size of 0 or whatever value was logged 5587 * before. If later the i_size of the inode is increased by a 5588 * truncate operation, the log is synced through an fsync of 5589 * some other inode and then finally an explicit fsync against 5590 * this inode is made, we must make sure this fsync logs the 5591 * inode with the new i_size, the hole between old i_size and 5592 * the new i_size, and syncs the log. 5593 */ 5594 if (inode_only != LOG_INODE_EXISTS) 5595 inode->last_log_commit = inode->last_sub_trans; 5596 spin_unlock(&inode->lock); 5597 } 5598 out_unlock: 5599 mutex_unlock(&inode->log_mutex); 5600 5601 btrfs_free_path(path); 5602 btrfs_free_path(dst_path); 5603 return err; 5604 } 5605 5606 /* 5607 * Check if we need to log an inode. This is used in contexts where while 5608 * logging an inode we need to log another inode (either that it exists or in 5609 * full mode). This is used instead of btrfs_inode_in_log() because the later 5610 * requires the inode to be in the log and have the log transaction committed, 5611 * while here we do not care if the log transaction was already committed - our 5612 * caller will commit the log later - and we want to avoid logging an inode 5613 * multiple times when multiple tasks have joined the same log transaction. 5614 */ 5615 static bool need_log_inode(struct btrfs_trans_handle *trans, 5616 struct btrfs_inode *inode) 5617 { 5618 /* 5619 * If a directory was not modified, no dentries added or removed, we can 5620 * and should avoid logging it. 5621 */ 5622 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5623 return false; 5624 5625 /* 5626 * If this inode does not have new/updated/deleted xattrs since the last 5627 * time it was logged and is flagged as logged in the current transaction, 5628 * we can skip logging it. As for new/deleted names, those are updated in 5629 * the log by link/unlink/rename operations. 5630 * In case the inode was logged and then evicted and reloaded, its 5631 * logged_trans will be 0, in which case we have to fully log it since 5632 * logged_trans is a transient field, not persisted. 5633 */ 5634 if (inode->logged_trans == trans->transid && 5635 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5636 return false; 5637 5638 return true; 5639 } 5640 5641 struct btrfs_dir_list { 5642 u64 ino; 5643 struct list_head list; 5644 }; 5645 5646 /* 5647 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5648 * details about the why it is needed. 5649 * This is a recursive operation - if an existing dentry corresponds to a 5650 * directory, that directory's new entries are logged too (same behaviour as 5651 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5652 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5653 * complains about the following circular lock dependency / possible deadlock: 5654 * 5655 * CPU0 CPU1 5656 * ---- ---- 5657 * lock(&type->i_mutex_dir_key#3/2); 5658 * lock(sb_internal#2); 5659 * lock(&type->i_mutex_dir_key#3/2); 5660 * lock(&sb->s_type->i_mutex_key#14); 5661 * 5662 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5663 * sb_start_intwrite() in btrfs_start_transaction(). 5664 * Not locking i_mutex of the inodes is still safe because: 5665 * 5666 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5667 * that while logging the inode new references (names) are added or removed 5668 * from the inode, leaving the logged inode item with a link count that does 5669 * not match the number of logged inode reference items. This is fine because 5670 * at log replay time we compute the real number of links and correct the 5671 * link count in the inode item (see replay_one_buffer() and 5672 * link_to_fixup_dir()); 5673 * 5674 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5675 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5676 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5677 * has a size that doesn't match the sum of the lengths of all the logged 5678 * names. This does not result in a problem because if a dir_item key is 5679 * logged but its matching dir_index key is not logged, at log replay time we 5680 * don't use it to replay the respective name (see replay_one_name()). On the 5681 * other hand if only the dir_index key ends up being logged, the respective 5682 * name is added to the fs/subvol tree with both the dir_item and dir_index 5683 * keys created (see replay_one_name()). 5684 * The directory's inode item with a wrong i_size is not a problem as well, 5685 * since we don't use it at log replay time to set the i_size in the inode 5686 * item of the fs/subvol tree (see overwrite_item()). 5687 */ 5688 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5689 struct btrfs_root *root, 5690 struct btrfs_inode *start_inode, 5691 struct btrfs_log_ctx *ctx) 5692 { 5693 struct btrfs_fs_info *fs_info = root->fs_info; 5694 struct btrfs_root *log = root->log_root; 5695 struct btrfs_path *path; 5696 LIST_HEAD(dir_list); 5697 struct btrfs_dir_list *dir_elem; 5698 int ret = 0; 5699 5700 path = btrfs_alloc_path(); 5701 if (!path) 5702 return -ENOMEM; 5703 5704 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5705 if (!dir_elem) { 5706 btrfs_free_path(path); 5707 return -ENOMEM; 5708 } 5709 dir_elem->ino = btrfs_ino(start_inode); 5710 list_add_tail(&dir_elem->list, &dir_list); 5711 5712 while (!list_empty(&dir_list)) { 5713 struct extent_buffer *leaf; 5714 struct btrfs_key min_key; 5715 int nritems; 5716 int i; 5717 5718 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5719 list); 5720 if (ret) 5721 goto next_dir_inode; 5722 5723 min_key.objectid = dir_elem->ino; 5724 min_key.type = BTRFS_DIR_ITEM_KEY; 5725 min_key.offset = 0; 5726 again: 5727 btrfs_release_path(path); 5728 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5729 if (ret < 0) { 5730 goto next_dir_inode; 5731 } else if (ret > 0) { 5732 ret = 0; 5733 goto next_dir_inode; 5734 } 5735 5736 process_leaf: 5737 leaf = path->nodes[0]; 5738 nritems = btrfs_header_nritems(leaf); 5739 for (i = path->slots[0]; i < nritems; i++) { 5740 struct btrfs_dir_item *di; 5741 struct btrfs_key di_key; 5742 struct inode *di_inode; 5743 struct btrfs_dir_list *new_dir_elem; 5744 int log_mode = LOG_INODE_EXISTS; 5745 int type; 5746 5747 btrfs_item_key_to_cpu(leaf, &min_key, i); 5748 if (min_key.objectid != dir_elem->ino || 5749 min_key.type != BTRFS_DIR_ITEM_KEY) 5750 goto next_dir_inode; 5751 5752 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5753 type = btrfs_dir_type(leaf, di); 5754 if (btrfs_dir_transid(leaf, di) < trans->transid && 5755 type != BTRFS_FT_DIR) 5756 continue; 5757 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5758 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5759 continue; 5760 5761 btrfs_release_path(path); 5762 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5763 if (IS_ERR(di_inode)) { 5764 ret = PTR_ERR(di_inode); 5765 goto next_dir_inode; 5766 } 5767 5768 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 5769 btrfs_add_delayed_iput(di_inode); 5770 break; 5771 } 5772 5773 ctx->log_new_dentries = false; 5774 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5775 log_mode = LOG_INODE_ALL; 5776 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5777 log_mode, ctx); 5778 btrfs_add_delayed_iput(di_inode); 5779 if (ret) 5780 goto next_dir_inode; 5781 if (ctx->log_new_dentries) { 5782 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5783 GFP_NOFS); 5784 if (!new_dir_elem) { 5785 ret = -ENOMEM; 5786 goto next_dir_inode; 5787 } 5788 new_dir_elem->ino = di_key.objectid; 5789 list_add_tail(&new_dir_elem->list, &dir_list); 5790 } 5791 break; 5792 } 5793 if (i == nritems) { 5794 ret = btrfs_next_leaf(log, path); 5795 if (ret < 0) { 5796 goto next_dir_inode; 5797 } else if (ret > 0) { 5798 ret = 0; 5799 goto next_dir_inode; 5800 } 5801 goto process_leaf; 5802 } 5803 if (min_key.offset < (u64)-1) { 5804 min_key.offset++; 5805 goto again; 5806 } 5807 next_dir_inode: 5808 list_del(&dir_elem->list); 5809 kfree(dir_elem); 5810 } 5811 5812 btrfs_free_path(path); 5813 return ret; 5814 } 5815 5816 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5817 struct btrfs_inode *inode, 5818 struct btrfs_log_ctx *ctx) 5819 { 5820 struct btrfs_fs_info *fs_info = trans->fs_info; 5821 int ret; 5822 struct btrfs_path *path; 5823 struct btrfs_key key; 5824 struct btrfs_root *root = inode->root; 5825 const u64 ino = btrfs_ino(inode); 5826 5827 path = btrfs_alloc_path(); 5828 if (!path) 5829 return -ENOMEM; 5830 path->skip_locking = 1; 5831 path->search_commit_root = 1; 5832 5833 key.objectid = ino; 5834 key.type = BTRFS_INODE_REF_KEY; 5835 key.offset = 0; 5836 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5837 if (ret < 0) 5838 goto out; 5839 5840 while (true) { 5841 struct extent_buffer *leaf = path->nodes[0]; 5842 int slot = path->slots[0]; 5843 u32 cur_offset = 0; 5844 u32 item_size; 5845 unsigned long ptr; 5846 5847 if (slot >= btrfs_header_nritems(leaf)) { 5848 ret = btrfs_next_leaf(root, path); 5849 if (ret < 0) 5850 goto out; 5851 else if (ret > 0) 5852 break; 5853 continue; 5854 } 5855 5856 btrfs_item_key_to_cpu(leaf, &key, slot); 5857 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5858 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5859 break; 5860 5861 item_size = btrfs_item_size_nr(leaf, slot); 5862 ptr = btrfs_item_ptr_offset(leaf, slot); 5863 while (cur_offset < item_size) { 5864 struct btrfs_key inode_key; 5865 struct inode *dir_inode; 5866 5867 inode_key.type = BTRFS_INODE_ITEM_KEY; 5868 inode_key.offset = 0; 5869 5870 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5871 struct btrfs_inode_extref *extref; 5872 5873 extref = (struct btrfs_inode_extref *) 5874 (ptr + cur_offset); 5875 inode_key.objectid = btrfs_inode_extref_parent( 5876 leaf, extref); 5877 cur_offset += sizeof(*extref); 5878 cur_offset += btrfs_inode_extref_name_len(leaf, 5879 extref); 5880 } else { 5881 inode_key.objectid = key.offset; 5882 cur_offset = item_size; 5883 } 5884 5885 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 5886 root); 5887 /* 5888 * If the parent inode was deleted, return an error to 5889 * fallback to a transaction commit. This is to prevent 5890 * getting an inode that was moved from one parent A to 5891 * a parent B, got its former parent A deleted and then 5892 * it got fsync'ed, from existing at both parents after 5893 * a log replay (and the old parent still existing). 5894 * Example: 5895 * 5896 * mkdir /mnt/A 5897 * mkdir /mnt/B 5898 * touch /mnt/B/bar 5899 * sync 5900 * mv /mnt/B/bar /mnt/A/bar 5901 * mv -T /mnt/A /mnt/B 5902 * fsync /mnt/B/bar 5903 * <power fail> 5904 * 5905 * If we ignore the old parent B which got deleted, 5906 * after a log replay we would have file bar linked 5907 * at both parents and the old parent B would still 5908 * exist. 5909 */ 5910 if (IS_ERR(dir_inode)) { 5911 ret = PTR_ERR(dir_inode); 5912 goto out; 5913 } 5914 5915 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 5916 btrfs_add_delayed_iput(dir_inode); 5917 continue; 5918 } 5919 5920 if (ctx) 5921 ctx->log_new_dentries = false; 5922 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5923 LOG_INODE_ALL, ctx); 5924 if (!ret && ctx && ctx->log_new_dentries) 5925 ret = log_new_dir_dentries(trans, root, 5926 BTRFS_I(dir_inode), ctx); 5927 btrfs_add_delayed_iput(dir_inode); 5928 if (ret) 5929 goto out; 5930 } 5931 path->slots[0]++; 5932 } 5933 ret = 0; 5934 out: 5935 btrfs_free_path(path); 5936 return ret; 5937 } 5938 5939 static int log_new_ancestors(struct btrfs_trans_handle *trans, 5940 struct btrfs_root *root, 5941 struct btrfs_path *path, 5942 struct btrfs_log_ctx *ctx) 5943 { 5944 struct btrfs_key found_key; 5945 5946 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5947 5948 while (true) { 5949 struct btrfs_fs_info *fs_info = root->fs_info; 5950 struct extent_buffer *leaf = path->nodes[0]; 5951 int slot = path->slots[0]; 5952 struct btrfs_key search_key; 5953 struct inode *inode; 5954 u64 ino; 5955 int ret = 0; 5956 5957 btrfs_release_path(path); 5958 5959 ino = found_key.offset; 5960 5961 search_key.objectid = found_key.offset; 5962 search_key.type = BTRFS_INODE_ITEM_KEY; 5963 search_key.offset = 0; 5964 inode = btrfs_iget(fs_info->sb, ino, root); 5965 if (IS_ERR(inode)) 5966 return PTR_ERR(inode); 5967 5968 if (BTRFS_I(inode)->generation >= trans->transid && 5969 need_log_inode(trans, BTRFS_I(inode))) 5970 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5971 LOG_INODE_EXISTS, ctx); 5972 btrfs_add_delayed_iput(inode); 5973 if (ret) 5974 return ret; 5975 5976 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5977 break; 5978 5979 search_key.type = BTRFS_INODE_REF_KEY; 5980 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5981 if (ret < 0) 5982 return ret; 5983 5984 leaf = path->nodes[0]; 5985 slot = path->slots[0]; 5986 if (slot >= btrfs_header_nritems(leaf)) { 5987 ret = btrfs_next_leaf(root, path); 5988 if (ret < 0) 5989 return ret; 5990 else if (ret > 0) 5991 return -ENOENT; 5992 leaf = path->nodes[0]; 5993 slot = path->slots[0]; 5994 } 5995 5996 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5997 if (found_key.objectid != search_key.objectid || 5998 found_key.type != BTRFS_INODE_REF_KEY) 5999 return -ENOENT; 6000 } 6001 return 0; 6002 } 6003 6004 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6005 struct btrfs_inode *inode, 6006 struct dentry *parent, 6007 struct btrfs_log_ctx *ctx) 6008 { 6009 struct btrfs_root *root = inode->root; 6010 struct dentry *old_parent = NULL; 6011 struct super_block *sb = inode->vfs_inode.i_sb; 6012 int ret = 0; 6013 6014 while (true) { 6015 if (!parent || d_really_is_negative(parent) || 6016 sb != parent->d_sb) 6017 break; 6018 6019 inode = BTRFS_I(d_inode(parent)); 6020 if (root != inode->root) 6021 break; 6022 6023 if (inode->generation >= trans->transid && 6024 need_log_inode(trans, inode)) { 6025 ret = btrfs_log_inode(trans, root, inode, 6026 LOG_INODE_EXISTS, ctx); 6027 if (ret) 6028 break; 6029 } 6030 if (IS_ROOT(parent)) 6031 break; 6032 6033 parent = dget_parent(parent); 6034 dput(old_parent); 6035 old_parent = parent; 6036 } 6037 dput(old_parent); 6038 6039 return ret; 6040 } 6041 6042 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6043 struct btrfs_inode *inode, 6044 struct dentry *parent, 6045 struct btrfs_log_ctx *ctx) 6046 { 6047 struct btrfs_root *root = inode->root; 6048 const u64 ino = btrfs_ino(inode); 6049 struct btrfs_path *path; 6050 struct btrfs_key search_key; 6051 int ret; 6052 6053 /* 6054 * For a single hard link case, go through a fast path that does not 6055 * need to iterate the fs/subvolume tree. 6056 */ 6057 if (inode->vfs_inode.i_nlink < 2) 6058 return log_new_ancestors_fast(trans, inode, parent, ctx); 6059 6060 path = btrfs_alloc_path(); 6061 if (!path) 6062 return -ENOMEM; 6063 6064 search_key.objectid = ino; 6065 search_key.type = BTRFS_INODE_REF_KEY; 6066 search_key.offset = 0; 6067 again: 6068 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6069 if (ret < 0) 6070 goto out; 6071 if (ret == 0) 6072 path->slots[0]++; 6073 6074 while (true) { 6075 struct extent_buffer *leaf = path->nodes[0]; 6076 int slot = path->slots[0]; 6077 struct btrfs_key found_key; 6078 6079 if (slot >= btrfs_header_nritems(leaf)) { 6080 ret = btrfs_next_leaf(root, path); 6081 if (ret < 0) 6082 goto out; 6083 else if (ret > 0) 6084 break; 6085 continue; 6086 } 6087 6088 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6089 if (found_key.objectid != ino || 6090 found_key.type > BTRFS_INODE_EXTREF_KEY) 6091 break; 6092 6093 /* 6094 * Don't deal with extended references because they are rare 6095 * cases and too complex to deal with (we would need to keep 6096 * track of which subitem we are processing for each item in 6097 * this loop, etc). So just return some error to fallback to 6098 * a transaction commit. 6099 */ 6100 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6101 ret = -EMLINK; 6102 goto out; 6103 } 6104 6105 /* 6106 * Logging ancestors needs to do more searches on the fs/subvol 6107 * tree, so it releases the path as needed to avoid deadlocks. 6108 * Keep track of the last inode ref key and resume from that key 6109 * after logging all new ancestors for the current hard link. 6110 */ 6111 memcpy(&search_key, &found_key, sizeof(search_key)); 6112 6113 ret = log_new_ancestors(trans, root, path, ctx); 6114 if (ret) 6115 goto out; 6116 btrfs_release_path(path); 6117 goto again; 6118 } 6119 ret = 0; 6120 out: 6121 btrfs_free_path(path); 6122 return ret; 6123 } 6124 6125 /* 6126 * helper function around btrfs_log_inode to make sure newly created 6127 * parent directories also end up in the log. A minimal inode and backref 6128 * only logging is done of any parent directories that are older than 6129 * the last committed transaction 6130 */ 6131 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6132 struct btrfs_inode *inode, 6133 struct dentry *parent, 6134 int inode_only, 6135 struct btrfs_log_ctx *ctx) 6136 { 6137 struct btrfs_root *root = inode->root; 6138 struct btrfs_fs_info *fs_info = root->fs_info; 6139 int ret = 0; 6140 bool log_dentries = false; 6141 6142 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6143 ret = 1; 6144 goto end_no_trans; 6145 } 6146 6147 if (btrfs_root_refs(&root->root_item) == 0) { 6148 ret = 1; 6149 goto end_no_trans; 6150 } 6151 6152 /* 6153 * Skip already logged inodes or inodes corresponding to tmpfiles 6154 * (since logging them is pointless, a link count of 0 means they 6155 * will never be accessible). 6156 */ 6157 if ((btrfs_inode_in_log(inode, trans->transid) && 6158 list_empty(&ctx->ordered_extents)) || 6159 inode->vfs_inode.i_nlink == 0) { 6160 ret = BTRFS_NO_LOG_SYNC; 6161 goto end_no_trans; 6162 } 6163 6164 ret = start_log_trans(trans, root, ctx); 6165 if (ret) 6166 goto end_no_trans; 6167 6168 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); 6169 if (ret) 6170 goto end_trans; 6171 6172 /* 6173 * for regular files, if its inode is already on disk, we don't 6174 * have to worry about the parents at all. This is because 6175 * we can use the last_unlink_trans field to record renames 6176 * and other fun in this file. 6177 */ 6178 if (S_ISREG(inode->vfs_inode.i_mode) && 6179 inode->generation < trans->transid && 6180 inode->last_unlink_trans < trans->transid) { 6181 ret = 0; 6182 goto end_trans; 6183 } 6184 6185 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6186 log_dentries = true; 6187 6188 /* 6189 * On unlink we must make sure all our current and old parent directory 6190 * inodes are fully logged. This is to prevent leaving dangling 6191 * directory index entries in directories that were our parents but are 6192 * not anymore. Not doing this results in old parent directory being 6193 * impossible to delete after log replay (rmdir will always fail with 6194 * error -ENOTEMPTY). 6195 * 6196 * Example 1: 6197 * 6198 * mkdir testdir 6199 * touch testdir/foo 6200 * ln testdir/foo testdir/bar 6201 * sync 6202 * unlink testdir/bar 6203 * xfs_io -c fsync testdir/foo 6204 * <power failure> 6205 * mount fs, triggers log replay 6206 * 6207 * If we don't log the parent directory (testdir), after log replay the 6208 * directory still has an entry pointing to the file inode using the bar 6209 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6210 * the file inode has a link count of 1. 6211 * 6212 * Example 2: 6213 * 6214 * mkdir testdir 6215 * touch foo 6216 * ln foo testdir/foo2 6217 * ln foo testdir/foo3 6218 * sync 6219 * unlink testdir/foo3 6220 * xfs_io -c fsync foo 6221 * <power failure> 6222 * mount fs, triggers log replay 6223 * 6224 * Similar as the first example, after log replay the parent directory 6225 * testdir still has an entry pointing to the inode file with name foo3 6226 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6227 * and has a link count of 2. 6228 */ 6229 if (inode->last_unlink_trans >= trans->transid) { 6230 ret = btrfs_log_all_parents(trans, inode, ctx); 6231 if (ret) 6232 goto end_trans; 6233 } 6234 6235 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6236 if (ret) 6237 goto end_trans; 6238 6239 if (log_dentries) 6240 ret = log_new_dir_dentries(trans, root, inode, ctx); 6241 else 6242 ret = 0; 6243 end_trans: 6244 if (ret < 0) { 6245 btrfs_set_log_full_commit(trans); 6246 ret = 1; 6247 } 6248 6249 if (ret) 6250 btrfs_remove_log_ctx(root, ctx); 6251 btrfs_end_log_trans(root); 6252 end_no_trans: 6253 return ret; 6254 } 6255 6256 /* 6257 * it is not safe to log dentry if the chunk root has added new 6258 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6259 * If this returns 1, you must commit the transaction to safely get your 6260 * data on disk. 6261 */ 6262 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6263 struct dentry *dentry, 6264 struct btrfs_log_ctx *ctx) 6265 { 6266 struct dentry *parent = dget_parent(dentry); 6267 int ret; 6268 6269 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6270 LOG_INODE_ALL, ctx); 6271 dput(parent); 6272 6273 return ret; 6274 } 6275 6276 /* 6277 * should be called during mount to recover any replay any log trees 6278 * from the FS 6279 */ 6280 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6281 { 6282 int ret; 6283 struct btrfs_path *path; 6284 struct btrfs_trans_handle *trans; 6285 struct btrfs_key key; 6286 struct btrfs_key found_key; 6287 struct btrfs_root *log; 6288 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6289 struct walk_control wc = { 6290 .process_func = process_one_buffer, 6291 .stage = LOG_WALK_PIN_ONLY, 6292 }; 6293 6294 path = btrfs_alloc_path(); 6295 if (!path) 6296 return -ENOMEM; 6297 6298 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6299 6300 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6301 if (IS_ERR(trans)) { 6302 ret = PTR_ERR(trans); 6303 goto error; 6304 } 6305 6306 wc.trans = trans; 6307 wc.pin = 1; 6308 6309 ret = walk_log_tree(trans, log_root_tree, &wc); 6310 if (ret) { 6311 btrfs_handle_fs_error(fs_info, ret, 6312 "Failed to pin buffers while recovering log root tree."); 6313 goto error; 6314 } 6315 6316 again: 6317 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6318 key.offset = (u64)-1; 6319 key.type = BTRFS_ROOT_ITEM_KEY; 6320 6321 while (1) { 6322 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6323 6324 if (ret < 0) { 6325 btrfs_handle_fs_error(fs_info, ret, 6326 "Couldn't find tree log root."); 6327 goto error; 6328 } 6329 if (ret > 0) { 6330 if (path->slots[0] == 0) 6331 break; 6332 path->slots[0]--; 6333 } 6334 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6335 path->slots[0]); 6336 btrfs_release_path(path); 6337 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6338 break; 6339 6340 log = btrfs_read_tree_root(log_root_tree, &found_key); 6341 if (IS_ERR(log)) { 6342 ret = PTR_ERR(log); 6343 btrfs_handle_fs_error(fs_info, ret, 6344 "Couldn't read tree log root."); 6345 goto error; 6346 } 6347 6348 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6349 true); 6350 if (IS_ERR(wc.replay_dest)) { 6351 ret = PTR_ERR(wc.replay_dest); 6352 6353 /* 6354 * We didn't find the subvol, likely because it was 6355 * deleted. This is ok, simply skip this log and go to 6356 * the next one. 6357 * 6358 * We need to exclude the root because we can't have 6359 * other log replays overwriting this log as we'll read 6360 * it back in a few more times. This will keep our 6361 * block from being modified, and we'll just bail for 6362 * each subsequent pass. 6363 */ 6364 if (ret == -ENOENT) 6365 ret = btrfs_pin_extent_for_log_replay(trans, 6366 log->node->start, 6367 log->node->len); 6368 btrfs_put_root(log); 6369 6370 if (!ret) 6371 goto next; 6372 btrfs_handle_fs_error(fs_info, ret, 6373 "Couldn't read target root for tree log recovery."); 6374 goto error; 6375 } 6376 6377 wc.replay_dest->log_root = log; 6378 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6379 if (ret) 6380 /* The loop needs to continue due to the root refs */ 6381 btrfs_handle_fs_error(fs_info, ret, 6382 "failed to record the log root in transaction"); 6383 else 6384 ret = walk_log_tree(trans, log, &wc); 6385 6386 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6387 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6388 path); 6389 } 6390 6391 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6392 struct btrfs_root *root = wc.replay_dest; 6393 6394 btrfs_release_path(path); 6395 6396 /* 6397 * We have just replayed everything, and the highest 6398 * objectid of fs roots probably has changed in case 6399 * some inode_item's got replayed. 6400 * 6401 * root->objectid_mutex is not acquired as log replay 6402 * could only happen during mount. 6403 */ 6404 ret = btrfs_init_root_free_objectid(root); 6405 } 6406 6407 wc.replay_dest->log_root = NULL; 6408 btrfs_put_root(wc.replay_dest); 6409 btrfs_put_root(log); 6410 6411 if (ret) 6412 goto error; 6413 next: 6414 if (found_key.offset == 0) 6415 break; 6416 key.offset = found_key.offset - 1; 6417 } 6418 btrfs_release_path(path); 6419 6420 /* step one is to pin it all, step two is to replay just inodes */ 6421 if (wc.pin) { 6422 wc.pin = 0; 6423 wc.process_func = replay_one_buffer; 6424 wc.stage = LOG_WALK_REPLAY_INODES; 6425 goto again; 6426 } 6427 /* step three is to replay everything */ 6428 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6429 wc.stage++; 6430 goto again; 6431 } 6432 6433 btrfs_free_path(path); 6434 6435 /* step 4: commit the transaction, which also unpins the blocks */ 6436 ret = btrfs_commit_transaction(trans); 6437 if (ret) 6438 return ret; 6439 6440 log_root_tree->log_root = NULL; 6441 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6442 btrfs_put_root(log_root_tree); 6443 6444 return 0; 6445 error: 6446 if (wc.trans) 6447 btrfs_end_transaction(wc.trans); 6448 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6449 btrfs_free_path(path); 6450 return ret; 6451 } 6452 6453 /* 6454 * there are some corner cases where we want to force a full 6455 * commit instead of allowing a directory to be logged. 6456 * 6457 * They revolve around files there were unlinked from the directory, and 6458 * this function updates the parent directory so that a full commit is 6459 * properly done if it is fsync'd later after the unlinks are done. 6460 * 6461 * Must be called before the unlink operations (updates to the subvolume tree, 6462 * inodes, etc) are done. 6463 */ 6464 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6465 struct btrfs_inode *dir, struct btrfs_inode *inode, 6466 int for_rename) 6467 { 6468 /* 6469 * when we're logging a file, if it hasn't been renamed 6470 * or unlinked, and its inode is fully committed on disk, 6471 * we don't have to worry about walking up the directory chain 6472 * to log its parents. 6473 * 6474 * So, we use the last_unlink_trans field to put this transid 6475 * into the file. When the file is logged we check it and 6476 * don't log the parents if the file is fully on disk. 6477 */ 6478 mutex_lock(&inode->log_mutex); 6479 inode->last_unlink_trans = trans->transid; 6480 mutex_unlock(&inode->log_mutex); 6481 6482 /* 6483 * if this directory was already logged any new 6484 * names for this file/dir will get recorded 6485 */ 6486 if (dir->logged_trans == trans->transid) 6487 return; 6488 6489 /* 6490 * if the inode we're about to unlink was logged, 6491 * the log will be properly updated for any new names 6492 */ 6493 if (inode->logged_trans == trans->transid) 6494 return; 6495 6496 /* 6497 * when renaming files across directories, if the directory 6498 * there we're unlinking from gets fsync'd later on, there's 6499 * no way to find the destination directory later and fsync it 6500 * properly. So, we have to be conservative and force commits 6501 * so the new name gets discovered. 6502 */ 6503 if (for_rename) 6504 goto record; 6505 6506 /* we can safely do the unlink without any special recording */ 6507 return; 6508 6509 record: 6510 mutex_lock(&dir->log_mutex); 6511 dir->last_unlink_trans = trans->transid; 6512 mutex_unlock(&dir->log_mutex); 6513 } 6514 6515 /* 6516 * Make sure that if someone attempts to fsync the parent directory of a deleted 6517 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6518 * that after replaying the log tree of the parent directory's root we will not 6519 * see the snapshot anymore and at log replay time we will not see any log tree 6520 * corresponding to the deleted snapshot's root, which could lead to replaying 6521 * it after replaying the log tree of the parent directory (which would replay 6522 * the snapshot delete operation). 6523 * 6524 * Must be called before the actual snapshot destroy operation (updates to the 6525 * parent root and tree of tree roots trees, etc) are done. 6526 */ 6527 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6528 struct btrfs_inode *dir) 6529 { 6530 mutex_lock(&dir->log_mutex); 6531 dir->last_unlink_trans = trans->transid; 6532 mutex_unlock(&dir->log_mutex); 6533 } 6534 6535 /* 6536 * Call this after adding a new name for a file and it will properly 6537 * update the log to reflect the new name. 6538 */ 6539 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6540 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6541 struct dentry *parent) 6542 { 6543 struct btrfs_log_ctx ctx; 6544 6545 /* 6546 * this will force the logging code to walk the dentry chain 6547 * up for the file 6548 */ 6549 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6550 inode->last_unlink_trans = trans->transid; 6551 6552 /* 6553 * if this inode hasn't been logged and directory we're renaming it 6554 * from hasn't been logged, we don't need to log it 6555 */ 6556 if (!inode_logged(trans, inode) && 6557 (!old_dir || !inode_logged(trans, old_dir))) 6558 return; 6559 6560 /* 6561 * If we are doing a rename (old_dir is not NULL) from a directory that 6562 * was previously logged, make sure the next log attempt on the directory 6563 * is not skipped and logs the inode again. This is because the log may 6564 * not currently be authoritative for a range including the old 6565 * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make 6566 * sure after a log replay we do not end up with both the new and old 6567 * dentries around (in case the inode is a directory we would have a 6568 * directory with two hard links and 2 inode references for different 6569 * parents). The next log attempt of old_dir will happen at 6570 * btrfs_log_all_parents(), called through btrfs_log_inode_parent() 6571 * below, because we have previously set inode->last_unlink_trans to the 6572 * current transaction ID, either here or at btrfs_record_unlink_dir() in 6573 * case inode is a directory. 6574 */ 6575 if (old_dir) 6576 old_dir->logged_trans = 0; 6577 6578 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6579 ctx.logging_new_name = true; 6580 /* 6581 * We don't care about the return value. If we fail to log the new name 6582 * then we know the next attempt to sync the log will fallback to a full 6583 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 6584 * we don't need to worry about getting a log committed that has an 6585 * inconsistent state after a rename operation. 6586 */ 6587 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6588 } 6589 6590