1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 23 /* magic values for the inode_only field in btrfs_log_inode: 24 * 25 * LOG_INODE_ALL means to log everything 26 * LOG_INODE_EXISTS means to log just enough to recreate the inode 27 * during log replay 28 */ 29 enum { 30 LOG_INODE_ALL, 31 LOG_INODE_EXISTS, 32 LOG_OTHER_INODE, 33 LOG_OTHER_INODE_ALL, 34 }; 35 36 /* 37 * directory trouble cases 38 * 39 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 40 * log, we must force a full commit before doing an fsync of the directory 41 * where the unlink was done. 42 * ---> record transid of last unlink/rename per directory 43 * 44 * mkdir foo/some_dir 45 * normal commit 46 * rename foo/some_dir foo2/some_dir 47 * mkdir foo/some_dir 48 * fsync foo/some_dir/some_file 49 * 50 * The fsync above will unlink the original some_dir without recording 51 * it in its new location (foo2). After a crash, some_dir will be gone 52 * unless the fsync of some_file forces a full commit 53 * 54 * 2) we must log any new names for any file or dir that is in the fsync 55 * log. ---> check inode while renaming/linking. 56 * 57 * 2a) we must log any new names for any file or dir during rename 58 * when the directory they are being removed from was logged. 59 * ---> check inode and old parent dir during rename 60 * 61 * 2a is actually the more important variant. With the extra logging 62 * a crash might unlink the old name without recreating the new one 63 * 64 * 3) after a crash, we must go through any directories with a link count 65 * of zero and redo the rm -rf 66 * 67 * mkdir f1/foo 68 * normal commit 69 * rm -rf f1/foo 70 * fsync(f1) 71 * 72 * The directory f1 was fully removed from the FS, but fsync was never 73 * called on f1, only its parent dir. After a crash the rm -rf must 74 * be replayed. This must be able to recurse down the entire 75 * directory tree. The inode link count fixup code takes care of the 76 * ugly details. 77 */ 78 79 /* 80 * stages for the tree walking. The first 81 * stage (0) is to only pin down the blocks we find 82 * the second stage (1) is to make sure that all the inodes 83 * we find in the log are created in the subvolume. 84 * 85 * The last stage is to deal with directories and links and extents 86 * and all the other fun semantics 87 */ 88 enum { 89 LOG_WALK_PIN_ONLY, 90 LOG_WALK_REPLAY_INODES, 91 LOG_WALK_REPLAY_DIR_INDEX, 92 LOG_WALK_REPLAY_ALL, 93 }; 94 95 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 96 struct btrfs_root *root, struct btrfs_inode *inode, 97 int inode_only, 98 struct btrfs_log_ctx *ctx); 99 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_path *path, u64 objectid); 102 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_root *log, 105 struct btrfs_path *path, 106 u64 dirid, int del_all); 107 108 /* 109 * tree logging is a special write ahead log used to make sure that 110 * fsyncs and O_SYNCs can happen without doing full tree commits. 111 * 112 * Full tree commits are expensive because they require commonly 113 * modified blocks to be recowed, creating many dirty pages in the 114 * extent tree an 4x-6x higher write load than ext3. 115 * 116 * Instead of doing a tree commit on every fsync, we use the 117 * key ranges and transaction ids to find items for a given file or directory 118 * that have changed in this transaction. Those items are copied into 119 * a special tree (one per subvolume root), that tree is written to disk 120 * and then the fsync is considered complete. 121 * 122 * After a crash, items are copied out of the log-tree back into the 123 * subvolume tree. Any file data extents found are recorded in the extent 124 * allocation tree, and the log-tree freed. 125 * 126 * The log tree is read three times, once to pin down all the extents it is 127 * using in ram and once, once to create all the inodes logged in the tree 128 * and once to do all the other items. 129 */ 130 131 /* 132 * start a sub transaction and setup the log tree 133 * this increments the log tree writer count to make the people 134 * syncing the tree wait for us to finish 135 */ 136 static int start_log_trans(struct btrfs_trans_handle *trans, 137 struct btrfs_root *root, 138 struct btrfs_log_ctx *ctx) 139 { 140 struct btrfs_fs_info *fs_info = root->fs_info; 141 struct btrfs_root *tree_root = fs_info->tree_root; 142 int ret = 0; 143 144 /* 145 * First check if the log root tree was already created. If not, create 146 * it before locking the root's log_mutex, just to keep lockdep happy. 147 */ 148 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 149 mutex_lock(&tree_root->log_mutex); 150 if (!fs_info->log_root_tree) { 151 ret = btrfs_init_log_root_tree(trans, fs_info); 152 if (!ret) 153 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 154 } 155 mutex_unlock(&tree_root->log_mutex); 156 if (ret) 157 return ret; 158 } 159 160 mutex_lock(&root->log_mutex); 161 162 if (root->log_root) { 163 if (btrfs_need_log_full_commit(trans)) { 164 ret = -EAGAIN; 165 goto out; 166 } 167 168 if (!root->log_start_pid) { 169 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 170 root->log_start_pid = current->pid; 171 } else if (root->log_start_pid != current->pid) { 172 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 173 } 174 } else { 175 ret = btrfs_add_log_tree(trans, root); 176 if (ret) 177 goto out; 178 179 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 180 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 181 root->log_start_pid = current->pid; 182 } 183 184 atomic_inc(&root->log_writers); 185 if (ctx && !ctx->logging_new_name) { 186 int index = root->log_transid % 2; 187 list_add_tail(&ctx->list, &root->log_ctxs[index]); 188 ctx->log_transid = root->log_transid; 189 } 190 191 out: 192 mutex_unlock(&root->log_mutex); 193 return ret; 194 } 195 196 /* 197 * returns 0 if there was a log transaction running and we were able 198 * to join, or returns -ENOENT if there were not transactions 199 * in progress 200 */ 201 static int join_running_log_trans(struct btrfs_root *root) 202 { 203 int ret = -ENOENT; 204 205 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 206 return ret; 207 208 mutex_lock(&root->log_mutex); 209 if (root->log_root) { 210 ret = 0; 211 atomic_inc(&root->log_writers); 212 } 213 mutex_unlock(&root->log_mutex); 214 return ret; 215 } 216 217 /* 218 * This either makes the current running log transaction wait 219 * until you call btrfs_end_log_trans() or it makes any future 220 * log transactions wait until you call btrfs_end_log_trans() 221 */ 222 void btrfs_pin_log_trans(struct btrfs_root *root) 223 { 224 atomic_inc(&root->log_writers); 225 } 226 227 /* 228 * indicate we're done making changes to the log tree 229 * and wake up anyone waiting to do a sync 230 */ 231 void btrfs_end_log_trans(struct btrfs_root *root) 232 { 233 if (atomic_dec_and_test(&root->log_writers)) { 234 /* atomic_dec_and_test implies a barrier */ 235 cond_wake_up_nomb(&root->log_writer_wait); 236 } 237 } 238 239 static int btrfs_write_tree_block(struct extent_buffer *buf) 240 { 241 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 242 buf->start + buf->len - 1); 243 } 244 245 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 246 { 247 filemap_fdatawait_range(buf->pages[0]->mapping, 248 buf->start, buf->start + buf->len - 1); 249 } 250 251 /* 252 * the walk control struct is used to pass state down the chain when 253 * processing the log tree. The stage field tells us which part 254 * of the log tree processing we are currently doing. The others 255 * are state fields used for that specific part 256 */ 257 struct walk_control { 258 /* should we free the extent on disk when done? This is used 259 * at transaction commit time while freeing a log tree 260 */ 261 int free; 262 263 /* should we write out the extent buffer? This is used 264 * while flushing the log tree to disk during a sync 265 */ 266 int write; 267 268 /* should we wait for the extent buffer io to finish? Also used 269 * while flushing the log tree to disk for a sync 270 */ 271 int wait; 272 273 /* pin only walk, we record which extents on disk belong to the 274 * log trees 275 */ 276 int pin; 277 278 /* what stage of the replay code we're currently in */ 279 int stage; 280 281 /* 282 * Ignore any items from the inode currently being processed. Needs 283 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 284 * the LOG_WALK_REPLAY_INODES stage. 285 */ 286 bool ignore_cur_inode; 287 288 /* the root we are currently replaying */ 289 struct btrfs_root *replay_dest; 290 291 /* the trans handle for the current replay */ 292 struct btrfs_trans_handle *trans; 293 294 /* the function that gets used to process blocks we find in the 295 * tree. Note the extent_buffer might not be up to date when it is 296 * passed in, and it must be checked or read if you need the data 297 * inside it 298 */ 299 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 300 struct walk_control *wc, u64 gen, int level); 301 }; 302 303 /* 304 * process_func used to pin down extents, write them or wait on them 305 */ 306 static int process_one_buffer(struct btrfs_root *log, 307 struct extent_buffer *eb, 308 struct walk_control *wc, u64 gen, int level) 309 { 310 struct btrfs_fs_info *fs_info = log->fs_info; 311 int ret = 0; 312 313 /* 314 * If this fs is mixed then we need to be able to process the leaves to 315 * pin down any logged extents, so we have to read the block. 316 */ 317 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 318 ret = btrfs_read_buffer(eb, gen, level, NULL); 319 if (ret) 320 return ret; 321 } 322 323 if (wc->pin) 324 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 325 eb->len); 326 327 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 328 if (wc->pin && btrfs_header_level(eb) == 0) 329 ret = btrfs_exclude_logged_extents(eb); 330 if (wc->write) 331 btrfs_write_tree_block(eb); 332 if (wc->wait) 333 btrfs_wait_tree_block_writeback(eb); 334 } 335 return ret; 336 } 337 338 /* 339 * Item overwrite used by replay and tree logging. eb, slot and key all refer 340 * to the src data we are copying out. 341 * 342 * root is the tree we are copying into, and path is a scratch 343 * path for use in this function (it should be released on entry and 344 * will be released on exit). 345 * 346 * If the key is already in the destination tree the existing item is 347 * overwritten. If the existing item isn't big enough, it is extended. 348 * If it is too large, it is truncated. 349 * 350 * If the key isn't in the destination yet, a new item is inserted. 351 */ 352 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 353 struct btrfs_root *root, 354 struct btrfs_path *path, 355 struct extent_buffer *eb, int slot, 356 struct btrfs_key *key) 357 { 358 int ret; 359 u32 item_size; 360 u64 saved_i_size = 0; 361 int save_old_i_size = 0; 362 unsigned long src_ptr; 363 unsigned long dst_ptr; 364 int overwrite_root = 0; 365 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 366 367 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 368 overwrite_root = 1; 369 370 item_size = btrfs_item_size_nr(eb, slot); 371 src_ptr = btrfs_item_ptr_offset(eb, slot); 372 373 /* look for the key in the destination tree */ 374 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 375 if (ret < 0) 376 return ret; 377 378 if (ret == 0) { 379 char *src_copy; 380 char *dst_copy; 381 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 382 path->slots[0]); 383 if (dst_size != item_size) 384 goto insert; 385 386 if (item_size == 0) { 387 btrfs_release_path(path); 388 return 0; 389 } 390 dst_copy = kmalloc(item_size, GFP_NOFS); 391 src_copy = kmalloc(item_size, GFP_NOFS); 392 if (!dst_copy || !src_copy) { 393 btrfs_release_path(path); 394 kfree(dst_copy); 395 kfree(src_copy); 396 return -ENOMEM; 397 } 398 399 read_extent_buffer(eb, src_copy, src_ptr, item_size); 400 401 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 402 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 403 item_size); 404 ret = memcmp(dst_copy, src_copy, item_size); 405 406 kfree(dst_copy); 407 kfree(src_copy); 408 /* 409 * they have the same contents, just return, this saves 410 * us from cowing blocks in the destination tree and doing 411 * extra writes that may not have been done by a previous 412 * sync 413 */ 414 if (ret == 0) { 415 btrfs_release_path(path); 416 return 0; 417 } 418 419 /* 420 * We need to load the old nbytes into the inode so when we 421 * replay the extents we've logged we get the right nbytes. 422 */ 423 if (inode_item) { 424 struct btrfs_inode_item *item; 425 u64 nbytes; 426 u32 mode; 427 428 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 429 struct btrfs_inode_item); 430 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 431 item = btrfs_item_ptr(eb, slot, 432 struct btrfs_inode_item); 433 btrfs_set_inode_nbytes(eb, item, nbytes); 434 435 /* 436 * If this is a directory we need to reset the i_size to 437 * 0 so that we can set it up properly when replaying 438 * the rest of the items in this log. 439 */ 440 mode = btrfs_inode_mode(eb, item); 441 if (S_ISDIR(mode)) 442 btrfs_set_inode_size(eb, item, 0); 443 } 444 } else if (inode_item) { 445 struct btrfs_inode_item *item; 446 u32 mode; 447 448 /* 449 * New inode, set nbytes to 0 so that the nbytes comes out 450 * properly when we replay the extents. 451 */ 452 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 453 btrfs_set_inode_nbytes(eb, item, 0); 454 455 /* 456 * If this is a directory we need to reset the i_size to 0 so 457 * that we can set it up properly when replaying the rest of 458 * the items in this log. 459 */ 460 mode = btrfs_inode_mode(eb, item); 461 if (S_ISDIR(mode)) 462 btrfs_set_inode_size(eb, item, 0); 463 } 464 insert: 465 btrfs_release_path(path); 466 /* try to insert the key into the destination tree */ 467 path->skip_release_on_error = 1; 468 ret = btrfs_insert_empty_item(trans, root, path, 469 key, item_size); 470 path->skip_release_on_error = 0; 471 472 /* make sure any existing item is the correct size */ 473 if (ret == -EEXIST || ret == -EOVERFLOW) { 474 u32 found_size; 475 found_size = btrfs_item_size_nr(path->nodes[0], 476 path->slots[0]); 477 if (found_size > item_size) 478 btrfs_truncate_item(path, item_size, 1); 479 else if (found_size < item_size) 480 btrfs_extend_item(path, item_size - found_size); 481 } else if (ret) { 482 return ret; 483 } 484 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 485 path->slots[0]); 486 487 /* don't overwrite an existing inode if the generation number 488 * was logged as zero. This is done when the tree logging code 489 * is just logging an inode to make sure it exists after recovery. 490 * 491 * Also, don't overwrite i_size on directories during replay. 492 * log replay inserts and removes directory items based on the 493 * state of the tree found in the subvolume, and i_size is modified 494 * as it goes 495 */ 496 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 497 struct btrfs_inode_item *src_item; 498 struct btrfs_inode_item *dst_item; 499 500 src_item = (struct btrfs_inode_item *)src_ptr; 501 dst_item = (struct btrfs_inode_item *)dst_ptr; 502 503 if (btrfs_inode_generation(eb, src_item) == 0) { 504 struct extent_buffer *dst_eb = path->nodes[0]; 505 const u64 ino_size = btrfs_inode_size(eb, src_item); 506 507 /* 508 * For regular files an ino_size == 0 is used only when 509 * logging that an inode exists, as part of a directory 510 * fsync, and the inode wasn't fsynced before. In this 511 * case don't set the size of the inode in the fs/subvol 512 * tree, otherwise we would be throwing valid data away. 513 */ 514 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 515 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 516 ino_size != 0) 517 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 518 goto no_copy; 519 } 520 521 if (overwrite_root && 522 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 523 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 524 save_old_i_size = 1; 525 saved_i_size = btrfs_inode_size(path->nodes[0], 526 dst_item); 527 } 528 } 529 530 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 531 src_ptr, item_size); 532 533 if (save_old_i_size) { 534 struct btrfs_inode_item *dst_item; 535 dst_item = (struct btrfs_inode_item *)dst_ptr; 536 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 537 } 538 539 /* make sure the generation is filled in */ 540 if (key->type == BTRFS_INODE_ITEM_KEY) { 541 struct btrfs_inode_item *dst_item; 542 dst_item = (struct btrfs_inode_item *)dst_ptr; 543 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 544 btrfs_set_inode_generation(path->nodes[0], dst_item, 545 trans->transid); 546 } 547 } 548 no_copy: 549 btrfs_mark_buffer_dirty(path->nodes[0]); 550 btrfs_release_path(path); 551 return 0; 552 } 553 554 /* 555 * simple helper to read an inode off the disk from a given root 556 * This can only be called for subvolume roots and not for the log 557 */ 558 static noinline struct inode *read_one_inode(struct btrfs_root *root, 559 u64 objectid) 560 { 561 struct inode *inode; 562 563 inode = btrfs_iget(root->fs_info->sb, objectid, root); 564 if (IS_ERR(inode)) 565 inode = NULL; 566 return inode; 567 } 568 569 /* replays a single extent in 'eb' at 'slot' with 'key' into the 570 * subvolume 'root'. path is released on entry and should be released 571 * on exit. 572 * 573 * extents in the log tree have not been allocated out of the extent 574 * tree yet. So, this completes the allocation, taking a reference 575 * as required if the extent already exists or creating a new extent 576 * if it isn't in the extent allocation tree yet. 577 * 578 * The extent is inserted into the file, dropping any existing extents 579 * from the file that overlap the new one. 580 */ 581 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 582 struct btrfs_root *root, 583 struct btrfs_path *path, 584 struct extent_buffer *eb, int slot, 585 struct btrfs_key *key) 586 { 587 struct btrfs_drop_extents_args drop_args = { 0 }; 588 struct btrfs_fs_info *fs_info = root->fs_info; 589 int found_type; 590 u64 extent_end; 591 u64 start = key->offset; 592 u64 nbytes = 0; 593 struct btrfs_file_extent_item *item; 594 struct inode *inode = NULL; 595 unsigned long size; 596 int ret = 0; 597 598 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 599 found_type = btrfs_file_extent_type(eb, item); 600 601 if (found_type == BTRFS_FILE_EXTENT_REG || 602 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 603 nbytes = btrfs_file_extent_num_bytes(eb, item); 604 extent_end = start + nbytes; 605 606 /* 607 * We don't add to the inodes nbytes if we are prealloc or a 608 * hole. 609 */ 610 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 611 nbytes = 0; 612 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 613 size = btrfs_file_extent_ram_bytes(eb, item); 614 nbytes = btrfs_file_extent_ram_bytes(eb, item); 615 extent_end = ALIGN(start + size, 616 fs_info->sectorsize); 617 } else { 618 ret = 0; 619 goto out; 620 } 621 622 inode = read_one_inode(root, key->objectid); 623 if (!inode) { 624 ret = -EIO; 625 goto out; 626 } 627 628 /* 629 * first check to see if we already have this extent in the 630 * file. This must be done before the btrfs_drop_extents run 631 * so we don't try to drop this extent. 632 */ 633 ret = btrfs_lookup_file_extent(trans, root, path, 634 btrfs_ino(BTRFS_I(inode)), start, 0); 635 636 if (ret == 0 && 637 (found_type == BTRFS_FILE_EXTENT_REG || 638 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 639 struct btrfs_file_extent_item cmp1; 640 struct btrfs_file_extent_item cmp2; 641 struct btrfs_file_extent_item *existing; 642 struct extent_buffer *leaf; 643 644 leaf = path->nodes[0]; 645 existing = btrfs_item_ptr(leaf, path->slots[0], 646 struct btrfs_file_extent_item); 647 648 read_extent_buffer(eb, &cmp1, (unsigned long)item, 649 sizeof(cmp1)); 650 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 651 sizeof(cmp2)); 652 653 /* 654 * we already have a pointer to this exact extent, 655 * we don't have to do anything 656 */ 657 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 658 btrfs_release_path(path); 659 goto out; 660 } 661 } 662 btrfs_release_path(path); 663 664 /* drop any overlapping extents */ 665 drop_args.start = start; 666 drop_args.end = extent_end; 667 drop_args.drop_cache = true; 668 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 669 if (ret) 670 goto out; 671 672 if (found_type == BTRFS_FILE_EXTENT_REG || 673 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 674 u64 offset; 675 unsigned long dest_offset; 676 struct btrfs_key ins; 677 678 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 679 btrfs_fs_incompat(fs_info, NO_HOLES)) 680 goto update_inode; 681 682 ret = btrfs_insert_empty_item(trans, root, path, key, 683 sizeof(*item)); 684 if (ret) 685 goto out; 686 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 687 path->slots[0]); 688 copy_extent_buffer(path->nodes[0], eb, dest_offset, 689 (unsigned long)item, sizeof(*item)); 690 691 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 692 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 693 ins.type = BTRFS_EXTENT_ITEM_KEY; 694 offset = key->offset - btrfs_file_extent_offset(eb, item); 695 696 /* 697 * Manually record dirty extent, as here we did a shallow 698 * file extent item copy and skip normal backref update, 699 * but modifying extent tree all by ourselves. 700 * So need to manually record dirty extent for qgroup, 701 * as the owner of the file extent changed from log tree 702 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 703 */ 704 ret = btrfs_qgroup_trace_extent(trans, 705 btrfs_file_extent_disk_bytenr(eb, item), 706 btrfs_file_extent_disk_num_bytes(eb, item), 707 GFP_NOFS); 708 if (ret < 0) 709 goto out; 710 711 if (ins.objectid > 0) { 712 struct btrfs_ref ref = { 0 }; 713 u64 csum_start; 714 u64 csum_end; 715 LIST_HEAD(ordered_sums); 716 717 /* 718 * is this extent already allocated in the extent 719 * allocation tree? If so, just add a reference 720 */ 721 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 722 ins.offset); 723 if (ret == 0) { 724 btrfs_init_generic_ref(&ref, 725 BTRFS_ADD_DELAYED_REF, 726 ins.objectid, ins.offset, 0); 727 btrfs_init_data_ref(&ref, 728 root->root_key.objectid, 729 key->objectid, offset); 730 ret = btrfs_inc_extent_ref(trans, &ref); 731 if (ret) 732 goto out; 733 } else { 734 /* 735 * insert the extent pointer in the extent 736 * allocation tree 737 */ 738 ret = btrfs_alloc_logged_file_extent(trans, 739 root->root_key.objectid, 740 key->objectid, offset, &ins); 741 if (ret) 742 goto out; 743 } 744 btrfs_release_path(path); 745 746 if (btrfs_file_extent_compression(eb, item)) { 747 csum_start = ins.objectid; 748 csum_end = csum_start + ins.offset; 749 } else { 750 csum_start = ins.objectid + 751 btrfs_file_extent_offset(eb, item); 752 csum_end = csum_start + 753 btrfs_file_extent_num_bytes(eb, item); 754 } 755 756 ret = btrfs_lookup_csums_range(root->log_root, 757 csum_start, csum_end - 1, 758 &ordered_sums, 0); 759 if (ret) 760 goto out; 761 /* 762 * Now delete all existing cums in the csum root that 763 * cover our range. We do this because we can have an 764 * extent that is completely referenced by one file 765 * extent item and partially referenced by another 766 * file extent item (like after using the clone or 767 * extent_same ioctls). In this case if we end up doing 768 * the replay of the one that partially references the 769 * extent first, and we do not do the csum deletion 770 * below, we can get 2 csum items in the csum tree that 771 * overlap each other. For example, imagine our log has 772 * the two following file extent items: 773 * 774 * key (257 EXTENT_DATA 409600) 775 * extent data disk byte 12845056 nr 102400 776 * extent data offset 20480 nr 20480 ram 102400 777 * 778 * key (257 EXTENT_DATA 819200) 779 * extent data disk byte 12845056 nr 102400 780 * extent data offset 0 nr 102400 ram 102400 781 * 782 * Where the second one fully references the 100K extent 783 * that starts at disk byte 12845056, and the log tree 784 * has a single csum item that covers the entire range 785 * of the extent: 786 * 787 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 788 * 789 * After the first file extent item is replayed, the 790 * csum tree gets the following csum item: 791 * 792 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 793 * 794 * Which covers the 20K sub-range starting at offset 20K 795 * of our extent. Now when we replay the second file 796 * extent item, if we do not delete existing csum items 797 * that cover any of its blocks, we end up getting two 798 * csum items in our csum tree that overlap each other: 799 * 800 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 801 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 802 * 803 * Which is a problem, because after this anyone trying 804 * to lookup up for the checksum of any block of our 805 * extent starting at an offset of 40K or higher, will 806 * end up looking at the second csum item only, which 807 * does not contain the checksum for any block starting 808 * at offset 40K or higher of our extent. 809 */ 810 while (!list_empty(&ordered_sums)) { 811 struct btrfs_ordered_sum *sums; 812 sums = list_entry(ordered_sums.next, 813 struct btrfs_ordered_sum, 814 list); 815 if (!ret) 816 ret = btrfs_del_csums(trans, 817 fs_info->csum_root, 818 sums->bytenr, 819 sums->len); 820 if (!ret) 821 ret = btrfs_csum_file_blocks(trans, 822 fs_info->csum_root, sums); 823 list_del(&sums->list); 824 kfree(sums); 825 } 826 if (ret) 827 goto out; 828 } else { 829 btrfs_release_path(path); 830 } 831 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 832 /* inline extents are easy, we just overwrite them */ 833 ret = overwrite_item(trans, root, path, eb, slot, key); 834 if (ret) 835 goto out; 836 } 837 838 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 839 extent_end - start); 840 if (ret) 841 goto out; 842 843 update_inode: 844 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 845 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 846 out: 847 if (inode) 848 iput(inode); 849 return ret; 850 } 851 852 /* 853 * when cleaning up conflicts between the directory names in the 854 * subvolume, directory names in the log and directory names in the 855 * inode back references, we may have to unlink inodes from directories. 856 * 857 * This is a helper function to do the unlink of a specific directory 858 * item 859 */ 860 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 861 struct btrfs_root *root, 862 struct btrfs_path *path, 863 struct btrfs_inode *dir, 864 struct btrfs_dir_item *di) 865 { 866 struct inode *inode; 867 char *name; 868 int name_len; 869 struct extent_buffer *leaf; 870 struct btrfs_key location; 871 int ret; 872 873 leaf = path->nodes[0]; 874 875 btrfs_dir_item_key_to_cpu(leaf, di, &location); 876 name_len = btrfs_dir_name_len(leaf, di); 877 name = kmalloc(name_len, GFP_NOFS); 878 if (!name) 879 return -ENOMEM; 880 881 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 882 btrfs_release_path(path); 883 884 inode = read_one_inode(root, location.objectid); 885 if (!inode) { 886 ret = -EIO; 887 goto out; 888 } 889 890 ret = link_to_fixup_dir(trans, root, path, location.objectid); 891 if (ret) 892 goto out; 893 894 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 895 name_len); 896 if (ret) 897 goto out; 898 else 899 ret = btrfs_run_delayed_items(trans); 900 out: 901 kfree(name); 902 iput(inode); 903 return ret; 904 } 905 906 /* 907 * helper function to see if a given name and sequence number found 908 * in an inode back reference are already in a directory and correctly 909 * point to this inode 910 */ 911 static noinline int inode_in_dir(struct btrfs_root *root, 912 struct btrfs_path *path, 913 u64 dirid, u64 objectid, u64 index, 914 const char *name, int name_len) 915 { 916 struct btrfs_dir_item *di; 917 struct btrfs_key location; 918 int match = 0; 919 920 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 921 index, name, name_len, 0); 922 if (di && !IS_ERR(di)) { 923 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 924 if (location.objectid != objectid) 925 goto out; 926 } else 927 goto out; 928 btrfs_release_path(path); 929 930 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 931 if (di && !IS_ERR(di)) { 932 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 933 if (location.objectid != objectid) 934 goto out; 935 } else 936 goto out; 937 match = 1; 938 out: 939 btrfs_release_path(path); 940 return match; 941 } 942 943 /* 944 * helper function to check a log tree for a named back reference in 945 * an inode. This is used to decide if a back reference that is 946 * found in the subvolume conflicts with what we find in the log. 947 * 948 * inode backreferences may have multiple refs in a single item, 949 * during replay we process one reference at a time, and we don't 950 * want to delete valid links to a file from the subvolume if that 951 * link is also in the log. 952 */ 953 static noinline int backref_in_log(struct btrfs_root *log, 954 struct btrfs_key *key, 955 u64 ref_objectid, 956 const char *name, int namelen) 957 { 958 struct btrfs_path *path; 959 int ret; 960 961 path = btrfs_alloc_path(); 962 if (!path) 963 return -ENOMEM; 964 965 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 966 if (ret < 0) { 967 goto out; 968 } else if (ret == 1) { 969 ret = 0; 970 goto out; 971 } 972 973 if (key->type == BTRFS_INODE_EXTREF_KEY) 974 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 975 path->slots[0], 976 ref_objectid, 977 name, namelen); 978 else 979 ret = !!btrfs_find_name_in_backref(path->nodes[0], 980 path->slots[0], 981 name, namelen); 982 out: 983 btrfs_free_path(path); 984 return ret; 985 } 986 987 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 988 struct btrfs_root *root, 989 struct btrfs_path *path, 990 struct btrfs_root *log_root, 991 struct btrfs_inode *dir, 992 struct btrfs_inode *inode, 993 u64 inode_objectid, u64 parent_objectid, 994 u64 ref_index, char *name, int namelen, 995 int *search_done) 996 { 997 int ret; 998 char *victim_name; 999 int victim_name_len; 1000 struct extent_buffer *leaf; 1001 struct btrfs_dir_item *di; 1002 struct btrfs_key search_key; 1003 struct btrfs_inode_extref *extref; 1004 1005 again: 1006 /* Search old style refs */ 1007 search_key.objectid = inode_objectid; 1008 search_key.type = BTRFS_INODE_REF_KEY; 1009 search_key.offset = parent_objectid; 1010 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1011 if (ret == 0) { 1012 struct btrfs_inode_ref *victim_ref; 1013 unsigned long ptr; 1014 unsigned long ptr_end; 1015 1016 leaf = path->nodes[0]; 1017 1018 /* are we trying to overwrite a back ref for the root directory 1019 * if so, just jump out, we're done 1020 */ 1021 if (search_key.objectid == search_key.offset) 1022 return 1; 1023 1024 /* check all the names in this back reference to see 1025 * if they are in the log. if so, we allow them to stay 1026 * otherwise they must be unlinked as a conflict 1027 */ 1028 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1029 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1030 while (ptr < ptr_end) { 1031 victim_ref = (struct btrfs_inode_ref *)ptr; 1032 victim_name_len = btrfs_inode_ref_name_len(leaf, 1033 victim_ref); 1034 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1035 if (!victim_name) 1036 return -ENOMEM; 1037 1038 read_extent_buffer(leaf, victim_name, 1039 (unsigned long)(victim_ref + 1), 1040 victim_name_len); 1041 1042 ret = backref_in_log(log_root, &search_key, 1043 parent_objectid, victim_name, 1044 victim_name_len); 1045 if (ret < 0) { 1046 kfree(victim_name); 1047 return ret; 1048 } else if (!ret) { 1049 inc_nlink(&inode->vfs_inode); 1050 btrfs_release_path(path); 1051 1052 ret = btrfs_unlink_inode(trans, root, dir, inode, 1053 victim_name, victim_name_len); 1054 kfree(victim_name); 1055 if (ret) 1056 return ret; 1057 ret = btrfs_run_delayed_items(trans); 1058 if (ret) 1059 return ret; 1060 *search_done = 1; 1061 goto again; 1062 } 1063 kfree(victim_name); 1064 1065 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1066 } 1067 1068 /* 1069 * NOTE: we have searched root tree and checked the 1070 * corresponding ref, it does not need to check again. 1071 */ 1072 *search_done = 1; 1073 } 1074 btrfs_release_path(path); 1075 1076 /* Same search but for extended refs */ 1077 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1078 inode_objectid, parent_objectid, 0, 1079 0); 1080 if (!IS_ERR_OR_NULL(extref)) { 1081 u32 item_size; 1082 u32 cur_offset = 0; 1083 unsigned long base; 1084 struct inode *victim_parent; 1085 1086 leaf = path->nodes[0]; 1087 1088 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1089 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1090 1091 while (cur_offset < item_size) { 1092 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1093 1094 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1095 1096 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1097 goto next; 1098 1099 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1100 if (!victim_name) 1101 return -ENOMEM; 1102 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1103 victim_name_len); 1104 1105 search_key.objectid = inode_objectid; 1106 search_key.type = BTRFS_INODE_EXTREF_KEY; 1107 search_key.offset = btrfs_extref_hash(parent_objectid, 1108 victim_name, 1109 victim_name_len); 1110 ret = backref_in_log(log_root, &search_key, 1111 parent_objectid, victim_name, 1112 victim_name_len); 1113 if (ret < 0) { 1114 return ret; 1115 } else if (!ret) { 1116 ret = -ENOENT; 1117 victim_parent = read_one_inode(root, 1118 parent_objectid); 1119 if (victim_parent) { 1120 inc_nlink(&inode->vfs_inode); 1121 btrfs_release_path(path); 1122 1123 ret = btrfs_unlink_inode(trans, root, 1124 BTRFS_I(victim_parent), 1125 inode, 1126 victim_name, 1127 victim_name_len); 1128 if (!ret) 1129 ret = btrfs_run_delayed_items( 1130 trans); 1131 } 1132 iput(victim_parent); 1133 kfree(victim_name); 1134 if (ret) 1135 return ret; 1136 *search_done = 1; 1137 goto again; 1138 } 1139 kfree(victim_name); 1140 next: 1141 cur_offset += victim_name_len + sizeof(*extref); 1142 } 1143 *search_done = 1; 1144 } 1145 btrfs_release_path(path); 1146 1147 /* look for a conflicting sequence number */ 1148 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1149 ref_index, name, namelen, 0); 1150 if (di && !IS_ERR(di)) { 1151 ret = drop_one_dir_item(trans, root, path, dir, di); 1152 if (ret) 1153 return ret; 1154 } 1155 btrfs_release_path(path); 1156 1157 /* look for a conflicting name */ 1158 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1159 name, namelen, 0); 1160 if (di && !IS_ERR(di)) { 1161 ret = drop_one_dir_item(trans, root, path, dir, di); 1162 if (ret) 1163 return ret; 1164 } 1165 btrfs_release_path(path); 1166 1167 return 0; 1168 } 1169 1170 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1171 u32 *namelen, char **name, u64 *index, 1172 u64 *parent_objectid) 1173 { 1174 struct btrfs_inode_extref *extref; 1175 1176 extref = (struct btrfs_inode_extref *)ref_ptr; 1177 1178 *namelen = btrfs_inode_extref_name_len(eb, extref); 1179 *name = kmalloc(*namelen, GFP_NOFS); 1180 if (*name == NULL) 1181 return -ENOMEM; 1182 1183 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1184 *namelen); 1185 1186 if (index) 1187 *index = btrfs_inode_extref_index(eb, extref); 1188 if (parent_objectid) 1189 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1190 1191 return 0; 1192 } 1193 1194 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1195 u32 *namelen, char **name, u64 *index) 1196 { 1197 struct btrfs_inode_ref *ref; 1198 1199 ref = (struct btrfs_inode_ref *)ref_ptr; 1200 1201 *namelen = btrfs_inode_ref_name_len(eb, ref); 1202 *name = kmalloc(*namelen, GFP_NOFS); 1203 if (*name == NULL) 1204 return -ENOMEM; 1205 1206 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1207 1208 if (index) 1209 *index = btrfs_inode_ref_index(eb, ref); 1210 1211 return 0; 1212 } 1213 1214 /* 1215 * Take an inode reference item from the log tree and iterate all names from the 1216 * inode reference item in the subvolume tree with the same key (if it exists). 1217 * For any name that is not in the inode reference item from the log tree, do a 1218 * proper unlink of that name (that is, remove its entry from the inode 1219 * reference item and both dir index keys). 1220 */ 1221 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1222 struct btrfs_root *root, 1223 struct btrfs_path *path, 1224 struct btrfs_inode *inode, 1225 struct extent_buffer *log_eb, 1226 int log_slot, 1227 struct btrfs_key *key) 1228 { 1229 int ret; 1230 unsigned long ref_ptr; 1231 unsigned long ref_end; 1232 struct extent_buffer *eb; 1233 1234 again: 1235 btrfs_release_path(path); 1236 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1237 if (ret > 0) { 1238 ret = 0; 1239 goto out; 1240 } 1241 if (ret < 0) 1242 goto out; 1243 1244 eb = path->nodes[0]; 1245 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1246 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1247 while (ref_ptr < ref_end) { 1248 char *name = NULL; 1249 int namelen; 1250 u64 parent_id; 1251 1252 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1253 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1254 NULL, &parent_id); 1255 } else { 1256 parent_id = key->offset; 1257 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1258 NULL); 1259 } 1260 if (ret) 1261 goto out; 1262 1263 if (key->type == BTRFS_INODE_EXTREF_KEY) 1264 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1265 parent_id, name, 1266 namelen); 1267 else 1268 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1269 name, namelen); 1270 1271 if (!ret) { 1272 struct inode *dir; 1273 1274 btrfs_release_path(path); 1275 dir = read_one_inode(root, parent_id); 1276 if (!dir) { 1277 ret = -ENOENT; 1278 kfree(name); 1279 goto out; 1280 } 1281 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1282 inode, name, namelen); 1283 kfree(name); 1284 iput(dir); 1285 if (ret) 1286 goto out; 1287 goto again; 1288 } 1289 1290 kfree(name); 1291 ref_ptr += namelen; 1292 if (key->type == BTRFS_INODE_EXTREF_KEY) 1293 ref_ptr += sizeof(struct btrfs_inode_extref); 1294 else 1295 ref_ptr += sizeof(struct btrfs_inode_ref); 1296 } 1297 ret = 0; 1298 out: 1299 btrfs_release_path(path); 1300 return ret; 1301 } 1302 1303 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1304 const u8 ref_type, const char *name, 1305 const int namelen) 1306 { 1307 struct btrfs_key key; 1308 struct btrfs_path *path; 1309 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1310 int ret; 1311 1312 path = btrfs_alloc_path(); 1313 if (!path) 1314 return -ENOMEM; 1315 1316 key.objectid = btrfs_ino(BTRFS_I(inode)); 1317 key.type = ref_type; 1318 if (key.type == BTRFS_INODE_REF_KEY) 1319 key.offset = parent_id; 1320 else 1321 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1322 1323 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1324 if (ret < 0) 1325 goto out; 1326 if (ret > 0) { 1327 ret = 0; 1328 goto out; 1329 } 1330 if (key.type == BTRFS_INODE_EXTREF_KEY) 1331 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1332 path->slots[0], parent_id, name, namelen); 1333 else 1334 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1335 name, namelen); 1336 1337 out: 1338 btrfs_free_path(path); 1339 return ret; 1340 } 1341 1342 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1343 struct inode *dir, struct inode *inode, const char *name, 1344 int namelen, u64 ref_index) 1345 { 1346 struct btrfs_dir_item *dir_item; 1347 struct btrfs_key key; 1348 struct btrfs_path *path; 1349 struct inode *other_inode = NULL; 1350 int ret; 1351 1352 path = btrfs_alloc_path(); 1353 if (!path) 1354 return -ENOMEM; 1355 1356 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1357 btrfs_ino(BTRFS_I(dir)), 1358 name, namelen, 0); 1359 if (!dir_item) { 1360 btrfs_release_path(path); 1361 goto add_link; 1362 } else if (IS_ERR(dir_item)) { 1363 ret = PTR_ERR(dir_item); 1364 goto out; 1365 } 1366 1367 /* 1368 * Our inode's dentry collides with the dentry of another inode which is 1369 * in the log but not yet processed since it has a higher inode number. 1370 * So delete that other dentry. 1371 */ 1372 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1373 btrfs_release_path(path); 1374 other_inode = read_one_inode(root, key.objectid); 1375 if (!other_inode) { 1376 ret = -ENOENT; 1377 goto out; 1378 } 1379 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1380 name, namelen); 1381 if (ret) 1382 goto out; 1383 /* 1384 * If we dropped the link count to 0, bump it so that later the iput() 1385 * on the inode will not free it. We will fixup the link count later. 1386 */ 1387 if (other_inode->i_nlink == 0) 1388 inc_nlink(other_inode); 1389 1390 ret = btrfs_run_delayed_items(trans); 1391 if (ret) 1392 goto out; 1393 add_link: 1394 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1395 name, namelen, 0, ref_index); 1396 out: 1397 iput(other_inode); 1398 btrfs_free_path(path); 1399 1400 return ret; 1401 } 1402 1403 /* 1404 * replay one inode back reference item found in the log tree. 1405 * eb, slot and key refer to the buffer and key found in the log tree. 1406 * root is the destination we are replaying into, and path is for temp 1407 * use by this function. (it should be released on return). 1408 */ 1409 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1410 struct btrfs_root *root, 1411 struct btrfs_root *log, 1412 struct btrfs_path *path, 1413 struct extent_buffer *eb, int slot, 1414 struct btrfs_key *key) 1415 { 1416 struct inode *dir = NULL; 1417 struct inode *inode = NULL; 1418 unsigned long ref_ptr; 1419 unsigned long ref_end; 1420 char *name = NULL; 1421 int namelen; 1422 int ret; 1423 int search_done = 0; 1424 int log_ref_ver = 0; 1425 u64 parent_objectid; 1426 u64 inode_objectid; 1427 u64 ref_index = 0; 1428 int ref_struct_size; 1429 1430 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1431 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1432 1433 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1434 struct btrfs_inode_extref *r; 1435 1436 ref_struct_size = sizeof(struct btrfs_inode_extref); 1437 log_ref_ver = 1; 1438 r = (struct btrfs_inode_extref *)ref_ptr; 1439 parent_objectid = btrfs_inode_extref_parent(eb, r); 1440 } else { 1441 ref_struct_size = sizeof(struct btrfs_inode_ref); 1442 parent_objectid = key->offset; 1443 } 1444 inode_objectid = key->objectid; 1445 1446 /* 1447 * it is possible that we didn't log all the parent directories 1448 * for a given inode. If we don't find the dir, just don't 1449 * copy the back ref in. The link count fixup code will take 1450 * care of the rest 1451 */ 1452 dir = read_one_inode(root, parent_objectid); 1453 if (!dir) { 1454 ret = -ENOENT; 1455 goto out; 1456 } 1457 1458 inode = read_one_inode(root, inode_objectid); 1459 if (!inode) { 1460 ret = -EIO; 1461 goto out; 1462 } 1463 1464 while (ref_ptr < ref_end) { 1465 if (log_ref_ver) { 1466 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1467 &ref_index, &parent_objectid); 1468 /* 1469 * parent object can change from one array 1470 * item to another. 1471 */ 1472 if (!dir) 1473 dir = read_one_inode(root, parent_objectid); 1474 if (!dir) { 1475 ret = -ENOENT; 1476 goto out; 1477 } 1478 } else { 1479 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1480 &ref_index); 1481 } 1482 if (ret) 1483 goto out; 1484 1485 /* if we already have a perfect match, we're done */ 1486 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1487 btrfs_ino(BTRFS_I(inode)), ref_index, 1488 name, namelen)) { 1489 /* 1490 * look for a conflicting back reference in the 1491 * metadata. if we find one we have to unlink that name 1492 * of the file before we add our new link. Later on, we 1493 * overwrite any existing back reference, and we don't 1494 * want to create dangling pointers in the directory. 1495 */ 1496 1497 if (!search_done) { 1498 ret = __add_inode_ref(trans, root, path, log, 1499 BTRFS_I(dir), 1500 BTRFS_I(inode), 1501 inode_objectid, 1502 parent_objectid, 1503 ref_index, name, namelen, 1504 &search_done); 1505 if (ret) { 1506 if (ret == 1) 1507 ret = 0; 1508 goto out; 1509 } 1510 } 1511 1512 /* 1513 * If a reference item already exists for this inode 1514 * with the same parent and name, but different index, 1515 * drop it and the corresponding directory index entries 1516 * from the parent before adding the new reference item 1517 * and dir index entries, otherwise we would fail with 1518 * -EEXIST returned from btrfs_add_link() below. 1519 */ 1520 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1521 name, namelen); 1522 if (ret > 0) { 1523 ret = btrfs_unlink_inode(trans, root, 1524 BTRFS_I(dir), 1525 BTRFS_I(inode), 1526 name, namelen); 1527 /* 1528 * If we dropped the link count to 0, bump it so 1529 * that later the iput() on the inode will not 1530 * free it. We will fixup the link count later. 1531 */ 1532 if (!ret && inode->i_nlink == 0) 1533 inc_nlink(inode); 1534 } 1535 if (ret < 0) 1536 goto out; 1537 1538 /* insert our name */ 1539 ret = add_link(trans, root, dir, inode, name, namelen, 1540 ref_index); 1541 if (ret) 1542 goto out; 1543 1544 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1545 } 1546 1547 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1548 kfree(name); 1549 name = NULL; 1550 if (log_ref_ver) { 1551 iput(dir); 1552 dir = NULL; 1553 } 1554 } 1555 1556 /* 1557 * Before we overwrite the inode reference item in the subvolume tree 1558 * with the item from the log tree, we must unlink all names from the 1559 * parent directory that are in the subvolume's tree inode reference 1560 * item, otherwise we end up with an inconsistent subvolume tree where 1561 * dir index entries exist for a name but there is no inode reference 1562 * item with the same name. 1563 */ 1564 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1565 key); 1566 if (ret) 1567 goto out; 1568 1569 /* finally write the back reference in the inode */ 1570 ret = overwrite_item(trans, root, path, eb, slot, key); 1571 out: 1572 btrfs_release_path(path); 1573 kfree(name); 1574 iput(dir); 1575 iput(inode); 1576 return ret; 1577 } 1578 1579 static int count_inode_extrefs(struct btrfs_root *root, 1580 struct btrfs_inode *inode, struct btrfs_path *path) 1581 { 1582 int ret = 0; 1583 int name_len; 1584 unsigned int nlink = 0; 1585 u32 item_size; 1586 u32 cur_offset = 0; 1587 u64 inode_objectid = btrfs_ino(inode); 1588 u64 offset = 0; 1589 unsigned long ptr; 1590 struct btrfs_inode_extref *extref; 1591 struct extent_buffer *leaf; 1592 1593 while (1) { 1594 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1595 &extref, &offset); 1596 if (ret) 1597 break; 1598 1599 leaf = path->nodes[0]; 1600 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1601 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1602 cur_offset = 0; 1603 1604 while (cur_offset < item_size) { 1605 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1606 name_len = btrfs_inode_extref_name_len(leaf, extref); 1607 1608 nlink++; 1609 1610 cur_offset += name_len + sizeof(*extref); 1611 } 1612 1613 offset++; 1614 btrfs_release_path(path); 1615 } 1616 btrfs_release_path(path); 1617 1618 if (ret < 0 && ret != -ENOENT) 1619 return ret; 1620 return nlink; 1621 } 1622 1623 static int count_inode_refs(struct btrfs_root *root, 1624 struct btrfs_inode *inode, struct btrfs_path *path) 1625 { 1626 int ret; 1627 struct btrfs_key key; 1628 unsigned int nlink = 0; 1629 unsigned long ptr; 1630 unsigned long ptr_end; 1631 int name_len; 1632 u64 ino = btrfs_ino(inode); 1633 1634 key.objectid = ino; 1635 key.type = BTRFS_INODE_REF_KEY; 1636 key.offset = (u64)-1; 1637 1638 while (1) { 1639 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1640 if (ret < 0) 1641 break; 1642 if (ret > 0) { 1643 if (path->slots[0] == 0) 1644 break; 1645 path->slots[0]--; 1646 } 1647 process_slot: 1648 btrfs_item_key_to_cpu(path->nodes[0], &key, 1649 path->slots[0]); 1650 if (key.objectid != ino || 1651 key.type != BTRFS_INODE_REF_KEY) 1652 break; 1653 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1654 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1655 path->slots[0]); 1656 while (ptr < ptr_end) { 1657 struct btrfs_inode_ref *ref; 1658 1659 ref = (struct btrfs_inode_ref *)ptr; 1660 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1661 ref); 1662 ptr = (unsigned long)(ref + 1) + name_len; 1663 nlink++; 1664 } 1665 1666 if (key.offset == 0) 1667 break; 1668 if (path->slots[0] > 0) { 1669 path->slots[0]--; 1670 goto process_slot; 1671 } 1672 key.offset--; 1673 btrfs_release_path(path); 1674 } 1675 btrfs_release_path(path); 1676 1677 return nlink; 1678 } 1679 1680 /* 1681 * There are a few corners where the link count of the file can't 1682 * be properly maintained during replay. So, instead of adding 1683 * lots of complexity to the log code, we just scan the backrefs 1684 * for any file that has been through replay. 1685 * 1686 * The scan will update the link count on the inode to reflect the 1687 * number of back refs found. If it goes down to zero, the iput 1688 * will free the inode. 1689 */ 1690 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1691 struct btrfs_root *root, 1692 struct inode *inode) 1693 { 1694 struct btrfs_path *path; 1695 int ret; 1696 u64 nlink = 0; 1697 u64 ino = btrfs_ino(BTRFS_I(inode)); 1698 1699 path = btrfs_alloc_path(); 1700 if (!path) 1701 return -ENOMEM; 1702 1703 ret = count_inode_refs(root, BTRFS_I(inode), path); 1704 if (ret < 0) 1705 goto out; 1706 1707 nlink = ret; 1708 1709 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1710 if (ret < 0) 1711 goto out; 1712 1713 nlink += ret; 1714 1715 ret = 0; 1716 1717 if (nlink != inode->i_nlink) { 1718 set_nlink(inode, nlink); 1719 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1720 } 1721 BTRFS_I(inode)->index_cnt = (u64)-1; 1722 1723 if (inode->i_nlink == 0) { 1724 if (S_ISDIR(inode->i_mode)) { 1725 ret = replay_dir_deletes(trans, root, NULL, path, 1726 ino, 1); 1727 if (ret) 1728 goto out; 1729 } 1730 ret = btrfs_insert_orphan_item(trans, root, ino); 1731 if (ret == -EEXIST) 1732 ret = 0; 1733 } 1734 1735 out: 1736 btrfs_free_path(path); 1737 return ret; 1738 } 1739 1740 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1741 struct btrfs_root *root, 1742 struct btrfs_path *path) 1743 { 1744 int ret; 1745 struct btrfs_key key; 1746 struct inode *inode; 1747 1748 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1749 key.type = BTRFS_ORPHAN_ITEM_KEY; 1750 key.offset = (u64)-1; 1751 while (1) { 1752 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1753 if (ret < 0) 1754 break; 1755 1756 if (ret == 1) { 1757 if (path->slots[0] == 0) 1758 break; 1759 path->slots[0]--; 1760 } 1761 1762 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1763 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1764 key.type != BTRFS_ORPHAN_ITEM_KEY) 1765 break; 1766 1767 ret = btrfs_del_item(trans, root, path); 1768 if (ret) 1769 goto out; 1770 1771 btrfs_release_path(path); 1772 inode = read_one_inode(root, key.offset); 1773 if (!inode) 1774 return -EIO; 1775 1776 ret = fixup_inode_link_count(trans, root, inode); 1777 iput(inode); 1778 if (ret) 1779 goto out; 1780 1781 /* 1782 * fixup on a directory may create new entries, 1783 * make sure we always look for the highset possible 1784 * offset 1785 */ 1786 key.offset = (u64)-1; 1787 } 1788 ret = 0; 1789 out: 1790 btrfs_release_path(path); 1791 return ret; 1792 } 1793 1794 1795 /* 1796 * record a given inode in the fixup dir so we can check its link 1797 * count when replay is done. The link count is incremented here 1798 * so the inode won't go away until we check it 1799 */ 1800 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1801 struct btrfs_root *root, 1802 struct btrfs_path *path, 1803 u64 objectid) 1804 { 1805 struct btrfs_key key; 1806 int ret = 0; 1807 struct inode *inode; 1808 1809 inode = read_one_inode(root, objectid); 1810 if (!inode) 1811 return -EIO; 1812 1813 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1814 key.type = BTRFS_ORPHAN_ITEM_KEY; 1815 key.offset = objectid; 1816 1817 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1818 1819 btrfs_release_path(path); 1820 if (ret == 0) { 1821 if (!inode->i_nlink) 1822 set_nlink(inode, 1); 1823 else 1824 inc_nlink(inode); 1825 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1826 } else if (ret == -EEXIST) { 1827 ret = 0; 1828 } else { 1829 BUG(); /* Logic Error */ 1830 } 1831 iput(inode); 1832 1833 return ret; 1834 } 1835 1836 /* 1837 * when replaying the log for a directory, we only insert names 1838 * for inodes that actually exist. This means an fsync on a directory 1839 * does not implicitly fsync all the new files in it 1840 */ 1841 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1842 struct btrfs_root *root, 1843 u64 dirid, u64 index, 1844 char *name, int name_len, 1845 struct btrfs_key *location) 1846 { 1847 struct inode *inode; 1848 struct inode *dir; 1849 int ret; 1850 1851 inode = read_one_inode(root, location->objectid); 1852 if (!inode) 1853 return -ENOENT; 1854 1855 dir = read_one_inode(root, dirid); 1856 if (!dir) { 1857 iput(inode); 1858 return -EIO; 1859 } 1860 1861 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1862 name_len, 1, index); 1863 1864 /* FIXME, put inode into FIXUP list */ 1865 1866 iput(inode); 1867 iput(dir); 1868 return ret; 1869 } 1870 1871 /* 1872 * take a single entry in a log directory item and replay it into 1873 * the subvolume. 1874 * 1875 * if a conflicting item exists in the subdirectory already, 1876 * the inode it points to is unlinked and put into the link count 1877 * fix up tree. 1878 * 1879 * If a name from the log points to a file or directory that does 1880 * not exist in the FS, it is skipped. fsyncs on directories 1881 * do not force down inodes inside that directory, just changes to the 1882 * names or unlinks in a directory. 1883 * 1884 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1885 * non-existing inode) and 1 if the name was replayed. 1886 */ 1887 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1888 struct btrfs_root *root, 1889 struct btrfs_path *path, 1890 struct extent_buffer *eb, 1891 struct btrfs_dir_item *di, 1892 struct btrfs_key *key) 1893 { 1894 char *name; 1895 int name_len; 1896 struct btrfs_dir_item *dst_di; 1897 struct btrfs_key found_key; 1898 struct btrfs_key log_key; 1899 struct inode *dir; 1900 u8 log_type; 1901 int exists; 1902 int ret = 0; 1903 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1904 bool name_added = false; 1905 1906 dir = read_one_inode(root, key->objectid); 1907 if (!dir) 1908 return -EIO; 1909 1910 name_len = btrfs_dir_name_len(eb, di); 1911 name = kmalloc(name_len, GFP_NOFS); 1912 if (!name) { 1913 ret = -ENOMEM; 1914 goto out; 1915 } 1916 1917 log_type = btrfs_dir_type(eb, di); 1918 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1919 name_len); 1920 1921 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1922 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1923 if (exists == 0) 1924 exists = 1; 1925 else 1926 exists = 0; 1927 btrfs_release_path(path); 1928 1929 if (key->type == BTRFS_DIR_ITEM_KEY) { 1930 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1931 name, name_len, 1); 1932 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1933 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1934 key->objectid, 1935 key->offset, name, 1936 name_len, 1); 1937 } else { 1938 /* Corruption */ 1939 ret = -EINVAL; 1940 goto out; 1941 } 1942 if (IS_ERR_OR_NULL(dst_di)) { 1943 /* we need a sequence number to insert, so we only 1944 * do inserts for the BTRFS_DIR_INDEX_KEY types 1945 */ 1946 if (key->type != BTRFS_DIR_INDEX_KEY) 1947 goto out; 1948 goto insert; 1949 } 1950 1951 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1952 /* the existing item matches the logged item */ 1953 if (found_key.objectid == log_key.objectid && 1954 found_key.type == log_key.type && 1955 found_key.offset == log_key.offset && 1956 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1957 update_size = false; 1958 goto out; 1959 } 1960 1961 /* 1962 * don't drop the conflicting directory entry if the inode 1963 * for the new entry doesn't exist 1964 */ 1965 if (!exists) 1966 goto out; 1967 1968 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1969 if (ret) 1970 goto out; 1971 1972 if (key->type == BTRFS_DIR_INDEX_KEY) 1973 goto insert; 1974 out: 1975 btrfs_release_path(path); 1976 if (!ret && update_size) { 1977 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1978 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 1979 } 1980 kfree(name); 1981 iput(dir); 1982 if (!ret && name_added) 1983 ret = 1; 1984 return ret; 1985 1986 insert: 1987 /* 1988 * Check if the inode reference exists in the log for the given name, 1989 * inode and parent inode 1990 */ 1991 found_key.objectid = log_key.objectid; 1992 found_key.type = BTRFS_INODE_REF_KEY; 1993 found_key.offset = key->objectid; 1994 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 1995 if (ret < 0) { 1996 goto out; 1997 } else if (ret) { 1998 /* The dentry will be added later. */ 1999 ret = 0; 2000 update_size = false; 2001 goto out; 2002 } 2003 2004 found_key.objectid = log_key.objectid; 2005 found_key.type = BTRFS_INODE_EXTREF_KEY; 2006 found_key.offset = key->objectid; 2007 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2008 name_len); 2009 if (ret < 0) { 2010 goto out; 2011 } else if (ret) { 2012 /* The dentry will be added later. */ 2013 ret = 0; 2014 update_size = false; 2015 goto out; 2016 } 2017 btrfs_release_path(path); 2018 ret = insert_one_name(trans, root, key->objectid, key->offset, 2019 name, name_len, &log_key); 2020 if (ret && ret != -ENOENT && ret != -EEXIST) 2021 goto out; 2022 if (!ret) 2023 name_added = true; 2024 update_size = false; 2025 ret = 0; 2026 goto out; 2027 } 2028 2029 /* 2030 * find all the names in a directory item and reconcile them into 2031 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2032 * one name in a directory item, but the same code gets used for 2033 * both directory index types 2034 */ 2035 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2036 struct btrfs_root *root, 2037 struct btrfs_path *path, 2038 struct extent_buffer *eb, int slot, 2039 struct btrfs_key *key) 2040 { 2041 int ret = 0; 2042 u32 item_size = btrfs_item_size_nr(eb, slot); 2043 struct btrfs_dir_item *di; 2044 int name_len; 2045 unsigned long ptr; 2046 unsigned long ptr_end; 2047 struct btrfs_path *fixup_path = NULL; 2048 2049 ptr = btrfs_item_ptr_offset(eb, slot); 2050 ptr_end = ptr + item_size; 2051 while (ptr < ptr_end) { 2052 di = (struct btrfs_dir_item *)ptr; 2053 name_len = btrfs_dir_name_len(eb, di); 2054 ret = replay_one_name(trans, root, path, eb, di, key); 2055 if (ret < 0) 2056 break; 2057 ptr = (unsigned long)(di + 1); 2058 ptr += name_len; 2059 2060 /* 2061 * If this entry refers to a non-directory (directories can not 2062 * have a link count > 1) and it was added in the transaction 2063 * that was not committed, make sure we fixup the link count of 2064 * the inode it the entry points to. Otherwise something like 2065 * the following would result in a directory pointing to an 2066 * inode with a wrong link that does not account for this dir 2067 * entry: 2068 * 2069 * mkdir testdir 2070 * touch testdir/foo 2071 * touch testdir/bar 2072 * sync 2073 * 2074 * ln testdir/bar testdir/bar_link 2075 * ln testdir/foo testdir/foo_link 2076 * xfs_io -c "fsync" testdir/bar 2077 * 2078 * <power failure> 2079 * 2080 * mount fs, log replay happens 2081 * 2082 * File foo would remain with a link count of 1 when it has two 2083 * entries pointing to it in the directory testdir. This would 2084 * make it impossible to ever delete the parent directory has 2085 * it would result in stale dentries that can never be deleted. 2086 */ 2087 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2088 struct btrfs_key di_key; 2089 2090 if (!fixup_path) { 2091 fixup_path = btrfs_alloc_path(); 2092 if (!fixup_path) { 2093 ret = -ENOMEM; 2094 break; 2095 } 2096 } 2097 2098 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2099 ret = link_to_fixup_dir(trans, root, fixup_path, 2100 di_key.objectid); 2101 if (ret) 2102 break; 2103 } 2104 ret = 0; 2105 } 2106 btrfs_free_path(fixup_path); 2107 return ret; 2108 } 2109 2110 /* 2111 * directory replay has two parts. There are the standard directory 2112 * items in the log copied from the subvolume, and range items 2113 * created in the log while the subvolume was logged. 2114 * 2115 * The range items tell us which parts of the key space the log 2116 * is authoritative for. During replay, if a key in the subvolume 2117 * directory is in a logged range item, but not actually in the log 2118 * that means it was deleted from the directory before the fsync 2119 * and should be removed. 2120 */ 2121 static noinline int find_dir_range(struct btrfs_root *root, 2122 struct btrfs_path *path, 2123 u64 dirid, int key_type, 2124 u64 *start_ret, u64 *end_ret) 2125 { 2126 struct btrfs_key key; 2127 u64 found_end; 2128 struct btrfs_dir_log_item *item; 2129 int ret; 2130 int nritems; 2131 2132 if (*start_ret == (u64)-1) 2133 return 1; 2134 2135 key.objectid = dirid; 2136 key.type = key_type; 2137 key.offset = *start_ret; 2138 2139 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2140 if (ret < 0) 2141 goto out; 2142 if (ret > 0) { 2143 if (path->slots[0] == 0) 2144 goto out; 2145 path->slots[0]--; 2146 } 2147 if (ret != 0) 2148 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2149 2150 if (key.type != key_type || key.objectid != dirid) { 2151 ret = 1; 2152 goto next; 2153 } 2154 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2155 struct btrfs_dir_log_item); 2156 found_end = btrfs_dir_log_end(path->nodes[0], item); 2157 2158 if (*start_ret >= key.offset && *start_ret <= found_end) { 2159 ret = 0; 2160 *start_ret = key.offset; 2161 *end_ret = found_end; 2162 goto out; 2163 } 2164 ret = 1; 2165 next: 2166 /* check the next slot in the tree to see if it is a valid item */ 2167 nritems = btrfs_header_nritems(path->nodes[0]); 2168 path->slots[0]++; 2169 if (path->slots[0] >= nritems) { 2170 ret = btrfs_next_leaf(root, path); 2171 if (ret) 2172 goto out; 2173 } 2174 2175 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2176 2177 if (key.type != key_type || key.objectid != dirid) { 2178 ret = 1; 2179 goto out; 2180 } 2181 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2182 struct btrfs_dir_log_item); 2183 found_end = btrfs_dir_log_end(path->nodes[0], item); 2184 *start_ret = key.offset; 2185 *end_ret = found_end; 2186 ret = 0; 2187 out: 2188 btrfs_release_path(path); 2189 return ret; 2190 } 2191 2192 /* 2193 * this looks for a given directory item in the log. If the directory 2194 * item is not in the log, the item is removed and the inode it points 2195 * to is unlinked 2196 */ 2197 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2198 struct btrfs_root *root, 2199 struct btrfs_root *log, 2200 struct btrfs_path *path, 2201 struct btrfs_path *log_path, 2202 struct inode *dir, 2203 struct btrfs_key *dir_key) 2204 { 2205 int ret; 2206 struct extent_buffer *eb; 2207 int slot; 2208 u32 item_size; 2209 struct btrfs_dir_item *di; 2210 struct btrfs_dir_item *log_di; 2211 int name_len; 2212 unsigned long ptr; 2213 unsigned long ptr_end; 2214 char *name; 2215 struct inode *inode; 2216 struct btrfs_key location; 2217 2218 again: 2219 eb = path->nodes[0]; 2220 slot = path->slots[0]; 2221 item_size = btrfs_item_size_nr(eb, slot); 2222 ptr = btrfs_item_ptr_offset(eb, slot); 2223 ptr_end = ptr + item_size; 2224 while (ptr < ptr_end) { 2225 di = (struct btrfs_dir_item *)ptr; 2226 name_len = btrfs_dir_name_len(eb, di); 2227 name = kmalloc(name_len, GFP_NOFS); 2228 if (!name) { 2229 ret = -ENOMEM; 2230 goto out; 2231 } 2232 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2233 name_len); 2234 log_di = NULL; 2235 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2236 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2237 dir_key->objectid, 2238 name, name_len, 0); 2239 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2240 log_di = btrfs_lookup_dir_index_item(trans, log, 2241 log_path, 2242 dir_key->objectid, 2243 dir_key->offset, 2244 name, name_len, 0); 2245 } 2246 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2247 btrfs_dir_item_key_to_cpu(eb, di, &location); 2248 btrfs_release_path(path); 2249 btrfs_release_path(log_path); 2250 inode = read_one_inode(root, location.objectid); 2251 if (!inode) { 2252 kfree(name); 2253 return -EIO; 2254 } 2255 2256 ret = link_to_fixup_dir(trans, root, 2257 path, location.objectid); 2258 if (ret) { 2259 kfree(name); 2260 iput(inode); 2261 goto out; 2262 } 2263 2264 inc_nlink(inode); 2265 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2266 BTRFS_I(inode), name, name_len); 2267 if (!ret) 2268 ret = btrfs_run_delayed_items(trans); 2269 kfree(name); 2270 iput(inode); 2271 if (ret) 2272 goto out; 2273 2274 /* there might still be more names under this key 2275 * check and repeat if required 2276 */ 2277 ret = btrfs_search_slot(NULL, root, dir_key, path, 2278 0, 0); 2279 if (ret == 0) 2280 goto again; 2281 ret = 0; 2282 goto out; 2283 } else if (IS_ERR(log_di)) { 2284 kfree(name); 2285 return PTR_ERR(log_di); 2286 } 2287 btrfs_release_path(log_path); 2288 kfree(name); 2289 2290 ptr = (unsigned long)(di + 1); 2291 ptr += name_len; 2292 } 2293 ret = 0; 2294 out: 2295 btrfs_release_path(path); 2296 btrfs_release_path(log_path); 2297 return ret; 2298 } 2299 2300 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2301 struct btrfs_root *root, 2302 struct btrfs_root *log, 2303 struct btrfs_path *path, 2304 const u64 ino) 2305 { 2306 struct btrfs_key search_key; 2307 struct btrfs_path *log_path; 2308 int i; 2309 int nritems; 2310 int ret; 2311 2312 log_path = btrfs_alloc_path(); 2313 if (!log_path) 2314 return -ENOMEM; 2315 2316 search_key.objectid = ino; 2317 search_key.type = BTRFS_XATTR_ITEM_KEY; 2318 search_key.offset = 0; 2319 again: 2320 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2321 if (ret < 0) 2322 goto out; 2323 process_leaf: 2324 nritems = btrfs_header_nritems(path->nodes[0]); 2325 for (i = path->slots[0]; i < nritems; i++) { 2326 struct btrfs_key key; 2327 struct btrfs_dir_item *di; 2328 struct btrfs_dir_item *log_di; 2329 u32 total_size; 2330 u32 cur; 2331 2332 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2333 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2334 ret = 0; 2335 goto out; 2336 } 2337 2338 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2339 total_size = btrfs_item_size_nr(path->nodes[0], i); 2340 cur = 0; 2341 while (cur < total_size) { 2342 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2343 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2344 u32 this_len = sizeof(*di) + name_len + data_len; 2345 char *name; 2346 2347 name = kmalloc(name_len, GFP_NOFS); 2348 if (!name) { 2349 ret = -ENOMEM; 2350 goto out; 2351 } 2352 read_extent_buffer(path->nodes[0], name, 2353 (unsigned long)(di + 1), name_len); 2354 2355 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2356 name, name_len, 0); 2357 btrfs_release_path(log_path); 2358 if (!log_di) { 2359 /* Doesn't exist in log tree, so delete it. */ 2360 btrfs_release_path(path); 2361 di = btrfs_lookup_xattr(trans, root, path, ino, 2362 name, name_len, -1); 2363 kfree(name); 2364 if (IS_ERR(di)) { 2365 ret = PTR_ERR(di); 2366 goto out; 2367 } 2368 ASSERT(di); 2369 ret = btrfs_delete_one_dir_name(trans, root, 2370 path, di); 2371 if (ret) 2372 goto out; 2373 btrfs_release_path(path); 2374 search_key = key; 2375 goto again; 2376 } 2377 kfree(name); 2378 if (IS_ERR(log_di)) { 2379 ret = PTR_ERR(log_di); 2380 goto out; 2381 } 2382 cur += this_len; 2383 di = (struct btrfs_dir_item *)((char *)di + this_len); 2384 } 2385 } 2386 ret = btrfs_next_leaf(root, path); 2387 if (ret > 0) 2388 ret = 0; 2389 else if (ret == 0) 2390 goto process_leaf; 2391 out: 2392 btrfs_free_path(log_path); 2393 btrfs_release_path(path); 2394 return ret; 2395 } 2396 2397 2398 /* 2399 * deletion replay happens before we copy any new directory items 2400 * out of the log or out of backreferences from inodes. It 2401 * scans the log to find ranges of keys that log is authoritative for, 2402 * and then scans the directory to find items in those ranges that are 2403 * not present in the log. 2404 * 2405 * Anything we don't find in the log is unlinked and removed from the 2406 * directory. 2407 */ 2408 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2409 struct btrfs_root *root, 2410 struct btrfs_root *log, 2411 struct btrfs_path *path, 2412 u64 dirid, int del_all) 2413 { 2414 u64 range_start; 2415 u64 range_end; 2416 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2417 int ret = 0; 2418 struct btrfs_key dir_key; 2419 struct btrfs_key found_key; 2420 struct btrfs_path *log_path; 2421 struct inode *dir; 2422 2423 dir_key.objectid = dirid; 2424 dir_key.type = BTRFS_DIR_ITEM_KEY; 2425 log_path = btrfs_alloc_path(); 2426 if (!log_path) 2427 return -ENOMEM; 2428 2429 dir = read_one_inode(root, dirid); 2430 /* it isn't an error if the inode isn't there, that can happen 2431 * because we replay the deletes before we copy in the inode item 2432 * from the log 2433 */ 2434 if (!dir) { 2435 btrfs_free_path(log_path); 2436 return 0; 2437 } 2438 again: 2439 range_start = 0; 2440 range_end = 0; 2441 while (1) { 2442 if (del_all) 2443 range_end = (u64)-1; 2444 else { 2445 ret = find_dir_range(log, path, dirid, key_type, 2446 &range_start, &range_end); 2447 if (ret != 0) 2448 break; 2449 } 2450 2451 dir_key.offset = range_start; 2452 while (1) { 2453 int nritems; 2454 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2455 0, 0); 2456 if (ret < 0) 2457 goto out; 2458 2459 nritems = btrfs_header_nritems(path->nodes[0]); 2460 if (path->slots[0] >= nritems) { 2461 ret = btrfs_next_leaf(root, path); 2462 if (ret == 1) 2463 break; 2464 else if (ret < 0) 2465 goto out; 2466 } 2467 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2468 path->slots[0]); 2469 if (found_key.objectid != dirid || 2470 found_key.type != dir_key.type) 2471 goto next_type; 2472 2473 if (found_key.offset > range_end) 2474 break; 2475 2476 ret = check_item_in_log(trans, root, log, path, 2477 log_path, dir, 2478 &found_key); 2479 if (ret) 2480 goto out; 2481 if (found_key.offset == (u64)-1) 2482 break; 2483 dir_key.offset = found_key.offset + 1; 2484 } 2485 btrfs_release_path(path); 2486 if (range_end == (u64)-1) 2487 break; 2488 range_start = range_end + 1; 2489 } 2490 2491 next_type: 2492 ret = 0; 2493 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2494 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2495 dir_key.type = BTRFS_DIR_INDEX_KEY; 2496 btrfs_release_path(path); 2497 goto again; 2498 } 2499 out: 2500 btrfs_release_path(path); 2501 btrfs_free_path(log_path); 2502 iput(dir); 2503 return ret; 2504 } 2505 2506 /* 2507 * the process_func used to replay items from the log tree. This 2508 * gets called in two different stages. The first stage just looks 2509 * for inodes and makes sure they are all copied into the subvolume. 2510 * 2511 * The second stage copies all the other item types from the log into 2512 * the subvolume. The two stage approach is slower, but gets rid of 2513 * lots of complexity around inodes referencing other inodes that exist 2514 * only in the log (references come from either directory items or inode 2515 * back refs). 2516 */ 2517 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2518 struct walk_control *wc, u64 gen, int level) 2519 { 2520 int nritems; 2521 struct btrfs_path *path; 2522 struct btrfs_root *root = wc->replay_dest; 2523 struct btrfs_key key; 2524 int i; 2525 int ret; 2526 2527 ret = btrfs_read_buffer(eb, gen, level, NULL); 2528 if (ret) 2529 return ret; 2530 2531 level = btrfs_header_level(eb); 2532 2533 if (level != 0) 2534 return 0; 2535 2536 path = btrfs_alloc_path(); 2537 if (!path) 2538 return -ENOMEM; 2539 2540 nritems = btrfs_header_nritems(eb); 2541 for (i = 0; i < nritems; i++) { 2542 btrfs_item_key_to_cpu(eb, &key, i); 2543 2544 /* inode keys are done during the first stage */ 2545 if (key.type == BTRFS_INODE_ITEM_KEY && 2546 wc->stage == LOG_WALK_REPLAY_INODES) { 2547 struct btrfs_inode_item *inode_item; 2548 u32 mode; 2549 2550 inode_item = btrfs_item_ptr(eb, i, 2551 struct btrfs_inode_item); 2552 /* 2553 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2554 * and never got linked before the fsync, skip it, as 2555 * replaying it is pointless since it would be deleted 2556 * later. We skip logging tmpfiles, but it's always 2557 * possible we are replaying a log created with a kernel 2558 * that used to log tmpfiles. 2559 */ 2560 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2561 wc->ignore_cur_inode = true; 2562 continue; 2563 } else { 2564 wc->ignore_cur_inode = false; 2565 } 2566 ret = replay_xattr_deletes(wc->trans, root, log, 2567 path, key.objectid); 2568 if (ret) 2569 break; 2570 mode = btrfs_inode_mode(eb, inode_item); 2571 if (S_ISDIR(mode)) { 2572 ret = replay_dir_deletes(wc->trans, 2573 root, log, path, key.objectid, 0); 2574 if (ret) 2575 break; 2576 } 2577 ret = overwrite_item(wc->trans, root, path, 2578 eb, i, &key); 2579 if (ret) 2580 break; 2581 2582 /* 2583 * Before replaying extents, truncate the inode to its 2584 * size. We need to do it now and not after log replay 2585 * because before an fsync we can have prealloc extents 2586 * added beyond the inode's i_size. If we did it after, 2587 * through orphan cleanup for example, we would drop 2588 * those prealloc extents just after replaying them. 2589 */ 2590 if (S_ISREG(mode)) { 2591 struct btrfs_drop_extents_args drop_args = { 0 }; 2592 struct inode *inode; 2593 u64 from; 2594 2595 inode = read_one_inode(root, key.objectid); 2596 if (!inode) { 2597 ret = -EIO; 2598 break; 2599 } 2600 from = ALIGN(i_size_read(inode), 2601 root->fs_info->sectorsize); 2602 drop_args.start = from; 2603 drop_args.end = (u64)-1; 2604 drop_args.drop_cache = true; 2605 ret = btrfs_drop_extents(wc->trans, root, 2606 BTRFS_I(inode), 2607 &drop_args); 2608 if (!ret) { 2609 inode_sub_bytes(inode, 2610 drop_args.bytes_found); 2611 /* Update the inode's nbytes. */ 2612 ret = btrfs_update_inode(wc->trans, 2613 root, BTRFS_I(inode)); 2614 } 2615 iput(inode); 2616 if (ret) 2617 break; 2618 } 2619 2620 ret = link_to_fixup_dir(wc->trans, root, 2621 path, key.objectid); 2622 if (ret) 2623 break; 2624 } 2625 2626 if (wc->ignore_cur_inode) 2627 continue; 2628 2629 if (key.type == BTRFS_DIR_INDEX_KEY && 2630 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2631 ret = replay_one_dir_item(wc->trans, root, path, 2632 eb, i, &key); 2633 if (ret) 2634 break; 2635 } 2636 2637 if (wc->stage < LOG_WALK_REPLAY_ALL) 2638 continue; 2639 2640 /* these keys are simply copied */ 2641 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2642 ret = overwrite_item(wc->trans, root, path, 2643 eb, i, &key); 2644 if (ret) 2645 break; 2646 } else if (key.type == BTRFS_INODE_REF_KEY || 2647 key.type == BTRFS_INODE_EXTREF_KEY) { 2648 ret = add_inode_ref(wc->trans, root, log, path, 2649 eb, i, &key); 2650 if (ret && ret != -ENOENT) 2651 break; 2652 ret = 0; 2653 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2654 ret = replay_one_extent(wc->trans, root, path, 2655 eb, i, &key); 2656 if (ret) 2657 break; 2658 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2659 ret = replay_one_dir_item(wc->trans, root, path, 2660 eb, i, &key); 2661 if (ret) 2662 break; 2663 } 2664 } 2665 btrfs_free_path(path); 2666 return ret; 2667 } 2668 2669 /* 2670 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2671 */ 2672 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2673 { 2674 struct btrfs_block_group *cache; 2675 2676 cache = btrfs_lookup_block_group(fs_info, start); 2677 if (!cache) { 2678 btrfs_err(fs_info, "unable to find block group for %llu", start); 2679 return; 2680 } 2681 2682 spin_lock(&cache->space_info->lock); 2683 spin_lock(&cache->lock); 2684 cache->reserved -= fs_info->nodesize; 2685 cache->space_info->bytes_reserved -= fs_info->nodesize; 2686 spin_unlock(&cache->lock); 2687 spin_unlock(&cache->space_info->lock); 2688 2689 btrfs_put_block_group(cache); 2690 } 2691 2692 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2693 struct btrfs_root *root, 2694 struct btrfs_path *path, int *level, 2695 struct walk_control *wc) 2696 { 2697 struct btrfs_fs_info *fs_info = root->fs_info; 2698 u64 bytenr; 2699 u64 ptr_gen; 2700 struct extent_buffer *next; 2701 struct extent_buffer *cur; 2702 u32 blocksize; 2703 int ret = 0; 2704 2705 while (*level > 0) { 2706 struct btrfs_key first_key; 2707 2708 cur = path->nodes[*level]; 2709 2710 WARN_ON(btrfs_header_level(cur) != *level); 2711 2712 if (path->slots[*level] >= 2713 btrfs_header_nritems(cur)) 2714 break; 2715 2716 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2717 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2718 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2719 blocksize = fs_info->nodesize; 2720 2721 next = btrfs_find_create_tree_block(fs_info, bytenr, 2722 btrfs_header_owner(cur), 2723 *level - 1); 2724 if (IS_ERR(next)) 2725 return PTR_ERR(next); 2726 2727 if (*level == 1) { 2728 ret = wc->process_func(root, next, wc, ptr_gen, 2729 *level - 1); 2730 if (ret) { 2731 free_extent_buffer(next); 2732 return ret; 2733 } 2734 2735 path->slots[*level]++; 2736 if (wc->free) { 2737 ret = btrfs_read_buffer(next, ptr_gen, 2738 *level - 1, &first_key); 2739 if (ret) { 2740 free_extent_buffer(next); 2741 return ret; 2742 } 2743 2744 if (trans) { 2745 btrfs_tree_lock(next); 2746 btrfs_clean_tree_block(next); 2747 btrfs_wait_tree_block_writeback(next); 2748 btrfs_tree_unlock(next); 2749 ret = btrfs_pin_reserved_extent(trans, 2750 bytenr, blocksize); 2751 if (ret) { 2752 free_extent_buffer(next); 2753 return ret; 2754 } 2755 } else { 2756 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2757 clear_extent_buffer_dirty(next); 2758 unaccount_log_buffer(fs_info, bytenr); 2759 } 2760 } 2761 free_extent_buffer(next); 2762 continue; 2763 } 2764 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2765 if (ret) { 2766 free_extent_buffer(next); 2767 return ret; 2768 } 2769 2770 if (path->nodes[*level-1]) 2771 free_extent_buffer(path->nodes[*level-1]); 2772 path->nodes[*level-1] = next; 2773 *level = btrfs_header_level(next); 2774 path->slots[*level] = 0; 2775 cond_resched(); 2776 } 2777 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2778 2779 cond_resched(); 2780 return 0; 2781 } 2782 2783 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2784 struct btrfs_root *root, 2785 struct btrfs_path *path, int *level, 2786 struct walk_control *wc) 2787 { 2788 struct btrfs_fs_info *fs_info = root->fs_info; 2789 int i; 2790 int slot; 2791 int ret; 2792 2793 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2794 slot = path->slots[i]; 2795 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2796 path->slots[i]++; 2797 *level = i; 2798 WARN_ON(*level == 0); 2799 return 0; 2800 } else { 2801 ret = wc->process_func(root, path->nodes[*level], wc, 2802 btrfs_header_generation(path->nodes[*level]), 2803 *level); 2804 if (ret) 2805 return ret; 2806 2807 if (wc->free) { 2808 struct extent_buffer *next; 2809 2810 next = path->nodes[*level]; 2811 2812 if (trans) { 2813 btrfs_tree_lock(next); 2814 btrfs_clean_tree_block(next); 2815 btrfs_wait_tree_block_writeback(next); 2816 btrfs_tree_unlock(next); 2817 ret = btrfs_pin_reserved_extent(trans, 2818 path->nodes[*level]->start, 2819 path->nodes[*level]->len); 2820 if (ret) 2821 return ret; 2822 } else { 2823 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2824 clear_extent_buffer_dirty(next); 2825 2826 unaccount_log_buffer(fs_info, 2827 path->nodes[*level]->start); 2828 } 2829 } 2830 free_extent_buffer(path->nodes[*level]); 2831 path->nodes[*level] = NULL; 2832 *level = i + 1; 2833 } 2834 } 2835 return 1; 2836 } 2837 2838 /* 2839 * drop the reference count on the tree rooted at 'snap'. This traverses 2840 * the tree freeing any blocks that have a ref count of zero after being 2841 * decremented. 2842 */ 2843 static int walk_log_tree(struct btrfs_trans_handle *trans, 2844 struct btrfs_root *log, struct walk_control *wc) 2845 { 2846 struct btrfs_fs_info *fs_info = log->fs_info; 2847 int ret = 0; 2848 int wret; 2849 int level; 2850 struct btrfs_path *path; 2851 int orig_level; 2852 2853 path = btrfs_alloc_path(); 2854 if (!path) 2855 return -ENOMEM; 2856 2857 level = btrfs_header_level(log->node); 2858 orig_level = level; 2859 path->nodes[level] = log->node; 2860 atomic_inc(&log->node->refs); 2861 path->slots[level] = 0; 2862 2863 while (1) { 2864 wret = walk_down_log_tree(trans, log, path, &level, wc); 2865 if (wret > 0) 2866 break; 2867 if (wret < 0) { 2868 ret = wret; 2869 goto out; 2870 } 2871 2872 wret = walk_up_log_tree(trans, log, path, &level, wc); 2873 if (wret > 0) 2874 break; 2875 if (wret < 0) { 2876 ret = wret; 2877 goto out; 2878 } 2879 } 2880 2881 /* was the root node processed? if not, catch it here */ 2882 if (path->nodes[orig_level]) { 2883 ret = wc->process_func(log, path->nodes[orig_level], wc, 2884 btrfs_header_generation(path->nodes[orig_level]), 2885 orig_level); 2886 if (ret) 2887 goto out; 2888 if (wc->free) { 2889 struct extent_buffer *next; 2890 2891 next = path->nodes[orig_level]; 2892 2893 if (trans) { 2894 btrfs_tree_lock(next); 2895 btrfs_clean_tree_block(next); 2896 btrfs_wait_tree_block_writeback(next); 2897 btrfs_tree_unlock(next); 2898 ret = btrfs_pin_reserved_extent(trans, 2899 next->start, next->len); 2900 if (ret) 2901 goto out; 2902 } else { 2903 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2904 clear_extent_buffer_dirty(next); 2905 unaccount_log_buffer(fs_info, next->start); 2906 } 2907 } 2908 } 2909 2910 out: 2911 btrfs_free_path(path); 2912 return ret; 2913 } 2914 2915 /* 2916 * helper function to update the item for a given subvolumes log root 2917 * in the tree of log roots 2918 */ 2919 static int update_log_root(struct btrfs_trans_handle *trans, 2920 struct btrfs_root *log, 2921 struct btrfs_root_item *root_item) 2922 { 2923 struct btrfs_fs_info *fs_info = log->fs_info; 2924 int ret; 2925 2926 if (log->log_transid == 1) { 2927 /* insert root item on the first sync */ 2928 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2929 &log->root_key, root_item); 2930 } else { 2931 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2932 &log->root_key, root_item); 2933 } 2934 return ret; 2935 } 2936 2937 static void wait_log_commit(struct btrfs_root *root, int transid) 2938 { 2939 DEFINE_WAIT(wait); 2940 int index = transid % 2; 2941 2942 /* 2943 * we only allow two pending log transactions at a time, 2944 * so we know that if ours is more than 2 older than the 2945 * current transaction, we're done 2946 */ 2947 for (;;) { 2948 prepare_to_wait(&root->log_commit_wait[index], 2949 &wait, TASK_UNINTERRUPTIBLE); 2950 2951 if (!(root->log_transid_committed < transid && 2952 atomic_read(&root->log_commit[index]))) 2953 break; 2954 2955 mutex_unlock(&root->log_mutex); 2956 schedule(); 2957 mutex_lock(&root->log_mutex); 2958 } 2959 finish_wait(&root->log_commit_wait[index], &wait); 2960 } 2961 2962 static void wait_for_writer(struct btrfs_root *root) 2963 { 2964 DEFINE_WAIT(wait); 2965 2966 for (;;) { 2967 prepare_to_wait(&root->log_writer_wait, &wait, 2968 TASK_UNINTERRUPTIBLE); 2969 if (!atomic_read(&root->log_writers)) 2970 break; 2971 2972 mutex_unlock(&root->log_mutex); 2973 schedule(); 2974 mutex_lock(&root->log_mutex); 2975 } 2976 finish_wait(&root->log_writer_wait, &wait); 2977 } 2978 2979 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2980 struct btrfs_log_ctx *ctx) 2981 { 2982 if (!ctx) 2983 return; 2984 2985 mutex_lock(&root->log_mutex); 2986 list_del_init(&ctx->list); 2987 mutex_unlock(&root->log_mutex); 2988 } 2989 2990 /* 2991 * Invoked in log mutex context, or be sure there is no other task which 2992 * can access the list. 2993 */ 2994 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2995 int index, int error) 2996 { 2997 struct btrfs_log_ctx *ctx; 2998 struct btrfs_log_ctx *safe; 2999 3000 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3001 list_del_init(&ctx->list); 3002 ctx->log_ret = error; 3003 } 3004 3005 INIT_LIST_HEAD(&root->log_ctxs[index]); 3006 } 3007 3008 /* 3009 * btrfs_sync_log does sends a given tree log down to the disk and 3010 * updates the super blocks to record it. When this call is done, 3011 * you know that any inodes previously logged are safely on disk only 3012 * if it returns 0. 3013 * 3014 * Any other return value means you need to call btrfs_commit_transaction. 3015 * Some of the edge cases for fsyncing directories that have had unlinks 3016 * or renames done in the past mean that sometimes the only safe 3017 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3018 * that has happened. 3019 */ 3020 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3021 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3022 { 3023 int index1; 3024 int index2; 3025 int mark; 3026 int ret; 3027 struct btrfs_fs_info *fs_info = root->fs_info; 3028 struct btrfs_root *log = root->log_root; 3029 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3030 struct btrfs_root_item new_root_item; 3031 int log_transid = 0; 3032 struct btrfs_log_ctx root_log_ctx; 3033 struct blk_plug plug; 3034 u64 log_root_start; 3035 u64 log_root_level; 3036 3037 mutex_lock(&root->log_mutex); 3038 log_transid = ctx->log_transid; 3039 if (root->log_transid_committed >= log_transid) { 3040 mutex_unlock(&root->log_mutex); 3041 return ctx->log_ret; 3042 } 3043 3044 index1 = log_transid % 2; 3045 if (atomic_read(&root->log_commit[index1])) { 3046 wait_log_commit(root, log_transid); 3047 mutex_unlock(&root->log_mutex); 3048 return ctx->log_ret; 3049 } 3050 ASSERT(log_transid == root->log_transid); 3051 atomic_set(&root->log_commit[index1], 1); 3052 3053 /* wait for previous tree log sync to complete */ 3054 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3055 wait_log_commit(root, log_transid - 1); 3056 3057 while (1) { 3058 int batch = atomic_read(&root->log_batch); 3059 /* when we're on an ssd, just kick the log commit out */ 3060 if (!btrfs_test_opt(fs_info, SSD) && 3061 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3062 mutex_unlock(&root->log_mutex); 3063 schedule_timeout_uninterruptible(1); 3064 mutex_lock(&root->log_mutex); 3065 } 3066 wait_for_writer(root); 3067 if (batch == atomic_read(&root->log_batch)) 3068 break; 3069 } 3070 3071 /* bail out if we need to do a full commit */ 3072 if (btrfs_need_log_full_commit(trans)) { 3073 ret = -EAGAIN; 3074 mutex_unlock(&root->log_mutex); 3075 goto out; 3076 } 3077 3078 if (log_transid % 2 == 0) 3079 mark = EXTENT_DIRTY; 3080 else 3081 mark = EXTENT_NEW; 3082 3083 /* we start IO on all the marked extents here, but we don't actually 3084 * wait for them until later. 3085 */ 3086 blk_start_plug(&plug); 3087 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3088 if (ret) { 3089 blk_finish_plug(&plug); 3090 btrfs_abort_transaction(trans, ret); 3091 btrfs_set_log_full_commit(trans); 3092 mutex_unlock(&root->log_mutex); 3093 goto out; 3094 } 3095 3096 /* 3097 * We _must_ update under the root->log_mutex in order to make sure we 3098 * have a consistent view of the log root we are trying to commit at 3099 * this moment. 3100 * 3101 * We _must_ copy this into a local copy, because we are not holding the 3102 * log_root_tree->log_mutex yet. This is important because when we 3103 * commit the log_root_tree we must have a consistent view of the 3104 * log_root_tree when we update the super block to point at the 3105 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3106 * with the commit and possibly point at the new block which we may not 3107 * have written out. 3108 */ 3109 btrfs_set_root_node(&log->root_item, log->node); 3110 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3111 3112 root->log_transid++; 3113 log->log_transid = root->log_transid; 3114 root->log_start_pid = 0; 3115 /* 3116 * IO has been started, blocks of the log tree have WRITTEN flag set 3117 * in their headers. new modifications of the log will be written to 3118 * new positions. so it's safe to allow log writers to go in. 3119 */ 3120 mutex_unlock(&root->log_mutex); 3121 3122 btrfs_init_log_ctx(&root_log_ctx, NULL); 3123 3124 mutex_lock(&log_root_tree->log_mutex); 3125 3126 index2 = log_root_tree->log_transid % 2; 3127 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3128 root_log_ctx.log_transid = log_root_tree->log_transid; 3129 3130 /* 3131 * Now we are safe to update the log_root_tree because we're under the 3132 * log_mutex, and we're a current writer so we're holding the commit 3133 * open until we drop the log_mutex. 3134 */ 3135 ret = update_log_root(trans, log, &new_root_item); 3136 if (ret) { 3137 if (!list_empty(&root_log_ctx.list)) 3138 list_del_init(&root_log_ctx.list); 3139 3140 blk_finish_plug(&plug); 3141 btrfs_set_log_full_commit(trans); 3142 3143 if (ret != -ENOSPC) { 3144 btrfs_abort_transaction(trans, ret); 3145 mutex_unlock(&log_root_tree->log_mutex); 3146 goto out; 3147 } 3148 btrfs_wait_tree_log_extents(log, mark); 3149 mutex_unlock(&log_root_tree->log_mutex); 3150 ret = -EAGAIN; 3151 goto out; 3152 } 3153 3154 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3155 blk_finish_plug(&plug); 3156 list_del_init(&root_log_ctx.list); 3157 mutex_unlock(&log_root_tree->log_mutex); 3158 ret = root_log_ctx.log_ret; 3159 goto out; 3160 } 3161 3162 index2 = root_log_ctx.log_transid % 2; 3163 if (atomic_read(&log_root_tree->log_commit[index2])) { 3164 blk_finish_plug(&plug); 3165 ret = btrfs_wait_tree_log_extents(log, mark); 3166 wait_log_commit(log_root_tree, 3167 root_log_ctx.log_transid); 3168 mutex_unlock(&log_root_tree->log_mutex); 3169 if (!ret) 3170 ret = root_log_ctx.log_ret; 3171 goto out; 3172 } 3173 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3174 atomic_set(&log_root_tree->log_commit[index2], 1); 3175 3176 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3177 wait_log_commit(log_root_tree, 3178 root_log_ctx.log_transid - 1); 3179 } 3180 3181 /* 3182 * now that we've moved on to the tree of log tree roots, 3183 * check the full commit flag again 3184 */ 3185 if (btrfs_need_log_full_commit(trans)) { 3186 blk_finish_plug(&plug); 3187 btrfs_wait_tree_log_extents(log, mark); 3188 mutex_unlock(&log_root_tree->log_mutex); 3189 ret = -EAGAIN; 3190 goto out_wake_log_root; 3191 } 3192 3193 ret = btrfs_write_marked_extents(fs_info, 3194 &log_root_tree->dirty_log_pages, 3195 EXTENT_DIRTY | EXTENT_NEW); 3196 blk_finish_plug(&plug); 3197 if (ret) { 3198 btrfs_set_log_full_commit(trans); 3199 btrfs_abort_transaction(trans, ret); 3200 mutex_unlock(&log_root_tree->log_mutex); 3201 goto out_wake_log_root; 3202 } 3203 ret = btrfs_wait_tree_log_extents(log, mark); 3204 if (!ret) 3205 ret = btrfs_wait_tree_log_extents(log_root_tree, 3206 EXTENT_NEW | EXTENT_DIRTY); 3207 if (ret) { 3208 btrfs_set_log_full_commit(trans); 3209 mutex_unlock(&log_root_tree->log_mutex); 3210 goto out_wake_log_root; 3211 } 3212 3213 log_root_start = log_root_tree->node->start; 3214 log_root_level = btrfs_header_level(log_root_tree->node); 3215 log_root_tree->log_transid++; 3216 mutex_unlock(&log_root_tree->log_mutex); 3217 3218 /* 3219 * Here we are guaranteed that nobody is going to write the superblock 3220 * for the current transaction before us and that neither we do write 3221 * our superblock before the previous transaction finishes its commit 3222 * and writes its superblock, because: 3223 * 3224 * 1) We are holding a handle on the current transaction, so no body 3225 * can commit it until we release the handle; 3226 * 3227 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3228 * if the previous transaction is still committing, and hasn't yet 3229 * written its superblock, we wait for it to do it, because a 3230 * transaction commit acquires the tree_log_mutex when the commit 3231 * begins and releases it only after writing its superblock. 3232 */ 3233 mutex_lock(&fs_info->tree_log_mutex); 3234 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3235 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3236 ret = write_all_supers(fs_info, 1); 3237 mutex_unlock(&fs_info->tree_log_mutex); 3238 if (ret) { 3239 btrfs_set_log_full_commit(trans); 3240 btrfs_abort_transaction(trans, ret); 3241 goto out_wake_log_root; 3242 } 3243 3244 mutex_lock(&root->log_mutex); 3245 if (root->last_log_commit < log_transid) 3246 root->last_log_commit = log_transid; 3247 mutex_unlock(&root->log_mutex); 3248 3249 out_wake_log_root: 3250 mutex_lock(&log_root_tree->log_mutex); 3251 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3252 3253 log_root_tree->log_transid_committed++; 3254 atomic_set(&log_root_tree->log_commit[index2], 0); 3255 mutex_unlock(&log_root_tree->log_mutex); 3256 3257 /* 3258 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3259 * all the updates above are seen by the woken threads. It might not be 3260 * necessary, but proving that seems to be hard. 3261 */ 3262 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3263 out: 3264 mutex_lock(&root->log_mutex); 3265 btrfs_remove_all_log_ctxs(root, index1, ret); 3266 root->log_transid_committed++; 3267 atomic_set(&root->log_commit[index1], 0); 3268 mutex_unlock(&root->log_mutex); 3269 3270 /* 3271 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3272 * all the updates above are seen by the woken threads. It might not be 3273 * necessary, but proving that seems to be hard. 3274 */ 3275 cond_wake_up(&root->log_commit_wait[index1]); 3276 return ret; 3277 } 3278 3279 static void free_log_tree(struct btrfs_trans_handle *trans, 3280 struct btrfs_root *log) 3281 { 3282 int ret; 3283 struct walk_control wc = { 3284 .free = 1, 3285 .process_func = process_one_buffer 3286 }; 3287 3288 ret = walk_log_tree(trans, log, &wc); 3289 if (ret) { 3290 if (trans) 3291 btrfs_abort_transaction(trans, ret); 3292 else 3293 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3294 } 3295 3296 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3297 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3298 extent_io_tree_release(&log->log_csum_range); 3299 btrfs_put_root(log); 3300 } 3301 3302 /* 3303 * free all the extents used by the tree log. This should be called 3304 * at commit time of the full transaction 3305 */ 3306 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3307 { 3308 if (root->log_root) { 3309 free_log_tree(trans, root->log_root); 3310 root->log_root = NULL; 3311 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3312 } 3313 return 0; 3314 } 3315 3316 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3317 struct btrfs_fs_info *fs_info) 3318 { 3319 if (fs_info->log_root_tree) { 3320 free_log_tree(trans, fs_info->log_root_tree); 3321 fs_info->log_root_tree = NULL; 3322 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3323 } 3324 return 0; 3325 } 3326 3327 /* 3328 * Check if an inode was logged in the current transaction. We can't always rely 3329 * on an inode's logged_trans value, because it's an in-memory only field and 3330 * therefore not persisted. This means that its value is lost if the inode gets 3331 * evicted and loaded again from disk (in which case it has a value of 0, and 3332 * certainly it is smaller then any possible transaction ID), when that happens 3333 * the full_sync flag is set in the inode's runtime flags, so on that case we 3334 * assume eviction happened and ignore the logged_trans value, assuming the 3335 * worst case, that the inode was logged before in the current transaction. 3336 */ 3337 static bool inode_logged(struct btrfs_trans_handle *trans, 3338 struct btrfs_inode *inode) 3339 { 3340 if (inode->logged_trans == trans->transid) 3341 return true; 3342 3343 if (inode->last_trans == trans->transid && 3344 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 3345 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3346 return true; 3347 3348 return false; 3349 } 3350 3351 /* 3352 * If both a file and directory are logged, and unlinks or renames are 3353 * mixed in, we have a few interesting corners: 3354 * 3355 * create file X in dir Y 3356 * link file X to X.link in dir Y 3357 * fsync file X 3358 * unlink file X but leave X.link 3359 * fsync dir Y 3360 * 3361 * After a crash we would expect only X.link to exist. But file X 3362 * didn't get fsync'd again so the log has back refs for X and X.link. 3363 * 3364 * We solve this by removing directory entries and inode backrefs from the 3365 * log when a file that was logged in the current transaction is 3366 * unlinked. Any later fsync will include the updated log entries, and 3367 * we'll be able to reconstruct the proper directory items from backrefs. 3368 * 3369 * This optimizations allows us to avoid relogging the entire inode 3370 * or the entire directory. 3371 */ 3372 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3373 struct btrfs_root *root, 3374 const char *name, int name_len, 3375 struct btrfs_inode *dir, u64 index) 3376 { 3377 struct btrfs_root *log; 3378 struct btrfs_dir_item *di; 3379 struct btrfs_path *path; 3380 int ret; 3381 int err = 0; 3382 int bytes_del = 0; 3383 u64 dir_ino = btrfs_ino(dir); 3384 3385 if (!inode_logged(trans, dir)) 3386 return 0; 3387 3388 ret = join_running_log_trans(root); 3389 if (ret) 3390 return 0; 3391 3392 mutex_lock(&dir->log_mutex); 3393 3394 log = root->log_root; 3395 path = btrfs_alloc_path(); 3396 if (!path) { 3397 err = -ENOMEM; 3398 goto out_unlock; 3399 } 3400 3401 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3402 name, name_len, -1); 3403 if (IS_ERR(di)) { 3404 err = PTR_ERR(di); 3405 goto fail; 3406 } 3407 if (di) { 3408 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3409 bytes_del += name_len; 3410 if (ret) { 3411 err = ret; 3412 goto fail; 3413 } 3414 } 3415 btrfs_release_path(path); 3416 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3417 index, name, name_len, -1); 3418 if (IS_ERR(di)) { 3419 err = PTR_ERR(di); 3420 goto fail; 3421 } 3422 if (di) { 3423 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3424 bytes_del += name_len; 3425 if (ret) { 3426 err = ret; 3427 goto fail; 3428 } 3429 } 3430 3431 /* update the directory size in the log to reflect the names 3432 * we have removed 3433 */ 3434 if (bytes_del) { 3435 struct btrfs_key key; 3436 3437 key.objectid = dir_ino; 3438 key.offset = 0; 3439 key.type = BTRFS_INODE_ITEM_KEY; 3440 btrfs_release_path(path); 3441 3442 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3443 if (ret < 0) { 3444 err = ret; 3445 goto fail; 3446 } 3447 if (ret == 0) { 3448 struct btrfs_inode_item *item; 3449 u64 i_size; 3450 3451 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3452 struct btrfs_inode_item); 3453 i_size = btrfs_inode_size(path->nodes[0], item); 3454 if (i_size > bytes_del) 3455 i_size -= bytes_del; 3456 else 3457 i_size = 0; 3458 btrfs_set_inode_size(path->nodes[0], item, i_size); 3459 btrfs_mark_buffer_dirty(path->nodes[0]); 3460 } else 3461 ret = 0; 3462 btrfs_release_path(path); 3463 } 3464 fail: 3465 btrfs_free_path(path); 3466 out_unlock: 3467 mutex_unlock(&dir->log_mutex); 3468 if (err == -ENOSPC) { 3469 btrfs_set_log_full_commit(trans); 3470 err = 0; 3471 } else if (err < 0 && err != -ENOENT) { 3472 /* ENOENT can be returned if the entry hasn't been fsynced yet */ 3473 btrfs_abort_transaction(trans, err); 3474 } 3475 3476 btrfs_end_log_trans(root); 3477 3478 return err; 3479 } 3480 3481 /* see comments for btrfs_del_dir_entries_in_log */ 3482 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3483 struct btrfs_root *root, 3484 const char *name, int name_len, 3485 struct btrfs_inode *inode, u64 dirid) 3486 { 3487 struct btrfs_root *log; 3488 u64 index; 3489 int ret; 3490 3491 if (!inode_logged(trans, inode)) 3492 return 0; 3493 3494 ret = join_running_log_trans(root); 3495 if (ret) 3496 return 0; 3497 log = root->log_root; 3498 mutex_lock(&inode->log_mutex); 3499 3500 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3501 dirid, &index); 3502 mutex_unlock(&inode->log_mutex); 3503 if (ret == -ENOSPC) { 3504 btrfs_set_log_full_commit(trans); 3505 ret = 0; 3506 } else if (ret < 0 && ret != -ENOENT) 3507 btrfs_abort_transaction(trans, ret); 3508 btrfs_end_log_trans(root); 3509 3510 return ret; 3511 } 3512 3513 /* 3514 * creates a range item in the log for 'dirid'. first_offset and 3515 * last_offset tell us which parts of the key space the log should 3516 * be considered authoritative for. 3517 */ 3518 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3519 struct btrfs_root *log, 3520 struct btrfs_path *path, 3521 int key_type, u64 dirid, 3522 u64 first_offset, u64 last_offset) 3523 { 3524 int ret; 3525 struct btrfs_key key; 3526 struct btrfs_dir_log_item *item; 3527 3528 key.objectid = dirid; 3529 key.offset = first_offset; 3530 if (key_type == BTRFS_DIR_ITEM_KEY) 3531 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3532 else 3533 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3534 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3535 if (ret) 3536 return ret; 3537 3538 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3539 struct btrfs_dir_log_item); 3540 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3541 btrfs_mark_buffer_dirty(path->nodes[0]); 3542 btrfs_release_path(path); 3543 return 0; 3544 } 3545 3546 /* 3547 * log all the items included in the current transaction for a given 3548 * directory. This also creates the range items in the log tree required 3549 * to replay anything deleted before the fsync 3550 */ 3551 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3552 struct btrfs_root *root, struct btrfs_inode *inode, 3553 struct btrfs_path *path, 3554 struct btrfs_path *dst_path, int key_type, 3555 struct btrfs_log_ctx *ctx, 3556 u64 min_offset, u64 *last_offset_ret) 3557 { 3558 struct btrfs_key min_key; 3559 struct btrfs_root *log = root->log_root; 3560 struct extent_buffer *src; 3561 int err = 0; 3562 int ret; 3563 int i; 3564 int nritems; 3565 u64 first_offset = min_offset; 3566 u64 last_offset = (u64)-1; 3567 u64 ino = btrfs_ino(inode); 3568 3569 log = root->log_root; 3570 3571 min_key.objectid = ino; 3572 min_key.type = key_type; 3573 min_key.offset = min_offset; 3574 3575 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3576 3577 /* 3578 * we didn't find anything from this transaction, see if there 3579 * is anything at all 3580 */ 3581 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3582 min_key.objectid = ino; 3583 min_key.type = key_type; 3584 min_key.offset = (u64)-1; 3585 btrfs_release_path(path); 3586 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3587 if (ret < 0) { 3588 btrfs_release_path(path); 3589 return ret; 3590 } 3591 ret = btrfs_previous_item(root, path, ino, key_type); 3592 3593 /* if ret == 0 there are items for this type, 3594 * create a range to tell us the last key of this type. 3595 * otherwise, there are no items in this directory after 3596 * *min_offset, and we create a range to indicate that. 3597 */ 3598 if (ret == 0) { 3599 struct btrfs_key tmp; 3600 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3601 path->slots[0]); 3602 if (key_type == tmp.type) 3603 first_offset = max(min_offset, tmp.offset) + 1; 3604 } 3605 goto done; 3606 } 3607 3608 /* go backward to find any previous key */ 3609 ret = btrfs_previous_item(root, path, ino, key_type); 3610 if (ret == 0) { 3611 struct btrfs_key tmp; 3612 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3613 if (key_type == tmp.type) { 3614 first_offset = tmp.offset; 3615 ret = overwrite_item(trans, log, dst_path, 3616 path->nodes[0], path->slots[0], 3617 &tmp); 3618 if (ret) { 3619 err = ret; 3620 goto done; 3621 } 3622 } 3623 } 3624 btrfs_release_path(path); 3625 3626 /* 3627 * Find the first key from this transaction again. See the note for 3628 * log_new_dir_dentries, if we're logging a directory recursively we 3629 * won't be holding its i_mutex, which means we can modify the directory 3630 * while we're logging it. If we remove an entry between our first 3631 * search and this search we'll not find the key again and can just 3632 * bail. 3633 */ 3634 search: 3635 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3636 if (ret != 0) 3637 goto done; 3638 3639 /* 3640 * we have a block from this transaction, log every item in it 3641 * from our directory 3642 */ 3643 while (1) { 3644 struct btrfs_key tmp; 3645 src = path->nodes[0]; 3646 nritems = btrfs_header_nritems(src); 3647 for (i = path->slots[0]; i < nritems; i++) { 3648 struct btrfs_dir_item *di; 3649 3650 btrfs_item_key_to_cpu(src, &min_key, i); 3651 3652 if (min_key.objectid != ino || min_key.type != key_type) 3653 goto done; 3654 3655 if (need_resched()) { 3656 btrfs_release_path(path); 3657 cond_resched(); 3658 goto search; 3659 } 3660 3661 ret = overwrite_item(trans, log, dst_path, src, i, 3662 &min_key); 3663 if (ret) { 3664 err = ret; 3665 goto done; 3666 } 3667 3668 /* 3669 * We must make sure that when we log a directory entry, 3670 * the corresponding inode, after log replay, has a 3671 * matching link count. For example: 3672 * 3673 * touch foo 3674 * mkdir mydir 3675 * sync 3676 * ln foo mydir/bar 3677 * xfs_io -c "fsync" mydir 3678 * <crash> 3679 * <mount fs and log replay> 3680 * 3681 * Would result in a fsync log that when replayed, our 3682 * file inode would have a link count of 1, but we get 3683 * two directory entries pointing to the same inode. 3684 * After removing one of the names, it would not be 3685 * possible to remove the other name, which resulted 3686 * always in stale file handle errors, and would not 3687 * be possible to rmdir the parent directory, since 3688 * its i_size could never decrement to the value 3689 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3690 */ 3691 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3692 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3693 if (ctx && 3694 (btrfs_dir_transid(src, di) == trans->transid || 3695 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3696 tmp.type != BTRFS_ROOT_ITEM_KEY) 3697 ctx->log_new_dentries = true; 3698 } 3699 path->slots[0] = nritems; 3700 3701 /* 3702 * look ahead to the next item and see if it is also 3703 * from this directory and from this transaction 3704 */ 3705 ret = btrfs_next_leaf(root, path); 3706 if (ret) { 3707 if (ret == 1) 3708 last_offset = (u64)-1; 3709 else 3710 err = ret; 3711 goto done; 3712 } 3713 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3714 if (tmp.objectid != ino || tmp.type != key_type) { 3715 last_offset = (u64)-1; 3716 goto done; 3717 } 3718 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3719 ret = overwrite_item(trans, log, dst_path, 3720 path->nodes[0], path->slots[0], 3721 &tmp); 3722 if (ret) 3723 err = ret; 3724 else 3725 last_offset = tmp.offset; 3726 goto done; 3727 } 3728 } 3729 done: 3730 btrfs_release_path(path); 3731 btrfs_release_path(dst_path); 3732 3733 if (err == 0) { 3734 *last_offset_ret = last_offset; 3735 /* 3736 * insert the log range keys to indicate where the log 3737 * is valid 3738 */ 3739 ret = insert_dir_log_key(trans, log, path, key_type, 3740 ino, first_offset, last_offset); 3741 if (ret) 3742 err = ret; 3743 } 3744 return err; 3745 } 3746 3747 /* 3748 * logging directories is very similar to logging inodes, We find all the items 3749 * from the current transaction and write them to the log. 3750 * 3751 * The recovery code scans the directory in the subvolume, and if it finds a 3752 * key in the range logged that is not present in the log tree, then it means 3753 * that dir entry was unlinked during the transaction. 3754 * 3755 * In order for that scan to work, we must include one key smaller than 3756 * the smallest logged by this transaction and one key larger than the largest 3757 * key logged by this transaction. 3758 */ 3759 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3760 struct btrfs_root *root, struct btrfs_inode *inode, 3761 struct btrfs_path *path, 3762 struct btrfs_path *dst_path, 3763 struct btrfs_log_ctx *ctx) 3764 { 3765 u64 min_key; 3766 u64 max_key; 3767 int ret; 3768 int key_type = BTRFS_DIR_ITEM_KEY; 3769 3770 again: 3771 min_key = 0; 3772 max_key = 0; 3773 while (1) { 3774 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3775 ctx, min_key, &max_key); 3776 if (ret) 3777 return ret; 3778 if (max_key == (u64)-1) 3779 break; 3780 min_key = max_key + 1; 3781 } 3782 3783 if (key_type == BTRFS_DIR_ITEM_KEY) { 3784 key_type = BTRFS_DIR_INDEX_KEY; 3785 goto again; 3786 } 3787 return 0; 3788 } 3789 3790 /* 3791 * a helper function to drop items from the log before we relog an 3792 * inode. max_key_type indicates the highest item type to remove. 3793 * This cannot be run for file data extents because it does not 3794 * free the extents they point to. 3795 */ 3796 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3797 struct btrfs_root *log, 3798 struct btrfs_path *path, 3799 u64 objectid, int max_key_type) 3800 { 3801 int ret; 3802 struct btrfs_key key; 3803 struct btrfs_key found_key; 3804 int start_slot; 3805 3806 key.objectid = objectid; 3807 key.type = max_key_type; 3808 key.offset = (u64)-1; 3809 3810 while (1) { 3811 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3812 BUG_ON(ret == 0); /* Logic error */ 3813 if (ret < 0) 3814 break; 3815 3816 if (path->slots[0] == 0) 3817 break; 3818 3819 path->slots[0]--; 3820 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3821 path->slots[0]); 3822 3823 if (found_key.objectid != objectid) 3824 break; 3825 3826 found_key.offset = 0; 3827 found_key.type = 0; 3828 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 3829 if (ret < 0) 3830 break; 3831 3832 ret = btrfs_del_items(trans, log, path, start_slot, 3833 path->slots[0] - start_slot + 1); 3834 /* 3835 * If start slot isn't 0 then we don't need to re-search, we've 3836 * found the last guy with the objectid in this tree. 3837 */ 3838 if (ret || start_slot != 0) 3839 break; 3840 btrfs_release_path(path); 3841 } 3842 btrfs_release_path(path); 3843 if (ret > 0) 3844 ret = 0; 3845 return ret; 3846 } 3847 3848 static void fill_inode_item(struct btrfs_trans_handle *trans, 3849 struct extent_buffer *leaf, 3850 struct btrfs_inode_item *item, 3851 struct inode *inode, int log_inode_only, 3852 u64 logged_isize) 3853 { 3854 struct btrfs_map_token token; 3855 3856 btrfs_init_map_token(&token, leaf); 3857 3858 if (log_inode_only) { 3859 /* set the generation to zero so the recover code 3860 * can tell the difference between an logging 3861 * just to say 'this inode exists' and a logging 3862 * to say 'update this inode with these values' 3863 */ 3864 btrfs_set_token_inode_generation(&token, item, 0); 3865 btrfs_set_token_inode_size(&token, item, logged_isize); 3866 } else { 3867 btrfs_set_token_inode_generation(&token, item, 3868 BTRFS_I(inode)->generation); 3869 btrfs_set_token_inode_size(&token, item, inode->i_size); 3870 } 3871 3872 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3873 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3874 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3875 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3876 3877 btrfs_set_token_timespec_sec(&token, &item->atime, 3878 inode->i_atime.tv_sec); 3879 btrfs_set_token_timespec_nsec(&token, &item->atime, 3880 inode->i_atime.tv_nsec); 3881 3882 btrfs_set_token_timespec_sec(&token, &item->mtime, 3883 inode->i_mtime.tv_sec); 3884 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3885 inode->i_mtime.tv_nsec); 3886 3887 btrfs_set_token_timespec_sec(&token, &item->ctime, 3888 inode->i_ctime.tv_sec); 3889 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3890 inode->i_ctime.tv_nsec); 3891 3892 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3893 3894 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3895 btrfs_set_token_inode_transid(&token, item, trans->transid); 3896 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3897 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); 3898 btrfs_set_token_inode_block_group(&token, item, 0); 3899 } 3900 3901 static int log_inode_item(struct btrfs_trans_handle *trans, 3902 struct btrfs_root *log, struct btrfs_path *path, 3903 struct btrfs_inode *inode) 3904 { 3905 struct btrfs_inode_item *inode_item; 3906 int ret; 3907 3908 ret = btrfs_insert_empty_item(trans, log, path, 3909 &inode->location, sizeof(*inode_item)); 3910 if (ret && ret != -EEXIST) 3911 return ret; 3912 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3913 struct btrfs_inode_item); 3914 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3915 0, 0); 3916 btrfs_release_path(path); 3917 return 0; 3918 } 3919 3920 static int log_csums(struct btrfs_trans_handle *trans, 3921 struct btrfs_inode *inode, 3922 struct btrfs_root *log_root, 3923 struct btrfs_ordered_sum *sums) 3924 { 3925 const u64 lock_end = sums->bytenr + sums->len - 1; 3926 struct extent_state *cached_state = NULL; 3927 int ret; 3928 3929 /* 3930 * If this inode was not used for reflink operations in the current 3931 * transaction with new extents, then do the fast path, no need to 3932 * worry about logging checksum items with overlapping ranges. 3933 */ 3934 if (inode->last_reflink_trans < trans->transid) 3935 return btrfs_csum_file_blocks(trans, log_root, sums); 3936 3937 /* 3938 * Serialize logging for checksums. This is to avoid racing with the 3939 * same checksum being logged by another task that is logging another 3940 * file which happens to refer to the same extent as well. Such races 3941 * can leave checksum items in the log with overlapping ranges. 3942 */ 3943 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 3944 lock_end, &cached_state); 3945 if (ret) 3946 return ret; 3947 /* 3948 * Due to extent cloning, we might have logged a csum item that covers a 3949 * subrange of a cloned extent, and later we can end up logging a csum 3950 * item for a larger subrange of the same extent or the entire range. 3951 * This would leave csum items in the log tree that cover the same range 3952 * and break the searches for checksums in the log tree, resulting in 3953 * some checksums missing in the fs/subvolume tree. So just delete (or 3954 * trim and adjust) any existing csum items in the log for this range. 3955 */ 3956 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 3957 if (!ret) 3958 ret = btrfs_csum_file_blocks(trans, log_root, sums); 3959 3960 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 3961 &cached_state); 3962 3963 return ret; 3964 } 3965 3966 static noinline int copy_items(struct btrfs_trans_handle *trans, 3967 struct btrfs_inode *inode, 3968 struct btrfs_path *dst_path, 3969 struct btrfs_path *src_path, 3970 int start_slot, int nr, int inode_only, 3971 u64 logged_isize) 3972 { 3973 struct btrfs_fs_info *fs_info = trans->fs_info; 3974 unsigned long src_offset; 3975 unsigned long dst_offset; 3976 struct btrfs_root *log = inode->root->log_root; 3977 struct btrfs_file_extent_item *extent; 3978 struct btrfs_inode_item *inode_item; 3979 struct extent_buffer *src = src_path->nodes[0]; 3980 int ret; 3981 struct btrfs_key *ins_keys; 3982 u32 *ins_sizes; 3983 char *ins_data; 3984 int i; 3985 struct list_head ordered_sums; 3986 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3987 3988 INIT_LIST_HEAD(&ordered_sums); 3989 3990 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3991 nr * sizeof(u32), GFP_NOFS); 3992 if (!ins_data) 3993 return -ENOMEM; 3994 3995 ins_sizes = (u32 *)ins_data; 3996 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3997 3998 for (i = 0; i < nr; i++) { 3999 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 4000 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 4001 } 4002 ret = btrfs_insert_empty_items(trans, log, dst_path, 4003 ins_keys, ins_sizes, nr); 4004 if (ret) { 4005 kfree(ins_data); 4006 return ret; 4007 } 4008 4009 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 4010 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 4011 dst_path->slots[0]); 4012 4013 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 4014 4015 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 4016 inode_item = btrfs_item_ptr(dst_path->nodes[0], 4017 dst_path->slots[0], 4018 struct btrfs_inode_item); 4019 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4020 &inode->vfs_inode, 4021 inode_only == LOG_INODE_EXISTS, 4022 logged_isize); 4023 } else { 4024 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4025 src_offset, ins_sizes[i]); 4026 } 4027 4028 /* take a reference on file data extents so that truncates 4029 * or deletes of this inode don't have to relog the inode 4030 * again 4031 */ 4032 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4033 !skip_csum) { 4034 int found_type; 4035 extent = btrfs_item_ptr(src, start_slot + i, 4036 struct btrfs_file_extent_item); 4037 4038 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4039 continue; 4040 4041 found_type = btrfs_file_extent_type(src, extent); 4042 if (found_type == BTRFS_FILE_EXTENT_REG) { 4043 u64 ds, dl, cs, cl; 4044 ds = btrfs_file_extent_disk_bytenr(src, 4045 extent); 4046 /* ds == 0 is a hole */ 4047 if (ds == 0) 4048 continue; 4049 4050 dl = btrfs_file_extent_disk_num_bytes(src, 4051 extent); 4052 cs = btrfs_file_extent_offset(src, extent); 4053 cl = btrfs_file_extent_num_bytes(src, 4054 extent); 4055 if (btrfs_file_extent_compression(src, 4056 extent)) { 4057 cs = 0; 4058 cl = dl; 4059 } 4060 4061 ret = btrfs_lookup_csums_range( 4062 fs_info->csum_root, 4063 ds + cs, ds + cs + cl - 1, 4064 &ordered_sums, 0); 4065 if (ret) 4066 break; 4067 } 4068 } 4069 } 4070 4071 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4072 btrfs_release_path(dst_path); 4073 kfree(ins_data); 4074 4075 /* 4076 * we have to do this after the loop above to avoid changing the 4077 * log tree while trying to change the log tree. 4078 */ 4079 while (!list_empty(&ordered_sums)) { 4080 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4081 struct btrfs_ordered_sum, 4082 list); 4083 if (!ret) 4084 ret = log_csums(trans, inode, log, sums); 4085 list_del(&sums->list); 4086 kfree(sums); 4087 } 4088 4089 return ret; 4090 } 4091 4092 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4093 { 4094 struct extent_map *em1, *em2; 4095 4096 em1 = list_entry(a, struct extent_map, list); 4097 em2 = list_entry(b, struct extent_map, list); 4098 4099 if (em1->start < em2->start) 4100 return -1; 4101 else if (em1->start > em2->start) 4102 return 1; 4103 return 0; 4104 } 4105 4106 static int log_extent_csums(struct btrfs_trans_handle *trans, 4107 struct btrfs_inode *inode, 4108 struct btrfs_root *log_root, 4109 const struct extent_map *em, 4110 struct btrfs_log_ctx *ctx) 4111 { 4112 struct btrfs_ordered_extent *ordered; 4113 u64 csum_offset; 4114 u64 csum_len; 4115 u64 mod_start = em->mod_start; 4116 u64 mod_len = em->mod_len; 4117 LIST_HEAD(ordered_sums); 4118 int ret = 0; 4119 4120 if (inode->flags & BTRFS_INODE_NODATASUM || 4121 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4122 em->block_start == EXTENT_MAP_HOLE) 4123 return 0; 4124 4125 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4126 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4127 const u64 mod_end = mod_start + mod_len; 4128 struct btrfs_ordered_sum *sums; 4129 4130 if (mod_len == 0) 4131 break; 4132 4133 if (ordered_end <= mod_start) 4134 continue; 4135 if (mod_end <= ordered->file_offset) 4136 break; 4137 4138 /* 4139 * We are going to copy all the csums on this ordered extent, so 4140 * go ahead and adjust mod_start and mod_len in case this ordered 4141 * extent has already been logged. 4142 */ 4143 if (ordered->file_offset > mod_start) { 4144 if (ordered_end >= mod_end) 4145 mod_len = ordered->file_offset - mod_start; 4146 /* 4147 * If we have this case 4148 * 4149 * |--------- logged extent ---------| 4150 * |----- ordered extent ----| 4151 * 4152 * Just don't mess with mod_start and mod_len, we'll 4153 * just end up logging more csums than we need and it 4154 * will be ok. 4155 */ 4156 } else { 4157 if (ordered_end < mod_end) { 4158 mod_len = mod_end - ordered_end; 4159 mod_start = ordered_end; 4160 } else { 4161 mod_len = 0; 4162 } 4163 } 4164 4165 /* 4166 * To keep us from looping for the above case of an ordered 4167 * extent that falls inside of the logged extent. 4168 */ 4169 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4170 continue; 4171 4172 list_for_each_entry(sums, &ordered->list, list) { 4173 ret = log_csums(trans, inode, log_root, sums); 4174 if (ret) 4175 return ret; 4176 } 4177 } 4178 4179 /* We're done, found all csums in the ordered extents. */ 4180 if (mod_len == 0) 4181 return 0; 4182 4183 /* If we're compressed we have to save the entire range of csums. */ 4184 if (em->compress_type) { 4185 csum_offset = 0; 4186 csum_len = max(em->block_len, em->orig_block_len); 4187 } else { 4188 csum_offset = mod_start - em->start; 4189 csum_len = mod_len; 4190 } 4191 4192 /* block start is already adjusted for the file extent offset. */ 4193 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4194 em->block_start + csum_offset, 4195 em->block_start + csum_offset + 4196 csum_len - 1, &ordered_sums, 0); 4197 if (ret) 4198 return ret; 4199 4200 while (!list_empty(&ordered_sums)) { 4201 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4202 struct btrfs_ordered_sum, 4203 list); 4204 if (!ret) 4205 ret = log_csums(trans, inode, log_root, sums); 4206 list_del(&sums->list); 4207 kfree(sums); 4208 } 4209 4210 return ret; 4211 } 4212 4213 static int log_one_extent(struct btrfs_trans_handle *trans, 4214 struct btrfs_inode *inode, struct btrfs_root *root, 4215 const struct extent_map *em, 4216 struct btrfs_path *path, 4217 struct btrfs_log_ctx *ctx) 4218 { 4219 struct btrfs_drop_extents_args drop_args = { 0 }; 4220 struct btrfs_root *log = root->log_root; 4221 struct btrfs_file_extent_item *fi; 4222 struct extent_buffer *leaf; 4223 struct btrfs_map_token token; 4224 struct btrfs_key key; 4225 u64 extent_offset = em->start - em->orig_start; 4226 u64 block_len; 4227 int ret; 4228 4229 ret = log_extent_csums(trans, inode, log, em, ctx); 4230 if (ret) 4231 return ret; 4232 4233 drop_args.path = path; 4234 drop_args.start = em->start; 4235 drop_args.end = em->start + em->len; 4236 drop_args.replace_extent = true; 4237 drop_args.extent_item_size = sizeof(*fi); 4238 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4239 if (ret) 4240 return ret; 4241 4242 if (!drop_args.extent_inserted) { 4243 key.objectid = btrfs_ino(inode); 4244 key.type = BTRFS_EXTENT_DATA_KEY; 4245 key.offset = em->start; 4246 4247 ret = btrfs_insert_empty_item(trans, log, path, &key, 4248 sizeof(*fi)); 4249 if (ret) 4250 return ret; 4251 } 4252 leaf = path->nodes[0]; 4253 btrfs_init_map_token(&token, leaf); 4254 fi = btrfs_item_ptr(leaf, path->slots[0], 4255 struct btrfs_file_extent_item); 4256 4257 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4258 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4259 btrfs_set_token_file_extent_type(&token, fi, 4260 BTRFS_FILE_EXTENT_PREALLOC); 4261 else 4262 btrfs_set_token_file_extent_type(&token, fi, 4263 BTRFS_FILE_EXTENT_REG); 4264 4265 block_len = max(em->block_len, em->orig_block_len); 4266 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4267 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4268 em->block_start); 4269 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4270 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4271 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4272 em->block_start - 4273 extent_offset); 4274 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4275 } else { 4276 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4277 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4278 } 4279 4280 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4281 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4282 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4283 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4284 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4285 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4286 btrfs_mark_buffer_dirty(leaf); 4287 4288 btrfs_release_path(path); 4289 4290 return ret; 4291 } 4292 4293 /* 4294 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4295 * lose them after doing a fast fsync and replaying the log. We scan the 4296 * subvolume's root instead of iterating the inode's extent map tree because 4297 * otherwise we can log incorrect extent items based on extent map conversion. 4298 * That can happen due to the fact that extent maps are merged when they 4299 * are not in the extent map tree's list of modified extents. 4300 */ 4301 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4302 struct btrfs_inode *inode, 4303 struct btrfs_path *path) 4304 { 4305 struct btrfs_root *root = inode->root; 4306 struct btrfs_key key; 4307 const u64 i_size = i_size_read(&inode->vfs_inode); 4308 const u64 ino = btrfs_ino(inode); 4309 struct btrfs_path *dst_path = NULL; 4310 bool dropped_extents = false; 4311 u64 truncate_offset = i_size; 4312 struct extent_buffer *leaf; 4313 int slot; 4314 int ins_nr = 0; 4315 int start_slot; 4316 int ret; 4317 4318 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4319 return 0; 4320 4321 key.objectid = ino; 4322 key.type = BTRFS_EXTENT_DATA_KEY; 4323 key.offset = i_size; 4324 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4325 if (ret < 0) 4326 goto out; 4327 4328 /* 4329 * We must check if there is a prealloc extent that starts before the 4330 * i_size and crosses the i_size boundary. This is to ensure later we 4331 * truncate down to the end of that extent and not to the i_size, as 4332 * otherwise we end up losing part of the prealloc extent after a log 4333 * replay and with an implicit hole if there is another prealloc extent 4334 * that starts at an offset beyond i_size. 4335 */ 4336 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4337 if (ret < 0) 4338 goto out; 4339 4340 if (ret == 0) { 4341 struct btrfs_file_extent_item *ei; 4342 4343 leaf = path->nodes[0]; 4344 slot = path->slots[0]; 4345 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4346 4347 if (btrfs_file_extent_type(leaf, ei) == 4348 BTRFS_FILE_EXTENT_PREALLOC) { 4349 u64 extent_end; 4350 4351 btrfs_item_key_to_cpu(leaf, &key, slot); 4352 extent_end = key.offset + 4353 btrfs_file_extent_num_bytes(leaf, ei); 4354 4355 if (extent_end > i_size) 4356 truncate_offset = extent_end; 4357 } 4358 } else { 4359 ret = 0; 4360 } 4361 4362 while (true) { 4363 leaf = path->nodes[0]; 4364 slot = path->slots[0]; 4365 4366 if (slot >= btrfs_header_nritems(leaf)) { 4367 if (ins_nr > 0) { 4368 ret = copy_items(trans, inode, dst_path, path, 4369 start_slot, ins_nr, 1, 0); 4370 if (ret < 0) 4371 goto out; 4372 ins_nr = 0; 4373 } 4374 ret = btrfs_next_leaf(root, path); 4375 if (ret < 0) 4376 goto out; 4377 if (ret > 0) { 4378 ret = 0; 4379 break; 4380 } 4381 continue; 4382 } 4383 4384 btrfs_item_key_to_cpu(leaf, &key, slot); 4385 if (key.objectid > ino) 4386 break; 4387 if (WARN_ON_ONCE(key.objectid < ino) || 4388 key.type < BTRFS_EXTENT_DATA_KEY || 4389 key.offset < i_size) { 4390 path->slots[0]++; 4391 continue; 4392 } 4393 if (!dropped_extents) { 4394 /* 4395 * Avoid logging extent items logged in past fsync calls 4396 * and leading to duplicate keys in the log tree. 4397 */ 4398 do { 4399 ret = btrfs_truncate_inode_items(trans, 4400 root->log_root, 4401 inode, truncate_offset, 4402 BTRFS_EXTENT_DATA_KEY); 4403 } while (ret == -EAGAIN); 4404 if (ret) 4405 goto out; 4406 dropped_extents = true; 4407 } 4408 if (ins_nr == 0) 4409 start_slot = slot; 4410 ins_nr++; 4411 path->slots[0]++; 4412 if (!dst_path) { 4413 dst_path = btrfs_alloc_path(); 4414 if (!dst_path) { 4415 ret = -ENOMEM; 4416 goto out; 4417 } 4418 } 4419 } 4420 if (ins_nr > 0) 4421 ret = copy_items(trans, inode, dst_path, path, 4422 start_slot, ins_nr, 1, 0); 4423 out: 4424 btrfs_release_path(path); 4425 btrfs_free_path(dst_path); 4426 return ret; 4427 } 4428 4429 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4430 struct btrfs_root *root, 4431 struct btrfs_inode *inode, 4432 struct btrfs_path *path, 4433 struct btrfs_log_ctx *ctx) 4434 { 4435 struct btrfs_ordered_extent *ordered; 4436 struct btrfs_ordered_extent *tmp; 4437 struct extent_map *em, *n; 4438 struct list_head extents; 4439 struct extent_map_tree *tree = &inode->extent_tree; 4440 int ret = 0; 4441 int num = 0; 4442 4443 INIT_LIST_HEAD(&extents); 4444 4445 write_lock(&tree->lock); 4446 4447 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4448 list_del_init(&em->list); 4449 /* 4450 * Just an arbitrary number, this can be really CPU intensive 4451 * once we start getting a lot of extents, and really once we 4452 * have a bunch of extents we just want to commit since it will 4453 * be faster. 4454 */ 4455 if (++num > 32768) { 4456 list_del_init(&tree->modified_extents); 4457 ret = -EFBIG; 4458 goto process; 4459 } 4460 4461 if (em->generation < trans->transid) 4462 continue; 4463 4464 /* We log prealloc extents beyond eof later. */ 4465 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4466 em->start >= i_size_read(&inode->vfs_inode)) 4467 continue; 4468 4469 /* Need a ref to keep it from getting evicted from cache */ 4470 refcount_inc(&em->refs); 4471 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4472 list_add_tail(&em->list, &extents); 4473 num++; 4474 } 4475 4476 list_sort(NULL, &extents, extent_cmp); 4477 process: 4478 while (!list_empty(&extents)) { 4479 em = list_entry(extents.next, struct extent_map, list); 4480 4481 list_del_init(&em->list); 4482 4483 /* 4484 * If we had an error we just need to delete everybody from our 4485 * private list. 4486 */ 4487 if (ret) { 4488 clear_em_logging(tree, em); 4489 free_extent_map(em); 4490 continue; 4491 } 4492 4493 write_unlock(&tree->lock); 4494 4495 ret = log_one_extent(trans, inode, root, em, path, ctx); 4496 write_lock(&tree->lock); 4497 clear_em_logging(tree, em); 4498 free_extent_map(em); 4499 } 4500 WARN_ON(!list_empty(&extents)); 4501 write_unlock(&tree->lock); 4502 4503 btrfs_release_path(path); 4504 if (!ret) 4505 ret = btrfs_log_prealloc_extents(trans, inode, path); 4506 if (ret) 4507 return ret; 4508 4509 /* 4510 * We have logged all extents successfully, now make sure the commit of 4511 * the current transaction waits for the ordered extents to complete 4512 * before it commits and wipes out the log trees, otherwise we would 4513 * lose data if an ordered extents completes after the transaction 4514 * commits and a power failure happens after the transaction commit. 4515 */ 4516 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4517 list_del_init(&ordered->log_list); 4518 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4519 4520 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4521 spin_lock_irq(&inode->ordered_tree.lock); 4522 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4523 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4524 atomic_inc(&trans->transaction->pending_ordered); 4525 } 4526 spin_unlock_irq(&inode->ordered_tree.lock); 4527 } 4528 btrfs_put_ordered_extent(ordered); 4529 } 4530 4531 return 0; 4532 } 4533 4534 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4535 struct btrfs_path *path, u64 *size_ret) 4536 { 4537 struct btrfs_key key; 4538 int ret; 4539 4540 key.objectid = btrfs_ino(inode); 4541 key.type = BTRFS_INODE_ITEM_KEY; 4542 key.offset = 0; 4543 4544 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4545 if (ret < 0) { 4546 return ret; 4547 } else if (ret > 0) { 4548 *size_ret = 0; 4549 } else { 4550 struct btrfs_inode_item *item; 4551 4552 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4553 struct btrfs_inode_item); 4554 *size_ret = btrfs_inode_size(path->nodes[0], item); 4555 /* 4556 * If the in-memory inode's i_size is smaller then the inode 4557 * size stored in the btree, return the inode's i_size, so 4558 * that we get a correct inode size after replaying the log 4559 * when before a power failure we had a shrinking truncate 4560 * followed by addition of a new name (rename / new hard link). 4561 * Otherwise return the inode size from the btree, to avoid 4562 * data loss when replaying a log due to previously doing a 4563 * write that expands the inode's size and logging a new name 4564 * immediately after. 4565 */ 4566 if (*size_ret > inode->vfs_inode.i_size) 4567 *size_ret = inode->vfs_inode.i_size; 4568 } 4569 4570 btrfs_release_path(path); 4571 return 0; 4572 } 4573 4574 /* 4575 * At the moment we always log all xattrs. This is to figure out at log replay 4576 * time which xattrs must have their deletion replayed. If a xattr is missing 4577 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4578 * because if a xattr is deleted, the inode is fsynced and a power failure 4579 * happens, causing the log to be replayed the next time the fs is mounted, 4580 * we want the xattr to not exist anymore (same behaviour as other filesystems 4581 * with a journal, ext3/4, xfs, f2fs, etc). 4582 */ 4583 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4584 struct btrfs_root *root, 4585 struct btrfs_inode *inode, 4586 struct btrfs_path *path, 4587 struct btrfs_path *dst_path) 4588 { 4589 int ret; 4590 struct btrfs_key key; 4591 const u64 ino = btrfs_ino(inode); 4592 int ins_nr = 0; 4593 int start_slot = 0; 4594 bool found_xattrs = false; 4595 4596 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 4597 return 0; 4598 4599 key.objectid = ino; 4600 key.type = BTRFS_XATTR_ITEM_KEY; 4601 key.offset = 0; 4602 4603 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4604 if (ret < 0) 4605 return ret; 4606 4607 while (true) { 4608 int slot = path->slots[0]; 4609 struct extent_buffer *leaf = path->nodes[0]; 4610 int nritems = btrfs_header_nritems(leaf); 4611 4612 if (slot >= nritems) { 4613 if (ins_nr > 0) { 4614 ret = copy_items(trans, inode, dst_path, path, 4615 start_slot, ins_nr, 1, 0); 4616 if (ret < 0) 4617 return ret; 4618 ins_nr = 0; 4619 } 4620 ret = btrfs_next_leaf(root, path); 4621 if (ret < 0) 4622 return ret; 4623 else if (ret > 0) 4624 break; 4625 continue; 4626 } 4627 4628 btrfs_item_key_to_cpu(leaf, &key, slot); 4629 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4630 break; 4631 4632 if (ins_nr == 0) 4633 start_slot = slot; 4634 ins_nr++; 4635 path->slots[0]++; 4636 found_xattrs = true; 4637 cond_resched(); 4638 } 4639 if (ins_nr > 0) { 4640 ret = copy_items(trans, inode, dst_path, path, 4641 start_slot, ins_nr, 1, 0); 4642 if (ret < 0) 4643 return ret; 4644 } 4645 4646 if (!found_xattrs) 4647 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 4648 4649 return 0; 4650 } 4651 4652 /* 4653 * When using the NO_HOLES feature if we punched a hole that causes the 4654 * deletion of entire leafs or all the extent items of the first leaf (the one 4655 * that contains the inode item and references) we may end up not processing 4656 * any extents, because there are no leafs with a generation matching the 4657 * current transaction that have extent items for our inode. So we need to find 4658 * if any holes exist and then log them. We also need to log holes after any 4659 * truncate operation that changes the inode's size. 4660 */ 4661 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4662 struct btrfs_root *root, 4663 struct btrfs_inode *inode, 4664 struct btrfs_path *path) 4665 { 4666 struct btrfs_fs_info *fs_info = root->fs_info; 4667 struct btrfs_key key; 4668 const u64 ino = btrfs_ino(inode); 4669 const u64 i_size = i_size_read(&inode->vfs_inode); 4670 u64 prev_extent_end = 0; 4671 int ret; 4672 4673 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4674 return 0; 4675 4676 key.objectid = ino; 4677 key.type = BTRFS_EXTENT_DATA_KEY; 4678 key.offset = 0; 4679 4680 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4681 if (ret < 0) 4682 return ret; 4683 4684 while (true) { 4685 struct extent_buffer *leaf = path->nodes[0]; 4686 4687 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4688 ret = btrfs_next_leaf(root, path); 4689 if (ret < 0) 4690 return ret; 4691 if (ret > 0) { 4692 ret = 0; 4693 break; 4694 } 4695 leaf = path->nodes[0]; 4696 } 4697 4698 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4699 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4700 break; 4701 4702 /* We have a hole, log it. */ 4703 if (prev_extent_end < key.offset) { 4704 const u64 hole_len = key.offset - prev_extent_end; 4705 4706 /* 4707 * Release the path to avoid deadlocks with other code 4708 * paths that search the root while holding locks on 4709 * leafs from the log root. 4710 */ 4711 btrfs_release_path(path); 4712 ret = btrfs_insert_file_extent(trans, root->log_root, 4713 ino, prev_extent_end, 0, 4714 0, hole_len, 0, hole_len, 4715 0, 0, 0); 4716 if (ret < 0) 4717 return ret; 4718 4719 /* 4720 * Search for the same key again in the root. Since it's 4721 * an extent item and we are holding the inode lock, the 4722 * key must still exist. If it doesn't just emit warning 4723 * and return an error to fall back to a transaction 4724 * commit. 4725 */ 4726 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4727 if (ret < 0) 4728 return ret; 4729 if (WARN_ON(ret > 0)) 4730 return -ENOENT; 4731 leaf = path->nodes[0]; 4732 } 4733 4734 prev_extent_end = btrfs_file_extent_end(path); 4735 path->slots[0]++; 4736 cond_resched(); 4737 } 4738 4739 if (prev_extent_end < i_size) { 4740 u64 hole_len; 4741 4742 btrfs_release_path(path); 4743 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 4744 ret = btrfs_insert_file_extent(trans, root->log_root, 4745 ino, prev_extent_end, 0, 0, 4746 hole_len, 0, hole_len, 4747 0, 0, 0); 4748 if (ret < 0) 4749 return ret; 4750 } 4751 4752 return 0; 4753 } 4754 4755 /* 4756 * When we are logging a new inode X, check if it doesn't have a reference that 4757 * matches the reference from some other inode Y created in a past transaction 4758 * and that was renamed in the current transaction. If we don't do this, then at 4759 * log replay time we can lose inode Y (and all its files if it's a directory): 4760 * 4761 * mkdir /mnt/x 4762 * echo "hello world" > /mnt/x/foobar 4763 * sync 4764 * mv /mnt/x /mnt/y 4765 * mkdir /mnt/x # or touch /mnt/x 4766 * xfs_io -c fsync /mnt/x 4767 * <power fail> 4768 * mount fs, trigger log replay 4769 * 4770 * After the log replay procedure, we would lose the first directory and all its 4771 * files (file foobar). 4772 * For the case where inode Y is not a directory we simply end up losing it: 4773 * 4774 * echo "123" > /mnt/foo 4775 * sync 4776 * mv /mnt/foo /mnt/bar 4777 * echo "abc" > /mnt/foo 4778 * xfs_io -c fsync /mnt/foo 4779 * <power fail> 4780 * 4781 * We also need this for cases where a snapshot entry is replaced by some other 4782 * entry (file or directory) otherwise we end up with an unreplayable log due to 4783 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4784 * if it were a regular entry: 4785 * 4786 * mkdir /mnt/x 4787 * btrfs subvolume snapshot /mnt /mnt/x/snap 4788 * btrfs subvolume delete /mnt/x/snap 4789 * rmdir /mnt/x 4790 * mkdir /mnt/x 4791 * fsync /mnt/x or fsync some new file inside it 4792 * <power fail> 4793 * 4794 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4795 * the same transaction. 4796 */ 4797 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4798 const int slot, 4799 const struct btrfs_key *key, 4800 struct btrfs_inode *inode, 4801 u64 *other_ino, u64 *other_parent) 4802 { 4803 int ret; 4804 struct btrfs_path *search_path; 4805 char *name = NULL; 4806 u32 name_len = 0; 4807 u32 item_size = btrfs_item_size_nr(eb, slot); 4808 u32 cur_offset = 0; 4809 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4810 4811 search_path = btrfs_alloc_path(); 4812 if (!search_path) 4813 return -ENOMEM; 4814 search_path->search_commit_root = 1; 4815 search_path->skip_locking = 1; 4816 4817 while (cur_offset < item_size) { 4818 u64 parent; 4819 u32 this_name_len; 4820 u32 this_len; 4821 unsigned long name_ptr; 4822 struct btrfs_dir_item *di; 4823 4824 if (key->type == BTRFS_INODE_REF_KEY) { 4825 struct btrfs_inode_ref *iref; 4826 4827 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4828 parent = key->offset; 4829 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4830 name_ptr = (unsigned long)(iref + 1); 4831 this_len = sizeof(*iref) + this_name_len; 4832 } else { 4833 struct btrfs_inode_extref *extref; 4834 4835 extref = (struct btrfs_inode_extref *)(ptr + 4836 cur_offset); 4837 parent = btrfs_inode_extref_parent(eb, extref); 4838 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4839 name_ptr = (unsigned long)&extref->name; 4840 this_len = sizeof(*extref) + this_name_len; 4841 } 4842 4843 if (this_name_len > name_len) { 4844 char *new_name; 4845 4846 new_name = krealloc(name, this_name_len, GFP_NOFS); 4847 if (!new_name) { 4848 ret = -ENOMEM; 4849 goto out; 4850 } 4851 name_len = this_name_len; 4852 name = new_name; 4853 } 4854 4855 read_extent_buffer(eb, name, name_ptr, this_name_len); 4856 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4857 parent, name, this_name_len, 0); 4858 if (di && !IS_ERR(di)) { 4859 struct btrfs_key di_key; 4860 4861 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4862 di, &di_key); 4863 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4864 if (di_key.objectid != key->objectid) { 4865 ret = 1; 4866 *other_ino = di_key.objectid; 4867 *other_parent = parent; 4868 } else { 4869 ret = 0; 4870 } 4871 } else { 4872 ret = -EAGAIN; 4873 } 4874 goto out; 4875 } else if (IS_ERR(di)) { 4876 ret = PTR_ERR(di); 4877 goto out; 4878 } 4879 btrfs_release_path(search_path); 4880 4881 cur_offset += this_len; 4882 } 4883 ret = 0; 4884 out: 4885 btrfs_free_path(search_path); 4886 kfree(name); 4887 return ret; 4888 } 4889 4890 struct btrfs_ino_list { 4891 u64 ino; 4892 u64 parent; 4893 struct list_head list; 4894 }; 4895 4896 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 4897 struct btrfs_root *root, 4898 struct btrfs_path *path, 4899 struct btrfs_log_ctx *ctx, 4900 u64 ino, u64 parent) 4901 { 4902 struct btrfs_ino_list *ino_elem; 4903 LIST_HEAD(inode_list); 4904 int ret = 0; 4905 4906 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4907 if (!ino_elem) 4908 return -ENOMEM; 4909 ino_elem->ino = ino; 4910 ino_elem->parent = parent; 4911 list_add_tail(&ino_elem->list, &inode_list); 4912 4913 while (!list_empty(&inode_list)) { 4914 struct btrfs_fs_info *fs_info = root->fs_info; 4915 struct btrfs_key key; 4916 struct inode *inode; 4917 4918 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 4919 list); 4920 ino = ino_elem->ino; 4921 parent = ino_elem->parent; 4922 list_del(&ino_elem->list); 4923 kfree(ino_elem); 4924 if (ret) 4925 continue; 4926 4927 btrfs_release_path(path); 4928 4929 inode = btrfs_iget(fs_info->sb, ino, root); 4930 /* 4931 * If the other inode that had a conflicting dir entry was 4932 * deleted in the current transaction, we need to log its parent 4933 * directory. 4934 */ 4935 if (IS_ERR(inode)) { 4936 ret = PTR_ERR(inode); 4937 if (ret == -ENOENT) { 4938 inode = btrfs_iget(fs_info->sb, parent, root); 4939 if (IS_ERR(inode)) { 4940 ret = PTR_ERR(inode); 4941 } else { 4942 ret = btrfs_log_inode(trans, root, 4943 BTRFS_I(inode), 4944 LOG_OTHER_INODE_ALL, 4945 ctx); 4946 btrfs_add_delayed_iput(inode); 4947 } 4948 } 4949 continue; 4950 } 4951 /* 4952 * If the inode was already logged skip it - otherwise we can 4953 * hit an infinite loop. Example: 4954 * 4955 * From the commit root (previous transaction) we have the 4956 * following inodes: 4957 * 4958 * inode 257 a directory 4959 * inode 258 with references "zz" and "zz_link" on inode 257 4960 * inode 259 with reference "a" on inode 257 4961 * 4962 * And in the current (uncommitted) transaction we have: 4963 * 4964 * inode 257 a directory, unchanged 4965 * inode 258 with references "a" and "a2" on inode 257 4966 * inode 259 with reference "zz_link" on inode 257 4967 * inode 261 with reference "zz" on inode 257 4968 * 4969 * When logging inode 261 the following infinite loop could 4970 * happen if we don't skip already logged inodes: 4971 * 4972 * - we detect inode 258 as a conflicting inode, with inode 261 4973 * on reference "zz", and log it; 4974 * 4975 * - we detect inode 259 as a conflicting inode, with inode 258 4976 * on reference "a", and log it; 4977 * 4978 * - we detect inode 258 as a conflicting inode, with inode 259 4979 * on reference "zz_link", and log it - again! After this we 4980 * repeat the above steps forever. 4981 */ 4982 spin_lock(&BTRFS_I(inode)->lock); 4983 /* 4984 * Check the inode's logged_trans only instead of 4985 * btrfs_inode_in_log(). This is because the last_log_commit of 4986 * the inode is not updated when we only log that it exists and 4987 * it has the full sync bit set (see btrfs_log_inode()). 4988 */ 4989 if (BTRFS_I(inode)->logged_trans == trans->transid) { 4990 spin_unlock(&BTRFS_I(inode)->lock); 4991 btrfs_add_delayed_iput(inode); 4992 continue; 4993 } 4994 spin_unlock(&BTRFS_I(inode)->lock); 4995 /* 4996 * We are safe logging the other inode without acquiring its 4997 * lock as long as we log with the LOG_INODE_EXISTS mode. We 4998 * are safe against concurrent renames of the other inode as 4999 * well because during a rename we pin the log and update the 5000 * log with the new name before we unpin it. 5001 */ 5002 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5003 LOG_OTHER_INODE, ctx); 5004 if (ret) { 5005 btrfs_add_delayed_iput(inode); 5006 continue; 5007 } 5008 5009 key.objectid = ino; 5010 key.type = BTRFS_INODE_REF_KEY; 5011 key.offset = 0; 5012 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5013 if (ret < 0) { 5014 btrfs_add_delayed_iput(inode); 5015 continue; 5016 } 5017 5018 while (true) { 5019 struct extent_buffer *leaf = path->nodes[0]; 5020 int slot = path->slots[0]; 5021 u64 other_ino = 0; 5022 u64 other_parent = 0; 5023 5024 if (slot >= btrfs_header_nritems(leaf)) { 5025 ret = btrfs_next_leaf(root, path); 5026 if (ret < 0) { 5027 break; 5028 } else if (ret > 0) { 5029 ret = 0; 5030 break; 5031 } 5032 continue; 5033 } 5034 5035 btrfs_item_key_to_cpu(leaf, &key, slot); 5036 if (key.objectid != ino || 5037 (key.type != BTRFS_INODE_REF_KEY && 5038 key.type != BTRFS_INODE_EXTREF_KEY)) { 5039 ret = 0; 5040 break; 5041 } 5042 5043 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5044 BTRFS_I(inode), &other_ino, 5045 &other_parent); 5046 if (ret < 0) 5047 break; 5048 if (ret > 0) { 5049 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5050 if (!ino_elem) { 5051 ret = -ENOMEM; 5052 break; 5053 } 5054 ino_elem->ino = other_ino; 5055 ino_elem->parent = other_parent; 5056 list_add_tail(&ino_elem->list, &inode_list); 5057 ret = 0; 5058 } 5059 path->slots[0]++; 5060 } 5061 btrfs_add_delayed_iput(inode); 5062 } 5063 5064 return ret; 5065 } 5066 5067 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5068 struct btrfs_inode *inode, 5069 struct btrfs_key *min_key, 5070 const struct btrfs_key *max_key, 5071 struct btrfs_path *path, 5072 struct btrfs_path *dst_path, 5073 const u64 logged_isize, 5074 const bool recursive_logging, 5075 const int inode_only, 5076 struct btrfs_log_ctx *ctx, 5077 bool *need_log_inode_item) 5078 { 5079 struct btrfs_root *root = inode->root; 5080 int ins_start_slot = 0; 5081 int ins_nr = 0; 5082 int ret; 5083 5084 while (1) { 5085 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5086 if (ret < 0) 5087 return ret; 5088 if (ret > 0) { 5089 ret = 0; 5090 break; 5091 } 5092 again: 5093 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5094 if (min_key->objectid != max_key->objectid) 5095 break; 5096 if (min_key->type > max_key->type) 5097 break; 5098 5099 if (min_key->type == BTRFS_INODE_ITEM_KEY) 5100 *need_log_inode_item = false; 5101 5102 if ((min_key->type == BTRFS_INODE_REF_KEY || 5103 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5104 inode->generation == trans->transid && 5105 !recursive_logging) { 5106 u64 other_ino = 0; 5107 u64 other_parent = 0; 5108 5109 ret = btrfs_check_ref_name_override(path->nodes[0], 5110 path->slots[0], min_key, inode, 5111 &other_ino, &other_parent); 5112 if (ret < 0) { 5113 return ret; 5114 } else if (ret > 0 && ctx && 5115 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5116 if (ins_nr > 0) { 5117 ins_nr++; 5118 } else { 5119 ins_nr = 1; 5120 ins_start_slot = path->slots[0]; 5121 } 5122 ret = copy_items(trans, inode, dst_path, path, 5123 ins_start_slot, ins_nr, 5124 inode_only, logged_isize); 5125 if (ret < 0) 5126 return ret; 5127 ins_nr = 0; 5128 5129 ret = log_conflicting_inodes(trans, root, path, 5130 ctx, other_ino, other_parent); 5131 if (ret) 5132 return ret; 5133 btrfs_release_path(path); 5134 goto next_key; 5135 } 5136 } 5137 5138 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5139 if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5140 if (ins_nr == 0) 5141 goto next_slot; 5142 ret = copy_items(trans, inode, dst_path, path, 5143 ins_start_slot, 5144 ins_nr, inode_only, logged_isize); 5145 if (ret < 0) 5146 return ret; 5147 ins_nr = 0; 5148 goto next_slot; 5149 } 5150 5151 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5152 ins_nr++; 5153 goto next_slot; 5154 } else if (!ins_nr) { 5155 ins_start_slot = path->slots[0]; 5156 ins_nr = 1; 5157 goto next_slot; 5158 } 5159 5160 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5161 ins_nr, inode_only, logged_isize); 5162 if (ret < 0) 5163 return ret; 5164 ins_nr = 1; 5165 ins_start_slot = path->slots[0]; 5166 next_slot: 5167 path->slots[0]++; 5168 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5169 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5170 path->slots[0]); 5171 goto again; 5172 } 5173 if (ins_nr) { 5174 ret = copy_items(trans, inode, dst_path, path, 5175 ins_start_slot, ins_nr, inode_only, 5176 logged_isize); 5177 if (ret < 0) 5178 return ret; 5179 ins_nr = 0; 5180 } 5181 btrfs_release_path(path); 5182 next_key: 5183 if (min_key->offset < (u64)-1) { 5184 min_key->offset++; 5185 } else if (min_key->type < max_key->type) { 5186 min_key->type++; 5187 min_key->offset = 0; 5188 } else { 5189 break; 5190 } 5191 } 5192 if (ins_nr) 5193 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5194 ins_nr, inode_only, logged_isize); 5195 5196 return ret; 5197 } 5198 5199 /* log a single inode in the tree log. 5200 * At least one parent directory for this inode must exist in the tree 5201 * or be logged already. 5202 * 5203 * Any items from this inode changed by the current transaction are copied 5204 * to the log tree. An extra reference is taken on any extents in this 5205 * file, allowing us to avoid a whole pile of corner cases around logging 5206 * blocks that have been removed from the tree. 5207 * 5208 * See LOG_INODE_ALL and related defines for a description of what inode_only 5209 * does. 5210 * 5211 * This handles both files and directories. 5212 */ 5213 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5214 struct btrfs_root *root, struct btrfs_inode *inode, 5215 int inode_only, 5216 struct btrfs_log_ctx *ctx) 5217 { 5218 struct btrfs_path *path; 5219 struct btrfs_path *dst_path; 5220 struct btrfs_key min_key; 5221 struct btrfs_key max_key; 5222 struct btrfs_root *log = root->log_root; 5223 int err = 0; 5224 int ret = 0; 5225 bool fast_search = false; 5226 u64 ino = btrfs_ino(inode); 5227 struct extent_map_tree *em_tree = &inode->extent_tree; 5228 u64 logged_isize = 0; 5229 bool need_log_inode_item = true; 5230 bool xattrs_logged = false; 5231 bool recursive_logging = false; 5232 5233 path = btrfs_alloc_path(); 5234 if (!path) 5235 return -ENOMEM; 5236 dst_path = btrfs_alloc_path(); 5237 if (!dst_path) { 5238 btrfs_free_path(path); 5239 return -ENOMEM; 5240 } 5241 5242 min_key.objectid = ino; 5243 min_key.type = BTRFS_INODE_ITEM_KEY; 5244 min_key.offset = 0; 5245 5246 max_key.objectid = ino; 5247 5248 5249 /* today the code can only do partial logging of directories */ 5250 if (S_ISDIR(inode->vfs_inode.i_mode) || 5251 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5252 &inode->runtime_flags) && 5253 inode_only >= LOG_INODE_EXISTS)) 5254 max_key.type = BTRFS_XATTR_ITEM_KEY; 5255 else 5256 max_key.type = (u8)-1; 5257 max_key.offset = (u64)-1; 5258 5259 /* 5260 * Only run delayed items if we are a directory. We want to make sure 5261 * all directory indexes hit the fs/subvolume tree so we can find them 5262 * and figure out which index ranges have to be logged. 5263 * 5264 * Otherwise commit the delayed inode only if the full sync flag is set, 5265 * as we want to make sure an up to date version is in the subvolume 5266 * tree so copy_inode_items_to_log() / copy_items() can find it and copy 5267 * it to the log tree. For a non full sync, we always log the inode item 5268 * based on the in-memory struct btrfs_inode which is always up to date. 5269 */ 5270 if (S_ISDIR(inode->vfs_inode.i_mode)) 5271 ret = btrfs_commit_inode_delayed_items(trans, inode); 5272 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5273 ret = btrfs_commit_inode_delayed_inode(inode); 5274 5275 if (ret) { 5276 btrfs_free_path(path); 5277 btrfs_free_path(dst_path); 5278 return ret; 5279 } 5280 5281 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5282 recursive_logging = true; 5283 if (inode_only == LOG_OTHER_INODE) 5284 inode_only = LOG_INODE_EXISTS; 5285 else 5286 inode_only = LOG_INODE_ALL; 5287 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5288 } else { 5289 mutex_lock(&inode->log_mutex); 5290 } 5291 5292 /* 5293 * a brute force approach to making sure we get the most uptodate 5294 * copies of everything. 5295 */ 5296 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5297 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5298 5299 if (inode_only == LOG_INODE_EXISTS) 5300 max_key_type = BTRFS_XATTR_ITEM_KEY; 5301 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5302 } else { 5303 if (inode_only == LOG_INODE_EXISTS) { 5304 /* 5305 * Make sure the new inode item we write to the log has 5306 * the same isize as the current one (if it exists). 5307 * This is necessary to prevent data loss after log 5308 * replay, and also to prevent doing a wrong expanding 5309 * truncate - for e.g. create file, write 4K into offset 5310 * 0, fsync, write 4K into offset 4096, add hard link, 5311 * fsync some other file (to sync log), power fail - if 5312 * we use the inode's current i_size, after log replay 5313 * we get a 8Kb file, with the last 4Kb extent as a hole 5314 * (zeroes), as if an expanding truncate happened, 5315 * instead of getting a file of 4Kb only. 5316 */ 5317 err = logged_inode_size(log, inode, path, &logged_isize); 5318 if (err) 5319 goto out_unlock; 5320 } 5321 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5322 &inode->runtime_flags)) { 5323 if (inode_only == LOG_INODE_EXISTS) { 5324 max_key.type = BTRFS_XATTR_ITEM_KEY; 5325 ret = drop_objectid_items(trans, log, path, ino, 5326 max_key.type); 5327 } else { 5328 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5329 &inode->runtime_flags); 5330 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5331 &inode->runtime_flags); 5332 while(1) { 5333 ret = btrfs_truncate_inode_items(trans, 5334 log, inode, 0, 0); 5335 if (ret != -EAGAIN) 5336 break; 5337 } 5338 } 5339 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5340 &inode->runtime_flags) || 5341 inode_only == LOG_INODE_EXISTS) { 5342 if (inode_only == LOG_INODE_ALL) 5343 fast_search = true; 5344 max_key.type = BTRFS_XATTR_ITEM_KEY; 5345 ret = drop_objectid_items(trans, log, path, ino, 5346 max_key.type); 5347 } else { 5348 if (inode_only == LOG_INODE_ALL) 5349 fast_search = true; 5350 goto log_extents; 5351 } 5352 5353 } 5354 if (ret) { 5355 err = ret; 5356 goto out_unlock; 5357 } 5358 5359 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5360 path, dst_path, logged_isize, 5361 recursive_logging, inode_only, ctx, 5362 &need_log_inode_item); 5363 if (err) 5364 goto out_unlock; 5365 5366 btrfs_release_path(path); 5367 btrfs_release_path(dst_path); 5368 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5369 if (err) 5370 goto out_unlock; 5371 xattrs_logged = true; 5372 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5373 btrfs_release_path(path); 5374 btrfs_release_path(dst_path); 5375 err = btrfs_log_holes(trans, root, inode, path); 5376 if (err) 5377 goto out_unlock; 5378 } 5379 log_extents: 5380 btrfs_release_path(path); 5381 btrfs_release_path(dst_path); 5382 if (need_log_inode_item) { 5383 err = log_inode_item(trans, log, dst_path, inode); 5384 if (!err && !xattrs_logged) { 5385 err = btrfs_log_all_xattrs(trans, root, inode, path, 5386 dst_path); 5387 btrfs_release_path(path); 5388 } 5389 if (err) 5390 goto out_unlock; 5391 } 5392 if (fast_search) { 5393 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5394 ctx); 5395 if (ret) { 5396 err = ret; 5397 goto out_unlock; 5398 } 5399 } else if (inode_only == LOG_INODE_ALL) { 5400 struct extent_map *em, *n; 5401 5402 write_lock(&em_tree->lock); 5403 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5404 list_del_init(&em->list); 5405 write_unlock(&em_tree->lock); 5406 } 5407 5408 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5409 ret = log_directory_changes(trans, root, inode, path, dst_path, 5410 ctx); 5411 if (ret) { 5412 err = ret; 5413 goto out_unlock; 5414 } 5415 } 5416 5417 /* 5418 * If we are logging that an ancestor inode exists as part of logging a 5419 * new name from a link or rename operation, don't mark the inode as 5420 * logged - otherwise if an explicit fsync is made against an ancestor, 5421 * the fsync considers the inode in the log and doesn't sync the log, 5422 * resulting in the ancestor missing after a power failure unless the 5423 * log was synced as part of an fsync against any other unrelated inode. 5424 * So keep it simple for this case and just don't flag the ancestors as 5425 * logged. 5426 */ 5427 if (!ctx || 5428 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && 5429 &inode->vfs_inode != ctx->inode)) { 5430 spin_lock(&inode->lock); 5431 inode->logged_trans = trans->transid; 5432 /* 5433 * Don't update last_log_commit if we logged that an inode exists 5434 * after it was loaded to memory (full_sync bit set). 5435 * This is to prevent data loss when we do a write to the inode, 5436 * then the inode gets evicted after all delalloc was flushed, 5437 * then we log it exists (due to a rename for example) and then 5438 * fsync it. This last fsync would do nothing (not logging the 5439 * extents previously written). 5440 */ 5441 if (inode_only != LOG_INODE_EXISTS || 5442 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5443 inode->last_log_commit = inode->last_sub_trans; 5444 spin_unlock(&inode->lock); 5445 } 5446 out_unlock: 5447 mutex_unlock(&inode->log_mutex); 5448 5449 btrfs_free_path(path); 5450 btrfs_free_path(dst_path); 5451 return err; 5452 } 5453 5454 /* 5455 * Check if we must fallback to a transaction commit when logging an inode. 5456 * This must be called after logging the inode and is used only in the context 5457 * when fsyncing an inode requires the need to log some other inode - in which 5458 * case we can't lock the i_mutex of each other inode we need to log as that 5459 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5460 * log inodes up or down in the hierarchy) or rename operations for example. So 5461 * we take the log_mutex of the inode after we have logged it and then check for 5462 * its last_unlink_trans value - this is safe because any task setting 5463 * last_unlink_trans must take the log_mutex and it must do this before it does 5464 * the actual unlink operation, so if we do this check before a concurrent task 5465 * sets last_unlink_trans it means we've logged a consistent version/state of 5466 * all the inode items, otherwise we are not sure and must do a transaction 5467 * commit (the concurrent task might have only updated last_unlink_trans before 5468 * we logged the inode or it might have also done the unlink). 5469 */ 5470 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5471 struct btrfs_inode *inode) 5472 { 5473 bool ret = false; 5474 5475 mutex_lock(&inode->log_mutex); 5476 if (inode->last_unlink_trans >= trans->transid) { 5477 /* 5478 * Make sure any commits to the log are forced to be full 5479 * commits. 5480 */ 5481 btrfs_set_log_full_commit(trans); 5482 ret = true; 5483 } 5484 mutex_unlock(&inode->log_mutex); 5485 5486 return ret; 5487 } 5488 5489 /* 5490 * follow the dentry parent pointers up the chain and see if any 5491 * of the directories in it require a full commit before they can 5492 * be logged. Returns zero if nothing special needs to be done or 1 if 5493 * a full commit is required. 5494 */ 5495 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5496 struct btrfs_inode *inode, 5497 struct dentry *parent, 5498 struct super_block *sb) 5499 { 5500 int ret = 0; 5501 struct dentry *old_parent = NULL; 5502 5503 /* 5504 * for regular files, if its inode is already on disk, we don't 5505 * have to worry about the parents at all. This is because 5506 * we can use the last_unlink_trans field to record renames 5507 * and other fun in this file. 5508 */ 5509 if (S_ISREG(inode->vfs_inode.i_mode) && 5510 inode->generation < trans->transid && 5511 inode->last_unlink_trans < trans->transid) 5512 goto out; 5513 5514 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5515 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5516 goto out; 5517 inode = BTRFS_I(d_inode(parent)); 5518 } 5519 5520 while (1) { 5521 if (btrfs_must_commit_transaction(trans, inode)) { 5522 ret = 1; 5523 break; 5524 } 5525 5526 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5527 break; 5528 5529 if (IS_ROOT(parent)) { 5530 inode = BTRFS_I(d_inode(parent)); 5531 if (btrfs_must_commit_transaction(trans, inode)) 5532 ret = 1; 5533 break; 5534 } 5535 5536 parent = dget_parent(parent); 5537 dput(old_parent); 5538 old_parent = parent; 5539 inode = BTRFS_I(d_inode(parent)); 5540 5541 } 5542 dput(old_parent); 5543 out: 5544 return ret; 5545 } 5546 5547 struct btrfs_dir_list { 5548 u64 ino; 5549 struct list_head list; 5550 }; 5551 5552 /* 5553 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5554 * details about the why it is needed. 5555 * This is a recursive operation - if an existing dentry corresponds to a 5556 * directory, that directory's new entries are logged too (same behaviour as 5557 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5558 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5559 * complains about the following circular lock dependency / possible deadlock: 5560 * 5561 * CPU0 CPU1 5562 * ---- ---- 5563 * lock(&type->i_mutex_dir_key#3/2); 5564 * lock(sb_internal#2); 5565 * lock(&type->i_mutex_dir_key#3/2); 5566 * lock(&sb->s_type->i_mutex_key#14); 5567 * 5568 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5569 * sb_start_intwrite() in btrfs_start_transaction(). 5570 * Not locking i_mutex of the inodes is still safe because: 5571 * 5572 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5573 * that while logging the inode new references (names) are added or removed 5574 * from the inode, leaving the logged inode item with a link count that does 5575 * not match the number of logged inode reference items. This is fine because 5576 * at log replay time we compute the real number of links and correct the 5577 * link count in the inode item (see replay_one_buffer() and 5578 * link_to_fixup_dir()); 5579 * 5580 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5581 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5582 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5583 * has a size that doesn't match the sum of the lengths of all the logged 5584 * names. This does not result in a problem because if a dir_item key is 5585 * logged but its matching dir_index key is not logged, at log replay time we 5586 * don't use it to replay the respective name (see replay_one_name()). On the 5587 * other hand if only the dir_index key ends up being logged, the respective 5588 * name is added to the fs/subvol tree with both the dir_item and dir_index 5589 * keys created (see replay_one_name()). 5590 * The directory's inode item with a wrong i_size is not a problem as well, 5591 * since we don't use it at log replay time to set the i_size in the inode 5592 * item of the fs/subvol tree (see overwrite_item()). 5593 */ 5594 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5595 struct btrfs_root *root, 5596 struct btrfs_inode *start_inode, 5597 struct btrfs_log_ctx *ctx) 5598 { 5599 struct btrfs_fs_info *fs_info = root->fs_info; 5600 struct btrfs_root *log = root->log_root; 5601 struct btrfs_path *path; 5602 LIST_HEAD(dir_list); 5603 struct btrfs_dir_list *dir_elem; 5604 int ret = 0; 5605 5606 path = btrfs_alloc_path(); 5607 if (!path) 5608 return -ENOMEM; 5609 5610 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5611 if (!dir_elem) { 5612 btrfs_free_path(path); 5613 return -ENOMEM; 5614 } 5615 dir_elem->ino = btrfs_ino(start_inode); 5616 list_add_tail(&dir_elem->list, &dir_list); 5617 5618 while (!list_empty(&dir_list)) { 5619 struct extent_buffer *leaf; 5620 struct btrfs_key min_key; 5621 int nritems; 5622 int i; 5623 5624 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5625 list); 5626 if (ret) 5627 goto next_dir_inode; 5628 5629 min_key.objectid = dir_elem->ino; 5630 min_key.type = BTRFS_DIR_ITEM_KEY; 5631 min_key.offset = 0; 5632 again: 5633 btrfs_release_path(path); 5634 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5635 if (ret < 0) { 5636 goto next_dir_inode; 5637 } else if (ret > 0) { 5638 ret = 0; 5639 goto next_dir_inode; 5640 } 5641 5642 process_leaf: 5643 leaf = path->nodes[0]; 5644 nritems = btrfs_header_nritems(leaf); 5645 for (i = path->slots[0]; i < nritems; i++) { 5646 struct btrfs_dir_item *di; 5647 struct btrfs_key di_key; 5648 struct inode *di_inode; 5649 struct btrfs_dir_list *new_dir_elem; 5650 int log_mode = LOG_INODE_EXISTS; 5651 int type; 5652 5653 btrfs_item_key_to_cpu(leaf, &min_key, i); 5654 if (min_key.objectid != dir_elem->ino || 5655 min_key.type != BTRFS_DIR_ITEM_KEY) 5656 goto next_dir_inode; 5657 5658 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5659 type = btrfs_dir_type(leaf, di); 5660 if (btrfs_dir_transid(leaf, di) < trans->transid && 5661 type != BTRFS_FT_DIR) 5662 continue; 5663 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5664 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5665 continue; 5666 5667 btrfs_release_path(path); 5668 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5669 if (IS_ERR(di_inode)) { 5670 ret = PTR_ERR(di_inode); 5671 goto next_dir_inode; 5672 } 5673 5674 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5675 btrfs_add_delayed_iput(di_inode); 5676 break; 5677 } 5678 5679 ctx->log_new_dentries = false; 5680 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5681 log_mode = LOG_INODE_ALL; 5682 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5683 log_mode, ctx); 5684 if (!ret && 5685 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5686 ret = 1; 5687 btrfs_add_delayed_iput(di_inode); 5688 if (ret) 5689 goto next_dir_inode; 5690 if (ctx->log_new_dentries) { 5691 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5692 GFP_NOFS); 5693 if (!new_dir_elem) { 5694 ret = -ENOMEM; 5695 goto next_dir_inode; 5696 } 5697 new_dir_elem->ino = di_key.objectid; 5698 list_add_tail(&new_dir_elem->list, &dir_list); 5699 } 5700 break; 5701 } 5702 if (i == nritems) { 5703 ret = btrfs_next_leaf(log, path); 5704 if (ret < 0) { 5705 goto next_dir_inode; 5706 } else if (ret > 0) { 5707 ret = 0; 5708 goto next_dir_inode; 5709 } 5710 goto process_leaf; 5711 } 5712 if (min_key.offset < (u64)-1) { 5713 min_key.offset++; 5714 goto again; 5715 } 5716 next_dir_inode: 5717 list_del(&dir_elem->list); 5718 kfree(dir_elem); 5719 } 5720 5721 btrfs_free_path(path); 5722 return ret; 5723 } 5724 5725 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5726 struct btrfs_inode *inode, 5727 struct btrfs_log_ctx *ctx) 5728 { 5729 struct btrfs_fs_info *fs_info = trans->fs_info; 5730 int ret; 5731 struct btrfs_path *path; 5732 struct btrfs_key key; 5733 struct btrfs_root *root = inode->root; 5734 const u64 ino = btrfs_ino(inode); 5735 5736 path = btrfs_alloc_path(); 5737 if (!path) 5738 return -ENOMEM; 5739 path->skip_locking = 1; 5740 path->search_commit_root = 1; 5741 5742 key.objectid = ino; 5743 key.type = BTRFS_INODE_REF_KEY; 5744 key.offset = 0; 5745 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5746 if (ret < 0) 5747 goto out; 5748 5749 while (true) { 5750 struct extent_buffer *leaf = path->nodes[0]; 5751 int slot = path->slots[0]; 5752 u32 cur_offset = 0; 5753 u32 item_size; 5754 unsigned long ptr; 5755 5756 if (slot >= btrfs_header_nritems(leaf)) { 5757 ret = btrfs_next_leaf(root, path); 5758 if (ret < 0) 5759 goto out; 5760 else if (ret > 0) 5761 break; 5762 continue; 5763 } 5764 5765 btrfs_item_key_to_cpu(leaf, &key, slot); 5766 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5767 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5768 break; 5769 5770 item_size = btrfs_item_size_nr(leaf, slot); 5771 ptr = btrfs_item_ptr_offset(leaf, slot); 5772 while (cur_offset < item_size) { 5773 struct btrfs_key inode_key; 5774 struct inode *dir_inode; 5775 5776 inode_key.type = BTRFS_INODE_ITEM_KEY; 5777 inode_key.offset = 0; 5778 5779 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5780 struct btrfs_inode_extref *extref; 5781 5782 extref = (struct btrfs_inode_extref *) 5783 (ptr + cur_offset); 5784 inode_key.objectid = btrfs_inode_extref_parent( 5785 leaf, extref); 5786 cur_offset += sizeof(*extref); 5787 cur_offset += btrfs_inode_extref_name_len(leaf, 5788 extref); 5789 } else { 5790 inode_key.objectid = key.offset; 5791 cur_offset = item_size; 5792 } 5793 5794 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 5795 root); 5796 /* 5797 * If the parent inode was deleted, return an error to 5798 * fallback to a transaction commit. This is to prevent 5799 * getting an inode that was moved from one parent A to 5800 * a parent B, got its former parent A deleted and then 5801 * it got fsync'ed, from existing at both parents after 5802 * a log replay (and the old parent still existing). 5803 * Example: 5804 * 5805 * mkdir /mnt/A 5806 * mkdir /mnt/B 5807 * touch /mnt/B/bar 5808 * sync 5809 * mv /mnt/B/bar /mnt/A/bar 5810 * mv -T /mnt/A /mnt/B 5811 * fsync /mnt/B/bar 5812 * <power fail> 5813 * 5814 * If we ignore the old parent B which got deleted, 5815 * after a log replay we would have file bar linked 5816 * at both parents and the old parent B would still 5817 * exist. 5818 */ 5819 if (IS_ERR(dir_inode)) { 5820 ret = PTR_ERR(dir_inode); 5821 goto out; 5822 } 5823 5824 if (ctx) 5825 ctx->log_new_dentries = false; 5826 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5827 LOG_INODE_ALL, ctx); 5828 if (!ret && 5829 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5830 ret = 1; 5831 if (!ret && ctx && ctx->log_new_dentries) 5832 ret = log_new_dir_dentries(trans, root, 5833 BTRFS_I(dir_inode), ctx); 5834 btrfs_add_delayed_iput(dir_inode); 5835 if (ret) 5836 goto out; 5837 } 5838 path->slots[0]++; 5839 } 5840 ret = 0; 5841 out: 5842 btrfs_free_path(path); 5843 return ret; 5844 } 5845 5846 static int log_new_ancestors(struct btrfs_trans_handle *trans, 5847 struct btrfs_root *root, 5848 struct btrfs_path *path, 5849 struct btrfs_log_ctx *ctx) 5850 { 5851 struct btrfs_key found_key; 5852 5853 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5854 5855 while (true) { 5856 struct btrfs_fs_info *fs_info = root->fs_info; 5857 struct extent_buffer *leaf = path->nodes[0]; 5858 int slot = path->slots[0]; 5859 struct btrfs_key search_key; 5860 struct inode *inode; 5861 u64 ino; 5862 int ret = 0; 5863 5864 btrfs_release_path(path); 5865 5866 ino = found_key.offset; 5867 5868 search_key.objectid = found_key.offset; 5869 search_key.type = BTRFS_INODE_ITEM_KEY; 5870 search_key.offset = 0; 5871 inode = btrfs_iget(fs_info->sb, ino, root); 5872 if (IS_ERR(inode)) 5873 return PTR_ERR(inode); 5874 5875 if (BTRFS_I(inode)->generation >= trans->transid) 5876 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5877 LOG_INODE_EXISTS, ctx); 5878 btrfs_add_delayed_iput(inode); 5879 if (ret) 5880 return ret; 5881 5882 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5883 break; 5884 5885 search_key.type = BTRFS_INODE_REF_KEY; 5886 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5887 if (ret < 0) 5888 return ret; 5889 5890 leaf = path->nodes[0]; 5891 slot = path->slots[0]; 5892 if (slot >= btrfs_header_nritems(leaf)) { 5893 ret = btrfs_next_leaf(root, path); 5894 if (ret < 0) 5895 return ret; 5896 else if (ret > 0) 5897 return -ENOENT; 5898 leaf = path->nodes[0]; 5899 slot = path->slots[0]; 5900 } 5901 5902 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5903 if (found_key.objectid != search_key.objectid || 5904 found_key.type != BTRFS_INODE_REF_KEY) 5905 return -ENOENT; 5906 } 5907 return 0; 5908 } 5909 5910 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 5911 struct btrfs_inode *inode, 5912 struct dentry *parent, 5913 struct btrfs_log_ctx *ctx) 5914 { 5915 struct btrfs_root *root = inode->root; 5916 struct dentry *old_parent = NULL; 5917 struct super_block *sb = inode->vfs_inode.i_sb; 5918 int ret = 0; 5919 5920 while (true) { 5921 if (!parent || d_really_is_negative(parent) || 5922 sb != parent->d_sb) 5923 break; 5924 5925 inode = BTRFS_I(d_inode(parent)); 5926 if (root != inode->root) 5927 break; 5928 5929 if (inode->generation >= trans->transid) { 5930 ret = btrfs_log_inode(trans, root, inode, 5931 LOG_INODE_EXISTS, ctx); 5932 if (ret) 5933 break; 5934 } 5935 if (IS_ROOT(parent)) 5936 break; 5937 5938 parent = dget_parent(parent); 5939 dput(old_parent); 5940 old_parent = parent; 5941 } 5942 dput(old_parent); 5943 5944 return ret; 5945 } 5946 5947 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 5948 struct btrfs_inode *inode, 5949 struct dentry *parent, 5950 struct btrfs_log_ctx *ctx) 5951 { 5952 struct btrfs_root *root = inode->root; 5953 const u64 ino = btrfs_ino(inode); 5954 struct btrfs_path *path; 5955 struct btrfs_key search_key; 5956 int ret; 5957 5958 /* 5959 * For a single hard link case, go through a fast path that does not 5960 * need to iterate the fs/subvolume tree. 5961 */ 5962 if (inode->vfs_inode.i_nlink < 2) 5963 return log_new_ancestors_fast(trans, inode, parent, ctx); 5964 5965 path = btrfs_alloc_path(); 5966 if (!path) 5967 return -ENOMEM; 5968 5969 search_key.objectid = ino; 5970 search_key.type = BTRFS_INODE_REF_KEY; 5971 search_key.offset = 0; 5972 again: 5973 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5974 if (ret < 0) 5975 goto out; 5976 if (ret == 0) 5977 path->slots[0]++; 5978 5979 while (true) { 5980 struct extent_buffer *leaf = path->nodes[0]; 5981 int slot = path->slots[0]; 5982 struct btrfs_key found_key; 5983 5984 if (slot >= btrfs_header_nritems(leaf)) { 5985 ret = btrfs_next_leaf(root, path); 5986 if (ret < 0) 5987 goto out; 5988 else if (ret > 0) 5989 break; 5990 continue; 5991 } 5992 5993 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5994 if (found_key.objectid != ino || 5995 found_key.type > BTRFS_INODE_EXTREF_KEY) 5996 break; 5997 5998 /* 5999 * Don't deal with extended references because they are rare 6000 * cases and too complex to deal with (we would need to keep 6001 * track of which subitem we are processing for each item in 6002 * this loop, etc). So just return some error to fallback to 6003 * a transaction commit. 6004 */ 6005 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6006 ret = -EMLINK; 6007 goto out; 6008 } 6009 6010 /* 6011 * Logging ancestors needs to do more searches on the fs/subvol 6012 * tree, so it releases the path as needed to avoid deadlocks. 6013 * Keep track of the last inode ref key and resume from that key 6014 * after logging all new ancestors for the current hard link. 6015 */ 6016 memcpy(&search_key, &found_key, sizeof(search_key)); 6017 6018 ret = log_new_ancestors(trans, root, path, ctx); 6019 if (ret) 6020 goto out; 6021 btrfs_release_path(path); 6022 goto again; 6023 } 6024 ret = 0; 6025 out: 6026 btrfs_free_path(path); 6027 return ret; 6028 } 6029 6030 /* 6031 * helper function around btrfs_log_inode to make sure newly created 6032 * parent directories also end up in the log. A minimal inode and backref 6033 * only logging is done of any parent directories that are older than 6034 * the last committed transaction 6035 */ 6036 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6037 struct btrfs_inode *inode, 6038 struct dentry *parent, 6039 int inode_only, 6040 struct btrfs_log_ctx *ctx) 6041 { 6042 struct btrfs_root *root = inode->root; 6043 struct btrfs_fs_info *fs_info = root->fs_info; 6044 struct super_block *sb; 6045 int ret = 0; 6046 bool log_dentries = false; 6047 6048 sb = inode->vfs_inode.i_sb; 6049 6050 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6051 ret = 1; 6052 goto end_no_trans; 6053 } 6054 6055 if (btrfs_root_refs(&root->root_item) == 0) { 6056 ret = 1; 6057 goto end_no_trans; 6058 } 6059 6060 ret = check_parent_dirs_for_sync(trans, inode, parent, sb); 6061 if (ret) 6062 goto end_no_trans; 6063 6064 /* 6065 * Skip already logged inodes or inodes corresponding to tmpfiles 6066 * (since logging them is pointless, a link count of 0 means they 6067 * will never be accessible). 6068 */ 6069 if (btrfs_inode_in_log(inode, trans->transid) || 6070 inode->vfs_inode.i_nlink == 0) { 6071 ret = BTRFS_NO_LOG_SYNC; 6072 goto end_no_trans; 6073 } 6074 6075 ret = start_log_trans(trans, root, ctx); 6076 if (ret) 6077 goto end_no_trans; 6078 6079 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); 6080 if (ret) 6081 goto end_trans; 6082 6083 /* 6084 * for regular files, if its inode is already on disk, we don't 6085 * have to worry about the parents at all. This is because 6086 * we can use the last_unlink_trans field to record renames 6087 * and other fun in this file. 6088 */ 6089 if (S_ISREG(inode->vfs_inode.i_mode) && 6090 inode->generation < trans->transid && 6091 inode->last_unlink_trans < trans->transid) { 6092 ret = 0; 6093 goto end_trans; 6094 } 6095 6096 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6097 log_dentries = true; 6098 6099 /* 6100 * On unlink we must make sure all our current and old parent directory 6101 * inodes are fully logged. This is to prevent leaving dangling 6102 * directory index entries in directories that were our parents but are 6103 * not anymore. Not doing this results in old parent directory being 6104 * impossible to delete after log replay (rmdir will always fail with 6105 * error -ENOTEMPTY). 6106 * 6107 * Example 1: 6108 * 6109 * mkdir testdir 6110 * touch testdir/foo 6111 * ln testdir/foo testdir/bar 6112 * sync 6113 * unlink testdir/bar 6114 * xfs_io -c fsync testdir/foo 6115 * <power failure> 6116 * mount fs, triggers log replay 6117 * 6118 * If we don't log the parent directory (testdir), after log replay the 6119 * directory still has an entry pointing to the file inode using the bar 6120 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6121 * the file inode has a link count of 1. 6122 * 6123 * Example 2: 6124 * 6125 * mkdir testdir 6126 * touch foo 6127 * ln foo testdir/foo2 6128 * ln foo testdir/foo3 6129 * sync 6130 * unlink testdir/foo3 6131 * xfs_io -c fsync foo 6132 * <power failure> 6133 * mount fs, triggers log replay 6134 * 6135 * Similar as the first example, after log replay the parent directory 6136 * testdir still has an entry pointing to the inode file with name foo3 6137 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6138 * and has a link count of 2. 6139 */ 6140 if (inode->last_unlink_trans >= trans->transid) { 6141 ret = btrfs_log_all_parents(trans, inode, ctx); 6142 if (ret) 6143 goto end_trans; 6144 } 6145 6146 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6147 if (ret) 6148 goto end_trans; 6149 6150 if (log_dentries) 6151 ret = log_new_dir_dentries(trans, root, inode, ctx); 6152 else 6153 ret = 0; 6154 end_trans: 6155 if (ret < 0) { 6156 btrfs_set_log_full_commit(trans); 6157 ret = 1; 6158 } 6159 6160 if (ret) 6161 btrfs_remove_log_ctx(root, ctx); 6162 btrfs_end_log_trans(root); 6163 end_no_trans: 6164 return ret; 6165 } 6166 6167 /* 6168 * it is not safe to log dentry if the chunk root has added new 6169 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6170 * If this returns 1, you must commit the transaction to safely get your 6171 * data on disk. 6172 */ 6173 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6174 struct dentry *dentry, 6175 struct btrfs_log_ctx *ctx) 6176 { 6177 struct dentry *parent = dget_parent(dentry); 6178 int ret; 6179 6180 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6181 LOG_INODE_ALL, ctx); 6182 dput(parent); 6183 6184 return ret; 6185 } 6186 6187 /* 6188 * should be called during mount to recover any replay any log trees 6189 * from the FS 6190 */ 6191 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6192 { 6193 int ret; 6194 struct btrfs_path *path; 6195 struct btrfs_trans_handle *trans; 6196 struct btrfs_key key; 6197 struct btrfs_key found_key; 6198 struct btrfs_root *log; 6199 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6200 struct walk_control wc = { 6201 .process_func = process_one_buffer, 6202 .stage = LOG_WALK_PIN_ONLY, 6203 }; 6204 6205 path = btrfs_alloc_path(); 6206 if (!path) 6207 return -ENOMEM; 6208 6209 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6210 6211 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6212 if (IS_ERR(trans)) { 6213 ret = PTR_ERR(trans); 6214 goto error; 6215 } 6216 6217 wc.trans = trans; 6218 wc.pin = 1; 6219 6220 ret = walk_log_tree(trans, log_root_tree, &wc); 6221 if (ret) { 6222 btrfs_handle_fs_error(fs_info, ret, 6223 "Failed to pin buffers while recovering log root tree."); 6224 goto error; 6225 } 6226 6227 again: 6228 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6229 key.offset = (u64)-1; 6230 key.type = BTRFS_ROOT_ITEM_KEY; 6231 6232 while (1) { 6233 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6234 6235 if (ret < 0) { 6236 btrfs_handle_fs_error(fs_info, ret, 6237 "Couldn't find tree log root."); 6238 goto error; 6239 } 6240 if (ret > 0) { 6241 if (path->slots[0] == 0) 6242 break; 6243 path->slots[0]--; 6244 } 6245 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6246 path->slots[0]); 6247 btrfs_release_path(path); 6248 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6249 break; 6250 6251 log = btrfs_read_tree_root(log_root_tree, &found_key); 6252 if (IS_ERR(log)) { 6253 ret = PTR_ERR(log); 6254 btrfs_handle_fs_error(fs_info, ret, 6255 "Couldn't read tree log root."); 6256 goto error; 6257 } 6258 6259 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6260 true); 6261 if (IS_ERR(wc.replay_dest)) { 6262 ret = PTR_ERR(wc.replay_dest); 6263 6264 /* 6265 * We didn't find the subvol, likely because it was 6266 * deleted. This is ok, simply skip this log and go to 6267 * the next one. 6268 * 6269 * We need to exclude the root because we can't have 6270 * other log replays overwriting this log as we'll read 6271 * it back in a few more times. This will keep our 6272 * block from being modified, and we'll just bail for 6273 * each subsequent pass. 6274 */ 6275 if (ret == -ENOENT) 6276 ret = btrfs_pin_extent_for_log_replay(trans, 6277 log->node->start, 6278 log->node->len); 6279 btrfs_put_root(log); 6280 6281 if (!ret) 6282 goto next; 6283 btrfs_handle_fs_error(fs_info, ret, 6284 "Couldn't read target root for tree log recovery."); 6285 goto error; 6286 } 6287 6288 wc.replay_dest->log_root = log; 6289 btrfs_record_root_in_trans(trans, wc.replay_dest); 6290 ret = walk_log_tree(trans, log, &wc); 6291 6292 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6293 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6294 path); 6295 } 6296 6297 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6298 struct btrfs_root *root = wc.replay_dest; 6299 6300 btrfs_release_path(path); 6301 6302 /* 6303 * We have just replayed everything, and the highest 6304 * objectid of fs roots probably has changed in case 6305 * some inode_item's got replayed. 6306 * 6307 * root->objectid_mutex is not acquired as log replay 6308 * could only happen during mount. 6309 */ 6310 ret = btrfs_find_highest_objectid(root, 6311 &root->highest_objectid); 6312 } 6313 6314 wc.replay_dest->log_root = NULL; 6315 btrfs_put_root(wc.replay_dest); 6316 btrfs_put_root(log); 6317 6318 if (ret) 6319 goto error; 6320 next: 6321 if (found_key.offset == 0) 6322 break; 6323 key.offset = found_key.offset - 1; 6324 } 6325 btrfs_release_path(path); 6326 6327 /* step one is to pin it all, step two is to replay just inodes */ 6328 if (wc.pin) { 6329 wc.pin = 0; 6330 wc.process_func = replay_one_buffer; 6331 wc.stage = LOG_WALK_REPLAY_INODES; 6332 goto again; 6333 } 6334 /* step three is to replay everything */ 6335 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6336 wc.stage++; 6337 goto again; 6338 } 6339 6340 btrfs_free_path(path); 6341 6342 /* step 4: commit the transaction, which also unpins the blocks */ 6343 ret = btrfs_commit_transaction(trans); 6344 if (ret) 6345 return ret; 6346 6347 log_root_tree->log_root = NULL; 6348 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6349 btrfs_put_root(log_root_tree); 6350 6351 return 0; 6352 error: 6353 if (wc.trans) 6354 btrfs_end_transaction(wc.trans); 6355 btrfs_free_path(path); 6356 return ret; 6357 } 6358 6359 /* 6360 * there are some corner cases where we want to force a full 6361 * commit instead of allowing a directory to be logged. 6362 * 6363 * They revolve around files there were unlinked from the directory, and 6364 * this function updates the parent directory so that a full commit is 6365 * properly done if it is fsync'd later after the unlinks are done. 6366 * 6367 * Must be called before the unlink operations (updates to the subvolume tree, 6368 * inodes, etc) are done. 6369 */ 6370 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6371 struct btrfs_inode *dir, struct btrfs_inode *inode, 6372 int for_rename) 6373 { 6374 /* 6375 * when we're logging a file, if it hasn't been renamed 6376 * or unlinked, and its inode is fully committed on disk, 6377 * we don't have to worry about walking up the directory chain 6378 * to log its parents. 6379 * 6380 * So, we use the last_unlink_trans field to put this transid 6381 * into the file. When the file is logged we check it and 6382 * don't log the parents if the file is fully on disk. 6383 */ 6384 mutex_lock(&inode->log_mutex); 6385 inode->last_unlink_trans = trans->transid; 6386 mutex_unlock(&inode->log_mutex); 6387 6388 /* 6389 * if this directory was already logged any new 6390 * names for this file/dir will get recorded 6391 */ 6392 if (dir->logged_trans == trans->transid) 6393 return; 6394 6395 /* 6396 * if the inode we're about to unlink was logged, 6397 * the log will be properly updated for any new names 6398 */ 6399 if (inode->logged_trans == trans->transid) 6400 return; 6401 6402 /* 6403 * when renaming files across directories, if the directory 6404 * there we're unlinking from gets fsync'd later on, there's 6405 * no way to find the destination directory later and fsync it 6406 * properly. So, we have to be conservative and force commits 6407 * so the new name gets discovered. 6408 */ 6409 if (for_rename) 6410 goto record; 6411 6412 /* we can safely do the unlink without any special recording */ 6413 return; 6414 6415 record: 6416 mutex_lock(&dir->log_mutex); 6417 dir->last_unlink_trans = trans->transid; 6418 mutex_unlock(&dir->log_mutex); 6419 } 6420 6421 /* 6422 * Make sure that if someone attempts to fsync the parent directory of a deleted 6423 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6424 * that after replaying the log tree of the parent directory's root we will not 6425 * see the snapshot anymore and at log replay time we will not see any log tree 6426 * corresponding to the deleted snapshot's root, which could lead to replaying 6427 * it after replaying the log tree of the parent directory (which would replay 6428 * the snapshot delete operation). 6429 * 6430 * Must be called before the actual snapshot destroy operation (updates to the 6431 * parent root and tree of tree roots trees, etc) are done. 6432 */ 6433 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6434 struct btrfs_inode *dir) 6435 { 6436 mutex_lock(&dir->log_mutex); 6437 dir->last_unlink_trans = trans->transid; 6438 mutex_unlock(&dir->log_mutex); 6439 } 6440 6441 /* 6442 * Call this after adding a new name for a file and it will properly 6443 * update the log to reflect the new name. 6444 */ 6445 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6446 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6447 struct dentry *parent) 6448 { 6449 struct btrfs_log_ctx ctx; 6450 6451 /* 6452 * this will force the logging code to walk the dentry chain 6453 * up for the file 6454 */ 6455 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6456 inode->last_unlink_trans = trans->transid; 6457 6458 /* 6459 * if this inode hasn't been logged and directory we're renaming it 6460 * from hasn't been logged, we don't need to log it 6461 */ 6462 if (inode->logged_trans < trans->transid && 6463 (!old_dir || old_dir->logged_trans < trans->transid)) 6464 return; 6465 6466 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6467 ctx.logging_new_name = true; 6468 /* 6469 * We don't care about the return value. If we fail to log the new name 6470 * then we know the next attempt to sync the log will fallback to a full 6471 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 6472 * we don't need to worry about getting a log committed that has an 6473 * inconsistent state after a rename operation. 6474 */ 6475 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6476 } 6477 6478