1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "inode-map.h" 21 #include "block-group.h" 22 #include "space-info.h" 23 24 /* magic values for the inode_only field in btrfs_log_inode: 25 * 26 * LOG_INODE_ALL means to log everything 27 * LOG_INODE_EXISTS means to log just enough to recreate the inode 28 * during log replay 29 */ 30 enum { 31 LOG_INODE_ALL, 32 LOG_INODE_EXISTS, 33 LOG_OTHER_INODE, 34 LOG_OTHER_INODE_ALL, 35 }; 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 enum { 90 LOG_WALK_PIN_ONLY, 91 LOG_WALK_REPLAY_INODES, 92 LOG_WALK_REPLAY_DIR_INDEX, 93 LOG_WALK_REPLAY_ALL, 94 }; 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct btrfs_inode *inode, 98 int inode_only, 99 struct btrfs_log_ctx *ctx); 100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_path *path, u64 objectid); 103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_root *log, 106 struct btrfs_path *path, 107 u64 dirid, int del_all); 108 109 /* 110 * tree logging is a special write ahead log used to make sure that 111 * fsyncs and O_SYNCs can happen without doing full tree commits. 112 * 113 * Full tree commits are expensive because they require commonly 114 * modified blocks to be recowed, creating many dirty pages in the 115 * extent tree an 4x-6x higher write load than ext3. 116 * 117 * Instead of doing a tree commit on every fsync, we use the 118 * key ranges and transaction ids to find items for a given file or directory 119 * that have changed in this transaction. Those items are copied into 120 * a special tree (one per subvolume root), that tree is written to disk 121 * and then the fsync is considered complete. 122 * 123 * After a crash, items are copied out of the log-tree back into the 124 * subvolume tree. Any file data extents found are recorded in the extent 125 * allocation tree, and the log-tree freed. 126 * 127 * The log tree is read three times, once to pin down all the extents it is 128 * using in ram and once, once to create all the inodes logged in the tree 129 * and once to do all the other items. 130 */ 131 132 /* 133 * start a sub transaction and setup the log tree 134 * this increments the log tree writer count to make the people 135 * syncing the tree wait for us to finish 136 */ 137 static int start_log_trans(struct btrfs_trans_handle *trans, 138 struct btrfs_root *root, 139 struct btrfs_log_ctx *ctx) 140 { 141 struct btrfs_fs_info *fs_info = root->fs_info; 142 int ret = 0; 143 144 mutex_lock(&root->log_mutex); 145 146 if (root->log_root) { 147 if (btrfs_need_log_full_commit(trans)) { 148 ret = -EAGAIN; 149 goto out; 150 } 151 152 if (!root->log_start_pid) { 153 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 154 root->log_start_pid = current->pid; 155 } else if (root->log_start_pid != current->pid) { 156 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 157 } 158 } else { 159 mutex_lock(&fs_info->tree_log_mutex); 160 if (!fs_info->log_root_tree) 161 ret = btrfs_init_log_root_tree(trans, fs_info); 162 mutex_unlock(&fs_info->tree_log_mutex); 163 if (ret) 164 goto out; 165 166 ret = btrfs_add_log_tree(trans, root); 167 if (ret) 168 goto out; 169 170 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 171 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 172 root->log_start_pid = current->pid; 173 } 174 175 atomic_inc(&root->log_batch); 176 atomic_inc(&root->log_writers); 177 if (ctx && !ctx->logging_new_name) { 178 int index = root->log_transid % 2; 179 list_add_tail(&ctx->list, &root->log_ctxs[index]); 180 ctx->log_transid = root->log_transid; 181 } 182 183 out: 184 mutex_unlock(&root->log_mutex); 185 return ret; 186 } 187 188 /* 189 * returns 0 if there was a log transaction running and we were able 190 * to join, or returns -ENOENT if there were not transactions 191 * in progress 192 */ 193 static int join_running_log_trans(struct btrfs_root *root) 194 { 195 int ret = -ENOENT; 196 197 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 198 return ret; 199 200 mutex_lock(&root->log_mutex); 201 if (root->log_root) { 202 ret = 0; 203 atomic_inc(&root->log_writers); 204 } 205 mutex_unlock(&root->log_mutex); 206 return ret; 207 } 208 209 /* 210 * This either makes the current running log transaction wait 211 * until you call btrfs_end_log_trans() or it makes any future 212 * log transactions wait until you call btrfs_end_log_trans() 213 */ 214 void btrfs_pin_log_trans(struct btrfs_root *root) 215 { 216 atomic_inc(&root->log_writers); 217 } 218 219 /* 220 * indicate we're done making changes to the log tree 221 * and wake up anyone waiting to do a sync 222 */ 223 void btrfs_end_log_trans(struct btrfs_root *root) 224 { 225 if (atomic_dec_and_test(&root->log_writers)) { 226 /* atomic_dec_and_test implies a barrier */ 227 cond_wake_up_nomb(&root->log_writer_wait); 228 } 229 } 230 231 static int btrfs_write_tree_block(struct extent_buffer *buf) 232 { 233 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 234 buf->start + buf->len - 1); 235 } 236 237 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 238 { 239 filemap_fdatawait_range(buf->pages[0]->mapping, 240 buf->start, buf->start + buf->len - 1); 241 } 242 243 /* 244 * the walk control struct is used to pass state down the chain when 245 * processing the log tree. The stage field tells us which part 246 * of the log tree processing we are currently doing. The others 247 * are state fields used for that specific part 248 */ 249 struct walk_control { 250 /* should we free the extent on disk when done? This is used 251 * at transaction commit time while freeing a log tree 252 */ 253 int free; 254 255 /* should we write out the extent buffer? This is used 256 * while flushing the log tree to disk during a sync 257 */ 258 int write; 259 260 /* should we wait for the extent buffer io to finish? Also used 261 * while flushing the log tree to disk for a sync 262 */ 263 int wait; 264 265 /* pin only walk, we record which extents on disk belong to the 266 * log trees 267 */ 268 int pin; 269 270 /* what stage of the replay code we're currently in */ 271 int stage; 272 273 /* 274 * Ignore any items from the inode currently being processed. Needs 275 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 276 * the LOG_WALK_REPLAY_INODES stage. 277 */ 278 bool ignore_cur_inode; 279 280 /* the root we are currently replaying */ 281 struct btrfs_root *replay_dest; 282 283 /* the trans handle for the current replay */ 284 struct btrfs_trans_handle *trans; 285 286 /* the function that gets used to process blocks we find in the 287 * tree. Note the extent_buffer might not be up to date when it is 288 * passed in, and it must be checked or read if you need the data 289 * inside it 290 */ 291 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 292 struct walk_control *wc, u64 gen, int level); 293 }; 294 295 /* 296 * process_func used to pin down extents, write them or wait on them 297 */ 298 static int process_one_buffer(struct btrfs_root *log, 299 struct extent_buffer *eb, 300 struct walk_control *wc, u64 gen, int level) 301 { 302 struct btrfs_fs_info *fs_info = log->fs_info; 303 int ret = 0; 304 305 /* 306 * If this fs is mixed then we need to be able to process the leaves to 307 * pin down any logged extents, so we have to read the block. 308 */ 309 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 310 ret = btrfs_read_buffer(eb, gen, level, NULL); 311 if (ret) 312 return ret; 313 } 314 315 if (wc->pin) 316 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 317 eb->len); 318 319 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 320 if (wc->pin && btrfs_header_level(eb) == 0) 321 ret = btrfs_exclude_logged_extents(eb); 322 if (wc->write) 323 btrfs_write_tree_block(eb); 324 if (wc->wait) 325 btrfs_wait_tree_block_writeback(eb); 326 } 327 return ret; 328 } 329 330 /* 331 * Item overwrite used by replay and tree logging. eb, slot and key all refer 332 * to the src data we are copying out. 333 * 334 * root is the tree we are copying into, and path is a scratch 335 * path for use in this function (it should be released on entry and 336 * will be released on exit). 337 * 338 * If the key is already in the destination tree the existing item is 339 * overwritten. If the existing item isn't big enough, it is extended. 340 * If it is too large, it is truncated. 341 * 342 * If the key isn't in the destination yet, a new item is inserted. 343 */ 344 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 345 struct btrfs_root *root, 346 struct btrfs_path *path, 347 struct extent_buffer *eb, int slot, 348 struct btrfs_key *key) 349 { 350 int ret; 351 u32 item_size; 352 u64 saved_i_size = 0; 353 int save_old_i_size = 0; 354 unsigned long src_ptr; 355 unsigned long dst_ptr; 356 int overwrite_root = 0; 357 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 358 359 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 360 overwrite_root = 1; 361 362 item_size = btrfs_item_size_nr(eb, slot); 363 src_ptr = btrfs_item_ptr_offset(eb, slot); 364 365 /* look for the key in the destination tree */ 366 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 367 if (ret < 0) 368 return ret; 369 370 if (ret == 0) { 371 char *src_copy; 372 char *dst_copy; 373 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 374 path->slots[0]); 375 if (dst_size != item_size) 376 goto insert; 377 378 if (item_size == 0) { 379 btrfs_release_path(path); 380 return 0; 381 } 382 dst_copy = kmalloc(item_size, GFP_NOFS); 383 src_copy = kmalloc(item_size, GFP_NOFS); 384 if (!dst_copy || !src_copy) { 385 btrfs_release_path(path); 386 kfree(dst_copy); 387 kfree(src_copy); 388 return -ENOMEM; 389 } 390 391 read_extent_buffer(eb, src_copy, src_ptr, item_size); 392 393 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 394 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 395 item_size); 396 ret = memcmp(dst_copy, src_copy, item_size); 397 398 kfree(dst_copy); 399 kfree(src_copy); 400 /* 401 * they have the same contents, just return, this saves 402 * us from cowing blocks in the destination tree and doing 403 * extra writes that may not have been done by a previous 404 * sync 405 */ 406 if (ret == 0) { 407 btrfs_release_path(path); 408 return 0; 409 } 410 411 /* 412 * We need to load the old nbytes into the inode so when we 413 * replay the extents we've logged we get the right nbytes. 414 */ 415 if (inode_item) { 416 struct btrfs_inode_item *item; 417 u64 nbytes; 418 u32 mode; 419 420 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 421 struct btrfs_inode_item); 422 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 423 item = btrfs_item_ptr(eb, slot, 424 struct btrfs_inode_item); 425 btrfs_set_inode_nbytes(eb, item, nbytes); 426 427 /* 428 * If this is a directory we need to reset the i_size to 429 * 0 so that we can set it up properly when replaying 430 * the rest of the items in this log. 431 */ 432 mode = btrfs_inode_mode(eb, item); 433 if (S_ISDIR(mode)) 434 btrfs_set_inode_size(eb, item, 0); 435 } 436 } else if (inode_item) { 437 struct btrfs_inode_item *item; 438 u32 mode; 439 440 /* 441 * New inode, set nbytes to 0 so that the nbytes comes out 442 * properly when we replay the extents. 443 */ 444 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 445 btrfs_set_inode_nbytes(eb, item, 0); 446 447 /* 448 * If this is a directory we need to reset the i_size to 0 so 449 * that we can set it up properly when replaying the rest of 450 * the items in this log. 451 */ 452 mode = btrfs_inode_mode(eb, item); 453 if (S_ISDIR(mode)) 454 btrfs_set_inode_size(eb, item, 0); 455 } 456 insert: 457 btrfs_release_path(path); 458 /* try to insert the key into the destination tree */ 459 path->skip_release_on_error = 1; 460 ret = btrfs_insert_empty_item(trans, root, path, 461 key, item_size); 462 path->skip_release_on_error = 0; 463 464 /* make sure any existing item is the correct size */ 465 if (ret == -EEXIST || ret == -EOVERFLOW) { 466 u32 found_size; 467 found_size = btrfs_item_size_nr(path->nodes[0], 468 path->slots[0]); 469 if (found_size > item_size) 470 btrfs_truncate_item(path, item_size, 1); 471 else if (found_size < item_size) 472 btrfs_extend_item(path, item_size - found_size); 473 } else if (ret) { 474 return ret; 475 } 476 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 477 path->slots[0]); 478 479 /* don't overwrite an existing inode if the generation number 480 * was logged as zero. This is done when the tree logging code 481 * is just logging an inode to make sure it exists after recovery. 482 * 483 * Also, don't overwrite i_size on directories during replay. 484 * log replay inserts and removes directory items based on the 485 * state of the tree found in the subvolume, and i_size is modified 486 * as it goes 487 */ 488 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 489 struct btrfs_inode_item *src_item; 490 struct btrfs_inode_item *dst_item; 491 492 src_item = (struct btrfs_inode_item *)src_ptr; 493 dst_item = (struct btrfs_inode_item *)dst_ptr; 494 495 if (btrfs_inode_generation(eb, src_item) == 0) { 496 struct extent_buffer *dst_eb = path->nodes[0]; 497 const u64 ino_size = btrfs_inode_size(eb, src_item); 498 499 /* 500 * For regular files an ino_size == 0 is used only when 501 * logging that an inode exists, as part of a directory 502 * fsync, and the inode wasn't fsynced before. In this 503 * case don't set the size of the inode in the fs/subvol 504 * tree, otherwise we would be throwing valid data away. 505 */ 506 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 507 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 508 ino_size != 0) 509 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 510 goto no_copy; 511 } 512 513 if (overwrite_root && 514 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 515 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 516 save_old_i_size = 1; 517 saved_i_size = btrfs_inode_size(path->nodes[0], 518 dst_item); 519 } 520 } 521 522 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 523 src_ptr, item_size); 524 525 if (save_old_i_size) { 526 struct btrfs_inode_item *dst_item; 527 dst_item = (struct btrfs_inode_item *)dst_ptr; 528 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 529 } 530 531 /* make sure the generation is filled in */ 532 if (key->type == BTRFS_INODE_ITEM_KEY) { 533 struct btrfs_inode_item *dst_item; 534 dst_item = (struct btrfs_inode_item *)dst_ptr; 535 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 536 btrfs_set_inode_generation(path->nodes[0], dst_item, 537 trans->transid); 538 } 539 } 540 no_copy: 541 btrfs_mark_buffer_dirty(path->nodes[0]); 542 btrfs_release_path(path); 543 return 0; 544 } 545 546 /* 547 * simple helper to read an inode off the disk from a given root 548 * This can only be called for subvolume roots and not for the log 549 */ 550 static noinline struct inode *read_one_inode(struct btrfs_root *root, 551 u64 objectid) 552 { 553 struct inode *inode; 554 555 inode = btrfs_iget(root->fs_info->sb, objectid, root); 556 if (IS_ERR(inode)) 557 inode = NULL; 558 return inode; 559 } 560 561 /* replays a single extent in 'eb' at 'slot' with 'key' into the 562 * subvolume 'root'. path is released on entry and should be released 563 * on exit. 564 * 565 * extents in the log tree have not been allocated out of the extent 566 * tree yet. So, this completes the allocation, taking a reference 567 * as required if the extent already exists or creating a new extent 568 * if it isn't in the extent allocation tree yet. 569 * 570 * The extent is inserted into the file, dropping any existing extents 571 * from the file that overlap the new one. 572 */ 573 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 574 struct btrfs_root *root, 575 struct btrfs_path *path, 576 struct extent_buffer *eb, int slot, 577 struct btrfs_key *key) 578 { 579 struct btrfs_fs_info *fs_info = root->fs_info; 580 int found_type; 581 u64 extent_end; 582 u64 start = key->offset; 583 u64 nbytes = 0; 584 struct btrfs_file_extent_item *item; 585 struct inode *inode = NULL; 586 unsigned long size; 587 int ret = 0; 588 589 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 590 found_type = btrfs_file_extent_type(eb, item); 591 592 if (found_type == BTRFS_FILE_EXTENT_REG || 593 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 594 nbytes = btrfs_file_extent_num_bytes(eb, item); 595 extent_end = start + nbytes; 596 597 /* 598 * We don't add to the inodes nbytes if we are prealloc or a 599 * hole. 600 */ 601 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 602 nbytes = 0; 603 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 604 size = btrfs_file_extent_ram_bytes(eb, item); 605 nbytes = btrfs_file_extent_ram_bytes(eb, item); 606 extent_end = ALIGN(start + size, 607 fs_info->sectorsize); 608 } else { 609 ret = 0; 610 goto out; 611 } 612 613 inode = read_one_inode(root, key->objectid); 614 if (!inode) { 615 ret = -EIO; 616 goto out; 617 } 618 619 /* 620 * first check to see if we already have this extent in the 621 * file. This must be done before the btrfs_drop_extents run 622 * so we don't try to drop this extent. 623 */ 624 ret = btrfs_lookup_file_extent(trans, root, path, 625 btrfs_ino(BTRFS_I(inode)), start, 0); 626 627 if (ret == 0 && 628 (found_type == BTRFS_FILE_EXTENT_REG || 629 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 630 struct btrfs_file_extent_item cmp1; 631 struct btrfs_file_extent_item cmp2; 632 struct btrfs_file_extent_item *existing; 633 struct extent_buffer *leaf; 634 635 leaf = path->nodes[0]; 636 existing = btrfs_item_ptr(leaf, path->slots[0], 637 struct btrfs_file_extent_item); 638 639 read_extent_buffer(eb, &cmp1, (unsigned long)item, 640 sizeof(cmp1)); 641 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 642 sizeof(cmp2)); 643 644 /* 645 * we already have a pointer to this exact extent, 646 * we don't have to do anything 647 */ 648 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 649 btrfs_release_path(path); 650 goto out; 651 } 652 } 653 btrfs_release_path(path); 654 655 /* drop any overlapping extents */ 656 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 657 if (ret) 658 goto out; 659 660 if (found_type == BTRFS_FILE_EXTENT_REG || 661 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 662 u64 offset; 663 unsigned long dest_offset; 664 struct btrfs_key ins; 665 666 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 667 btrfs_fs_incompat(fs_info, NO_HOLES)) 668 goto update_inode; 669 670 ret = btrfs_insert_empty_item(trans, root, path, key, 671 sizeof(*item)); 672 if (ret) 673 goto out; 674 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 675 path->slots[0]); 676 copy_extent_buffer(path->nodes[0], eb, dest_offset, 677 (unsigned long)item, sizeof(*item)); 678 679 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 680 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 681 ins.type = BTRFS_EXTENT_ITEM_KEY; 682 offset = key->offset - btrfs_file_extent_offset(eb, item); 683 684 /* 685 * Manually record dirty extent, as here we did a shallow 686 * file extent item copy and skip normal backref update, 687 * but modifying extent tree all by ourselves. 688 * So need to manually record dirty extent for qgroup, 689 * as the owner of the file extent changed from log tree 690 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 691 */ 692 ret = btrfs_qgroup_trace_extent(trans, 693 btrfs_file_extent_disk_bytenr(eb, item), 694 btrfs_file_extent_disk_num_bytes(eb, item), 695 GFP_NOFS); 696 if (ret < 0) 697 goto out; 698 699 if (ins.objectid > 0) { 700 struct btrfs_ref ref = { 0 }; 701 u64 csum_start; 702 u64 csum_end; 703 LIST_HEAD(ordered_sums); 704 705 /* 706 * is this extent already allocated in the extent 707 * allocation tree? If so, just add a reference 708 */ 709 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 710 ins.offset); 711 if (ret == 0) { 712 btrfs_init_generic_ref(&ref, 713 BTRFS_ADD_DELAYED_REF, 714 ins.objectid, ins.offset, 0); 715 btrfs_init_data_ref(&ref, 716 root->root_key.objectid, 717 key->objectid, offset); 718 ret = btrfs_inc_extent_ref(trans, &ref); 719 if (ret) 720 goto out; 721 } else { 722 /* 723 * insert the extent pointer in the extent 724 * allocation tree 725 */ 726 ret = btrfs_alloc_logged_file_extent(trans, 727 root->root_key.objectid, 728 key->objectid, offset, &ins); 729 if (ret) 730 goto out; 731 } 732 btrfs_release_path(path); 733 734 if (btrfs_file_extent_compression(eb, item)) { 735 csum_start = ins.objectid; 736 csum_end = csum_start + ins.offset; 737 } else { 738 csum_start = ins.objectid + 739 btrfs_file_extent_offset(eb, item); 740 csum_end = csum_start + 741 btrfs_file_extent_num_bytes(eb, item); 742 } 743 744 ret = btrfs_lookup_csums_range(root->log_root, 745 csum_start, csum_end - 1, 746 &ordered_sums, 0); 747 if (ret) 748 goto out; 749 /* 750 * Now delete all existing cums in the csum root that 751 * cover our range. We do this because we can have an 752 * extent that is completely referenced by one file 753 * extent item and partially referenced by another 754 * file extent item (like after using the clone or 755 * extent_same ioctls). In this case if we end up doing 756 * the replay of the one that partially references the 757 * extent first, and we do not do the csum deletion 758 * below, we can get 2 csum items in the csum tree that 759 * overlap each other. For example, imagine our log has 760 * the two following file extent items: 761 * 762 * key (257 EXTENT_DATA 409600) 763 * extent data disk byte 12845056 nr 102400 764 * extent data offset 20480 nr 20480 ram 102400 765 * 766 * key (257 EXTENT_DATA 819200) 767 * extent data disk byte 12845056 nr 102400 768 * extent data offset 0 nr 102400 ram 102400 769 * 770 * Where the second one fully references the 100K extent 771 * that starts at disk byte 12845056, and the log tree 772 * has a single csum item that covers the entire range 773 * of the extent: 774 * 775 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 776 * 777 * After the first file extent item is replayed, the 778 * csum tree gets the following csum item: 779 * 780 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 781 * 782 * Which covers the 20K sub-range starting at offset 20K 783 * of our extent. Now when we replay the second file 784 * extent item, if we do not delete existing csum items 785 * that cover any of its blocks, we end up getting two 786 * csum items in our csum tree that overlap each other: 787 * 788 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 789 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 790 * 791 * Which is a problem, because after this anyone trying 792 * to lookup up for the checksum of any block of our 793 * extent starting at an offset of 40K or higher, will 794 * end up looking at the second csum item only, which 795 * does not contain the checksum for any block starting 796 * at offset 40K or higher of our extent. 797 */ 798 while (!list_empty(&ordered_sums)) { 799 struct btrfs_ordered_sum *sums; 800 sums = list_entry(ordered_sums.next, 801 struct btrfs_ordered_sum, 802 list); 803 if (!ret) 804 ret = btrfs_del_csums(trans, 805 fs_info->csum_root, 806 sums->bytenr, 807 sums->len); 808 if (!ret) 809 ret = btrfs_csum_file_blocks(trans, 810 fs_info->csum_root, sums); 811 list_del(&sums->list); 812 kfree(sums); 813 } 814 if (ret) 815 goto out; 816 } else { 817 btrfs_release_path(path); 818 } 819 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 820 /* inline extents are easy, we just overwrite them */ 821 ret = overwrite_item(trans, root, path, eb, slot, key); 822 if (ret) 823 goto out; 824 } 825 826 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 827 extent_end - start); 828 if (ret) 829 goto out; 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct inode *inode; 855 char *name; 856 int name_len; 857 struct extent_buffer *leaf; 858 struct btrfs_key location; 859 int ret; 860 861 leaf = path->nodes[0]; 862 863 btrfs_dir_item_key_to_cpu(leaf, di, &location); 864 name_len = btrfs_dir_name_len(leaf, di); 865 name = kmalloc(name_len, GFP_NOFS); 866 if (!name) 867 return -ENOMEM; 868 869 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 870 btrfs_release_path(path); 871 872 inode = read_one_inode(root, location.objectid); 873 if (!inode) { 874 ret = -EIO; 875 goto out; 876 } 877 878 ret = link_to_fixup_dir(trans, root, path, location.objectid); 879 if (ret) 880 goto out; 881 882 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 883 name_len); 884 if (ret) 885 goto out; 886 else 887 ret = btrfs_run_delayed_items(trans); 888 out: 889 kfree(name); 890 iput(inode); 891 return ret; 892 } 893 894 /* 895 * helper function to see if a given name and sequence number found 896 * in an inode back reference are already in a directory and correctly 897 * point to this inode 898 */ 899 static noinline int inode_in_dir(struct btrfs_root *root, 900 struct btrfs_path *path, 901 u64 dirid, u64 objectid, u64 index, 902 const char *name, int name_len) 903 { 904 struct btrfs_dir_item *di; 905 struct btrfs_key location; 906 int match = 0; 907 908 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 909 index, name, name_len, 0); 910 if (di && !IS_ERR(di)) { 911 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 912 if (location.objectid != objectid) 913 goto out; 914 } else 915 goto out; 916 btrfs_release_path(path); 917 918 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 919 if (di && !IS_ERR(di)) { 920 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 921 if (location.objectid != objectid) 922 goto out; 923 } else 924 goto out; 925 match = 1; 926 out: 927 btrfs_release_path(path); 928 return match; 929 } 930 931 /* 932 * helper function to check a log tree for a named back reference in 933 * an inode. This is used to decide if a back reference that is 934 * found in the subvolume conflicts with what we find in the log. 935 * 936 * inode backreferences may have multiple refs in a single item, 937 * during replay we process one reference at a time, and we don't 938 * want to delete valid links to a file from the subvolume if that 939 * link is also in the log. 940 */ 941 static noinline int backref_in_log(struct btrfs_root *log, 942 struct btrfs_key *key, 943 u64 ref_objectid, 944 const char *name, int namelen) 945 { 946 struct btrfs_path *path; 947 int ret; 948 949 path = btrfs_alloc_path(); 950 if (!path) 951 return -ENOMEM; 952 953 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 954 if (ret < 0) { 955 goto out; 956 } else if (ret == 1) { 957 ret = 0; 958 goto out; 959 } 960 961 if (key->type == BTRFS_INODE_EXTREF_KEY) 962 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 963 path->slots[0], 964 ref_objectid, 965 name, namelen); 966 else 967 ret = !!btrfs_find_name_in_backref(path->nodes[0], 968 path->slots[0], 969 name, namelen); 970 out: 971 btrfs_free_path(path); 972 return ret; 973 } 974 975 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 976 struct btrfs_root *root, 977 struct btrfs_path *path, 978 struct btrfs_root *log_root, 979 struct btrfs_inode *dir, 980 struct btrfs_inode *inode, 981 u64 inode_objectid, u64 parent_objectid, 982 u64 ref_index, char *name, int namelen, 983 int *search_done) 984 { 985 int ret; 986 char *victim_name; 987 int victim_name_len; 988 struct extent_buffer *leaf; 989 struct btrfs_dir_item *di; 990 struct btrfs_key search_key; 991 struct btrfs_inode_extref *extref; 992 993 again: 994 /* Search old style refs */ 995 search_key.objectid = inode_objectid; 996 search_key.type = BTRFS_INODE_REF_KEY; 997 search_key.offset = parent_objectid; 998 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 999 if (ret == 0) { 1000 struct btrfs_inode_ref *victim_ref; 1001 unsigned long ptr; 1002 unsigned long ptr_end; 1003 1004 leaf = path->nodes[0]; 1005 1006 /* are we trying to overwrite a back ref for the root directory 1007 * if so, just jump out, we're done 1008 */ 1009 if (search_key.objectid == search_key.offset) 1010 return 1; 1011 1012 /* check all the names in this back reference to see 1013 * if they are in the log. if so, we allow them to stay 1014 * otherwise they must be unlinked as a conflict 1015 */ 1016 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1017 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1018 while (ptr < ptr_end) { 1019 victim_ref = (struct btrfs_inode_ref *)ptr; 1020 victim_name_len = btrfs_inode_ref_name_len(leaf, 1021 victim_ref); 1022 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1023 if (!victim_name) 1024 return -ENOMEM; 1025 1026 read_extent_buffer(leaf, victim_name, 1027 (unsigned long)(victim_ref + 1), 1028 victim_name_len); 1029 1030 ret = backref_in_log(log_root, &search_key, 1031 parent_objectid, victim_name, 1032 victim_name_len); 1033 if (ret < 0) { 1034 kfree(victim_name); 1035 return ret; 1036 } else if (!ret) { 1037 inc_nlink(&inode->vfs_inode); 1038 btrfs_release_path(path); 1039 1040 ret = btrfs_unlink_inode(trans, root, dir, inode, 1041 victim_name, victim_name_len); 1042 kfree(victim_name); 1043 if (ret) 1044 return ret; 1045 ret = btrfs_run_delayed_items(trans); 1046 if (ret) 1047 return ret; 1048 *search_done = 1; 1049 goto again; 1050 } 1051 kfree(victim_name); 1052 1053 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1054 } 1055 1056 /* 1057 * NOTE: we have searched root tree and checked the 1058 * corresponding ref, it does not need to check again. 1059 */ 1060 *search_done = 1; 1061 } 1062 btrfs_release_path(path); 1063 1064 /* Same search but for extended refs */ 1065 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1066 inode_objectid, parent_objectid, 0, 1067 0); 1068 if (!IS_ERR_OR_NULL(extref)) { 1069 u32 item_size; 1070 u32 cur_offset = 0; 1071 unsigned long base; 1072 struct inode *victim_parent; 1073 1074 leaf = path->nodes[0]; 1075 1076 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1077 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1078 1079 while (cur_offset < item_size) { 1080 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1081 1082 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1083 1084 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1085 goto next; 1086 1087 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1088 if (!victim_name) 1089 return -ENOMEM; 1090 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1091 victim_name_len); 1092 1093 search_key.objectid = inode_objectid; 1094 search_key.type = BTRFS_INODE_EXTREF_KEY; 1095 search_key.offset = btrfs_extref_hash(parent_objectid, 1096 victim_name, 1097 victim_name_len); 1098 ret = backref_in_log(log_root, &search_key, 1099 parent_objectid, victim_name, 1100 victim_name_len); 1101 if (ret < 0) { 1102 return ret; 1103 } else if (!ret) { 1104 ret = -ENOENT; 1105 victim_parent = read_one_inode(root, 1106 parent_objectid); 1107 if (victim_parent) { 1108 inc_nlink(&inode->vfs_inode); 1109 btrfs_release_path(path); 1110 1111 ret = btrfs_unlink_inode(trans, root, 1112 BTRFS_I(victim_parent), 1113 inode, 1114 victim_name, 1115 victim_name_len); 1116 if (!ret) 1117 ret = btrfs_run_delayed_items( 1118 trans); 1119 } 1120 iput(victim_parent); 1121 kfree(victim_name); 1122 if (ret) 1123 return ret; 1124 *search_done = 1; 1125 goto again; 1126 } 1127 kfree(victim_name); 1128 next: 1129 cur_offset += victim_name_len + sizeof(*extref); 1130 } 1131 *search_done = 1; 1132 } 1133 btrfs_release_path(path); 1134 1135 /* look for a conflicting sequence number */ 1136 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1137 ref_index, name, namelen, 0); 1138 if (di && !IS_ERR(di)) { 1139 ret = drop_one_dir_item(trans, root, path, dir, di); 1140 if (ret) 1141 return ret; 1142 } 1143 btrfs_release_path(path); 1144 1145 /* look for a conflicting name */ 1146 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1147 name, namelen, 0); 1148 if (di && !IS_ERR(di)) { 1149 ret = drop_one_dir_item(trans, root, path, dir, di); 1150 if (ret) 1151 return ret; 1152 } 1153 btrfs_release_path(path); 1154 1155 return 0; 1156 } 1157 1158 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1159 u32 *namelen, char **name, u64 *index, 1160 u64 *parent_objectid) 1161 { 1162 struct btrfs_inode_extref *extref; 1163 1164 extref = (struct btrfs_inode_extref *)ref_ptr; 1165 1166 *namelen = btrfs_inode_extref_name_len(eb, extref); 1167 *name = kmalloc(*namelen, GFP_NOFS); 1168 if (*name == NULL) 1169 return -ENOMEM; 1170 1171 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1172 *namelen); 1173 1174 if (index) 1175 *index = btrfs_inode_extref_index(eb, extref); 1176 if (parent_objectid) 1177 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1178 1179 return 0; 1180 } 1181 1182 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1183 u32 *namelen, char **name, u64 *index) 1184 { 1185 struct btrfs_inode_ref *ref; 1186 1187 ref = (struct btrfs_inode_ref *)ref_ptr; 1188 1189 *namelen = btrfs_inode_ref_name_len(eb, ref); 1190 *name = kmalloc(*namelen, GFP_NOFS); 1191 if (*name == NULL) 1192 return -ENOMEM; 1193 1194 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1195 1196 if (index) 1197 *index = btrfs_inode_ref_index(eb, ref); 1198 1199 return 0; 1200 } 1201 1202 /* 1203 * Take an inode reference item from the log tree and iterate all names from the 1204 * inode reference item in the subvolume tree with the same key (if it exists). 1205 * For any name that is not in the inode reference item from the log tree, do a 1206 * proper unlink of that name (that is, remove its entry from the inode 1207 * reference item and both dir index keys). 1208 */ 1209 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1210 struct btrfs_root *root, 1211 struct btrfs_path *path, 1212 struct btrfs_inode *inode, 1213 struct extent_buffer *log_eb, 1214 int log_slot, 1215 struct btrfs_key *key) 1216 { 1217 int ret; 1218 unsigned long ref_ptr; 1219 unsigned long ref_end; 1220 struct extent_buffer *eb; 1221 1222 again: 1223 btrfs_release_path(path); 1224 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1225 if (ret > 0) { 1226 ret = 0; 1227 goto out; 1228 } 1229 if (ret < 0) 1230 goto out; 1231 1232 eb = path->nodes[0]; 1233 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1234 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1235 while (ref_ptr < ref_end) { 1236 char *name = NULL; 1237 int namelen; 1238 u64 parent_id; 1239 1240 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1241 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1242 NULL, &parent_id); 1243 } else { 1244 parent_id = key->offset; 1245 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1246 NULL); 1247 } 1248 if (ret) 1249 goto out; 1250 1251 if (key->type == BTRFS_INODE_EXTREF_KEY) 1252 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1253 parent_id, name, 1254 namelen); 1255 else 1256 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1257 name, namelen); 1258 1259 if (!ret) { 1260 struct inode *dir; 1261 1262 btrfs_release_path(path); 1263 dir = read_one_inode(root, parent_id); 1264 if (!dir) { 1265 ret = -ENOENT; 1266 kfree(name); 1267 goto out; 1268 } 1269 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1270 inode, name, namelen); 1271 kfree(name); 1272 iput(dir); 1273 if (ret) 1274 goto out; 1275 goto again; 1276 } 1277 1278 kfree(name); 1279 ref_ptr += namelen; 1280 if (key->type == BTRFS_INODE_EXTREF_KEY) 1281 ref_ptr += sizeof(struct btrfs_inode_extref); 1282 else 1283 ref_ptr += sizeof(struct btrfs_inode_ref); 1284 } 1285 ret = 0; 1286 out: 1287 btrfs_release_path(path); 1288 return ret; 1289 } 1290 1291 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1292 const u8 ref_type, const char *name, 1293 const int namelen) 1294 { 1295 struct btrfs_key key; 1296 struct btrfs_path *path; 1297 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1298 int ret; 1299 1300 path = btrfs_alloc_path(); 1301 if (!path) 1302 return -ENOMEM; 1303 1304 key.objectid = btrfs_ino(BTRFS_I(inode)); 1305 key.type = ref_type; 1306 if (key.type == BTRFS_INODE_REF_KEY) 1307 key.offset = parent_id; 1308 else 1309 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1310 1311 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1312 if (ret < 0) 1313 goto out; 1314 if (ret > 0) { 1315 ret = 0; 1316 goto out; 1317 } 1318 if (key.type == BTRFS_INODE_EXTREF_KEY) 1319 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1320 path->slots[0], parent_id, name, namelen); 1321 else 1322 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1323 name, namelen); 1324 1325 out: 1326 btrfs_free_path(path); 1327 return ret; 1328 } 1329 1330 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1331 struct inode *dir, struct inode *inode, const char *name, 1332 int namelen, u64 ref_index) 1333 { 1334 struct btrfs_dir_item *dir_item; 1335 struct btrfs_key key; 1336 struct btrfs_path *path; 1337 struct inode *other_inode = NULL; 1338 int ret; 1339 1340 path = btrfs_alloc_path(); 1341 if (!path) 1342 return -ENOMEM; 1343 1344 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1345 btrfs_ino(BTRFS_I(dir)), 1346 name, namelen, 0); 1347 if (!dir_item) { 1348 btrfs_release_path(path); 1349 goto add_link; 1350 } else if (IS_ERR(dir_item)) { 1351 ret = PTR_ERR(dir_item); 1352 goto out; 1353 } 1354 1355 /* 1356 * Our inode's dentry collides with the dentry of another inode which is 1357 * in the log but not yet processed since it has a higher inode number. 1358 * So delete that other dentry. 1359 */ 1360 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1361 btrfs_release_path(path); 1362 other_inode = read_one_inode(root, key.objectid); 1363 if (!other_inode) { 1364 ret = -ENOENT; 1365 goto out; 1366 } 1367 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1368 name, namelen); 1369 if (ret) 1370 goto out; 1371 /* 1372 * If we dropped the link count to 0, bump it so that later the iput() 1373 * on the inode will not free it. We will fixup the link count later. 1374 */ 1375 if (other_inode->i_nlink == 0) 1376 inc_nlink(other_inode); 1377 1378 ret = btrfs_run_delayed_items(trans); 1379 if (ret) 1380 goto out; 1381 add_link: 1382 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1383 name, namelen, 0, ref_index); 1384 out: 1385 iput(other_inode); 1386 btrfs_free_path(path); 1387 1388 return ret; 1389 } 1390 1391 /* 1392 * replay one inode back reference item found in the log tree. 1393 * eb, slot and key refer to the buffer and key found in the log tree. 1394 * root is the destination we are replaying into, and path is for temp 1395 * use by this function. (it should be released on return). 1396 */ 1397 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1398 struct btrfs_root *root, 1399 struct btrfs_root *log, 1400 struct btrfs_path *path, 1401 struct extent_buffer *eb, int slot, 1402 struct btrfs_key *key) 1403 { 1404 struct inode *dir = NULL; 1405 struct inode *inode = NULL; 1406 unsigned long ref_ptr; 1407 unsigned long ref_end; 1408 char *name = NULL; 1409 int namelen; 1410 int ret; 1411 int search_done = 0; 1412 int log_ref_ver = 0; 1413 u64 parent_objectid; 1414 u64 inode_objectid; 1415 u64 ref_index = 0; 1416 int ref_struct_size; 1417 1418 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1419 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1420 1421 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1422 struct btrfs_inode_extref *r; 1423 1424 ref_struct_size = sizeof(struct btrfs_inode_extref); 1425 log_ref_ver = 1; 1426 r = (struct btrfs_inode_extref *)ref_ptr; 1427 parent_objectid = btrfs_inode_extref_parent(eb, r); 1428 } else { 1429 ref_struct_size = sizeof(struct btrfs_inode_ref); 1430 parent_objectid = key->offset; 1431 } 1432 inode_objectid = key->objectid; 1433 1434 /* 1435 * it is possible that we didn't log all the parent directories 1436 * for a given inode. If we don't find the dir, just don't 1437 * copy the back ref in. The link count fixup code will take 1438 * care of the rest 1439 */ 1440 dir = read_one_inode(root, parent_objectid); 1441 if (!dir) { 1442 ret = -ENOENT; 1443 goto out; 1444 } 1445 1446 inode = read_one_inode(root, inode_objectid); 1447 if (!inode) { 1448 ret = -EIO; 1449 goto out; 1450 } 1451 1452 while (ref_ptr < ref_end) { 1453 if (log_ref_ver) { 1454 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1455 &ref_index, &parent_objectid); 1456 /* 1457 * parent object can change from one array 1458 * item to another. 1459 */ 1460 if (!dir) 1461 dir = read_one_inode(root, parent_objectid); 1462 if (!dir) { 1463 ret = -ENOENT; 1464 goto out; 1465 } 1466 } else { 1467 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1468 &ref_index); 1469 } 1470 if (ret) 1471 goto out; 1472 1473 /* if we already have a perfect match, we're done */ 1474 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1475 btrfs_ino(BTRFS_I(inode)), ref_index, 1476 name, namelen)) { 1477 /* 1478 * look for a conflicting back reference in the 1479 * metadata. if we find one we have to unlink that name 1480 * of the file before we add our new link. Later on, we 1481 * overwrite any existing back reference, and we don't 1482 * want to create dangling pointers in the directory. 1483 */ 1484 1485 if (!search_done) { 1486 ret = __add_inode_ref(trans, root, path, log, 1487 BTRFS_I(dir), 1488 BTRFS_I(inode), 1489 inode_objectid, 1490 parent_objectid, 1491 ref_index, name, namelen, 1492 &search_done); 1493 if (ret) { 1494 if (ret == 1) 1495 ret = 0; 1496 goto out; 1497 } 1498 } 1499 1500 /* 1501 * If a reference item already exists for this inode 1502 * with the same parent and name, but different index, 1503 * drop it and the corresponding directory index entries 1504 * from the parent before adding the new reference item 1505 * and dir index entries, otherwise we would fail with 1506 * -EEXIST returned from btrfs_add_link() below. 1507 */ 1508 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1509 name, namelen); 1510 if (ret > 0) { 1511 ret = btrfs_unlink_inode(trans, root, 1512 BTRFS_I(dir), 1513 BTRFS_I(inode), 1514 name, namelen); 1515 /* 1516 * If we dropped the link count to 0, bump it so 1517 * that later the iput() on the inode will not 1518 * free it. We will fixup the link count later. 1519 */ 1520 if (!ret && inode->i_nlink == 0) 1521 inc_nlink(inode); 1522 } 1523 if (ret < 0) 1524 goto out; 1525 1526 /* insert our name */ 1527 ret = add_link(trans, root, dir, inode, name, namelen, 1528 ref_index); 1529 if (ret) 1530 goto out; 1531 1532 btrfs_update_inode(trans, root, inode); 1533 } 1534 1535 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1536 kfree(name); 1537 name = NULL; 1538 if (log_ref_ver) { 1539 iput(dir); 1540 dir = NULL; 1541 } 1542 } 1543 1544 /* 1545 * Before we overwrite the inode reference item in the subvolume tree 1546 * with the item from the log tree, we must unlink all names from the 1547 * parent directory that are in the subvolume's tree inode reference 1548 * item, otherwise we end up with an inconsistent subvolume tree where 1549 * dir index entries exist for a name but there is no inode reference 1550 * item with the same name. 1551 */ 1552 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1553 key); 1554 if (ret) 1555 goto out; 1556 1557 /* finally write the back reference in the inode */ 1558 ret = overwrite_item(trans, root, path, eb, slot, key); 1559 out: 1560 btrfs_release_path(path); 1561 kfree(name); 1562 iput(dir); 1563 iput(inode); 1564 return ret; 1565 } 1566 1567 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1568 struct btrfs_root *root, u64 ino) 1569 { 1570 int ret; 1571 1572 ret = btrfs_insert_orphan_item(trans, root, ino); 1573 if (ret == -EEXIST) 1574 ret = 0; 1575 1576 return ret; 1577 } 1578 1579 static int count_inode_extrefs(struct btrfs_root *root, 1580 struct btrfs_inode *inode, struct btrfs_path *path) 1581 { 1582 int ret = 0; 1583 int name_len; 1584 unsigned int nlink = 0; 1585 u32 item_size; 1586 u32 cur_offset = 0; 1587 u64 inode_objectid = btrfs_ino(inode); 1588 u64 offset = 0; 1589 unsigned long ptr; 1590 struct btrfs_inode_extref *extref; 1591 struct extent_buffer *leaf; 1592 1593 while (1) { 1594 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1595 &extref, &offset); 1596 if (ret) 1597 break; 1598 1599 leaf = path->nodes[0]; 1600 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1601 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1602 cur_offset = 0; 1603 1604 while (cur_offset < item_size) { 1605 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1606 name_len = btrfs_inode_extref_name_len(leaf, extref); 1607 1608 nlink++; 1609 1610 cur_offset += name_len + sizeof(*extref); 1611 } 1612 1613 offset++; 1614 btrfs_release_path(path); 1615 } 1616 btrfs_release_path(path); 1617 1618 if (ret < 0 && ret != -ENOENT) 1619 return ret; 1620 return nlink; 1621 } 1622 1623 static int count_inode_refs(struct btrfs_root *root, 1624 struct btrfs_inode *inode, struct btrfs_path *path) 1625 { 1626 int ret; 1627 struct btrfs_key key; 1628 unsigned int nlink = 0; 1629 unsigned long ptr; 1630 unsigned long ptr_end; 1631 int name_len; 1632 u64 ino = btrfs_ino(inode); 1633 1634 key.objectid = ino; 1635 key.type = BTRFS_INODE_REF_KEY; 1636 key.offset = (u64)-1; 1637 1638 while (1) { 1639 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1640 if (ret < 0) 1641 break; 1642 if (ret > 0) { 1643 if (path->slots[0] == 0) 1644 break; 1645 path->slots[0]--; 1646 } 1647 process_slot: 1648 btrfs_item_key_to_cpu(path->nodes[0], &key, 1649 path->slots[0]); 1650 if (key.objectid != ino || 1651 key.type != BTRFS_INODE_REF_KEY) 1652 break; 1653 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1654 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1655 path->slots[0]); 1656 while (ptr < ptr_end) { 1657 struct btrfs_inode_ref *ref; 1658 1659 ref = (struct btrfs_inode_ref *)ptr; 1660 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1661 ref); 1662 ptr = (unsigned long)(ref + 1) + name_len; 1663 nlink++; 1664 } 1665 1666 if (key.offset == 0) 1667 break; 1668 if (path->slots[0] > 0) { 1669 path->slots[0]--; 1670 goto process_slot; 1671 } 1672 key.offset--; 1673 btrfs_release_path(path); 1674 } 1675 btrfs_release_path(path); 1676 1677 return nlink; 1678 } 1679 1680 /* 1681 * There are a few corners where the link count of the file can't 1682 * be properly maintained during replay. So, instead of adding 1683 * lots of complexity to the log code, we just scan the backrefs 1684 * for any file that has been through replay. 1685 * 1686 * The scan will update the link count on the inode to reflect the 1687 * number of back refs found. If it goes down to zero, the iput 1688 * will free the inode. 1689 */ 1690 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1691 struct btrfs_root *root, 1692 struct inode *inode) 1693 { 1694 struct btrfs_path *path; 1695 int ret; 1696 u64 nlink = 0; 1697 u64 ino = btrfs_ino(BTRFS_I(inode)); 1698 1699 path = btrfs_alloc_path(); 1700 if (!path) 1701 return -ENOMEM; 1702 1703 ret = count_inode_refs(root, BTRFS_I(inode), path); 1704 if (ret < 0) 1705 goto out; 1706 1707 nlink = ret; 1708 1709 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1710 if (ret < 0) 1711 goto out; 1712 1713 nlink += ret; 1714 1715 ret = 0; 1716 1717 if (nlink != inode->i_nlink) { 1718 set_nlink(inode, nlink); 1719 btrfs_update_inode(trans, root, inode); 1720 } 1721 BTRFS_I(inode)->index_cnt = (u64)-1; 1722 1723 if (inode->i_nlink == 0) { 1724 if (S_ISDIR(inode->i_mode)) { 1725 ret = replay_dir_deletes(trans, root, NULL, path, 1726 ino, 1); 1727 if (ret) 1728 goto out; 1729 } 1730 ret = insert_orphan_item(trans, root, ino); 1731 } 1732 1733 out: 1734 btrfs_free_path(path); 1735 return ret; 1736 } 1737 1738 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1739 struct btrfs_root *root, 1740 struct btrfs_path *path) 1741 { 1742 int ret; 1743 struct btrfs_key key; 1744 struct inode *inode; 1745 1746 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1747 key.type = BTRFS_ORPHAN_ITEM_KEY; 1748 key.offset = (u64)-1; 1749 while (1) { 1750 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1751 if (ret < 0) 1752 break; 1753 1754 if (ret == 1) { 1755 if (path->slots[0] == 0) 1756 break; 1757 path->slots[0]--; 1758 } 1759 1760 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1761 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1762 key.type != BTRFS_ORPHAN_ITEM_KEY) 1763 break; 1764 1765 ret = btrfs_del_item(trans, root, path); 1766 if (ret) 1767 goto out; 1768 1769 btrfs_release_path(path); 1770 inode = read_one_inode(root, key.offset); 1771 if (!inode) 1772 return -EIO; 1773 1774 ret = fixup_inode_link_count(trans, root, inode); 1775 iput(inode); 1776 if (ret) 1777 goto out; 1778 1779 /* 1780 * fixup on a directory may create new entries, 1781 * make sure we always look for the highset possible 1782 * offset 1783 */ 1784 key.offset = (u64)-1; 1785 } 1786 ret = 0; 1787 out: 1788 btrfs_release_path(path); 1789 return ret; 1790 } 1791 1792 1793 /* 1794 * record a given inode in the fixup dir so we can check its link 1795 * count when replay is done. The link count is incremented here 1796 * so the inode won't go away until we check it 1797 */ 1798 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1799 struct btrfs_root *root, 1800 struct btrfs_path *path, 1801 u64 objectid) 1802 { 1803 struct btrfs_key key; 1804 int ret = 0; 1805 struct inode *inode; 1806 1807 inode = read_one_inode(root, objectid); 1808 if (!inode) 1809 return -EIO; 1810 1811 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1812 key.type = BTRFS_ORPHAN_ITEM_KEY; 1813 key.offset = objectid; 1814 1815 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1816 1817 btrfs_release_path(path); 1818 if (ret == 0) { 1819 if (!inode->i_nlink) 1820 set_nlink(inode, 1); 1821 else 1822 inc_nlink(inode); 1823 ret = btrfs_update_inode(trans, root, inode); 1824 } else if (ret == -EEXIST) { 1825 ret = 0; 1826 } else { 1827 BUG(); /* Logic Error */ 1828 } 1829 iput(inode); 1830 1831 return ret; 1832 } 1833 1834 /* 1835 * when replaying the log for a directory, we only insert names 1836 * for inodes that actually exist. This means an fsync on a directory 1837 * does not implicitly fsync all the new files in it 1838 */ 1839 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1840 struct btrfs_root *root, 1841 u64 dirid, u64 index, 1842 char *name, int name_len, 1843 struct btrfs_key *location) 1844 { 1845 struct inode *inode; 1846 struct inode *dir; 1847 int ret; 1848 1849 inode = read_one_inode(root, location->objectid); 1850 if (!inode) 1851 return -ENOENT; 1852 1853 dir = read_one_inode(root, dirid); 1854 if (!dir) { 1855 iput(inode); 1856 return -EIO; 1857 } 1858 1859 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1860 name_len, 1, index); 1861 1862 /* FIXME, put inode into FIXUP list */ 1863 1864 iput(inode); 1865 iput(dir); 1866 return ret; 1867 } 1868 1869 /* 1870 * take a single entry in a log directory item and replay it into 1871 * the subvolume. 1872 * 1873 * if a conflicting item exists in the subdirectory already, 1874 * the inode it points to is unlinked and put into the link count 1875 * fix up tree. 1876 * 1877 * If a name from the log points to a file or directory that does 1878 * not exist in the FS, it is skipped. fsyncs on directories 1879 * do not force down inodes inside that directory, just changes to the 1880 * names or unlinks in a directory. 1881 * 1882 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1883 * non-existing inode) and 1 if the name was replayed. 1884 */ 1885 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1886 struct btrfs_root *root, 1887 struct btrfs_path *path, 1888 struct extent_buffer *eb, 1889 struct btrfs_dir_item *di, 1890 struct btrfs_key *key) 1891 { 1892 char *name; 1893 int name_len; 1894 struct btrfs_dir_item *dst_di; 1895 struct btrfs_key found_key; 1896 struct btrfs_key log_key; 1897 struct inode *dir; 1898 u8 log_type; 1899 int exists; 1900 int ret = 0; 1901 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1902 bool name_added = false; 1903 1904 dir = read_one_inode(root, key->objectid); 1905 if (!dir) 1906 return -EIO; 1907 1908 name_len = btrfs_dir_name_len(eb, di); 1909 name = kmalloc(name_len, GFP_NOFS); 1910 if (!name) { 1911 ret = -ENOMEM; 1912 goto out; 1913 } 1914 1915 log_type = btrfs_dir_type(eb, di); 1916 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1917 name_len); 1918 1919 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1920 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1921 if (exists == 0) 1922 exists = 1; 1923 else 1924 exists = 0; 1925 btrfs_release_path(path); 1926 1927 if (key->type == BTRFS_DIR_ITEM_KEY) { 1928 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1929 name, name_len, 1); 1930 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1931 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1932 key->objectid, 1933 key->offset, name, 1934 name_len, 1); 1935 } else { 1936 /* Corruption */ 1937 ret = -EINVAL; 1938 goto out; 1939 } 1940 if (IS_ERR_OR_NULL(dst_di)) { 1941 /* we need a sequence number to insert, so we only 1942 * do inserts for the BTRFS_DIR_INDEX_KEY types 1943 */ 1944 if (key->type != BTRFS_DIR_INDEX_KEY) 1945 goto out; 1946 goto insert; 1947 } 1948 1949 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1950 /* the existing item matches the logged item */ 1951 if (found_key.objectid == log_key.objectid && 1952 found_key.type == log_key.type && 1953 found_key.offset == log_key.offset && 1954 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1955 update_size = false; 1956 goto out; 1957 } 1958 1959 /* 1960 * don't drop the conflicting directory entry if the inode 1961 * for the new entry doesn't exist 1962 */ 1963 if (!exists) 1964 goto out; 1965 1966 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1967 if (ret) 1968 goto out; 1969 1970 if (key->type == BTRFS_DIR_INDEX_KEY) 1971 goto insert; 1972 out: 1973 btrfs_release_path(path); 1974 if (!ret && update_size) { 1975 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1976 ret = btrfs_update_inode(trans, root, dir); 1977 } 1978 kfree(name); 1979 iput(dir); 1980 if (!ret && name_added) 1981 ret = 1; 1982 return ret; 1983 1984 insert: 1985 /* 1986 * Check if the inode reference exists in the log for the given name, 1987 * inode and parent inode 1988 */ 1989 found_key.objectid = log_key.objectid; 1990 found_key.type = BTRFS_INODE_REF_KEY; 1991 found_key.offset = key->objectid; 1992 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 1993 if (ret < 0) { 1994 goto out; 1995 } else if (ret) { 1996 /* The dentry will be added later. */ 1997 ret = 0; 1998 update_size = false; 1999 goto out; 2000 } 2001 2002 found_key.objectid = log_key.objectid; 2003 found_key.type = BTRFS_INODE_EXTREF_KEY; 2004 found_key.offset = key->objectid; 2005 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2006 name_len); 2007 if (ret < 0) { 2008 goto out; 2009 } else if (ret) { 2010 /* The dentry will be added later. */ 2011 ret = 0; 2012 update_size = false; 2013 goto out; 2014 } 2015 btrfs_release_path(path); 2016 ret = insert_one_name(trans, root, key->objectid, key->offset, 2017 name, name_len, &log_key); 2018 if (ret && ret != -ENOENT && ret != -EEXIST) 2019 goto out; 2020 if (!ret) 2021 name_added = true; 2022 update_size = false; 2023 ret = 0; 2024 goto out; 2025 } 2026 2027 /* 2028 * find all the names in a directory item and reconcile them into 2029 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2030 * one name in a directory item, but the same code gets used for 2031 * both directory index types 2032 */ 2033 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2034 struct btrfs_root *root, 2035 struct btrfs_path *path, 2036 struct extent_buffer *eb, int slot, 2037 struct btrfs_key *key) 2038 { 2039 int ret = 0; 2040 u32 item_size = btrfs_item_size_nr(eb, slot); 2041 struct btrfs_dir_item *di; 2042 int name_len; 2043 unsigned long ptr; 2044 unsigned long ptr_end; 2045 struct btrfs_path *fixup_path = NULL; 2046 2047 ptr = btrfs_item_ptr_offset(eb, slot); 2048 ptr_end = ptr + item_size; 2049 while (ptr < ptr_end) { 2050 di = (struct btrfs_dir_item *)ptr; 2051 name_len = btrfs_dir_name_len(eb, di); 2052 ret = replay_one_name(trans, root, path, eb, di, key); 2053 if (ret < 0) 2054 break; 2055 ptr = (unsigned long)(di + 1); 2056 ptr += name_len; 2057 2058 /* 2059 * If this entry refers to a non-directory (directories can not 2060 * have a link count > 1) and it was added in the transaction 2061 * that was not committed, make sure we fixup the link count of 2062 * the inode it the entry points to. Otherwise something like 2063 * the following would result in a directory pointing to an 2064 * inode with a wrong link that does not account for this dir 2065 * entry: 2066 * 2067 * mkdir testdir 2068 * touch testdir/foo 2069 * touch testdir/bar 2070 * sync 2071 * 2072 * ln testdir/bar testdir/bar_link 2073 * ln testdir/foo testdir/foo_link 2074 * xfs_io -c "fsync" testdir/bar 2075 * 2076 * <power failure> 2077 * 2078 * mount fs, log replay happens 2079 * 2080 * File foo would remain with a link count of 1 when it has two 2081 * entries pointing to it in the directory testdir. This would 2082 * make it impossible to ever delete the parent directory has 2083 * it would result in stale dentries that can never be deleted. 2084 */ 2085 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2086 struct btrfs_key di_key; 2087 2088 if (!fixup_path) { 2089 fixup_path = btrfs_alloc_path(); 2090 if (!fixup_path) { 2091 ret = -ENOMEM; 2092 break; 2093 } 2094 } 2095 2096 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2097 ret = link_to_fixup_dir(trans, root, fixup_path, 2098 di_key.objectid); 2099 if (ret) 2100 break; 2101 } 2102 ret = 0; 2103 } 2104 btrfs_free_path(fixup_path); 2105 return ret; 2106 } 2107 2108 /* 2109 * directory replay has two parts. There are the standard directory 2110 * items in the log copied from the subvolume, and range items 2111 * created in the log while the subvolume was logged. 2112 * 2113 * The range items tell us which parts of the key space the log 2114 * is authoritative for. During replay, if a key in the subvolume 2115 * directory is in a logged range item, but not actually in the log 2116 * that means it was deleted from the directory before the fsync 2117 * and should be removed. 2118 */ 2119 static noinline int find_dir_range(struct btrfs_root *root, 2120 struct btrfs_path *path, 2121 u64 dirid, int key_type, 2122 u64 *start_ret, u64 *end_ret) 2123 { 2124 struct btrfs_key key; 2125 u64 found_end; 2126 struct btrfs_dir_log_item *item; 2127 int ret; 2128 int nritems; 2129 2130 if (*start_ret == (u64)-1) 2131 return 1; 2132 2133 key.objectid = dirid; 2134 key.type = key_type; 2135 key.offset = *start_ret; 2136 2137 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2138 if (ret < 0) 2139 goto out; 2140 if (ret > 0) { 2141 if (path->slots[0] == 0) 2142 goto out; 2143 path->slots[0]--; 2144 } 2145 if (ret != 0) 2146 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2147 2148 if (key.type != key_type || key.objectid != dirid) { 2149 ret = 1; 2150 goto next; 2151 } 2152 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2153 struct btrfs_dir_log_item); 2154 found_end = btrfs_dir_log_end(path->nodes[0], item); 2155 2156 if (*start_ret >= key.offset && *start_ret <= found_end) { 2157 ret = 0; 2158 *start_ret = key.offset; 2159 *end_ret = found_end; 2160 goto out; 2161 } 2162 ret = 1; 2163 next: 2164 /* check the next slot in the tree to see if it is a valid item */ 2165 nritems = btrfs_header_nritems(path->nodes[0]); 2166 path->slots[0]++; 2167 if (path->slots[0] >= nritems) { 2168 ret = btrfs_next_leaf(root, path); 2169 if (ret) 2170 goto out; 2171 } 2172 2173 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2174 2175 if (key.type != key_type || key.objectid != dirid) { 2176 ret = 1; 2177 goto out; 2178 } 2179 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2180 struct btrfs_dir_log_item); 2181 found_end = btrfs_dir_log_end(path->nodes[0], item); 2182 *start_ret = key.offset; 2183 *end_ret = found_end; 2184 ret = 0; 2185 out: 2186 btrfs_release_path(path); 2187 return ret; 2188 } 2189 2190 /* 2191 * this looks for a given directory item in the log. If the directory 2192 * item is not in the log, the item is removed and the inode it points 2193 * to is unlinked 2194 */ 2195 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2196 struct btrfs_root *root, 2197 struct btrfs_root *log, 2198 struct btrfs_path *path, 2199 struct btrfs_path *log_path, 2200 struct inode *dir, 2201 struct btrfs_key *dir_key) 2202 { 2203 int ret; 2204 struct extent_buffer *eb; 2205 int slot; 2206 u32 item_size; 2207 struct btrfs_dir_item *di; 2208 struct btrfs_dir_item *log_di; 2209 int name_len; 2210 unsigned long ptr; 2211 unsigned long ptr_end; 2212 char *name; 2213 struct inode *inode; 2214 struct btrfs_key location; 2215 2216 again: 2217 eb = path->nodes[0]; 2218 slot = path->slots[0]; 2219 item_size = btrfs_item_size_nr(eb, slot); 2220 ptr = btrfs_item_ptr_offset(eb, slot); 2221 ptr_end = ptr + item_size; 2222 while (ptr < ptr_end) { 2223 di = (struct btrfs_dir_item *)ptr; 2224 name_len = btrfs_dir_name_len(eb, di); 2225 name = kmalloc(name_len, GFP_NOFS); 2226 if (!name) { 2227 ret = -ENOMEM; 2228 goto out; 2229 } 2230 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2231 name_len); 2232 log_di = NULL; 2233 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2234 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2235 dir_key->objectid, 2236 name, name_len, 0); 2237 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2238 log_di = btrfs_lookup_dir_index_item(trans, log, 2239 log_path, 2240 dir_key->objectid, 2241 dir_key->offset, 2242 name, name_len, 0); 2243 } 2244 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2245 btrfs_dir_item_key_to_cpu(eb, di, &location); 2246 btrfs_release_path(path); 2247 btrfs_release_path(log_path); 2248 inode = read_one_inode(root, location.objectid); 2249 if (!inode) { 2250 kfree(name); 2251 return -EIO; 2252 } 2253 2254 ret = link_to_fixup_dir(trans, root, 2255 path, location.objectid); 2256 if (ret) { 2257 kfree(name); 2258 iput(inode); 2259 goto out; 2260 } 2261 2262 inc_nlink(inode); 2263 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2264 BTRFS_I(inode), name, name_len); 2265 if (!ret) 2266 ret = btrfs_run_delayed_items(trans); 2267 kfree(name); 2268 iput(inode); 2269 if (ret) 2270 goto out; 2271 2272 /* there might still be more names under this key 2273 * check and repeat if required 2274 */ 2275 ret = btrfs_search_slot(NULL, root, dir_key, path, 2276 0, 0); 2277 if (ret == 0) 2278 goto again; 2279 ret = 0; 2280 goto out; 2281 } else if (IS_ERR(log_di)) { 2282 kfree(name); 2283 return PTR_ERR(log_di); 2284 } 2285 btrfs_release_path(log_path); 2286 kfree(name); 2287 2288 ptr = (unsigned long)(di + 1); 2289 ptr += name_len; 2290 } 2291 ret = 0; 2292 out: 2293 btrfs_release_path(path); 2294 btrfs_release_path(log_path); 2295 return ret; 2296 } 2297 2298 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2299 struct btrfs_root *root, 2300 struct btrfs_root *log, 2301 struct btrfs_path *path, 2302 const u64 ino) 2303 { 2304 struct btrfs_key search_key; 2305 struct btrfs_path *log_path; 2306 int i; 2307 int nritems; 2308 int ret; 2309 2310 log_path = btrfs_alloc_path(); 2311 if (!log_path) 2312 return -ENOMEM; 2313 2314 search_key.objectid = ino; 2315 search_key.type = BTRFS_XATTR_ITEM_KEY; 2316 search_key.offset = 0; 2317 again: 2318 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2319 if (ret < 0) 2320 goto out; 2321 process_leaf: 2322 nritems = btrfs_header_nritems(path->nodes[0]); 2323 for (i = path->slots[0]; i < nritems; i++) { 2324 struct btrfs_key key; 2325 struct btrfs_dir_item *di; 2326 struct btrfs_dir_item *log_di; 2327 u32 total_size; 2328 u32 cur; 2329 2330 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2331 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2332 ret = 0; 2333 goto out; 2334 } 2335 2336 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2337 total_size = btrfs_item_size_nr(path->nodes[0], i); 2338 cur = 0; 2339 while (cur < total_size) { 2340 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2341 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2342 u32 this_len = sizeof(*di) + name_len + data_len; 2343 char *name; 2344 2345 name = kmalloc(name_len, GFP_NOFS); 2346 if (!name) { 2347 ret = -ENOMEM; 2348 goto out; 2349 } 2350 read_extent_buffer(path->nodes[0], name, 2351 (unsigned long)(di + 1), name_len); 2352 2353 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2354 name, name_len, 0); 2355 btrfs_release_path(log_path); 2356 if (!log_di) { 2357 /* Doesn't exist in log tree, so delete it. */ 2358 btrfs_release_path(path); 2359 di = btrfs_lookup_xattr(trans, root, path, ino, 2360 name, name_len, -1); 2361 kfree(name); 2362 if (IS_ERR(di)) { 2363 ret = PTR_ERR(di); 2364 goto out; 2365 } 2366 ASSERT(di); 2367 ret = btrfs_delete_one_dir_name(trans, root, 2368 path, di); 2369 if (ret) 2370 goto out; 2371 btrfs_release_path(path); 2372 search_key = key; 2373 goto again; 2374 } 2375 kfree(name); 2376 if (IS_ERR(log_di)) { 2377 ret = PTR_ERR(log_di); 2378 goto out; 2379 } 2380 cur += this_len; 2381 di = (struct btrfs_dir_item *)((char *)di + this_len); 2382 } 2383 } 2384 ret = btrfs_next_leaf(root, path); 2385 if (ret > 0) 2386 ret = 0; 2387 else if (ret == 0) 2388 goto process_leaf; 2389 out: 2390 btrfs_free_path(log_path); 2391 btrfs_release_path(path); 2392 return ret; 2393 } 2394 2395 2396 /* 2397 * deletion replay happens before we copy any new directory items 2398 * out of the log or out of backreferences from inodes. It 2399 * scans the log to find ranges of keys that log is authoritative for, 2400 * and then scans the directory to find items in those ranges that are 2401 * not present in the log. 2402 * 2403 * Anything we don't find in the log is unlinked and removed from the 2404 * directory. 2405 */ 2406 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2407 struct btrfs_root *root, 2408 struct btrfs_root *log, 2409 struct btrfs_path *path, 2410 u64 dirid, int del_all) 2411 { 2412 u64 range_start; 2413 u64 range_end; 2414 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2415 int ret = 0; 2416 struct btrfs_key dir_key; 2417 struct btrfs_key found_key; 2418 struct btrfs_path *log_path; 2419 struct inode *dir; 2420 2421 dir_key.objectid = dirid; 2422 dir_key.type = BTRFS_DIR_ITEM_KEY; 2423 log_path = btrfs_alloc_path(); 2424 if (!log_path) 2425 return -ENOMEM; 2426 2427 dir = read_one_inode(root, dirid); 2428 /* it isn't an error if the inode isn't there, that can happen 2429 * because we replay the deletes before we copy in the inode item 2430 * from the log 2431 */ 2432 if (!dir) { 2433 btrfs_free_path(log_path); 2434 return 0; 2435 } 2436 again: 2437 range_start = 0; 2438 range_end = 0; 2439 while (1) { 2440 if (del_all) 2441 range_end = (u64)-1; 2442 else { 2443 ret = find_dir_range(log, path, dirid, key_type, 2444 &range_start, &range_end); 2445 if (ret != 0) 2446 break; 2447 } 2448 2449 dir_key.offset = range_start; 2450 while (1) { 2451 int nritems; 2452 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2453 0, 0); 2454 if (ret < 0) 2455 goto out; 2456 2457 nritems = btrfs_header_nritems(path->nodes[0]); 2458 if (path->slots[0] >= nritems) { 2459 ret = btrfs_next_leaf(root, path); 2460 if (ret == 1) 2461 break; 2462 else if (ret < 0) 2463 goto out; 2464 } 2465 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2466 path->slots[0]); 2467 if (found_key.objectid != dirid || 2468 found_key.type != dir_key.type) 2469 goto next_type; 2470 2471 if (found_key.offset > range_end) 2472 break; 2473 2474 ret = check_item_in_log(trans, root, log, path, 2475 log_path, dir, 2476 &found_key); 2477 if (ret) 2478 goto out; 2479 if (found_key.offset == (u64)-1) 2480 break; 2481 dir_key.offset = found_key.offset + 1; 2482 } 2483 btrfs_release_path(path); 2484 if (range_end == (u64)-1) 2485 break; 2486 range_start = range_end + 1; 2487 } 2488 2489 next_type: 2490 ret = 0; 2491 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2492 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2493 dir_key.type = BTRFS_DIR_INDEX_KEY; 2494 btrfs_release_path(path); 2495 goto again; 2496 } 2497 out: 2498 btrfs_release_path(path); 2499 btrfs_free_path(log_path); 2500 iput(dir); 2501 return ret; 2502 } 2503 2504 /* 2505 * the process_func used to replay items from the log tree. This 2506 * gets called in two different stages. The first stage just looks 2507 * for inodes and makes sure they are all copied into the subvolume. 2508 * 2509 * The second stage copies all the other item types from the log into 2510 * the subvolume. The two stage approach is slower, but gets rid of 2511 * lots of complexity around inodes referencing other inodes that exist 2512 * only in the log (references come from either directory items or inode 2513 * back refs). 2514 */ 2515 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2516 struct walk_control *wc, u64 gen, int level) 2517 { 2518 int nritems; 2519 struct btrfs_path *path; 2520 struct btrfs_root *root = wc->replay_dest; 2521 struct btrfs_key key; 2522 int i; 2523 int ret; 2524 2525 ret = btrfs_read_buffer(eb, gen, level, NULL); 2526 if (ret) 2527 return ret; 2528 2529 level = btrfs_header_level(eb); 2530 2531 if (level != 0) 2532 return 0; 2533 2534 path = btrfs_alloc_path(); 2535 if (!path) 2536 return -ENOMEM; 2537 2538 nritems = btrfs_header_nritems(eb); 2539 for (i = 0; i < nritems; i++) { 2540 btrfs_item_key_to_cpu(eb, &key, i); 2541 2542 /* inode keys are done during the first stage */ 2543 if (key.type == BTRFS_INODE_ITEM_KEY && 2544 wc->stage == LOG_WALK_REPLAY_INODES) { 2545 struct btrfs_inode_item *inode_item; 2546 u32 mode; 2547 2548 inode_item = btrfs_item_ptr(eb, i, 2549 struct btrfs_inode_item); 2550 /* 2551 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2552 * and never got linked before the fsync, skip it, as 2553 * replaying it is pointless since it would be deleted 2554 * later. We skip logging tmpfiles, but it's always 2555 * possible we are replaying a log created with a kernel 2556 * that used to log tmpfiles. 2557 */ 2558 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2559 wc->ignore_cur_inode = true; 2560 continue; 2561 } else { 2562 wc->ignore_cur_inode = false; 2563 } 2564 ret = replay_xattr_deletes(wc->trans, root, log, 2565 path, key.objectid); 2566 if (ret) 2567 break; 2568 mode = btrfs_inode_mode(eb, inode_item); 2569 if (S_ISDIR(mode)) { 2570 ret = replay_dir_deletes(wc->trans, 2571 root, log, path, key.objectid, 0); 2572 if (ret) 2573 break; 2574 } 2575 ret = overwrite_item(wc->trans, root, path, 2576 eb, i, &key); 2577 if (ret) 2578 break; 2579 2580 /* 2581 * Before replaying extents, truncate the inode to its 2582 * size. We need to do it now and not after log replay 2583 * because before an fsync we can have prealloc extents 2584 * added beyond the inode's i_size. If we did it after, 2585 * through orphan cleanup for example, we would drop 2586 * those prealloc extents just after replaying them. 2587 */ 2588 if (S_ISREG(mode)) { 2589 struct inode *inode; 2590 u64 from; 2591 2592 inode = read_one_inode(root, key.objectid); 2593 if (!inode) { 2594 ret = -EIO; 2595 break; 2596 } 2597 from = ALIGN(i_size_read(inode), 2598 root->fs_info->sectorsize); 2599 ret = btrfs_drop_extents(wc->trans, root, inode, 2600 from, (u64)-1, 1); 2601 if (!ret) { 2602 /* Update the inode's nbytes. */ 2603 ret = btrfs_update_inode(wc->trans, 2604 root, inode); 2605 } 2606 iput(inode); 2607 if (ret) 2608 break; 2609 } 2610 2611 ret = link_to_fixup_dir(wc->trans, root, 2612 path, key.objectid); 2613 if (ret) 2614 break; 2615 } 2616 2617 if (wc->ignore_cur_inode) 2618 continue; 2619 2620 if (key.type == BTRFS_DIR_INDEX_KEY && 2621 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2622 ret = replay_one_dir_item(wc->trans, root, path, 2623 eb, i, &key); 2624 if (ret) 2625 break; 2626 } 2627 2628 if (wc->stage < LOG_WALK_REPLAY_ALL) 2629 continue; 2630 2631 /* these keys are simply copied */ 2632 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2633 ret = overwrite_item(wc->trans, root, path, 2634 eb, i, &key); 2635 if (ret) 2636 break; 2637 } else if (key.type == BTRFS_INODE_REF_KEY || 2638 key.type == BTRFS_INODE_EXTREF_KEY) { 2639 ret = add_inode_ref(wc->trans, root, log, path, 2640 eb, i, &key); 2641 if (ret && ret != -ENOENT) 2642 break; 2643 ret = 0; 2644 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2645 ret = replay_one_extent(wc->trans, root, path, 2646 eb, i, &key); 2647 if (ret) 2648 break; 2649 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2650 ret = replay_one_dir_item(wc->trans, root, path, 2651 eb, i, &key); 2652 if (ret) 2653 break; 2654 } 2655 } 2656 btrfs_free_path(path); 2657 return ret; 2658 } 2659 2660 /* 2661 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2662 */ 2663 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2664 { 2665 struct btrfs_block_group *cache; 2666 2667 cache = btrfs_lookup_block_group(fs_info, start); 2668 if (!cache) { 2669 btrfs_err(fs_info, "unable to find block group for %llu", start); 2670 return; 2671 } 2672 2673 spin_lock(&cache->space_info->lock); 2674 spin_lock(&cache->lock); 2675 cache->reserved -= fs_info->nodesize; 2676 cache->space_info->bytes_reserved -= fs_info->nodesize; 2677 spin_unlock(&cache->lock); 2678 spin_unlock(&cache->space_info->lock); 2679 2680 btrfs_put_block_group(cache); 2681 } 2682 2683 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2684 struct btrfs_root *root, 2685 struct btrfs_path *path, int *level, 2686 struct walk_control *wc) 2687 { 2688 struct btrfs_fs_info *fs_info = root->fs_info; 2689 u64 bytenr; 2690 u64 ptr_gen; 2691 struct extent_buffer *next; 2692 struct extent_buffer *cur; 2693 u32 blocksize; 2694 int ret = 0; 2695 2696 while (*level > 0) { 2697 struct btrfs_key first_key; 2698 2699 cur = path->nodes[*level]; 2700 2701 WARN_ON(btrfs_header_level(cur) != *level); 2702 2703 if (path->slots[*level] >= 2704 btrfs_header_nritems(cur)) 2705 break; 2706 2707 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2708 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2709 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2710 blocksize = fs_info->nodesize; 2711 2712 next = btrfs_find_create_tree_block(fs_info, bytenr); 2713 if (IS_ERR(next)) 2714 return PTR_ERR(next); 2715 2716 if (*level == 1) { 2717 ret = wc->process_func(root, next, wc, ptr_gen, 2718 *level - 1); 2719 if (ret) { 2720 free_extent_buffer(next); 2721 return ret; 2722 } 2723 2724 path->slots[*level]++; 2725 if (wc->free) { 2726 ret = btrfs_read_buffer(next, ptr_gen, 2727 *level - 1, &first_key); 2728 if (ret) { 2729 free_extent_buffer(next); 2730 return ret; 2731 } 2732 2733 if (trans) { 2734 btrfs_tree_lock(next); 2735 btrfs_set_lock_blocking_write(next); 2736 btrfs_clean_tree_block(next); 2737 btrfs_wait_tree_block_writeback(next); 2738 btrfs_tree_unlock(next); 2739 ret = btrfs_pin_reserved_extent(trans, 2740 bytenr, blocksize); 2741 if (ret) { 2742 free_extent_buffer(next); 2743 return ret; 2744 } 2745 } else { 2746 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2747 clear_extent_buffer_dirty(next); 2748 unaccount_log_buffer(fs_info, bytenr); 2749 } 2750 } 2751 free_extent_buffer(next); 2752 continue; 2753 } 2754 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2755 if (ret) { 2756 free_extent_buffer(next); 2757 return ret; 2758 } 2759 2760 if (path->nodes[*level-1]) 2761 free_extent_buffer(path->nodes[*level-1]); 2762 path->nodes[*level-1] = next; 2763 *level = btrfs_header_level(next); 2764 path->slots[*level] = 0; 2765 cond_resched(); 2766 } 2767 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2768 2769 cond_resched(); 2770 return 0; 2771 } 2772 2773 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2774 struct btrfs_root *root, 2775 struct btrfs_path *path, int *level, 2776 struct walk_control *wc) 2777 { 2778 struct btrfs_fs_info *fs_info = root->fs_info; 2779 int i; 2780 int slot; 2781 int ret; 2782 2783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2784 slot = path->slots[i]; 2785 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2786 path->slots[i]++; 2787 *level = i; 2788 WARN_ON(*level == 0); 2789 return 0; 2790 } else { 2791 ret = wc->process_func(root, path->nodes[*level], wc, 2792 btrfs_header_generation(path->nodes[*level]), 2793 *level); 2794 if (ret) 2795 return ret; 2796 2797 if (wc->free) { 2798 struct extent_buffer *next; 2799 2800 next = path->nodes[*level]; 2801 2802 if (trans) { 2803 btrfs_tree_lock(next); 2804 btrfs_set_lock_blocking_write(next); 2805 btrfs_clean_tree_block(next); 2806 btrfs_wait_tree_block_writeback(next); 2807 btrfs_tree_unlock(next); 2808 ret = btrfs_pin_reserved_extent(trans, 2809 path->nodes[*level]->start, 2810 path->nodes[*level]->len); 2811 if (ret) 2812 return ret; 2813 } else { 2814 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2815 clear_extent_buffer_dirty(next); 2816 2817 unaccount_log_buffer(fs_info, 2818 path->nodes[*level]->start); 2819 } 2820 } 2821 free_extent_buffer(path->nodes[*level]); 2822 path->nodes[*level] = NULL; 2823 *level = i + 1; 2824 } 2825 } 2826 return 1; 2827 } 2828 2829 /* 2830 * drop the reference count on the tree rooted at 'snap'. This traverses 2831 * the tree freeing any blocks that have a ref count of zero after being 2832 * decremented. 2833 */ 2834 static int walk_log_tree(struct btrfs_trans_handle *trans, 2835 struct btrfs_root *log, struct walk_control *wc) 2836 { 2837 struct btrfs_fs_info *fs_info = log->fs_info; 2838 int ret = 0; 2839 int wret; 2840 int level; 2841 struct btrfs_path *path; 2842 int orig_level; 2843 2844 path = btrfs_alloc_path(); 2845 if (!path) 2846 return -ENOMEM; 2847 2848 level = btrfs_header_level(log->node); 2849 orig_level = level; 2850 path->nodes[level] = log->node; 2851 atomic_inc(&log->node->refs); 2852 path->slots[level] = 0; 2853 2854 while (1) { 2855 wret = walk_down_log_tree(trans, log, path, &level, wc); 2856 if (wret > 0) 2857 break; 2858 if (wret < 0) { 2859 ret = wret; 2860 goto out; 2861 } 2862 2863 wret = walk_up_log_tree(trans, log, path, &level, wc); 2864 if (wret > 0) 2865 break; 2866 if (wret < 0) { 2867 ret = wret; 2868 goto out; 2869 } 2870 } 2871 2872 /* was the root node processed? if not, catch it here */ 2873 if (path->nodes[orig_level]) { 2874 ret = wc->process_func(log, path->nodes[orig_level], wc, 2875 btrfs_header_generation(path->nodes[orig_level]), 2876 orig_level); 2877 if (ret) 2878 goto out; 2879 if (wc->free) { 2880 struct extent_buffer *next; 2881 2882 next = path->nodes[orig_level]; 2883 2884 if (trans) { 2885 btrfs_tree_lock(next); 2886 btrfs_set_lock_blocking_write(next); 2887 btrfs_clean_tree_block(next); 2888 btrfs_wait_tree_block_writeback(next); 2889 btrfs_tree_unlock(next); 2890 ret = btrfs_pin_reserved_extent(trans, 2891 next->start, next->len); 2892 if (ret) 2893 goto out; 2894 } else { 2895 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2896 clear_extent_buffer_dirty(next); 2897 unaccount_log_buffer(fs_info, next->start); 2898 } 2899 } 2900 } 2901 2902 out: 2903 btrfs_free_path(path); 2904 return ret; 2905 } 2906 2907 /* 2908 * helper function to update the item for a given subvolumes log root 2909 * in the tree of log roots 2910 */ 2911 static int update_log_root(struct btrfs_trans_handle *trans, 2912 struct btrfs_root *log, 2913 struct btrfs_root_item *root_item) 2914 { 2915 struct btrfs_fs_info *fs_info = log->fs_info; 2916 int ret; 2917 2918 if (log->log_transid == 1) { 2919 /* insert root item on the first sync */ 2920 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2921 &log->root_key, root_item); 2922 } else { 2923 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2924 &log->root_key, root_item); 2925 } 2926 return ret; 2927 } 2928 2929 static void wait_log_commit(struct btrfs_root *root, int transid) 2930 { 2931 DEFINE_WAIT(wait); 2932 int index = transid % 2; 2933 2934 /* 2935 * we only allow two pending log transactions at a time, 2936 * so we know that if ours is more than 2 older than the 2937 * current transaction, we're done 2938 */ 2939 for (;;) { 2940 prepare_to_wait(&root->log_commit_wait[index], 2941 &wait, TASK_UNINTERRUPTIBLE); 2942 2943 if (!(root->log_transid_committed < transid && 2944 atomic_read(&root->log_commit[index]))) 2945 break; 2946 2947 mutex_unlock(&root->log_mutex); 2948 schedule(); 2949 mutex_lock(&root->log_mutex); 2950 } 2951 finish_wait(&root->log_commit_wait[index], &wait); 2952 } 2953 2954 static void wait_for_writer(struct btrfs_root *root) 2955 { 2956 DEFINE_WAIT(wait); 2957 2958 for (;;) { 2959 prepare_to_wait(&root->log_writer_wait, &wait, 2960 TASK_UNINTERRUPTIBLE); 2961 if (!atomic_read(&root->log_writers)) 2962 break; 2963 2964 mutex_unlock(&root->log_mutex); 2965 schedule(); 2966 mutex_lock(&root->log_mutex); 2967 } 2968 finish_wait(&root->log_writer_wait, &wait); 2969 } 2970 2971 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2972 struct btrfs_log_ctx *ctx) 2973 { 2974 if (!ctx) 2975 return; 2976 2977 mutex_lock(&root->log_mutex); 2978 list_del_init(&ctx->list); 2979 mutex_unlock(&root->log_mutex); 2980 } 2981 2982 /* 2983 * Invoked in log mutex context, or be sure there is no other task which 2984 * can access the list. 2985 */ 2986 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2987 int index, int error) 2988 { 2989 struct btrfs_log_ctx *ctx; 2990 struct btrfs_log_ctx *safe; 2991 2992 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2993 list_del_init(&ctx->list); 2994 ctx->log_ret = error; 2995 } 2996 2997 INIT_LIST_HEAD(&root->log_ctxs[index]); 2998 } 2999 3000 /* 3001 * btrfs_sync_log does sends a given tree log down to the disk and 3002 * updates the super blocks to record it. When this call is done, 3003 * you know that any inodes previously logged are safely on disk only 3004 * if it returns 0. 3005 * 3006 * Any other return value means you need to call btrfs_commit_transaction. 3007 * Some of the edge cases for fsyncing directories that have had unlinks 3008 * or renames done in the past mean that sometimes the only safe 3009 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3010 * that has happened. 3011 */ 3012 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3013 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3014 { 3015 int index1; 3016 int index2; 3017 int mark; 3018 int ret; 3019 struct btrfs_fs_info *fs_info = root->fs_info; 3020 struct btrfs_root *log = root->log_root; 3021 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3022 struct btrfs_root_item new_root_item; 3023 int log_transid = 0; 3024 struct btrfs_log_ctx root_log_ctx; 3025 struct blk_plug plug; 3026 3027 mutex_lock(&root->log_mutex); 3028 log_transid = ctx->log_transid; 3029 if (root->log_transid_committed >= log_transid) { 3030 mutex_unlock(&root->log_mutex); 3031 return ctx->log_ret; 3032 } 3033 3034 index1 = log_transid % 2; 3035 if (atomic_read(&root->log_commit[index1])) { 3036 wait_log_commit(root, log_transid); 3037 mutex_unlock(&root->log_mutex); 3038 return ctx->log_ret; 3039 } 3040 ASSERT(log_transid == root->log_transid); 3041 atomic_set(&root->log_commit[index1], 1); 3042 3043 /* wait for previous tree log sync to complete */ 3044 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3045 wait_log_commit(root, log_transid - 1); 3046 3047 while (1) { 3048 int batch = atomic_read(&root->log_batch); 3049 /* when we're on an ssd, just kick the log commit out */ 3050 if (!btrfs_test_opt(fs_info, SSD) && 3051 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3052 mutex_unlock(&root->log_mutex); 3053 schedule_timeout_uninterruptible(1); 3054 mutex_lock(&root->log_mutex); 3055 } 3056 wait_for_writer(root); 3057 if (batch == atomic_read(&root->log_batch)) 3058 break; 3059 } 3060 3061 /* bail out if we need to do a full commit */ 3062 if (btrfs_need_log_full_commit(trans)) { 3063 ret = -EAGAIN; 3064 mutex_unlock(&root->log_mutex); 3065 goto out; 3066 } 3067 3068 if (log_transid % 2 == 0) 3069 mark = EXTENT_DIRTY; 3070 else 3071 mark = EXTENT_NEW; 3072 3073 /* we start IO on all the marked extents here, but we don't actually 3074 * wait for them until later. 3075 */ 3076 blk_start_plug(&plug); 3077 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3078 if (ret) { 3079 blk_finish_plug(&plug); 3080 btrfs_abort_transaction(trans, ret); 3081 btrfs_set_log_full_commit(trans); 3082 mutex_unlock(&root->log_mutex); 3083 goto out; 3084 } 3085 3086 /* 3087 * We _must_ update under the root->log_mutex in order to make sure we 3088 * have a consistent view of the log root we are trying to commit at 3089 * this moment. 3090 * 3091 * We _must_ copy this into a local copy, because we are not holding the 3092 * log_root_tree->log_mutex yet. This is important because when we 3093 * commit the log_root_tree we must have a consistent view of the 3094 * log_root_tree when we update the super block to point at the 3095 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3096 * with the commit and possibly point at the new block which we may not 3097 * have written out. 3098 */ 3099 btrfs_set_root_node(&log->root_item, log->node); 3100 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3101 3102 root->log_transid++; 3103 log->log_transid = root->log_transid; 3104 root->log_start_pid = 0; 3105 /* 3106 * IO has been started, blocks of the log tree have WRITTEN flag set 3107 * in their headers. new modifications of the log will be written to 3108 * new positions. so it's safe to allow log writers to go in. 3109 */ 3110 mutex_unlock(&root->log_mutex); 3111 3112 btrfs_init_log_ctx(&root_log_ctx, NULL); 3113 3114 mutex_lock(&log_root_tree->log_mutex); 3115 3116 index2 = log_root_tree->log_transid % 2; 3117 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3118 root_log_ctx.log_transid = log_root_tree->log_transid; 3119 3120 /* 3121 * Now we are safe to update the log_root_tree because we're under the 3122 * log_mutex, and we're a current writer so we're holding the commit 3123 * open until we drop the log_mutex. 3124 */ 3125 ret = update_log_root(trans, log, &new_root_item); 3126 if (ret) { 3127 if (!list_empty(&root_log_ctx.list)) 3128 list_del_init(&root_log_ctx.list); 3129 3130 blk_finish_plug(&plug); 3131 btrfs_set_log_full_commit(trans); 3132 3133 if (ret != -ENOSPC) { 3134 btrfs_abort_transaction(trans, ret); 3135 mutex_unlock(&log_root_tree->log_mutex); 3136 goto out; 3137 } 3138 btrfs_wait_tree_log_extents(log, mark); 3139 mutex_unlock(&log_root_tree->log_mutex); 3140 ret = -EAGAIN; 3141 goto out; 3142 } 3143 3144 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3145 blk_finish_plug(&plug); 3146 list_del_init(&root_log_ctx.list); 3147 mutex_unlock(&log_root_tree->log_mutex); 3148 ret = root_log_ctx.log_ret; 3149 goto out; 3150 } 3151 3152 index2 = root_log_ctx.log_transid % 2; 3153 if (atomic_read(&log_root_tree->log_commit[index2])) { 3154 blk_finish_plug(&plug); 3155 ret = btrfs_wait_tree_log_extents(log, mark); 3156 wait_log_commit(log_root_tree, 3157 root_log_ctx.log_transid); 3158 mutex_unlock(&log_root_tree->log_mutex); 3159 if (!ret) 3160 ret = root_log_ctx.log_ret; 3161 goto out; 3162 } 3163 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3164 atomic_set(&log_root_tree->log_commit[index2], 1); 3165 3166 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3167 wait_log_commit(log_root_tree, 3168 root_log_ctx.log_transid - 1); 3169 } 3170 3171 /* 3172 * now that we've moved on to the tree of log tree roots, 3173 * check the full commit flag again 3174 */ 3175 if (btrfs_need_log_full_commit(trans)) { 3176 blk_finish_plug(&plug); 3177 btrfs_wait_tree_log_extents(log, mark); 3178 mutex_unlock(&log_root_tree->log_mutex); 3179 ret = -EAGAIN; 3180 goto out_wake_log_root; 3181 } 3182 3183 ret = btrfs_write_marked_extents(fs_info, 3184 &log_root_tree->dirty_log_pages, 3185 EXTENT_DIRTY | EXTENT_NEW); 3186 blk_finish_plug(&plug); 3187 if (ret) { 3188 btrfs_set_log_full_commit(trans); 3189 btrfs_abort_transaction(trans, ret); 3190 mutex_unlock(&log_root_tree->log_mutex); 3191 goto out_wake_log_root; 3192 } 3193 ret = btrfs_wait_tree_log_extents(log, mark); 3194 if (!ret) 3195 ret = btrfs_wait_tree_log_extents(log_root_tree, 3196 EXTENT_NEW | EXTENT_DIRTY); 3197 if (ret) { 3198 btrfs_set_log_full_commit(trans); 3199 mutex_unlock(&log_root_tree->log_mutex); 3200 goto out_wake_log_root; 3201 } 3202 3203 btrfs_set_super_log_root(fs_info->super_for_commit, 3204 log_root_tree->node->start); 3205 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3206 btrfs_header_level(log_root_tree->node)); 3207 3208 log_root_tree->log_transid++; 3209 mutex_unlock(&log_root_tree->log_mutex); 3210 3211 /* 3212 * Nobody else is going to jump in and write the ctree 3213 * super here because the log_commit atomic below is protecting 3214 * us. We must be called with a transaction handle pinning 3215 * the running transaction open, so a full commit can't hop 3216 * in and cause problems either. 3217 */ 3218 ret = write_all_supers(fs_info, 1); 3219 if (ret) { 3220 btrfs_set_log_full_commit(trans); 3221 btrfs_abort_transaction(trans, ret); 3222 goto out_wake_log_root; 3223 } 3224 3225 mutex_lock(&root->log_mutex); 3226 if (root->last_log_commit < log_transid) 3227 root->last_log_commit = log_transid; 3228 mutex_unlock(&root->log_mutex); 3229 3230 out_wake_log_root: 3231 mutex_lock(&log_root_tree->log_mutex); 3232 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3233 3234 log_root_tree->log_transid_committed++; 3235 atomic_set(&log_root_tree->log_commit[index2], 0); 3236 mutex_unlock(&log_root_tree->log_mutex); 3237 3238 /* 3239 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3240 * all the updates above are seen by the woken threads. It might not be 3241 * necessary, but proving that seems to be hard. 3242 */ 3243 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3244 out: 3245 mutex_lock(&root->log_mutex); 3246 btrfs_remove_all_log_ctxs(root, index1, ret); 3247 root->log_transid_committed++; 3248 atomic_set(&root->log_commit[index1], 0); 3249 mutex_unlock(&root->log_mutex); 3250 3251 /* 3252 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3253 * all the updates above are seen by the woken threads. It might not be 3254 * necessary, but proving that seems to be hard. 3255 */ 3256 cond_wake_up(&root->log_commit_wait[index1]); 3257 return ret; 3258 } 3259 3260 static void free_log_tree(struct btrfs_trans_handle *trans, 3261 struct btrfs_root *log) 3262 { 3263 int ret; 3264 struct walk_control wc = { 3265 .free = 1, 3266 .process_func = process_one_buffer 3267 }; 3268 3269 ret = walk_log_tree(trans, log, &wc); 3270 if (ret) { 3271 if (trans) 3272 btrfs_abort_transaction(trans, ret); 3273 else 3274 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3275 } 3276 3277 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3278 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3279 extent_io_tree_release(&log->log_csum_range); 3280 btrfs_put_root(log); 3281 } 3282 3283 /* 3284 * free all the extents used by the tree log. This should be called 3285 * at commit time of the full transaction 3286 */ 3287 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3288 { 3289 if (root->log_root) { 3290 free_log_tree(trans, root->log_root); 3291 root->log_root = NULL; 3292 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3293 } 3294 return 0; 3295 } 3296 3297 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3298 struct btrfs_fs_info *fs_info) 3299 { 3300 if (fs_info->log_root_tree) { 3301 free_log_tree(trans, fs_info->log_root_tree); 3302 fs_info->log_root_tree = NULL; 3303 } 3304 return 0; 3305 } 3306 3307 /* 3308 * Check if an inode was logged in the current transaction. We can't always rely 3309 * on an inode's logged_trans value, because it's an in-memory only field and 3310 * therefore not persisted. This means that its value is lost if the inode gets 3311 * evicted and loaded again from disk (in which case it has a value of 0, and 3312 * certainly it is smaller then any possible transaction ID), when that happens 3313 * the full_sync flag is set in the inode's runtime flags, so on that case we 3314 * assume eviction happened and ignore the logged_trans value, assuming the 3315 * worst case, that the inode was logged before in the current transaction. 3316 */ 3317 static bool inode_logged(struct btrfs_trans_handle *trans, 3318 struct btrfs_inode *inode) 3319 { 3320 if (inode->logged_trans == trans->transid) 3321 return true; 3322 3323 if (inode->last_trans == trans->transid && 3324 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 3325 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3326 return true; 3327 3328 return false; 3329 } 3330 3331 /* 3332 * If both a file and directory are logged, and unlinks or renames are 3333 * mixed in, we have a few interesting corners: 3334 * 3335 * create file X in dir Y 3336 * link file X to X.link in dir Y 3337 * fsync file X 3338 * unlink file X but leave X.link 3339 * fsync dir Y 3340 * 3341 * After a crash we would expect only X.link to exist. But file X 3342 * didn't get fsync'd again so the log has back refs for X and X.link. 3343 * 3344 * We solve this by removing directory entries and inode backrefs from the 3345 * log when a file that was logged in the current transaction is 3346 * unlinked. Any later fsync will include the updated log entries, and 3347 * we'll be able to reconstruct the proper directory items from backrefs. 3348 * 3349 * This optimizations allows us to avoid relogging the entire inode 3350 * or the entire directory. 3351 */ 3352 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3353 struct btrfs_root *root, 3354 const char *name, int name_len, 3355 struct btrfs_inode *dir, u64 index) 3356 { 3357 struct btrfs_root *log; 3358 struct btrfs_dir_item *di; 3359 struct btrfs_path *path; 3360 int ret; 3361 int err = 0; 3362 int bytes_del = 0; 3363 u64 dir_ino = btrfs_ino(dir); 3364 3365 if (!inode_logged(trans, dir)) 3366 return 0; 3367 3368 ret = join_running_log_trans(root); 3369 if (ret) 3370 return 0; 3371 3372 mutex_lock(&dir->log_mutex); 3373 3374 log = root->log_root; 3375 path = btrfs_alloc_path(); 3376 if (!path) { 3377 err = -ENOMEM; 3378 goto out_unlock; 3379 } 3380 3381 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3382 name, name_len, -1); 3383 if (IS_ERR(di)) { 3384 err = PTR_ERR(di); 3385 goto fail; 3386 } 3387 if (di) { 3388 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3389 bytes_del += name_len; 3390 if (ret) { 3391 err = ret; 3392 goto fail; 3393 } 3394 } 3395 btrfs_release_path(path); 3396 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3397 index, name, name_len, -1); 3398 if (IS_ERR(di)) { 3399 err = PTR_ERR(di); 3400 goto fail; 3401 } 3402 if (di) { 3403 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3404 bytes_del += name_len; 3405 if (ret) { 3406 err = ret; 3407 goto fail; 3408 } 3409 } 3410 3411 /* update the directory size in the log to reflect the names 3412 * we have removed 3413 */ 3414 if (bytes_del) { 3415 struct btrfs_key key; 3416 3417 key.objectid = dir_ino; 3418 key.offset = 0; 3419 key.type = BTRFS_INODE_ITEM_KEY; 3420 btrfs_release_path(path); 3421 3422 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3423 if (ret < 0) { 3424 err = ret; 3425 goto fail; 3426 } 3427 if (ret == 0) { 3428 struct btrfs_inode_item *item; 3429 u64 i_size; 3430 3431 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3432 struct btrfs_inode_item); 3433 i_size = btrfs_inode_size(path->nodes[0], item); 3434 if (i_size > bytes_del) 3435 i_size -= bytes_del; 3436 else 3437 i_size = 0; 3438 btrfs_set_inode_size(path->nodes[0], item, i_size); 3439 btrfs_mark_buffer_dirty(path->nodes[0]); 3440 } else 3441 ret = 0; 3442 btrfs_release_path(path); 3443 } 3444 fail: 3445 btrfs_free_path(path); 3446 out_unlock: 3447 mutex_unlock(&dir->log_mutex); 3448 if (err == -ENOSPC) { 3449 btrfs_set_log_full_commit(trans); 3450 err = 0; 3451 } else if (err < 0 && err != -ENOENT) { 3452 /* ENOENT can be returned if the entry hasn't been fsynced yet */ 3453 btrfs_abort_transaction(trans, err); 3454 } 3455 3456 btrfs_end_log_trans(root); 3457 3458 return err; 3459 } 3460 3461 /* see comments for btrfs_del_dir_entries_in_log */ 3462 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3463 struct btrfs_root *root, 3464 const char *name, int name_len, 3465 struct btrfs_inode *inode, u64 dirid) 3466 { 3467 struct btrfs_root *log; 3468 u64 index; 3469 int ret; 3470 3471 if (!inode_logged(trans, inode)) 3472 return 0; 3473 3474 ret = join_running_log_trans(root); 3475 if (ret) 3476 return 0; 3477 log = root->log_root; 3478 mutex_lock(&inode->log_mutex); 3479 3480 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3481 dirid, &index); 3482 mutex_unlock(&inode->log_mutex); 3483 if (ret == -ENOSPC) { 3484 btrfs_set_log_full_commit(trans); 3485 ret = 0; 3486 } else if (ret < 0 && ret != -ENOENT) 3487 btrfs_abort_transaction(trans, ret); 3488 btrfs_end_log_trans(root); 3489 3490 return ret; 3491 } 3492 3493 /* 3494 * creates a range item in the log for 'dirid'. first_offset and 3495 * last_offset tell us which parts of the key space the log should 3496 * be considered authoritative for. 3497 */ 3498 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3499 struct btrfs_root *log, 3500 struct btrfs_path *path, 3501 int key_type, u64 dirid, 3502 u64 first_offset, u64 last_offset) 3503 { 3504 int ret; 3505 struct btrfs_key key; 3506 struct btrfs_dir_log_item *item; 3507 3508 key.objectid = dirid; 3509 key.offset = first_offset; 3510 if (key_type == BTRFS_DIR_ITEM_KEY) 3511 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3512 else 3513 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3514 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3515 if (ret) 3516 return ret; 3517 3518 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3519 struct btrfs_dir_log_item); 3520 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3521 btrfs_mark_buffer_dirty(path->nodes[0]); 3522 btrfs_release_path(path); 3523 return 0; 3524 } 3525 3526 /* 3527 * log all the items included in the current transaction for a given 3528 * directory. This also creates the range items in the log tree required 3529 * to replay anything deleted before the fsync 3530 */ 3531 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3532 struct btrfs_root *root, struct btrfs_inode *inode, 3533 struct btrfs_path *path, 3534 struct btrfs_path *dst_path, int key_type, 3535 struct btrfs_log_ctx *ctx, 3536 u64 min_offset, u64 *last_offset_ret) 3537 { 3538 struct btrfs_key min_key; 3539 struct btrfs_root *log = root->log_root; 3540 struct extent_buffer *src; 3541 int err = 0; 3542 int ret; 3543 int i; 3544 int nritems; 3545 u64 first_offset = min_offset; 3546 u64 last_offset = (u64)-1; 3547 u64 ino = btrfs_ino(inode); 3548 3549 log = root->log_root; 3550 3551 min_key.objectid = ino; 3552 min_key.type = key_type; 3553 min_key.offset = min_offset; 3554 3555 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3556 3557 /* 3558 * we didn't find anything from this transaction, see if there 3559 * is anything at all 3560 */ 3561 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3562 min_key.objectid = ino; 3563 min_key.type = key_type; 3564 min_key.offset = (u64)-1; 3565 btrfs_release_path(path); 3566 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3567 if (ret < 0) { 3568 btrfs_release_path(path); 3569 return ret; 3570 } 3571 ret = btrfs_previous_item(root, path, ino, key_type); 3572 3573 /* if ret == 0 there are items for this type, 3574 * create a range to tell us the last key of this type. 3575 * otherwise, there are no items in this directory after 3576 * *min_offset, and we create a range to indicate that. 3577 */ 3578 if (ret == 0) { 3579 struct btrfs_key tmp; 3580 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3581 path->slots[0]); 3582 if (key_type == tmp.type) 3583 first_offset = max(min_offset, tmp.offset) + 1; 3584 } 3585 goto done; 3586 } 3587 3588 /* go backward to find any previous key */ 3589 ret = btrfs_previous_item(root, path, ino, key_type); 3590 if (ret == 0) { 3591 struct btrfs_key tmp; 3592 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3593 if (key_type == tmp.type) { 3594 first_offset = tmp.offset; 3595 ret = overwrite_item(trans, log, dst_path, 3596 path->nodes[0], path->slots[0], 3597 &tmp); 3598 if (ret) { 3599 err = ret; 3600 goto done; 3601 } 3602 } 3603 } 3604 btrfs_release_path(path); 3605 3606 /* 3607 * Find the first key from this transaction again. See the note for 3608 * log_new_dir_dentries, if we're logging a directory recursively we 3609 * won't be holding its i_mutex, which means we can modify the directory 3610 * while we're logging it. If we remove an entry between our first 3611 * search and this search we'll not find the key again and can just 3612 * bail. 3613 */ 3614 search: 3615 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3616 if (ret != 0) 3617 goto done; 3618 3619 /* 3620 * we have a block from this transaction, log every item in it 3621 * from our directory 3622 */ 3623 while (1) { 3624 struct btrfs_key tmp; 3625 src = path->nodes[0]; 3626 nritems = btrfs_header_nritems(src); 3627 for (i = path->slots[0]; i < nritems; i++) { 3628 struct btrfs_dir_item *di; 3629 3630 btrfs_item_key_to_cpu(src, &min_key, i); 3631 3632 if (min_key.objectid != ino || min_key.type != key_type) 3633 goto done; 3634 3635 if (need_resched()) { 3636 btrfs_release_path(path); 3637 cond_resched(); 3638 goto search; 3639 } 3640 3641 ret = overwrite_item(trans, log, dst_path, src, i, 3642 &min_key); 3643 if (ret) { 3644 err = ret; 3645 goto done; 3646 } 3647 3648 /* 3649 * We must make sure that when we log a directory entry, 3650 * the corresponding inode, after log replay, has a 3651 * matching link count. For example: 3652 * 3653 * touch foo 3654 * mkdir mydir 3655 * sync 3656 * ln foo mydir/bar 3657 * xfs_io -c "fsync" mydir 3658 * <crash> 3659 * <mount fs and log replay> 3660 * 3661 * Would result in a fsync log that when replayed, our 3662 * file inode would have a link count of 1, but we get 3663 * two directory entries pointing to the same inode. 3664 * After removing one of the names, it would not be 3665 * possible to remove the other name, which resulted 3666 * always in stale file handle errors, and would not 3667 * be possible to rmdir the parent directory, since 3668 * its i_size could never decrement to the value 3669 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3670 */ 3671 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3672 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3673 if (ctx && 3674 (btrfs_dir_transid(src, di) == trans->transid || 3675 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3676 tmp.type != BTRFS_ROOT_ITEM_KEY) 3677 ctx->log_new_dentries = true; 3678 } 3679 path->slots[0] = nritems; 3680 3681 /* 3682 * look ahead to the next item and see if it is also 3683 * from this directory and from this transaction 3684 */ 3685 ret = btrfs_next_leaf(root, path); 3686 if (ret) { 3687 if (ret == 1) 3688 last_offset = (u64)-1; 3689 else 3690 err = ret; 3691 goto done; 3692 } 3693 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3694 if (tmp.objectid != ino || tmp.type != key_type) { 3695 last_offset = (u64)-1; 3696 goto done; 3697 } 3698 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3699 ret = overwrite_item(trans, log, dst_path, 3700 path->nodes[0], path->slots[0], 3701 &tmp); 3702 if (ret) 3703 err = ret; 3704 else 3705 last_offset = tmp.offset; 3706 goto done; 3707 } 3708 } 3709 done: 3710 btrfs_release_path(path); 3711 btrfs_release_path(dst_path); 3712 3713 if (err == 0) { 3714 *last_offset_ret = last_offset; 3715 /* 3716 * insert the log range keys to indicate where the log 3717 * is valid 3718 */ 3719 ret = insert_dir_log_key(trans, log, path, key_type, 3720 ino, first_offset, last_offset); 3721 if (ret) 3722 err = ret; 3723 } 3724 return err; 3725 } 3726 3727 /* 3728 * logging directories is very similar to logging inodes, We find all the items 3729 * from the current transaction and write them to the log. 3730 * 3731 * The recovery code scans the directory in the subvolume, and if it finds a 3732 * key in the range logged that is not present in the log tree, then it means 3733 * that dir entry was unlinked during the transaction. 3734 * 3735 * In order for that scan to work, we must include one key smaller than 3736 * the smallest logged by this transaction and one key larger than the largest 3737 * key logged by this transaction. 3738 */ 3739 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3740 struct btrfs_root *root, struct btrfs_inode *inode, 3741 struct btrfs_path *path, 3742 struct btrfs_path *dst_path, 3743 struct btrfs_log_ctx *ctx) 3744 { 3745 u64 min_key; 3746 u64 max_key; 3747 int ret; 3748 int key_type = BTRFS_DIR_ITEM_KEY; 3749 3750 again: 3751 min_key = 0; 3752 max_key = 0; 3753 while (1) { 3754 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3755 ctx, min_key, &max_key); 3756 if (ret) 3757 return ret; 3758 if (max_key == (u64)-1) 3759 break; 3760 min_key = max_key + 1; 3761 } 3762 3763 if (key_type == BTRFS_DIR_ITEM_KEY) { 3764 key_type = BTRFS_DIR_INDEX_KEY; 3765 goto again; 3766 } 3767 return 0; 3768 } 3769 3770 /* 3771 * a helper function to drop items from the log before we relog an 3772 * inode. max_key_type indicates the highest item type to remove. 3773 * This cannot be run for file data extents because it does not 3774 * free the extents they point to. 3775 */ 3776 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3777 struct btrfs_root *log, 3778 struct btrfs_path *path, 3779 u64 objectid, int max_key_type) 3780 { 3781 int ret; 3782 struct btrfs_key key; 3783 struct btrfs_key found_key; 3784 int start_slot; 3785 3786 key.objectid = objectid; 3787 key.type = max_key_type; 3788 key.offset = (u64)-1; 3789 3790 while (1) { 3791 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3792 BUG_ON(ret == 0); /* Logic error */ 3793 if (ret < 0) 3794 break; 3795 3796 if (path->slots[0] == 0) 3797 break; 3798 3799 path->slots[0]--; 3800 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3801 path->slots[0]); 3802 3803 if (found_key.objectid != objectid) 3804 break; 3805 3806 found_key.offset = 0; 3807 found_key.type = 0; 3808 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 3809 if (ret < 0) 3810 break; 3811 3812 ret = btrfs_del_items(trans, log, path, start_slot, 3813 path->slots[0] - start_slot + 1); 3814 /* 3815 * If start slot isn't 0 then we don't need to re-search, we've 3816 * found the last guy with the objectid in this tree. 3817 */ 3818 if (ret || start_slot != 0) 3819 break; 3820 btrfs_release_path(path); 3821 } 3822 btrfs_release_path(path); 3823 if (ret > 0) 3824 ret = 0; 3825 return ret; 3826 } 3827 3828 static void fill_inode_item(struct btrfs_trans_handle *trans, 3829 struct extent_buffer *leaf, 3830 struct btrfs_inode_item *item, 3831 struct inode *inode, int log_inode_only, 3832 u64 logged_isize) 3833 { 3834 struct btrfs_map_token token; 3835 3836 btrfs_init_map_token(&token, leaf); 3837 3838 if (log_inode_only) { 3839 /* set the generation to zero so the recover code 3840 * can tell the difference between an logging 3841 * just to say 'this inode exists' and a logging 3842 * to say 'update this inode with these values' 3843 */ 3844 btrfs_set_token_inode_generation(&token, item, 0); 3845 btrfs_set_token_inode_size(&token, item, logged_isize); 3846 } else { 3847 btrfs_set_token_inode_generation(&token, item, 3848 BTRFS_I(inode)->generation); 3849 btrfs_set_token_inode_size(&token, item, inode->i_size); 3850 } 3851 3852 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3853 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3854 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3855 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3856 3857 btrfs_set_token_timespec_sec(&token, &item->atime, 3858 inode->i_atime.tv_sec); 3859 btrfs_set_token_timespec_nsec(&token, &item->atime, 3860 inode->i_atime.tv_nsec); 3861 3862 btrfs_set_token_timespec_sec(&token, &item->mtime, 3863 inode->i_mtime.tv_sec); 3864 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3865 inode->i_mtime.tv_nsec); 3866 3867 btrfs_set_token_timespec_sec(&token, &item->ctime, 3868 inode->i_ctime.tv_sec); 3869 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3870 inode->i_ctime.tv_nsec); 3871 3872 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3873 3874 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3875 btrfs_set_token_inode_transid(&token, item, trans->transid); 3876 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3877 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); 3878 btrfs_set_token_inode_block_group(&token, item, 0); 3879 } 3880 3881 static int log_inode_item(struct btrfs_trans_handle *trans, 3882 struct btrfs_root *log, struct btrfs_path *path, 3883 struct btrfs_inode *inode) 3884 { 3885 struct btrfs_inode_item *inode_item; 3886 int ret; 3887 3888 ret = btrfs_insert_empty_item(trans, log, path, 3889 &inode->location, sizeof(*inode_item)); 3890 if (ret && ret != -EEXIST) 3891 return ret; 3892 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3893 struct btrfs_inode_item); 3894 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3895 0, 0); 3896 btrfs_release_path(path); 3897 return 0; 3898 } 3899 3900 static int log_csums(struct btrfs_trans_handle *trans, 3901 struct btrfs_inode *inode, 3902 struct btrfs_root *log_root, 3903 struct btrfs_ordered_sum *sums) 3904 { 3905 const u64 lock_end = sums->bytenr + sums->len - 1; 3906 struct extent_state *cached_state = NULL; 3907 int ret; 3908 3909 /* 3910 * If this inode was not used for reflink operations in the current 3911 * transaction with new extents, then do the fast path, no need to 3912 * worry about logging checksum items with overlapping ranges. 3913 */ 3914 if (inode->last_reflink_trans < trans->transid) 3915 return btrfs_csum_file_blocks(trans, log_root, sums); 3916 3917 /* 3918 * Serialize logging for checksums. This is to avoid racing with the 3919 * same checksum being logged by another task that is logging another 3920 * file which happens to refer to the same extent as well. Such races 3921 * can leave checksum items in the log with overlapping ranges. 3922 */ 3923 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 3924 lock_end, &cached_state); 3925 if (ret) 3926 return ret; 3927 /* 3928 * Due to extent cloning, we might have logged a csum item that covers a 3929 * subrange of a cloned extent, and later we can end up logging a csum 3930 * item for a larger subrange of the same extent or the entire range. 3931 * This would leave csum items in the log tree that cover the same range 3932 * and break the searches for checksums in the log tree, resulting in 3933 * some checksums missing in the fs/subvolume tree. So just delete (or 3934 * trim and adjust) any existing csum items in the log for this range. 3935 */ 3936 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 3937 if (!ret) 3938 ret = btrfs_csum_file_blocks(trans, log_root, sums); 3939 3940 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 3941 &cached_state); 3942 3943 return ret; 3944 } 3945 3946 static noinline int copy_items(struct btrfs_trans_handle *trans, 3947 struct btrfs_inode *inode, 3948 struct btrfs_path *dst_path, 3949 struct btrfs_path *src_path, 3950 int start_slot, int nr, int inode_only, 3951 u64 logged_isize) 3952 { 3953 struct btrfs_fs_info *fs_info = trans->fs_info; 3954 unsigned long src_offset; 3955 unsigned long dst_offset; 3956 struct btrfs_root *log = inode->root->log_root; 3957 struct btrfs_file_extent_item *extent; 3958 struct btrfs_inode_item *inode_item; 3959 struct extent_buffer *src = src_path->nodes[0]; 3960 int ret; 3961 struct btrfs_key *ins_keys; 3962 u32 *ins_sizes; 3963 char *ins_data; 3964 int i; 3965 struct list_head ordered_sums; 3966 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3967 3968 INIT_LIST_HEAD(&ordered_sums); 3969 3970 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3971 nr * sizeof(u32), GFP_NOFS); 3972 if (!ins_data) 3973 return -ENOMEM; 3974 3975 ins_sizes = (u32 *)ins_data; 3976 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3977 3978 for (i = 0; i < nr; i++) { 3979 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3980 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3981 } 3982 ret = btrfs_insert_empty_items(trans, log, dst_path, 3983 ins_keys, ins_sizes, nr); 3984 if (ret) { 3985 kfree(ins_data); 3986 return ret; 3987 } 3988 3989 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3990 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3991 dst_path->slots[0]); 3992 3993 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3994 3995 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3996 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3997 dst_path->slots[0], 3998 struct btrfs_inode_item); 3999 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4000 &inode->vfs_inode, 4001 inode_only == LOG_INODE_EXISTS, 4002 logged_isize); 4003 } else { 4004 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4005 src_offset, ins_sizes[i]); 4006 } 4007 4008 /* take a reference on file data extents so that truncates 4009 * or deletes of this inode don't have to relog the inode 4010 * again 4011 */ 4012 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4013 !skip_csum) { 4014 int found_type; 4015 extent = btrfs_item_ptr(src, start_slot + i, 4016 struct btrfs_file_extent_item); 4017 4018 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4019 continue; 4020 4021 found_type = btrfs_file_extent_type(src, extent); 4022 if (found_type == BTRFS_FILE_EXTENT_REG) { 4023 u64 ds, dl, cs, cl; 4024 ds = btrfs_file_extent_disk_bytenr(src, 4025 extent); 4026 /* ds == 0 is a hole */ 4027 if (ds == 0) 4028 continue; 4029 4030 dl = btrfs_file_extent_disk_num_bytes(src, 4031 extent); 4032 cs = btrfs_file_extent_offset(src, extent); 4033 cl = btrfs_file_extent_num_bytes(src, 4034 extent); 4035 if (btrfs_file_extent_compression(src, 4036 extent)) { 4037 cs = 0; 4038 cl = dl; 4039 } 4040 4041 ret = btrfs_lookup_csums_range( 4042 fs_info->csum_root, 4043 ds + cs, ds + cs + cl - 1, 4044 &ordered_sums, 0); 4045 if (ret) 4046 break; 4047 } 4048 } 4049 } 4050 4051 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4052 btrfs_release_path(dst_path); 4053 kfree(ins_data); 4054 4055 /* 4056 * we have to do this after the loop above to avoid changing the 4057 * log tree while trying to change the log tree. 4058 */ 4059 while (!list_empty(&ordered_sums)) { 4060 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4061 struct btrfs_ordered_sum, 4062 list); 4063 if (!ret) 4064 ret = log_csums(trans, inode, log, sums); 4065 list_del(&sums->list); 4066 kfree(sums); 4067 } 4068 4069 return ret; 4070 } 4071 4072 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4073 { 4074 struct extent_map *em1, *em2; 4075 4076 em1 = list_entry(a, struct extent_map, list); 4077 em2 = list_entry(b, struct extent_map, list); 4078 4079 if (em1->start < em2->start) 4080 return -1; 4081 else if (em1->start > em2->start) 4082 return 1; 4083 return 0; 4084 } 4085 4086 static int log_extent_csums(struct btrfs_trans_handle *trans, 4087 struct btrfs_inode *inode, 4088 struct btrfs_root *log_root, 4089 const struct extent_map *em, 4090 struct btrfs_log_ctx *ctx) 4091 { 4092 struct btrfs_ordered_extent *ordered; 4093 u64 csum_offset; 4094 u64 csum_len; 4095 u64 mod_start = em->mod_start; 4096 u64 mod_len = em->mod_len; 4097 LIST_HEAD(ordered_sums); 4098 int ret = 0; 4099 4100 if (inode->flags & BTRFS_INODE_NODATASUM || 4101 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4102 em->block_start == EXTENT_MAP_HOLE) 4103 return 0; 4104 4105 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4106 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4107 const u64 mod_end = mod_start + mod_len; 4108 struct btrfs_ordered_sum *sums; 4109 4110 if (mod_len == 0) 4111 break; 4112 4113 if (ordered_end <= mod_start) 4114 continue; 4115 if (mod_end <= ordered->file_offset) 4116 break; 4117 4118 /* 4119 * We are going to copy all the csums on this ordered extent, so 4120 * go ahead and adjust mod_start and mod_len in case this ordered 4121 * extent has already been logged. 4122 */ 4123 if (ordered->file_offset > mod_start) { 4124 if (ordered_end >= mod_end) 4125 mod_len = ordered->file_offset - mod_start; 4126 /* 4127 * If we have this case 4128 * 4129 * |--------- logged extent ---------| 4130 * |----- ordered extent ----| 4131 * 4132 * Just don't mess with mod_start and mod_len, we'll 4133 * just end up logging more csums than we need and it 4134 * will be ok. 4135 */ 4136 } else { 4137 if (ordered_end < mod_end) { 4138 mod_len = mod_end - ordered_end; 4139 mod_start = ordered_end; 4140 } else { 4141 mod_len = 0; 4142 } 4143 } 4144 4145 /* 4146 * To keep us from looping for the above case of an ordered 4147 * extent that falls inside of the logged extent. 4148 */ 4149 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4150 continue; 4151 4152 list_for_each_entry(sums, &ordered->list, list) { 4153 ret = log_csums(trans, inode, log_root, sums); 4154 if (ret) 4155 return ret; 4156 } 4157 } 4158 4159 /* We're done, found all csums in the ordered extents. */ 4160 if (mod_len == 0) 4161 return 0; 4162 4163 /* If we're compressed we have to save the entire range of csums. */ 4164 if (em->compress_type) { 4165 csum_offset = 0; 4166 csum_len = max(em->block_len, em->orig_block_len); 4167 } else { 4168 csum_offset = mod_start - em->start; 4169 csum_len = mod_len; 4170 } 4171 4172 /* block start is already adjusted for the file extent offset. */ 4173 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4174 em->block_start + csum_offset, 4175 em->block_start + csum_offset + 4176 csum_len - 1, &ordered_sums, 0); 4177 if (ret) 4178 return ret; 4179 4180 while (!list_empty(&ordered_sums)) { 4181 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4182 struct btrfs_ordered_sum, 4183 list); 4184 if (!ret) 4185 ret = log_csums(trans, inode, log_root, sums); 4186 list_del(&sums->list); 4187 kfree(sums); 4188 } 4189 4190 return ret; 4191 } 4192 4193 static int log_one_extent(struct btrfs_trans_handle *trans, 4194 struct btrfs_inode *inode, struct btrfs_root *root, 4195 const struct extent_map *em, 4196 struct btrfs_path *path, 4197 struct btrfs_log_ctx *ctx) 4198 { 4199 struct btrfs_root *log = root->log_root; 4200 struct btrfs_file_extent_item *fi; 4201 struct extent_buffer *leaf; 4202 struct btrfs_map_token token; 4203 struct btrfs_key key; 4204 u64 extent_offset = em->start - em->orig_start; 4205 u64 block_len; 4206 int ret; 4207 int extent_inserted = 0; 4208 4209 ret = log_extent_csums(trans, inode, log, em, ctx); 4210 if (ret) 4211 return ret; 4212 4213 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 4214 em->start + em->len, NULL, 0, 1, 4215 sizeof(*fi), &extent_inserted); 4216 if (ret) 4217 return ret; 4218 4219 if (!extent_inserted) { 4220 key.objectid = btrfs_ino(inode); 4221 key.type = BTRFS_EXTENT_DATA_KEY; 4222 key.offset = em->start; 4223 4224 ret = btrfs_insert_empty_item(trans, log, path, &key, 4225 sizeof(*fi)); 4226 if (ret) 4227 return ret; 4228 } 4229 leaf = path->nodes[0]; 4230 btrfs_init_map_token(&token, leaf); 4231 fi = btrfs_item_ptr(leaf, path->slots[0], 4232 struct btrfs_file_extent_item); 4233 4234 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4235 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4236 btrfs_set_token_file_extent_type(&token, fi, 4237 BTRFS_FILE_EXTENT_PREALLOC); 4238 else 4239 btrfs_set_token_file_extent_type(&token, fi, 4240 BTRFS_FILE_EXTENT_REG); 4241 4242 block_len = max(em->block_len, em->orig_block_len); 4243 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4244 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4245 em->block_start); 4246 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4247 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4248 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4249 em->block_start - 4250 extent_offset); 4251 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4252 } else { 4253 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4254 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4255 } 4256 4257 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4258 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4259 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4260 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4261 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4262 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4263 btrfs_mark_buffer_dirty(leaf); 4264 4265 btrfs_release_path(path); 4266 4267 return ret; 4268 } 4269 4270 /* 4271 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4272 * lose them after doing a fast fsync and replaying the log. We scan the 4273 * subvolume's root instead of iterating the inode's extent map tree because 4274 * otherwise we can log incorrect extent items based on extent map conversion. 4275 * That can happen due to the fact that extent maps are merged when they 4276 * are not in the extent map tree's list of modified extents. 4277 */ 4278 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4279 struct btrfs_inode *inode, 4280 struct btrfs_path *path) 4281 { 4282 struct btrfs_root *root = inode->root; 4283 struct btrfs_key key; 4284 const u64 i_size = i_size_read(&inode->vfs_inode); 4285 const u64 ino = btrfs_ino(inode); 4286 struct btrfs_path *dst_path = NULL; 4287 bool dropped_extents = false; 4288 u64 truncate_offset = i_size; 4289 struct extent_buffer *leaf; 4290 int slot; 4291 int ins_nr = 0; 4292 int start_slot; 4293 int ret; 4294 4295 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4296 return 0; 4297 4298 key.objectid = ino; 4299 key.type = BTRFS_EXTENT_DATA_KEY; 4300 key.offset = i_size; 4301 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4302 if (ret < 0) 4303 goto out; 4304 4305 /* 4306 * We must check if there is a prealloc extent that starts before the 4307 * i_size and crosses the i_size boundary. This is to ensure later we 4308 * truncate down to the end of that extent and not to the i_size, as 4309 * otherwise we end up losing part of the prealloc extent after a log 4310 * replay and with an implicit hole if there is another prealloc extent 4311 * that starts at an offset beyond i_size. 4312 */ 4313 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4314 if (ret < 0) 4315 goto out; 4316 4317 if (ret == 0) { 4318 struct btrfs_file_extent_item *ei; 4319 4320 leaf = path->nodes[0]; 4321 slot = path->slots[0]; 4322 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4323 4324 if (btrfs_file_extent_type(leaf, ei) == 4325 BTRFS_FILE_EXTENT_PREALLOC) { 4326 u64 extent_end; 4327 4328 btrfs_item_key_to_cpu(leaf, &key, slot); 4329 extent_end = key.offset + 4330 btrfs_file_extent_num_bytes(leaf, ei); 4331 4332 if (extent_end > i_size) 4333 truncate_offset = extent_end; 4334 } 4335 } else { 4336 ret = 0; 4337 } 4338 4339 while (true) { 4340 leaf = path->nodes[0]; 4341 slot = path->slots[0]; 4342 4343 if (slot >= btrfs_header_nritems(leaf)) { 4344 if (ins_nr > 0) { 4345 ret = copy_items(trans, inode, dst_path, path, 4346 start_slot, ins_nr, 1, 0); 4347 if (ret < 0) 4348 goto out; 4349 ins_nr = 0; 4350 } 4351 ret = btrfs_next_leaf(root, path); 4352 if (ret < 0) 4353 goto out; 4354 if (ret > 0) { 4355 ret = 0; 4356 break; 4357 } 4358 continue; 4359 } 4360 4361 btrfs_item_key_to_cpu(leaf, &key, slot); 4362 if (key.objectid > ino) 4363 break; 4364 if (WARN_ON_ONCE(key.objectid < ino) || 4365 key.type < BTRFS_EXTENT_DATA_KEY || 4366 key.offset < i_size) { 4367 path->slots[0]++; 4368 continue; 4369 } 4370 if (!dropped_extents) { 4371 /* 4372 * Avoid logging extent items logged in past fsync calls 4373 * and leading to duplicate keys in the log tree. 4374 */ 4375 do { 4376 ret = btrfs_truncate_inode_items(trans, 4377 root->log_root, 4378 &inode->vfs_inode, 4379 truncate_offset, 4380 BTRFS_EXTENT_DATA_KEY); 4381 } while (ret == -EAGAIN); 4382 if (ret) 4383 goto out; 4384 dropped_extents = true; 4385 } 4386 if (ins_nr == 0) 4387 start_slot = slot; 4388 ins_nr++; 4389 path->slots[0]++; 4390 if (!dst_path) { 4391 dst_path = btrfs_alloc_path(); 4392 if (!dst_path) { 4393 ret = -ENOMEM; 4394 goto out; 4395 } 4396 } 4397 } 4398 if (ins_nr > 0) 4399 ret = copy_items(trans, inode, dst_path, path, 4400 start_slot, ins_nr, 1, 0); 4401 out: 4402 btrfs_release_path(path); 4403 btrfs_free_path(dst_path); 4404 return ret; 4405 } 4406 4407 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4408 struct btrfs_root *root, 4409 struct btrfs_inode *inode, 4410 struct btrfs_path *path, 4411 struct btrfs_log_ctx *ctx) 4412 { 4413 struct btrfs_ordered_extent *ordered; 4414 struct btrfs_ordered_extent *tmp; 4415 struct extent_map *em, *n; 4416 struct list_head extents; 4417 struct extent_map_tree *tree = &inode->extent_tree; 4418 u64 test_gen; 4419 int ret = 0; 4420 int num = 0; 4421 4422 INIT_LIST_HEAD(&extents); 4423 4424 write_lock(&tree->lock); 4425 test_gen = root->fs_info->last_trans_committed; 4426 4427 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4428 list_del_init(&em->list); 4429 /* 4430 * Just an arbitrary number, this can be really CPU intensive 4431 * once we start getting a lot of extents, and really once we 4432 * have a bunch of extents we just want to commit since it will 4433 * be faster. 4434 */ 4435 if (++num > 32768) { 4436 list_del_init(&tree->modified_extents); 4437 ret = -EFBIG; 4438 goto process; 4439 } 4440 4441 if (em->generation <= test_gen) 4442 continue; 4443 4444 /* We log prealloc extents beyond eof later. */ 4445 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4446 em->start >= i_size_read(&inode->vfs_inode)) 4447 continue; 4448 4449 /* Need a ref to keep it from getting evicted from cache */ 4450 refcount_inc(&em->refs); 4451 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4452 list_add_tail(&em->list, &extents); 4453 num++; 4454 } 4455 4456 list_sort(NULL, &extents, extent_cmp); 4457 process: 4458 while (!list_empty(&extents)) { 4459 em = list_entry(extents.next, struct extent_map, list); 4460 4461 list_del_init(&em->list); 4462 4463 /* 4464 * If we had an error we just need to delete everybody from our 4465 * private list. 4466 */ 4467 if (ret) { 4468 clear_em_logging(tree, em); 4469 free_extent_map(em); 4470 continue; 4471 } 4472 4473 write_unlock(&tree->lock); 4474 4475 ret = log_one_extent(trans, inode, root, em, path, ctx); 4476 write_lock(&tree->lock); 4477 clear_em_logging(tree, em); 4478 free_extent_map(em); 4479 } 4480 WARN_ON(!list_empty(&extents)); 4481 write_unlock(&tree->lock); 4482 4483 btrfs_release_path(path); 4484 if (!ret) 4485 ret = btrfs_log_prealloc_extents(trans, inode, path); 4486 if (ret) 4487 return ret; 4488 4489 /* 4490 * We have logged all extents successfully, now make sure the commit of 4491 * the current transaction waits for the ordered extents to complete 4492 * before it commits and wipes out the log trees, otherwise we would 4493 * lose data if an ordered extents completes after the transaction 4494 * commits and a power failure happens after the transaction commit. 4495 */ 4496 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4497 list_del_init(&ordered->log_list); 4498 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4499 4500 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4501 spin_lock_irq(&inode->ordered_tree.lock); 4502 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4503 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4504 atomic_inc(&trans->transaction->pending_ordered); 4505 } 4506 spin_unlock_irq(&inode->ordered_tree.lock); 4507 } 4508 btrfs_put_ordered_extent(ordered); 4509 } 4510 4511 return 0; 4512 } 4513 4514 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4515 struct btrfs_path *path, u64 *size_ret) 4516 { 4517 struct btrfs_key key; 4518 int ret; 4519 4520 key.objectid = btrfs_ino(inode); 4521 key.type = BTRFS_INODE_ITEM_KEY; 4522 key.offset = 0; 4523 4524 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4525 if (ret < 0) { 4526 return ret; 4527 } else if (ret > 0) { 4528 *size_ret = 0; 4529 } else { 4530 struct btrfs_inode_item *item; 4531 4532 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4533 struct btrfs_inode_item); 4534 *size_ret = btrfs_inode_size(path->nodes[0], item); 4535 /* 4536 * If the in-memory inode's i_size is smaller then the inode 4537 * size stored in the btree, return the inode's i_size, so 4538 * that we get a correct inode size after replaying the log 4539 * when before a power failure we had a shrinking truncate 4540 * followed by addition of a new name (rename / new hard link). 4541 * Otherwise return the inode size from the btree, to avoid 4542 * data loss when replaying a log due to previously doing a 4543 * write that expands the inode's size and logging a new name 4544 * immediately after. 4545 */ 4546 if (*size_ret > inode->vfs_inode.i_size) 4547 *size_ret = inode->vfs_inode.i_size; 4548 } 4549 4550 btrfs_release_path(path); 4551 return 0; 4552 } 4553 4554 /* 4555 * At the moment we always log all xattrs. This is to figure out at log replay 4556 * time which xattrs must have their deletion replayed. If a xattr is missing 4557 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4558 * because if a xattr is deleted, the inode is fsynced and a power failure 4559 * happens, causing the log to be replayed the next time the fs is mounted, 4560 * we want the xattr to not exist anymore (same behaviour as other filesystems 4561 * with a journal, ext3/4, xfs, f2fs, etc). 4562 */ 4563 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4564 struct btrfs_root *root, 4565 struct btrfs_inode *inode, 4566 struct btrfs_path *path, 4567 struct btrfs_path *dst_path) 4568 { 4569 int ret; 4570 struct btrfs_key key; 4571 const u64 ino = btrfs_ino(inode); 4572 int ins_nr = 0; 4573 int start_slot = 0; 4574 4575 key.objectid = ino; 4576 key.type = BTRFS_XATTR_ITEM_KEY; 4577 key.offset = 0; 4578 4579 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4580 if (ret < 0) 4581 return ret; 4582 4583 while (true) { 4584 int slot = path->slots[0]; 4585 struct extent_buffer *leaf = path->nodes[0]; 4586 int nritems = btrfs_header_nritems(leaf); 4587 4588 if (slot >= nritems) { 4589 if (ins_nr > 0) { 4590 ret = copy_items(trans, inode, dst_path, path, 4591 start_slot, ins_nr, 1, 0); 4592 if (ret < 0) 4593 return ret; 4594 ins_nr = 0; 4595 } 4596 ret = btrfs_next_leaf(root, path); 4597 if (ret < 0) 4598 return ret; 4599 else if (ret > 0) 4600 break; 4601 continue; 4602 } 4603 4604 btrfs_item_key_to_cpu(leaf, &key, slot); 4605 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4606 break; 4607 4608 if (ins_nr == 0) 4609 start_slot = slot; 4610 ins_nr++; 4611 path->slots[0]++; 4612 cond_resched(); 4613 } 4614 if (ins_nr > 0) { 4615 ret = copy_items(trans, inode, dst_path, path, 4616 start_slot, ins_nr, 1, 0); 4617 if (ret < 0) 4618 return ret; 4619 } 4620 4621 return 0; 4622 } 4623 4624 /* 4625 * When using the NO_HOLES feature if we punched a hole that causes the 4626 * deletion of entire leafs or all the extent items of the first leaf (the one 4627 * that contains the inode item and references) we may end up not processing 4628 * any extents, because there are no leafs with a generation matching the 4629 * current transaction that have extent items for our inode. So we need to find 4630 * if any holes exist and then log them. We also need to log holes after any 4631 * truncate operation that changes the inode's size. 4632 */ 4633 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4634 struct btrfs_root *root, 4635 struct btrfs_inode *inode, 4636 struct btrfs_path *path) 4637 { 4638 struct btrfs_fs_info *fs_info = root->fs_info; 4639 struct btrfs_key key; 4640 const u64 ino = btrfs_ino(inode); 4641 const u64 i_size = i_size_read(&inode->vfs_inode); 4642 u64 prev_extent_end = 0; 4643 int ret; 4644 4645 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4646 return 0; 4647 4648 key.objectid = ino; 4649 key.type = BTRFS_EXTENT_DATA_KEY; 4650 key.offset = 0; 4651 4652 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4653 if (ret < 0) 4654 return ret; 4655 4656 while (true) { 4657 struct extent_buffer *leaf = path->nodes[0]; 4658 4659 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4660 ret = btrfs_next_leaf(root, path); 4661 if (ret < 0) 4662 return ret; 4663 if (ret > 0) { 4664 ret = 0; 4665 break; 4666 } 4667 leaf = path->nodes[0]; 4668 } 4669 4670 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4671 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4672 break; 4673 4674 /* We have a hole, log it. */ 4675 if (prev_extent_end < key.offset) { 4676 const u64 hole_len = key.offset - prev_extent_end; 4677 4678 /* 4679 * Release the path to avoid deadlocks with other code 4680 * paths that search the root while holding locks on 4681 * leafs from the log root. 4682 */ 4683 btrfs_release_path(path); 4684 ret = btrfs_insert_file_extent(trans, root->log_root, 4685 ino, prev_extent_end, 0, 4686 0, hole_len, 0, hole_len, 4687 0, 0, 0); 4688 if (ret < 0) 4689 return ret; 4690 4691 /* 4692 * Search for the same key again in the root. Since it's 4693 * an extent item and we are holding the inode lock, the 4694 * key must still exist. If it doesn't just emit warning 4695 * and return an error to fall back to a transaction 4696 * commit. 4697 */ 4698 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4699 if (ret < 0) 4700 return ret; 4701 if (WARN_ON(ret > 0)) 4702 return -ENOENT; 4703 leaf = path->nodes[0]; 4704 } 4705 4706 prev_extent_end = btrfs_file_extent_end(path); 4707 path->slots[0]++; 4708 cond_resched(); 4709 } 4710 4711 if (prev_extent_end < i_size) { 4712 u64 hole_len; 4713 4714 btrfs_release_path(path); 4715 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 4716 ret = btrfs_insert_file_extent(trans, root->log_root, 4717 ino, prev_extent_end, 0, 0, 4718 hole_len, 0, hole_len, 4719 0, 0, 0); 4720 if (ret < 0) 4721 return ret; 4722 } 4723 4724 return 0; 4725 } 4726 4727 /* 4728 * When we are logging a new inode X, check if it doesn't have a reference that 4729 * matches the reference from some other inode Y created in a past transaction 4730 * and that was renamed in the current transaction. If we don't do this, then at 4731 * log replay time we can lose inode Y (and all its files if it's a directory): 4732 * 4733 * mkdir /mnt/x 4734 * echo "hello world" > /mnt/x/foobar 4735 * sync 4736 * mv /mnt/x /mnt/y 4737 * mkdir /mnt/x # or touch /mnt/x 4738 * xfs_io -c fsync /mnt/x 4739 * <power fail> 4740 * mount fs, trigger log replay 4741 * 4742 * After the log replay procedure, we would lose the first directory and all its 4743 * files (file foobar). 4744 * For the case where inode Y is not a directory we simply end up losing it: 4745 * 4746 * echo "123" > /mnt/foo 4747 * sync 4748 * mv /mnt/foo /mnt/bar 4749 * echo "abc" > /mnt/foo 4750 * xfs_io -c fsync /mnt/foo 4751 * <power fail> 4752 * 4753 * We also need this for cases where a snapshot entry is replaced by some other 4754 * entry (file or directory) otherwise we end up with an unreplayable log due to 4755 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4756 * if it were a regular entry: 4757 * 4758 * mkdir /mnt/x 4759 * btrfs subvolume snapshot /mnt /mnt/x/snap 4760 * btrfs subvolume delete /mnt/x/snap 4761 * rmdir /mnt/x 4762 * mkdir /mnt/x 4763 * fsync /mnt/x or fsync some new file inside it 4764 * <power fail> 4765 * 4766 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4767 * the same transaction. 4768 */ 4769 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4770 const int slot, 4771 const struct btrfs_key *key, 4772 struct btrfs_inode *inode, 4773 u64 *other_ino, u64 *other_parent) 4774 { 4775 int ret; 4776 struct btrfs_path *search_path; 4777 char *name = NULL; 4778 u32 name_len = 0; 4779 u32 item_size = btrfs_item_size_nr(eb, slot); 4780 u32 cur_offset = 0; 4781 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4782 4783 search_path = btrfs_alloc_path(); 4784 if (!search_path) 4785 return -ENOMEM; 4786 search_path->search_commit_root = 1; 4787 search_path->skip_locking = 1; 4788 4789 while (cur_offset < item_size) { 4790 u64 parent; 4791 u32 this_name_len; 4792 u32 this_len; 4793 unsigned long name_ptr; 4794 struct btrfs_dir_item *di; 4795 4796 if (key->type == BTRFS_INODE_REF_KEY) { 4797 struct btrfs_inode_ref *iref; 4798 4799 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4800 parent = key->offset; 4801 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4802 name_ptr = (unsigned long)(iref + 1); 4803 this_len = sizeof(*iref) + this_name_len; 4804 } else { 4805 struct btrfs_inode_extref *extref; 4806 4807 extref = (struct btrfs_inode_extref *)(ptr + 4808 cur_offset); 4809 parent = btrfs_inode_extref_parent(eb, extref); 4810 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4811 name_ptr = (unsigned long)&extref->name; 4812 this_len = sizeof(*extref) + this_name_len; 4813 } 4814 4815 if (this_name_len > name_len) { 4816 char *new_name; 4817 4818 new_name = krealloc(name, this_name_len, GFP_NOFS); 4819 if (!new_name) { 4820 ret = -ENOMEM; 4821 goto out; 4822 } 4823 name_len = this_name_len; 4824 name = new_name; 4825 } 4826 4827 read_extent_buffer(eb, name, name_ptr, this_name_len); 4828 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4829 parent, name, this_name_len, 0); 4830 if (di && !IS_ERR(di)) { 4831 struct btrfs_key di_key; 4832 4833 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4834 di, &di_key); 4835 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4836 if (di_key.objectid != key->objectid) { 4837 ret = 1; 4838 *other_ino = di_key.objectid; 4839 *other_parent = parent; 4840 } else { 4841 ret = 0; 4842 } 4843 } else { 4844 ret = -EAGAIN; 4845 } 4846 goto out; 4847 } else if (IS_ERR(di)) { 4848 ret = PTR_ERR(di); 4849 goto out; 4850 } 4851 btrfs_release_path(search_path); 4852 4853 cur_offset += this_len; 4854 } 4855 ret = 0; 4856 out: 4857 btrfs_free_path(search_path); 4858 kfree(name); 4859 return ret; 4860 } 4861 4862 struct btrfs_ino_list { 4863 u64 ino; 4864 u64 parent; 4865 struct list_head list; 4866 }; 4867 4868 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 4869 struct btrfs_root *root, 4870 struct btrfs_path *path, 4871 struct btrfs_log_ctx *ctx, 4872 u64 ino, u64 parent) 4873 { 4874 struct btrfs_ino_list *ino_elem; 4875 LIST_HEAD(inode_list); 4876 int ret = 0; 4877 4878 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4879 if (!ino_elem) 4880 return -ENOMEM; 4881 ino_elem->ino = ino; 4882 ino_elem->parent = parent; 4883 list_add_tail(&ino_elem->list, &inode_list); 4884 4885 while (!list_empty(&inode_list)) { 4886 struct btrfs_fs_info *fs_info = root->fs_info; 4887 struct btrfs_key key; 4888 struct inode *inode; 4889 4890 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 4891 list); 4892 ino = ino_elem->ino; 4893 parent = ino_elem->parent; 4894 list_del(&ino_elem->list); 4895 kfree(ino_elem); 4896 if (ret) 4897 continue; 4898 4899 btrfs_release_path(path); 4900 4901 inode = btrfs_iget(fs_info->sb, ino, root); 4902 /* 4903 * If the other inode that had a conflicting dir entry was 4904 * deleted in the current transaction, we need to log its parent 4905 * directory. 4906 */ 4907 if (IS_ERR(inode)) { 4908 ret = PTR_ERR(inode); 4909 if (ret == -ENOENT) { 4910 inode = btrfs_iget(fs_info->sb, parent, root); 4911 if (IS_ERR(inode)) { 4912 ret = PTR_ERR(inode); 4913 } else { 4914 ret = btrfs_log_inode(trans, root, 4915 BTRFS_I(inode), 4916 LOG_OTHER_INODE_ALL, 4917 ctx); 4918 btrfs_add_delayed_iput(inode); 4919 } 4920 } 4921 continue; 4922 } 4923 /* 4924 * If the inode was already logged skip it - otherwise we can 4925 * hit an infinite loop. Example: 4926 * 4927 * From the commit root (previous transaction) we have the 4928 * following inodes: 4929 * 4930 * inode 257 a directory 4931 * inode 258 with references "zz" and "zz_link" on inode 257 4932 * inode 259 with reference "a" on inode 257 4933 * 4934 * And in the current (uncommitted) transaction we have: 4935 * 4936 * inode 257 a directory, unchanged 4937 * inode 258 with references "a" and "a2" on inode 257 4938 * inode 259 with reference "zz_link" on inode 257 4939 * inode 261 with reference "zz" on inode 257 4940 * 4941 * When logging inode 261 the following infinite loop could 4942 * happen if we don't skip already logged inodes: 4943 * 4944 * - we detect inode 258 as a conflicting inode, with inode 261 4945 * on reference "zz", and log it; 4946 * 4947 * - we detect inode 259 as a conflicting inode, with inode 258 4948 * on reference "a", and log it; 4949 * 4950 * - we detect inode 258 as a conflicting inode, with inode 259 4951 * on reference "zz_link", and log it - again! After this we 4952 * repeat the above steps forever. 4953 */ 4954 spin_lock(&BTRFS_I(inode)->lock); 4955 /* 4956 * Check the inode's logged_trans only instead of 4957 * btrfs_inode_in_log(). This is because the last_log_commit of 4958 * the inode is not updated when we only log that it exists and 4959 * it has the full sync bit set (see btrfs_log_inode()). 4960 */ 4961 if (BTRFS_I(inode)->logged_trans == trans->transid) { 4962 spin_unlock(&BTRFS_I(inode)->lock); 4963 btrfs_add_delayed_iput(inode); 4964 continue; 4965 } 4966 spin_unlock(&BTRFS_I(inode)->lock); 4967 /* 4968 * We are safe logging the other inode without acquiring its 4969 * lock as long as we log with the LOG_INODE_EXISTS mode. We 4970 * are safe against concurrent renames of the other inode as 4971 * well because during a rename we pin the log and update the 4972 * log with the new name before we unpin it. 4973 */ 4974 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 4975 LOG_OTHER_INODE, ctx); 4976 if (ret) { 4977 btrfs_add_delayed_iput(inode); 4978 continue; 4979 } 4980 4981 key.objectid = ino; 4982 key.type = BTRFS_INODE_REF_KEY; 4983 key.offset = 0; 4984 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4985 if (ret < 0) { 4986 btrfs_add_delayed_iput(inode); 4987 continue; 4988 } 4989 4990 while (true) { 4991 struct extent_buffer *leaf = path->nodes[0]; 4992 int slot = path->slots[0]; 4993 u64 other_ino = 0; 4994 u64 other_parent = 0; 4995 4996 if (slot >= btrfs_header_nritems(leaf)) { 4997 ret = btrfs_next_leaf(root, path); 4998 if (ret < 0) { 4999 break; 5000 } else if (ret > 0) { 5001 ret = 0; 5002 break; 5003 } 5004 continue; 5005 } 5006 5007 btrfs_item_key_to_cpu(leaf, &key, slot); 5008 if (key.objectid != ino || 5009 (key.type != BTRFS_INODE_REF_KEY && 5010 key.type != BTRFS_INODE_EXTREF_KEY)) { 5011 ret = 0; 5012 break; 5013 } 5014 5015 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5016 BTRFS_I(inode), &other_ino, 5017 &other_parent); 5018 if (ret < 0) 5019 break; 5020 if (ret > 0) { 5021 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5022 if (!ino_elem) { 5023 ret = -ENOMEM; 5024 break; 5025 } 5026 ino_elem->ino = other_ino; 5027 ino_elem->parent = other_parent; 5028 list_add_tail(&ino_elem->list, &inode_list); 5029 ret = 0; 5030 } 5031 path->slots[0]++; 5032 } 5033 btrfs_add_delayed_iput(inode); 5034 } 5035 5036 return ret; 5037 } 5038 5039 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5040 struct btrfs_inode *inode, 5041 struct btrfs_key *min_key, 5042 const struct btrfs_key *max_key, 5043 struct btrfs_path *path, 5044 struct btrfs_path *dst_path, 5045 const u64 logged_isize, 5046 const bool recursive_logging, 5047 const int inode_only, 5048 struct btrfs_log_ctx *ctx, 5049 bool *need_log_inode_item) 5050 { 5051 struct btrfs_root *root = inode->root; 5052 int ins_start_slot = 0; 5053 int ins_nr = 0; 5054 int ret; 5055 5056 while (1) { 5057 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5058 if (ret < 0) 5059 return ret; 5060 if (ret > 0) { 5061 ret = 0; 5062 break; 5063 } 5064 again: 5065 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5066 if (min_key->objectid != max_key->objectid) 5067 break; 5068 if (min_key->type > max_key->type) 5069 break; 5070 5071 if (min_key->type == BTRFS_INODE_ITEM_KEY) 5072 *need_log_inode_item = false; 5073 5074 if ((min_key->type == BTRFS_INODE_REF_KEY || 5075 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5076 inode->generation == trans->transid && 5077 !recursive_logging) { 5078 u64 other_ino = 0; 5079 u64 other_parent = 0; 5080 5081 ret = btrfs_check_ref_name_override(path->nodes[0], 5082 path->slots[0], min_key, inode, 5083 &other_ino, &other_parent); 5084 if (ret < 0) { 5085 return ret; 5086 } else if (ret > 0 && ctx && 5087 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5088 if (ins_nr > 0) { 5089 ins_nr++; 5090 } else { 5091 ins_nr = 1; 5092 ins_start_slot = path->slots[0]; 5093 } 5094 ret = copy_items(trans, inode, dst_path, path, 5095 ins_start_slot, ins_nr, 5096 inode_only, logged_isize); 5097 if (ret < 0) 5098 return ret; 5099 ins_nr = 0; 5100 5101 ret = log_conflicting_inodes(trans, root, path, 5102 ctx, other_ino, other_parent); 5103 if (ret) 5104 return ret; 5105 btrfs_release_path(path); 5106 goto next_key; 5107 } 5108 } 5109 5110 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5111 if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5112 if (ins_nr == 0) 5113 goto next_slot; 5114 ret = copy_items(trans, inode, dst_path, path, 5115 ins_start_slot, 5116 ins_nr, inode_only, logged_isize); 5117 if (ret < 0) 5118 return ret; 5119 ins_nr = 0; 5120 goto next_slot; 5121 } 5122 5123 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5124 ins_nr++; 5125 goto next_slot; 5126 } else if (!ins_nr) { 5127 ins_start_slot = path->slots[0]; 5128 ins_nr = 1; 5129 goto next_slot; 5130 } 5131 5132 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5133 ins_nr, inode_only, logged_isize); 5134 if (ret < 0) 5135 return ret; 5136 ins_nr = 1; 5137 ins_start_slot = path->slots[0]; 5138 next_slot: 5139 path->slots[0]++; 5140 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5141 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5142 path->slots[0]); 5143 goto again; 5144 } 5145 if (ins_nr) { 5146 ret = copy_items(trans, inode, dst_path, path, 5147 ins_start_slot, ins_nr, inode_only, 5148 logged_isize); 5149 if (ret < 0) 5150 return ret; 5151 ins_nr = 0; 5152 } 5153 btrfs_release_path(path); 5154 next_key: 5155 if (min_key->offset < (u64)-1) { 5156 min_key->offset++; 5157 } else if (min_key->type < max_key->type) { 5158 min_key->type++; 5159 min_key->offset = 0; 5160 } else { 5161 break; 5162 } 5163 } 5164 if (ins_nr) 5165 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5166 ins_nr, inode_only, logged_isize); 5167 5168 return ret; 5169 } 5170 5171 /* log a single inode in the tree log. 5172 * At least one parent directory for this inode must exist in the tree 5173 * or be logged already. 5174 * 5175 * Any items from this inode changed by the current transaction are copied 5176 * to the log tree. An extra reference is taken on any extents in this 5177 * file, allowing us to avoid a whole pile of corner cases around logging 5178 * blocks that have been removed from the tree. 5179 * 5180 * See LOG_INODE_ALL and related defines for a description of what inode_only 5181 * does. 5182 * 5183 * This handles both files and directories. 5184 */ 5185 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5186 struct btrfs_root *root, struct btrfs_inode *inode, 5187 int inode_only, 5188 struct btrfs_log_ctx *ctx) 5189 { 5190 struct btrfs_path *path; 5191 struct btrfs_path *dst_path; 5192 struct btrfs_key min_key; 5193 struct btrfs_key max_key; 5194 struct btrfs_root *log = root->log_root; 5195 int err = 0; 5196 int ret = 0; 5197 bool fast_search = false; 5198 u64 ino = btrfs_ino(inode); 5199 struct extent_map_tree *em_tree = &inode->extent_tree; 5200 u64 logged_isize = 0; 5201 bool need_log_inode_item = true; 5202 bool xattrs_logged = false; 5203 bool recursive_logging = false; 5204 5205 path = btrfs_alloc_path(); 5206 if (!path) 5207 return -ENOMEM; 5208 dst_path = btrfs_alloc_path(); 5209 if (!dst_path) { 5210 btrfs_free_path(path); 5211 return -ENOMEM; 5212 } 5213 5214 min_key.objectid = ino; 5215 min_key.type = BTRFS_INODE_ITEM_KEY; 5216 min_key.offset = 0; 5217 5218 max_key.objectid = ino; 5219 5220 5221 /* today the code can only do partial logging of directories */ 5222 if (S_ISDIR(inode->vfs_inode.i_mode) || 5223 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5224 &inode->runtime_flags) && 5225 inode_only >= LOG_INODE_EXISTS)) 5226 max_key.type = BTRFS_XATTR_ITEM_KEY; 5227 else 5228 max_key.type = (u8)-1; 5229 max_key.offset = (u64)-1; 5230 5231 /* 5232 * Only run delayed items if we are a directory. We want to make sure 5233 * all directory indexes hit the fs/subvolume tree so we can find them 5234 * and figure out which index ranges have to be logged. 5235 * 5236 * Otherwise commit the delayed inode only if the full sync flag is set, 5237 * as we want to make sure an up to date version is in the subvolume 5238 * tree so copy_inode_items_to_log() / copy_items() can find it and copy 5239 * it to the log tree. For a non full sync, we always log the inode item 5240 * based on the in-memory struct btrfs_inode which is always up to date. 5241 */ 5242 if (S_ISDIR(inode->vfs_inode.i_mode)) 5243 ret = btrfs_commit_inode_delayed_items(trans, inode); 5244 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5245 ret = btrfs_commit_inode_delayed_inode(inode); 5246 5247 if (ret) { 5248 btrfs_free_path(path); 5249 btrfs_free_path(dst_path); 5250 return ret; 5251 } 5252 5253 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5254 recursive_logging = true; 5255 if (inode_only == LOG_OTHER_INODE) 5256 inode_only = LOG_INODE_EXISTS; 5257 else 5258 inode_only = LOG_INODE_ALL; 5259 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5260 } else { 5261 mutex_lock(&inode->log_mutex); 5262 } 5263 5264 /* 5265 * a brute force approach to making sure we get the most uptodate 5266 * copies of everything. 5267 */ 5268 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5269 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5270 5271 if (inode_only == LOG_INODE_EXISTS) 5272 max_key_type = BTRFS_XATTR_ITEM_KEY; 5273 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5274 } else { 5275 if (inode_only == LOG_INODE_EXISTS) { 5276 /* 5277 * Make sure the new inode item we write to the log has 5278 * the same isize as the current one (if it exists). 5279 * This is necessary to prevent data loss after log 5280 * replay, and also to prevent doing a wrong expanding 5281 * truncate - for e.g. create file, write 4K into offset 5282 * 0, fsync, write 4K into offset 4096, add hard link, 5283 * fsync some other file (to sync log), power fail - if 5284 * we use the inode's current i_size, after log replay 5285 * we get a 8Kb file, with the last 4Kb extent as a hole 5286 * (zeroes), as if an expanding truncate happened, 5287 * instead of getting a file of 4Kb only. 5288 */ 5289 err = logged_inode_size(log, inode, path, &logged_isize); 5290 if (err) 5291 goto out_unlock; 5292 } 5293 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5294 &inode->runtime_flags)) { 5295 if (inode_only == LOG_INODE_EXISTS) { 5296 max_key.type = BTRFS_XATTR_ITEM_KEY; 5297 ret = drop_objectid_items(trans, log, path, ino, 5298 max_key.type); 5299 } else { 5300 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5301 &inode->runtime_flags); 5302 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5303 &inode->runtime_flags); 5304 while(1) { 5305 ret = btrfs_truncate_inode_items(trans, 5306 log, &inode->vfs_inode, 0, 0); 5307 if (ret != -EAGAIN) 5308 break; 5309 } 5310 } 5311 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5312 &inode->runtime_flags) || 5313 inode_only == LOG_INODE_EXISTS) { 5314 if (inode_only == LOG_INODE_ALL) 5315 fast_search = true; 5316 max_key.type = BTRFS_XATTR_ITEM_KEY; 5317 ret = drop_objectid_items(trans, log, path, ino, 5318 max_key.type); 5319 } else { 5320 if (inode_only == LOG_INODE_ALL) 5321 fast_search = true; 5322 goto log_extents; 5323 } 5324 5325 } 5326 if (ret) { 5327 err = ret; 5328 goto out_unlock; 5329 } 5330 5331 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5332 path, dst_path, logged_isize, 5333 recursive_logging, inode_only, ctx, 5334 &need_log_inode_item); 5335 if (err) 5336 goto out_unlock; 5337 5338 btrfs_release_path(path); 5339 btrfs_release_path(dst_path); 5340 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5341 if (err) 5342 goto out_unlock; 5343 xattrs_logged = true; 5344 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5345 btrfs_release_path(path); 5346 btrfs_release_path(dst_path); 5347 err = btrfs_log_holes(trans, root, inode, path); 5348 if (err) 5349 goto out_unlock; 5350 } 5351 log_extents: 5352 btrfs_release_path(path); 5353 btrfs_release_path(dst_path); 5354 if (need_log_inode_item) { 5355 err = log_inode_item(trans, log, dst_path, inode); 5356 if (!err && !xattrs_logged) { 5357 err = btrfs_log_all_xattrs(trans, root, inode, path, 5358 dst_path); 5359 btrfs_release_path(path); 5360 } 5361 if (err) 5362 goto out_unlock; 5363 } 5364 if (fast_search) { 5365 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5366 ctx); 5367 if (ret) { 5368 err = ret; 5369 goto out_unlock; 5370 } 5371 } else if (inode_only == LOG_INODE_ALL) { 5372 struct extent_map *em, *n; 5373 5374 write_lock(&em_tree->lock); 5375 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5376 list_del_init(&em->list); 5377 write_unlock(&em_tree->lock); 5378 } 5379 5380 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5381 ret = log_directory_changes(trans, root, inode, path, dst_path, 5382 ctx); 5383 if (ret) { 5384 err = ret; 5385 goto out_unlock; 5386 } 5387 } 5388 5389 /* 5390 * If we are logging that an ancestor inode exists as part of logging a 5391 * new name from a link or rename operation, don't mark the inode as 5392 * logged - otherwise if an explicit fsync is made against an ancestor, 5393 * the fsync considers the inode in the log and doesn't sync the log, 5394 * resulting in the ancestor missing after a power failure unless the 5395 * log was synced as part of an fsync against any other unrelated inode. 5396 * So keep it simple for this case and just don't flag the ancestors as 5397 * logged. 5398 */ 5399 if (!ctx || 5400 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && 5401 &inode->vfs_inode != ctx->inode)) { 5402 spin_lock(&inode->lock); 5403 inode->logged_trans = trans->transid; 5404 /* 5405 * Don't update last_log_commit if we logged that an inode exists 5406 * after it was loaded to memory (full_sync bit set). 5407 * This is to prevent data loss when we do a write to the inode, 5408 * then the inode gets evicted after all delalloc was flushed, 5409 * then we log it exists (due to a rename for example) and then 5410 * fsync it. This last fsync would do nothing (not logging the 5411 * extents previously written). 5412 */ 5413 if (inode_only != LOG_INODE_EXISTS || 5414 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5415 inode->last_log_commit = inode->last_sub_trans; 5416 spin_unlock(&inode->lock); 5417 } 5418 out_unlock: 5419 mutex_unlock(&inode->log_mutex); 5420 5421 btrfs_free_path(path); 5422 btrfs_free_path(dst_path); 5423 return err; 5424 } 5425 5426 /* 5427 * Check if we must fallback to a transaction commit when logging an inode. 5428 * This must be called after logging the inode and is used only in the context 5429 * when fsyncing an inode requires the need to log some other inode - in which 5430 * case we can't lock the i_mutex of each other inode we need to log as that 5431 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5432 * log inodes up or down in the hierarchy) or rename operations for example. So 5433 * we take the log_mutex of the inode after we have logged it and then check for 5434 * its last_unlink_trans value - this is safe because any task setting 5435 * last_unlink_trans must take the log_mutex and it must do this before it does 5436 * the actual unlink operation, so if we do this check before a concurrent task 5437 * sets last_unlink_trans it means we've logged a consistent version/state of 5438 * all the inode items, otherwise we are not sure and must do a transaction 5439 * commit (the concurrent task might have only updated last_unlink_trans before 5440 * we logged the inode or it might have also done the unlink). 5441 */ 5442 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5443 struct btrfs_inode *inode) 5444 { 5445 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5446 bool ret = false; 5447 5448 mutex_lock(&inode->log_mutex); 5449 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5450 /* 5451 * Make sure any commits to the log are forced to be full 5452 * commits. 5453 */ 5454 btrfs_set_log_full_commit(trans); 5455 ret = true; 5456 } 5457 mutex_unlock(&inode->log_mutex); 5458 5459 return ret; 5460 } 5461 5462 /* 5463 * follow the dentry parent pointers up the chain and see if any 5464 * of the directories in it require a full commit before they can 5465 * be logged. Returns zero if nothing special needs to be done or 1 if 5466 * a full commit is required. 5467 */ 5468 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5469 struct btrfs_inode *inode, 5470 struct dentry *parent, 5471 struct super_block *sb, 5472 u64 last_committed) 5473 { 5474 int ret = 0; 5475 struct dentry *old_parent = NULL; 5476 5477 /* 5478 * for regular files, if its inode is already on disk, we don't 5479 * have to worry about the parents at all. This is because 5480 * we can use the last_unlink_trans field to record renames 5481 * and other fun in this file. 5482 */ 5483 if (S_ISREG(inode->vfs_inode.i_mode) && 5484 inode->generation <= last_committed && 5485 inode->last_unlink_trans <= last_committed) 5486 goto out; 5487 5488 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5489 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5490 goto out; 5491 inode = BTRFS_I(d_inode(parent)); 5492 } 5493 5494 while (1) { 5495 if (btrfs_must_commit_transaction(trans, inode)) { 5496 ret = 1; 5497 break; 5498 } 5499 5500 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5501 break; 5502 5503 if (IS_ROOT(parent)) { 5504 inode = BTRFS_I(d_inode(parent)); 5505 if (btrfs_must_commit_transaction(trans, inode)) 5506 ret = 1; 5507 break; 5508 } 5509 5510 parent = dget_parent(parent); 5511 dput(old_parent); 5512 old_parent = parent; 5513 inode = BTRFS_I(d_inode(parent)); 5514 5515 } 5516 dput(old_parent); 5517 out: 5518 return ret; 5519 } 5520 5521 struct btrfs_dir_list { 5522 u64 ino; 5523 struct list_head list; 5524 }; 5525 5526 /* 5527 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5528 * details about the why it is needed. 5529 * This is a recursive operation - if an existing dentry corresponds to a 5530 * directory, that directory's new entries are logged too (same behaviour as 5531 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5532 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5533 * complains about the following circular lock dependency / possible deadlock: 5534 * 5535 * CPU0 CPU1 5536 * ---- ---- 5537 * lock(&type->i_mutex_dir_key#3/2); 5538 * lock(sb_internal#2); 5539 * lock(&type->i_mutex_dir_key#3/2); 5540 * lock(&sb->s_type->i_mutex_key#14); 5541 * 5542 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5543 * sb_start_intwrite() in btrfs_start_transaction(). 5544 * Not locking i_mutex of the inodes is still safe because: 5545 * 5546 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5547 * that while logging the inode new references (names) are added or removed 5548 * from the inode, leaving the logged inode item with a link count that does 5549 * not match the number of logged inode reference items. This is fine because 5550 * at log replay time we compute the real number of links and correct the 5551 * link count in the inode item (see replay_one_buffer() and 5552 * link_to_fixup_dir()); 5553 * 5554 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5555 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5556 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5557 * has a size that doesn't match the sum of the lengths of all the logged 5558 * names. This does not result in a problem because if a dir_item key is 5559 * logged but its matching dir_index key is not logged, at log replay time we 5560 * don't use it to replay the respective name (see replay_one_name()). On the 5561 * other hand if only the dir_index key ends up being logged, the respective 5562 * name is added to the fs/subvol tree with both the dir_item and dir_index 5563 * keys created (see replay_one_name()). 5564 * The directory's inode item with a wrong i_size is not a problem as well, 5565 * since we don't use it at log replay time to set the i_size in the inode 5566 * item of the fs/subvol tree (see overwrite_item()). 5567 */ 5568 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5569 struct btrfs_root *root, 5570 struct btrfs_inode *start_inode, 5571 struct btrfs_log_ctx *ctx) 5572 { 5573 struct btrfs_fs_info *fs_info = root->fs_info; 5574 struct btrfs_root *log = root->log_root; 5575 struct btrfs_path *path; 5576 LIST_HEAD(dir_list); 5577 struct btrfs_dir_list *dir_elem; 5578 int ret = 0; 5579 5580 path = btrfs_alloc_path(); 5581 if (!path) 5582 return -ENOMEM; 5583 5584 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5585 if (!dir_elem) { 5586 btrfs_free_path(path); 5587 return -ENOMEM; 5588 } 5589 dir_elem->ino = btrfs_ino(start_inode); 5590 list_add_tail(&dir_elem->list, &dir_list); 5591 5592 while (!list_empty(&dir_list)) { 5593 struct extent_buffer *leaf; 5594 struct btrfs_key min_key; 5595 int nritems; 5596 int i; 5597 5598 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5599 list); 5600 if (ret) 5601 goto next_dir_inode; 5602 5603 min_key.objectid = dir_elem->ino; 5604 min_key.type = BTRFS_DIR_ITEM_KEY; 5605 min_key.offset = 0; 5606 again: 5607 btrfs_release_path(path); 5608 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5609 if (ret < 0) { 5610 goto next_dir_inode; 5611 } else if (ret > 0) { 5612 ret = 0; 5613 goto next_dir_inode; 5614 } 5615 5616 process_leaf: 5617 leaf = path->nodes[0]; 5618 nritems = btrfs_header_nritems(leaf); 5619 for (i = path->slots[0]; i < nritems; i++) { 5620 struct btrfs_dir_item *di; 5621 struct btrfs_key di_key; 5622 struct inode *di_inode; 5623 struct btrfs_dir_list *new_dir_elem; 5624 int log_mode = LOG_INODE_EXISTS; 5625 int type; 5626 5627 btrfs_item_key_to_cpu(leaf, &min_key, i); 5628 if (min_key.objectid != dir_elem->ino || 5629 min_key.type != BTRFS_DIR_ITEM_KEY) 5630 goto next_dir_inode; 5631 5632 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5633 type = btrfs_dir_type(leaf, di); 5634 if (btrfs_dir_transid(leaf, di) < trans->transid && 5635 type != BTRFS_FT_DIR) 5636 continue; 5637 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5638 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5639 continue; 5640 5641 btrfs_release_path(path); 5642 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5643 if (IS_ERR(di_inode)) { 5644 ret = PTR_ERR(di_inode); 5645 goto next_dir_inode; 5646 } 5647 5648 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5649 btrfs_add_delayed_iput(di_inode); 5650 break; 5651 } 5652 5653 ctx->log_new_dentries = false; 5654 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5655 log_mode = LOG_INODE_ALL; 5656 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5657 log_mode, ctx); 5658 if (!ret && 5659 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5660 ret = 1; 5661 btrfs_add_delayed_iput(di_inode); 5662 if (ret) 5663 goto next_dir_inode; 5664 if (ctx->log_new_dentries) { 5665 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5666 GFP_NOFS); 5667 if (!new_dir_elem) { 5668 ret = -ENOMEM; 5669 goto next_dir_inode; 5670 } 5671 new_dir_elem->ino = di_key.objectid; 5672 list_add_tail(&new_dir_elem->list, &dir_list); 5673 } 5674 break; 5675 } 5676 if (i == nritems) { 5677 ret = btrfs_next_leaf(log, path); 5678 if (ret < 0) { 5679 goto next_dir_inode; 5680 } else if (ret > 0) { 5681 ret = 0; 5682 goto next_dir_inode; 5683 } 5684 goto process_leaf; 5685 } 5686 if (min_key.offset < (u64)-1) { 5687 min_key.offset++; 5688 goto again; 5689 } 5690 next_dir_inode: 5691 list_del(&dir_elem->list); 5692 kfree(dir_elem); 5693 } 5694 5695 btrfs_free_path(path); 5696 return ret; 5697 } 5698 5699 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5700 struct btrfs_inode *inode, 5701 struct btrfs_log_ctx *ctx) 5702 { 5703 struct btrfs_fs_info *fs_info = trans->fs_info; 5704 int ret; 5705 struct btrfs_path *path; 5706 struct btrfs_key key; 5707 struct btrfs_root *root = inode->root; 5708 const u64 ino = btrfs_ino(inode); 5709 5710 path = btrfs_alloc_path(); 5711 if (!path) 5712 return -ENOMEM; 5713 path->skip_locking = 1; 5714 path->search_commit_root = 1; 5715 5716 key.objectid = ino; 5717 key.type = BTRFS_INODE_REF_KEY; 5718 key.offset = 0; 5719 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5720 if (ret < 0) 5721 goto out; 5722 5723 while (true) { 5724 struct extent_buffer *leaf = path->nodes[0]; 5725 int slot = path->slots[0]; 5726 u32 cur_offset = 0; 5727 u32 item_size; 5728 unsigned long ptr; 5729 5730 if (slot >= btrfs_header_nritems(leaf)) { 5731 ret = btrfs_next_leaf(root, path); 5732 if (ret < 0) 5733 goto out; 5734 else if (ret > 0) 5735 break; 5736 continue; 5737 } 5738 5739 btrfs_item_key_to_cpu(leaf, &key, slot); 5740 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5741 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5742 break; 5743 5744 item_size = btrfs_item_size_nr(leaf, slot); 5745 ptr = btrfs_item_ptr_offset(leaf, slot); 5746 while (cur_offset < item_size) { 5747 struct btrfs_key inode_key; 5748 struct inode *dir_inode; 5749 5750 inode_key.type = BTRFS_INODE_ITEM_KEY; 5751 inode_key.offset = 0; 5752 5753 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5754 struct btrfs_inode_extref *extref; 5755 5756 extref = (struct btrfs_inode_extref *) 5757 (ptr + cur_offset); 5758 inode_key.objectid = btrfs_inode_extref_parent( 5759 leaf, extref); 5760 cur_offset += sizeof(*extref); 5761 cur_offset += btrfs_inode_extref_name_len(leaf, 5762 extref); 5763 } else { 5764 inode_key.objectid = key.offset; 5765 cur_offset = item_size; 5766 } 5767 5768 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 5769 root); 5770 /* 5771 * If the parent inode was deleted, return an error to 5772 * fallback to a transaction commit. This is to prevent 5773 * getting an inode that was moved from one parent A to 5774 * a parent B, got its former parent A deleted and then 5775 * it got fsync'ed, from existing at both parents after 5776 * a log replay (and the old parent still existing). 5777 * Example: 5778 * 5779 * mkdir /mnt/A 5780 * mkdir /mnt/B 5781 * touch /mnt/B/bar 5782 * sync 5783 * mv /mnt/B/bar /mnt/A/bar 5784 * mv -T /mnt/A /mnt/B 5785 * fsync /mnt/B/bar 5786 * <power fail> 5787 * 5788 * If we ignore the old parent B which got deleted, 5789 * after a log replay we would have file bar linked 5790 * at both parents and the old parent B would still 5791 * exist. 5792 */ 5793 if (IS_ERR(dir_inode)) { 5794 ret = PTR_ERR(dir_inode); 5795 goto out; 5796 } 5797 5798 if (ctx) 5799 ctx->log_new_dentries = false; 5800 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5801 LOG_INODE_ALL, ctx); 5802 if (!ret && 5803 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5804 ret = 1; 5805 if (!ret && ctx && ctx->log_new_dentries) 5806 ret = log_new_dir_dentries(trans, root, 5807 BTRFS_I(dir_inode), ctx); 5808 btrfs_add_delayed_iput(dir_inode); 5809 if (ret) 5810 goto out; 5811 } 5812 path->slots[0]++; 5813 } 5814 ret = 0; 5815 out: 5816 btrfs_free_path(path); 5817 return ret; 5818 } 5819 5820 static int log_new_ancestors(struct btrfs_trans_handle *trans, 5821 struct btrfs_root *root, 5822 struct btrfs_path *path, 5823 struct btrfs_log_ctx *ctx) 5824 { 5825 struct btrfs_key found_key; 5826 5827 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5828 5829 while (true) { 5830 struct btrfs_fs_info *fs_info = root->fs_info; 5831 const u64 last_committed = fs_info->last_trans_committed; 5832 struct extent_buffer *leaf = path->nodes[0]; 5833 int slot = path->slots[0]; 5834 struct btrfs_key search_key; 5835 struct inode *inode; 5836 u64 ino; 5837 int ret = 0; 5838 5839 btrfs_release_path(path); 5840 5841 ino = found_key.offset; 5842 5843 search_key.objectid = found_key.offset; 5844 search_key.type = BTRFS_INODE_ITEM_KEY; 5845 search_key.offset = 0; 5846 inode = btrfs_iget(fs_info->sb, ino, root); 5847 if (IS_ERR(inode)) 5848 return PTR_ERR(inode); 5849 5850 if (BTRFS_I(inode)->generation > last_committed) 5851 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5852 LOG_INODE_EXISTS, ctx); 5853 btrfs_add_delayed_iput(inode); 5854 if (ret) 5855 return ret; 5856 5857 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5858 break; 5859 5860 search_key.type = BTRFS_INODE_REF_KEY; 5861 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5862 if (ret < 0) 5863 return ret; 5864 5865 leaf = path->nodes[0]; 5866 slot = path->slots[0]; 5867 if (slot >= btrfs_header_nritems(leaf)) { 5868 ret = btrfs_next_leaf(root, path); 5869 if (ret < 0) 5870 return ret; 5871 else if (ret > 0) 5872 return -ENOENT; 5873 leaf = path->nodes[0]; 5874 slot = path->slots[0]; 5875 } 5876 5877 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5878 if (found_key.objectid != search_key.objectid || 5879 found_key.type != BTRFS_INODE_REF_KEY) 5880 return -ENOENT; 5881 } 5882 return 0; 5883 } 5884 5885 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 5886 struct btrfs_inode *inode, 5887 struct dentry *parent, 5888 struct btrfs_log_ctx *ctx) 5889 { 5890 struct btrfs_root *root = inode->root; 5891 struct btrfs_fs_info *fs_info = root->fs_info; 5892 struct dentry *old_parent = NULL; 5893 struct super_block *sb = inode->vfs_inode.i_sb; 5894 int ret = 0; 5895 5896 while (true) { 5897 if (!parent || d_really_is_negative(parent) || 5898 sb != parent->d_sb) 5899 break; 5900 5901 inode = BTRFS_I(d_inode(parent)); 5902 if (root != inode->root) 5903 break; 5904 5905 if (inode->generation > fs_info->last_trans_committed) { 5906 ret = btrfs_log_inode(trans, root, inode, 5907 LOG_INODE_EXISTS, ctx); 5908 if (ret) 5909 break; 5910 } 5911 if (IS_ROOT(parent)) 5912 break; 5913 5914 parent = dget_parent(parent); 5915 dput(old_parent); 5916 old_parent = parent; 5917 } 5918 dput(old_parent); 5919 5920 return ret; 5921 } 5922 5923 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 5924 struct btrfs_inode *inode, 5925 struct dentry *parent, 5926 struct btrfs_log_ctx *ctx) 5927 { 5928 struct btrfs_root *root = inode->root; 5929 const u64 ino = btrfs_ino(inode); 5930 struct btrfs_path *path; 5931 struct btrfs_key search_key; 5932 int ret; 5933 5934 /* 5935 * For a single hard link case, go through a fast path that does not 5936 * need to iterate the fs/subvolume tree. 5937 */ 5938 if (inode->vfs_inode.i_nlink < 2) 5939 return log_new_ancestors_fast(trans, inode, parent, ctx); 5940 5941 path = btrfs_alloc_path(); 5942 if (!path) 5943 return -ENOMEM; 5944 5945 search_key.objectid = ino; 5946 search_key.type = BTRFS_INODE_REF_KEY; 5947 search_key.offset = 0; 5948 again: 5949 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5950 if (ret < 0) 5951 goto out; 5952 if (ret == 0) 5953 path->slots[0]++; 5954 5955 while (true) { 5956 struct extent_buffer *leaf = path->nodes[0]; 5957 int slot = path->slots[0]; 5958 struct btrfs_key found_key; 5959 5960 if (slot >= btrfs_header_nritems(leaf)) { 5961 ret = btrfs_next_leaf(root, path); 5962 if (ret < 0) 5963 goto out; 5964 else if (ret > 0) 5965 break; 5966 continue; 5967 } 5968 5969 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5970 if (found_key.objectid != ino || 5971 found_key.type > BTRFS_INODE_EXTREF_KEY) 5972 break; 5973 5974 /* 5975 * Don't deal with extended references because they are rare 5976 * cases and too complex to deal with (we would need to keep 5977 * track of which subitem we are processing for each item in 5978 * this loop, etc). So just return some error to fallback to 5979 * a transaction commit. 5980 */ 5981 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 5982 ret = -EMLINK; 5983 goto out; 5984 } 5985 5986 /* 5987 * Logging ancestors needs to do more searches on the fs/subvol 5988 * tree, so it releases the path as needed to avoid deadlocks. 5989 * Keep track of the last inode ref key and resume from that key 5990 * after logging all new ancestors for the current hard link. 5991 */ 5992 memcpy(&search_key, &found_key, sizeof(search_key)); 5993 5994 ret = log_new_ancestors(trans, root, path, ctx); 5995 if (ret) 5996 goto out; 5997 btrfs_release_path(path); 5998 goto again; 5999 } 6000 ret = 0; 6001 out: 6002 btrfs_free_path(path); 6003 return ret; 6004 } 6005 6006 /* 6007 * helper function around btrfs_log_inode to make sure newly created 6008 * parent directories also end up in the log. A minimal inode and backref 6009 * only logging is done of any parent directories that are older than 6010 * the last committed transaction 6011 */ 6012 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6013 struct btrfs_inode *inode, 6014 struct dentry *parent, 6015 int inode_only, 6016 struct btrfs_log_ctx *ctx) 6017 { 6018 struct btrfs_root *root = inode->root; 6019 struct btrfs_fs_info *fs_info = root->fs_info; 6020 struct super_block *sb; 6021 int ret = 0; 6022 u64 last_committed = fs_info->last_trans_committed; 6023 bool log_dentries = false; 6024 6025 sb = inode->vfs_inode.i_sb; 6026 6027 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6028 ret = 1; 6029 goto end_no_trans; 6030 } 6031 6032 /* 6033 * The prev transaction commit doesn't complete, we need do 6034 * full commit by ourselves. 6035 */ 6036 if (fs_info->last_trans_log_full_commit > 6037 fs_info->last_trans_committed) { 6038 ret = 1; 6039 goto end_no_trans; 6040 } 6041 6042 if (btrfs_root_refs(&root->root_item) == 0) { 6043 ret = 1; 6044 goto end_no_trans; 6045 } 6046 6047 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 6048 last_committed); 6049 if (ret) 6050 goto end_no_trans; 6051 6052 /* 6053 * Skip already logged inodes or inodes corresponding to tmpfiles 6054 * (since logging them is pointless, a link count of 0 means they 6055 * will never be accessible). 6056 */ 6057 if (btrfs_inode_in_log(inode, trans->transid) || 6058 inode->vfs_inode.i_nlink == 0) { 6059 ret = BTRFS_NO_LOG_SYNC; 6060 goto end_no_trans; 6061 } 6062 6063 ret = start_log_trans(trans, root, ctx); 6064 if (ret) 6065 goto end_no_trans; 6066 6067 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); 6068 if (ret) 6069 goto end_trans; 6070 6071 /* 6072 * for regular files, if its inode is already on disk, we don't 6073 * have to worry about the parents at all. This is because 6074 * we can use the last_unlink_trans field to record renames 6075 * and other fun in this file. 6076 */ 6077 if (S_ISREG(inode->vfs_inode.i_mode) && 6078 inode->generation <= last_committed && 6079 inode->last_unlink_trans <= last_committed) { 6080 ret = 0; 6081 goto end_trans; 6082 } 6083 6084 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6085 log_dentries = true; 6086 6087 /* 6088 * On unlink we must make sure all our current and old parent directory 6089 * inodes are fully logged. This is to prevent leaving dangling 6090 * directory index entries in directories that were our parents but are 6091 * not anymore. Not doing this results in old parent directory being 6092 * impossible to delete after log replay (rmdir will always fail with 6093 * error -ENOTEMPTY). 6094 * 6095 * Example 1: 6096 * 6097 * mkdir testdir 6098 * touch testdir/foo 6099 * ln testdir/foo testdir/bar 6100 * sync 6101 * unlink testdir/bar 6102 * xfs_io -c fsync testdir/foo 6103 * <power failure> 6104 * mount fs, triggers log replay 6105 * 6106 * If we don't log the parent directory (testdir), after log replay the 6107 * directory still has an entry pointing to the file inode using the bar 6108 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6109 * the file inode has a link count of 1. 6110 * 6111 * Example 2: 6112 * 6113 * mkdir testdir 6114 * touch foo 6115 * ln foo testdir/foo2 6116 * ln foo testdir/foo3 6117 * sync 6118 * unlink testdir/foo3 6119 * xfs_io -c fsync foo 6120 * <power failure> 6121 * mount fs, triggers log replay 6122 * 6123 * Similar as the first example, after log replay the parent directory 6124 * testdir still has an entry pointing to the inode file with name foo3 6125 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6126 * and has a link count of 2. 6127 */ 6128 if (inode->last_unlink_trans > last_committed) { 6129 ret = btrfs_log_all_parents(trans, inode, ctx); 6130 if (ret) 6131 goto end_trans; 6132 } 6133 6134 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6135 if (ret) 6136 goto end_trans; 6137 6138 if (log_dentries) 6139 ret = log_new_dir_dentries(trans, root, inode, ctx); 6140 else 6141 ret = 0; 6142 end_trans: 6143 if (ret < 0) { 6144 btrfs_set_log_full_commit(trans); 6145 ret = 1; 6146 } 6147 6148 if (ret) 6149 btrfs_remove_log_ctx(root, ctx); 6150 btrfs_end_log_trans(root); 6151 end_no_trans: 6152 return ret; 6153 } 6154 6155 /* 6156 * it is not safe to log dentry if the chunk root has added new 6157 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6158 * If this returns 1, you must commit the transaction to safely get your 6159 * data on disk. 6160 */ 6161 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6162 struct dentry *dentry, 6163 struct btrfs_log_ctx *ctx) 6164 { 6165 struct dentry *parent = dget_parent(dentry); 6166 int ret; 6167 6168 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6169 LOG_INODE_ALL, ctx); 6170 dput(parent); 6171 6172 return ret; 6173 } 6174 6175 /* 6176 * should be called during mount to recover any replay any log trees 6177 * from the FS 6178 */ 6179 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6180 { 6181 int ret; 6182 struct btrfs_path *path; 6183 struct btrfs_trans_handle *trans; 6184 struct btrfs_key key; 6185 struct btrfs_key found_key; 6186 struct btrfs_root *log; 6187 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6188 struct walk_control wc = { 6189 .process_func = process_one_buffer, 6190 .stage = LOG_WALK_PIN_ONLY, 6191 }; 6192 6193 path = btrfs_alloc_path(); 6194 if (!path) 6195 return -ENOMEM; 6196 6197 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6198 6199 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6200 if (IS_ERR(trans)) { 6201 ret = PTR_ERR(trans); 6202 goto error; 6203 } 6204 6205 wc.trans = trans; 6206 wc.pin = 1; 6207 6208 ret = walk_log_tree(trans, log_root_tree, &wc); 6209 if (ret) { 6210 btrfs_handle_fs_error(fs_info, ret, 6211 "Failed to pin buffers while recovering log root tree."); 6212 goto error; 6213 } 6214 6215 again: 6216 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6217 key.offset = (u64)-1; 6218 key.type = BTRFS_ROOT_ITEM_KEY; 6219 6220 while (1) { 6221 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6222 6223 if (ret < 0) { 6224 btrfs_handle_fs_error(fs_info, ret, 6225 "Couldn't find tree log root."); 6226 goto error; 6227 } 6228 if (ret > 0) { 6229 if (path->slots[0] == 0) 6230 break; 6231 path->slots[0]--; 6232 } 6233 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6234 path->slots[0]); 6235 btrfs_release_path(path); 6236 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6237 break; 6238 6239 log = btrfs_read_tree_root(log_root_tree, &found_key); 6240 if (IS_ERR(log)) { 6241 ret = PTR_ERR(log); 6242 btrfs_handle_fs_error(fs_info, ret, 6243 "Couldn't read tree log root."); 6244 goto error; 6245 } 6246 6247 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6248 true); 6249 if (IS_ERR(wc.replay_dest)) { 6250 ret = PTR_ERR(wc.replay_dest); 6251 6252 /* 6253 * We didn't find the subvol, likely because it was 6254 * deleted. This is ok, simply skip this log and go to 6255 * the next one. 6256 * 6257 * We need to exclude the root because we can't have 6258 * other log replays overwriting this log as we'll read 6259 * it back in a few more times. This will keep our 6260 * block from being modified, and we'll just bail for 6261 * each subsequent pass. 6262 */ 6263 if (ret == -ENOENT) 6264 ret = btrfs_pin_extent_for_log_replay(trans, 6265 log->node->start, 6266 log->node->len); 6267 btrfs_put_root(log); 6268 6269 if (!ret) 6270 goto next; 6271 btrfs_handle_fs_error(fs_info, ret, 6272 "Couldn't read target root for tree log recovery."); 6273 goto error; 6274 } 6275 6276 wc.replay_dest->log_root = log; 6277 btrfs_record_root_in_trans(trans, wc.replay_dest); 6278 ret = walk_log_tree(trans, log, &wc); 6279 6280 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6281 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6282 path); 6283 } 6284 6285 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6286 struct btrfs_root *root = wc.replay_dest; 6287 6288 btrfs_release_path(path); 6289 6290 /* 6291 * We have just replayed everything, and the highest 6292 * objectid of fs roots probably has changed in case 6293 * some inode_item's got replayed. 6294 * 6295 * root->objectid_mutex is not acquired as log replay 6296 * could only happen during mount. 6297 */ 6298 ret = btrfs_find_highest_objectid(root, 6299 &root->highest_objectid); 6300 } 6301 6302 wc.replay_dest->log_root = NULL; 6303 btrfs_put_root(wc.replay_dest); 6304 btrfs_put_root(log); 6305 6306 if (ret) 6307 goto error; 6308 next: 6309 if (found_key.offset == 0) 6310 break; 6311 key.offset = found_key.offset - 1; 6312 } 6313 btrfs_release_path(path); 6314 6315 /* step one is to pin it all, step two is to replay just inodes */ 6316 if (wc.pin) { 6317 wc.pin = 0; 6318 wc.process_func = replay_one_buffer; 6319 wc.stage = LOG_WALK_REPLAY_INODES; 6320 goto again; 6321 } 6322 /* step three is to replay everything */ 6323 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6324 wc.stage++; 6325 goto again; 6326 } 6327 6328 btrfs_free_path(path); 6329 6330 /* step 4: commit the transaction, which also unpins the blocks */ 6331 ret = btrfs_commit_transaction(trans); 6332 if (ret) 6333 return ret; 6334 6335 log_root_tree->log_root = NULL; 6336 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6337 btrfs_put_root(log_root_tree); 6338 6339 return 0; 6340 error: 6341 if (wc.trans) 6342 btrfs_end_transaction(wc.trans); 6343 btrfs_free_path(path); 6344 return ret; 6345 } 6346 6347 /* 6348 * there are some corner cases where we want to force a full 6349 * commit instead of allowing a directory to be logged. 6350 * 6351 * They revolve around files there were unlinked from the directory, and 6352 * this function updates the parent directory so that a full commit is 6353 * properly done if it is fsync'd later after the unlinks are done. 6354 * 6355 * Must be called before the unlink operations (updates to the subvolume tree, 6356 * inodes, etc) are done. 6357 */ 6358 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6359 struct btrfs_inode *dir, struct btrfs_inode *inode, 6360 int for_rename) 6361 { 6362 /* 6363 * when we're logging a file, if it hasn't been renamed 6364 * or unlinked, and its inode is fully committed on disk, 6365 * we don't have to worry about walking up the directory chain 6366 * to log its parents. 6367 * 6368 * So, we use the last_unlink_trans field to put this transid 6369 * into the file. When the file is logged we check it and 6370 * don't log the parents if the file is fully on disk. 6371 */ 6372 mutex_lock(&inode->log_mutex); 6373 inode->last_unlink_trans = trans->transid; 6374 mutex_unlock(&inode->log_mutex); 6375 6376 /* 6377 * if this directory was already logged any new 6378 * names for this file/dir will get recorded 6379 */ 6380 if (dir->logged_trans == trans->transid) 6381 return; 6382 6383 /* 6384 * if the inode we're about to unlink was logged, 6385 * the log will be properly updated for any new names 6386 */ 6387 if (inode->logged_trans == trans->transid) 6388 return; 6389 6390 /* 6391 * when renaming files across directories, if the directory 6392 * there we're unlinking from gets fsync'd later on, there's 6393 * no way to find the destination directory later and fsync it 6394 * properly. So, we have to be conservative and force commits 6395 * so the new name gets discovered. 6396 */ 6397 if (for_rename) 6398 goto record; 6399 6400 /* we can safely do the unlink without any special recording */ 6401 return; 6402 6403 record: 6404 mutex_lock(&dir->log_mutex); 6405 dir->last_unlink_trans = trans->transid; 6406 mutex_unlock(&dir->log_mutex); 6407 } 6408 6409 /* 6410 * Make sure that if someone attempts to fsync the parent directory of a deleted 6411 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6412 * that after replaying the log tree of the parent directory's root we will not 6413 * see the snapshot anymore and at log replay time we will not see any log tree 6414 * corresponding to the deleted snapshot's root, which could lead to replaying 6415 * it after replaying the log tree of the parent directory (which would replay 6416 * the snapshot delete operation). 6417 * 6418 * Must be called before the actual snapshot destroy operation (updates to the 6419 * parent root and tree of tree roots trees, etc) are done. 6420 */ 6421 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6422 struct btrfs_inode *dir) 6423 { 6424 mutex_lock(&dir->log_mutex); 6425 dir->last_unlink_trans = trans->transid; 6426 mutex_unlock(&dir->log_mutex); 6427 } 6428 6429 /* 6430 * Call this after adding a new name for a file and it will properly 6431 * update the log to reflect the new name. 6432 */ 6433 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6434 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6435 struct dentry *parent) 6436 { 6437 struct btrfs_fs_info *fs_info = trans->fs_info; 6438 struct btrfs_log_ctx ctx; 6439 6440 /* 6441 * this will force the logging code to walk the dentry chain 6442 * up for the file 6443 */ 6444 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6445 inode->last_unlink_trans = trans->transid; 6446 6447 /* 6448 * if this inode hasn't been logged and directory we're renaming it 6449 * from hasn't been logged, we don't need to log it 6450 */ 6451 if (inode->logged_trans <= fs_info->last_trans_committed && 6452 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6453 return; 6454 6455 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6456 ctx.logging_new_name = true; 6457 /* 6458 * We don't care about the return value. If we fail to log the new name 6459 * then we know the next attempt to sync the log will fallback to a full 6460 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 6461 * we don't need to worry about getting a log committed that has an 6462 * inconsistent state after a rename operation. 6463 */ 6464 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6465 } 6466 6467