1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "inode-map.h" 21 #include "block-group.h" 22 #include "space-info.h" 23 24 /* magic values for the inode_only field in btrfs_log_inode: 25 * 26 * LOG_INODE_ALL means to log everything 27 * LOG_INODE_EXISTS means to log just enough to recreate the inode 28 * during log replay 29 */ 30 enum { 31 LOG_INODE_ALL, 32 LOG_INODE_EXISTS, 33 LOG_OTHER_INODE, 34 LOG_OTHER_INODE_ALL, 35 }; 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 enum { 90 LOG_WALK_PIN_ONLY, 91 LOG_WALK_REPLAY_INODES, 92 LOG_WALK_REPLAY_DIR_INDEX, 93 LOG_WALK_REPLAY_ALL, 94 }; 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct btrfs_inode *inode, 98 int inode_only, 99 const loff_t start, 100 const loff_t end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 int ret = 0; 145 146 mutex_lock(&root->log_mutex); 147 148 if (root->log_root) { 149 if (btrfs_need_log_full_commit(trans)) { 150 ret = -EAGAIN; 151 goto out; 152 } 153 154 if (!root->log_start_pid) { 155 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 156 root->log_start_pid = current->pid; 157 } else if (root->log_start_pid != current->pid) { 158 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 } 160 } else { 161 mutex_lock(&fs_info->tree_log_mutex); 162 if (!fs_info->log_root_tree) 163 ret = btrfs_init_log_root_tree(trans, fs_info); 164 mutex_unlock(&fs_info->tree_log_mutex); 165 if (ret) 166 goto out; 167 168 ret = btrfs_add_log_tree(trans, root); 169 if (ret) 170 goto out; 171 172 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 173 root->log_start_pid = current->pid; 174 } 175 176 atomic_inc(&root->log_batch); 177 atomic_inc(&root->log_writers); 178 if (ctx) { 179 int index = root->log_transid % 2; 180 list_add_tail(&ctx->list, &root->log_ctxs[index]); 181 ctx->log_transid = root->log_transid; 182 } 183 184 out: 185 mutex_unlock(&root->log_mutex); 186 return ret; 187 } 188 189 /* 190 * returns 0 if there was a log transaction running and we were able 191 * to join, or returns -ENOENT if there were not transactions 192 * in progress 193 */ 194 static int join_running_log_trans(struct btrfs_root *root) 195 { 196 int ret = -ENOENT; 197 198 mutex_lock(&root->log_mutex); 199 if (root->log_root) { 200 ret = 0; 201 atomic_inc(&root->log_writers); 202 } 203 mutex_unlock(&root->log_mutex); 204 return ret; 205 } 206 207 /* 208 * This either makes the current running log transaction wait 209 * until you call btrfs_end_log_trans() or it makes any future 210 * log transactions wait until you call btrfs_end_log_trans() 211 */ 212 void btrfs_pin_log_trans(struct btrfs_root *root) 213 { 214 mutex_lock(&root->log_mutex); 215 atomic_inc(&root->log_writers); 216 mutex_unlock(&root->log_mutex); 217 } 218 219 /* 220 * indicate we're done making changes to the log tree 221 * and wake up anyone waiting to do a sync 222 */ 223 void btrfs_end_log_trans(struct btrfs_root *root) 224 { 225 if (atomic_dec_and_test(&root->log_writers)) { 226 /* atomic_dec_and_test implies a barrier */ 227 cond_wake_up_nomb(&root->log_writer_wait); 228 } 229 } 230 231 static int btrfs_write_tree_block(struct extent_buffer *buf) 232 { 233 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 234 buf->start + buf->len - 1); 235 } 236 237 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 238 { 239 filemap_fdatawait_range(buf->pages[0]->mapping, 240 buf->start, buf->start + buf->len - 1); 241 } 242 243 /* 244 * the walk control struct is used to pass state down the chain when 245 * processing the log tree. The stage field tells us which part 246 * of the log tree processing we are currently doing. The others 247 * are state fields used for that specific part 248 */ 249 struct walk_control { 250 /* should we free the extent on disk when done? This is used 251 * at transaction commit time while freeing a log tree 252 */ 253 int free; 254 255 /* should we write out the extent buffer? This is used 256 * while flushing the log tree to disk during a sync 257 */ 258 int write; 259 260 /* should we wait for the extent buffer io to finish? Also used 261 * while flushing the log tree to disk for a sync 262 */ 263 int wait; 264 265 /* pin only walk, we record which extents on disk belong to the 266 * log trees 267 */ 268 int pin; 269 270 /* what stage of the replay code we're currently in */ 271 int stage; 272 273 /* 274 * Ignore any items from the inode currently being processed. Needs 275 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 276 * the LOG_WALK_REPLAY_INODES stage. 277 */ 278 bool ignore_cur_inode; 279 280 /* the root we are currently replaying */ 281 struct btrfs_root *replay_dest; 282 283 /* the trans handle for the current replay */ 284 struct btrfs_trans_handle *trans; 285 286 /* the function that gets used to process blocks we find in the 287 * tree. Note the extent_buffer might not be up to date when it is 288 * passed in, and it must be checked or read if you need the data 289 * inside it 290 */ 291 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 292 struct walk_control *wc, u64 gen, int level); 293 }; 294 295 /* 296 * process_func used to pin down extents, write them or wait on them 297 */ 298 static int process_one_buffer(struct btrfs_root *log, 299 struct extent_buffer *eb, 300 struct walk_control *wc, u64 gen, int level) 301 { 302 struct btrfs_fs_info *fs_info = log->fs_info; 303 int ret = 0; 304 305 /* 306 * If this fs is mixed then we need to be able to process the leaves to 307 * pin down any logged extents, so we have to read the block. 308 */ 309 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 310 ret = btrfs_read_buffer(eb, gen, level, NULL); 311 if (ret) 312 return ret; 313 } 314 315 if (wc->pin) 316 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 317 eb->len); 318 319 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 320 if (wc->pin && btrfs_header_level(eb) == 0) 321 ret = btrfs_exclude_logged_extents(eb); 322 if (wc->write) 323 btrfs_write_tree_block(eb); 324 if (wc->wait) 325 btrfs_wait_tree_block_writeback(eb); 326 } 327 return ret; 328 } 329 330 /* 331 * Item overwrite used by replay and tree logging. eb, slot and key all refer 332 * to the src data we are copying out. 333 * 334 * root is the tree we are copying into, and path is a scratch 335 * path for use in this function (it should be released on entry and 336 * will be released on exit). 337 * 338 * If the key is already in the destination tree the existing item is 339 * overwritten. If the existing item isn't big enough, it is extended. 340 * If it is too large, it is truncated. 341 * 342 * If the key isn't in the destination yet, a new item is inserted. 343 */ 344 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 345 struct btrfs_root *root, 346 struct btrfs_path *path, 347 struct extent_buffer *eb, int slot, 348 struct btrfs_key *key) 349 { 350 int ret; 351 u32 item_size; 352 u64 saved_i_size = 0; 353 int save_old_i_size = 0; 354 unsigned long src_ptr; 355 unsigned long dst_ptr; 356 int overwrite_root = 0; 357 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 358 359 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 360 overwrite_root = 1; 361 362 item_size = btrfs_item_size_nr(eb, slot); 363 src_ptr = btrfs_item_ptr_offset(eb, slot); 364 365 /* look for the key in the destination tree */ 366 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 367 if (ret < 0) 368 return ret; 369 370 if (ret == 0) { 371 char *src_copy; 372 char *dst_copy; 373 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 374 path->slots[0]); 375 if (dst_size != item_size) 376 goto insert; 377 378 if (item_size == 0) { 379 btrfs_release_path(path); 380 return 0; 381 } 382 dst_copy = kmalloc(item_size, GFP_NOFS); 383 src_copy = kmalloc(item_size, GFP_NOFS); 384 if (!dst_copy || !src_copy) { 385 btrfs_release_path(path); 386 kfree(dst_copy); 387 kfree(src_copy); 388 return -ENOMEM; 389 } 390 391 read_extent_buffer(eb, src_copy, src_ptr, item_size); 392 393 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 394 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 395 item_size); 396 ret = memcmp(dst_copy, src_copy, item_size); 397 398 kfree(dst_copy); 399 kfree(src_copy); 400 /* 401 * they have the same contents, just return, this saves 402 * us from cowing blocks in the destination tree and doing 403 * extra writes that may not have been done by a previous 404 * sync 405 */ 406 if (ret == 0) { 407 btrfs_release_path(path); 408 return 0; 409 } 410 411 /* 412 * We need to load the old nbytes into the inode so when we 413 * replay the extents we've logged we get the right nbytes. 414 */ 415 if (inode_item) { 416 struct btrfs_inode_item *item; 417 u64 nbytes; 418 u32 mode; 419 420 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 421 struct btrfs_inode_item); 422 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 423 item = btrfs_item_ptr(eb, slot, 424 struct btrfs_inode_item); 425 btrfs_set_inode_nbytes(eb, item, nbytes); 426 427 /* 428 * If this is a directory we need to reset the i_size to 429 * 0 so that we can set it up properly when replaying 430 * the rest of the items in this log. 431 */ 432 mode = btrfs_inode_mode(eb, item); 433 if (S_ISDIR(mode)) 434 btrfs_set_inode_size(eb, item, 0); 435 } 436 } else if (inode_item) { 437 struct btrfs_inode_item *item; 438 u32 mode; 439 440 /* 441 * New inode, set nbytes to 0 so that the nbytes comes out 442 * properly when we replay the extents. 443 */ 444 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 445 btrfs_set_inode_nbytes(eb, item, 0); 446 447 /* 448 * If this is a directory we need to reset the i_size to 0 so 449 * that we can set it up properly when replaying the rest of 450 * the items in this log. 451 */ 452 mode = btrfs_inode_mode(eb, item); 453 if (S_ISDIR(mode)) 454 btrfs_set_inode_size(eb, item, 0); 455 } 456 insert: 457 btrfs_release_path(path); 458 /* try to insert the key into the destination tree */ 459 path->skip_release_on_error = 1; 460 ret = btrfs_insert_empty_item(trans, root, path, 461 key, item_size); 462 path->skip_release_on_error = 0; 463 464 /* make sure any existing item is the correct size */ 465 if (ret == -EEXIST || ret == -EOVERFLOW) { 466 u32 found_size; 467 found_size = btrfs_item_size_nr(path->nodes[0], 468 path->slots[0]); 469 if (found_size > item_size) 470 btrfs_truncate_item(path, item_size, 1); 471 else if (found_size < item_size) 472 btrfs_extend_item(path, item_size - found_size); 473 } else if (ret) { 474 return ret; 475 } 476 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 477 path->slots[0]); 478 479 /* don't overwrite an existing inode if the generation number 480 * was logged as zero. This is done when the tree logging code 481 * is just logging an inode to make sure it exists after recovery. 482 * 483 * Also, don't overwrite i_size on directories during replay. 484 * log replay inserts and removes directory items based on the 485 * state of the tree found in the subvolume, and i_size is modified 486 * as it goes 487 */ 488 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 489 struct btrfs_inode_item *src_item; 490 struct btrfs_inode_item *dst_item; 491 492 src_item = (struct btrfs_inode_item *)src_ptr; 493 dst_item = (struct btrfs_inode_item *)dst_ptr; 494 495 if (btrfs_inode_generation(eb, src_item) == 0) { 496 struct extent_buffer *dst_eb = path->nodes[0]; 497 const u64 ino_size = btrfs_inode_size(eb, src_item); 498 499 /* 500 * For regular files an ino_size == 0 is used only when 501 * logging that an inode exists, as part of a directory 502 * fsync, and the inode wasn't fsynced before. In this 503 * case don't set the size of the inode in the fs/subvol 504 * tree, otherwise we would be throwing valid data away. 505 */ 506 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 507 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 508 ino_size != 0) 509 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 510 goto no_copy; 511 } 512 513 if (overwrite_root && 514 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 515 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 516 save_old_i_size = 1; 517 saved_i_size = btrfs_inode_size(path->nodes[0], 518 dst_item); 519 } 520 } 521 522 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 523 src_ptr, item_size); 524 525 if (save_old_i_size) { 526 struct btrfs_inode_item *dst_item; 527 dst_item = (struct btrfs_inode_item *)dst_ptr; 528 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 529 } 530 531 /* make sure the generation is filled in */ 532 if (key->type == BTRFS_INODE_ITEM_KEY) { 533 struct btrfs_inode_item *dst_item; 534 dst_item = (struct btrfs_inode_item *)dst_ptr; 535 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 536 btrfs_set_inode_generation(path->nodes[0], dst_item, 537 trans->transid); 538 } 539 } 540 no_copy: 541 btrfs_mark_buffer_dirty(path->nodes[0]); 542 btrfs_release_path(path); 543 return 0; 544 } 545 546 /* 547 * simple helper to read an inode off the disk from a given root 548 * This can only be called for subvolume roots and not for the log 549 */ 550 static noinline struct inode *read_one_inode(struct btrfs_root *root, 551 u64 objectid) 552 { 553 struct inode *inode; 554 555 inode = btrfs_iget(root->fs_info->sb, objectid, root); 556 if (IS_ERR(inode)) 557 inode = NULL; 558 return inode; 559 } 560 561 /* replays a single extent in 'eb' at 'slot' with 'key' into the 562 * subvolume 'root'. path is released on entry and should be released 563 * on exit. 564 * 565 * extents in the log tree have not been allocated out of the extent 566 * tree yet. So, this completes the allocation, taking a reference 567 * as required if the extent already exists or creating a new extent 568 * if it isn't in the extent allocation tree yet. 569 * 570 * The extent is inserted into the file, dropping any existing extents 571 * from the file that overlap the new one. 572 */ 573 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 574 struct btrfs_root *root, 575 struct btrfs_path *path, 576 struct extent_buffer *eb, int slot, 577 struct btrfs_key *key) 578 { 579 struct btrfs_fs_info *fs_info = root->fs_info; 580 int found_type; 581 u64 extent_end; 582 u64 start = key->offset; 583 u64 nbytes = 0; 584 struct btrfs_file_extent_item *item; 585 struct inode *inode = NULL; 586 unsigned long size; 587 int ret = 0; 588 589 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 590 found_type = btrfs_file_extent_type(eb, item); 591 592 if (found_type == BTRFS_FILE_EXTENT_REG || 593 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 594 nbytes = btrfs_file_extent_num_bytes(eb, item); 595 extent_end = start + nbytes; 596 597 /* 598 * We don't add to the inodes nbytes if we are prealloc or a 599 * hole. 600 */ 601 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 602 nbytes = 0; 603 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 604 size = btrfs_file_extent_ram_bytes(eb, item); 605 nbytes = btrfs_file_extent_ram_bytes(eb, item); 606 extent_end = ALIGN(start + size, 607 fs_info->sectorsize); 608 } else { 609 ret = 0; 610 goto out; 611 } 612 613 inode = read_one_inode(root, key->objectid); 614 if (!inode) { 615 ret = -EIO; 616 goto out; 617 } 618 619 /* 620 * first check to see if we already have this extent in the 621 * file. This must be done before the btrfs_drop_extents run 622 * so we don't try to drop this extent. 623 */ 624 ret = btrfs_lookup_file_extent(trans, root, path, 625 btrfs_ino(BTRFS_I(inode)), start, 0); 626 627 if (ret == 0 && 628 (found_type == BTRFS_FILE_EXTENT_REG || 629 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 630 struct btrfs_file_extent_item cmp1; 631 struct btrfs_file_extent_item cmp2; 632 struct btrfs_file_extent_item *existing; 633 struct extent_buffer *leaf; 634 635 leaf = path->nodes[0]; 636 existing = btrfs_item_ptr(leaf, path->slots[0], 637 struct btrfs_file_extent_item); 638 639 read_extent_buffer(eb, &cmp1, (unsigned long)item, 640 sizeof(cmp1)); 641 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 642 sizeof(cmp2)); 643 644 /* 645 * we already have a pointer to this exact extent, 646 * we don't have to do anything 647 */ 648 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 649 btrfs_release_path(path); 650 goto out; 651 } 652 } 653 btrfs_release_path(path); 654 655 /* drop any overlapping extents */ 656 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 657 if (ret) 658 goto out; 659 660 if (found_type == BTRFS_FILE_EXTENT_REG || 661 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 662 u64 offset; 663 unsigned long dest_offset; 664 struct btrfs_key ins; 665 666 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 667 btrfs_fs_incompat(fs_info, NO_HOLES)) 668 goto update_inode; 669 670 ret = btrfs_insert_empty_item(trans, root, path, key, 671 sizeof(*item)); 672 if (ret) 673 goto out; 674 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 675 path->slots[0]); 676 copy_extent_buffer(path->nodes[0], eb, dest_offset, 677 (unsigned long)item, sizeof(*item)); 678 679 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 680 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 681 ins.type = BTRFS_EXTENT_ITEM_KEY; 682 offset = key->offset - btrfs_file_extent_offset(eb, item); 683 684 /* 685 * Manually record dirty extent, as here we did a shallow 686 * file extent item copy and skip normal backref update, 687 * but modifying extent tree all by ourselves. 688 * So need to manually record dirty extent for qgroup, 689 * as the owner of the file extent changed from log tree 690 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 691 */ 692 ret = btrfs_qgroup_trace_extent(trans, 693 btrfs_file_extent_disk_bytenr(eb, item), 694 btrfs_file_extent_disk_num_bytes(eb, item), 695 GFP_NOFS); 696 if (ret < 0) 697 goto out; 698 699 if (ins.objectid > 0) { 700 struct btrfs_ref ref = { 0 }; 701 u64 csum_start; 702 u64 csum_end; 703 LIST_HEAD(ordered_sums); 704 705 /* 706 * is this extent already allocated in the extent 707 * allocation tree? If so, just add a reference 708 */ 709 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 710 ins.offset); 711 if (ret == 0) { 712 btrfs_init_generic_ref(&ref, 713 BTRFS_ADD_DELAYED_REF, 714 ins.objectid, ins.offset, 0); 715 btrfs_init_data_ref(&ref, 716 root->root_key.objectid, 717 key->objectid, offset); 718 ret = btrfs_inc_extent_ref(trans, &ref); 719 if (ret) 720 goto out; 721 } else { 722 /* 723 * insert the extent pointer in the extent 724 * allocation tree 725 */ 726 ret = btrfs_alloc_logged_file_extent(trans, 727 root->root_key.objectid, 728 key->objectid, offset, &ins); 729 if (ret) 730 goto out; 731 } 732 btrfs_release_path(path); 733 734 if (btrfs_file_extent_compression(eb, item)) { 735 csum_start = ins.objectid; 736 csum_end = csum_start + ins.offset; 737 } else { 738 csum_start = ins.objectid + 739 btrfs_file_extent_offset(eb, item); 740 csum_end = csum_start + 741 btrfs_file_extent_num_bytes(eb, item); 742 } 743 744 ret = btrfs_lookup_csums_range(root->log_root, 745 csum_start, csum_end - 1, 746 &ordered_sums, 0); 747 if (ret) 748 goto out; 749 /* 750 * Now delete all existing cums in the csum root that 751 * cover our range. We do this because we can have an 752 * extent that is completely referenced by one file 753 * extent item and partially referenced by another 754 * file extent item (like after using the clone or 755 * extent_same ioctls). In this case if we end up doing 756 * the replay of the one that partially references the 757 * extent first, and we do not do the csum deletion 758 * below, we can get 2 csum items in the csum tree that 759 * overlap each other. For example, imagine our log has 760 * the two following file extent items: 761 * 762 * key (257 EXTENT_DATA 409600) 763 * extent data disk byte 12845056 nr 102400 764 * extent data offset 20480 nr 20480 ram 102400 765 * 766 * key (257 EXTENT_DATA 819200) 767 * extent data disk byte 12845056 nr 102400 768 * extent data offset 0 nr 102400 ram 102400 769 * 770 * Where the second one fully references the 100K extent 771 * that starts at disk byte 12845056, and the log tree 772 * has a single csum item that covers the entire range 773 * of the extent: 774 * 775 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 776 * 777 * After the first file extent item is replayed, the 778 * csum tree gets the following csum item: 779 * 780 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 781 * 782 * Which covers the 20K sub-range starting at offset 20K 783 * of our extent. Now when we replay the second file 784 * extent item, if we do not delete existing csum items 785 * that cover any of its blocks, we end up getting two 786 * csum items in our csum tree that overlap each other: 787 * 788 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 789 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 790 * 791 * Which is a problem, because after this anyone trying 792 * to lookup up for the checksum of any block of our 793 * extent starting at an offset of 40K or higher, will 794 * end up looking at the second csum item only, which 795 * does not contain the checksum for any block starting 796 * at offset 40K or higher of our extent. 797 */ 798 while (!list_empty(&ordered_sums)) { 799 struct btrfs_ordered_sum *sums; 800 sums = list_entry(ordered_sums.next, 801 struct btrfs_ordered_sum, 802 list); 803 if (!ret) 804 ret = btrfs_del_csums(trans, 805 fs_info->csum_root, 806 sums->bytenr, 807 sums->len); 808 if (!ret) 809 ret = btrfs_csum_file_blocks(trans, 810 fs_info->csum_root, sums); 811 list_del(&sums->list); 812 kfree(sums); 813 } 814 if (ret) 815 goto out; 816 } else { 817 btrfs_release_path(path); 818 } 819 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 820 /* inline extents are easy, we just overwrite them */ 821 ret = overwrite_item(trans, root, path, eb, slot, key); 822 if (ret) 823 goto out; 824 } 825 826 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 827 extent_end - start); 828 if (ret) 829 goto out; 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct inode *inode; 855 char *name; 856 int name_len; 857 struct extent_buffer *leaf; 858 struct btrfs_key location; 859 int ret; 860 861 leaf = path->nodes[0]; 862 863 btrfs_dir_item_key_to_cpu(leaf, di, &location); 864 name_len = btrfs_dir_name_len(leaf, di); 865 name = kmalloc(name_len, GFP_NOFS); 866 if (!name) 867 return -ENOMEM; 868 869 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 870 btrfs_release_path(path); 871 872 inode = read_one_inode(root, location.objectid); 873 if (!inode) { 874 ret = -EIO; 875 goto out; 876 } 877 878 ret = link_to_fixup_dir(trans, root, path, location.objectid); 879 if (ret) 880 goto out; 881 882 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 883 name_len); 884 if (ret) 885 goto out; 886 else 887 ret = btrfs_run_delayed_items(trans); 888 out: 889 kfree(name); 890 iput(inode); 891 return ret; 892 } 893 894 /* 895 * helper function to see if a given name and sequence number found 896 * in an inode back reference are already in a directory and correctly 897 * point to this inode 898 */ 899 static noinline int inode_in_dir(struct btrfs_root *root, 900 struct btrfs_path *path, 901 u64 dirid, u64 objectid, u64 index, 902 const char *name, int name_len) 903 { 904 struct btrfs_dir_item *di; 905 struct btrfs_key location; 906 int match = 0; 907 908 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 909 index, name, name_len, 0); 910 if (di && !IS_ERR(di)) { 911 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 912 if (location.objectid != objectid) 913 goto out; 914 } else 915 goto out; 916 btrfs_release_path(path); 917 918 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 919 if (di && !IS_ERR(di)) { 920 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 921 if (location.objectid != objectid) 922 goto out; 923 } else 924 goto out; 925 match = 1; 926 out: 927 btrfs_release_path(path); 928 return match; 929 } 930 931 /* 932 * helper function to check a log tree for a named back reference in 933 * an inode. This is used to decide if a back reference that is 934 * found in the subvolume conflicts with what we find in the log. 935 * 936 * inode backreferences may have multiple refs in a single item, 937 * during replay we process one reference at a time, and we don't 938 * want to delete valid links to a file from the subvolume if that 939 * link is also in the log. 940 */ 941 static noinline int backref_in_log(struct btrfs_root *log, 942 struct btrfs_key *key, 943 u64 ref_objectid, 944 const char *name, int namelen) 945 { 946 struct btrfs_path *path; 947 int ret; 948 949 path = btrfs_alloc_path(); 950 if (!path) 951 return -ENOMEM; 952 953 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 954 if (ret < 0) { 955 goto out; 956 } else if (ret == 1) { 957 ret = 0; 958 goto out; 959 } 960 961 if (key->type == BTRFS_INODE_EXTREF_KEY) 962 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 963 path->slots[0], 964 ref_objectid, 965 name, namelen); 966 else 967 ret = !!btrfs_find_name_in_backref(path->nodes[0], 968 path->slots[0], 969 name, namelen); 970 out: 971 btrfs_free_path(path); 972 return ret; 973 } 974 975 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 976 struct btrfs_root *root, 977 struct btrfs_path *path, 978 struct btrfs_root *log_root, 979 struct btrfs_inode *dir, 980 struct btrfs_inode *inode, 981 u64 inode_objectid, u64 parent_objectid, 982 u64 ref_index, char *name, int namelen, 983 int *search_done) 984 { 985 int ret; 986 char *victim_name; 987 int victim_name_len; 988 struct extent_buffer *leaf; 989 struct btrfs_dir_item *di; 990 struct btrfs_key search_key; 991 struct btrfs_inode_extref *extref; 992 993 again: 994 /* Search old style refs */ 995 search_key.objectid = inode_objectid; 996 search_key.type = BTRFS_INODE_REF_KEY; 997 search_key.offset = parent_objectid; 998 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 999 if (ret == 0) { 1000 struct btrfs_inode_ref *victim_ref; 1001 unsigned long ptr; 1002 unsigned long ptr_end; 1003 1004 leaf = path->nodes[0]; 1005 1006 /* are we trying to overwrite a back ref for the root directory 1007 * if so, just jump out, we're done 1008 */ 1009 if (search_key.objectid == search_key.offset) 1010 return 1; 1011 1012 /* check all the names in this back reference to see 1013 * if they are in the log. if so, we allow them to stay 1014 * otherwise they must be unlinked as a conflict 1015 */ 1016 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1017 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1018 while (ptr < ptr_end) { 1019 victim_ref = (struct btrfs_inode_ref *)ptr; 1020 victim_name_len = btrfs_inode_ref_name_len(leaf, 1021 victim_ref); 1022 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1023 if (!victim_name) 1024 return -ENOMEM; 1025 1026 read_extent_buffer(leaf, victim_name, 1027 (unsigned long)(victim_ref + 1), 1028 victim_name_len); 1029 1030 ret = backref_in_log(log_root, &search_key, 1031 parent_objectid, victim_name, 1032 victim_name_len); 1033 if (ret < 0) { 1034 kfree(victim_name); 1035 return ret; 1036 } else if (!ret) { 1037 inc_nlink(&inode->vfs_inode); 1038 btrfs_release_path(path); 1039 1040 ret = btrfs_unlink_inode(trans, root, dir, inode, 1041 victim_name, victim_name_len); 1042 kfree(victim_name); 1043 if (ret) 1044 return ret; 1045 ret = btrfs_run_delayed_items(trans); 1046 if (ret) 1047 return ret; 1048 *search_done = 1; 1049 goto again; 1050 } 1051 kfree(victim_name); 1052 1053 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1054 } 1055 1056 /* 1057 * NOTE: we have searched root tree and checked the 1058 * corresponding ref, it does not need to check again. 1059 */ 1060 *search_done = 1; 1061 } 1062 btrfs_release_path(path); 1063 1064 /* Same search but for extended refs */ 1065 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1066 inode_objectid, parent_objectid, 0, 1067 0); 1068 if (!IS_ERR_OR_NULL(extref)) { 1069 u32 item_size; 1070 u32 cur_offset = 0; 1071 unsigned long base; 1072 struct inode *victim_parent; 1073 1074 leaf = path->nodes[0]; 1075 1076 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1077 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1078 1079 while (cur_offset < item_size) { 1080 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1081 1082 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1083 1084 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1085 goto next; 1086 1087 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1088 if (!victim_name) 1089 return -ENOMEM; 1090 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1091 victim_name_len); 1092 1093 search_key.objectid = inode_objectid; 1094 search_key.type = BTRFS_INODE_EXTREF_KEY; 1095 search_key.offset = btrfs_extref_hash(parent_objectid, 1096 victim_name, 1097 victim_name_len); 1098 ret = backref_in_log(log_root, &search_key, 1099 parent_objectid, victim_name, 1100 victim_name_len); 1101 if (ret < 0) { 1102 return ret; 1103 } else if (!ret) { 1104 ret = -ENOENT; 1105 victim_parent = read_one_inode(root, 1106 parent_objectid); 1107 if (victim_parent) { 1108 inc_nlink(&inode->vfs_inode); 1109 btrfs_release_path(path); 1110 1111 ret = btrfs_unlink_inode(trans, root, 1112 BTRFS_I(victim_parent), 1113 inode, 1114 victim_name, 1115 victim_name_len); 1116 if (!ret) 1117 ret = btrfs_run_delayed_items( 1118 trans); 1119 } 1120 iput(victim_parent); 1121 kfree(victim_name); 1122 if (ret) 1123 return ret; 1124 *search_done = 1; 1125 goto again; 1126 } 1127 kfree(victim_name); 1128 next: 1129 cur_offset += victim_name_len + sizeof(*extref); 1130 } 1131 *search_done = 1; 1132 } 1133 btrfs_release_path(path); 1134 1135 /* look for a conflicting sequence number */ 1136 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1137 ref_index, name, namelen, 0); 1138 if (di && !IS_ERR(di)) { 1139 ret = drop_one_dir_item(trans, root, path, dir, di); 1140 if (ret) 1141 return ret; 1142 } 1143 btrfs_release_path(path); 1144 1145 /* look for a conflicting name */ 1146 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1147 name, namelen, 0); 1148 if (di && !IS_ERR(di)) { 1149 ret = drop_one_dir_item(trans, root, path, dir, di); 1150 if (ret) 1151 return ret; 1152 } 1153 btrfs_release_path(path); 1154 1155 return 0; 1156 } 1157 1158 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1159 u32 *namelen, char **name, u64 *index, 1160 u64 *parent_objectid) 1161 { 1162 struct btrfs_inode_extref *extref; 1163 1164 extref = (struct btrfs_inode_extref *)ref_ptr; 1165 1166 *namelen = btrfs_inode_extref_name_len(eb, extref); 1167 *name = kmalloc(*namelen, GFP_NOFS); 1168 if (*name == NULL) 1169 return -ENOMEM; 1170 1171 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1172 *namelen); 1173 1174 if (index) 1175 *index = btrfs_inode_extref_index(eb, extref); 1176 if (parent_objectid) 1177 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1178 1179 return 0; 1180 } 1181 1182 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1183 u32 *namelen, char **name, u64 *index) 1184 { 1185 struct btrfs_inode_ref *ref; 1186 1187 ref = (struct btrfs_inode_ref *)ref_ptr; 1188 1189 *namelen = btrfs_inode_ref_name_len(eb, ref); 1190 *name = kmalloc(*namelen, GFP_NOFS); 1191 if (*name == NULL) 1192 return -ENOMEM; 1193 1194 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1195 1196 if (index) 1197 *index = btrfs_inode_ref_index(eb, ref); 1198 1199 return 0; 1200 } 1201 1202 /* 1203 * Take an inode reference item from the log tree and iterate all names from the 1204 * inode reference item in the subvolume tree with the same key (if it exists). 1205 * For any name that is not in the inode reference item from the log tree, do a 1206 * proper unlink of that name (that is, remove its entry from the inode 1207 * reference item and both dir index keys). 1208 */ 1209 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1210 struct btrfs_root *root, 1211 struct btrfs_path *path, 1212 struct btrfs_inode *inode, 1213 struct extent_buffer *log_eb, 1214 int log_slot, 1215 struct btrfs_key *key) 1216 { 1217 int ret; 1218 unsigned long ref_ptr; 1219 unsigned long ref_end; 1220 struct extent_buffer *eb; 1221 1222 again: 1223 btrfs_release_path(path); 1224 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1225 if (ret > 0) { 1226 ret = 0; 1227 goto out; 1228 } 1229 if (ret < 0) 1230 goto out; 1231 1232 eb = path->nodes[0]; 1233 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1234 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1235 while (ref_ptr < ref_end) { 1236 char *name = NULL; 1237 int namelen; 1238 u64 parent_id; 1239 1240 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1241 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1242 NULL, &parent_id); 1243 } else { 1244 parent_id = key->offset; 1245 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1246 NULL); 1247 } 1248 if (ret) 1249 goto out; 1250 1251 if (key->type == BTRFS_INODE_EXTREF_KEY) 1252 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1253 parent_id, name, 1254 namelen); 1255 else 1256 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1257 name, namelen); 1258 1259 if (!ret) { 1260 struct inode *dir; 1261 1262 btrfs_release_path(path); 1263 dir = read_one_inode(root, parent_id); 1264 if (!dir) { 1265 ret = -ENOENT; 1266 kfree(name); 1267 goto out; 1268 } 1269 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1270 inode, name, namelen); 1271 kfree(name); 1272 iput(dir); 1273 if (ret) 1274 goto out; 1275 goto again; 1276 } 1277 1278 kfree(name); 1279 ref_ptr += namelen; 1280 if (key->type == BTRFS_INODE_EXTREF_KEY) 1281 ref_ptr += sizeof(struct btrfs_inode_extref); 1282 else 1283 ref_ptr += sizeof(struct btrfs_inode_ref); 1284 } 1285 ret = 0; 1286 out: 1287 btrfs_release_path(path); 1288 return ret; 1289 } 1290 1291 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1292 const u8 ref_type, const char *name, 1293 const int namelen) 1294 { 1295 struct btrfs_key key; 1296 struct btrfs_path *path; 1297 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1298 int ret; 1299 1300 path = btrfs_alloc_path(); 1301 if (!path) 1302 return -ENOMEM; 1303 1304 key.objectid = btrfs_ino(BTRFS_I(inode)); 1305 key.type = ref_type; 1306 if (key.type == BTRFS_INODE_REF_KEY) 1307 key.offset = parent_id; 1308 else 1309 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1310 1311 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1312 if (ret < 0) 1313 goto out; 1314 if (ret > 0) { 1315 ret = 0; 1316 goto out; 1317 } 1318 if (key.type == BTRFS_INODE_EXTREF_KEY) 1319 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1320 path->slots[0], parent_id, name, namelen); 1321 else 1322 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1323 name, namelen); 1324 1325 out: 1326 btrfs_free_path(path); 1327 return ret; 1328 } 1329 1330 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1331 struct inode *dir, struct inode *inode, const char *name, 1332 int namelen, u64 ref_index) 1333 { 1334 struct btrfs_dir_item *dir_item; 1335 struct btrfs_key key; 1336 struct btrfs_path *path; 1337 struct inode *other_inode = NULL; 1338 int ret; 1339 1340 path = btrfs_alloc_path(); 1341 if (!path) 1342 return -ENOMEM; 1343 1344 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1345 btrfs_ino(BTRFS_I(dir)), 1346 name, namelen, 0); 1347 if (!dir_item) { 1348 btrfs_release_path(path); 1349 goto add_link; 1350 } else if (IS_ERR(dir_item)) { 1351 ret = PTR_ERR(dir_item); 1352 goto out; 1353 } 1354 1355 /* 1356 * Our inode's dentry collides with the dentry of another inode which is 1357 * in the log but not yet processed since it has a higher inode number. 1358 * So delete that other dentry. 1359 */ 1360 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1361 btrfs_release_path(path); 1362 other_inode = read_one_inode(root, key.objectid); 1363 if (!other_inode) { 1364 ret = -ENOENT; 1365 goto out; 1366 } 1367 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1368 name, namelen); 1369 if (ret) 1370 goto out; 1371 /* 1372 * If we dropped the link count to 0, bump it so that later the iput() 1373 * on the inode will not free it. We will fixup the link count later. 1374 */ 1375 if (other_inode->i_nlink == 0) 1376 inc_nlink(other_inode); 1377 1378 ret = btrfs_run_delayed_items(trans); 1379 if (ret) 1380 goto out; 1381 add_link: 1382 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1383 name, namelen, 0, ref_index); 1384 out: 1385 iput(other_inode); 1386 btrfs_free_path(path); 1387 1388 return ret; 1389 } 1390 1391 /* 1392 * replay one inode back reference item found in the log tree. 1393 * eb, slot and key refer to the buffer and key found in the log tree. 1394 * root is the destination we are replaying into, and path is for temp 1395 * use by this function. (it should be released on return). 1396 */ 1397 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1398 struct btrfs_root *root, 1399 struct btrfs_root *log, 1400 struct btrfs_path *path, 1401 struct extent_buffer *eb, int slot, 1402 struct btrfs_key *key) 1403 { 1404 struct inode *dir = NULL; 1405 struct inode *inode = NULL; 1406 unsigned long ref_ptr; 1407 unsigned long ref_end; 1408 char *name = NULL; 1409 int namelen; 1410 int ret; 1411 int search_done = 0; 1412 int log_ref_ver = 0; 1413 u64 parent_objectid; 1414 u64 inode_objectid; 1415 u64 ref_index = 0; 1416 int ref_struct_size; 1417 1418 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1419 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1420 1421 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1422 struct btrfs_inode_extref *r; 1423 1424 ref_struct_size = sizeof(struct btrfs_inode_extref); 1425 log_ref_ver = 1; 1426 r = (struct btrfs_inode_extref *)ref_ptr; 1427 parent_objectid = btrfs_inode_extref_parent(eb, r); 1428 } else { 1429 ref_struct_size = sizeof(struct btrfs_inode_ref); 1430 parent_objectid = key->offset; 1431 } 1432 inode_objectid = key->objectid; 1433 1434 /* 1435 * it is possible that we didn't log all the parent directories 1436 * for a given inode. If we don't find the dir, just don't 1437 * copy the back ref in. The link count fixup code will take 1438 * care of the rest 1439 */ 1440 dir = read_one_inode(root, parent_objectid); 1441 if (!dir) { 1442 ret = -ENOENT; 1443 goto out; 1444 } 1445 1446 inode = read_one_inode(root, inode_objectid); 1447 if (!inode) { 1448 ret = -EIO; 1449 goto out; 1450 } 1451 1452 while (ref_ptr < ref_end) { 1453 if (log_ref_ver) { 1454 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1455 &ref_index, &parent_objectid); 1456 /* 1457 * parent object can change from one array 1458 * item to another. 1459 */ 1460 if (!dir) 1461 dir = read_one_inode(root, parent_objectid); 1462 if (!dir) { 1463 ret = -ENOENT; 1464 goto out; 1465 } 1466 } else { 1467 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1468 &ref_index); 1469 } 1470 if (ret) 1471 goto out; 1472 1473 /* if we already have a perfect match, we're done */ 1474 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1475 btrfs_ino(BTRFS_I(inode)), ref_index, 1476 name, namelen)) { 1477 /* 1478 * look for a conflicting back reference in the 1479 * metadata. if we find one we have to unlink that name 1480 * of the file before we add our new link. Later on, we 1481 * overwrite any existing back reference, and we don't 1482 * want to create dangling pointers in the directory. 1483 */ 1484 1485 if (!search_done) { 1486 ret = __add_inode_ref(trans, root, path, log, 1487 BTRFS_I(dir), 1488 BTRFS_I(inode), 1489 inode_objectid, 1490 parent_objectid, 1491 ref_index, name, namelen, 1492 &search_done); 1493 if (ret) { 1494 if (ret == 1) 1495 ret = 0; 1496 goto out; 1497 } 1498 } 1499 1500 /* 1501 * If a reference item already exists for this inode 1502 * with the same parent and name, but different index, 1503 * drop it and the corresponding directory index entries 1504 * from the parent before adding the new reference item 1505 * and dir index entries, otherwise we would fail with 1506 * -EEXIST returned from btrfs_add_link() below. 1507 */ 1508 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1509 name, namelen); 1510 if (ret > 0) { 1511 ret = btrfs_unlink_inode(trans, root, 1512 BTRFS_I(dir), 1513 BTRFS_I(inode), 1514 name, namelen); 1515 /* 1516 * If we dropped the link count to 0, bump it so 1517 * that later the iput() on the inode will not 1518 * free it. We will fixup the link count later. 1519 */ 1520 if (!ret && inode->i_nlink == 0) 1521 inc_nlink(inode); 1522 } 1523 if (ret < 0) 1524 goto out; 1525 1526 /* insert our name */ 1527 ret = add_link(trans, root, dir, inode, name, namelen, 1528 ref_index); 1529 if (ret) 1530 goto out; 1531 1532 btrfs_update_inode(trans, root, inode); 1533 } 1534 1535 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1536 kfree(name); 1537 name = NULL; 1538 if (log_ref_ver) { 1539 iput(dir); 1540 dir = NULL; 1541 } 1542 } 1543 1544 /* 1545 * Before we overwrite the inode reference item in the subvolume tree 1546 * with the item from the log tree, we must unlink all names from the 1547 * parent directory that are in the subvolume's tree inode reference 1548 * item, otherwise we end up with an inconsistent subvolume tree where 1549 * dir index entries exist for a name but there is no inode reference 1550 * item with the same name. 1551 */ 1552 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1553 key); 1554 if (ret) 1555 goto out; 1556 1557 /* finally write the back reference in the inode */ 1558 ret = overwrite_item(trans, root, path, eb, slot, key); 1559 out: 1560 btrfs_release_path(path); 1561 kfree(name); 1562 iput(dir); 1563 iput(inode); 1564 return ret; 1565 } 1566 1567 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1568 struct btrfs_root *root, u64 ino) 1569 { 1570 int ret; 1571 1572 ret = btrfs_insert_orphan_item(trans, root, ino); 1573 if (ret == -EEXIST) 1574 ret = 0; 1575 1576 return ret; 1577 } 1578 1579 static int count_inode_extrefs(struct btrfs_root *root, 1580 struct btrfs_inode *inode, struct btrfs_path *path) 1581 { 1582 int ret = 0; 1583 int name_len; 1584 unsigned int nlink = 0; 1585 u32 item_size; 1586 u32 cur_offset = 0; 1587 u64 inode_objectid = btrfs_ino(inode); 1588 u64 offset = 0; 1589 unsigned long ptr; 1590 struct btrfs_inode_extref *extref; 1591 struct extent_buffer *leaf; 1592 1593 while (1) { 1594 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1595 &extref, &offset); 1596 if (ret) 1597 break; 1598 1599 leaf = path->nodes[0]; 1600 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1601 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1602 cur_offset = 0; 1603 1604 while (cur_offset < item_size) { 1605 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1606 name_len = btrfs_inode_extref_name_len(leaf, extref); 1607 1608 nlink++; 1609 1610 cur_offset += name_len + sizeof(*extref); 1611 } 1612 1613 offset++; 1614 btrfs_release_path(path); 1615 } 1616 btrfs_release_path(path); 1617 1618 if (ret < 0 && ret != -ENOENT) 1619 return ret; 1620 return nlink; 1621 } 1622 1623 static int count_inode_refs(struct btrfs_root *root, 1624 struct btrfs_inode *inode, struct btrfs_path *path) 1625 { 1626 int ret; 1627 struct btrfs_key key; 1628 unsigned int nlink = 0; 1629 unsigned long ptr; 1630 unsigned long ptr_end; 1631 int name_len; 1632 u64 ino = btrfs_ino(inode); 1633 1634 key.objectid = ino; 1635 key.type = BTRFS_INODE_REF_KEY; 1636 key.offset = (u64)-1; 1637 1638 while (1) { 1639 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1640 if (ret < 0) 1641 break; 1642 if (ret > 0) { 1643 if (path->slots[0] == 0) 1644 break; 1645 path->slots[0]--; 1646 } 1647 process_slot: 1648 btrfs_item_key_to_cpu(path->nodes[0], &key, 1649 path->slots[0]); 1650 if (key.objectid != ino || 1651 key.type != BTRFS_INODE_REF_KEY) 1652 break; 1653 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1654 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1655 path->slots[0]); 1656 while (ptr < ptr_end) { 1657 struct btrfs_inode_ref *ref; 1658 1659 ref = (struct btrfs_inode_ref *)ptr; 1660 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1661 ref); 1662 ptr = (unsigned long)(ref + 1) + name_len; 1663 nlink++; 1664 } 1665 1666 if (key.offset == 0) 1667 break; 1668 if (path->slots[0] > 0) { 1669 path->slots[0]--; 1670 goto process_slot; 1671 } 1672 key.offset--; 1673 btrfs_release_path(path); 1674 } 1675 btrfs_release_path(path); 1676 1677 return nlink; 1678 } 1679 1680 /* 1681 * There are a few corners where the link count of the file can't 1682 * be properly maintained during replay. So, instead of adding 1683 * lots of complexity to the log code, we just scan the backrefs 1684 * for any file that has been through replay. 1685 * 1686 * The scan will update the link count on the inode to reflect the 1687 * number of back refs found. If it goes down to zero, the iput 1688 * will free the inode. 1689 */ 1690 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1691 struct btrfs_root *root, 1692 struct inode *inode) 1693 { 1694 struct btrfs_path *path; 1695 int ret; 1696 u64 nlink = 0; 1697 u64 ino = btrfs_ino(BTRFS_I(inode)); 1698 1699 path = btrfs_alloc_path(); 1700 if (!path) 1701 return -ENOMEM; 1702 1703 ret = count_inode_refs(root, BTRFS_I(inode), path); 1704 if (ret < 0) 1705 goto out; 1706 1707 nlink = ret; 1708 1709 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1710 if (ret < 0) 1711 goto out; 1712 1713 nlink += ret; 1714 1715 ret = 0; 1716 1717 if (nlink != inode->i_nlink) { 1718 set_nlink(inode, nlink); 1719 btrfs_update_inode(trans, root, inode); 1720 } 1721 BTRFS_I(inode)->index_cnt = (u64)-1; 1722 1723 if (inode->i_nlink == 0) { 1724 if (S_ISDIR(inode->i_mode)) { 1725 ret = replay_dir_deletes(trans, root, NULL, path, 1726 ino, 1); 1727 if (ret) 1728 goto out; 1729 } 1730 ret = insert_orphan_item(trans, root, ino); 1731 } 1732 1733 out: 1734 btrfs_free_path(path); 1735 return ret; 1736 } 1737 1738 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1739 struct btrfs_root *root, 1740 struct btrfs_path *path) 1741 { 1742 int ret; 1743 struct btrfs_key key; 1744 struct inode *inode; 1745 1746 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1747 key.type = BTRFS_ORPHAN_ITEM_KEY; 1748 key.offset = (u64)-1; 1749 while (1) { 1750 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1751 if (ret < 0) 1752 break; 1753 1754 if (ret == 1) { 1755 if (path->slots[0] == 0) 1756 break; 1757 path->slots[0]--; 1758 } 1759 1760 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1761 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1762 key.type != BTRFS_ORPHAN_ITEM_KEY) 1763 break; 1764 1765 ret = btrfs_del_item(trans, root, path); 1766 if (ret) 1767 goto out; 1768 1769 btrfs_release_path(path); 1770 inode = read_one_inode(root, key.offset); 1771 if (!inode) 1772 return -EIO; 1773 1774 ret = fixup_inode_link_count(trans, root, inode); 1775 iput(inode); 1776 if (ret) 1777 goto out; 1778 1779 /* 1780 * fixup on a directory may create new entries, 1781 * make sure we always look for the highset possible 1782 * offset 1783 */ 1784 key.offset = (u64)-1; 1785 } 1786 ret = 0; 1787 out: 1788 btrfs_release_path(path); 1789 return ret; 1790 } 1791 1792 1793 /* 1794 * record a given inode in the fixup dir so we can check its link 1795 * count when replay is done. The link count is incremented here 1796 * so the inode won't go away until we check it 1797 */ 1798 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1799 struct btrfs_root *root, 1800 struct btrfs_path *path, 1801 u64 objectid) 1802 { 1803 struct btrfs_key key; 1804 int ret = 0; 1805 struct inode *inode; 1806 1807 inode = read_one_inode(root, objectid); 1808 if (!inode) 1809 return -EIO; 1810 1811 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1812 key.type = BTRFS_ORPHAN_ITEM_KEY; 1813 key.offset = objectid; 1814 1815 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1816 1817 btrfs_release_path(path); 1818 if (ret == 0) { 1819 if (!inode->i_nlink) 1820 set_nlink(inode, 1); 1821 else 1822 inc_nlink(inode); 1823 ret = btrfs_update_inode(trans, root, inode); 1824 } else if (ret == -EEXIST) { 1825 ret = 0; 1826 } else { 1827 BUG(); /* Logic Error */ 1828 } 1829 iput(inode); 1830 1831 return ret; 1832 } 1833 1834 /* 1835 * when replaying the log for a directory, we only insert names 1836 * for inodes that actually exist. This means an fsync on a directory 1837 * does not implicitly fsync all the new files in it 1838 */ 1839 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1840 struct btrfs_root *root, 1841 u64 dirid, u64 index, 1842 char *name, int name_len, 1843 struct btrfs_key *location) 1844 { 1845 struct inode *inode; 1846 struct inode *dir; 1847 int ret; 1848 1849 inode = read_one_inode(root, location->objectid); 1850 if (!inode) 1851 return -ENOENT; 1852 1853 dir = read_one_inode(root, dirid); 1854 if (!dir) { 1855 iput(inode); 1856 return -EIO; 1857 } 1858 1859 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1860 name_len, 1, index); 1861 1862 /* FIXME, put inode into FIXUP list */ 1863 1864 iput(inode); 1865 iput(dir); 1866 return ret; 1867 } 1868 1869 /* 1870 * take a single entry in a log directory item and replay it into 1871 * the subvolume. 1872 * 1873 * if a conflicting item exists in the subdirectory already, 1874 * the inode it points to is unlinked and put into the link count 1875 * fix up tree. 1876 * 1877 * If a name from the log points to a file or directory that does 1878 * not exist in the FS, it is skipped. fsyncs on directories 1879 * do not force down inodes inside that directory, just changes to the 1880 * names or unlinks in a directory. 1881 * 1882 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1883 * non-existing inode) and 1 if the name was replayed. 1884 */ 1885 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1886 struct btrfs_root *root, 1887 struct btrfs_path *path, 1888 struct extent_buffer *eb, 1889 struct btrfs_dir_item *di, 1890 struct btrfs_key *key) 1891 { 1892 char *name; 1893 int name_len; 1894 struct btrfs_dir_item *dst_di; 1895 struct btrfs_key found_key; 1896 struct btrfs_key log_key; 1897 struct inode *dir; 1898 u8 log_type; 1899 int exists; 1900 int ret = 0; 1901 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1902 bool name_added = false; 1903 1904 dir = read_one_inode(root, key->objectid); 1905 if (!dir) 1906 return -EIO; 1907 1908 name_len = btrfs_dir_name_len(eb, di); 1909 name = kmalloc(name_len, GFP_NOFS); 1910 if (!name) { 1911 ret = -ENOMEM; 1912 goto out; 1913 } 1914 1915 log_type = btrfs_dir_type(eb, di); 1916 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1917 name_len); 1918 1919 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1920 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1921 if (exists == 0) 1922 exists = 1; 1923 else 1924 exists = 0; 1925 btrfs_release_path(path); 1926 1927 if (key->type == BTRFS_DIR_ITEM_KEY) { 1928 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1929 name, name_len, 1); 1930 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1931 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1932 key->objectid, 1933 key->offset, name, 1934 name_len, 1); 1935 } else { 1936 /* Corruption */ 1937 ret = -EINVAL; 1938 goto out; 1939 } 1940 if (IS_ERR_OR_NULL(dst_di)) { 1941 /* we need a sequence number to insert, so we only 1942 * do inserts for the BTRFS_DIR_INDEX_KEY types 1943 */ 1944 if (key->type != BTRFS_DIR_INDEX_KEY) 1945 goto out; 1946 goto insert; 1947 } 1948 1949 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1950 /* the existing item matches the logged item */ 1951 if (found_key.objectid == log_key.objectid && 1952 found_key.type == log_key.type && 1953 found_key.offset == log_key.offset && 1954 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1955 update_size = false; 1956 goto out; 1957 } 1958 1959 /* 1960 * don't drop the conflicting directory entry if the inode 1961 * for the new entry doesn't exist 1962 */ 1963 if (!exists) 1964 goto out; 1965 1966 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1967 if (ret) 1968 goto out; 1969 1970 if (key->type == BTRFS_DIR_INDEX_KEY) 1971 goto insert; 1972 out: 1973 btrfs_release_path(path); 1974 if (!ret && update_size) { 1975 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1976 ret = btrfs_update_inode(trans, root, dir); 1977 } 1978 kfree(name); 1979 iput(dir); 1980 if (!ret && name_added) 1981 ret = 1; 1982 return ret; 1983 1984 insert: 1985 /* 1986 * Check if the inode reference exists in the log for the given name, 1987 * inode and parent inode 1988 */ 1989 found_key.objectid = log_key.objectid; 1990 found_key.type = BTRFS_INODE_REF_KEY; 1991 found_key.offset = key->objectid; 1992 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 1993 if (ret < 0) { 1994 goto out; 1995 } else if (ret) { 1996 /* The dentry will be added later. */ 1997 ret = 0; 1998 update_size = false; 1999 goto out; 2000 } 2001 2002 found_key.objectid = log_key.objectid; 2003 found_key.type = BTRFS_INODE_EXTREF_KEY; 2004 found_key.offset = key->objectid; 2005 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2006 name_len); 2007 if (ret < 0) { 2008 goto out; 2009 } else if (ret) { 2010 /* The dentry will be added later. */ 2011 ret = 0; 2012 update_size = false; 2013 goto out; 2014 } 2015 btrfs_release_path(path); 2016 ret = insert_one_name(trans, root, key->objectid, key->offset, 2017 name, name_len, &log_key); 2018 if (ret && ret != -ENOENT && ret != -EEXIST) 2019 goto out; 2020 if (!ret) 2021 name_added = true; 2022 update_size = false; 2023 ret = 0; 2024 goto out; 2025 } 2026 2027 /* 2028 * find all the names in a directory item and reconcile them into 2029 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2030 * one name in a directory item, but the same code gets used for 2031 * both directory index types 2032 */ 2033 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2034 struct btrfs_root *root, 2035 struct btrfs_path *path, 2036 struct extent_buffer *eb, int slot, 2037 struct btrfs_key *key) 2038 { 2039 int ret = 0; 2040 u32 item_size = btrfs_item_size_nr(eb, slot); 2041 struct btrfs_dir_item *di; 2042 int name_len; 2043 unsigned long ptr; 2044 unsigned long ptr_end; 2045 struct btrfs_path *fixup_path = NULL; 2046 2047 ptr = btrfs_item_ptr_offset(eb, slot); 2048 ptr_end = ptr + item_size; 2049 while (ptr < ptr_end) { 2050 di = (struct btrfs_dir_item *)ptr; 2051 name_len = btrfs_dir_name_len(eb, di); 2052 ret = replay_one_name(trans, root, path, eb, di, key); 2053 if (ret < 0) 2054 break; 2055 ptr = (unsigned long)(di + 1); 2056 ptr += name_len; 2057 2058 /* 2059 * If this entry refers to a non-directory (directories can not 2060 * have a link count > 1) and it was added in the transaction 2061 * that was not committed, make sure we fixup the link count of 2062 * the inode it the entry points to. Otherwise something like 2063 * the following would result in a directory pointing to an 2064 * inode with a wrong link that does not account for this dir 2065 * entry: 2066 * 2067 * mkdir testdir 2068 * touch testdir/foo 2069 * touch testdir/bar 2070 * sync 2071 * 2072 * ln testdir/bar testdir/bar_link 2073 * ln testdir/foo testdir/foo_link 2074 * xfs_io -c "fsync" testdir/bar 2075 * 2076 * <power failure> 2077 * 2078 * mount fs, log replay happens 2079 * 2080 * File foo would remain with a link count of 1 when it has two 2081 * entries pointing to it in the directory testdir. This would 2082 * make it impossible to ever delete the parent directory has 2083 * it would result in stale dentries that can never be deleted. 2084 */ 2085 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2086 struct btrfs_key di_key; 2087 2088 if (!fixup_path) { 2089 fixup_path = btrfs_alloc_path(); 2090 if (!fixup_path) { 2091 ret = -ENOMEM; 2092 break; 2093 } 2094 } 2095 2096 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2097 ret = link_to_fixup_dir(trans, root, fixup_path, 2098 di_key.objectid); 2099 if (ret) 2100 break; 2101 } 2102 ret = 0; 2103 } 2104 btrfs_free_path(fixup_path); 2105 return ret; 2106 } 2107 2108 /* 2109 * directory replay has two parts. There are the standard directory 2110 * items in the log copied from the subvolume, and range items 2111 * created in the log while the subvolume was logged. 2112 * 2113 * The range items tell us which parts of the key space the log 2114 * is authoritative for. During replay, if a key in the subvolume 2115 * directory is in a logged range item, but not actually in the log 2116 * that means it was deleted from the directory before the fsync 2117 * and should be removed. 2118 */ 2119 static noinline int find_dir_range(struct btrfs_root *root, 2120 struct btrfs_path *path, 2121 u64 dirid, int key_type, 2122 u64 *start_ret, u64 *end_ret) 2123 { 2124 struct btrfs_key key; 2125 u64 found_end; 2126 struct btrfs_dir_log_item *item; 2127 int ret; 2128 int nritems; 2129 2130 if (*start_ret == (u64)-1) 2131 return 1; 2132 2133 key.objectid = dirid; 2134 key.type = key_type; 2135 key.offset = *start_ret; 2136 2137 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2138 if (ret < 0) 2139 goto out; 2140 if (ret > 0) { 2141 if (path->slots[0] == 0) 2142 goto out; 2143 path->slots[0]--; 2144 } 2145 if (ret != 0) 2146 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2147 2148 if (key.type != key_type || key.objectid != dirid) { 2149 ret = 1; 2150 goto next; 2151 } 2152 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2153 struct btrfs_dir_log_item); 2154 found_end = btrfs_dir_log_end(path->nodes[0], item); 2155 2156 if (*start_ret >= key.offset && *start_ret <= found_end) { 2157 ret = 0; 2158 *start_ret = key.offset; 2159 *end_ret = found_end; 2160 goto out; 2161 } 2162 ret = 1; 2163 next: 2164 /* check the next slot in the tree to see if it is a valid item */ 2165 nritems = btrfs_header_nritems(path->nodes[0]); 2166 path->slots[0]++; 2167 if (path->slots[0] >= nritems) { 2168 ret = btrfs_next_leaf(root, path); 2169 if (ret) 2170 goto out; 2171 } 2172 2173 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2174 2175 if (key.type != key_type || key.objectid != dirid) { 2176 ret = 1; 2177 goto out; 2178 } 2179 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2180 struct btrfs_dir_log_item); 2181 found_end = btrfs_dir_log_end(path->nodes[0], item); 2182 *start_ret = key.offset; 2183 *end_ret = found_end; 2184 ret = 0; 2185 out: 2186 btrfs_release_path(path); 2187 return ret; 2188 } 2189 2190 /* 2191 * this looks for a given directory item in the log. If the directory 2192 * item is not in the log, the item is removed and the inode it points 2193 * to is unlinked 2194 */ 2195 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2196 struct btrfs_root *root, 2197 struct btrfs_root *log, 2198 struct btrfs_path *path, 2199 struct btrfs_path *log_path, 2200 struct inode *dir, 2201 struct btrfs_key *dir_key) 2202 { 2203 int ret; 2204 struct extent_buffer *eb; 2205 int slot; 2206 u32 item_size; 2207 struct btrfs_dir_item *di; 2208 struct btrfs_dir_item *log_di; 2209 int name_len; 2210 unsigned long ptr; 2211 unsigned long ptr_end; 2212 char *name; 2213 struct inode *inode; 2214 struct btrfs_key location; 2215 2216 again: 2217 eb = path->nodes[0]; 2218 slot = path->slots[0]; 2219 item_size = btrfs_item_size_nr(eb, slot); 2220 ptr = btrfs_item_ptr_offset(eb, slot); 2221 ptr_end = ptr + item_size; 2222 while (ptr < ptr_end) { 2223 di = (struct btrfs_dir_item *)ptr; 2224 name_len = btrfs_dir_name_len(eb, di); 2225 name = kmalloc(name_len, GFP_NOFS); 2226 if (!name) { 2227 ret = -ENOMEM; 2228 goto out; 2229 } 2230 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2231 name_len); 2232 log_di = NULL; 2233 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2234 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2235 dir_key->objectid, 2236 name, name_len, 0); 2237 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2238 log_di = btrfs_lookup_dir_index_item(trans, log, 2239 log_path, 2240 dir_key->objectid, 2241 dir_key->offset, 2242 name, name_len, 0); 2243 } 2244 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2245 btrfs_dir_item_key_to_cpu(eb, di, &location); 2246 btrfs_release_path(path); 2247 btrfs_release_path(log_path); 2248 inode = read_one_inode(root, location.objectid); 2249 if (!inode) { 2250 kfree(name); 2251 return -EIO; 2252 } 2253 2254 ret = link_to_fixup_dir(trans, root, 2255 path, location.objectid); 2256 if (ret) { 2257 kfree(name); 2258 iput(inode); 2259 goto out; 2260 } 2261 2262 inc_nlink(inode); 2263 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2264 BTRFS_I(inode), name, name_len); 2265 if (!ret) 2266 ret = btrfs_run_delayed_items(trans); 2267 kfree(name); 2268 iput(inode); 2269 if (ret) 2270 goto out; 2271 2272 /* there might still be more names under this key 2273 * check and repeat if required 2274 */ 2275 ret = btrfs_search_slot(NULL, root, dir_key, path, 2276 0, 0); 2277 if (ret == 0) 2278 goto again; 2279 ret = 0; 2280 goto out; 2281 } else if (IS_ERR(log_di)) { 2282 kfree(name); 2283 return PTR_ERR(log_di); 2284 } 2285 btrfs_release_path(log_path); 2286 kfree(name); 2287 2288 ptr = (unsigned long)(di + 1); 2289 ptr += name_len; 2290 } 2291 ret = 0; 2292 out: 2293 btrfs_release_path(path); 2294 btrfs_release_path(log_path); 2295 return ret; 2296 } 2297 2298 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2299 struct btrfs_root *root, 2300 struct btrfs_root *log, 2301 struct btrfs_path *path, 2302 const u64 ino) 2303 { 2304 struct btrfs_key search_key; 2305 struct btrfs_path *log_path; 2306 int i; 2307 int nritems; 2308 int ret; 2309 2310 log_path = btrfs_alloc_path(); 2311 if (!log_path) 2312 return -ENOMEM; 2313 2314 search_key.objectid = ino; 2315 search_key.type = BTRFS_XATTR_ITEM_KEY; 2316 search_key.offset = 0; 2317 again: 2318 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2319 if (ret < 0) 2320 goto out; 2321 process_leaf: 2322 nritems = btrfs_header_nritems(path->nodes[0]); 2323 for (i = path->slots[0]; i < nritems; i++) { 2324 struct btrfs_key key; 2325 struct btrfs_dir_item *di; 2326 struct btrfs_dir_item *log_di; 2327 u32 total_size; 2328 u32 cur; 2329 2330 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2331 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2332 ret = 0; 2333 goto out; 2334 } 2335 2336 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2337 total_size = btrfs_item_size_nr(path->nodes[0], i); 2338 cur = 0; 2339 while (cur < total_size) { 2340 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2341 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2342 u32 this_len = sizeof(*di) + name_len + data_len; 2343 char *name; 2344 2345 name = kmalloc(name_len, GFP_NOFS); 2346 if (!name) { 2347 ret = -ENOMEM; 2348 goto out; 2349 } 2350 read_extent_buffer(path->nodes[0], name, 2351 (unsigned long)(di + 1), name_len); 2352 2353 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2354 name, name_len, 0); 2355 btrfs_release_path(log_path); 2356 if (!log_di) { 2357 /* Doesn't exist in log tree, so delete it. */ 2358 btrfs_release_path(path); 2359 di = btrfs_lookup_xattr(trans, root, path, ino, 2360 name, name_len, -1); 2361 kfree(name); 2362 if (IS_ERR(di)) { 2363 ret = PTR_ERR(di); 2364 goto out; 2365 } 2366 ASSERT(di); 2367 ret = btrfs_delete_one_dir_name(trans, root, 2368 path, di); 2369 if (ret) 2370 goto out; 2371 btrfs_release_path(path); 2372 search_key = key; 2373 goto again; 2374 } 2375 kfree(name); 2376 if (IS_ERR(log_di)) { 2377 ret = PTR_ERR(log_di); 2378 goto out; 2379 } 2380 cur += this_len; 2381 di = (struct btrfs_dir_item *)((char *)di + this_len); 2382 } 2383 } 2384 ret = btrfs_next_leaf(root, path); 2385 if (ret > 0) 2386 ret = 0; 2387 else if (ret == 0) 2388 goto process_leaf; 2389 out: 2390 btrfs_free_path(log_path); 2391 btrfs_release_path(path); 2392 return ret; 2393 } 2394 2395 2396 /* 2397 * deletion replay happens before we copy any new directory items 2398 * out of the log or out of backreferences from inodes. It 2399 * scans the log to find ranges of keys that log is authoritative for, 2400 * and then scans the directory to find items in those ranges that are 2401 * not present in the log. 2402 * 2403 * Anything we don't find in the log is unlinked and removed from the 2404 * directory. 2405 */ 2406 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2407 struct btrfs_root *root, 2408 struct btrfs_root *log, 2409 struct btrfs_path *path, 2410 u64 dirid, int del_all) 2411 { 2412 u64 range_start; 2413 u64 range_end; 2414 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2415 int ret = 0; 2416 struct btrfs_key dir_key; 2417 struct btrfs_key found_key; 2418 struct btrfs_path *log_path; 2419 struct inode *dir; 2420 2421 dir_key.objectid = dirid; 2422 dir_key.type = BTRFS_DIR_ITEM_KEY; 2423 log_path = btrfs_alloc_path(); 2424 if (!log_path) 2425 return -ENOMEM; 2426 2427 dir = read_one_inode(root, dirid); 2428 /* it isn't an error if the inode isn't there, that can happen 2429 * because we replay the deletes before we copy in the inode item 2430 * from the log 2431 */ 2432 if (!dir) { 2433 btrfs_free_path(log_path); 2434 return 0; 2435 } 2436 again: 2437 range_start = 0; 2438 range_end = 0; 2439 while (1) { 2440 if (del_all) 2441 range_end = (u64)-1; 2442 else { 2443 ret = find_dir_range(log, path, dirid, key_type, 2444 &range_start, &range_end); 2445 if (ret != 0) 2446 break; 2447 } 2448 2449 dir_key.offset = range_start; 2450 while (1) { 2451 int nritems; 2452 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2453 0, 0); 2454 if (ret < 0) 2455 goto out; 2456 2457 nritems = btrfs_header_nritems(path->nodes[0]); 2458 if (path->slots[0] >= nritems) { 2459 ret = btrfs_next_leaf(root, path); 2460 if (ret == 1) 2461 break; 2462 else if (ret < 0) 2463 goto out; 2464 } 2465 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2466 path->slots[0]); 2467 if (found_key.objectid != dirid || 2468 found_key.type != dir_key.type) 2469 goto next_type; 2470 2471 if (found_key.offset > range_end) 2472 break; 2473 2474 ret = check_item_in_log(trans, root, log, path, 2475 log_path, dir, 2476 &found_key); 2477 if (ret) 2478 goto out; 2479 if (found_key.offset == (u64)-1) 2480 break; 2481 dir_key.offset = found_key.offset + 1; 2482 } 2483 btrfs_release_path(path); 2484 if (range_end == (u64)-1) 2485 break; 2486 range_start = range_end + 1; 2487 } 2488 2489 next_type: 2490 ret = 0; 2491 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2492 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2493 dir_key.type = BTRFS_DIR_INDEX_KEY; 2494 btrfs_release_path(path); 2495 goto again; 2496 } 2497 out: 2498 btrfs_release_path(path); 2499 btrfs_free_path(log_path); 2500 iput(dir); 2501 return ret; 2502 } 2503 2504 /* 2505 * the process_func used to replay items from the log tree. This 2506 * gets called in two different stages. The first stage just looks 2507 * for inodes and makes sure they are all copied into the subvolume. 2508 * 2509 * The second stage copies all the other item types from the log into 2510 * the subvolume. The two stage approach is slower, but gets rid of 2511 * lots of complexity around inodes referencing other inodes that exist 2512 * only in the log (references come from either directory items or inode 2513 * back refs). 2514 */ 2515 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2516 struct walk_control *wc, u64 gen, int level) 2517 { 2518 int nritems; 2519 struct btrfs_path *path; 2520 struct btrfs_root *root = wc->replay_dest; 2521 struct btrfs_key key; 2522 int i; 2523 int ret; 2524 2525 ret = btrfs_read_buffer(eb, gen, level, NULL); 2526 if (ret) 2527 return ret; 2528 2529 level = btrfs_header_level(eb); 2530 2531 if (level != 0) 2532 return 0; 2533 2534 path = btrfs_alloc_path(); 2535 if (!path) 2536 return -ENOMEM; 2537 2538 nritems = btrfs_header_nritems(eb); 2539 for (i = 0; i < nritems; i++) { 2540 btrfs_item_key_to_cpu(eb, &key, i); 2541 2542 /* inode keys are done during the first stage */ 2543 if (key.type == BTRFS_INODE_ITEM_KEY && 2544 wc->stage == LOG_WALK_REPLAY_INODES) { 2545 struct btrfs_inode_item *inode_item; 2546 u32 mode; 2547 2548 inode_item = btrfs_item_ptr(eb, i, 2549 struct btrfs_inode_item); 2550 /* 2551 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2552 * and never got linked before the fsync, skip it, as 2553 * replaying it is pointless since it would be deleted 2554 * later. We skip logging tmpfiles, but it's always 2555 * possible we are replaying a log created with a kernel 2556 * that used to log tmpfiles. 2557 */ 2558 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2559 wc->ignore_cur_inode = true; 2560 continue; 2561 } else { 2562 wc->ignore_cur_inode = false; 2563 } 2564 ret = replay_xattr_deletes(wc->trans, root, log, 2565 path, key.objectid); 2566 if (ret) 2567 break; 2568 mode = btrfs_inode_mode(eb, inode_item); 2569 if (S_ISDIR(mode)) { 2570 ret = replay_dir_deletes(wc->trans, 2571 root, log, path, key.objectid, 0); 2572 if (ret) 2573 break; 2574 } 2575 ret = overwrite_item(wc->trans, root, path, 2576 eb, i, &key); 2577 if (ret) 2578 break; 2579 2580 /* 2581 * Before replaying extents, truncate the inode to its 2582 * size. We need to do it now and not after log replay 2583 * because before an fsync we can have prealloc extents 2584 * added beyond the inode's i_size. If we did it after, 2585 * through orphan cleanup for example, we would drop 2586 * those prealloc extents just after replaying them. 2587 */ 2588 if (S_ISREG(mode)) { 2589 struct inode *inode; 2590 u64 from; 2591 2592 inode = read_one_inode(root, key.objectid); 2593 if (!inode) { 2594 ret = -EIO; 2595 break; 2596 } 2597 from = ALIGN(i_size_read(inode), 2598 root->fs_info->sectorsize); 2599 ret = btrfs_drop_extents(wc->trans, root, inode, 2600 from, (u64)-1, 1); 2601 if (!ret) { 2602 /* Update the inode's nbytes. */ 2603 ret = btrfs_update_inode(wc->trans, 2604 root, inode); 2605 } 2606 iput(inode); 2607 if (ret) 2608 break; 2609 } 2610 2611 ret = link_to_fixup_dir(wc->trans, root, 2612 path, key.objectid); 2613 if (ret) 2614 break; 2615 } 2616 2617 if (wc->ignore_cur_inode) 2618 continue; 2619 2620 if (key.type == BTRFS_DIR_INDEX_KEY && 2621 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2622 ret = replay_one_dir_item(wc->trans, root, path, 2623 eb, i, &key); 2624 if (ret) 2625 break; 2626 } 2627 2628 if (wc->stage < LOG_WALK_REPLAY_ALL) 2629 continue; 2630 2631 /* these keys are simply copied */ 2632 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2633 ret = overwrite_item(wc->trans, root, path, 2634 eb, i, &key); 2635 if (ret) 2636 break; 2637 } else if (key.type == BTRFS_INODE_REF_KEY || 2638 key.type == BTRFS_INODE_EXTREF_KEY) { 2639 ret = add_inode_ref(wc->trans, root, log, path, 2640 eb, i, &key); 2641 if (ret && ret != -ENOENT) 2642 break; 2643 ret = 0; 2644 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2645 ret = replay_one_extent(wc->trans, root, path, 2646 eb, i, &key); 2647 if (ret) 2648 break; 2649 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2650 ret = replay_one_dir_item(wc->trans, root, path, 2651 eb, i, &key); 2652 if (ret) 2653 break; 2654 } 2655 } 2656 btrfs_free_path(path); 2657 return ret; 2658 } 2659 2660 /* 2661 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2662 */ 2663 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2664 { 2665 struct btrfs_block_group *cache; 2666 2667 cache = btrfs_lookup_block_group(fs_info, start); 2668 if (!cache) { 2669 btrfs_err(fs_info, "unable to find block group for %llu", start); 2670 return; 2671 } 2672 2673 spin_lock(&cache->space_info->lock); 2674 spin_lock(&cache->lock); 2675 cache->reserved -= fs_info->nodesize; 2676 cache->space_info->bytes_reserved -= fs_info->nodesize; 2677 spin_unlock(&cache->lock); 2678 spin_unlock(&cache->space_info->lock); 2679 2680 btrfs_put_block_group(cache); 2681 } 2682 2683 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2684 struct btrfs_root *root, 2685 struct btrfs_path *path, int *level, 2686 struct walk_control *wc) 2687 { 2688 struct btrfs_fs_info *fs_info = root->fs_info; 2689 u64 bytenr; 2690 u64 ptr_gen; 2691 struct extent_buffer *next; 2692 struct extent_buffer *cur; 2693 u32 blocksize; 2694 int ret = 0; 2695 2696 while (*level > 0) { 2697 struct btrfs_key first_key; 2698 2699 cur = path->nodes[*level]; 2700 2701 WARN_ON(btrfs_header_level(cur) != *level); 2702 2703 if (path->slots[*level] >= 2704 btrfs_header_nritems(cur)) 2705 break; 2706 2707 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2708 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2709 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2710 blocksize = fs_info->nodesize; 2711 2712 next = btrfs_find_create_tree_block(fs_info, bytenr); 2713 if (IS_ERR(next)) 2714 return PTR_ERR(next); 2715 2716 if (*level == 1) { 2717 ret = wc->process_func(root, next, wc, ptr_gen, 2718 *level - 1); 2719 if (ret) { 2720 free_extent_buffer(next); 2721 return ret; 2722 } 2723 2724 path->slots[*level]++; 2725 if (wc->free) { 2726 ret = btrfs_read_buffer(next, ptr_gen, 2727 *level - 1, &first_key); 2728 if (ret) { 2729 free_extent_buffer(next); 2730 return ret; 2731 } 2732 2733 if (trans) { 2734 btrfs_tree_lock(next); 2735 btrfs_set_lock_blocking_write(next); 2736 btrfs_clean_tree_block(next); 2737 btrfs_wait_tree_block_writeback(next); 2738 btrfs_tree_unlock(next); 2739 ret = btrfs_pin_reserved_extent(trans, 2740 bytenr, blocksize); 2741 if (ret) { 2742 free_extent_buffer(next); 2743 return ret; 2744 } 2745 } else { 2746 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2747 clear_extent_buffer_dirty(next); 2748 unaccount_log_buffer(fs_info, bytenr); 2749 } 2750 } 2751 free_extent_buffer(next); 2752 continue; 2753 } 2754 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2755 if (ret) { 2756 free_extent_buffer(next); 2757 return ret; 2758 } 2759 2760 if (path->nodes[*level-1]) 2761 free_extent_buffer(path->nodes[*level-1]); 2762 path->nodes[*level-1] = next; 2763 *level = btrfs_header_level(next); 2764 path->slots[*level] = 0; 2765 cond_resched(); 2766 } 2767 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2768 2769 cond_resched(); 2770 return 0; 2771 } 2772 2773 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2774 struct btrfs_root *root, 2775 struct btrfs_path *path, int *level, 2776 struct walk_control *wc) 2777 { 2778 struct btrfs_fs_info *fs_info = root->fs_info; 2779 int i; 2780 int slot; 2781 int ret; 2782 2783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2784 slot = path->slots[i]; 2785 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2786 path->slots[i]++; 2787 *level = i; 2788 WARN_ON(*level == 0); 2789 return 0; 2790 } else { 2791 ret = wc->process_func(root, path->nodes[*level], wc, 2792 btrfs_header_generation(path->nodes[*level]), 2793 *level); 2794 if (ret) 2795 return ret; 2796 2797 if (wc->free) { 2798 struct extent_buffer *next; 2799 2800 next = path->nodes[*level]; 2801 2802 if (trans) { 2803 btrfs_tree_lock(next); 2804 btrfs_set_lock_blocking_write(next); 2805 btrfs_clean_tree_block(next); 2806 btrfs_wait_tree_block_writeback(next); 2807 btrfs_tree_unlock(next); 2808 ret = btrfs_pin_reserved_extent(trans, 2809 path->nodes[*level]->start, 2810 path->nodes[*level]->len); 2811 if (ret) 2812 return ret; 2813 } else { 2814 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2815 clear_extent_buffer_dirty(next); 2816 2817 unaccount_log_buffer(fs_info, 2818 path->nodes[*level]->start); 2819 } 2820 } 2821 free_extent_buffer(path->nodes[*level]); 2822 path->nodes[*level] = NULL; 2823 *level = i + 1; 2824 } 2825 } 2826 return 1; 2827 } 2828 2829 /* 2830 * drop the reference count on the tree rooted at 'snap'. This traverses 2831 * the tree freeing any blocks that have a ref count of zero after being 2832 * decremented. 2833 */ 2834 static int walk_log_tree(struct btrfs_trans_handle *trans, 2835 struct btrfs_root *log, struct walk_control *wc) 2836 { 2837 struct btrfs_fs_info *fs_info = log->fs_info; 2838 int ret = 0; 2839 int wret; 2840 int level; 2841 struct btrfs_path *path; 2842 int orig_level; 2843 2844 path = btrfs_alloc_path(); 2845 if (!path) 2846 return -ENOMEM; 2847 2848 level = btrfs_header_level(log->node); 2849 orig_level = level; 2850 path->nodes[level] = log->node; 2851 atomic_inc(&log->node->refs); 2852 path->slots[level] = 0; 2853 2854 while (1) { 2855 wret = walk_down_log_tree(trans, log, path, &level, wc); 2856 if (wret > 0) 2857 break; 2858 if (wret < 0) { 2859 ret = wret; 2860 goto out; 2861 } 2862 2863 wret = walk_up_log_tree(trans, log, path, &level, wc); 2864 if (wret > 0) 2865 break; 2866 if (wret < 0) { 2867 ret = wret; 2868 goto out; 2869 } 2870 } 2871 2872 /* was the root node processed? if not, catch it here */ 2873 if (path->nodes[orig_level]) { 2874 ret = wc->process_func(log, path->nodes[orig_level], wc, 2875 btrfs_header_generation(path->nodes[orig_level]), 2876 orig_level); 2877 if (ret) 2878 goto out; 2879 if (wc->free) { 2880 struct extent_buffer *next; 2881 2882 next = path->nodes[orig_level]; 2883 2884 if (trans) { 2885 btrfs_tree_lock(next); 2886 btrfs_set_lock_blocking_write(next); 2887 btrfs_clean_tree_block(next); 2888 btrfs_wait_tree_block_writeback(next); 2889 btrfs_tree_unlock(next); 2890 ret = btrfs_pin_reserved_extent(trans, 2891 next->start, next->len); 2892 if (ret) 2893 goto out; 2894 } else { 2895 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2896 clear_extent_buffer_dirty(next); 2897 unaccount_log_buffer(fs_info, next->start); 2898 } 2899 } 2900 } 2901 2902 out: 2903 btrfs_free_path(path); 2904 return ret; 2905 } 2906 2907 /* 2908 * helper function to update the item for a given subvolumes log root 2909 * in the tree of log roots 2910 */ 2911 static int update_log_root(struct btrfs_trans_handle *trans, 2912 struct btrfs_root *log, 2913 struct btrfs_root_item *root_item) 2914 { 2915 struct btrfs_fs_info *fs_info = log->fs_info; 2916 int ret; 2917 2918 if (log->log_transid == 1) { 2919 /* insert root item on the first sync */ 2920 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2921 &log->root_key, root_item); 2922 } else { 2923 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2924 &log->root_key, root_item); 2925 } 2926 return ret; 2927 } 2928 2929 static void wait_log_commit(struct btrfs_root *root, int transid) 2930 { 2931 DEFINE_WAIT(wait); 2932 int index = transid % 2; 2933 2934 /* 2935 * we only allow two pending log transactions at a time, 2936 * so we know that if ours is more than 2 older than the 2937 * current transaction, we're done 2938 */ 2939 for (;;) { 2940 prepare_to_wait(&root->log_commit_wait[index], 2941 &wait, TASK_UNINTERRUPTIBLE); 2942 2943 if (!(root->log_transid_committed < transid && 2944 atomic_read(&root->log_commit[index]))) 2945 break; 2946 2947 mutex_unlock(&root->log_mutex); 2948 schedule(); 2949 mutex_lock(&root->log_mutex); 2950 } 2951 finish_wait(&root->log_commit_wait[index], &wait); 2952 } 2953 2954 static void wait_for_writer(struct btrfs_root *root) 2955 { 2956 DEFINE_WAIT(wait); 2957 2958 for (;;) { 2959 prepare_to_wait(&root->log_writer_wait, &wait, 2960 TASK_UNINTERRUPTIBLE); 2961 if (!atomic_read(&root->log_writers)) 2962 break; 2963 2964 mutex_unlock(&root->log_mutex); 2965 schedule(); 2966 mutex_lock(&root->log_mutex); 2967 } 2968 finish_wait(&root->log_writer_wait, &wait); 2969 } 2970 2971 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2972 struct btrfs_log_ctx *ctx) 2973 { 2974 if (!ctx) 2975 return; 2976 2977 mutex_lock(&root->log_mutex); 2978 list_del_init(&ctx->list); 2979 mutex_unlock(&root->log_mutex); 2980 } 2981 2982 /* 2983 * Invoked in log mutex context, or be sure there is no other task which 2984 * can access the list. 2985 */ 2986 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2987 int index, int error) 2988 { 2989 struct btrfs_log_ctx *ctx; 2990 struct btrfs_log_ctx *safe; 2991 2992 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2993 list_del_init(&ctx->list); 2994 ctx->log_ret = error; 2995 } 2996 2997 INIT_LIST_HEAD(&root->log_ctxs[index]); 2998 } 2999 3000 /* 3001 * btrfs_sync_log does sends a given tree log down to the disk and 3002 * updates the super blocks to record it. When this call is done, 3003 * you know that any inodes previously logged are safely on disk only 3004 * if it returns 0. 3005 * 3006 * Any other return value means you need to call btrfs_commit_transaction. 3007 * Some of the edge cases for fsyncing directories that have had unlinks 3008 * or renames done in the past mean that sometimes the only safe 3009 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3010 * that has happened. 3011 */ 3012 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3013 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3014 { 3015 int index1; 3016 int index2; 3017 int mark; 3018 int ret; 3019 struct btrfs_fs_info *fs_info = root->fs_info; 3020 struct btrfs_root *log = root->log_root; 3021 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3022 struct btrfs_root_item new_root_item; 3023 int log_transid = 0; 3024 struct btrfs_log_ctx root_log_ctx; 3025 struct blk_plug plug; 3026 3027 mutex_lock(&root->log_mutex); 3028 log_transid = ctx->log_transid; 3029 if (root->log_transid_committed >= log_transid) { 3030 mutex_unlock(&root->log_mutex); 3031 return ctx->log_ret; 3032 } 3033 3034 index1 = log_transid % 2; 3035 if (atomic_read(&root->log_commit[index1])) { 3036 wait_log_commit(root, log_transid); 3037 mutex_unlock(&root->log_mutex); 3038 return ctx->log_ret; 3039 } 3040 ASSERT(log_transid == root->log_transid); 3041 atomic_set(&root->log_commit[index1], 1); 3042 3043 /* wait for previous tree log sync to complete */ 3044 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3045 wait_log_commit(root, log_transid - 1); 3046 3047 while (1) { 3048 int batch = atomic_read(&root->log_batch); 3049 /* when we're on an ssd, just kick the log commit out */ 3050 if (!btrfs_test_opt(fs_info, SSD) && 3051 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3052 mutex_unlock(&root->log_mutex); 3053 schedule_timeout_uninterruptible(1); 3054 mutex_lock(&root->log_mutex); 3055 } 3056 wait_for_writer(root); 3057 if (batch == atomic_read(&root->log_batch)) 3058 break; 3059 } 3060 3061 /* bail out if we need to do a full commit */ 3062 if (btrfs_need_log_full_commit(trans)) { 3063 ret = -EAGAIN; 3064 mutex_unlock(&root->log_mutex); 3065 goto out; 3066 } 3067 3068 if (log_transid % 2 == 0) 3069 mark = EXTENT_DIRTY; 3070 else 3071 mark = EXTENT_NEW; 3072 3073 /* we start IO on all the marked extents here, but we don't actually 3074 * wait for them until later. 3075 */ 3076 blk_start_plug(&plug); 3077 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3078 if (ret) { 3079 blk_finish_plug(&plug); 3080 btrfs_abort_transaction(trans, ret); 3081 btrfs_set_log_full_commit(trans); 3082 mutex_unlock(&root->log_mutex); 3083 goto out; 3084 } 3085 3086 /* 3087 * We _must_ update under the root->log_mutex in order to make sure we 3088 * have a consistent view of the log root we are trying to commit at 3089 * this moment. 3090 * 3091 * We _must_ copy this into a local copy, because we are not holding the 3092 * log_root_tree->log_mutex yet. This is important because when we 3093 * commit the log_root_tree we must have a consistent view of the 3094 * log_root_tree when we update the super block to point at the 3095 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3096 * with the commit and possibly point at the new block which we may not 3097 * have written out. 3098 */ 3099 btrfs_set_root_node(&log->root_item, log->node); 3100 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3101 3102 root->log_transid++; 3103 log->log_transid = root->log_transid; 3104 root->log_start_pid = 0; 3105 /* 3106 * IO has been started, blocks of the log tree have WRITTEN flag set 3107 * in their headers. new modifications of the log will be written to 3108 * new positions. so it's safe to allow log writers to go in. 3109 */ 3110 mutex_unlock(&root->log_mutex); 3111 3112 btrfs_init_log_ctx(&root_log_ctx, NULL); 3113 3114 mutex_lock(&log_root_tree->log_mutex); 3115 atomic_inc(&log_root_tree->log_batch); 3116 atomic_inc(&log_root_tree->log_writers); 3117 3118 index2 = log_root_tree->log_transid % 2; 3119 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3120 root_log_ctx.log_transid = log_root_tree->log_transid; 3121 3122 mutex_unlock(&log_root_tree->log_mutex); 3123 3124 mutex_lock(&log_root_tree->log_mutex); 3125 3126 /* 3127 * Now we are safe to update the log_root_tree because we're under the 3128 * log_mutex, and we're a current writer so we're holding the commit 3129 * open until we drop the log_mutex. 3130 */ 3131 ret = update_log_root(trans, log, &new_root_item); 3132 3133 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 3134 /* atomic_dec_and_test implies a barrier */ 3135 cond_wake_up_nomb(&log_root_tree->log_writer_wait); 3136 } 3137 3138 if (ret) { 3139 if (!list_empty(&root_log_ctx.list)) 3140 list_del_init(&root_log_ctx.list); 3141 3142 blk_finish_plug(&plug); 3143 btrfs_set_log_full_commit(trans); 3144 3145 if (ret != -ENOSPC) { 3146 btrfs_abort_transaction(trans, ret); 3147 mutex_unlock(&log_root_tree->log_mutex); 3148 goto out; 3149 } 3150 btrfs_wait_tree_log_extents(log, mark); 3151 mutex_unlock(&log_root_tree->log_mutex); 3152 ret = -EAGAIN; 3153 goto out; 3154 } 3155 3156 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3157 blk_finish_plug(&plug); 3158 list_del_init(&root_log_ctx.list); 3159 mutex_unlock(&log_root_tree->log_mutex); 3160 ret = root_log_ctx.log_ret; 3161 goto out; 3162 } 3163 3164 index2 = root_log_ctx.log_transid % 2; 3165 if (atomic_read(&log_root_tree->log_commit[index2])) { 3166 blk_finish_plug(&plug); 3167 ret = btrfs_wait_tree_log_extents(log, mark); 3168 wait_log_commit(log_root_tree, 3169 root_log_ctx.log_transid); 3170 mutex_unlock(&log_root_tree->log_mutex); 3171 if (!ret) 3172 ret = root_log_ctx.log_ret; 3173 goto out; 3174 } 3175 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3176 atomic_set(&log_root_tree->log_commit[index2], 1); 3177 3178 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3179 wait_log_commit(log_root_tree, 3180 root_log_ctx.log_transid - 1); 3181 } 3182 3183 wait_for_writer(log_root_tree); 3184 3185 /* 3186 * now that we've moved on to the tree of log tree roots, 3187 * check the full commit flag again 3188 */ 3189 if (btrfs_need_log_full_commit(trans)) { 3190 blk_finish_plug(&plug); 3191 btrfs_wait_tree_log_extents(log, mark); 3192 mutex_unlock(&log_root_tree->log_mutex); 3193 ret = -EAGAIN; 3194 goto out_wake_log_root; 3195 } 3196 3197 ret = btrfs_write_marked_extents(fs_info, 3198 &log_root_tree->dirty_log_pages, 3199 EXTENT_DIRTY | EXTENT_NEW); 3200 blk_finish_plug(&plug); 3201 if (ret) { 3202 btrfs_set_log_full_commit(trans); 3203 btrfs_abort_transaction(trans, ret); 3204 mutex_unlock(&log_root_tree->log_mutex); 3205 goto out_wake_log_root; 3206 } 3207 ret = btrfs_wait_tree_log_extents(log, mark); 3208 if (!ret) 3209 ret = btrfs_wait_tree_log_extents(log_root_tree, 3210 EXTENT_NEW | EXTENT_DIRTY); 3211 if (ret) { 3212 btrfs_set_log_full_commit(trans); 3213 mutex_unlock(&log_root_tree->log_mutex); 3214 goto out_wake_log_root; 3215 } 3216 3217 btrfs_set_super_log_root(fs_info->super_for_commit, 3218 log_root_tree->node->start); 3219 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3220 btrfs_header_level(log_root_tree->node)); 3221 3222 log_root_tree->log_transid++; 3223 mutex_unlock(&log_root_tree->log_mutex); 3224 3225 /* 3226 * Nobody else is going to jump in and write the ctree 3227 * super here because the log_commit atomic below is protecting 3228 * us. We must be called with a transaction handle pinning 3229 * the running transaction open, so a full commit can't hop 3230 * in and cause problems either. 3231 */ 3232 ret = write_all_supers(fs_info, 1); 3233 if (ret) { 3234 btrfs_set_log_full_commit(trans); 3235 btrfs_abort_transaction(trans, ret); 3236 goto out_wake_log_root; 3237 } 3238 3239 mutex_lock(&root->log_mutex); 3240 if (root->last_log_commit < log_transid) 3241 root->last_log_commit = log_transid; 3242 mutex_unlock(&root->log_mutex); 3243 3244 out_wake_log_root: 3245 mutex_lock(&log_root_tree->log_mutex); 3246 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3247 3248 log_root_tree->log_transid_committed++; 3249 atomic_set(&log_root_tree->log_commit[index2], 0); 3250 mutex_unlock(&log_root_tree->log_mutex); 3251 3252 /* 3253 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3254 * all the updates above are seen by the woken threads. It might not be 3255 * necessary, but proving that seems to be hard. 3256 */ 3257 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3258 out: 3259 mutex_lock(&root->log_mutex); 3260 btrfs_remove_all_log_ctxs(root, index1, ret); 3261 root->log_transid_committed++; 3262 atomic_set(&root->log_commit[index1], 0); 3263 mutex_unlock(&root->log_mutex); 3264 3265 /* 3266 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3267 * all the updates above are seen by the woken threads. It might not be 3268 * necessary, but proving that seems to be hard. 3269 */ 3270 cond_wake_up(&root->log_commit_wait[index1]); 3271 return ret; 3272 } 3273 3274 static void free_log_tree(struct btrfs_trans_handle *trans, 3275 struct btrfs_root *log) 3276 { 3277 int ret; 3278 struct walk_control wc = { 3279 .free = 1, 3280 .process_func = process_one_buffer 3281 }; 3282 3283 ret = walk_log_tree(trans, log, &wc); 3284 if (ret) { 3285 if (trans) 3286 btrfs_abort_transaction(trans, ret); 3287 else 3288 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3289 } 3290 3291 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3292 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3293 extent_io_tree_release(&log->log_csum_range); 3294 btrfs_put_root(log); 3295 } 3296 3297 /* 3298 * free all the extents used by the tree log. This should be called 3299 * at commit time of the full transaction 3300 */ 3301 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3302 { 3303 if (root->log_root) { 3304 free_log_tree(trans, root->log_root); 3305 root->log_root = NULL; 3306 } 3307 return 0; 3308 } 3309 3310 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3311 struct btrfs_fs_info *fs_info) 3312 { 3313 if (fs_info->log_root_tree) { 3314 free_log_tree(trans, fs_info->log_root_tree); 3315 fs_info->log_root_tree = NULL; 3316 } 3317 return 0; 3318 } 3319 3320 /* 3321 * Check if an inode was logged in the current transaction. We can't always rely 3322 * on an inode's logged_trans value, because it's an in-memory only field and 3323 * therefore not persisted. This means that its value is lost if the inode gets 3324 * evicted and loaded again from disk (in which case it has a value of 0, and 3325 * certainly it is smaller then any possible transaction ID), when that happens 3326 * the full_sync flag is set in the inode's runtime flags, so on that case we 3327 * assume eviction happened and ignore the logged_trans value, assuming the 3328 * worst case, that the inode was logged before in the current transaction. 3329 */ 3330 static bool inode_logged(struct btrfs_trans_handle *trans, 3331 struct btrfs_inode *inode) 3332 { 3333 if (inode->logged_trans == trans->transid) 3334 return true; 3335 3336 if (inode->last_trans == trans->transid && 3337 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 3338 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3339 return true; 3340 3341 return false; 3342 } 3343 3344 /* 3345 * If both a file and directory are logged, and unlinks or renames are 3346 * mixed in, we have a few interesting corners: 3347 * 3348 * create file X in dir Y 3349 * link file X to X.link in dir Y 3350 * fsync file X 3351 * unlink file X but leave X.link 3352 * fsync dir Y 3353 * 3354 * After a crash we would expect only X.link to exist. But file X 3355 * didn't get fsync'd again so the log has back refs for X and X.link. 3356 * 3357 * We solve this by removing directory entries and inode backrefs from the 3358 * log when a file that was logged in the current transaction is 3359 * unlinked. Any later fsync will include the updated log entries, and 3360 * we'll be able to reconstruct the proper directory items from backrefs. 3361 * 3362 * This optimizations allows us to avoid relogging the entire inode 3363 * or the entire directory. 3364 */ 3365 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3366 struct btrfs_root *root, 3367 const char *name, int name_len, 3368 struct btrfs_inode *dir, u64 index) 3369 { 3370 struct btrfs_root *log; 3371 struct btrfs_dir_item *di; 3372 struct btrfs_path *path; 3373 int ret; 3374 int err = 0; 3375 int bytes_del = 0; 3376 u64 dir_ino = btrfs_ino(dir); 3377 3378 if (!inode_logged(trans, dir)) 3379 return 0; 3380 3381 ret = join_running_log_trans(root); 3382 if (ret) 3383 return 0; 3384 3385 mutex_lock(&dir->log_mutex); 3386 3387 log = root->log_root; 3388 path = btrfs_alloc_path(); 3389 if (!path) { 3390 err = -ENOMEM; 3391 goto out_unlock; 3392 } 3393 3394 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3395 name, name_len, -1); 3396 if (IS_ERR(di)) { 3397 err = PTR_ERR(di); 3398 goto fail; 3399 } 3400 if (di) { 3401 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3402 bytes_del += name_len; 3403 if (ret) { 3404 err = ret; 3405 goto fail; 3406 } 3407 } 3408 btrfs_release_path(path); 3409 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3410 index, name, name_len, -1); 3411 if (IS_ERR(di)) { 3412 err = PTR_ERR(di); 3413 goto fail; 3414 } 3415 if (di) { 3416 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3417 bytes_del += name_len; 3418 if (ret) { 3419 err = ret; 3420 goto fail; 3421 } 3422 } 3423 3424 /* update the directory size in the log to reflect the names 3425 * we have removed 3426 */ 3427 if (bytes_del) { 3428 struct btrfs_key key; 3429 3430 key.objectid = dir_ino; 3431 key.offset = 0; 3432 key.type = BTRFS_INODE_ITEM_KEY; 3433 btrfs_release_path(path); 3434 3435 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3436 if (ret < 0) { 3437 err = ret; 3438 goto fail; 3439 } 3440 if (ret == 0) { 3441 struct btrfs_inode_item *item; 3442 u64 i_size; 3443 3444 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3445 struct btrfs_inode_item); 3446 i_size = btrfs_inode_size(path->nodes[0], item); 3447 if (i_size > bytes_del) 3448 i_size -= bytes_del; 3449 else 3450 i_size = 0; 3451 btrfs_set_inode_size(path->nodes[0], item, i_size); 3452 btrfs_mark_buffer_dirty(path->nodes[0]); 3453 } else 3454 ret = 0; 3455 btrfs_release_path(path); 3456 } 3457 fail: 3458 btrfs_free_path(path); 3459 out_unlock: 3460 mutex_unlock(&dir->log_mutex); 3461 if (ret == -ENOSPC) { 3462 btrfs_set_log_full_commit(trans); 3463 ret = 0; 3464 } else if (ret < 0) 3465 btrfs_abort_transaction(trans, ret); 3466 3467 btrfs_end_log_trans(root); 3468 3469 return err; 3470 } 3471 3472 /* see comments for btrfs_del_dir_entries_in_log */ 3473 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3474 struct btrfs_root *root, 3475 const char *name, int name_len, 3476 struct btrfs_inode *inode, u64 dirid) 3477 { 3478 struct btrfs_root *log; 3479 u64 index; 3480 int ret; 3481 3482 if (!inode_logged(trans, inode)) 3483 return 0; 3484 3485 ret = join_running_log_trans(root); 3486 if (ret) 3487 return 0; 3488 log = root->log_root; 3489 mutex_lock(&inode->log_mutex); 3490 3491 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3492 dirid, &index); 3493 mutex_unlock(&inode->log_mutex); 3494 if (ret == -ENOSPC) { 3495 btrfs_set_log_full_commit(trans); 3496 ret = 0; 3497 } else if (ret < 0 && ret != -ENOENT) 3498 btrfs_abort_transaction(trans, ret); 3499 btrfs_end_log_trans(root); 3500 3501 return ret; 3502 } 3503 3504 /* 3505 * creates a range item in the log for 'dirid'. first_offset and 3506 * last_offset tell us which parts of the key space the log should 3507 * be considered authoritative for. 3508 */ 3509 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3510 struct btrfs_root *log, 3511 struct btrfs_path *path, 3512 int key_type, u64 dirid, 3513 u64 first_offset, u64 last_offset) 3514 { 3515 int ret; 3516 struct btrfs_key key; 3517 struct btrfs_dir_log_item *item; 3518 3519 key.objectid = dirid; 3520 key.offset = first_offset; 3521 if (key_type == BTRFS_DIR_ITEM_KEY) 3522 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3523 else 3524 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3525 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3526 if (ret) 3527 return ret; 3528 3529 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3530 struct btrfs_dir_log_item); 3531 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3532 btrfs_mark_buffer_dirty(path->nodes[0]); 3533 btrfs_release_path(path); 3534 return 0; 3535 } 3536 3537 /* 3538 * log all the items included in the current transaction for a given 3539 * directory. This also creates the range items in the log tree required 3540 * to replay anything deleted before the fsync 3541 */ 3542 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3543 struct btrfs_root *root, struct btrfs_inode *inode, 3544 struct btrfs_path *path, 3545 struct btrfs_path *dst_path, int key_type, 3546 struct btrfs_log_ctx *ctx, 3547 u64 min_offset, u64 *last_offset_ret) 3548 { 3549 struct btrfs_key min_key; 3550 struct btrfs_root *log = root->log_root; 3551 struct extent_buffer *src; 3552 int err = 0; 3553 int ret; 3554 int i; 3555 int nritems; 3556 u64 first_offset = min_offset; 3557 u64 last_offset = (u64)-1; 3558 u64 ino = btrfs_ino(inode); 3559 3560 log = root->log_root; 3561 3562 min_key.objectid = ino; 3563 min_key.type = key_type; 3564 min_key.offset = min_offset; 3565 3566 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3567 3568 /* 3569 * we didn't find anything from this transaction, see if there 3570 * is anything at all 3571 */ 3572 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3573 min_key.objectid = ino; 3574 min_key.type = key_type; 3575 min_key.offset = (u64)-1; 3576 btrfs_release_path(path); 3577 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3578 if (ret < 0) { 3579 btrfs_release_path(path); 3580 return ret; 3581 } 3582 ret = btrfs_previous_item(root, path, ino, key_type); 3583 3584 /* if ret == 0 there are items for this type, 3585 * create a range to tell us the last key of this type. 3586 * otherwise, there are no items in this directory after 3587 * *min_offset, and we create a range to indicate that. 3588 */ 3589 if (ret == 0) { 3590 struct btrfs_key tmp; 3591 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3592 path->slots[0]); 3593 if (key_type == tmp.type) 3594 first_offset = max(min_offset, tmp.offset) + 1; 3595 } 3596 goto done; 3597 } 3598 3599 /* go backward to find any previous key */ 3600 ret = btrfs_previous_item(root, path, ino, key_type); 3601 if (ret == 0) { 3602 struct btrfs_key tmp; 3603 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3604 if (key_type == tmp.type) { 3605 first_offset = tmp.offset; 3606 ret = overwrite_item(trans, log, dst_path, 3607 path->nodes[0], path->slots[0], 3608 &tmp); 3609 if (ret) { 3610 err = ret; 3611 goto done; 3612 } 3613 } 3614 } 3615 btrfs_release_path(path); 3616 3617 /* 3618 * Find the first key from this transaction again. See the note for 3619 * log_new_dir_dentries, if we're logging a directory recursively we 3620 * won't be holding its i_mutex, which means we can modify the directory 3621 * while we're logging it. If we remove an entry between our first 3622 * search and this search we'll not find the key again and can just 3623 * bail. 3624 */ 3625 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3626 if (ret != 0) 3627 goto done; 3628 3629 /* 3630 * we have a block from this transaction, log every item in it 3631 * from our directory 3632 */ 3633 while (1) { 3634 struct btrfs_key tmp; 3635 src = path->nodes[0]; 3636 nritems = btrfs_header_nritems(src); 3637 for (i = path->slots[0]; i < nritems; i++) { 3638 struct btrfs_dir_item *di; 3639 3640 btrfs_item_key_to_cpu(src, &min_key, i); 3641 3642 if (min_key.objectid != ino || min_key.type != key_type) 3643 goto done; 3644 ret = overwrite_item(trans, log, dst_path, src, i, 3645 &min_key); 3646 if (ret) { 3647 err = ret; 3648 goto done; 3649 } 3650 3651 /* 3652 * We must make sure that when we log a directory entry, 3653 * the corresponding inode, after log replay, has a 3654 * matching link count. For example: 3655 * 3656 * touch foo 3657 * mkdir mydir 3658 * sync 3659 * ln foo mydir/bar 3660 * xfs_io -c "fsync" mydir 3661 * <crash> 3662 * <mount fs and log replay> 3663 * 3664 * Would result in a fsync log that when replayed, our 3665 * file inode would have a link count of 1, but we get 3666 * two directory entries pointing to the same inode. 3667 * After removing one of the names, it would not be 3668 * possible to remove the other name, which resulted 3669 * always in stale file handle errors, and would not 3670 * be possible to rmdir the parent directory, since 3671 * its i_size could never decrement to the value 3672 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3673 */ 3674 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3675 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3676 if (ctx && 3677 (btrfs_dir_transid(src, di) == trans->transid || 3678 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3679 tmp.type != BTRFS_ROOT_ITEM_KEY) 3680 ctx->log_new_dentries = true; 3681 } 3682 path->slots[0] = nritems; 3683 3684 /* 3685 * look ahead to the next item and see if it is also 3686 * from this directory and from this transaction 3687 */ 3688 ret = btrfs_next_leaf(root, path); 3689 if (ret) { 3690 if (ret == 1) 3691 last_offset = (u64)-1; 3692 else 3693 err = ret; 3694 goto done; 3695 } 3696 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3697 if (tmp.objectid != ino || tmp.type != key_type) { 3698 last_offset = (u64)-1; 3699 goto done; 3700 } 3701 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3702 ret = overwrite_item(trans, log, dst_path, 3703 path->nodes[0], path->slots[0], 3704 &tmp); 3705 if (ret) 3706 err = ret; 3707 else 3708 last_offset = tmp.offset; 3709 goto done; 3710 } 3711 } 3712 done: 3713 btrfs_release_path(path); 3714 btrfs_release_path(dst_path); 3715 3716 if (err == 0) { 3717 *last_offset_ret = last_offset; 3718 /* 3719 * insert the log range keys to indicate where the log 3720 * is valid 3721 */ 3722 ret = insert_dir_log_key(trans, log, path, key_type, 3723 ino, first_offset, last_offset); 3724 if (ret) 3725 err = ret; 3726 } 3727 return err; 3728 } 3729 3730 /* 3731 * logging directories is very similar to logging inodes, We find all the items 3732 * from the current transaction and write them to the log. 3733 * 3734 * The recovery code scans the directory in the subvolume, and if it finds a 3735 * key in the range logged that is not present in the log tree, then it means 3736 * that dir entry was unlinked during the transaction. 3737 * 3738 * In order for that scan to work, we must include one key smaller than 3739 * the smallest logged by this transaction and one key larger than the largest 3740 * key logged by this transaction. 3741 */ 3742 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3743 struct btrfs_root *root, struct btrfs_inode *inode, 3744 struct btrfs_path *path, 3745 struct btrfs_path *dst_path, 3746 struct btrfs_log_ctx *ctx) 3747 { 3748 u64 min_key; 3749 u64 max_key; 3750 int ret; 3751 int key_type = BTRFS_DIR_ITEM_KEY; 3752 3753 again: 3754 min_key = 0; 3755 max_key = 0; 3756 while (1) { 3757 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3758 ctx, min_key, &max_key); 3759 if (ret) 3760 return ret; 3761 if (max_key == (u64)-1) 3762 break; 3763 min_key = max_key + 1; 3764 } 3765 3766 if (key_type == BTRFS_DIR_ITEM_KEY) { 3767 key_type = BTRFS_DIR_INDEX_KEY; 3768 goto again; 3769 } 3770 return 0; 3771 } 3772 3773 /* 3774 * a helper function to drop items from the log before we relog an 3775 * inode. max_key_type indicates the highest item type to remove. 3776 * This cannot be run for file data extents because it does not 3777 * free the extents they point to. 3778 */ 3779 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3780 struct btrfs_root *log, 3781 struct btrfs_path *path, 3782 u64 objectid, int max_key_type) 3783 { 3784 int ret; 3785 struct btrfs_key key; 3786 struct btrfs_key found_key; 3787 int start_slot; 3788 3789 key.objectid = objectid; 3790 key.type = max_key_type; 3791 key.offset = (u64)-1; 3792 3793 while (1) { 3794 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3795 BUG_ON(ret == 0); /* Logic error */ 3796 if (ret < 0) 3797 break; 3798 3799 if (path->slots[0] == 0) 3800 break; 3801 3802 path->slots[0]--; 3803 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3804 path->slots[0]); 3805 3806 if (found_key.objectid != objectid) 3807 break; 3808 3809 found_key.offset = 0; 3810 found_key.type = 0; 3811 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 3812 if (ret < 0) 3813 break; 3814 3815 ret = btrfs_del_items(trans, log, path, start_slot, 3816 path->slots[0] - start_slot + 1); 3817 /* 3818 * If start slot isn't 0 then we don't need to re-search, we've 3819 * found the last guy with the objectid in this tree. 3820 */ 3821 if (ret || start_slot != 0) 3822 break; 3823 btrfs_release_path(path); 3824 } 3825 btrfs_release_path(path); 3826 if (ret > 0) 3827 ret = 0; 3828 return ret; 3829 } 3830 3831 static void fill_inode_item(struct btrfs_trans_handle *trans, 3832 struct extent_buffer *leaf, 3833 struct btrfs_inode_item *item, 3834 struct inode *inode, int log_inode_only, 3835 u64 logged_isize) 3836 { 3837 struct btrfs_map_token token; 3838 3839 btrfs_init_map_token(&token, leaf); 3840 3841 if (log_inode_only) { 3842 /* set the generation to zero so the recover code 3843 * can tell the difference between an logging 3844 * just to say 'this inode exists' and a logging 3845 * to say 'update this inode with these values' 3846 */ 3847 btrfs_set_token_inode_generation(&token, item, 0); 3848 btrfs_set_token_inode_size(&token, item, logged_isize); 3849 } else { 3850 btrfs_set_token_inode_generation(&token, item, 3851 BTRFS_I(inode)->generation); 3852 btrfs_set_token_inode_size(&token, item, inode->i_size); 3853 } 3854 3855 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3856 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3857 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3858 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3859 3860 btrfs_set_token_timespec_sec(&token, &item->atime, 3861 inode->i_atime.tv_sec); 3862 btrfs_set_token_timespec_nsec(&token, &item->atime, 3863 inode->i_atime.tv_nsec); 3864 3865 btrfs_set_token_timespec_sec(&token, &item->mtime, 3866 inode->i_mtime.tv_sec); 3867 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3868 inode->i_mtime.tv_nsec); 3869 3870 btrfs_set_token_timespec_sec(&token, &item->ctime, 3871 inode->i_ctime.tv_sec); 3872 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3873 inode->i_ctime.tv_nsec); 3874 3875 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3876 3877 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3878 btrfs_set_token_inode_transid(&token, item, trans->transid); 3879 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3880 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); 3881 btrfs_set_token_inode_block_group(&token, item, 0); 3882 } 3883 3884 static int log_inode_item(struct btrfs_trans_handle *trans, 3885 struct btrfs_root *log, struct btrfs_path *path, 3886 struct btrfs_inode *inode) 3887 { 3888 struct btrfs_inode_item *inode_item; 3889 int ret; 3890 3891 ret = btrfs_insert_empty_item(trans, log, path, 3892 &inode->location, sizeof(*inode_item)); 3893 if (ret && ret != -EEXIST) 3894 return ret; 3895 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3896 struct btrfs_inode_item); 3897 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3898 0, 0); 3899 btrfs_release_path(path); 3900 return 0; 3901 } 3902 3903 static int log_csums(struct btrfs_trans_handle *trans, 3904 struct btrfs_root *log_root, 3905 struct btrfs_ordered_sum *sums) 3906 { 3907 const u64 lock_end = sums->bytenr + sums->len - 1; 3908 struct extent_state *cached_state = NULL; 3909 int ret; 3910 3911 /* 3912 * Serialize logging for checksums. This is to avoid racing with the 3913 * same checksum being logged by another task that is logging another 3914 * file which happens to refer to the same extent as well. Such races 3915 * can leave checksum items in the log with overlapping ranges. 3916 */ 3917 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 3918 lock_end, &cached_state); 3919 if (ret) 3920 return ret; 3921 /* 3922 * Due to extent cloning, we might have logged a csum item that covers a 3923 * subrange of a cloned extent, and later we can end up logging a csum 3924 * item for a larger subrange of the same extent or the entire range. 3925 * This would leave csum items in the log tree that cover the same range 3926 * and break the searches for checksums in the log tree, resulting in 3927 * some checksums missing in the fs/subvolume tree. So just delete (or 3928 * trim and adjust) any existing csum items in the log for this range. 3929 */ 3930 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 3931 if (!ret) 3932 ret = btrfs_csum_file_blocks(trans, log_root, sums); 3933 3934 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 3935 &cached_state); 3936 3937 return ret; 3938 } 3939 3940 static noinline int copy_items(struct btrfs_trans_handle *trans, 3941 struct btrfs_inode *inode, 3942 struct btrfs_path *dst_path, 3943 struct btrfs_path *src_path, 3944 int start_slot, int nr, int inode_only, 3945 u64 logged_isize) 3946 { 3947 struct btrfs_fs_info *fs_info = trans->fs_info; 3948 unsigned long src_offset; 3949 unsigned long dst_offset; 3950 struct btrfs_root *log = inode->root->log_root; 3951 struct btrfs_file_extent_item *extent; 3952 struct btrfs_inode_item *inode_item; 3953 struct extent_buffer *src = src_path->nodes[0]; 3954 int ret; 3955 struct btrfs_key *ins_keys; 3956 u32 *ins_sizes; 3957 char *ins_data; 3958 int i; 3959 struct list_head ordered_sums; 3960 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3961 3962 INIT_LIST_HEAD(&ordered_sums); 3963 3964 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3965 nr * sizeof(u32), GFP_NOFS); 3966 if (!ins_data) 3967 return -ENOMEM; 3968 3969 ins_sizes = (u32 *)ins_data; 3970 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3971 3972 for (i = 0; i < nr; i++) { 3973 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3974 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3975 } 3976 ret = btrfs_insert_empty_items(trans, log, dst_path, 3977 ins_keys, ins_sizes, nr); 3978 if (ret) { 3979 kfree(ins_data); 3980 return ret; 3981 } 3982 3983 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3984 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3985 dst_path->slots[0]); 3986 3987 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3988 3989 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3990 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3991 dst_path->slots[0], 3992 struct btrfs_inode_item); 3993 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3994 &inode->vfs_inode, 3995 inode_only == LOG_INODE_EXISTS, 3996 logged_isize); 3997 } else { 3998 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3999 src_offset, ins_sizes[i]); 4000 } 4001 4002 /* take a reference on file data extents so that truncates 4003 * or deletes of this inode don't have to relog the inode 4004 * again 4005 */ 4006 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4007 !skip_csum) { 4008 int found_type; 4009 extent = btrfs_item_ptr(src, start_slot + i, 4010 struct btrfs_file_extent_item); 4011 4012 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4013 continue; 4014 4015 found_type = btrfs_file_extent_type(src, extent); 4016 if (found_type == BTRFS_FILE_EXTENT_REG) { 4017 u64 ds, dl, cs, cl; 4018 ds = btrfs_file_extent_disk_bytenr(src, 4019 extent); 4020 /* ds == 0 is a hole */ 4021 if (ds == 0) 4022 continue; 4023 4024 dl = btrfs_file_extent_disk_num_bytes(src, 4025 extent); 4026 cs = btrfs_file_extent_offset(src, extent); 4027 cl = btrfs_file_extent_num_bytes(src, 4028 extent); 4029 if (btrfs_file_extent_compression(src, 4030 extent)) { 4031 cs = 0; 4032 cl = dl; 4033 } 4034 4035 ret = btrfs_lookup_csums_range( 4036 fs_info->csum_root, 4037 ds + cs, ds + cs + cl - 1, 4038 &ordered_sums, 0); 4039 if (ret) { 4040 btrfs_release_path(dst_path); 4041 kfree(ins_data); 4042 return ret; 4043 } 4044 } 4045 } 4046 } 4047 4048 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4049 btrfs_release_path(dst_path); 4050 kfree(ins_data); 4051 4052 /* 4053 * we have to do this after the loop above to avoid changing the 4054 * log tree while trying to change the log tree. 4055 */ 4056 ret = 0; 4057 while (!list_empty(&ordered_sums)) { 4058 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4059 struct btrfs_ordered_sum, 4060 list); 4061 if (!ret) 4062 ret = log_csums(trans, log, sums); 4063 list_del(&sums->list); 4064 kfree(sums); 4065 } 4066 4067 return ret; 4068 } 4069 4070 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4071 { 4072 struct extent_map *em1, *em2; 4073 4074 em1 = list_entry(a, struct extent_map, list); 4075 em2 = list_entry(b, struct extent_map, list); 4076 4077 if (em1->start < em2->start) 4078 return -1; 4079 else if (em1->start > em2->start) 4080 return 1; 4081 return 0; 4082 } 4083 4084 static int log_extent_csums(struct btrfs_trans_handle *trans, 4085 struct btrfs_inode *inode, 4086 struct btrfs_root *log_root, 4087 const struct extent_map *em) 4088 { 4089 u64 csum_offset; 4090 u64 csum_len; 4091 LIST_HEAD(ordered_sums); 4092 int ret = 0; 4093 4094 if (inode->flags & BTRFS_INODE_NODATASUM || 4095 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4096 em->block_start == EXTENT_MAP_HOLE) 4097 return 0; 4098 4099 /* If we're compressed we have to save the entire range of csums. */ 4100 if (em->compress_type) { 4101 csum_offset = 0; 4102 csum_len = max(em->block_len, em->orig_block_len); 4103 } else { 4104 csum_offset = em->mod_start - em->start; 4105 csum_len = em->mod_len; 4106 } 4107 4108 /* block start is already adjusted for the file extent offset. */ 4109 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4110 em->block_start + csum_offset, 4111 em->block_start + csum_offset + 4112 csum_len - 1, &ordered_sums, 0); 4113 if (ret) 4114 return ret; 4115 4116 while (!list_empty(&ordered_sums)) { 4117 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4118 struct btrfs_ordered_sum, 4119 list); 4120 if (!ret) 4121 ret = log_csums(trans, log_root, sums); 4122 list_del(&sums->list); 4123 kfree(sums); 4124 } 4125 4126 return ret; 4127 } 4128 4129 static int log_one_extent(struct btrfs_trans_handle *trans, 4130 struct btrfs_inode *inode, struct btrfs_root *root, 4131 const struct extent_map *em, 4132 struct btrfs_path *path, 4133 struct btrfs_log_ctx *ctx) 4134 { 4135 struct btrfs_root *log = root->log_root; 4136 struct btrfs_file_extent_item *fi; 4137 struct extent_buffer *leaf; 4138 struct btrfs_map_token token; 4139 struct btrfs_key key; 4140 u64 extent_offset = em->start - em->orig_start; 4141 u64 block_len; 4142 int ret; 4143 int extent_inserted = 0; 4144 4145 ret = log_extent_csums(trans, inode, log, em); 4146 if (ret) 4147 return ret; 4148 4149 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4150 em->start + em->len, NULL, 0, 1, 4151 sizeof(*fi), &extent_inserted); 4152 if (ret) 4153 return ret; 4154 4155 if (!extent_inserted) { 4156 key.objectid = btrfs_ino(inode); 4157 key.type = BTRFS_EXTENT_DATA_KEY; 4158 key.offset = em->start; 4159 4160 ret = btrfs_insert_empty_item(trans, log, path, &key, 4161 sizeof(*fi)); 4162 if (ret) 4163 return ret; 4164 } 4165 leaf = path->nodes[0]; 4166 btrfs_init_map_token(&token, leaf); 4167 fi = btrfs_item_ptr(leaf, path->slots[0], 4168 struct btrfs_file_extent_item); 4169 4170 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4171 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4172 btrfs_set_token_file_extent_type(&token, fi, 4173 BTRFS_FILE_EXTENT_PREALLOC); 4174 else 4175 btrfs_set_token_file_extent_type(&token, fi, 4176 BTRFS_FILE_EXTENT_REG); 4177 4178 block_len = max(em->block_len, em->orig_block_len); 4179 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4180 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4181 em->block_start); 4182 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4183 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4184 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4185 em->block_start - 4186 extent_offset); 4187 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4188 } else { 4189 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4190 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4191 } 4192 4193 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4194 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4195 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4196 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4197 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4198 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4199 btrfs_mark_buffer_dirty(leaf); 4200 4201 btrfs_release_path(path); 4202 4203 return ret; 4204 } 4205 4206 /* 4207 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4208 * lose them after doing a fast fsync and replaying the log. We scan the 4209 * subvolume's root instead of iterating the inode's extent map tree because 4210 * otherwise we can log incorrect extent items based on extent map conversion. 4211 * That can happen due to the fact that extent maps are merged when they 4212 * are not in the extent map tree's list of modified extents. 4213 */ 4214 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4215 struct btrfs_inode *inode, 4216 struct btrfs_path *path) 4217 { 4218 struct btrfs_root *root = inode->root; 4219 struct btrfs_key key; 4220 const u64 i_size = i_size_read(&inode->vfs_inode); 4221 const u64 ino = btrfs_ino(inode); 4222 struct btrfs_path *dst_path = NULL; 4223 bool dropped_extents = false; 4224 u64 truncate_offset = i_size; 4225 struct extent_buffer *leaf; 4226 int slot; 4227 int ins_nr = 0; 4228 int start_slot; 4229 int ret; 4230 4231 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4232 return 0; 4233 4234 key.objectid = ino; 4235 key.type = BTRFS_EXTENT_DATA_KEY; 4236 key.offset = i_size; 4237 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4238 if (ret < 0) 4239 goto out; 4240 4241 /* 4242 * We must check if there is a prealloc extent that starts before the 4243 * i_size and crosses the i_size boundary. This is to ensure later we 4244 * truncate down to the end of that extent and not to the i_size, as 4245 * otherwise we end up losing part of the prealloc extent after a log 4246 * replay and with an implicit hole if there is another prealloc extent 4247 * that starts at an offset beyond i_size. 4248 */ 4249 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4250 if (ret < 0) 4251 goto out; 4252 4253 if (ret == 0) { 4254 struct btrfs_file_extent_item *ei; 4255 4256 leaf = path->nodes[0]; 4257 slot = path->slots[0]; 4258 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4259 4260 if (btrfs_file_extent_type(leaf, ei) == 4261 BTRFS_FILE_EXTENT_PREALLOC) { 4262 u64 extent_end; 4263 4264 btrfs_item_key_to_cpu(leaf, &key, slot); 4265 extent_end = key.offset + 4266 btrfs_file_extent_num_bytes(leaf, ei); 4267 4268 if (extent_end > i_size) 4269 truncate_offset = extent_end; 4270 } 4271 } else { 4272 ret = 0; 4273 } 4274 4275 while (true) { 4276 leaf = path->nodes[0]; 4277 slot = path->slots[0]; 4278 4279 if (slot >= btrfs_header_nritems(leaf)) { 4280 if (ins_nr > 0) { 4281 ret = copy_items(trans, inode, dst_path, path, 4282 start_slot, ins_nr, 1, 0); 4283 if (ret < 0) 4284 goto out; 4285 ins_nr = 0; 4286 } 4287 ret = btrfs_next_leaf(root, path); 4288 if (ret < 0) 4289 goto out; 4290 if (ret > 0) { 4291 ret = 0; 4292 break; 4293 } 4294 continue; 4295 } 4296 4297 btrfs_item_key_to_cpu(leaf, &key, slot); 4298 if (key.objectid > ino) 4299 break; 4300 if (WARN_ON_ONCE(key.objectid < ino) || 4301 key.type < BTRFS_EXTENT_DATA_KEY || 4302 key.offset < i_size) { 4303 path->slots[0]++; 4304 continue; 4305 } 4306 if (!dropped_extents) { 4307 /* 4308 * Avoid logging extent items logged in past fsync calls 4309 * and leading to duplicate keys in the log tree. 4310 */ 4311 do { 4312 ret = btrfs_truncate_inode_items(trans, 4313 root->log_root, 4314 &inode->vfs_inode, 4315 truncate_offset, 4316 BTRFS_EXTENT_DATA_KEY); 4317 } while (ret == -EAGAIN); 4318 if (ret) 4319 goto out; 4320 dropped_extents = true; 4321 } 4322 if (ins_nr == 0) 4323 start_slot = slot; 4324 ins_nr++; 4325 path->slots[0]++; 4326 if (!dst_path) { 4327 dst_path = btrfs_alloc_path(); 4328 if (!dst_path) { 4329 ret = -ENOMEM; 4330 goto out; 4331 } 4332 } 4333 } 4334 if (ins_nr > 0) 4335 ret = copy_items(trans, inode, dst_path, path, 4336 start_slot, ins_nr, 1, 0); 4337 out: 4338 btrfs_release_path(path); 4339 btrfs_free_path(dst_path); 4340 return ret; 4341 } 4342 4343 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4344 struct btrfs_root *root, 4345 struct btrfs_inode *inode, 4346 struct btrfs_path *path, 4347 struct btrfs_log_ctx *ctx, 4348 const u64 start, 4349 const u64 end) 4350 { 4351 struct extent_map *em, *n; 4352 struct list_head extents; 4353 struct extent_map_tree *tree = &inode->extent_tree; 4354 u64 test_gen; 4355 int ret = 0; 4356 int num = 0; 4357 4358 INIT_LIST_HEAD(&extents); 4359 4360 write_lock(&tree->lock); 4361 test_gen = root->fs_info->last_trans_committed; 4362 4363 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4364 /* 4365 * Skip extents outside our logging range. It's important to do 4366 * it for correctness because if we don't ignore them, we may 4367 * log them before their ordered extent completes, and therefore 4368 * we could log them without logging their respective checksums 4369 * (the checksum items are added to the csum tree at the very 4370 * end of btrfs_finish_ordered_io()). Also leave such extents 4371 * outside of our range in the list, since we may have another 4372 * ranged fsync in the near future that needs them. If an extent 4373 * outside our range corresponds to a hole, log it to avoid 4374 * leaving gaps between extents (fsck will complain when we are 4375 * not using the NO_HOLES feature). 4376 */ 4377 if ((em->start > end || em->start + em->len <= start) && 4378 em->block_start != EXTENT_MAP_HOLE) 4379 continue; 4380 4381 list_del_init(&em->list); 4382 /* 4383 * Just an arbitrary number, this can be really CPU intensive 4384 * once we start getting a lot of extents, and really once we 4385 * have a bunch of extents we just want to commit since it will 4386 * be faster. 4387 */ 4388 if (++num > 32768) { 4389 list_del_init(&tree->modified_extents); 4390 ret = -EFBIG; 4391 goto process; 4392 } 4393 4394 if (em->generation <= test_gen) 4395 continue; 4396 4397 /* We log prealloc extents beyond eof later. */ 4398 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4399 em->start >= i_size_read(&inode->vfs_inode)) 4400 continue; 4401 4402 /* Need a ref to keep it from getting evicted from cache */ 4403 refcount_inc(&em->refs); 4404 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4405 list_add_tail(&em->list, &extents); 4406 num++; 4407 } 4408 4409 list_sort(NULL, &extents, extent_cmp); 4410 process: 4411 while (!list_empty(&extents)) { 4412 em = list_entry(extents.next, struct extent_map, list); 4413 4414 list_del_init(&em->list); 4415 4416 /* 4417 * If we had an error we just need to delete everybody from our 4418 * private list. 4419 */ 4420 if (ret) { 4421 clear_em_logging(tree, em); 4422 free_extent_map(em); 4423 continue; 4424 } 4425 4426 write_unlock(&tree->lock); 4427 4428 ret = log_one_extent(trans, inode, root, em, path, ctx); 4429 write_lock(&tree->lock); 4430 clear_em_logging(tree, em); 4431 free_extent_map(em); 4432 } 4433 WARN_ON(!list_empty(&extents)); 4434 write_unlock(&tree->lock); 4435 4436 btrfs_release_path(path); 4437 if (!ret) 4438 ret = btrfs_log_prealloc_extents(trans, inode, path); 4439 4440 return ret; 4441 } 4442 4443 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4444 struct btrfs_path *path, u64 *size_ret) 4445 { 4446 struct btrfs_key key; 4447 int ret; 4448 4449 key.objectid = btrfs_ino(inode); 4450 key.type = BTRFS_INODE_ITEM_KEY; 4451 key.offset = 0; 4452 4453 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4454 if (ret < 0) { 4455 return ret; 4456 } else if (ret > 0) { 4457 *size_ret = 0; 4458 } else { 4459 struct btrfs_inode_item *item; 4460 4461 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4462 struct btrfs_inode_item); 4463 *size_ret = btrfs_inode_size(path->nodes[0], item); 4464 /* 4465 * If the in-memory inode's i_size is smaller then the inode 4466 * size stored in the btree, return the inode's i_size, so 4467 * that we get a correct inode size after replaying the log 4468 * when before a power failure we had a shrinking truncate 4469 * followed by addition of a new name (rename / new hard link). 4470 * Otherwise return the inode size from the btree, to avoid 4471 * data loss when replaying a log due to previously doing a 4472 * write that expands the inode's size and logging a new name 4473 * immediately after. 4474 */ 4475 if (*size_ret > inode->vfs_inode.i_size) 4476 *size_ret = inode->vfs_inode.i_size; 4477 } 4478 4479 btrfs_release_path(path); 4480 return 0; 4481 } 4482 4483 /* 4484 * At the moment we always log all xattrs. This is to figure out at log replay 4485 * time which xattrs must have their deletion replayed. If a xattr is missing 4486 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4487 * because if a xattr is deleted, the inode is fsynced and a power failure 4488 * happens, causing the log to be replayed the next time the fs is mounted, 4489 * we want the xattr to not exist anymore (same behaviour as other filesystems 4490 * with a journal, ext3/4, xfs, f2fs, etc). 4491 */ 4492 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4493 struct btrfs_root *root, 4494 struct btrfs_inode *inode, 4495 struct btrfs_path *path, 4496 struct btrfs_path *dst_path) 4497 { 4498 int ret; 4499 struct btrfs_key key; 4500 const u64 ino = btrfs_ino(inode); 4501 int ins_nr = 0; 4502 int start_slot = 0; 4503 4504 key.objectid = ino; 4505 key.type = BTRFS_XATTR_ITEM_KEY; 4506 key.offset = 0; 4507 4508 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4509 if (ret < 0) 4510 return ret; 4511 4512 while (true) { 4513 int slot = path->slots[0]; 4514 struct extent_buffer *leaf = path->nodes[0]; 4515 int nritems = btrfs_header_nritems(leaf); 4516 4517 if (slot >= nritems) { 4518 if (ins_nr > 0) { 4519 ret = copy_items(trans, inode, dst_path, path, 4520 start_slot, ins_nr, 1, 0); 4521 if (ret < 0) 4522 return ret; 4523 ins_nr = 0; 4524 } 4525 ret = btrfs_next_leaf(root, path); 4526 if (ret < 0) 4527 return ret; 4528 else if (ret > 0) 4529 break; 4530 continue; 4531 } 4532 4533 btrfs_item_key_to_cpu(leaf, &key, slot); 4534 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4535 break; 4536 4537 if (ins_nr == 0) 4538 start_slot = slot; 4539 ins_nr++; 4540 path->slots[0]++; 4541 cond_resched(); 4542 } 4543 if (ins_nr > 0) { 4544 ret = copy_items(trans, inode, dst_path, path, 4545 start_slot, ins_nr, 1, 0); 4546 if (ret < 0) 4547 return ret; 4548 } 4549 4550 return 0; 4551 } 4552 4553 /* 4554 * When using the NO_HOLES feature if we punched a hole that causes the 4555 * deletion of entire leafs or all the extent items of the first leaf (the one 4556 * that contains the inode item and references) we may end up not processing 4557 * any extents, because there are no leafs with a generation matching the 4558 * current transaction that have extent items for our inode. So we need to find 4559 * if any holes exist and then log them. We also need to log holes after any 4560 * truncate operation that changes the inode's size. 4561 */ 4562 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4563 struct btrfs_root *root, 4564 struct btrfs_inode *inode, 4565 struct btrfs_path *path) 4566 { 4567 struct btrfs_fs_info *fs_info = root->fs_info; 4568 struct btrfs_key key; 4569 const u64 ino = btrfs_ino(inode); 4570 const u64 i_size = i_size_read(&inode->vfs_inode); 4571 u64 prev_extent_end = 0; 4572 int ret; 4573 4574 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4575 return 0; 4576 4577 key.objectid = ino; 4578 key.type = BTRFS_EXTENT_DATA_KEY; 4579 key.offset = 0; 4580 4581 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4582 if (ret < 0) 4583 return ret; 4584 4585 while (true) { 4586 struct extent_buffer *leaf = path->nodes[0]; 4587 4588 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4589 ret = btrfs_next_leaf(root, path); 4590 if (ret < 0) 4591 return ret; 4592 if (ret > 0) { 4593 ret = 0; 4594 break; 4595 } 4596 leaf = path->nodes[0]; 4597 } 4598 4599 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4600 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4601 break; 4602 4603 /* We have a hole, log it. */ 4604 if (prev_extent_end < key.offset) { 4605 const u64 hole_len = key.offset - prev_extent_end; 4606 4607 /* 4608 * Release the path to avoid deadlocks with other code 4609 * paths that search the root while holding locks on 4610 * leafs from the log root. 4611 */ 4612 btrfs_release_path(path); 4613 ret = btrfs_insert_file_extent(trans, root->log_root, 4614 ino, prev_extent_end, 0, 4615 0, hole_len, 0, hole_len, 4616 0, 0, 0); 4617 if (ret < 0) 4618 return ret; 4619 4620 /* 4621 * Search for the same key again in the root. Since it's 4622 * an extent item and we are holding the inode lock, the 4623 * key must still exist. If it doesn't just emit warning 4624 * and return an error to fall back to a transaction 4625 * commit. 4626 */ 4627 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4628 if (ret < 0) 4629 return ret; 4630 if (WARN_ON(ret > 0)) 4631 return -ENOENT; 4632 leaf = path->nodes[0]; 4633 } 4634 4635 prev_extent_end = btrfs_file_extent_end(path); 4636 path->slots[0]++; 4637 cond_resched(); 4638 } 4639 4640 if (prev_extent_end < i_size) { 4641 u64 hole_len; 4642 4643 btrfs_release_path(path); 4644 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 4645 ret = btrfs_insert_file_extent(trans, root->log_root, 4646 ino, prev_extent_end, 0, 0, 4647 hole_len, 0, hole_len, 4648 0, 0, 0); 4649 if (ret < 0) 4650 return ret; 4651 } 4652 4653 return 0; 4654 } 4655 4656 /* 4657 * When we are logging a new inode X, check if it doesn't have a reference that 4658 * matches the reference from some other inode Y created in a past transaction 4659 * and that was renamed in the current transaction. If we don't do this, then at 4660 * log replay time we can lose inode Y (and all its files if it's a directory): 4661 * 4662 * mkdir /mnt/x 4663 * echo "hello world" > /mnt/x/foobar 4664 * sync 4665 * mv /mnt/x /mnt/y 4666 * mkdir /mnt/x # or touch /mnt/x 4667 * xfs_io -c fsync /mnt/x 4668 * <power fail> 4669 * mount fs, trigger log replay 4670 * 4671 * After the log replay procedure, we would lose the first directory and all its 4672 * files (file foobar). 4673 * For the case where inode Y is not a directory we simply end up losing it: 4674 * 4675 * echo "123" > /mnt/foo 4676 * sync 4677 * mv /mnt/foo /mnt/bar 4678 * echo "abc" > /mnt/foo 4679 * xfs_io -c fsync /mnt/foo 4680 * <power fail> 4681 * 4682 * We also need this for cases where a snapshot entry is replaced by some other 4683 * entry (file or directory) otherwise we end up with an unreplayable log due to 4684 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4685 * if it were a regular entry: 4686 * 4687 * mkdir /mnt/x 4688 * btrfs subvolume snapshot /mnt /mnt/x/snap 4689 * btrfs subvolume delete /mnt/x/snap 4690 * rmdir /mnt/x 4691 * mkdir /mnt/x 4692 * fsync /mnt/x or fsync some new file inside it 4693 * <power fail> 4694 * 4695 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4696 * the same transaction. 4697 */ 4698 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4699 const int slot, 4700 const struct btrfs_key *key, 4701 struct btrfs_inode *inode, 4702 u64 *other_ino, u64 *other_parent) 4703 { 4704 int ret; 4705 struct btrfs_path *search_path; 4706 char *name = NULL; 4707 u32 name_len = 0; 4708 u32 item_size = btrfs_item_size_nr(eb, slot); 4709 u32 cur_offset = 0; 4710 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4711 4712 search_path = btrfs_alloc_path(); 4713 if (!search_path) 4714 return -ENOMEM; 4715 search_path->search_commit_root = 1; 4716 search_path->skip_locking = 1; 4717 4718 while (cur_offset < item_size) { 4719 u64 parent; 4720 u32 this_name_len; 4721 u32 this_len; 4722 unsigned long name_ptr; 4723 struct btrfs_dir_item *di; 4724 4725 if (key->type == BTRFS_INODE_REF_KEY) { 4726 struct btrfs_inode_ref *iref; 4727 4728 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4729 parent = key->offset; 4730 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4731 name_ptr = (unsigned long)(iref + 1); 4732 this_len = sizeof(*iref) + this_name_len; 4733 } else { 4734 struct btrfs_inode_extref *extref; 4735 4736 extref = (struct btrfs_inode_extref *)(ptr + 4737 cur_offset); 4738 parent = btrfs_inode_extref_parent(eb, extref); 4739 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4740 name_ptr = (unsigned long)&extref->name; 4741 this_len = sizeof(*extref) + this_name_len; 4742 } 4743 4744 if (this_name_len > name_len) { 4745 char *new_name; 4746 4747 new_name = krealloc(name, this_name_len, GFP_NOFS); 4748 if (!new_name) { 4749 ret = -ENOMEM; 4750 goto out; 4751 } 4752 name_len = this_name_len; 4753 name = new_name; 4754 } 4755 4756 read_extent_buffer(eb, name, name_ptr, this_name_len); 4757 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4758 parent, name, this_name_len, 0); 4759 if (di && !IS_ERR(di)) { 4760 struct btrfs_key di_key; 4761 4762 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4763 di, &di_key); 4764 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4765 if (di_key.objectid != key->objectid) { 4766 ret = 1; 4767 *other_ino = di_key.objectid; 4768 *other_parent = parent; 4769 } else { 4770 ret = 0; 4771 } 4772 } else { 4773 ret = -EAGAIN; 4774 } 4775 goto out; 4776 } else if (IS_ERR(di)) { 4777 ret = PTR_ERR(di); 4778 goto out; 4779 } 4780 btrfs_release_path(search_path); 4781 4782 cur_offset += this_len; 4783 } 4784 ret = 0; 4785 out: 4786 btrfs_free_path(search_path); 4787 kfree(name); 4788 return ret; 4789 } 4790 4791 struct btrfs_ino_list { 4792 u64 ino; 4793 u64 parent; 4794 struct list_head list; 4795 }; 4796 4797 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 4798 struct btrfs_root *root, 4799 struct btrfs_path *path, 4800 struct btrfs_log_ctx *ctx, 4801 u64 ino, u64 parent) 4802 { 4803 struct btrfs_ino_list *ino_elem; 4804 LIST_HEAD(inode_list); 4805 int ret = 0; 4806 4807 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4808 if (!ino_elem) 4809 return -ENOMEM; 4810 ino_elem->ino = ino; 4811 ino_elem->parent = parent; 4812 list_add_tail(&ino_elem->list, &inode_list); 4813 4814 while (!list_empty(&inode_list)) { 4815 struct btrfs_fs_info *fs_info = root->fs_info; 4816 struct btrfs_key key; 4817 struct inode *inode; 4818 4819 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 4820 list); 4821 ino = ino_elem->ino; 4822 parent = ino_elem->parent; 4823 list_del(&ino_elem->list); 4824 kfree(ino_elem); 4825 if (ret) 4826 continue; 4827 4828 btrfs_release_path(path); 4829 4830 inode = btrfs_iget(fs_info->sb, ino, root); 4831 /* 4832 * If the other inode that had a conflicting dir entry was 4833 * deleted in the current transaction, we need to log its parent 4834 * directory. 4835 */ 4836 if (IS_ERR(inode)) { 4837 ret = PTR_ERR(inode); 4838 if (ret == -ENOENT) { 4839 inode = btrfs_iget(fs_info->sb, parent, root); 4840 if (IS_ERR(inode)) { 4841 ret = PTR_ERR(inode); 4842 } else { 4843 ret = btrfs_log_inode(trans, root, 4844 BTRFS_I(inode), 4845 LOG_OTHER_INODE_ALL, 4846 0, LLONG_MAX, ctx); 4847 btrfs_add_delayed_iput(inode); 4848 } 4849 } 4850 continue; 4851 } 4852 /* 4853 * If the inode was already logged skip it - otherwise we can 4854 * hit an infinite loop. Example: 4855 * 4856 * From the commit root (previous transaction) we have the 4857 * following inodes: 4858 * 4859 * inode 257 a directory 4860 * inode 258 with references "zz" and "zz_link" on inode 257 4861 * inode 259 with reference "a" on inode 257 4862 * 4863 * And in the current (uncommitted) transaction we have: 4864 * 4865 * inode 257 a directory, unchanged 4866 * inode 258 with references "a" and "a2" on inode 257 4867 * inode 259 with reference "zz_link" on inode 257 4868 * inode 261 with reference "zz" on inode 257 4869 * 4870 * When logging inode 261 the following infinite loop could 4871 * happen if we don't skip already logged inodes: 4872 * 4873 * - we detect inode 258 as a conflicting inode, with inode 261 4874 * on reference "zz", and log it; 4875 * 4876 * - we detect inode 259 as a conflicting inode, with inode 258 4877 * on reference "a", and log it; 4878 * 4879 * - we detect inode 258 as a conflicting inode, with inode 259 4880 * on reference "zz_link", and log it - again! After this we 4881 * repeat the above steps forever. 4882 */ 4883 spin_lock(&BTRFS_I(inode)->lock); 4884 /* 4885 * Check the inode's logged_trans only instead of 4886 * btrfs_inode_in_log(). This is because the last_log_commit of 4887 * the inode is not updated when we only log that it exists and 4888 * and it has the full sync bit set (see btrfs_log_inode()). 4889 */ 4890 if (BTRFS_I(inode)->logged_trans == trans->transid) { 4891 spin_unlock(&BTRFS_I(inode)->lock); 4892 btrfs_add_delayed_iput(inode); 4893 continue; 4894 } 4895 spin_unlock(&BTRFS_I(inode)->lock); 4896 /* 4897 * We are safe logging the other inode without acquiring its 4898 * lock as long as we log with the LOG_INODE_EXISTS mode. We 4899 * are safe against concurrent renames of the other inode as 4900 * well because during a rename we pin the log and update the 4901 * log with the new name before we unpin it. 4902 */ 4903 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 4904 LOG_OTHER_INODE, 0, LLONG_MAX, ctx); 4905 if (ret) { 4906 btrfs_add_delayed_iput(inode); 4907 continue; 4908 } 4909 4910 key.objectid = ino; 4911 key.type = BTRFS_INODE_REF_KEY; 4912 key.offset = 0; 4913 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4914 if (ret < 0) { 4915 btrfs_add_delayed_iput(inode); 4916 continue; 4917 } 4918 4919 while (true) { 4920 struct extent_buffer *leaf = path->nodes[0]; 4921 int slot = path->slots[0]; 4922 u64 other_ino = 0; 4923 u64 other_parent = 0; 4924 4925 if (slot >= btrfs_header_nritems(leaf)) { 4926 ret = btrfs_next_leaf(root, path); 4927 if (ret < 0) { 4928 break; 4929 } else if (ret > 0) { 4930 ret = 0; 4931 break; 4932 } 4933 continue; 4934 } 4935 4936 btrfs_item_key_to_cpu(leaf, &key, slot); 4937 if (key.objectid != ino || 4938 (key.type != BTRFS_INODE_REF_KEY && 4939 key.type != BTRFS_INODE_EXTREF_KEY)) { 4940 ret = 0; 4941 break; 4942 } 4943 4944 ret = btrfs_check_ref_name_override(leaf, slot, &key, 4945 BTRFS_I(inode), &other_ino, 4946 &other_parent); 4947 if (ret < 0) 4948 break; 4949 if (ret > 0) { 4950 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4951 if (!ino_elem) { 4952 ret = -ENOMEM; 4953 break; 4954 } 4955 ino_elem->ino = other_ino; 4956 ino_elem->parent = other_parent; 4957 list_add_tail(&ino_elem->list, &inode_list); 4958 ret = 0; 4959 } 4960 path->slots[0]++; 4961 } 4962 btrfs_add_delayed_iput(inode); 4963 } 4964 4965 return ret; 4966 } 4967 4968 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 4969 struct btrfs_inode *inode, 4970 struct btrfs_key *min_key, 4971 const struct btrfs_key *max_key, 4972 struct btrfs_path *path, 4973 struct btrfs_path *dst_path, 4974 const u64 logged_isize, 4975 const bool recursive_logging, 4976 const int inode_only, 4977 struct btrfs_log_ctx *ctx, 4978 bool *need_log_inode_item) 4979 { 4980 struct btrfs_root *root = inode->root; 4981 int ins_start_slot = 0; 4982 int ins_nr = 0; 4983 int ret; 4984 4985 while (1) { 4986 ret = btrfs_search_forward(root, min_key, path, trans->transid); 4987 if (ret < 0) 4988 return ret; 4989 if (ret > 0) { 4990 ret = 0; 4991 break; 4992 } 4993 again: 4994 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 4995 if (min_key->objectid != max_key->objectid) 4996 break; 4997 if (min_key->type > max_key->type) 4998 break; 4999 5000 if (min_key->type == BTRFS_INODE_ITEM_KEY) 5001 *need_log_inode_item = false; 5002 5003 if ((min_key->type == BTRFS_INODE_REF_KEY || 5004 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5005 inode->generation == trans->transid && 5006 !recursive_logging) { 5007 u64 other_ino = 0; 5008 u64 other_parent = 0; 5009 5010 ret = btrfs_check_ref_name_override(path->nodes[0], 5011 path->slots[0], min_key, inode, 5012 &other_ino, &other_parent); 5013 if (ret < 0) { 5014 return ret; 5015 } else if (ret > 0 && ctx && 5016 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5017 if (ins_nr > 0) { 5018 ins_nr++; 5019 } else { 5020 ins_nr = 1; 5021 ins_start_slot = path->slots[0]; 5022 } 5023 ret = copy_items(trans, inode, dst_path, path, 5024 ins_start_slot, ins_nr, 5025 inode_only, logged_isize); 5026 if (ret < 0) 5027 return ret; 5028 ins_nr = 0; 5029 5030 ret = log_conflicting_inodes(trans, root, path, 5031 ctx, other_ino, other_parent); 5032 if (ret) 5033 return ret; 5034 btrfs_release_path(path); 5035 goto next_key; 5036 } 5037 } 5038 5039 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5040 if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5041 if (ins_nr == 0) 5042 goto next_slot; 5043 ret = copy_items(trans, inode, dst_path, path, 5044 ins_start_slot, 5045 ins_nr, inode_only, logged_isize); 5046 if (ret < 0) 5047 return ret; 5048 ins_nr = 0; 5049 goto next_slot; 5050 } 5051 5052 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5053 ins_nr++; 5054 goto next_slot; 5055 } else if (!ins_nr) { 5056 ins_start_slot = path->slots[0]; 5057 ins_nr = 1; 5058 goto next_slot; 5059 } 5060 5061 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5062 ins_nr, inode_only, logged_isize); 5063 if (ret < 0) 5064 return ret; 5065 ins_nr = 1; 5066 ins_start_slot = path->slots[0]; 5067 next_slot: 5068 path->slots[0]++; 5069 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5070 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5071 path->slots[0]); 5072 goto again; 5073 } 5074 if (ins_nr) { 5075 ret = copy_items(trans, inode, dst_path, path, 5076 ins_start_slot, ins_nr, inode_only, 5077 logged_isize); 5078 if (ret < 0) 5079 return ret; 5080 ins_nr = 0; 5081 } 5082 btrfs_release_path(path); 5083 next_key: 5084 if (min_key->offset < (u64)-1) { 5085 min_key->offset++; 5086 } else if (min_key->type < max_key->type) { 5087 min_key->type++; 5088 min_key->offset = 0; 5089 } else { 5090 break; 5091 } 5092 } 5093 if (ins_nr) 5094 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5095 ins_nr, inode_only, logged_isize); 5096 5097 return ret; 5098 } 5099 5100 /* log a single inode in the tree log. 5101 * At least one parent directory for this inode must exist in the tree 5102 * or be logged already. 5103 * 5104 * Any items from this inode changed by the current transaction are copied 5105 * to the log tree. An extra reference is taken on any extents in this 5106 * file, allowing us to avoid a whole pile of corner cases around logging 5107 * blocks that have been removed from the tree. 5108 * 5109 * See LOG_INODE_ALL and related defines for a description of what inode_only 5110 * does. 5111 * 5112 * This handles both files and directories. 5113 */ 5114 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5115 struct btrfs_root *root, struct btrfs_inode *inode, 5116 int inode_only, 5117 const loff_t start, 5118 const loff_t end, 5119 struct btrfs_log_ctx *ctx) 5120 { 5121 struct btrfs_fs_info *fs_info = root->fs_info; 5122 struct btrfs_path *path; 5123 struct btrfs_path *dst_path; 5124 struct btrfs_key min_key; 5125 struct btrfs_key max_key; 5126 struct btrfs_root *log = root->log_root; 5127 int err = 0; 5128 int ret; 5129 bool fast_search = false; 5130 u64 ino = btrfs_ino(inode); 5131 struct extent_map_tree *em_tree = &inode->extent_tree; 5132 u64 logged_isize = 0; 5133 bool need_log_inode_item = true; 5134 bool xattrs_logged = false; 5135 bool recursive_logging = false; 5136 5137 path = btrfs_alloc_path(); 5138 if (!path) 5139 return -ENOMEM; 5140 dst_path = btrfs_alloc_path(); 5141 if (!dst_path) { 5142 btrfs_free_path(path); 5143 return -ENOMEM; 5144 } 5145 5146 min_key.objectid = ino; 5147 min_key.type = BTRFS_INODE_ITEM_KEY; 5148 min_key.offset = 0; 5149 5150 max_key.objectid = ino; 5151 5152 5153 /* today the code can only do partial logging of directories */ 5154 if (S_ISDIR(inode->vfs_inode.i_mode) || 5155 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5156 &inode->runtime_flags) && 5157 inode_only >= LOG_INODE_EXISTS)) 5158 max_key.type = BTRFS_XATTR_ITEM_KEY; 5159 else 5160 max_key.type = (u8)-1; 5161 max_key.offset = (u64)-1; 5162 5163 /* 5164 * Only run delayed items if we are a dir or a new file. 5165 * Otherwise commit the delayed inode only, which is needed in 5166 * order for the log replay code to mark inodes for link count 5167 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 5168 */ 5169 if (S_ISDIR(inode->vfs_inode.i_mode) || 5170 inode->generation > fs_info->last_trans_committed) 5171 ret = btrfs_commit_inode_delayed_items(trans, inode); 5172 else 5173 ret = btrfs_commit_inode_delayed_inode(inode); 5174 5175 if (ret) { 5176 btrfs_free_path(path); 5177 btrfs_free_path(dst_path); 5178 return ret; 5179 } 5180 5181 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5182 recursive_logging = true; 5183 if (inode_only == LOG_OTHER_INODE) 5184 inode_only = LOG_INODE_EXISTS; 5185 else 5186 inode_only = LOG_INODE_ALL; 5187 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5188 } else { 5189 mutex_lock(&inode->log_mutex); 5190 } 5191 5192 /* 5193 * a brute force approach to making sure we get the most uptodate 5194 * copies of everything. 5195 */ 5196 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5197 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5198 5199 if (inode_only == LOG_INODE_EXISTS) 5200 max_key_type = BTRFS_XATTR_ITEM_KEY; 5201 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5202 } else { 5203 if (inode_only == LOG_INODE_EXISTS) { 5204 /* 5205 * Make sure the new inode item we write to the log has 5206 * the same isize as the current one (if it exists). 5207 * This is necessary to prevent data loss after log 5208 * replay, and also to prevent doing a wrong expanding 5209 * truncate - for e.g. create file, write 4K into offset 5210 * 0, fsync, write 4K into offset 4096, add hard link, 5211 * fsync some other file (to sync log), power fail - if 5212 * we use the inode's current i_size, after log replay 5213 * we get a 8Kb file, with the last 4Kb extent as a hole 5214 * (zeroes), as if an expanding truncate happened, 5215 * instead of getting a file of 4Kb only. 5216 */ 5217 err = logged_inode_size(log, inode, path, &logged_isize); 5218 if (err) 5219 goto out_unlock; 5220 } 5221 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5222 &inode->runtime_flags)) { 5223 if (inode_only == LOG_INODE_EXISTS) { 5224 max_key.type = BTRFS_XATTR_ITEM_KEY; 5225 ret = drop_objectid_items(trans, log, path, ino, 5226 max_key.type); 5227 } else { 5228 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5229 &inode->runtime_flags); 5230 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5231 &inode->runtime_flags); 5232 while(1) { 5233 ret = btrfs_truncate_inode_items(trans, 5234 log, &inode->vfs_inode, 0, 0); 5235 if (ret != -EAGAIN) 5236 break; 5237 } 5238 } 5239 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5240 &inode->runtime_flags) || 5241 inode_only == LOG_INODE_EXISTS) { 5242 if (inode_only == LOG_INODE_ALL) 5243 fast_search = true; 5244 max_key.type = BTRFS_XATTR_ITEM_KEY; 5245 ret = drop_objectid_items(trans, log, path, ino, 5246 max_key.type); 5247 } else { 5248 if (inode_only == LOG_INODE_ALL) 5249 fast_search = true; 5250 goto log_extents; 5251 } 5252 5253 } 5254 if (ret) { 5255 err = ret; 5256 goto out_unlock; 5257 } 5258 5259 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5260 path, dst_path, logged_isize, 5261 recursive_logging, inode_only, ctx, 5262 &need_log_inode_item); 5263 if (err) 5264 goto out_unlock; 5265 5266 btrfs_release_path(path); 5267 btrfs_release_path(dst_path); 5268 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5269 if (err) 5270 goto out_unlock; 5271 xattrs_logged = true; 5272 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5273 btrfs_release_path(path); 5274 btrfs_release_path(dst_path); 5275 err = btrfs_log_holes(trans, root, inode, path); 5276 if (err) 5277 goto out_unlock; 5278 } 5279 log_extents: 5280 btrfs_release_path(path); 5281 btrfs_release_path(dst_path); 5282 if (need_log_inode_item) { 5283 err = log_inode_item(trans, log, dst_path, inode); 5284 if (!err && !xattrs_logged) { 5285 err = btrfs_log_all_xattrs(trans, root, inode, path, 5286 dst_path); 5287 btrfs_release_path(path); 5288 } 5289 if (err) 5290 goto out_unlock; 5291 } 5292 if (fast_search) { 5293 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5294 ctx, start, end); 5295 if (ret) { 5296 err = ret; 5297 goto out_unlock; 5298 } 5299 } else if (inode_only == LOG_INODE_ALL) { 5300 struct extent_map *em, *n; 5301 5302 write_lock(&em_tree->lock); 5303 /* 5304 * We can't just remove every em if we're called for a ranged 5305 * fsync - that is, one that doesn't cover the whole possible 5306 * file range (0 to LLONG_MAX). This is because we can have 5307 * em's that fall outside the range we're logging and therefore 5308 * their ordered operations haven't completed yet 5309 * (btrfs_finish_ordered_io() not invoked yet). This means we 5310 * didn't get their respective file extent item in the fs/subvol 5311 * tree yet, and need to let the next fast fsync (one which 5312 * consults the list of modified extent maps) find the em so 5313 * that it logs a matching file extent item and waits for the 5314 * respective ordered operation to complete (if it's still 5315 * running). 5316 * 5317 * Removing every em outside the range we're logging would make 5318 * the next fast fsync not log their matching file extent items, 5319 * therefore making us lose data after a log replay. 5320 */ 5321 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5322 list) { 5323 const u64 mod_end = em->mod_start + em->mod_len - 1; 5324 5325 if (em->mod_start >= start && mod_end <= end) 5326 list_del_init(&em->list); 5327 } 5328 write_unlock(&em_tree->lock); 5329 } 5330 5331 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5332 ret = log_directory_changes(trans, root, inode, path, dst_path, 5333 ctx); 5334 if (ret) { 5335 err = ret; 5336 goto out_unlock; 5337 } 5338 } 5339 5340 /* 5341 * Don't update last_log_commit if we logged that an inode exists after 5342 * it was loaded to memory (full_sync bit set). 5343 * This is to prevent data loss when we do a write to the inode, then 5344 * the inode gets evicted after all delalloc was flushed, then we log 5345 * it exists (due to a rename for example) and then fsync it. This last 5346 * fsync would do nothing (not logging the extents previously written). 5347 */ 5348 spin_lock(&inode->lock); 5349 inode->logged_trans = trans->transid; 5350 if (inode_only != LOG_INODE_EXISTS || 5351 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5352 inode->last_log_commit = inode->last_sub_trans; 5353 spin_unlock(&inode->lock); 5354 out_unlock: 5355 mutex_unlock(&inode->log_mutex); 5356 5357 btrfs_free_path(path); 5358 btrfs_free_path(dst_path); 5359 return err; 5360 } 5361 5362 /* 5363 * Check if we must fallback to a transaction commit when logging an inode. 5364 * This must be called after logging the inode and is used only in the context 5365 * when fsyncing an inode requires the need to log some other inode - in which 5366 * case we can't lock the i_mutex of each other inode we need to log as that 5367 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5368 * log inodes up or down in the hierarchy) or rename operations for example. So 5369 * we take the log_mutex of the inode after we have logged it and then check for 5370 * its last_unlink_trans value - this is safe because any task setting 5371 * last_unlink_trans must take the log_mutex and it must do this before it does 5372 * the actual unlink operation, so if we do this check before a concurrent task 5373 * sets last_unlink_trans it means we've logged a consistent version/state of 5374 * all the inode items, otherwise we are not sure and must do a transaction 5375 * commit (the concurrent task might have only updated last_unlink_trans before 5376 * we logged the inode or it might have also done the unlink). 5377 */ 5378 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5379 struct btrfs_inode *inode) 5380 { 5381 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5382 bool ret = false; 5383 5384 mutex_lock(&inode->log_mutex); 5385 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5386 /* 5387 * Make sure any commits to the log are forced to be full 5388 * commits. 5389 */ 5390 btrfs_set_log_full_commit(trans); 5391 ret = true; 5392 } 5393 mutex_unlock(&inode->log_mutex); 5394 5395 return ret; 5396 } 5397 5398 /* 5399 * follow the dentry parent pointers up the chain and see if any 5400 * of the directories in it require a full commit before they can 5401 * be logged. Returns zero if nothing special needs to be done or 1 if 5402 * a full commit is required. 5403 */ 5404 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5405 struct btrfs_inode *inode, 5406 struct dentry *parent, 5407 struct super_block *sb, 5408 u64 last_committed) 5409 { 5410 int ret = 0; 5411 struct dentry *old_parent = NULL; 5412 5413 /* 5414 * for regular files, if its inode is already on disk, we don't 5415 * have to worry about the parents at all. This is because 5416 * we can use the last_unlink_trans field to record renames 5417 * and other fun in this file. 5418 */ 5419 if (S_ISREG(inode->vfs_inode.i_mode) && 5420 inode->generation <= last_committed && 5421 inode->last_unlink_trans <= last_committed) 5422 goto out; 5423 5424 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5425 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5426 goto out; 5427 inode = BTRFS_I(d_inode(parent)); 5428 } 5429 5430 while (1) { 5431 if (btrfs_must_commit_transaction(trans, inode)) { 5432 ret = 1; 5433 break; 5434 } 5435 5436 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5437 break; 5438 5439 if (IS_ROOT(parent)) { 5440 inode = BTRFS_I(d_inode(parent)); 5441 if (btrfs_must_commit_transaction(trans, inode)) 5442 ret = 1; 5443 break; 5444 } 5445 5446 parent = dget_parent(parent); 5447 dput(old_parent); 5448 old_parent = parent; 5449 inode = BTRFS_I(d_inode(parent)); 5450 5451 } 5452 dput(old_parent); 5453 out: 5454 return ret; 5455 } 5456 5457 struct btrfs_dir_list { 5458 u64 ino; 5459 struct list_head list; 5460 }; 5461 5462 /* 5463 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5464 * details about the why it is needed. 5465 * This is a recursive operation - if an existing dentry corresponds to a 5466 * directory, that directory's new entries are logged too (same behaviour as 5467 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5468 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5469 * complains about the following circular lock dependency / possible deadlock: 5470 * 5471 * CPU0 CPU1 5472 * ---- ---- 5473 * lock(&type->i_mutex_dir_key#3/2); 5474 * lock(sb_internal#2); 5475 * lock(&type->i_mutex_dir_key#3/2); 5476 * lock(&sb->s_type->i_mutex_key#14); 5477 * 5478 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5479 * sb_start_intwrite() in btrfs_start_transaction(). 5480 * Not locking i_mutex of the inodes is still safe because: 5481 * 5482 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5483 * that while logging the inode new references (names) are added or removed 5484 * from the inode, leaving the logged inode item with a link count that does 5485 * not match the number of logged inode reference items. This is fine because 5486 * at log replay time we compute the real number of links and correct the 5487 * link count in the inode item (see replay_one_buffer() and 5488 * link_to_fixup_dir()); 5489 * 5490 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5491 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5492 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5493 * has a size that doesn't match the sum of the lengths of all the logged 5494 * names. This does not result in a problem because if a dir_item key is 5495 * logged but its matching dir_index key is not logged, at log replay time we 5496 * don't use it to replay the respective name (see replay_one_name()). On the 5497 * other hand if only the dir_index key ends up being logged, the respective 5498 * name is added to the fs/subvol tree with both the dir_item and dir_index 5499 * keys created (see replay_one_name()). 5500 * The directory's inode item with a wrong i_size is not a problem as well, 5501 * since we don't use it at log replay time to set the i_size in the inode 5502 * item of the fs/subvol tree (see overwrite_item()). 5503 */ 5504 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5505 struct btrfs_root *root, 5506 struct btrfs_inode *start_inode, 5507 struct btrfs_log_ctx *ctx) 5508 { 5509 struct btrfs_fs_info *fs_info = root->fs_info; 5510 struct btrfs_root *log = root->log_root; 5511 struct btrfs_path *path; 5512 LIST_HEAD(dir_list); 5513 struct btrfs_dir_list *dir_elem; 5514 int ret = 0; 5515 5516 path = btrfs_alloc_path(); 5517 if (!path) 5518 return -ENOMEM; 5519 5520 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5521 if (!dir_elem) { 5522 btrfs_free_path(path); 5523 return -ENOMEM; 5524 } 5525 dir_elem->ino = btrfs_ino(start_inode); 5526 list_add_tail(&dir_elem->list, &dir_list); 5527 5528 while (!list_empty(&dir_list)) { 5529 struct extent_buffer *leaf; 5530 struct btrfs_key min_key; 5531 int nritems; 5532 int i; 5533 5534 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5535 list); 5536 if (ret) 5537 goto next_dir_inode; 5538 5539 min_key.objectid = dir_elem->ino; 5540 min_key.type = BTRFS_DIR_ITEM_KEY; 5541 min_key.offset = 0; 5542 again: 5543 btrfs_release_path(path); 5544 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5545 if (ret < 0) { 5546 goto next_dir_inode; 5547 } else if (ret > 0) { 5548 ret = 0; 5549 goto next_dir_inode; 5550 } 5551 5552 process_leaf: 5553 leaf = path->nodes[0]; 5554 nritems = btrfs_header_nritems(leaf); 5555 for (i = path->slots[0]; i < nritems; i++) { 5556 struct btrfs_dir_item *di; 5557 struct btrfs_key di_key; 5558 struct inode *di_inode; 5559 struct btrfs_dir_list *new_dir_elem; 5560 int log_mode = LOG_INODE_EXISTS; 5561 int type; 5562 5563 btrfs_item_key_to_cpu(leaf, &min_key, i); 5564 if (min_key.objectid != dir_elem->ino || 5565 min_key.type != BTRFS_DIR_ITEM_KEY) 5566 goto next_dir_inode; 5567 5568 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5569 type = btrfs_dir_type(leaf, di); 5570 if (btrfs_dir_transid(leaf, di) < trans->transid && 5571 type != BTRFS_FT_DIR) 5572 continue; 5573 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5574 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5575 continue; 5576 5577 btrfs_release_path(path); 5578 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5579 if (IS_ERR(di_inode)) { 5580 ret = PTR_ERR(di_inode); 5581 goto next_dir_inode; 5582 } 5583 5584 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5585 btrfs_add_delayed_iput(di_inode); 5586 break; 5587 } 5588 5589 ctx->log_new_dentries = false; 5590 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5591 log_mode = LOG_INODE_ALL; 5592 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5593 log_mode, 0, LLONG_MAX, ctx); 5594 if (!ret && 5595 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5596 ret = 1; 5597 btrfs_add_delayed_iput(di_inode); 5598 if (ret) 5599 goto next_dir_inode; 5600 if (ctx->log_new_dentries) { 5601 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5602 GFP_NOFS); 5603 if (!new_dir_elem) { 5604 ret = -ENOMEM; 5605 goto next_dir_inode; 5606 } 5607 new_dir_elem->ino = di_key.objectid; 5608 list_add_tail(&new_dir_elem->list, &dir_list); 5609 } 5610 break; 5611 } 5612 if (i == nritems) { 5613 ret = btrfs_next_leaf(log, path); 5614 if (ret < 0) { 5615 goto next_dir_inode; 5616 } else if (ret > 0) { 5617 ret = 0; 5618 goto next_dir_inode; 5619 } 5620 goto process_leaf; 5621 } 5622 if (min_key.offset < (u64)-1) { 5623 min_key.offset++; 5624 goto again; 5625 } 5626 next_dir_inode: 5627 list_del(&dir_elem->list); 5628 kfree(dir_elem); 5629 } 5630 5631 btrfs_free_path(path); 5632 return ret; 5633 } 5634 5635 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5636 struct btrfs_inode *inode, 5637 struct btrfs_log_ctx *ctx) 5638 { 5639 struct btrfs_fs_info *fs_info = trans->fs_info; 5640 int ret; 5641 struct btrfs_path *path; 5642 struct btrfs_key key; 5643 struct btrfs_root *root = inode->root; 5644 const u64 ino = btrfs_ino(inode); 5645 5646 path = btrfs_alloc_path(); 5647 if (!path) 5648 return -ENOMEM; 5649 path->skip_locking = 1; 5650 path->search_commit_root = 1; 5651 5652 key.objectid = ino; 5653 key.type = BTRFS_INODE_REF_KEY; 5654 key.offset = 0; 5655 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5656 if (ret < 0) 5657 goto out; 5658 5659 while (true) { 5660 struct extent_buffer *leaf = path->nodes[0]; 5661 int slot = path->slots[0]; 5662 u32 cur_offset = 0; 5663 u32 item_size; 5664 unsigned long ptr; 5665 5666 if (slot >= btrfs_header_nritems(leaf)) { 5667 ret = btrfs_next_leaf(root, path); 5668 if (ret < 0) 5669 goto out; 5670 else if (ret > 0) 5671 break; 5672 continue; 5673 } 5674 5675 btrfs_item_key_to_cpu(leaf, &key, slot); 5676 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5677 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5678 break; 5679 5680 item_size = btrfs_item_size_nr(leaf, slot); 5681 ptr = btrfs_item_ptr_offset(leaf, slot); 5682 while (cur_offset < item_size) { 5683 struct btrfs_key inode_key; 5684 struct inode *dir_inode; 5685 5686 inode_key.type = BTRFS_INODE_ITEM_KEY; 5687 inode_key.offset = 0; 5688 5689 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5690 struct btrfs_inode_extref *extref; 5691 5692 extref = (struct btrfs_inode_extref *) 5693 (ptr + cur_offset); 5694 inode_key.objectid = btrfs_inode_extref_parent( 5695 leaf, extref); 5696 cur_offset += sizeof(*extref); 5697 cur_offset += btrfs_inode_extref_name_len(leaf, 5698 extref); 5699 } else { 5700 inode_key.objectid = key.offset; 5701 cur_offset = item_size; 5702 } 5703 5704 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 5705 root); 5706 /* 5707 * If the parent inode was deleted, return an error to 5708 * fallback to a transaction commit. This is to prevent 5709 * getting an inode that was moved from one parent A to 5710 * a parent B, got its former parent A deleted and then 5711 * it got fsync'ed, from existing at both parents after 5712 * a log replay (and the old parent still existing). 5713 * Example: 5714 * 5715 * mkdir /mnt/A 5716 * mkdir /mnt/B 5717 * touch /mnt/B/bar 5718 * sync 5719 * mv /mnt/B/bar /mnt/A/bar 5720 * mv -T /mnt/A /mnt/B 5721 * fsync /mnt/B/bar 5722 * <power fail> 5723 * 5724 * If we ignore the old parent B which got deleted, 5725 * after a log replay we would have file bar linked 5726 * at both parents and the old parent B would still 5727 * exist. 5728 */ 5729 if (IS_ERR(dir_inode)) { 5730 ret = PTR_ERR(dir_inode); 5731 goto out; 5732 } 5733 5734 if (ctx) 5735 ctx->log_new_dentries = false; 5736 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5737 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5738 if (!ret && 5739 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5740 ret = 1; 5741 if (!ret && ctx && ctx->log_new_dentries) 5742 ret = log_new_dir_dentries(trans, root, 5743 BTRFS_I(dir_inode), ctx); 5744 btrfs_add_delayed_iput(dir_inode); 5745 if (ret) 5746 goto out; 5747 } 5748 path->slots[0]++; 5749 } 5750 ret = 0; 5751 out: 5752 btrfs_free_path(path); 5753 return ret; 5754 } 5755 5756 static int log_new_ancestors(struct btrfs_trans_handle *trans, 5757 struct btrfs_root *root, 5758 struct btrfs_path *path, 5759 struct btrfs_log_ctx *ctx) 5760 { 5761 struct btrfs_key found_key; 5762 5763 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5764 5765 while (true) { 5766 struct btrfs_fs_info *fs_info = root->fs_info; 5767 const u64 last_committed = fs_info->last_trans_committed; 5768 struct extent_buffer *leaf = path->nodes[0]; 5769 int slot = path->slots[0]; 5770 struct btrfs_key search_key; 5771 struct inode *inode; 5772 u64 ino; 5773 int ret = 0; 5774 5775 btrfs_release_path(path); 5776 5777 ino = found_key.offset; 5778 5779 search_key.objectid = found_key.offset; 5780 search_key.type = BTRFS_INODE_ITEM_KEY; 5781 search_key.offset = 0; 5782 inode = btrfs_iget(fs_info->sb, ino, root); 5783 if (IS_ERR(inode)) 5784 return PTR_ERR(inode); 5785 5786 if (BTRFS_I(inode)->generation > last_committed) 5787 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5788 LOG_INODE_EXISTS, 5789 0, LLONG_MAX, ctx); 5790 btrfs_add_delayed_iput(inode); 5791 if (ret) 5792 return ret; 5793 5794 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5795 break; 5796 5797 search_key.type = BTRFS_INODE_REF_KEY; 5798 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5799 if (ret < 0) 5800 return ret; 5801 5802 leaf = path->nodes[0]; 5803 slot = path->slots[0]; 5804 if (slot >= btrfs_header_nritems(leaf)) { 5805 ret = btrfs_next_leaf(root, path); 5806 if (ret < 0) 5807 return ret; 5808 else if (ret > 0) 5809 return -ENOENT; 5810 leaf = path->nodes[0]; 5811 slot = path->slots[0]; 5812 } 5813 5814 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5815 if (found_key.objectid != search_key.objectid || 5816 found_key.type != BTRFS_INODE_REF_KEY) 5817 return -ENOENT; 5818 } 5819 return 0; 5820 } 5821 5822 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 5823 struct btrfs_inode *inode, 5824 struct dentry *parent, 5825 struct btrfs_log_ctx *ctx) 5826 { 5827 struct btrfs_root *root = inode->root; 5828 struct btrfs_fs_info *fs_info = root->fs_info; 5829 struct dentry *old_parent = NULL; 5830 struct super_block *sb = inode->vfs_inode.i_sb; 5831 int ret = 0; 5832 5833 while (true) { 5834 if (!parent || d_really_is_negative(parent) || 5835 sb != parent->d_sb) 5836 break; 5837 5838 inode = BTRFS_I(d_inode(parent)); 5839 if (root != inode->root) 5840 break; 5841 5842 if (inode->generation > fs_info->last_trans_committed) { 5843 ret = btrfs_log_inode(trans, root, inode, 5844 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5845 if (ret) 5846 break; 5847 } 5848 if (IS_ROOT(parent)) 5849 break; 5850 5851 parent = dget_parent(parent); 5852 dput(old_parent); 5853 old_parent = parent; 5854 } 5855 dput(old_parent); 5856 5857 return ret; 5858 } 5859 5860 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 5861 struct btrfs_inode *inode, 5862 struct dentry *parent, 5863 struct btrfs_log_ctx *ctx) 5864 { 5865 struct btrfs_root *root = inode->root; 5866 const u64 ino = btrfs_ino(inode); 5867 struct btrfs_path *path; 5868 struct btrfs_key search_key; 5869 int ret; 5870 5871 /* 5872 * For a single hard link case, go through a fast path that does not 5873 * need to iterate the fs/subvolume tree. 5874 */ 5875 if (inode->vfs_inode.i_nlink < 2) 5876 return log_new_ancestors_fast(trans, inode, parent, ctx); 5877 5878 path = btrfs_alloc_path(); 5879 if (!path) 5880 return -ENOMEM; 5881 5882 search_key.objectid = ino; 5883 search_key.type = BTRFS_INODE_REF_KEY; 5884 search_key.offset = 0; 5885 again: 5886 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5887 if (ret < 0) 5888 goto out; 5889 if (ret == 0) 5890 path->slots[0]++; 5891 5892 while (true) { 5893 struct extent_buffer *leaf = path->nodes[0]; 5894 int slot = path->slots[0]; 5895 struct btrfs_key found_key; 5896 5897 if (slot >= btrfs_header_nritems(leaf)) { 5898 ret = btrfs_next_leaf(root, path); 5899 if (ret < 0) 5900 goto out; 5901 else if (ret > 0) 5902 break; 5903 continue; 5904 } 5905 5906 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5907 if (found_key.objectid != ino || 5908 found_key.type > BTRFS_INODE_EXTREF_KEY) 5909 break; 5910 5911 /* 5912 * Don't deal with extended references because they are rare 5913 * cases and too complex to deal with (we would need to keep 5914 * track of which subitem we are processing for each item in 5915 * this loop, etc). So just return some error to fallback to 5916 * a transaction commit. 5917 */ 5918 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 5919 ret = -EMLINK; 5920 goto out; 5921 } 5922 5923 /* 5924 * Logging ancestors needs to do more searches on the fs/subvol 5925 * tree, so it releases the path as needed to avoid deadlocks. 5926 * Keep track of the last inode ref key and resume from that key 5927 * after logging all new ancestors for the current hard link. 5928 */ 5929 memcpy(&search_key, &found_key, sizeof(search_key)); 5930 5931 ret = log_new_ancestors(trans, root, path, ctx); 5932 if (ret) 5933 goto out; 5934 btrfs_release_path(path); 5935 goto again; 5936 } 5937 ret = 0; 5938 out: 5939 btrfs_free_path(path); 5940 return ret; 5941 } 5942 5943 /* 5944 * helper function around btrfs_log_inode to make sure newly created 5945 * parent directories also end up in the log. A minimal inode and backref 5946 * only logging is done of any parent directories that are older than 5947 * the last committed transaction 5948 */ 5949 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5950 struct btrfs_inode *inode, 5951 struct dentry *parent, 5952 const loff_t start, 5953 const loff_t end, 5954 int inode_only, 5955 struct btrfs_log_ctx *ctx) 5956 { 5957 struct btrfs_root *root = inode->root; 5958 struct btrfs_fs_info *fs_info = root->fs_info; 5959 struct super_block *sb; 5960 int ret = 0; 5961 u64 last_committed = fs_info->last_trans_committed; 5962 bool log_dentries = false; 5963 5964 sb = inode->vfs_inode.i_sb; 5965 5966 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5967 ret = 1; 5968 goto end_no_trans; 5969 } 5970 5971 /* 5972 * The prev transaction commit doesn't complete, we need do 5973 * full commit by ourselves. 5974 */ 5975 if (fs_info->last_trans_log_full_commit > 5976 fs_info->last_trans_committed) { 5977 ret = 1; 5978 goto end_no_trans; 5979 } 5980 5981 if (btrfs_root_refs(&root->root_item) == 0) { 5982 ret = 1; 5983 goto end_no_trans; 5984 } 5985 5986 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5987 last_committed); 5988 if (ret) 5989 goto end_no_trans; 5990 5991 /* 5992 * Skip already logged inodes or inodes corresponding to tmpfiles 5993 * (since logging them is pointless, a link count of 0 means they 5994 * will never be accessible). 5995 */ 5996 if (btrfs_inode_in_log(inode, trans->transid) || 5997 inode->vfs_inode.i_nlink == 0) { 5998 ret = BTRFS_NO_LOG_SYNC; 5999 goto end_no_trans; 6000 } 6001 6002 ret = start_log_trans(trans, root, ctx); 6003 if (ret) 6004 goto end_no_trans; 6005 6006 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 6007 if (ret) 6008 goto end_trans; 6009 6010 /* 6011 * for regular files, if its inode is already on disk, we don't 6012 * have to worry about the parents at all. This is because 6013 * we can use the last_unlink_trans field to record renames 6014 * and other fun in this file. 6015 */ 6016 if (S_ISREG(inode->vfs_inode.i_mode) && 6017 inode->generation <= last_committed && 6018 inode->last_unlink_trans <= last_committed) { 6019 ret = 0; 6020 goto end_trans; 6021 } 6022 6023 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6024 log_dentries = true; 6025 6026 /* 6027 * On unlink we must make sure all our current and old parent directory 6028 * inodes are fully logged. This is to prevent leaving dangling 6029 * directory index entries in directories that were our parents but are 6030 * not anymore. Not doing this results in old parent directory being 6031 * impossible to delete after log replay (rmdir will always fail with 6032 * error -ENOTEMPTY). 6033 * 6034 * Example 1: 6035 * 6036 * mkdir testdir 6037 * touch testdir/foo 6038 * ln testdir/foo testdir/bar 6039 * sync 6040 * unlink testdir/bar 6041 * xfs_io -c fsync testdir/foo 6042 * <power failure> 6043 * mount fs, triggers log replay 6044 * 6045 * If we don't log the parent directory (testdir), after log replay the 6046 * directory still has an entry pointing to the file inode using the bar 6047 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6048 * the file inode has a link count of 1. 6049 * 6050 * Example 2: 6051 * 6052 * mkdir testdir 6053 * touch foo 6054 * ln foo testdir/foo2 6055 * ln foo testdir/foo3 6056 * sync 6057 * unlink testdir/foo3 6058 * xfs_io -c fsync foo 6059 * <power failure> 6060 * mount fs, triggers log replay 6061 * 6062 * Similar as the first example, after log replay the parent directory 6063 * testdir still has an entry pointing to the inode file with name foo3 6064 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6065 * and has a link count of 2. 6066 */ 6067 if (inode->last_unlink_trans > last_committed) { 6068 ret = btrfs_log_all_parents(trans, inode, ctx); 6069 if (ret) 6070 goto end_trans; 6071 } 6072 6073 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6074 if (ret) 6075 goto end_trans; 6076 6077 if (log_dentries) 6078 ret = log_new_dir_dentries(trans, root, inode, ctx); 6079 else 6080 ret = 0; 6081 end_trans: 6082 if (ret < 0) { 6083 btrfs_set_log_full_commit(trans); 6084 ret = 1; 6085 } 6086 6087 if (ret) 6088 btrfs_remove_log_ctx(root, ctx); 6089 btrfs_end_log_trans(root); 6090 end_no_trans: 6091 return ret; 6092 } 6093 6094 /* 6095 * it is not safe to log dentry if the chunk root has added new 6096 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6097 * If this returns 1, you must commit the transaction to safely get your 6098 * data on disk. 6099 */ 6100 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6101 struct dentry *dentry, 6102 const loff_t start, 6103 const loff_t end, 6104 struct btrfs_log_ctx *ctx) 6105 { 6106 struct dentry *parent = dget_parent(dentry); 6107 int ret; 6108 6109 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6110 start, end, LOG_INODE_ALL, ctx); 6111 dput(parent); 6112 6113 return ret; 6114 } 6115 6116 /* 6117 * should be called during mount to recover any replay any log trees 6118 * from the FS 6119 */ 6120 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6121 { 6122 int ret; 6123 struct btrfs_path *path; 6124 struct btrfs_trans_handle *trans; 6125 struct btrfs_key key; 6126 struct btrfs_key found_key; 6127 struct btrfs_root *log; 6128 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6129 struct walk_control wc = { 6130 .process_func = process_one_buffer, 6131 .stage = LOG_WALK_PIN_ONLY, 6132 }; 6133 6134 path = btrfs_alloc_path(); 6135 if (!path) 6136 return -ENOMEM; 6137 6138 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6139 6140 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6141 if (IS_ERR(trans)) { 6142 ret = PTR_ERR(trans); 6143 goto error; 6144 } 6145 6146 wc.trans = trans; 6147 wc.pin = 1; 6148 6149 ret = walk_log_tree(trans, log_root_tree, &wc); 6150 if (ret) { 6151 btrfs_handle_fs_error(fs_info, ret, 6152 "Failed to pin buffers while recovering log root tree."); 6153 goto error; 6154 } 6155 6156 again: 6157 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6158 key.offset = (u64)-1; 6159 key.type = BTRFS_ROOT_ITEM_KEY; 6160 6161 while (1) { 6162 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6163 6164 if (ret < 0) { 6165 btrfs_handle_fs_error(fs_info, ret, 6166 "Couldn't find tree log root."); 6167 goto error; 6168 } 6169 if (ret > 0) { 6170 if (path->slots[0] == 0) 6171 break; 6172 path->slots[0]--; 6173 } 6174 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6175 path->slots[0]); 6176 btrfs_release_path(path); 6177 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6178 break; 6179 6180 log = btrfs_read_tree_root(log_root_tree, &found_key); 6181 if (IS_ERR(log)) { 6182 ret = PTR_ERR(log); 6183 btrfs_handle_fs_error(fs_info, ret, 6184 "Couldn't read tree log root."); 6185 goto error; 6186 } 6187 6188 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6189 true); 6190 if (IS_ERR(wc.replay_dest)) { 6191 ret = PTR_ERR(wc.replay_dest); 6192 6193 /* 6194 * We didn't find the subvol, likely because it was 6195 * deleted. This is ok, simply skip this log and go to 6196 * the next one. 6197 * 6198 * We need to exclude the root because we can't have 6199 * other log replays overwriting this log as we'll read 6200 * it back in a few more times. This will keep our 6201 * block from being modified, and we'll just bail for 6202 * each subsequent pass. 6203 */ 6204 if (ret == -ENOENT) 6205 ret = btrfs_pin_extent_for_log_replay(trans, 6206 log->node->start, 6207 log->node->len); 6208 btrfs_put_root(log); 6209 6210 if (!ret) 6211 goto next; 6212 btrfs_handle_fs_error(fs_info, ret, 6213 "Couldn't read target root for tree log recovery."); 6214 goto error; 6215 } 6216 6217 wc.replay_dest->log_root = log; 6218 btrfs_record_root_in_trans(trans, wc.replay_dest); 6219 ret = walk_log_tree(trans, log, &wc); 6220 6221 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6222 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6223 path); 6224 } 6225 6226 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6227 struct btrfs_root *root = wc.replay_dest; 6228 6229 btrfs_release_path(path); 6230 6231 /* 6232 * We have just replayed everything, and the highest 6233 * objectid of fs roots probably has changed in case 6234 * some inode_item's got replayed. 6235 * 6236 * root->objectid_mutex is not acquired as log replay 6237 * could only happen during mount. 6238 */ 6239 ret = btrfs_find_highest_objectid(root, 6240 &root->highest_objectid); 6241 } 6242 6243 wc.replay_dest->log_root = NULL; 6244 btrfs_put_root(wc.replay_dest); 6245 btrfs_put_root(log); 6246 6247 if (ret) 6248 goto error; 6249 next: 6250 if (found_key.offset == 0) 6251 break; 6252 key.offset = found_key.offset - 1; 6253 } 6254 btrfs_release_path(path); 6255 6256 /* step one is to pin it all, step two is to replay just inodes */ 6257 if (wc.pin) { 6258 wc.pin = 0; 6259 wc.process_func = replay_one_buffer; 6260 wc.stage = LOG_WALK_REPLAY_INODES; 6261 goto again; 6262 } 6263 /* step three is to replay everything */ 6264 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6265 wc.stage++; 6266 goto again; 6267 } 6268 6269 btrfs_free_path(path); 6270 6271 /* step 4: commit the transaction, which also unpins the blocks */ 6272 ret = btrfs_commit_transaction(trans); 6273 if (ret) 6274 return ret; 6275 6276 log_root_tree->log_root = NULL; 6277 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6278 btrfs_put_root(log_root_tree); 6279 6280 return 0; 6281 error: 6282 if (wc.trans) 6283 btrfs_end_transaction(wc.trans); 6284 btrfs_free_path(path); 6285 return ret; 6286 } 6287 6288 /* 6289 * there are some corner cases where we want to force a full 6290 * commit instead of allowing a directory to be logged. 6291 * 6292 * They revolve around files there were unlinked from the directory, and 6293 * this function updates the parent directory so that a full commit is 6294 * properly done if it is fsync'd later after the unlinks are done. 6295 * 6296 * Must be called before the unlink operations (updates to the subvolume tree, 6297 * inodes, etc) are done. 6298 */ 6299 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6300 struct btrfs_inode *dir, struct btrfs_inode *inode, 6301 int for_rename) 6302 { 6303 /* 6304 * when we're logging a file, if it hasn't been renamed 6305 * or unlinked, and its inode is fully committed on disk, 6306 * we don't have to worry about walking up the directory chain 6307 * to log its parents. 6308 * 6309 * So, we use the last_unlink_trans field to put this transid 6310 * into the file. When the file is logged we check it and 6311 * don't log the parents if the file is fully on disk. 6312 */ 6313 mutex_lock(&inode->log_mutex); 6314 inode->last_unlink_trans = trans->transid; 6315 mutex_unlock(&inode->log_mutex); 6316 6317 /* 6318 * if this directory was already logged any new 6319 * names for this file/dir will get recorded 6320 */ 6321 if (dir->logged_trans == trans->transid) 6322 return; 6323 6324 /* 6325 * if the inode we're about to unlink was logged, 6326 * the log will be properly updated for any new names 6327 */ 6328 if (inode->logged_trans == trans->transid) 6329 return; 6330 6331 /* 6332 * when renaming files across directories, if the directory 6333 * there we're unlinking from gets fsync'd later on, there's 6334 * no way to find the destination directory later and fsync it 6335 * properly. So, we have to be conservative and force commits 6336 * so the new name gets discovered. 6337 */ 6338 if (for_rename) 6339 goto record; 6340 6341 /* we can safely do the unlink without any special recording */ 6342 return; 6343 6344 record: 6345 mutex_lock(&dir->log_mutex); 6346 dir->last_unlink_trans = trans->transid; 6347 mutex_unlock(&dir->log_mutex); 6348 } 6349 6350 /* 6351 * Make sure that if someone attempts to fsync the parent directory of a deleted 6352 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6353 * that after replaying the log tree of the parent directory's root we will not 6354 * see the snapshot anymore and at log replay time we will not see any log tree 6355 * corresponding to the deleted snapshot's root, which could lead to replaying 6356 * it after replaying the log tree of the parent directory (which would replay 6357 * the snapshot delete operation). 6358 * 6359 * Must be called before the actual snapshot destroy operation (updates to the 6360 * parent root and tree of tree roots trees, etc) are done. 6361 */ 6362 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6363 struct btrfs_inode *dir) 6364 { 6365 mutex_lock(&dir->log_mutex); 6366 dir->last_unlink_trans = trans->transid; 6367 mutex_unlock(&dir->log_mutex); 6368 } 6369 6370 /* 6371 * Call this after adding a new name for a file and it will properly 6372 * update the log to reflect the new name. 6373 * 6374 * @ctx can not be NULL when @sync_log is false, and should be NULL when it's 6375 * true (because it's not used). 6376 * 6377 * Return value depends on whether @sync_log is true or false. 6378 * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be 6379 * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT 6380 * otherwise. 6381 * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to 6382 * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, 6383 * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be 6384 * committed (without attempting to sync the log). 6385 */ 6386 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 6387 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6388 struct dentry *parent, 6389 bool sync_log, struct btrfs_log_ctx *ctx) 6390 { 6391 struct btrfs_fs_info *fs_info = trans->fs_info; 6392 int ret; 6393 6394 /* 6395 * this will force the logging code to walk the dentry chain 6396 * up for the file 6397 */ 6398 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6399 inode->last_unlink_trans = trans->transid; 6400 6401 /* 6402 * if this inode hasn't been logged and directory we're renaming it 6403 * from hasn't been logged, we don't need to log it 6404 */ 6405 if (inode->logged_trans <= fs_info->last_trans_committed && 6406 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6407 return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : 6408 BTRFS_DONT_NEED_LOG_SYNC; 6409 6410 if (sync_log) { 6411 struct btrfs_log_ctx ctx2; 6412 6413 btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); 6414 ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6415 LOG_INODE_EXISTS, &ctx2); 6416 if (ret == BTRFS_NO_LOG_SYNC) 6417 return BTRFS_DONT_NEED_TRANS_COMMIT; 6418 else if (ret) 6419 return BTRFS_NEED_TRANS_COMMIT; 6420 6421 ret = btrfs_sync_log(trans, inode->root, &ctx2); 6422 if (ret) 6423 return BTRFS_NEED_TRANS_COMMIT; 6424 return BTRFS_DONT_NEED_TRANS_COMMIT; 6425 } 6426 6427 ASSERT(ctx); 6428 ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6429 LOG_INODE_EXISTS, ctx); 6430 if (ret == BTRFS_NO_LOG_SYNC) 6431 return BTRFS_DONT_NEED_LOG_SYNC; 6432 else if (ret) 6433 return BTRFS_NEED_TRANS_COMMIT; 6434 6435 return BTRFS_NEED_LOG_SYNC; 6436 } 6437 6438