1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "inode-map.h" 21 #include "block-group.h" 22 #include "space-info.h" 23 24 /* magic values for the inode_only field in btrfs_log_inode: 25 * 26 * LOG_INODE_ALL means to log everything 27 * LOG_INODE_EXISTS means to log just enough to recreate the inode 28 * during log replay 29 */ 30 enum { 31 LOG_INODE_ALL, 32 LOG_INODE_EXISTS, 33 LOG_OTHER_INODE, 34 LOG_OTHER_INODE_ALL, 35 }; 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 enum { 90 LOG_WALK_PIN_ONLY, 91 LOG_WALK_REPLAY_INODES, 92 LOG_WALK_REPLAY_DIR_INDEX, 93 LOG_WALK_REPLAY_ALL, 94 }; 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct btrfs_inode *inode, 98 int inode_only, 99 u64 start, 100 u64 end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 int ret = 0; 145 146 mutex_lock(&root->log_mutex); 147 148 if (root->log_root) { 149 if (btrfs_need_log_full_commit(trans)) { 150 ret = -EAGAIN; 151 goto out; 152 } 153 154 if (!root->log_start_pid) { 155 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 156 root->log_start_pid = current->pid; 157 } else if (root->log_start_pid != current->pid) { 158 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 } 160 } else { 161 mutex_lock(&fs_info->tree_log_mutex); 162 if (!fs_info->log_root_tree) 163 ret = btrfs_init_log_root_tree(trans, fs_info); 164 mutex_unlock(&fs_info->tree_log_mutex); 165 if (ret) 166 goto out; 167 168 ret = btrfs_add_log_tree(trans, root); 169 if (ret) 170 goto out; 171 172 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 173 root->log_start_pid = current->pid; 174 } 175 176 atomic_inc(&root->log_batch); 177 atomic_inc(&root->log_writers); 178 if (ctx) { 179 int index = root->log_transid % 2; 180 list_add_tail(&ctx->list, &root->log_ctxs[index]); 181 ctx->log_transid = root->log_transid; 182 } 183 184 out: 185 mutex_unlock(&root->log_mutex); 186 return ret; 187 } 188 189 /* 190 * returns 0 if there was a log transaction running and we were able 191 * to join, or returns -ENOENT if there were not transactions 192 * in progress 193 */ 194 static int join_running_log_trans(struct btrfs_root *root) 195 { 196 int ret = -ENOENT; 197 198 mutex_lock(&root->log_mutex); 199 if (root->log_root) { 200 ret = 0; 201 atomic_inc(&root->log_writers); 202 } 203 mutex_unlock(&root->log_mutex); 204 return ret; 205 } 206 207 /* 208 * This either makes the current running log transaction wait 209 * until you call btrfs_end_log_trans() or it makes any future 210 * log transactions wait until you call btrfs_end_log_trans() 211 */ 212 void btrfs_pin_log_trans(struct btrfs_root *root) 213 { 214 mutex_lock(&root->log_mutex); 215 atomic_inc(&root->log_writers); 216 mutex_unlock(&root->log_mutex); 217 } 218 219 /* 220 * indicate we're done making changes to the log tree 221 * and wake up anyone waiting to do a sync 222 */ 223 void btrfs_end_log_trans(struct btrfs_root *root) 224 { 225 if (atomic_dec_and_test(&root->log_writers)) { 226 /* atomic_dec_and_test implies a barrier */ 227 cond_wake_up_nomb(&root->log_writer_wait); 228 } 229 } 230 231 static int btrfs_write_tree_block(struct extent_buffer *buf) 232 { 233 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 234 buf->start + buf->len - 1); 235 } 236 237 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 238 { 239 filemap_fdatawait_range(buf->pages[0]->mapping, 240 buf->start, buf->start + buf->len - 1); 241 } 242 243 /* 244 * the walk control struct is used to pass state down the chain when 245 * processing the log tree. The stage field tells us which part 246 * of the log tree processing we are currently doing. The others 247 * are state fields used for that specific part 248 */ 249 struct walk_control { 250 /* should we free the extent on disk when done? This is used 251 * at transaction commit time while freeing a log tree 252 */ 253 int free; 254 255 /* should we write out the extent buffer? This is used 256 * while flushing the log tree to disk during a sync 257 */ 258 int write; 259 260 /* should we wait for the extent buffer io to finish? Also used 261 * while flushing the log tree to disk for a sync 262 */ 263 int wait; 264 265 /* pin only walk, we record which extents on disk belong to the 266 * log trees 267 */ 268 int pin; 269 270 /* what stage of the replay code we're currently in */ 271 int stage; 272 273 /* 274 * Ignore any items from the inode currently being processed. Needs 275 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 276 * the LOG_WALK_REPLAY_INODES stage. 277 */ 278 bool ignore_cur_inode; 279 280 /* the root we are currently replaying */ 281 struct btrfs_root *replay_dest; 282 283 /* the trans handle for the current replay */ 284 struct btrfs_trans_handle *trans; 285 286 /* the function that gets used to process blocks we find in the 287 * tree. Note the extent_buffer might not be up to date when it is 288 * passed in, and it must be checked or read if you need the data 289 * inside it 290 */ 291 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 292 struct walk_control *wc, u64 gen, int level); 293 }; 294 295 /* 296 * process_func used to pin down extents, write them or wait on them 297 */ 298 static int process_one_buffer(struct btrfs_root *log, 299 struct extent_buffer *eb, 300 struct walk_control *wc, u64 gen, int level) 301 { 302 struct btrfs_fs_info *fs_info = log->fs_info; 303 int ret = 0; 304 305 /* 306 * If this fs is mixed then we need to be able to process the leaves to 307 * pin down any logged extents, so we have to read the block. 308 */ 309 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 310 ret = btrfs_read_buffer(eb, gen, level, NULL); 311 if (ret) 312 return ret; 313 } 314 315 if (wc->pin) 316 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 317 eb->len); 318 319 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 320 if (wc->pin && btrfs_header_level(eb) == 0) 321 ret = btrfs_exclude_logged_extents(eb); 322 if (wc->write) 323 btrfs_write_tree_block(eb); 324 if (wc->wait) 325 btrfs_wait_tree_block_writeback(eb); 326 } 327 return ret; 328 } 329 330 /* 331 * Item overwrite used by replay and tree logging. eb, slot and key all refer 332 * to the src data we are copying out. 333 * 334 * root is the tree we are copying into, and path is a scratch 335 * path for use in this function (it should be released on entry and 336 * will be released on exit). 337 * 338 * If the key is already in the destination tree the existing item is 339 * overwritten. If the existing item isn't big enough, it is extended. 340 * If it is too large, it is truncated. 341 * 342 * If the key isn't in the destination yet, a new item is inserted. 343 */ 344 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 345 struct btrfs_root *root, 346 struct btrfs_path *path, 347 struct extent_buffer *eb, int slot, 348 struct btrfs_key *key) 349 { 350 int ret; 351 u32 item_size; 352 u64 saved_i_size = 0; 353 int save_old_i_size = 0; 354 unsigned long src_ptr; 355 unsigned long dst_ptr; 356 int overwrite_root = 0; 357 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 358 359 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 360 overwrite_root = 1; 361 362 item_size = btrfs_item_size_nr(eb, slot); 363 src_ptr = btrfs_item_ptr_offset(eb, slot); 364 365 /* look for the key in the destination tree */ 366 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 367 if (ret < 0) 368 return ret; 369 370 if (ret == 0) { 371 char *src_copy; 372 char *dst_copy; 373 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 374 path->slots[0]); 375 if (dst_size != item_size) 376 goto insert; 377 378 if (item_size == 0) { 379 btrfs_release_path(path); 380 return 0; 381 } 382 dst_copy = kmalloc(item_size, GFP_NOFS); 383 src_copy = kmalloc(item_size, GFP_NOFS); 384 if (!dst_copy || !src_copy) { 385 btrfs_release_path(path); 386 kfree(dst_copy); 387 kfree(src_copy); 388 return -ENOMEM; 389 } 390 391 read_extent_buffer(eb, src_copy, src_ptr, item_size); 392 393 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 394 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 395 item_size); 396 ret = memcmp(dst_copy, src_copy, item_size); 397 398 kfree(dst_copy); 399 kfree(src_copy); 400 /* 401 * they have the same contents, just return, this saves 402 * us from cowing blocks in the destination tree and doing 403 * extra writes that may not have been done by a previous 404 * sync 405 */ 406 if (ret == 0) { 407 btrfs_release_path(path); 408 return 0; 409 } 410 411 /* 412 * We need to load the old nbytes into the inode so when we 413 * replay the extents we've logged we get the right nbytes. 414 */ 415 if (inode_item) { 416 struct btrfs_inode_item *item; 417 u64 nbytes; 418 u32 mode; 419 420 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 421 struct btrfs_inode_item); 422 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 423 item = btrfs_item_ptr(eb, slot, 424 struct btrfs_inode_item); 425 btrfs_set_inode_nbytes(eb, item, nbytes); 426 427 /* 428 * If this is a directory we need to reset the i_size to 429 * 0 so that we can set it up properly when replaying 430 * the rest of the items in this log. 431 */ 432 mode = btrfs_inode_mode(eb, item); 433 if (S_ISDIR(mode)) 434 btrfs_set_inode_size(eb, item, 0); 435 } 436 } else if (inode_item) { 437 struct btrfs_inode_item *item; 438 u32 mode; 439 440 /* 441 * New inode, set nbytes to 0 so that the nbytes comes out 442 * properly when we replay the extents. 443 */ 444 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 445 btrfs_set_inode_nbytes(eb, item, 0); 446 447 /* 448 * If this is a directory we need to reset the i_size to 0 so 449 * that we can set it up properly when replaying the rest of 450 * the items in this log. 451 */ 452 mode = btrfs_inode_mode(eb, item); 453 if (S_ISDIR(mode)) 454 btrfs_set_inode_size(eb, item, 0); 455 } 456 insert: 457 btrfs_release_path(path); 458 /* try to insert the key into the destination tree */ 459 path->skip_release_on_error = 1; 460 ret = btrfs_insert_empty_item(trans, root, path, 461 key, item_size); 462 path->skip_release_on_error = 0; 463 464 /* make sure any existing item is the correct size */ 465 if (ret == -EEXIST || ret == -EOVERFLOW) { 466 u32 found_size; 467 found_size = btrfs_item_size_nr(path->nodes[0], 468 path->slots[0]); 469 if (found_size > item_size) 470 btrfs_truncate_item(path, item_size, 1); 471 else if (found_size < item_size) 472 btrfs_extend_item(path, item_size - found_size); 473 } else if (ret) { 474 return ret; 475 } 476 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 477 path->slots[0]); 478 479 /* don't overwrite an existing inode if the generation number 480 * was logged as zero. This is done when the tree logging code 481 * is just logging an inode to make sure it exists after recovery. 482 * 483 * Also, don't overwrite i_size on directories during replay. 484 * log replay inserts and removes directory items based on the 485 * state of the tree found in the subvolume, and i_size is modified 486 * as it goes 487 */ 488 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 489 struct btrfs_inode_item *src_item; 490 struct btrfs_inode_item *dst_item; 491 492 src_item = (struct btrfs_inode_item *)src_ptr; 493 dst_item = (struct btrfs_inode_item *)dst_ptr; 494 495 if (btrfs_inode_generation(eb, src_item) == 0) { 496 struct extent_buffer *dst_eb = path->nodes[0]; 497 const u64 ino_size = btrfs_inode_size(eb, src_item); 498 499 /* 500 * For regular files an ino_size == 0 is used only when 501 * logging that an inode exists, as part of a directory 502 * fsync, and the inode wasn't fsynced before. In this 503 * case don't set the size of the inode in the fs/subvol 504 * tree, otherwise we would be throwing valid data away. 505 */ 506 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 507 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 508 ino_size != 0) { 509 struct btrfs_map_token token; 510 511 btrfs_init_map_token(&token, dst_eb); 512 btrfs_set_token_inode_size(dst_eb, dst_item, 513 ino_size, &token); 514 } 515 goto no_copy; 516 } 517 518 if (overwrite_root && 519 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 520 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 521 save_old_i_size = 1; 522 saved_i_size = btrfs_inode_size(path->nodes[0], 523 dst_item); 524 } 525 } 526 527 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 528 src_ptr, item_size); 529 530 if (save_old_i_size) { 531 struct btrfs_inode_item *dst_item; 532 dst_item = (struct btrfs_inode_item *)dst_ptr; 533 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 534 } 535 536 /* make sure the generation is filled in */ 537 if (key->type == BTRFS_INODE_ITEM_KEY) { 538 struct btrfs_inode_item *dst_item; 539 dst_item = (struct btrfs_inode_item *)dst_ptr; 540 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 541 btrfs_set_inode_generation(path->nodes[0], dst_item, 542 trans->transid); 543 } 544 } 545 no_copy: 546 btrfs_mark_buffer_dirty(path->nodes[0]); 547 btrfs_release_path(path); 548 return 0; 549 } 550 551 /* 552 * simple helper to read an inode off the disk from a given root 553 * This can only be called for subvolume roots and not for the log 554 */ 555 static noinline struct inode *read_one_inode(struct btrfs_root *root, 556 u64 objectid) 557 { 558 struct btrfs_key key; 559 struct inode *inode; 560 561 key.objectid = objectid; 562 key.type = BTRFS_INODE_ITEM_KEY; 563 key.offset = 0; 564 inode = btrfs_iget(root->fs_info->sb, &key, root); 565 if (IS_ERR(inode)) 566 inode = NULL; 567 return inode; 568 } 569 570 /* replays a single extent in 'eb' at 'slot' with 'key' into the 571 * subvolume 'root'. path is released on entry and should be released 572 * on exit. 573 * 574 * extents in the log tree have not been allocated out of the extent 575 * tree yet. So, this completes the allocation, taking a reference 576 * as required if the extent already exists or creating a new extent 577 * if it isn't in the extent allocation tree yet. 578 * 579 * The extent is inserted into the file, dropping any existing extents 580 * from the file that overlap the new one. 581 */ 582 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, 584 struct btrfs_path *path, 585 struct extent_buffer *eb, int slot, 586 struct btrfs_key *key) 587 { 588 struct btrfs_fs_info *fs_info = root->fs_info; 589 int found_type; 590 u64 extent_end; 591 u64 start = key->offset; 592 u64 nbytes = 0; 593 struct btrfs_file_extent_item *item; 594 struct inode *inode = NULL; 595 unsigned long size; 596 int ret = 0; 597 598 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 599 found_type = btrfs_file_extent_type(eb, item); 600 601 if (found_type == BTRFS_FILE_EXTENT_REG || 602 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 603 nbytes = btrfs_file_extent_num_bytes(eb, item); 604 extent_end = start + nbytes; 605 606 /* 607 * We don't add to the inodes nbytes if we are prealloc or a 608 * hole. 609 */ 610 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 611 nbytes = 0; 612 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 613 size = btrfs_file_extent_ram_bytes(eb, item); 614 nbytes = btrfs_file_extent_ram_bytes(eb, item); 615 extent_end = ALIGN(start + size, 616 fs_info->sectorsize); 617 } else { 618 ret = 0; 619 goto out; 620 } 621 622 inode = read_one_inode(root, key->objectid); 623 if (!inode) { 624 ret = -EIO; 625 goto out; 626 } 627 628 /* 629 * first check to see if we already have this extent in the 630 * file. This must be done before the btrfs_drop_extents run 631 * so we don't try to drop this extent. 632 */ 633 ret = btrfs_lookup_file_extent(trans, root, path, 634 btrfs_ino(BTRFS_I(inode)), start, 0); 635 636 if (ret == 0 && 637 (found_type == BTRFS_FILE_EXTENT_REG || 638 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 639 struct btrfs_file_extent_item cmp1; 640 struct btrfs_file_extent_item cmp2; 641 struct btrfs_file_extent_item *existing; 642 struct extent_buffer *leaf; 643 644 leaf = path->nodes[0]; 645 existing = btrfs_item_ptr(leaf, path->slots[0], 646 struct btrfs_file_extent_item); 647 648 read_extent_buffer(eb, &cmp1, (unsigned long)item, 649 sizeof(cmp1)); 650 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 651 sizeof(cmp2)); 652 653 /* 654 * we already have a pointer to this exact extent, 655 * we don't have to do anything 656 */ 657 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 658 btrfs_release_path(path); 659 goto out; 660 } 661 } 662 btrfs_release_path(path); 663 664 /* drop any overlapping extents */ 665 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 666 if (ret) 667 goto out; 668 669 if (found_type == BTRFS_FILE_EXTENT_REG || 670 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 671 u64 offset; 672 unsigned long dest_offset; 673 struct btrfs_key ins; 674 675 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 676 btrfs_fs_incompat(fs_info, NO_HOLES)) 677 goto update_inode; 678 679 ret = btrfs_insert_empty_item(trans, root, path, key, 680 sizeof(*item)); 681 if (ret) 682 goto out; 683 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 684 path->slots[0]); 685 copy_extent_buffer(path->nodes[0], eb, dest_offset, 686 (unsigned long)item, sizeof(*item)); 687 688 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 689 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 690 ins.type = BTRFS_EXTENT_ITEM_KEY; 691 offset = key->offset - btrfs_file_extent_offset(eb, item); 692 693 /* 694 * Manually record dirty extent, as here we did a shallow 695 * file extent item copy and skip normal backref update, 696 * but modifying extent tree all by ourselves. 697 * So need to manually record dirty extent for qgroup, 698 * as the owner of the file extent changed from log tree 699 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 700 */ 701 ret = btrfs_qgroup_trace_extent(trans, 702 btrfs_file_extent_disk_bytenr(eb, item), 703 btrfs_file_extent_disk_num_bytes(eb, item), 704 GFP_NOFS); 705 if (ret < 0) 706 goto out; 707 708 if (ins.objectid > 0) { 709 struct btrfs_ref ref = { 0 }; 710 u64 csum_start; 711 u64 csum_end; 712 LIST_HEAD(ordered_sums); 713 714 /* 715 * is this extent already allocated in the extent 716 * allocation tree? If so, just add a reference 717 */ 718 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 719 ins.offset); 720 if (ret == 0) { 721 btrfs_init_generic_ref(&ref, 722 BTRFS_ADD_DELAYED_REF, 723 ins.objectid, ins.offset, 0); 724 btrfs_init_data_ref(&ref, 725 root->root_key.objectid, 726 key->objectid, offset); 727 ret = btrfs_inc_extent_ref(trans, &ref); 728 if (ret) 729 goto out; 730 } else { 731 /* 732 * insert the extent pointer in the extent 733 * allocation tree 734 */ 735 ret = btrfs_alloc_logged_file_extent(trans, 736 root->root_key.objectid, 737 key->objectid, offset, &ins); 738 if (ret) 739 goto out; 740 } 741 btrfs_release_path(path); 742 743 if (btrfs_file_extent_compression(eb, item)) { 744 csum_start = ins.objectid; 745 csum_end = csum_start + ins.offset; 746 } else { 747 csum_start = ins.objectid + 748 btrfs_file_extent_offset(eb, item); 749 csum_end = csum_start + 750 btrfs_file_extent_num_bytes(eb, item); 751 } 752 753 ret = btrfs_lookup_csums_range(root->log_root, 754 csum_start, csum_end - 1, 755 &ordered_sums, 0); 756 if (ret) 757 goto out; 758 /* 759 * Now delete all existing cums in the csum root that 760 * cover our range. We do this because we can have an 761 * extent that is completely referenced by one file 762 * extent item and partially referenced by another 763 * file extent item (like after using the clone or 764 * extent_same ioctls). In this case if we end up doing 765 * the replay of the one that partially references the 766 * extent first, and we do not do the csum deletion 767 * below, we can get 2 csum items in the csum tree that 768 * overlap each other. For example, imagine our log has 769 * the two following file extent items: 770 * 771 * key (257 EXTENT_DATA 409600) 772 * extent data disk byte 12845056 nr 102400 773 * extent data offset 20480 nr 20480 ram 102400 774 * 775 * key (257 EXTENT_DATA 819200) 776 * extent data disk byte 12845056 nr 102400 777 * extent data offset 0 nr 102400 ram 102400 778 * 779 * Where the second one fully references the 100K extent 780 * that starts at disk byte 12845056, and the log tree 781 * has a single csum item that covers the entire range 782 * of the extent: 783 * 784 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 785 * 786 * After the first file extent item is replayed, the 787 * csum tree gets the following csum item: 788 * 789 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 790 * 791 * Which covers the 20K sub-range starting at offset 20K 792 * of our extent. Now when we replay the second file 793 * extent item, if we do not delete existing csum items 794 * that cover any of its blocks, we end up getting two 795 * csum items in our csum tree that overlap each other: 796 * 797 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 798 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 799 * 800 * Which is a problem, because after this anyone trying 801 * to lookup up for the checksum of any block of our 802 * extent starting at an offset of 40K or higher, will 803 * end up looking at the second csum item only, which 804 * does not contain the checksum for any block starting 805 * at offset 40K or higher of our extent. 806 */ 807 while (!list_empty(&ordered_sums)) { 808 struct btrfs_ordered_sum *sums; 809 sums = list_entry(ordered_sums.next, 810 struct btrfs_ordered_sum, 811 list); 812 if (!ret) 813 ret = btrfs_del_csums(trans, 814 fs_info->csum_root, 815 sums->bytenr, 816 sums->len); 817 if (!ret) 818 ret = btrfs_csum_file_blocks(trans, 819 fs_info->csum_root, sums); 820 list_del(&sums->list); 821 kfree(sums); 822 } 823 if (ret) 824 goto out; 825 } else { 826 btrfs_release_path(path); 827 } 828 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 829 /* inline extents are easy, we just overwrite them */ 830 ret = overwrite_item(trans, root, path, eb, slot, key); 831 if (ret) 832 goto out; 833 } 834 835 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 836 extent_end - start); 837 if (ret) 838 goto out; 839 840 inode_add_bytes(inode, nbytes); 841 update_inode: 842 ret = btrfs_update_inode(trans, root, inode); 843 out: 844 if (inode) 845 iput(inode); 846 return ret; 847 } 848 849 /* 850 * when cleaning up conflicts between the directory names in the 851 * subvolume, directory names in the log and directory names in the 852 * inode back references, we may have to unlink inodes from directories. 853 * 854 * This is a helper function to do the unlink of a specific directory 855 * item 856 */ 857 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 858 struct btrfs_root *root, 859 struct btrfs_path *path, 860 struct btrfs_inode *dir, 861 struct btrfs_dir_item *di) 862 { 863 struct inode *inode; 864 char *name; 865 int name_len; 866 struct extent_buffer *leaf; 867 struct btrfs_key location; 868 int ret; 869 870 leaf = path->nodes[0]; 871 872 btrfs_dir_item_key_to_cpu(leaf, di, &location); 873 name_len = btrfs_dir_name_len(leaf, di); 874 name = kmalloc(name_len, GFP_NOFS); 875 if (!name) 876 return -ENOMEM; 877 878 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 879 btrfs_release_path(path); 880 881 inode = read_one_inode(root, location.objectid); 882 if (!inode) { 883 ret = -EIO; 884 goto out; 885 } 886 887 ret = link_to_fixup_dir(trans, root, path, location.objectid); 888 if (ret) 889 goto out; 890 891 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 892 name_len); 893 if (ret) 894 goto out; 895 else 896 ret = btrfs_run_delayed_items(trans); 897 out: 898 kfree(name); 899 iput(inode); 900 return ret; 901 } 902 903 /* 904 * helper function to see if a given name and sequence number found 905 * in an inode back reference are already in a directory and correctly 906 * point to this inode 907 */ 908 static noinline int inode_in_dir(struct btrfs_root *root, 909 struct btrfs_path *path, 910 u64 dirid, u64 objectid, u64 index, 911 const char *name, int name_len) 912 { 913 struct btrfs_dir_item *di; 914 struct btrfs_key location; 915 int match = 0; 916 917 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 918 index, name, name_len, 0); 919 if (di && !IS_ERR(di)) { 920 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 921 if (location.objectid != objectid) 922 goto out; 923 } else 924 goto out; 925 btrfs_release_path(path); 926 927 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 928 if (di && !IS_ERR(di)) { 929 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 930 if (location.objectid != objectid) 931 goto out; 932 } else 933 goto out; 934 match = 1; 935 out: 936 btrfs_release_path(path); 937 return match; 938 } 939 940 /* 941 * helper function to check a log tree for a named back reference in 942 * an inode. This is used to decide if a back reference that is 943 * found in the subvolume conflicts with what we find in the log. 944 * 945 * inode backreferences may have multiple refs in a single item, 946 * during replay we process one reference at a time, and we don't 947 * want to delete valid links to a file from the subvolume if that 948 * link is also in the log. 949 */ 950 static noinline int backref_in_log(struct btrfs_root *log, 951 struct btrfs_key *key, 952 u64 ref_objectid, 953 const char *name, int namelen) 954 { 955 struct btrfs_path *path; 956 int ret; 957 958 path = btrfs_alloc_path(); 959 if (!path) 960 return -ENOMEM; 961 962 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 963 if (ret < 0) { 964 goto out; 965 } else if (ret == 1) { 966 ret = 0; 967 goto out; 968 } 969 970 if (key->type == BTRFS_INODE_EXTREF_KEY) 971 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 972 path->slots[0], 973 ref_objectid, 974 name, namelen); 975 else 976 ret = !!btrfs_find_name_in_backref(path->nodes[0], 977 path->slots[0], 978 name, namelen); 979 out: 980 btrfs_free_path(path); 981 return ret; 982 } 983 984 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 985 struct btrfs_root *root, 986 struct btrfs_path *path, 987 struct btrfs_root *log_root, 988 struct btrfs_inode *dir, 989 struct btrfs_inode *inode, 990 u64 inode_objectid, u64 parent_objectid, 991 u64 ref_index, char *name, int namelen, 992 int *search_done) 993 { 994 int ret; 995 char *victim_name; 996 int victim_name_len; 997 struct extent_buffer *leaf; 998 struct btrfs_dir_item *di; 999 struct btrfs_key search_key; 1000 struct btrfs_inode_extref *extref; 1001 1002 again: 1003 /* Search old style refs */ 1004 search_key.objectid = inode_objectid; 1005 search_key.type = BTRFS_INODE_REF_KEY; 1006 search_key.offset = parent_objectid; 1007 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1008 if (ret == 0) { 1009 struct btrfs_inode_ref *victim_ref; 1010 unsigned long ptr; 1011 unsigned long ptr_end; 1012 1013 leaf = path->nodes[0]; 1014 1015 /* are we trying to overwrite a back ref for the root directory 1016 * if so, just jump out, we're done 1017 */ 1018 if (search_key.objectid == search_key.offset) 1019 return 1; 1020 1021 /* check all the names in this back reference to see 1022 * if they are in the log. if so, we allow them to stay 1023 * otherwise they must be unlinked as a conflict 1024 */ 1025 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1026 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1027 while (ptr < ptr_end) { 1028 victim_ref = (struct btrfs_inode_ref *)ptr; 1029 victim_name_len = btrfs_inode_ref_name_len(leaf, 1030 victim_ref); 1031 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1032 if (!victim_name) 1033 return -ENOMEM; 1034 1035 read_extent_buffer(leaf, victim_name, 1036 (unsigned long)(victim_ref + 1), 1037 victim_name_len); 1038 1039 ret = backref_in_log(log_root, &search_key, 1040 parent_objectid, victim_name, 1041 victim_name_len); 1042 if (ret < 0) { 1043 kfree(victim_name); 1044 return ret; 1045 } else if (!ret) { 1046 inc_nlink(&inode->vfs_inode); 1047 btrfs_release_path(path); 1048 1049 ret = btrfs_unlink_inode(trans, root, dir, inode, 1050 victim_name, victim_name_len); 1051 kfree(victim_name); 1052 if (ret) 1053 return ret; 1054 ret = btrfs_run_delayed_items(trans); 1055 if (ret) 1056 return ret; 1057 *search_done = 1; 1058 goto again; 1059 } 1060 kfree(victim_name); 1061 1062 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1063 } 1064 1065 /* 1066 * NOTE: we have searched root tree and checked the 1067 * corresponding ref, it does not need to check again. 1068 */ 1069 *search_done = 1; 1070 } 1071 btrfs_release_path(path); 1072 1073 /* Same search but for extended refs */ 1074 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1075 inode_objectid, parent_objectid, 0, 1076 0); 1077 if (!IS_ERR_OR_NULL(extref)) { 1078 u32 item_size; 1079 u32 cur_offset = 0; 1080 unsigned long base; 1081 struct inode *victim_parent; 1082 1083 leaf = path->nodes[0]; 1084 1085 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1086 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1087 1088 while (cur_offset < item_size) { 1089 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1090 1091 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1092 1093 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1094 goto next; 1095 1096 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1097 if (!victim_name) 1098 return -ENOMEM; 1099 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1100 victim_name_len); 1101 1102 search_key.objectid = inode_objectid; 1103 search_key.type = BTRFS_INODE_EXTREF_KEY; 1104 search_key.offset = btrfs_extref_hash(parent_objectid, 1105 victim_name, 1106 victim_name_len); 1107 ret = backref_in_log(log_root, &search_key, 1108 parent_objectid, victim_name, 1109 victim_name_len); 1110 if (ret < 0) { 1111 return ret; 1112 } else if (!ret) { 1113 ret = -ENOENT; 1114 victim_parent = read_one_inode(root, 1115 parent_objectid); 1116 if (victim_parent) { 1117 inc_nlink(&inode->vfs_inode); 1118 btrfs_release_path(path); 1119 1120 ret = btrfs_unlink_inode(trans, root, 1121 BTRFS_I(victim_parent), 1122 inode, 1123 victim_name, 1124 victim_name_len); 1125 if (!ret) 1126 ret = btrfs_run_delayed_items( 1127 trans); 1128 } 1129 iput(victim_parent); 1130 kfree(victim_name); 1131 if (ret) 1132 return ret; 1133 *search_done = 1; 1134 goto again; 1135 } 1136 kfree(victim_name); 1137 next: 1138 cur_offset += victim_name_len + sizeof(*extref); 1139 } 1140 *search_done = 1; 1141 } 1142 btrfs_release_path(path); 1143 1144 /* look for a conflicting sequence number */ 1145 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1146 ref_index, name, namelen, 0); 1147 if (di && !IS_ERR(di)) { 1148 ret = drop_one_dir_item(trans, root, path, dir, di); 1149 if (ret) 1150 return ret; 1151 } 1152 btrfs_release_path(path); 1153 1154 /* look for a conflicting name */ 1155 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1156 name, namelen, 0); 1157 if (di && !IS_ERR(di)) { 1158 ret = drop_one_dir_item(trans, root, path, dir, di); 1159 if (ret) 1160 return ret; 1161 } 1162 btrfs_release_path(path); 1163 1164 return 0; 1165 } 1166 1167 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1168 u32 *namelen, char **name, u64 *index, 1169 u64 *parent_objectid) 1170 { 1171 struct btrfs_inode_extref *extref; 1172 1173 extref = (struct btrfs_inode_extref *)ref_ptr; 1174 1175 *namelen = btrfs_inode_extref_name_len(eb, extref); 1176 *name = kmalloc(*namelen, GFP_NOFS); 1177 if (*name == NULL) 1178 return -ENOMEM; 1179 1180 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1181 *namelen); 1182 1183 if (index) 1184 *index = btrfs_inode_extref_index(eb, extref); 1185 if (parent_objectid) 1186 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1187 1188 return 0; 1189 } 1190 1191 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1192 u32 *namelen, char **name, u64 *index) 1193 { 1194 struct btrfs_inode_ref *ref; 1195 1196 ref = (struct btrfs_inode_ref *)ref_ptr; 1197 1198 *namelen = btrfs_inode_ref_name_len(eb, ref); 1199 *name = kmalloc(*namelen, GFP_NOFS); 1200 if (*name == NULL) 1201 return -ENOMEM; 1202 1203 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1204 1205 if (index) 1206 *index = btrfs_inode_ref_index(eb, ref); 1207 1208 return 0; 1209 } 1210 1211 /* 1212 * Take an inode reference item from the log tree and iterate all names from the 1213 * inode reference item in the subvolume tree with the same key (if it exists). 1214 * For any name that is not in the inode reference item from the log tree, do a 1215 * proper unlink of that name (that is, remove its entry from the inode 1216 * reference item and both dir index keys). 1217 */ 1218 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1219 struct btrfs_root *root, 1220 struct btrfs_path *path, 1221 struct btrfs_inode *inode, 1222 struct extent_buffer *log_eb, 1223 int log_slot, 1224 struct btrfs_key *key) 1225 { 1226 int ret; 1227 unsigned long ref_ptr; 1228 unsigned long ref_end; 1229 struct extent_buffer *eb; 1230 1231 again: 1232 btrfs_release_path(path); 1233 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1234 if (ret > 0) { 1235 ret = 0; 1236 goto out; 1237 } 1238 if (ret < 0) 1239 goto out; 1240 1241 eb = path->nodes[0]; 1242 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1243 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1244 while (ref_ptr < ref_end) { 1245 char *name = NULL; 1246 int namelen; 1247 u64 parent_id; 1248 1249 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1250 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1251 NULL, &parent_id); 1252 } else { 1253 parent_id = key->offset; 1254 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1255 NULL); 1256 } 1257 if (ret) 1258 goto out; 1259 1260 if (key->type == BTRFS_INODE_EXTREF_KEY) 1261 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1262 parent_id, name, 1263 namelen); 1264 else 1265 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1266 name, namelen); 1267 1268 if (!ret) { 1269 struct inode *dir; 1270 1271 btrfs_release_path(path); 1272 dir = read_one_inode(root, parent_id); 1273 if (!dir) { 1274 ret = -ENOENT; 1275 kfree(name); 1276 goto out; 1277 } 1278 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1279 inode, name, namelen); 1280 kfree(name); 1281 iput(dir); 1282 if (ret) 1283 goto out; 1284 goto again; 1285 } 1286 1287 kfree(name); 1288 ref_ptr += namelen; 1289 if (key->type == BTRFS_INODE_EXTREF_KEY) 1290 ref_ptr += sizeof(struct btrfs_inode_extref); 1291 else 1292 ref_ptr += sizeof(struct btrfs_inode_ref); 1293 } 1294 ret = 0; 1295 out: 1296 btrfs_release_path(path); 1297 return ret; 1298 } 1299 1300 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1301 const u8 ref_type, const char *name, 1302 const int namelen) 1303 { 1304 struct btrfs_key key; 1305 struct btrfs_path *path; 1306 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1307 int ret; 1308 1309 path = btrfs_alloc_path(); 1310 if (!path) 1311 return -ENOMEM; 1312 1313 key.objectid = btrfs_ino(BTRFS_I(inode)); 1314 key.type = ref_type; 1315 if (key.type == BTRFS_INODE_REF_KEY) 1316 key.offset = parent_id; 1317 else 1318 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1319 1320 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1321 if (ret < 0) 1322 goto out; 1323 if (ret > 0) { 1324 ret = 0; 1325 goto out; 1326 } 1327 if (key.type == BTRFS_INODE_EXTREF_KEY) 1328 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1329 path->slots[0], parent_id, name, namelen); 1330 else 1331 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1332 name, namelen); 1333 1334 out: 1335 btrfs_free_path(path); 1336 return ret; 1337 } 1338 1339 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1340 struct inode *dir, struct inode *inode, const char *name, 1341 int namelen, u64 ref_index) 1342 { 1343 struct btrfs_dir_item *dir_item; 1344 struct btrfs_key key; 1345 struct btrfs_path *path; 1346 struct inode *other_inode = NULL; 1347 int ret; 1348 1349 path = btrfs_alloc_path(); 1350 if (!path) 1351 return -ENOMEM; 1352 1353 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1354 btrfs_ino(BTRFS_I(dir)), 1355 name, namelen, 0); 1356 if (!dir_item) { 1357 btrfs_release_path(path); 1358 goto add_link; 1359 } else if (IS_ERR(dir_item)) { 1360 ret = PTR_ERR(dir_item); 1361 goto out; 1362 } 1363 1364 /* 1365 * Our inode's dentry collides with the dentry of another inode which is 1366 * in the log but not yet processed since it has a higher inode number. 1367 * So delete that other dentry. 1368 */ 1369 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1370 btrfs_release_path(path); 1371 other_inode = read_one_inode(root, key.objectid); 1372 if (!other_inode) { 1373 ret = -ENOENT; 1374 goto out; 1375 } 1376 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1377 name, namelen); 1378 if (ret) 1379 goto out; 1380 /* 1381 * If we dropped the link count to 0, bump it so that later the iput() 1382 * on the inode will not free it. We will fixup the link count later. 1383 */ 1384 if (other_inode->i_nlink == 0) 1385 inc_nlink(other_inode); 1386 1387 ret = btrfs_run_delayed_items(trans); 1388 if (ret) 1389 goto out; 1390 add_link: 1391 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1392 name, namelen, 0, ref_index); 1393 out: 1394 iput(other_inode); 1395 btrfs_free_path(path); 1396 1397 return ret; 1398 } 1399 1400 /* 1401 * replay one inode back reference item found in the log tree. 1402 * eb, slot and key refer to the buffer and key found in the log tree. 1403 * root is the destination we are replaying into, and path is for temp 1404 * use by this function. (it should be released on return). 1405 */ 1406 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1407 struct btrfs_root *root, 1408 struct btrfs_root *log, 1409 struct btrfs_path *path, 1410 struct extent_buffer *eb, int slot, 1411 struct btrfs_key *key) 1412 { 1413 struct inode *dir = NULL; 1414 struct inode *inode = NULL; 1415 unsigned long ref_ptr; 1416 unsigned long ref_end; 1417 char *name = NULL; 1418 int namelen; 1419 int ret; 1420 int search_done = 0; 1421 int log_ref_ver = 0; 1422 u64 parent_objectid; 1423 u64 inode_objectid; 1424 u64 ref_index = 0; 1425 int ref_struct_size; 1426 1427 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1428 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1429 1430 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1431 struct btrfs_inode_extref *r; 1432 1433 ref_struct_size = sizeof(struct btrfs_inode_extref); 1434 log_ref_ver = 1; 1435 r = (struct btrfs_inode_extref *)ref_ptr; 1436 parent_objectid = btrfs_inode_extref_parent(eb, r); 1437 } else { 1438 ref_struct_size = sizeof(struct btrfs_inode_ref); 1439 parent_objectid = key->offset; 1440 } 1441 inode_objectid = key->objectid; 1442 1443 /* 1444 * it is possible that we didn't log all the parent directories 1445 * for a given inode. If we don't find the dir, just don't 1446 * copy the back ref in. The link count fixup code will take 1447 * care of the rest 1448 */ 1449 dir = read_one_inode(root, parent_objectid); 1450 if (!dir) { 1451 ret = -ENOENT; 1452 goto out; 1453 } 1454 1455 inode = read_one_inode(root, inode_objectid); 1456 if (!inode) { 1457 ret = -EIO; 1458 goto out; 1459 } 1460 1461 while (ref_ptr < ref_end) { 1462 if (log_ref_ver) { 1463 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1464 &ref_index, &parent_objectid); 1465 /* 1466 * parent object can change from one array 1467 * item to another. 1468 */ 1469 if (!dir) 1470 dir = read_one_inode(root, parent_objectid); 1471 if (!dir) { 1472 ret = -ENOENT; 1473 goto out; 1474 } 1475 } else { 1476 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1477 &ref_index); 1478 } 1479 if (ret) 1480 goto out; 1481 1482 /* if we already have a perfect match, we're done */ 1483 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1484 btrfs_ino(BTRFS_I(inode)), ref_index, 1485 name, namelen)) { 1486 /* 1487 * look for a conflicting back reference in the 1488 * metadata. if we find one we have to unlink that name 1489 * of the file before we add our new link. Later on, we 1490 * overwrite any existing back reference, and we don't 1491 * want to create dangling pointers in the directory. 1492 */ 1493 1494 if (!search_done) { 1495 ret = __add_inode_ref(trans, root, path, log, 1496 BTRFS_I(dir), 1497 BTRFS_I(inode), 1498 inode_objectid, 1499 parent_objectid, 1500 ref_index, name, namelen, 1501 &search_done); 1502 if (ret) { 1503 if (ret == 1) 1504 ret = 0; 1505 goto out; 1506 } 1507 } 1508 1509 /* 1510 * If a reference item already exists for this inode 1511 * with the same parent and name, but different index, 1512 * drop it and the corresponding directory index entries 1513 * from the parent before adding the new reference item 1514 * and dir index entries, otherwise we would fail with 1515 * -EEXIST returned from btrfs_add_link() below. 1516 */ 1517 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1518 name, namelen); 1519 if (ret > 0) { 1520 ret = btrfs_unlink_inode(trans, root, 1521 BTRFS_I(dir), 1522 BTRFS_I(inode), 1523 name, namelen); 1524 /* 1525 * If we dropped the link count to 0, bump it so 1526 * that later the iput() on the inode will not 1527 * free it. We will fixup the link count later. 1528 */ 1529 if (!ret && inode->i_nlink == 0) 1530 inc_nlink(inode); 1531 } 1532 if (ret < 0) 1533 goto out; 1534 1535 /* insert our name */ 1536 ret = add_link(trans, root, dir, inode, name, namelen, 1537 ref_index); 1538 if (ret) 1539 goto out; 1540 1541 btrfs_update_inode(trans, root, inode); 1542 } 1543 1544 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1545 kfree(name); 1546 name = NULL; 1547 if (log_ref_ver) { 1548 iput(dir); 1549 dir = NULL; 1550 } 1551 } 1552 1553 /* 1554 * Before we overwrite the inode reference item in the subvolume tree 1555 * with the item from the log tree, we must unlink all names from the 1556 * parent directory that are in the subvolume's tree inode reference 1557 * item, otherwise we end up with an inconsistent subvolume tree where 1558 * dir index entries exist for a name but there is no inode reference 1559 * item with the same name. 1560 */ 1561 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1562 key); 1563 if (ret) 1564 goto out; 1565 1566 /* finally write the back reference in the inode */ 1567 ret = overwrite_item(trans, root, path, eb, slot, key); 1568 out: 1569 btrfs_release_path(path); 1570 kfree(name); 1571 iput(dir); 1572 iput(inode); 1573 return ret; 1574 } 1575 1576 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1577 struct btrfs_root *root, u64 ino) 1578 { 1579 int ret; 1580 1581 ret = btrfs_insert_orphan_item(trans, root, ino); 1582 if (ret == -EEXIST) 1583 ret = 0; 1584 1585 return ret; 1586 } 1587 1588 static int count_inode_extrefs(struct btrfs_root *root, 1589 struct btrfs_inode *inode, struct btrfs_path *path) 1590 { 1591 int ret = 0; 1592 int name_len; 1593 unsigned int nlink = 0; 1594 u32 item_size; 1595 u32 cur_offset = 0; 1596 u64 inode_objectid = btrfs_ino(inode); 1597 u64 offset = 0; 1598 unsigned long ptr; 1599 struct btrfs_inode_extref *extref; 1600 struct extent_buffer *leaf; 1601 1602 while (1) { 1603 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1604 &extref, &offset); 1605 if (ret) 1606 break; 1607 1608 leaf = path->nodes[0]; 1609 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1610 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1611 cur_offset = 0; 1612 1613 while (cur_offset < item_size) { 1614 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1615 name_len = btrfs_inode_extref_name_len(leaf, extref); 1616 1617 nlink++; 1618 1619 cur_offset += name_len + sizeof(*extref); 1620 } 1621 1622 offset++; 1623 btrfs_release_path(path); 1624 } 1625 btrfs_release_path(path); 1626 1627 if (ret < 0 && ret != -ENOENT) 1628 return ret; 1629 return nlink; 1630 } 1631 1632 static int count_inode_refs(struct btrfs_root *root, 1633 struct btrfs_inode *inode, struct btrfs_path *path) 1634 { 1635 int ret; 1636 struct btrfs_key key; 1637 unsigned int nlink = 0; 1638 unsigned long ptr; 1639 unsigned long ptr_end; 1640 int name_len; 1641 u64 ino = btrfs_ino(inode); 1642 1643 key.objectid = ino; 1644 key.type = BTRFS_INODE_REF_KEY; 1645 key.offset = (u64)-1; 1646 1647 while (1) { 1648 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1649 if (ret < 0) 1650 break; 1651 if (ret > 0) { 1652 if (path->slots[0] == 0) 1653 break; 1654 path->slots[0]--; 1655 } 1656 process_slot: 1657 btrfs_item_key_to_cpu(path->nodes[0], &key, 1658 path->slots[0]); 1659 if (key.objectid != ino || 1660 key.type != BTRFS_INODE_REF_KEY) 1661 break; 1662 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1663 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1664 path->slots[0]); 1665 while (ptr < ptr_end) { 1666 struct btrfs_inode_ref *ref; 1667 1668 ref = (struct btrfs_inode_ref *)ptr; 1669 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1670 ref); 1671 ptr = (unsigned long)(ref + 1) + name_len; 1672 nlink++; 1673 } 1674 1675 if (key.offset == 0) 1676 break; 1677 if (path->slots[0] > 0) { 1678 path->slots[0]--; 1679 goto process_slot; 1680 } 1681 key.offset--; 1682 btrfs_release_path(path); 1683 } 1684 btrfs_release_path(path); 1685 1686 return nlink; 1687 } 1688 1689 /* 1690 * There are a few corners where the link count of the file can't 1691 * be properly maintained during replay. So, instead of adding 1692 * lots of complexity to the log code, we just scan the backrefs 1693 * for any file that has been through replay. 1694 * 1695 * The scan will update the link count on the inode to reflect the 1696 * number of back refs found. If it goes down to zero, the iput 1697 * will free the inode. 1698 */ 1699 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1700 struct btrfs_root *root, 1701 struct inode *inode) 1702 { 1703 struct btrfs_path *path; 1704 int ret; 1705 u64 nlink = 0; 1706 u64 ino = btrfs_ino(BTRFS_I(inode)); 1707 1708 path = btrfs_alloc_path(); 1709 if (!path) 1710 return -ENOMEM; 1711 1712 ret = count_inode_refs(root, BTRFS_I(inode), path); 1713 if (ret < 0) 1714 goto out; 1715 1716 nlink = ret; 1717 1718 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1719 if (ret < 0) 1720 goto out; 1721 1722 nlink += ret; 1723 1724 ret = 0; 1725 1726 if (nlink != inode->i_nlink) { 1727 set_nlink(inode, nlink); 1728 btrfs_update_inode(trans, root, inode); 1729 } 1730 BTRFS_I(inode)->index_cnt = (u64)-1; 1731 1732 if (inode->i_nlink == 0) { 1733 if (S_ISDIR(inode->i_mode)) { 1734 ret = replay_dir_deletes(trans, root, NULL, path, 1735 ino, 1); 1736 if (ret) 1737 goto out; 1738 } 1739 ret = insert_orphan_item(trans, root, ino); 1740 } 1741 1742 out: 1743 btrfs_free_path(path); 1744 return ret; 1745 } 1746 1747 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1748 struct btrfs_root *root, 1749 struct btrfs_path *path) 1750 { 1751 int ret; 1752 struct btrfs_key key; 1753 struct inode *inode; 1754 1755 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1756 key.type = BTRFS_ORPHAN_ITEM_KEY; 1757 key.offset = (u64)-1; 1758 while (1) { 1759 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1760 if (ret < 0) 1761 break; 1762 1763 if (ret == 1) { 1764 if (path->slots[0] == 0) 1765 break; 1766 path->slots[0]--; 1767 } 1768 1769 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1770 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1771 key.type != BTRFS_ORPHAN_ITEM_KEY) 1772 break; 1773 1774 ret = btrfs_del_item(trans, root, path); 1775 if (ret) 1776 goto out; 1777 1778 btrfs_release_path(path); 1779 inode = read_one_inode(root, key.offset); 1780 if (!inode) 1781 return -EIO; 1782 1783 ret = fixup_inode_link_count(trans, root, inode); 1784 iput(inode); 1785 if (ret) 1786 goto out; 1787 1788 /* 1789 * fixup on a directory may create new entries, 1790 * make sure we always look for the highset possible 1791 * offset 1792 */ 1793 key.offset = (u64)-1; 1794 } 1795 ret = 0; 1796 out: 1797 btrfs_release_path(path); 1798 return ret; 1799 } 1800 1801 1802 /* 1803 * record a given inode in the fixup dir so we can check its link 1804 * count when replay is done. The link count is incremented here 1805 * so the inode won't go away until we check it 1806 */ 1807 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1808 struct btrfs_root *root, 1809 struct btrfs_path *path, 1810 u64 objectid) 1811 { 1812 struct btrfs_key key; 1813 int ret = 0; 1814 struct inode *inode; 1815 1816 inode = read_one_inode(root, objectid); 1817 if (!inode) 1818 return -EIO; 1819 1820 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1821 key.type = BTRFS_ORPHAN_ITEM_KEY; 1822 key.offset = objectid; 1823 1824 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1825 1826 btrfs_release_path(path); 1827 if (ret == 0) { 1828 if (!inode->i_nlink) 1829 set_nlink(inode, 1); 1830 else 1831 inc_nlink(inode); 1832 ret = btrfs_update_inode(trans, root, inode); 1833 } else if (ret == -EEXIST) { 1834 ret = 0; 1835 } else { 1836 BUG(); /* Logic Error */ 1837 } 1838 iput(inode); 1839 1840 return ret; 1841 } 1842 1843 /* 1844 * when replaying the log for a directory, we only insert names 1845 * for inodes that actually exist. This means an fsync on a directory 1846 * does not implicitly fsync all the new files in it 1847 */ 1848 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1849 struct btrfs_root *root, 1850 u64 dirid, u64 index, 1851 char *name, int name_len, 1852 struct btrfs_key *location) 1853 { 1854 struct inode *inode; 1855 struct inode *dir; 1856 int ret; 1857 1858 inode = read_one_inode(root, location->objectid); 1859 if (!inode) 1860 return -ENOENT; 1861 1862 dir = read_one_inode(root, dirid); 1863 if (!dir) { 1864 iput(inode); 1865 return -EIO; 1866 } 1867 1868 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1869 name_len, 1, index); 1870 1871 /* FIXME, put inode into FIXUP list */ 1872 1873 iput(inode); 1874 iput(dir); 1875 return ret; 1876 } 1877 1878 /* 1879 * take a single entry in a log directory item and replay it into 1880 * the subvolume. 1881 * 1882 * if a conflicting item exists in the subdirectory already, 1883 * the inode it points to is unlinked and put into the link count 1884 * fix up tree. 1885 * 1886 * If a name from the log points to a file or directory that does 1887 * not exist in the FS, it is skipped. fsyncs on directories 1888 * do not force down inodes inside that directory, just changes to the 1889 * names or unlinks in a directory. 1890 * 1891 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1892 * non-existing inode) and 1 if the name was replayed. 1893 */ 1894 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1895 struct btrfs_root *root, 1896 struct btrfs_path *path, 1897 struct extent_buffer *eb, 1898 struct btrfs_dir_item *di, 1899 struct btrfs_key *key) 1900 { 1901 char *name; 1902 int name_len; 1903 struct btrfs_dir_item *dst_di; 1904 struct btrfs_key found_key; 1905 struct btrfs_key log_key; 1906 struct inode *dir; 1907 u8 log_type; 1908 int exists; 1909 int ret = 0; 1910 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1911 bool name_added = false; 1912 1913 dir = read_one_inode(root, key->objectid); 1914 if (!dir) 1915 return -EIO; 1916 1917 name_len = btrfs_dir_name_len(eb, di); 1918 name = kmalloc(name_len, GFP_NOFS); 1919 if (!name) { 1920 ret = -ENOMEM; 1921 goto out; 1922 } 1923 1924 log_type = btrfs_dir_type(eb, di); 1925 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1926 name_len); 1927 1928 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1929 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1930 if (exists == 0) 1931 exists = 1; 1932 else 1933 exists = 0; 1934 btrfs_release_path(path); 1935 1936 if (key->type == BTRFS_DIR_ITEM_KEY) { 1937 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1938 name, name_len, 1); 1939 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1940 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1941 key->objectid, 1942 key->offset, name, 1943 name_len, 1); 1944 } else { 1945 /* Corruption */ 1946 ret = -EINVAL; 1947 goto out; 1948 } 1949 if (IS_ERR_OR_NULL(dst_di)) { 1950 /* we need a sequence number to insert, so we only 1951 * do inserts for the BTRFS_DIR_INDEX_KEY types 1952 */ 1953 if (key->type != BTRFS_DIR_INDEX_KEY) 1954 goto out; 1955 goto insert; 1956 } 1957 1958 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1959 /* the existing item matches the logged item */ 1960 if (found_key.objectid == log_key.objectid && 1961 found_key.type == log_key.type && 1962 found_key.offset == log_key.offset && 1963 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1964 update_size = false; 1965 goto out; 1966 } 1967 1968 /* 1969 * don't drop the conflicting directory entry if the inode 1970 * for the new entry doesn't exist 1971 */ 1972 if (!exists) 1973 goto out; 1974 1975 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1976 if (ret) 1977 goto out; 1978 1979 if (key->type == BTRFS_DIR_INDEX_KEY) 1980 goto insert; 1981 out: 1982 btrfs_release_path(path); 1983 if (!ret && update_size) { 1984 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1985 ret = btrfs_update_inode(trans, root, dir); 1986 } 1987 kfree(name); 1988 iput(dir); 1989 if (!ret && name_added) 1990 ret = 1; 1991 return ret; 1992 1993 insert: 1994 /* 1995 * Check if the inode reference exists in the log for the given name, 1996 * inode and parent inode 1997 */ 1998 found_key.objectid = log_key.objectid; 1999 found_key.type = BTRFS_INODE_REF_KEY; 2000 found_key.offset = key->objectid; 2001 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 2002 if (ret < 0) { 2003 goto out; 2004 } else if (ret) { 2005 /* The dentry will be added later. */ 2006 ret = 0; 2007 update_size = false; 2008 goto out; 2009 } 2010 2011 found_key.objectid = log_key.objectid; 2012 found_key.type = BTRFS_INODE_EXTREF_KEY; 2013 found_key.offset = key->objectid; 2014 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2015 name_len); 2016 if (ret < 0) { 2017 goto out; 2018 } else if (ret) { 2019 /* The dentry will be added later. */ 2020 ret = 0; 2021 update_size = false; 2022 goto out; 2023 } 2024 btrfs_release_path(path); 2025 ret = insert_one_name(trans, root, key->objectid, key->offset, 2026 name, name_len, &log_key); 2027 if (ret && ret != -ENOENT && ret != -EEXIST) 2028 goto out; 2029 if (!ret) 2030 name_added = true; 2031 update_size = false; 2032 ret = 0; 2033 goto out; 2034 } 2035 2036 /* 2037 * find all the names in a directory item and reconcile them into 2038 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2039 * one name in a directory item, but the same code gets used for 2040 * both directory index types 2041 */ 2042 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2043 struct btrfs_root *root, 2044 struct btrfs_path *path, 2045 struct extent_buffer *eb, int slot, 2046 struct btrfs_key *key) 2047 { 2048 int ret = 0; 2049 u32 item_size = btrfs_item_size_nr(eb, slot); 2050 struct btrfs_dir_item *di; 2051 int name_len; 2052 unsigned long ptr; 2053 unsigned long ptr_end; 2054 struct btrfs_path *fixup_path = NULL; 2055 2056 ptr = btrfs_item_ptr_offset(eb, slot); 2057 ptr_end = ptr + item_size; 2058 while (ptr < ptr_end) { 2059 di = (struct btrfs_dir_item *)ptr; 2060 name_len = btrfs_dir_name_len(eb, di); 2061 ret = replay_one_name(trans, root, path, eb, di, key); 2062 if (ret < 0) 2063 break; 2064 ptr = (unsigned long)(di + 1); 2065 ptr += name_len; 2066 2067 /* 2068 * If this entry refers to a non-directory (directories can not 2069 * have a link count > 1) and it was added in the transaction 2070 * that was not committed, make sure we fixup the link count of 2071 * the inode it the entry points to. Otherwise something like 2072 * the following would result in a directory pointing to an 2073 * inode with a wrong link that does not account for this dir 2074 * entry: 2075 * 2076 * mkdir testdir 2077 * touch testdir/foo 2078 * touch testdir/bar 2079 * sync 2080 * 2081 * ln testdir/bar testdir/bar_link 2082 * ln testdir/foo testdir/foo_link 2083 * xfs_io -c "fsync" testdir/bar 2084 * 2085 * <power failure> 2086 * 2087 * mount fs, log replay happens 2088 * 2089 * File foo would remain with a link count of 1 when it has two 2090 * entries pointing to it in the directory testdir. This would 2091 * make it impossible to ever delete the parent directory has 2092 * it would result in stale dentries that can never be deleted. 2093 */ 2094 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2095 struct btrfs_key di_key; 2096 2097 if (!fixup_path) { 2098 fixup_path = btrfs_alloc_path(); 2099 if (!fixup_path) { 2100 ret = -ENOMEM; 2101 break; 2102 } 2103 } 2104 2105 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2106 ret = link_to_fixup_dir(trans, root, fixup_path, 2107 di_key.objectid); 2108 if (ret) 2109 break; 2110 } 2111 ret = 0; 2112 } 2113 btrfs_free_path(fixup_path); 2114 return ret; 2115 } 2116 2117 /* 2118 * directory replay has two parts. There are the standard directory 2119 * items in the log copied from the subvolume, and range items 2120 * created in the log while the subvolume was logged. 2121 * 2122 * The range items tell us which parts of the key space the log 2123 * is authoritative for. During replay, if a key in the subvolume 2124 * directory is in a logged range item, but not actually in the log 2125 * that means it was deleted from the directory before the fsync 2126 * and should be removed. 2127 */ 2128 static noinline int find_dir_range(struct btrfs_root *root, 2129 struct btrfs_path *path, 2130 u64 dirid, int key_type, 2131 u64 *start_ret, u64 *end_ret) 2132 { 2133 struct btrfs_key key; 2134 u64 found_end; 2135 struct btrfs_dir_log_item *item; 2136 int ret; 2137 int nritems; 2138 2139 if (*start_ret == (u64)-1) 2140 return 1; 2141 2142 key.objectid = dirid; 2143 key.type = key_type; 2144 key.offset = *start_ret; 2145 2146 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2147 if (ret < 0) 2148 goto out; 2149 if (ret > 0) { 2150 if (path->slots[0] == 0) 2151 goto out; 2152 path->slots[0]--; 2153 } 2154 if (ret != 0) 2155 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2156 2157 if (key.type != key_type || key.objectid != dirid) { 2158 ret = 1; 2159 goto next; 2160 } 2161 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2162 struct btrfs_dir_log_item); 2163 found_end = btrfs_dir_log_end(path->nodes[0], item); 2164 2165 if (*start_ret >= key.offset && *start_ret <= found_end) { 2166 ret = 0; 2167 *start_ret = key.offset; 2168 *end_ret = found_end; 2169 goto out; 2170 } 2171 ret = 1; 2172 next: 2173 /* check the next slot in the tree to see if it is a valid item */ 2174 nritems = btrfs_header_nritems(path->nodes[0]); 2175 path->slots[0]++; 2176 if (path->slots[0] >= nritems) { 2177 ret = btrfs_next_leaf(root, path); 2178 if (ret) 2179 goto out; 2180 } 2181 2182 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2183 2184 if (key.type != key_type || key.objectid != dirid) { 2185 ret = 1; 2186 goto out; 2187 } 2188 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2189 struct btrfs_dir_log_item); 2190 found_end = btrfs_dir_log_end(path->nodes[0], item); 2191 *start_ret = key.offset; 2192 *end_ret = found_end; 2193 ret = 0; 2194 out: 2195 btrfs_release_path(path); 2196 return ret; 2197 } 2198 2199 /* 2200 * this looks for a given directory item in the log. If the directory 2201 * item is not in the log, the item is removed and the inode it points 2202 * to is unlinked 2203 */ 2204 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2205 struct btrfs_root *root, 2206 struct btrfs_root *log, 2207 struct btrfs_path *path, 2208 struct btrfs_path *log_path, 2209 struct inode *dir, 2210 struct btrfs_key *dir_key) 2211 { 2212 int ret; 2213 struct extent_buffer *eb; 2214 int slot; 2215 u32 item_size; 2216 struct btrfs_dir_item *di; 2217 struct btrfs_dir_item *log_di; 2218 int name_len; 2219 unsigned long ptr; 2220 unsigned long ptr_end; 2221 char *name; 2222 struct inode *inode; 2223 struct btrfs_key location; 2224 2225 again: 2226 eb = path->nodes[0]; 2227 slot = path->slots[0]; 2228 item_size = btrfs_item_size_nr(eb, slot); 2229 ptr = btrfs_item_ptr_offset(eb, slot); 2230 ptr_end = ptr + item_size; 2231 while (ptr < ptr_end) { 2232 di = (struct btrfs_dir_item *)ptr; 2233 name_len = btrfs_dir_name_len(eb, di); 2234 name = kmalloc(name_len, GFP_NOFS); 2235 if (!name) { 2236 ret = -ENOMEM; 2237 goto out; 2238 } 2239 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2240 name_len); 2241 log_di = NULL; 2242 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2243 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2244 dir_key->objectid, 2245 name, name_len, 0); 2246 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2247 log_di = btrfs_lookup_dir_index_item(trans, log, 2248 log_path, 2249 dir_key->objectid, 2250 dir_key->offset, 2251 name, name_len, 0); 2252 } 2253 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2254 btrfs_dir_item_key_to_cpu(eb, di, &location); 2255 btrfs_release_path(path); 2256 btrfs_release_path(log_path); 2257 inode = read_one_inode(root, location.objectid); 2258 if (!inode) { 2259 kfree(name); 2260 return -EIO; 2261 } 2262 2263 ret = link_to_fixup_dir(trans, root, 2264 path, location.objectid); 2265 if (ret) { 2266 kfree(name); 2267 iput(inode); 2268 goto out; 2269 } 2270 2271 inc_nlink(inode); 2272 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2273 BTRFS_I(inode), name, name_len); 2274 if (!ret) 2275 ret = btrfs_run_delayed_items(trans); 2276 kfree(name); 2277 iput(inode); 2278 if (ret) 2279 goto out; 2280 2281 /* there might still be more names under this key 2282 * check and repeat if required 2283 */ 2284 ret = btrfs_search_slot(NULL, root, dir_key, path, 2285 0, 0); 2286 if (ret == 0) 2287 goto again; 2288 ret = 0; 2289 goto out; 2290 } else if (IS_ERR(log_di)) { 2291 kfree(name); 2292 return PTR_ERR(log_di); 2293 } 2294 btrfs_release_path(log_path); 2295 kfree(name); 2296 2297 ptr = (unsigned long)(di + 1); 2298 ptr += name_len; 2299 } 2300 ret = 0; 2301 out: 2302 btrfs_release_path(path); 2303 btrfs_release_path(log_path); 2304 return ret; 2305 } 2306 2307 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2308 struct btrfs_root *root, 2309 struct btrfs_root *log, 2310 struct btrfs_path *path, 2311 const u64 ino) 2312 { 2313 struct btrfs_key search_key; 2314 struct btrfs_path *log_path; 2315 int i; 2316 int nritems; 2317 int ret; 2318 2319 log_path = btrfs_alloc_path(); 2320 if (!log_path) 2321 return -ENOMEM; 2322 2323 search_key.objectid = ino; 2324 search_key.type = BTRFS_XATTR_ITEM_KEY; 2325 search_key.offset = 0; 2326 again: 2327 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2328 if (ret < 0) 2329 goto out; 2330 process_leaf: 2331 nritems = btrfs_header_nritems(path->nodes[0]); 2332 for (i = path->slots[0]; i < nritems; i++) { 2333 struct btrfs_key key; 2334 struct btrfs_dir_item *di; 2335 struct btrfs_dir_item *log_di; 2336 u32 total_size; 2337 u32 cur; 2338 2339 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2340 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2341 ret = 0; 2342 goto out; 2343 } 2344 2345 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2346 total_size = btrfs_item_size_nr(path->nodes[0], i); 2347 cur = 0; 2348 while (cur < total_size) { 2349 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2350 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2351 u32 this_len = sizeof(*di) + name_len + data_len; 2352 char *name; 2353 2354 name = kmalloc(name_len, GFP_NOFS); 2355 if (!name) { 2356 ret = -ENOMEM; 2357 goto out; 2358 } 2359 read_extent_buffer(path->nodes[0], name, 2360 (unsigned long)(di + 1), name_len); 2361 2362 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2363 name, name_len, 0); 2364 btrfs_release_path(log_path); 2365 if (!log_di) { 2366 /* Doesn't exist in log tree, so delete it. */ 2367 btrfs_release_path(path); 2368 di = btrfs_lookup_xattr(trans, root, path, ino, 2369 name, name_len, -1); 2370 kfree(name); 2371 if (IS_ERR(di)) { 2372 ret = PTR_ERR(di); 2373 goto out; 2374 } 2375 ASSERT(di); 2376 ret = btrfs_delete_one_dir_name(trans, root, 2377 path, di); 2378 if (ret) 2379 goto out; 2380 btrfs_release_path(path); 2381 search_key = key; 2382 goto again; 2383 } 2384 kfree(name); 2385 if (IS_ERR(log_di)) { 2386 ret = PTR_ERR(log_di); 2387 goto out; 2388 } 2389 cur += this_len; 2390 di = (struct btrfs_dir_item *)((char *)di + this_len); 2391 } 2392 } 2393 ret = btrfs_next_leaf(root, path); 2394 if (ret > 0) 2395 ret = 0; 2396 else if (ret == 0) 2397 goto process_leaf; 2398 out: 2399 btrfs_free_path(log_path); 2400 btrfs_release_path(path); 2401 return ret; 2402 } 2403 2404 2405 /* 2406 * deletion replay happens before we copy any new directory items 2407 * out of the log or out of backreferences from inodes. It 2408 * scans the log to find ranges of keys that log is authoritative for, 2409 * and then scans the directory to find items in those ranges that are 2410 * not present in the log. 2411 * 2412 * Anything we don't find in the log is unlinked and removed from the 2413 * directory. 2414 */ 2415 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2416 struct btrfs_root *root, 2417 struct btrfs_root *log, 2418 struct btrfs_path *path, 2419 u64 dirid, int del_all) 2420 { 2421 u64 range_start; 2422 u64 range_end; 2423 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2424 int ret = 0; 2425 struct btrfs_key dir_key; 2426 struct btrfs_key found_key; 2427 struct btrfs_path *log_path; 2428 struct inode *dir; 2429 2430 dir_key.objectid = dirid; 2431 dir_key.type = BTRFS_DIR_ITEM_KEY; 2432 log_path = btrfs_alloc_path(); 2433 if (!log_path) 2434 return -ENOMEM; 2435 2436 dir = read_one_inode(root, dirid); 2437 /* it isn't an error if the inode isn't there, that can happen 2438 * because we replay the deletes before we copy in the inode item 2439 * from the log 2440 */ 2441 if (!dir) { 2442 btrfs_free_path(log_path); 2443 return 0; 2444 } 2445 again: 2446 range_start = 0; 2447 range_end = 0; 2448 while (1) { 2449 if (del_all) 2450 range_end = (u64)-1; 2451 else { 2452 ret = find_dir_range(log, path, dirid, key_type, 2453 &range_start, &range_end); 2454 if (ret != 0) 2455 break; 2456 } 2457 2458 dir_key.offset = range_start; 2459 while (1) { 2460 int nritems; 2461 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2462 0, 0); 2463 if (ret < 0) 2464 goto out; 2465 2466 nritems = btrfs_header_nritems(path->nodes[0]); 2467 if (path->slots[0] >= nritems) { 2468 ret = btrfs_next_leaf(root, path); 2469 if (ret == 1) 2470 break; 2471 else if (ret < 0) 2472 goto out; 2473 } 2474 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2475 path->slots[0]); 2476 if (found_key.objectid != dirid || 2477 found_key.type != dir_key.type) 2478 goto next_type; 2479 2480 if (found_key.offset > range_end) 2481 break; 2482 2483 ret = check_item_in_log(trans, root, log, path, 2484 log_path, dir, 2485 &found_key); 2486 if (ret) 2487 goto out; 2488 if (found_key.offset == (u64)-1) 2489 break; 2490 dir_key.offset = found_key.offset + 1; 2491 } 2492 btrfs_release_path(path); 2493 if (range_end == (u64)-1) 2494 break; 2495 range_start = range_end + 1; 2496 } 2497 2498 next_type: 2499 ret = 0; 2500 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2501 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2502 dir_key.type = BTRFS_DIR_INDEX_KEY; 2503 btrfs_release_path(path); 2504 goto again; 2505 } 2506 out: 2507 btrfs_release_path(path); 2508 btrfs_free_path(log_path); 2509 iput(dir); 2510 return ret; 2511 } 2512 2513 /* 2514 * the process_func used to replay items from the log tree. This 2515 * gets called in two different stages. The first stage just looks 2516 * for inodes and makes sure they are all copied into the subvolume. 2517 * 2518 * The second stage copies all the other item types from the log into 2519 * the subvolume. The two stage approach is slower, but gets rid of 2520 * lots of complexity around inodes referencing other inodes that exist 2521 * only in the log (references come from either directory items or inode 2522 * back refs). 2523 */ 2524 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2525 struct walk_control *wc, u64 gen, int level) 2526 { 2527 int nritems; 2528 struct btrfs_path *path; 2529 struct btrfs_root *root = wc->replay_dest; 2530 struct btrfs_key key; 2531 int i; 2532 int ret; 2533 2534 ret = btrfs_read_buffer(eb, gen, level, NULL); 2535 if (ret) 2536 return ret; 2537 2538 level = btrfs_header_level(eb); 2539 2540 if (level != 0) 2541 return 0; 2542 2543 path = btrfs_alloc_path(); 2544 if (!path) 2545 return -ENOMEM; 2546 2547 nritems = btrfs_header_nritems(eb); 2548 for (i = 0; i < nritems; i++) { 2549 btrfs_item_key_to_cpu(eb, &key, i); 2550 2551 /* inode keys are done during the first stage */ 2552 if (key.type == BTRFS_INODE_ITEM_KEY && 2553 wc->stage == LOG_WALK_REPLAY_INODES) { 2554 struct btrfs_inode_item *inode_item; 2555 u32 mode; 2556 2557 inode_item = btrfs_item_ptr(eb, i, 2558 struct btrfs_inode_item); 2559 /* 2560 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2561 * and never got linked before the fsync, skip it, as 2562 * replaying it is pointless since it would be deleted 2563 * later. We skip logging tmpfiles, but it's always 2564 * possible we are replaying a log created with a kernel 2565 * that used to log tmpfiles. 2566 */ 2567 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2568 wc->ignore_cur_inode = true; 2569 continue; 2570 } else { 2571 wc->ignore_cur_inode = false; 2572 } 2573 ret = replay_xattr_deletes(wc->trans, root, log, 2574 path, key.objectid); 2575 if (ret) 2576 break; 2577 mode = btrfs_inode_mode(eb, inode_item); 2578 if (S_ISDIR(mode)) { 2579 ret = replay_dir_deletes(wc->trans, 2580 root, log, path, key.objectid, 0); 2581 if (ret) 2582 break; 2583 } 2584 ret = overwrite_item(wc->trans, root, path, 2585 eb, i, &key); 2586 if (ret) 2587 break; 2588 2589 /* 2590 * Before replaying extents, truncate the inode to its 2591 * size. We need to do it now and not after log replay 2592 * because before an fsync we can have prealloc extents 2593 * added beyond the inode's i_size. If we did it after, 2594 * through orphan cleanup for example, we would drop 2595 * those prealloc extents just after replaying them. 2596 */ 2597 if (S_ISREG(mode)) { 2598 struct inode *inode; 2599 u64 from; 2600 2601 inode = read_one_inode(root, key.objectid); 2602 if (!inode) { 2603 ret = -EIO; 2604 break; 2605 } 2606 from = ALIGN(i_size_read(inode), 2607 root->fs_info->sectorsize); 2608 ret = btrfs_drop_extents(wc->trans, root, inode, 2609 from, (u64)-1, 1); 2610 if (!ret) { 2611 /* Update the inode's nbytes. */ 2612 ret = btrfs_update_inode(wc->trans, 2613 root, inode); 2614 } 2615 iput(inode); 2616 if (ret) 2617 break; 2618 } 2619 2620 ret = link_to_fixup_dir(wc->trans, root, 2621 path, key.objectid); 2622 if (ret) 2623 break; 2624 } 2625 2626 if (wc->ignore_cur_inode) 2627 continue; 2628 2629 if (key.type == BTRFS_DIR_INDEX_KEY && 2630 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2631 ret = replay_one_dir_item(wc->trans, root, path, 2632 eb, i, &key); 2633 if (ret) 2634 break; 2635 } 2636 2637 if (wc->stage < LOG_WALK_REPLAY_ALL) 2638 continue; 2639 2640 /* these keys are simply copied */ 2641 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2642 ret = overwrite_item(wc->trans, root, path, 2643 eb, i, &key); 2644 if (ret) 2645 break; 2646 } else if (key.type == BTRFS_INODE_REF_KEY || 2647 key.type == BTRFS_INODE_EXTREF_KEY) { 2648 ret = add_inode_ref(wc->trans, root, log, path, 2649 eb, i, &key); 2650 if (ret && ret != -ENOENT) 2651 break; 2652 ret = 0; 2653 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2654 ret = replay_one_extent(wc->trans, root, path, 2655 eb, i, &key); 2656 if (ret) 2657 break; 2658 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2659 ret = replay_one_dir_item(wc->trans, root, path, 2660 eb, i, &key); 2661 if (ret) 2662 break; 2663 } 2664 } 2665 btrfs_free_path(path); 2666 return ret; 2667 } 2668 2669 /* 2670 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2671 */ 2672 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2673 { 2674 struct btrfs_block_group *cache; 2675 2676 cache = btrfs_lookup_block_group(fs_info, start); 2677 if (!cache) { 2678 btrfs_err(fs_info, "unable to find block group for %llu", start); 2679 return; 2680 } 2681 2682 spin_lock(&cache->space_info->lock); 2683 spin_lock(&cache->lock); 2684 cache->reserved -= fs_info->nodesize; 2685 cache->space_info->bytes_reserved -= fs_info->nodesize; 2686 spin_unlock(&cache->lock); 2687 spin_unlock(&cache->space_info->lock); 2688 2689 btrfs_put_block_group(cache); 2690 } 2691 2692 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2693 struct btrfs_root *root, 2694 struct btrfs_path *path, int *level, 2695 struct walk_control *wc) 2696 { 2697 struct btrfs_fs_info *fs_info = root->fs_info; 2698 u64 bytenr; 2699 u64 ptr_gen; 2700 struct extent_buffer *next; 2701 struct extent_buffer *cur; 2702 u32 blocksize; 2703 int ret = 0; 2704 2705 while (*level > 0) { 2706 struct btrfs_key first_key; 2707 2708 cur = path->nodes[*level]; 2709 2710 WARN_ON(btrfs_header_level(cur) != *level); 2711 2712 if (path->slots[*level] >= 2713 btrfs_header_nritems(cur)) 2714 break; 2715 2716 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2717 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2718 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2719 blocksize = fs_info->nodesize; 2720 2721 next = btrfs_find_create_tree_block(fs_info, bytenr); 2722 if (IS_ERR(next)) 2723 return PTR_ERR(next); 2724 2725 if (*level == 1) { 2726 ret = wc->process_func(root, next, wc, ptr_gen, 2727 *level - 1); 2728 if (ret) { 2729 free_extent_buffer(next); 2730 return ret; 2731 } 2732 2733 path->slots[*level]++; 2734 if (wc->free) { 2735 ret = btrfs_read_buffer(next, ptr_gen, 2736 *level - 1, &first_key); 2737 if (ret) { 2738 free_extent_buffer(next); 2739 return ret; 2740 } 2741 2742 if (trans) { 2743 btrfs_tree_lock(next); 2744 btrfs_set_lock_blocking_write(next); 2745 btrfs_clean_tree_block(next); 2746 btrfs_wait_tree_block_writeback(next); 2747 btrfs_tree_unlock(next); 2748 ret = btrfs_pin_reserved_extent(trans, 2749 bytenr, blocksize); 2750 if (ret) { 2751 free_extent_buffer(next); 2752 return ret; 2753 } 2754 } else { 2755 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2756 clear_extent_buffer_dirty(next); 2757 unaccount_log_buffer(fs_info, bytenr); 2758 } 2759 } 2760 free_extent_buffer(next); 2761 continue; 2762 } 2763 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2764 if (ret) { 2765 free_extent_buffer(next); 2766 return ret; 2767 } 2768 2769 if (path->nodes[*level-1]) 2770 free_extent_buffer(path->nodes[*level-1]); 2771 path->nodes[*level-1] = next; 2772 *level = btrfs_header_level(next); 2773 path->slots[*level] = 0; 2774 cond_resched(); 2775 } 2776 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2777 2778 cond_resched(); 2779 return 0; 2780 } 2781 2782 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2783 struct btrfs_root *root, 2784 struct btrfs_path *path, int *level, 2785 struct walk_control *wc) 2786 { 2787 struct btrfs_fs_info *fs_info = root->fs_info; 2788 int i; 2789 int slot; 2790 int ret; 2791 2792 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2793 slot = path->slots[i]; 2794 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2795 path->slots[i]++; 2796 *level = i; 2797 WARN_ON(*level == 0); 2798 return 0; 2799 } else { 2800 ret = wc->process_func(root, path->nodes[*level], wc, 2801 btrfs_header_generation(path->nodes[*level]), 2802 *level); 2803 if (ret) 2804 return ret; 2805 2806 if (wc->free) { 2807 struct extent_buffer *next; 2808 2809 next = path->nodes[*level]; 2810 2811 if (trans) { 2812 btrfs_tree_lock(next); 2813 btrfs_set_lock_blocking_write(next); 2814 btrfs_clean_tree_block(next); 2815 btrfs_wait_tree_block_writeback(next); 2816 btrfs_tree_unlock(next); 2817 ret = btrfs_pin_reserved_extent(trans, 2818 path->nodes[*level]->start, 2819 path->nodes[*level]->len); 2820 if (ret) 2821 return ret; 2822 } else { 2823 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2824 clear_extent_buffer_dirty(next); 2825 2826 unaccount_log_buffer(fs_info, 2827 path->nodes[*level]->start); 2828 } 2829 } 2830 free_extent_buffer(path->nodes[*level]); 2831 path->nodes[*level] = NULL; 2832 *level = i + 1; 2833 } 2834 } 2835 return 1; 2836 } 2837 2838 /* 2839 * drop the reference count on the tree rooted at 'snap'. This traverses 2840 * the tree freeing any blocks that have a ref count of zero after being 2841 * decremented. 2842 */ 2843 static int walk_log_tree(struct btrfs_trans_handle *trans, 2844 struct btrfs_root *log, struct walk_control *wc) 2845 { 2846 struct btrfs_fs_info *fs_info = log->fs_info; 2847 int ret = 0; 2848 int wret; 2849 int level; 2850 struct btrfs_path *path; 2851 int orig_level; 2852 2853 path = btrfs_alloc_path(); 2854 if (!path) 2855 return -ENOMEM; 2856 2857 level = btrfs_header_level(log->node); 2858 orig_level = level; 2859 path->nodes[level] = log->node; 2860 atomic_inc(&log->node->refs); 2861 path->slots[level] = 0; 2862 2863 while (1) { 2864 wret = walk_down_log_tree(trans, log, path, &level, wc); 2865 if (wret > 0) 2866 break; 2867 if (wret < 0) { 2868 ret = wret; 2869 goto out; 2870 } 2871 2872 wret = walk_up_log_tree(trans, log, path, &level, wc); 2873 if (wret > 0) 2874 break; 2875 if (wret < 0) { 2876 ret = wret; 2877 goto out; 2878 } 2879 } 2880 2881 /* was the root node processed? if not, catch it here */ 2882 if (path->nodes[orig_level]) { 2883 ret = wc->process_func(log, path->nodes[orig_level], wc, 2884 btrfs_header_generation(path->nodes[orig_level]), 2885 orig_level); 2886 if (ret) 2887 goto out; 2888 if (wc->free) { 2889 struct extent_buffer *next; 2890 2891 next = path->nodes[orig_level]; 2892 2893 if (trans) { 2894 btrfs_tree_lock(next); 2895 btrfs_set_lock_blocking_write(next); 2896 btrfs_clean_tree_block(next); 2897 btrfs_wait_tree_block_writeback(next); 2898 btrfs_tree_unlock(next); 2899 ret = btrfs_pin_reserved_extent(trans, 2900 next->start, next->len); 2901 if (ret) 2902 goto out; 2903 } else { 2904 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2905 clear_extent_buffer_dirty(next); 2906 unaccount_log_buffer(fs_info, next->start); 2907 } 2908 } 2909 } 2910 2911 out: 2912 btrfs_free_path(path); 2913 return ret; 2914 } 2915 2916 /* 2917 * helper function to update the item for a given subvolumes log root 2918 * in the tree of log roots 2919 */ 2920 static int update_log_root(struct btrfs_trans_handle *trans, 2921 struct btrfs_root *log, 2922 struct btrfs_root_item *root_item) 2923 { 2924 struct btrfs_fs_info *fs_info = log->fs_info; 2925 int ret; 2926 2927 if (log->log_transid == 1) { 2928 /* insert root item on the first sync */ 2929 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2930 &log->root_key, root_item); 2931 } else { 2932 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2933 &log->root_key, root_item); 2934 } 2935 return ret; 2936 } 2937 2938 static void wait_log_commit(struct btrfs_root *root, int transid) 2939 { 2940 DEFINE_WAIT(wait); 2941 int index = transid % 2; 2942 2943 /* 2944 * we only allow two pending log transactions at a time, 2945 * so we know that if ours is more than 2 older than the 2946 * current transaction, we're done 2947 */ 2948 for (;;) { 2949 prepare_to_wait(&root->log_commit_wait[index], 2950 &wait, TASK_UNINTERRUPTIBLE); 2951 2952 if (!(root->log_transid_committed < transid && 2953 atomic_read(&root->log_commit[index]))) 2954 break; 2955 2956 mutex_unlock(&root->log_mutex); 2957 schedule(); 2958 mutex_lock(&root->log_mutex); 2959 } 2960 finish_wait(&root->log_commit_wait[index], &wait); 2961 } 2962 2963 static void wait_for_writer(struct btrfs_root *root) 2964 { 2965 DEFINE_WAIT(wait); 2966 2967 for (;;) { 2968 prepare_to_wait(&root->log_writer_wait, &wait, 2969 TASK_UNINTERRUPTIBLE); 2970 if (!atomic_read(&root->log_writers)) 2971 break; 2972 2973 mutex_unlock(&root->log_mutex); 2974 schedule(); 2975 mutex_lock(&root->log_mutex); 2976 } 2977 finish_wait(&root->log_writer_wait, &wait); 2978 } 2979 2980 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2981 struct btrfs_log_ctx *ctx) 2982 { 2983 if (!ctx) 2984 return; 2985 2986 mutex_lock(&root->log_mutex); 2987 list_del_init(&ctx->list); 2988 mutex_unlock(&root->log_mutex); 2989 } 2990 2991 /* 2992 * Invoked in log mutex context, or be sure there is no other task which 2993 * can access the list. 2994 */ 2995 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2996 int index, int error) 2997 { 2998 struct btrfs_log_ctx *ctx; 2999 struct btrfs_log_ctx *safe; 3000 3001 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3002 list_del_init(&ctx->list); 3003 ctx->log_ret = error; 3004 } 3005 3006 INIT_LIST_HEAD(&root->log_ctxs[index]); 3007 } 3008 3009 /* 3010 * btrfs_sync_log does sends a given tree log down to the disk and 3011 * updates the super blocks to record it. When this call is done, 3012 * you know that any inodes previously logged are safely on disk only 3013 * if it returns 0. 3014 * 3015 * Any other return value means you need to call btrfs_commit_transaction. 3016 * Some of the edge cases for fsyncing directories that have had unlinks 3017 * or renames done in the past mean that sometimes the only safe 3018 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3019 * that has happened. 3020 */ 3021 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3022 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3023 { 3024 int index1; 3025 int index2; 3026 int mark; 3027 int ret; 3028 struct btrfs_fs_info *fs_info = root->fs_info; 3029 struct btrfs_root *log = root->log_root; 3030 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3031 struct btrfs_root_item new_root_item; 3032 int log_transid = 0; 3033 struct btrfs_log_ctx root_log_ctx; 3034 struct blk_plug plug; 3035 3036 mutex_lock(&root->log_mutex); 3037 log_transid = ctx->log_transid; 3038 if (root->log_transid_committed >= log_transid) { 3039 mutex_unlock(&root->log_mutex); 3040 return ctx->log_ret; 3041 } 3042 3043 index1 = log_transid % 2; 3044 if (atomic_read(&root->log_commit[index1])) { 3045 wait_log_commit(root, log_transid); 3046 mutex_unlock(&root->log_mutex); 3047 return ctx->log_ret; 3048 } 3049 ASSERT(log_transid == root->log_transid); 3050 atomic_set(&root->log_commit[index1], 1); 3051 3052 /* wait for previous tree log sync to complete */ 3053 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3054 wait_log_commit(root, log_transid - 1); 3055 3056 while (1) { 3057 int batch = atomic_read(&root->log_batch); 3058 /* when we're on an ssd, just kick the log commit out */ 3059 if (!btrfs_test_opt(fs_info, SSD) && 3060 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3061 mutex_unlock(&root->log_mutex); 3062 schedule_timeout_uninterruptible(1); 3063 mutex_lock(&root->log_mutex); 3064 } 3065 wait_for_writer(root); 3066 if (batch == atomic_read(&root->log_batch)) 3067 break; 3068 } 3069 3070 /* bail out if we need to do a full commit */ 3071 if (btrfs_need_log_full_commit(trans)) { 3072 ret = -EAGAIN; 3073 mutex_unlock(&root->log_mutex); 3074 goto out; 3075 } 3076 3077 if (log_transid % 2 == 0) 3078 mark = EXTENT_DIRTY; 3079 else 3080 mark = EXTENT_NEW; 3081 3082 /* we start IO on all the marked extents here, but we don't actually 3083 * wait for them until later. 3084 */ 3085 blk_start_plug(&plug); 3086 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3087 if (ret) { 3088 blk_finish_plug(&plug); 3089 btrfs_abort_transaction(trans, ret); 3090 btrfs_set_log_full_commit(trans); 3091 mutex_unlock(&root->log_mutex); 3092 goto out; 3093 } 3094 3095 /* 3096 * We _must_ update under the root->log_mutex in order to make sure we 3097 * have a consistent view of the log root we are trying to commit at 3098 * this moment. 3099 * 3100 * We _must_ copy this into a local copy, because we are not holding the 3101 * log_root_tree->log_mutex yet. This is important because when we 3102 * commit the log_root_tree we must have a consistent view of the 3103 * log_root_tree when we update the super block to point at the 3104 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3105 * with the commit and possibly point at the new block which we may not 3106 * have written out. 3107 */ 3108 btrfs_set_root_node(&log->root_item, log->node); 3109 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3110 3111 root->log_transid++; 3112 log->log_transid = root->log_transid; 3113 root->log_start_pid = 0; 3114 /* 3115 * IO has been started, blocks of the log tree have WRITTEN flag set 3116 * in their headers. new modifications of the log will be written to 3117 * new positions. so it's safe to allow log writers to go in. 3118 */ 3119 mutex_unlock(&root->log_mutex); 3120 3121 btrfs_init_log_ctx(&root_log_ctx, NULL); 3122 3123 mutex_lock(&log_root_tree->log_mutex); 3124 atomic_inc(&log_root_tree->log_batch); 3125 atomic_inc(&log_root_tree->log_writers); 3126 3127 index2 = log_root_tree->log_transid % 2; 3128 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3129 root_log_ctx.log_transid = log_root_tree->log_transid; 3130 3131 mutex_unlock(&log_root_tree->log_mutex); 3132 3133 mutex_lock(&log_root_tree->log_mutex); 3134 3135 /* 3136 * Now we are safe to update the log_root_tree because we're under the 3137 * log_mutex, and we're a current writer so we're holding the commit 3138 * open until we drop the log_mutex. 3139 */ 3140 ret = update_log_root(trans, log, &new_root_item); 3141 3142 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 3143 /* atomic_dec_and_test implies a barrier */ 3144 cond_wake_up_nomb(&log_root_tree->log_writer_wait); 3145 } 3146 3147 if (ret) { 3148 if (!list_empty(&root_log_ctx.list)) 3149 list_del_init(&root_log_ctx.list); 3150 3151 blk_finish_plug(&plug); 3152 btrfs_set_log_full_commit(trans); 3153 3154 if (ret != -ENOSPC) { 3155 btrfs_abort_transaction(trans, ret); 3156 mutex_unlock(&log_root_tree->log_mutex); 3157 goto out; 3158 } 3159 btrfs_wait_tree_log_extents(log, mark); 3160 mutex_unlock(&log_root_tree->log_mutex); 3161 ret = -EAGAIN; 3162 goto out; 3163 } 3164 3165 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3166 blk_finish_plug(&plug); 3167 list_del_init(&root_log_ctx.list); 3168 mutex_unlock(&log_root_tree->log_mutex); 3169 ret = root_log_ctx.log_ret; 3170 goto out; 3171 } 3172 3173 index2 = root_log_ctx.log_transid % 2; 3174 if (atomic_read(&log_root_tree->log_commit[index2])) { 3175 blk_finish_plug(&plug); 3176 ret = btrfs_wait_tree_log_extents(log, mark); 3177 wait_log_commit(log_root_tree, 3178 root_log_ctx.log_transid); 3179 mutex_unlock(&log_root_tree->log_mutex); 3180 if (!ret) 3181 ret = root_log_ctx.log_ret; 3182 goto out; 3183 } 3184 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3185 atomic_set(&log_root_tree->log_commit[index2], 1); 3186 3187 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3188 wait_log_commit(log_root_tree, 3189 root_log_ctx.log_transid - 1); 3190 } 3191 3192 wait_for_writer(log_root_tree); 3193 3194 /* 3195 * now that we've moved on to the tree of log tree roots, 3196 * check the full commit flag again 3197 */ 3198 if (btrfs_need_log_full_commit(trans)) { 3199 blk_finish_plug(&plug); 3200 btrfs_wait_tree_log_extents(log, mark); 3201 mutex_unlock(&log_root_tree->log_mutex); 3202 ret = -EAGAIN; 3203 goto out_wake_log_root; 3204 } 3205 3206 ret = btrfs_write_marked_extents(fs_info, 3207 &log_root_tree->dirty_log_pages, 3208 EXTENT_DIRTY | EXTENT_NEW); 3209 blk_finish_plug(&plug); 3210 if (ret) { 3211 btrfs_set_log_full_commit(trans); 3212 btrfs_abort_transaction(trans, ret); 3213 mutex_unlock(&log_root_tree->log_mutex); 3214 goto out_wake_log_root; 3215 } 3216 ret = btrfs_wait_tree_log_extents(log, mark); 3217 if (!ret) 3218 ret = btrfs_wait_tree_log_extents(log_root_tree, 3219 EXTENT_NEW | EXTENT_DIRTY); 3220 if (ret) { 3221 btrfs_set_log_full_commit(trans); 3222 mutex_unlock(&log_root_tree->log_mutex); 3223 goto out_wake_log_root; 3224 } 3225 3226 btrfs_set_super_log_root(fs_info->super_for_commit, 3227 log_root_tree->node->start); 3228 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3229 btrfs_header_level(log_root_tree->node)); 3230 3231 log_root_tree->log_transid++; 3232 mutex_unlock(&log_root_tree->log_mutex); 3233 3234 /* 3235 * Nobody else is going to jump in and write the ctree 3236 * super here because the log_commit atomic below is protecting 3237 * us. We must be called with a transaction handle pinning 3238 * the running transaction open, so a full commit can't hop 3239 * in and cause problems either. 3240 */ 3241 ret = write_all_supers(fs_info, 1); 3242 if (ret) { 3243 btrfs_set_log_full_commit(trans); 3244 btrfs_abort_transaction(trans, ret); 3245 goto out_wake_log_root; 3246 } 3247 3248 mutex_lock(&root->log_mutex); 3249 if (root->last_log_commit < log_transid) 3250 root->last_log_commit = log_transid; 3251 mutex_unlock(&root->log_mutex); 3252 3253 out_wake_log_root: 3254 mutex_lock(&log_root_tree->log_mutex); 3255 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3256 3257 log_root_tree->log_transid_committed++; 3258 atomic_set(&log_root_tree->log_commit[index2], 0); 3259 mutex_unlock(&log_root_tree->log_mutex); 3260 3261 /* 3262 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3263 * all the updates above are seen by the woken threads. It might not be 3264 * necessary, but proving that seems to be hard. 3265 */ 3266 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3267 out: 3268 mutex_lock(&root->log_mutex); 3269 btrfs_remove_all_log_ctxs(root, index1, ret); 3270 root->log_transid_committed++; 3271 atomic_set(&root->log_commit[index1], 0); 3272 mutex_unlock(&root->log_mutex); 3273 3274 /* 3275 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3276 * all the updates above are seen by the woken threads. It might not be 3277 * necessary, but proving that seems to be hard. 3278 */ 3279 cond_wake_up(&root->log_commit_wait[index1]); 3280 return ret; 3281 } 3282 3283 static void free_log_tree(struct btrfs_trans_handle *trans, 3284 struct btrfs_root *log) 3285 { 3286 int ret; 3287 struct walk_control wc = { 3288 .free = 1, 3289 .process_func = process_one_buffer 3290 }; 3291 3292 ret = walk_log_tree(trans, log, &wc); 3293 if (ret) { 3294 if (trans) 3295 btrfs_abort_transaction(trans, ret); 3296 else 3297 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3298 } 3299 3300 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3301 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3302 btrfs_put_root(log); 3303 } 3304 3305 /* 3306 * free all the extents used by the tree log. This should be called 3307 * at commit time of the full transaction 3308 */ 3309 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3310 { 3311 if (root->log_root) { 3312 free_log_tree(trans, root->log_root); 3313 root->log_root = NULL; 3314 } 3315 return 0; 3316 } 3317 3318 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3319 struct btrfs_fs_info *fs_info) 3320 { 3321 if (fs_info->log_root_tree) { 3322 free_log_tree(trans, fs_info->log_root_tree); 3323 fs_info->log_root_tree = NULL; 3324 } 3325 return 0; 3326 } 3327 3328 /* 3329 * Check if an inode was logged in the current transaction. We can't always rely 3330 * on an inode's logged_trans value, because it's an in-memory only field and 3331 * therefore not persisted. This means that its value is lost if the inode gets 3332 * evicted and loaded again from disk (in which case it has a value of 0, and 3333 * certainly it is smaller then any possible transaction ID), when that happens 3334 * the full_sync flag is set in the inode's runtime flags, so on that case we 3335 * assume eviction happened and ignore the logged_trans value, assuming the 3336 * worst case, that the inode was logged before in the current transaction. 3337 */ 3338 static bool inode_logged(struct btrfs_trans_handle *trans, 3339 struct btrfs_inode *inode) 3340 { 3341 if (inode->logged_trans == trans->transid) 3342 return true; 3343 3344 if (inode->last_trans == trans->transid && 3345 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 3346 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3347 return true; 3348 3349 return false; 3350 } 3351 3352 /* 3353 * If both a file and directory are logged, and unlinks or renames are 3354 * mixed in, we have a few interesting corners: 3355 * 3356 * create file X in dir Y 3357 * link file X to X.link in dir Y 3358 * fsync file X 3359 * unlink file X but leave X.link 3360 * fsync dir Y 3361 * 3362 * After a crash we would expect only X.link to exist. But file X 3363 * didn't get fsync'd again so the log has back refs for X and X.link. 3364 * 3365 * We solve this by removing directory entries and inode backrefs from the 3366 * log when a file that was logged in the current transaction is 3367 * unlinked. Any later fsync will include the updated log entries, and 3368 * we'll be able to reconstruct the proper directory items from backrefs. 3369 * 3370 * This optimizations allows us to avoid relogging the entire inode 3371 * or the entire directory. 3372 */ 3373 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3374 struct btrfs_root *root, 3375 const char *name, int name_len, 3376 struct btrfs_inode *dir, u64 index) 3377 { 3378 struct btrfs_root *log; 3379 struct btrfs_dir_item *di; 3380 struct btrfs_path *path; 3381 int ret; 3382 int err = 0; 3383 int bytes_del = 0; 3384 u64 dir_ino = btrfs_ino(dir); 3385 3386 if (!inode_logged(trans, dir)) 3387 return 0; 3388 3389 ret = join_running_log_trans(root); 3390 if (ret) 3391 return 0; 3392 3393 mutex_lock(&dir->log_mutex); 3394 3395 log = root->log_root; 3396 path = btrfs_alloc_path(); 3397 if (!path) { 3398 err = -ENOMEM; 3399 goto out_unlock; 3400 } 3401 3402 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3403 name, name_len, -1); 3404 if (IS_ERR(di)) { 3405 err = PTR_ERR(di); 3406 goto fail; 3407 } 3408 if (di) { 3409 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3410 bytes_del += name_len; 3411 if (ret) { 3412 err = ret; 3413 goto fail; 3414 } 3415 } 3416 btrfs_release_path(path); 3417 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3418 index, name, name_len, -1); 3419 if (IS_ERR(di)) { 3420 err = PTR_ERR(di); 3421 goto fail; 3422 } 3423 if (di) { 3424 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3425 bytes_del += name_len; 3426 if (ret) { 3427 err = ret; 3428 goto fail; 3429 } 3430 } 3431 3432 /* update the directory size in the log to reflect the names 3433 * we have removed 3434 */ 3435 if (bytes_del) { 3436 struct btrfs_key key; 3437 3438 key.objectid = dir_ino; 3439 key.offset = 0; 3440 key.type = BTRFS_INODE_ITEM_KEY; 3441 btrfs_release_path(path); 3442 3443 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3444 if (ret < 0) { 3445 err = ret; 3446 goto fail; 3447 } 3448 if (ret == 0) { 3449 struct btrfs_inode_item *item; 3450 u64 i_size; 3451 3452 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3453 struct btrfs_inode_item); 3454 i_size = btrfs_inode_size(path->nodes[0], item); 3455 if (i_size > bytes_del) 3456 i_size -= bytes_del; 3457 else 3458 i_size = 0; 3459 btrfs_set_inode_size(path->nodes[0], item, i_size); 3460 btrfs_mark_buffer_dirty(path->nodes[0]); 3461 } else 3462 ret = 0; 3463 btrfs_release_path(path); 3464 } 3465 fail: 3466 btrfs_free_path(path); 3467 out_unlock: 3468 mutex_unlock(&dir->log_mutex); 3469 if (ret == -ENOSPC) { 3470 btrfs_set_log_full_commit(trans); 3471 ret = 0; 3472 } else if (ret < 0) 3473 btrfs_abort_transaction(trans, ret); 3474 3475 btrfs_end_log_trans(root); 3476 3477 return err; 3478 } 3479 3480 /* see comments for btrfs_del_dir_entries_in_log */ 3481 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3482 struct btrfs_root *root, 3483 const char *name, int name_len, 3484 struct btrfs_inode *inode, u64 dirid) 3485 { 3486 struct btrfs_root *log; 3487 u64 index; 3488 int ret; 3489 3490 if (!inode_logged(trans, inode)) 3491 return 0; 3492 3493 ret = join_running_log_trans(root); 3494 if (ret) 3495 return 0; 3496 log = root->log_root; 3497 mutex_lock(&inode->log_mutex); 3498 3499 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3500 dirid, &index); 3501 mutex_unlock(&inode->log_mutex); 3502 if (ret == -ENOSPC) { 3503 btrfs_set_log_full_commit(trans); 3504 ret = 0; 3505 } else if (ret < 0 && ret != -ENOENT) 3506 btrfs_abort_transaction(trans, ret); 3507 btrfs_end_log_trans(root); 3508 3509 return ret; 3510 } 3511 3512 /* 3513 * creates a range item in the log for 'dirid'. first_offset and 3514 * last_offset tell us which parts of the key space the log should 3515 * be considered authoritative for. 3516 */ 3517 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3518 struct btrfs_root *log, 3519 struct btrfs_path *path, 3520 int key_type, u64 dirid, 3521 u64 first_offset, u64 last_offset) 3522 { 3523 int ret; 3524 struct btrfs_key key; 3525 struct btrfs_dir_log_item *item; 3526 3527 key.objectid = dirid; 3528 key.offset = first_offset; 3529 if (key_type == BTRFS_DIR_ITEM_KEY) 3530 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3531 else 3532 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3533 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3534 if (ret) 3535 return ret; 3536 3537 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3538 struct btrfs_dir_log_item); 3539 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3540 btrfs_mark_buffer_dirty(path->nodes[0]); 3541 btrfs_release_path(path); 3542 return 0; 3543 } 3544 3545 /* 3546 * log all the items included in the current transaction for a given 3547 * directory. This also creates the range items in the log tree required 3548 * to replay anything deleted before the fsync 3549 */ 3550 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3551 struct btrfs_root *root, struct btrfs_inode *inode, 3552 struct btrfs_path *path, 3553 struct btrfs_path *dst_path, int key_type, 3554 struct btrfs_log_ctx *ctx, 3555 u64 min_offset, u64 *last_offset_ret) 3556 { 3557 struct btrfs_key min_key; 3558 struct btrfs_root *log = root->log_root; 3559 struct extent_buffer *src; 3560 int err = 0; 3561 int ret; 3562 int i; 3563 int nritems; 3564 u64 first_offset = min_offset; 3565 u64 last_offset = (u64)-1; 3566 u64 ino = btrfs_ino(inode); 3567 3568 log = root->log_root; 3569 3570 min_key.objectid = ino; 3571 min_key.type = key_type; 3572 min_key.offset = min_offset; 3573 3574 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3575 3576 /* 3577 * we didn't find anything from this transaction, see if there 3578 * is anything at all 3579 */ 3580 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3581 min_key.objectid = ino; 3582 min_key.type = key_type; 3583 min_key.offset = (u64)-1; 3584 btrfs_release_path(path); 3585 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3586 if (ret < 0) { 3587 btrfs_release_path(path); 3588 return ret; 3589 } 3590 ret = btrfs_previous_item(root, path, ino, key_type); 3591 3592 /* if ret == 0 there are items for this type, 3593 * create a range to tell us the last key of this type. 3594 * otherwise, there are no items in this directory after 3595 * *min_offset, and we create a range to indicate that. 3596 */ 3597 if (ret == 0) { 3598 struct btrfs_key tmp; 3599 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3600 path->slots[0]); 3601 if (key_type == tmp.type) 3602 first_offset = max(min_offset, tmp.offset) + 1; 3603 } 3604 goto done; 3605 } 3606 3607 /* go backward to find any previous key */ 3608 ret = btrfs_previous_item(root, path, ino, key_type); 3609 if (ret == 0) { 3610 struct btrfs_key tmp; 3611 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3612 if (key_type == tmp.type) { 3613 first_offset = tmp.offset; 3614 ret = overwrite_item(trans, log, dst_path, 3615 path->nodes[0], path->slots[0], 3616 &tmp); 3617 if (ret) { 3618 err = ret; 3619 goto done; 3620 } 3621 } 3622 } 3623 btrfs_release_path(path); 3624 3625 /* 3626 * Find the first key from this transaction again. See the note for 3627 * log_new_dir_dentries, if we're logging a directory recursively we 3628 * won't be holding its i_mutex, which means we can modify the directory 3629 * while we're logging it. If we remove an entry between our first 3630 * search and this search we'll not find the key again and can just 3631 * bail. 3632 */ 3633 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3634 if (ret != 0) 3635 goto done; 3636 3637 /* 3638 * we have a block from this transaction, log every item in it 3639 * from our directory 3640 */ 3641 while (1) { 3642 struct btrfs_key tmp; 3643 src = path->nodes[0]; 3644 nritems = btrfs_header_nritems(src); 3645 for (i = path->slots[0]; i < nritems; i++) { 3646 struct btrfs_dir_item *di; 3647 3648 btrfs_item_key_to_cpu(src, &min_key, i); 3649 3650 if (min_key.objectid != ino || min_key.type != key_type) 3651 goto done; 3652 ret = overwrite_item(trans, log, dst_path, src, i, 3653 &min_key); 3654 if (ret) { 3655 err = ret; 3656 goto done; 3657 } 3658 3659 /* 3660 * We must make sure that when we log a directory entry, 3661 * the corresponding inode, after log replay, has a 3662 * matching link count. For example: 3663 * 3664 * touch foo 3665 * mkdir mydir 3666 * sync 3667 * ln foo mydir/bar 3668 * xfs_io -c "fsync" mydir 3669 * <crash> 3670 * <mount fs and log replay> 3671 * 3672 * Would result in a fsync log that when replayed, our 3673 * file inode would have a link count of 1, but we get 3674 * two directory entries pointing to the same inode. 3675 * After removing one of the names, it would not be 3676 * possible to remove the other name, which resulted 3677 * always in stale file handle errors, and would not 3678 * be possible to rmdir the parent directory, since 3679 * its i_size could never decrement to the value 3680 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3681 */ 3682 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3683 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3684 if (ctx && 3685 (btrfs_dir_transid(src, di) == trans->transid || 3686 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3687 tmp.type != BTRFS_ROOT_ITEM_KEY) 3688 ctx->log_new_dentries = true; 3689 } 3690 path->slots[0] = nritems; 3691 3692 /* 3693 * look ahead to the next item and see if it is also 3694 * from this directory and from this transaction 3695 */ 3696 ret = btrfs_next_leaf(root, path); 3697 if (ret) { 3698 if (ret == 1) 3699 last_offset = (u64)-1; 3700 else 3701 err = ret; 3702 goto done; 3703 } 3704 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3705 if (tmp.objectid != ino || tmp.type != key_type) { 3706 last_offset = (u64)-1; 3707 goto done; 3708 } 3709 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3710 ret = overwrite_item(trans, log, dst_path, 3711 path->nodes[0], path->slots[0], 3712 &tmp); 3713 if (ret) 3714 err = ret; 3715 else 3716 last_offset = tmp.offset; 3717 goto done; 3718 } 3719 } 3720 done: 3721 btrfs_release_path(path); 3722 btrfs_release_path(dst_path); 3723 3724 if (err == 0) { 3725 *last_offset_ret = last_offset; 3726 /* 3727 * insert the log range keys to indicate where the log 3728 * is valid 3729 */ 3730 ret = insert_dir_log_key(trans, log, path, key_type, 3731 ino, first_offset, last_offset); 3732 if (ret) 3733 err = ret; 3734 } 3735 return err; 3736 } 3737 3738 /* 3739 * logging directories is very similar to logging inodes, We find all the items 3740 * from the current transaction and write them to the log. 3741 * 3742 * The recovery code scans the directory in the subvolume, and if it finds a 3743 * key in the range logged that is not present in the log tree, then it means 3744 * that dir entry was unlinked during the transaction. 3745 * 3746 * In order for that scan to work, we must include one key smaller than 3747 * the smallest logged by this transaction and one key larger than the largest 3748 * key logged by this transaction. 3749 */ 3750 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3751 struct btrfs_root *root, struct btrfs_inode *inode, 3752 struct btrfs_path *path, 3753 struct btrfs_path *dst_path, 3754 struct btrfs_log_ctx *ctx) 3755 { 3756 u64 min_key; 3757 u64 max_key; 3758 int ret; 3759 int key_type = BTRFS_DIR_ITEM_KEY; 3760 3761 again: 3762 min_key = 0; 3763 max_key = 0; 3764 while (1) { 3765 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3766 ctx, min_key, &max_key); 3767 if (ret) 3768 return ret; 3769 if (max_key == (u64)-1) 3770 break; 3771 min_key = max_key + 1; 3772 } 3773 3774 if (key_type == BTRFS_DIR_ITEM_KEY) { 3775 key_type = BTRFS_DIR_INDEX_KEY; 3776 goto again; 3777 } 3778 return 0; 3779 } 3780 3781 /* 3782 * a helper function to drop items from the log before we relog an 3783 * inode. max_key_type indicates the highest item type to remove. 3784 * This cannot be run for file data extents because it does not 3785 * free the extents they point to. 3786 */ 3787 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3788 struct btrfs_root *log, 3789 struct btrfs_path *path, 3790 u64 objectid, int max_key_type) 3791 { 3792 int ret; 3793 struct btrfs_key key; 3794 struct btrfs_key found_key; 3795 int start_slot; 3796 3797 key.objectid = objectid; 3798 key.type = max_key_type; 3799 key.offset = (u64)-1; 3800 3801 while (1) { 3802 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3803 BUG_ON(ret == 0); /* Logic error */ 3804 if (ret < 0) 3805 break; 3806 3807 if (path->slots[0] == 0) 3808 break; 3809 3810 path->slots[0]--; 3811 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3812 path->slots[0]); 3813 3814 if (found_key.objectid != objectid) 3815 break; 3816 3817 found_key.offset = 0; 3818 found_key.type = 0; 3819 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3820 &start_slot); 3821 if (ret < 0) 3822 break; 3823 3824 ret = btrfs_del_items(trans, log, path, start_slot, 3825 path->slots[0] - start_slot + 1); 3826 /* 3827 * If start slot isn't 0 then we don't need to re-search, we've 3828 * found the last guy with the objectid in this tree. 3829 */ 3830 if (ret || start_slot != 0) 3831 break; 3832 btrfs_release_path(path); 3833 } 3834 btrfs_release_path(path); 3835 if (ret > 0) 3836 ret = 0; 3837 return ret; 3838 } 3839 3840 static void fill_inode_item(struct btrfs_trans_handle *trans, 3841 struct extent_buffer *leaf, 3842 struct btrfs_inode_item *item, 3843 struct inode *inode, int log_inode_only, 3844 u64 logged_isize) 3845 { 3846 struct btrfs_map_token token; 3847 3848 btrfs_init_map_token(&token, leaf); 3849 3850 if (log_inode_only) { 3851 /* set the generation to zero so the recover code 3852 * can tell the difference between an logging 3853 * just to say 'this inode exists' and a logging 3854 * to say 'update this inode with these values' 3855 */ 3856 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3857 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3858 } else { 3859 btrfs_set_token_inode_generation(leaf, item, 3860 BTRFS_I(inode)->generation, 3861 &token); 3862 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3863 } 3864 3865 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3866 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3867 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3868 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3869 3870 btrfs_set_token_timespec_sec(leaf, &item->atime, 3871 inode->i_atime.tv_sec, &token); 3872 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3873 inode->i_atime.tv_nsec, &token); 3874 3875 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3876 inode->i_mtime.tv_sec, &token); 3877 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3878 inode->i_mtime.tv_nsec, &token); 3879 3880 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3881 inode->i_ctime.tv_sec, &token); 3882 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3883 inode->i_ctime.tv_nsec, &token); 3884 3885 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3886 &token); 3887 3888 btrfs_set_token_inode_sequence(leaf, item, 3889 inode_peek_iversion(inode), &token); 3890 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3891 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3892 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3893 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3894 } 3895 3896 static int log_inode_item(struct btrfs_trans_handle *trans, 3897 struct btrfs_root *log, struct btrfs_path *path, 3898 struct btrfs_inode *inode) 3899 { 3900 struct btrfs_inode_item *inode_item; 3901 int ret; 3902 3903 ret = btrfs_insert_empty_item(trans, log, path, 3904 &inode->location, sizeof(*inode_item)); 3905 if (ret && ret != -EEXIST) 3906 return ret; 3907 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3908 struct btrfs_inode_item); 3909 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3910 0, 0); 3911 btrfs_release_path(path); 3912 return 0; 3913 } 3914 3915 static int log_csums(struct btrfs_trans_handle *trans, 3916 struct btrfs_root *log_root, 3917 struct btrfs_ordered_sum *sums) 3918 { 3919 int ret; 3920 3921 /* 3922 * Due to extent cloning, we might have logged a csum item that covers a 3923 * subrange of a cloned extent, and later we can end up logging a csum 3924 * item for a larger subrange of the same extent or the entire range. 3925 * This would leave csum items in the log tree that cover the same range 3926 * and break the searches for checksums in the log tree, resulting in 3927 * some checksums missing in the fs/subvolume tree. So just delete (or 3928 * trim and adjust) any existing csum items in the log for this range. 3929 */ 3930 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 3931 if (ret) 3932 return ret; 3933 3934 return btrfs_csum_file_blocks(trans, log_root, sums); 3935 } 3936 3937 static noinline int copy_items(struct btrfs_trans_handle *trans, 3938 struct btrfs_inode *inode, 3939 struct btrfs_path *dst_path, 3940 struct btrfs_path *src_path, 3941 int start_slot, int nr, int inode_only, 3942 u64 logged_isize) 3943 { 3944 struct btrfs_fs_info *fs_info = trans->fs_info; 3945 unsigned long src_offset; 3946 unsigned long dst_offset; 3947 struct btrfs_root *log = inode->root->log_root; 3948 struct btrfs_file_extent_item *extent; 3949 struct btrfs_inode_item *inode_item; 3950 struct extent_buffer *src = src_path->nodes[0]; 3951 int ret; 3952 struct btrfs_key *ins_keys; 3953 u32 *ins_sizes; 3954 char *ins_data; 3955 int i; 3956 struct list_head ordered_sums; 3957 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3958 3959 INIT_LIST_HEAD(&ordered_sums); 3960 3961 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3962 nr * sizeof(u32), GFP_NOFS); 3963 if (!ins_data) 3964 return -ENOMEM; 3965 3966 ins_sizes = (u32 *)ins_data; 3967 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3968 3969 for (i = 0; i < nr; i++) { 3970 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3971 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3972 } 3973 ret = btrfs_insert_empty_items(trans, log, dst_path, 3974 ins_keys, ins_sizes, nr); 3975 if (ret) { 3976 kfree(ins_data); 3977 return ret; 3978 } 3979 3980 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3981 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3982 dst_path->slots[0]); 3983 3984 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3985 3986 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3987 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3988 dst_path->slots[0], 3989 struct btrfs_inode_item); 3990 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3991 &inode->vfs_inode, 3992 inode_only == LOG_INODE_EXISTS, 3993 logged_isize); 3994 } else { 3995 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3996 src_offset, ins_sizes[i]); 3997 } 3998 3999 /* take a reference on file data extents so that truncates 4000 * or deletes of this inode don't have to relog the inode 4001 * again 4002 */ 4003 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4004 !skip_csum) { 4005 int found_type; 4006 extent = btrfs_item_ptr(src, start_slot + i, 4007 struct btrfs_file_extent_item); 4008 4009 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4010 continue; 4011 4012 found_type = btrfs_file_extent_type(src, extent); 4013 if (found_type == BTRFS_FILE_EXTENT_REG) { 4014 u64 ds, dl, cs, cl; 4015 ds = btrfs_file_extent_disk_bytenr(src, 4016 extent); 4017 /* ds == 0 is a hole */ 4018 if (ds == 0) 4019 continue; 4020 4021 dl = btrfs_file_extent_disk_num_bytes(src, 4022 extent); 4023 cs = btrfs_file_extent_offset(src, extent); 4024 cl = btrfs_file_extent_num_bytes(src, 4025 extent); 4026 if (btrfs_file_extent_compression(src, 4027 extent)) { 4028 cs = 0; 4029 cl = dl; 4030 } 4031 4032 ret = btrfs_lookup_csums_range( 4033 fs_info->csum_root, 4034 ds + cs, ds + cs + cl - 1, 4035 &ordered_sums, 0); 4036 if (ret) { 4037 btrfs_release_path(dst_path); 4038 kfree(ins_data); 4039 return ret; 4040 } 4041 } 4042 } 4043 } 4044 4045 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4046 btrfs_release_path(dst_path); 4047 kfree(ins_data); 4048 4049 /* 4050 * we have to do this after the loop above to avoid changing the 4051 * log tree while trying to change the log tree. 4052 */ 4053 ret = 0; 4054 while (!list_empty(&ordered_sums)) { 4055 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4056 struct btrfs_ordered_sum, 4057 list); 4058 if (!ret) 4059 ret = log_csums(trans, log, sums); 4060 list_del(&sums->list); 4061 kfree(sums); 4062 } 4063 4064 return ret; 4065 } 4066 4067 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4068 { 4069 struct extent_map *em1, *em2; 4070 4071 em1 = list_entry(a, struct extent_map, list); 4072 em2 = list_entry(b, struct extent_map, list); 4073 4074 if (em1->start < em2->start) 4075 return -1; 4076 else if (em1->start > em2->start) 4077 return 1; 4078 return 0; 4079 } 4080 4081 static int log_extent_csums(struct btrfs_trans_handle *trans, 4082 struct btrfs_inode *inode, 4083 struct btrfs_root *log_root, 4084 const struct extent_map *em) 4085 { 4086 u64 csum_offset; 4087 u64 csum_len; 4088 LIST_HEAD(ordered_sums); 4089 int ret = 0; 4090 4091 if (inode->flags & BTRFS_INODE_NODATASUM || 4092 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4093 em->block_start == EXTENT_MAP_HOLE) 4094 return 0; 4095 4096 /* If we're compressed we have to save the entire range of csums. */ 4097 if (em->compress_type) { 4098 csum_offset = 0; 4099 csum_len = max(em->block_len, em->orig_block_len); 4100 } else { 4101 csum_offset = em->mod_start - em->start; 4102 csum_len = em->mod_len; 4103 } 4104 4105 /* block start is already adjusted for the file extent offset. */ 4106 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4107 em->block_start + csum_offset, 4108 em->block_start + csum_offset + 4109 csum_len - 1, &ordered_sums, 0); 4110 if (ret) 4111 return ret; 4112 4113 while (!list_empty(&ordered_sums)) { 4114 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4115 struct btrfs_ordered_sum, 4116 list); 4117 if (!ret) 4118 ret = log_csums(trans, log_root, sums); 4119 list_del(&sums->list); 4120 kfree(sums); 4121 } 4122 4123 return ret; 4124 } 4125 4126 static int log_one_extent(struct btrfs_trans_handle *trans, 4127 struct btrfs_inode *inode, struct btrfs_root *root, 4128 const struct extent_map *em, 4129 struct btrfs_path *path, 4130 struct btrfs_log_ctx *ctx) 4131 { 4132 struct btrfs_root *log = root->log_root; 4133 struct btrfs_file_extent_item *fi; 4134 struct extent_buffer *leaf; 4135 struct btrfs_map_token token; 4136 struct btrfs_key key; 4137 u64 extent_offset = em->start - em->orig_start; 4138 u64 block_len; 4139 int ret; 4140 int extent_inserted = 0; 4141 4142 ret = log_extent_csums(trans, inode, log, em); 4143 if (ret) 4144 return ret; 4145 4146 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4147 em->start + em->len, NULL, 0, 1, 4148 sizeof(*fi), &extent_inserted); 4149 if (ret) 4150 return ret; 4151 4152 if (!extent_inserted) { 4153 key.objectid = btrfs_ino(inode); 4154 key.type = BTRFS_EXTENT_DATA_KEY; 4155 key.offset = em->start; 4156 4157 ret = btrfs_insert_empty_item(trans, log, path, &key, 4158 sizeof(*fi)); 4159 if (ret) 4160 return ret; 4161 } 4162 leaf = path->nodes[0]; 4163 btrfs_init_map_token(&token, leaf); 4164 fi = btrfs_item_ptr(leaf, path->slots[0], 4165 struct btrfs_file_extent_item); 4166 4167 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4168 &token); 4169 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4170 btrfs_set_token_file_extent_type(leaf, fi, 4171 BTRFS_FILE_EXTENT_PREALLOC, 4172 &token); 4173 else 4174 btrfs_set_token_file_extent_type(leaf, fi, 4175 BTRFS_FILE_EXTENT_REG, 4176 &token); 4177 4178 block_len = max(em->block_len, em->orig_block_len); 4179 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4180 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4181 em->block_start, 4182 &token); 4183 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4184 &token); 4185 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4186 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4187 em->block_start - 4188 extent_offset, &token); 4189 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4190 &token); 4191 } else { 4192 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4193 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4194 &token); 4195 } 4196 4197 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4198 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4199 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4200 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4201 &token); 4202 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4203 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4204 btrfs_mark_buffer_dirty(leaf); 4205 4206 btrfs_release_path(path); 4207 4208 return ret; 4209 } 4210 4211 /* 4212 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4213 * lose them after doing a fast fsync and replaying the log. We scan the 4214 * subvolume's root instead of iterating the inode's extent map tree because 4215 * otherwise we can log incorrect extent items based on extent map conversion. 4216 * That can happen due to the fact that extent maps are merged when they 4217 * are not in the extent map tree's list of modified extents. 4218 */ 4219 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4220 struct btrfs_inode *inode, 4221 struct btrfs_path *path) 4222 { 4223 struct btrfs_root *root = inode->root; 4224 struct btrfs_key key; 4225 const u64 i_size = i_size_read(&inode->vfs_inode); 4226 const u64 ino = btrfs_ino(inode); 4227 struct btrfs_path *dst_path = NULL; 4228 bool dropped_extents = false; 4229 int ins_nr = 0; 4230 int start_slot; 4231 int ret; 4232 4233 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4234 return 0; 4235 4236 key.objectid = ino; 4237 key.type = BTRFS_EXTENT_DATA_KEY; 4238 key.offset = i_size; 4239 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4240 if (ret < 0) 4241 goto out; 4242 4243 while (true) { 4244 struct extent_buffer *leaf = path->nodes[0]; 4245 int slot = path->slots[0]; 4246 4247 if (slot >= btrfs_header_nritems(leaf)) { 4248 if (ins_nr > 0) { 4249 ret = copy_items(trans, inode, dst_path, path, 4250 start_slot, ins_nr, 1, 0); 4251 if (ret < 0) 4252 goto out; 4253 ins_nr = 0; 4254 } 4255 ret = btrfs_next_leaf(root, path); 4256 if (ret < 0) 4257 goto out; 4258 if (ret > 0) { 4259 ret = 0; 4260 break; 4261 } 4262 continue; 4263 } 4264 4265 btrfs_item_key_to_cpu(leaf, &key, slot); 4266 if (key.objectid > ino) 4267 break; 4268 if (WARN_ON_ONCE(key.objectid < ino) || 4269 key.type < BTRFS_EXTENT_DATA_KEY || 4270 key.offset < i_size) { 4271 path->slots[0]++; 4272 continue; 4273 } 4274 if (!dropped_extents) { 4275 /* 4276 * Avoid logging extent items logged in past fsync calls 4277 * and leading to duplicate keys in the log tree. 4278 */ 4279 do { 4280 ret = btrfs_truncate_inode_items(trans, 4281 root->log_root, 4282 &inode->vfs_inode, 4283 i_size, 4284 BTRFS_EXTENT_DATA_KEY); 4285 } while (ret == -EAGAIN); 4286 if (ret) 4287 goto out; 4288 dropped_extents = true; 4289 } 4290 if (ins_nr == 0) 4291 start_slot = slot; 4292 ins_nr++; 4293 path->slots[0]++; 4294 if (!dst_path) { 4295 dst_path = btrfs_alloc_path(); 4296 if (!dst_path) { 4297 ret = -ENOMEM; 4298 goto out; 4299 } 4300 } 4301 } 4302 if (ins_nr > 0) { 4303 ret = copy_items(trans, inode, dst_path, path, 4304 start_slot, ins_nr, 1, 0); 4305 if (ret > 0) 4306 ret = 0; 4307 } 4308 out: 4309 btrfs_release_path(path); 4310 btrfs_free_path(dst_path); 4311 return ret; 4312 } 4313 4314 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4315 struct btrfs_root *root, 4316 struct btrfs_inode *inode, 4317 struct btrfs_path *path, 4318 struct btrfs_log_ctx *ctx, 4319 const u64 start, 4320 const u64 end) 4321 { 4322 struct extent_map *em, *n; 4323 struct list_head extents; 4324 struct extent_map_tree *tree = &inode->extent_tree; 4325 u64 test_gen; 4326 int ret = 0; 4327 int num = 0; 4328 4329 INIT_LIST_HEAD(&extents); 4330 4331 write_lock(&tree->lock); 4332 test_gen = root->fs_info->last_trans_committed; 4333 4334 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4335 /* 4336 * Skip extents outside our logging range. It's important to do 4337 * it for correctness because if we don't ignore them, we may 4338 * log them before their ordered extent completes, and therefore 4339 * we could log them without logging their respective checksums 4340 * (the checksum items are added to the csum tree at the very 4341 * end of btrfs_finish_ordered_io()). Also leave such extents 4342 * outside of our range in the list, since we may have another 4343 * ranged fsync in the near future that needs them. If an extent 4344 * outside our range corresponds to a hole, log it to avoid 4345 * leaving gaps between extents (fsck will complain when we are 4346 * not using the NO_HOLES feature). 4347 */ 4348 if ((em->start > end || em->start + em->len <= start) && 4349 em->block_start != EXTENT_MAP_HOLE) 4350 continue; 4351 4352 list_del_init(&em->list); 4353 /* 4354 * Just an arbitrary number, this can be really CPU intensive 4355 * once we start getting a lot of extents, and really once we 4356 * have a bunch of extents we just want to commit since it will 4357 * be faster. 4358 */ 4359 if (++num > 32768) { 4360 list_del_init(&tree->modified_extents); 4361 ret = -EFBIG; 4362 goto process; 4363 } 4364 4365 if (em->generation <= test_gen) 4366 continue; 4367 4368 /* We log prealloc extents beyond eof later. */ 4369 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4370 em->start >= i_size_read(&inode->vfs_inode)) 4371 continue; 4372 4373 /* Need a ref to keep it from getting evicted from cache */ 4374 refcount_inc(&em->refs); 4375 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4376 list_add_tail(&em->list, &extents); 4377 num++; 4378 } 4379 4380 list_sort(NULL, &extents, extent_cmp); 4381 process: 4382 while (!list_empty(&extents)) { 4383 em = list_entry(extents.next, struct extent_map, list); 4384 4385 list_del_init(&em->list); 4386 4387 /* 4388 * If we had an error we just need to delete everybody from our 4389 * private list. 4390 */ 4391 if (ret) { 4392 clear_em_logging(tree, em); 4393 free_extent_map(em); 4394 continue; 4395 } 4396 4397 write_unlock(&tree->lock); 4398 4399 ret = log_one_extent(trans, inode, root, em, path, ctx); 4400 write_lock(&tree->lock); 4401 clear_em_logging(tree, em); 4402 free_extent_map(em); 4403 } 4404 WARN_ON(!list_empty(&extents)); 4405 write_unlock(&tree->lock); 4406 4407 btrfs_release_path(path); 4408 if (!ret) 4409 ret = btrfs_log_prealloc_extents(trans, inode, path); 4410 4411 return ret; 4412 } 4413 4414 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4415 struct btrfs_path *path, u64 *size_ret) 4416 { 4417 struct btrfs_key key; 4418 int ret; 4419 4420 key.objectid = btrfs_ino(inode); 4421 key.type = BTRFS_INODE_ITEM_KEY; 4422 key.offset = 0; 4423 4424 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4425 if (ret < 0) { 4426 return ret; 4427 } else if (ret > 0) { 4428 *size_ret = 0; 4429 } else { 4430 struct btrfs_inode_item *item; 4431 4432 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4433 struct btrfs_inode_item); 4434 *size_ret = btrfs_inode_size(path->nodes[0], item); 4435 /* 4436 * If the in-memory inode's i_size is smaller then the inode 4437 * size stored in the btree, return the inode's i_size, so 4438 * that we get a correct inode size after replaying the log 4439 * when before a power failure we had a shrinking truncate 4440 * followed by addition of a new name (rename / new hard link). 4441 * Otherwise return the inode size from the btree, to avoid 4442 * data loss when replaying a log due to previously doing a 4443 * write that expands the inode's size and logging a new name 4444 * immediately after. 4445 */ 4446 if (*size_ret > inode->vfs_inode.i_size) 4447 *size_ret = inode->vfs_inode.i_size; 4448 } 4449 4450 btrfs_release_path(path); 4451 return 0; 4452 } 4453 4454 /* 4455 * At the moment we always log all xattrs. This is to figure out at log replay 4456 * time which xattrs must have their deletion replayed. If a xattr is missing 4457 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4458 * because if a xattr is deleted, the inode is fsynced and a power failure 4459 * happens, causing the log to be replayed the next time the fs is mounted, 4460 * we want the xattr to not exist anymore (same behaviour as other filesystems 4461 * with a journal, ext3/4, xfs, f2fs, etc). 4462 */ 4463 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4464 struct btrfs_root *root, 4465 struct btrfs_inode *inode, 4466 struct btrfs_path *path, 4467 struct btrfs_path *dst_path) 4468 { 4469 int ret; 4470 struct btrfs_key key; 4471 const u64 ino = btrfs_ino(inode); 4472 int ins_nr = 0; 4473 int start_slot = 0; 4474 4475 key.objectid = ino; 4476 key.type = BTRFS_XATTR_ITEM_KEY; 4477 key.offset = 0; 4478 4479 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4480 if (ret < 0) 4481 return ret; 4482 4483 while (true) { 4484 int slot = path->slots[0]; 4485 struct extent_buffer *leaf = path->nodes[0]; 4486 int nritems = btrfs_header_nritems(leaf); 4487 4488 if (slot >= nritems) { 4489 if (ins_nr > 0) { 4490 ret = copy_items(trans, inode, dst_path, path, 4491 start_slot, ins_nr, 1, 0); 4492 if (ret < 0) 4493 return ret; 4494 ins_nr = 0; 4495 } 4496 ret = btrfs_next_leaf(root, path); 4497 if (ret < 0) 4498 return ret; 4499 else if (ret > 0) 4500 break; 4501 continue; 4502 } 4503 4504 btrfs_item_key_to_cpu(leaf, &key, slot); 4505 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4506 break; 4507 4508 if (ins_nr == 0) 4509 start_slot = slot; 4510 ins_nr++; 4511 path->slots[0]++; 4512 cond_resched(); 4513 } 4514 if (ins_nr > 0) { 4515 ret = copy_items(trans, inode, dst_path, path, 4516 start_slot, ins_nr, 1, 0); 4517 if (ret < 0) 4518 return ret; 4519 } 4520 4521 return 0; 4522 } 4523 4524 /* 4525 * When using the NO_HOLES feature if we punched a hole that causes the 4526 * deletion of entire leafs or all the extent items of the first leaf (the one 4527 * that contains the inode item and references) we may end up not processing 4528 * any extents, because there are no leafs with a generation matching the 4529 * current transaction that have extent items for our inode. So we need to find 4530 * if any holes exist and then log them. We also need to log holes after any 4531 * truncate operation that changes the inode's size. 4532 */ 4533 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4534 struct btrfs_root *root, 4535 struct btrfs_inode *inode, 4536 struct btrfs_path *path, 4537 const u64 start, 4538 const u64 end) 4539 { 4540 struct btrfs_fs_info *fs_info = root->fs_info; 4541 struct btrfs_key key; 4542 const u64 ino = btrfs_ino(inode); 4543 const u64 i_size = i_size_read(&inode->vfs_inode); 4544 u64 prev_extent_end = start; 4545 int ret; 4546 4547 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4548 return 0; 4549 4550 key.objectid = ino; 4551 key.type = BTRFS_EXTENT_DATA_KEY; 4552 key.offset = start; 4553 4554 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4555 if (ret < 0) 4556 return ret; 4557 4558 if (ret > 0 && path->slots[0] > 0) { 4559 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 4560 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 4561 path->slots[0]--; 4562 } 4563 4564 while (true) { 4565 struct extent_buffer *leaf = path->nodes[0]; 4566 u64 extent_end; 4567 4568 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4569 ret = btrfs_next_leaf(root, path); 4570 if (ret < 0) 4571 return ret; 4572 if (ret > 0) { 4573 ret = 0; 4574 break; 4575 } 4576 leaf = path->nodes[0]; 4577 } 4578 4579 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4580 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4581 break; 4582 4583 extent_end = btrfs_file_extent_end(path); 4584 if (extent_end <= start) 4585 goto next_slot; 4586 4587 /* We have a hole, log it. */ 4588 if (prev_extent_end < key.offset) { 4589 u64 hole_len; 4590 4591 if (key.offset >= end) 4592 hole_len = end - prev_extent_end; 4593 else 4594 hole_len = key.offset - prev_extent_end; 4595 4596 /* 4597 * Release the path to avoid deadlocks with other code 4598 * paths that search the root while holding locks on 4599 * leafs from the log root. 4600 */ 4601 btrfs_release_path(path); 4602 ret = btrfs_insert_file_extent(trans, root->log_root, 4603 ino, prev_extent_end, 0, 4604 0, hole_len, 0, hole_len, 4605 0, 0, 0); 4606 if (ret < 0) 4607 return ret; 4608 4609 /* 4610 * Search for the same key again in the root. Since it's 4611 * an extent item and we are holding the inode lock, the 4612 * key must still exist. If it doesn't just emit warning 4613 * and return an error to fall back to a transaction 4614 * commit. 4615 */ 4616 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4617 if (ret < 0) 4618 return ret; 4619 if (WARN_ON(ret > 0)) 4620 return -ENOENT; 4621 leaf = path->nodes[0]; 4622 } 4623 4624 prev_extent_end = min(extent_end, end); 4625 if (extent_end >= end) 4626 break; 4627 next_slot: 4628 path->slots[0]++; 4629 cond_resched(); 4630 } 4631 4632 if (prev_extent_end < end && prev_extent_end < i_size) { 4633 u64 hole_len; 4634 4635 btrfs_release_path(path); 4636 hole_len = min(ALIGN(i_size, fs_info->sectorsize), end); 4637 hole_len -= prev_extent_end; 4638 ret = btrfs_insert_file_extent(trans, root->log_root, 4639 ino, prev_extent_end, 0, 0, 4640 hole_len, 0, hole_len, 4641 0, 0, 0); 4642 if (ret < 0) 4643 return ret; 4644 } 4645 4646 return 0; 4647 } 4648 4649 /* 4650 * When we are logging a new inode X, check if it doesn't have a reference that 4651 * matches the reference from some other inode Y created in a past transaction 4652 * and that was renamed in the current transaction. If we don't do this, then at 4653 * log replay time we can lose inode Y (and all its files if it's a directory): 4654 * 4655 * mkdir /mnt/x 4656 * echo "hello world" > /mnt/x/foobar 4657 * sync 4658 * mv /mnt/x /mnt/y 4659 * mkdir /mnt/x # or touch /mnt/x 4660 * xfs_io -c fsync /mnt/x 4661 * <power fail> 4662 * mount fs, trigger log replay 4663 * 4664 * After the log replay procedure, we would lose the first directory and all its 4665 * files (file foobar). 4666 * For the case where inode Y is not a directory we simply end up losing it: 4667 * 4668 * echo "123" > /mnt/foo 4669 * sync 4670 * mv /mnt/foo /mnt/bar 4671 * echo "abc" > /mnt/foo 4672 * xfs_io -c fsync /mnt/foo 4673 * <power fail> 4674 * 4675 * We also need this for cases where a snapshot entry is replaced by some other 4676 * entry (file or directory) otherwise we end up with an unreplayable log due to 4677 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4678 * if it were a regular entry: 4679 * 4680 * mkdir /mnt/x 4681 * btrfs subvolume snapshot /mnt /mnt/x/snap 4682 * btrfs subvolume delete /mnt/x/snap 4683 * rmdir /mnt/x 4684 * mkdir /mnt/x 4685 * fsync /mnt/x or fsync some new file inside it 4686 * <power fail> 4687 * 4688 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4689 * the same transaction. 4690 */ 4691 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4692 const int slot, 4693 const struct btrfs_key *key, 4694 struct btrfs_inode *inode, 4695 u64 *other_ino, u64 *other_parent) 4696 { 4697 int ret; 4698 struct btrfs_path *search_path; 4699 char *name = NULL; 4700 u32 name_len = 0; 4701 u32 item_size = btrfs_item_size_nr(eb, slot); 4702 u32 cur_offset = 0; 4703 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4704 4705 search_path = btrfs_alloc_path(); 4706 if (!search_path) 4707 return -ENOMEM; 4708 search_path->search_commit_root = 1; 4709 search_path->skip_locking = 1; 4710 4711 while (cur_offset < item_size) { 4712 u64 parent; 4713 u32 this_name_len; 4714 u32 this_len; 4715 unsigned long name_ptr; 4716 struct btrfs_dir_item *di; 4717 4718 if (key->type == BTRFS_INODE_REF_KEY) { 4719 struct btrfs_inode_ref *iref; 4720 4721 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4722 parent = key->offset; 4723 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4724 name_ptr = (unsigned long)(iref + 1); 4725 this_len = sizeof(*iref) + this_name_len; 4726 } else { 4727 struct btrfs_inode_extref *extref; 4728 4729 extref = (struct btrfs_inode_extref *)(ptr + 4730 cur_offset); 4731 parent = btrfs_inode_extref_parent(eb, extref); 4732 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4733 name_ptr = (unsigned long)&extref->name; 4734 this_len = sizeof(*extref) + this_name_len; 4735 } 4736 4737 if (this_name_len > name_len) { 4738 char *new_name; 4739 4740 new_name = krealloc(name, this_name_len, GFP_NOFS); 4741 if (!new_name) { 4742 ret = -ENOMEM; 4743 goto out; 4744 } 4745 name_len = this_name_len; 4746 name = new_name; 4747 } 4748 4749 read_extent_buffer(eb, name, name_ptr, this_name_len); 4750 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4751 parent, name, this_name_len, 0); 4752 if (di && !IS_ERR(di)) { 4753 struct btrfs_key di_key; 4754 4755 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4756 di, &di_key); 4757 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4758 if (di_key.objectid != key->objectid) { 4759 ret = 1; 4760 *other_ino = di_key.objectid; 4761 *other_parent = parent; 4762 } else { 4763 ret = 0; 4764 } 4765 } else { 4766 ret = -EAGAIN; 4767 } 4768 goto out; 4769 } else if (IS_ERR(di)) { 4770 ret = PTR_ERR(di); 4771 goto out; 4772 } 4773 btrfs_release_path(search_path); 4774 4775 cur_offset += this_len; 4776 } 4777 ret = 0; 4778 out: 4779 btrfs_free_path(search_path); 4780 kfree(name); 4781 return ret; 4782 } 4783 4784 struct btrfs_ino_list { 4785 u64 ino; 4786 u64 parent; 4787 struct list_head list; 4788 }; 4789 4790 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 4791 struct btrfs_root *root, 4792 struct btrfs_path *path, 4793 struct btrfs_log_ctx *ctx, 4794 u64 ino, u64 parent) 4795 { 4796 struct btrfs_ino_list *ino_elem; 4797 LIST_HEAD(inode_list); 4798 int ret = 0; 4799 4800 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4801 if (!ino_elem) 4802 return -ENOMEM; 4803 ino_elem->ino = ino; 4804 ino_elem->parent = parent; 4805 list_add_tail(&ino_elem->list, &inode_list); 4806 4807 while (!list_empty(&inode_list)) { 4808 struct btrfs_fs_info *fs_info = root->fs_info; 4809 struct btrfs_key key; 4810 struct inode *inode; 4811 4812 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 4813 list); 4814 ino = ino_elem->ino; 4815 parent = ino_elem->parent; 4816 list_del(&ino_elem->list); 4817 kfree(ino_elem); 4818 if (ret) 4819 continue; 4820 4821 btrfs_release_path(path); 4822 4823 key.objectid = ino; 4824 key.type = BTRFS_INODE_ITEM_KEY; 4825 key.offset = 0; 4826 inode = btrfs_iget(fs_info->sb, &key, root); 4827 /* 4828 * If the other inode that had a conflicting dir entry was 4829 * deleted in the current transaction, we need to log its parent 4830 * directory. 4831 */ 4832 if (IS_ERR(inode)) { 4833 ret = PTR_ERR(inode); 4834 if (ret == -ENOENT) { 4835 key.objectid = parent; 4836 inode = btrfs_iget(fs_info->sb, &key, root); 4837 if (IS_ERR(inode)) { 4838 ret = PTR_ERR(inode); 4839 } else { 4840 ret = btrfs_log_inode(trans, root, 4841 BTRFS_I(inode), 4842 LOG_OTHER_INODE_ALL, 4843 0, LLONG_MAX, ctx); 4844 btrfs_add_delayed_iput(inode); 4845 } 4846 } 4847 continue; 4848 } 4849 /* 4850 * If the inode was already logged skip it - otherwise we can 4851 * hit an infinite loop. Example: 4852 * 4853 * From the commit root (previous transaction) we have the 4854 * following inodes: 4855 * 4856 * inode 257 a directory 4857 * inode 258 with references "zz" and "zz_link" on inode 257 4858 * inode 259 with reference "a" on inode 257 4859 * 4860 * And in the current (uncommitted) transaction we have: 4861 * 4862 * inode 257 a directory, unchanged 4863 * inode 258 with references "a" and "a2" on inode 257 4864 * inode 259 with reference "zz_link" on inode 257 4865 * inode 261 with reference "zz" on inode 257 4866 * 4867 * When logging inode 261 the following infinite loop could 4868 * happen if we don't skip already logged inodes: 4869 * 4870 * - we detect inode 258 as a conflicting inode, with inode 261 4871 * on reference "zz", and log it; 4872 * 4873 * - we detect inode 259 as a conflicting inode, with inode 258 4874 * on reference "a", and log it; 4875 * 4876 * - we detect inode 258 as a conflicting inode, with inode 259 4877 * on reference "zz_link", and log it - again! After this we 4878 * repeat the above steps forever. 4879 */ 4880 spin_lock(&BTRFS_I(inode)->lock); 4881 /* 4882 * Check the inode's logged_trans only instead of 4883 * btrfs_inode_in_log(). This is because the last_log_commit of 4884 * the inode is not updated when we only log that it exists and 4885 * and it has the full sync bit set (see btrfs_log_inode()). 4886 */ 4887 if (BTRFS_I(inode)->logged_trans == trans->transid) { 4888 spin_unlock(&BTRFS_I(inode)->lock); 4889 btrfs_add_delayed_iput(inode); 4890 continue; 4891 } 4892 spin_unlock(&BTRFS_I(inode)->lock); 4893 /* 4894 * We are safe logging the other inode without acquiring its 4895 * lock as long as we log with the LOG_INODE_EXISTS mode. We 4896 * are safe against concurrent renames of the other inode as 4897 * well because during a rename we pin the log and update the 4898 * log with the new name before we unpin it. 4899 */ 4900 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 4901 LOG_OTHER_INODE, 0, LLONG_MAX, ctx); 4902 if (ret) { 4903 btrfs_add_delayed_iput(inode); 4904 continue; 4905 } 4906 4907 key.objectid = ino; 4908 key.type = BTRFS_INODE_REF_KEY; 4909 key.offset = 0; 4910 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4911 if (ret < 0) { 4912 btrfs_add_delayed_iput(inode); 4913 continue; 4914 } 4915 4916 while (true) { 4917 struct extent_buffer *leaf = path->nodes[0]; 4918 int slot = path->slots[0]; 4919 u64 other_ino = 0; 4920 u64 other_parent = 0; 4921 4922 if (slot >= btrfs_header_nritems(leaf)) { 4923 ret = btrfs_next_leaf(root, path); 4924 if (ret < 0) { 4925 break; 4926 } else if (ret > 0) { 4927 ret = 0; 4928 break; 4929 } 4930 continue; 4931 } 4932 4933 btrfs_item_key_to_cpu(leaf, &key, slot); 4934 if (key.objectid != ino || 4935 (key.type != BTRFS_INODE_REF_KEY && 4936 key.type != BTRFS_INODE_EXTREF_KEY)) { 4937 ret = 0; 4938 break; 4939 } 4940 4941 ret = btrfs_check_ref_name_override(leaf, slot, &key, 4942 BTRFS_I(inode), &other_ino, 4943 &other_parent); 4944 if (ret < 0) 4945 break; 4946 if (ret > 0) { 4947 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4948 if (!ino_elem) { 4949 ret = -ENOMEM; 4950 break; 4951 } 4952 ino_elem->ino = other_ino; 4953 ino_elem->parent = other_parent; 4954 list_add_tail(&ino_elem->list, &inode_list); 4955 ret = 0; 4956 } 4957 path->slots[0]++; 4958 } 4959 btrfs_add_delayed_iput(inode); 4960 } 4961 4962 return ret; 4963 } 4964 4965 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 4966 struct btrfs_inode *inode, 4967 struct btrfs_key *min_key, 4968 const struct btrfs_key *max_key, 4969 struct btrfs_path *path, 4970 struct btrfs_path *dst_path, 4971 const u64 logged_isize, 4972 const bool recursive_logging, 4973 const int inode_only, 4974 const u64 start, 4975 const u64 end, 4976 struct btrfs_log_ctx *ctx, 4977 bool *need_log_inode_item) 4978 { 4979 struct btrfs_root *root = inode->root; 4980 int ins_start_slot = 0; 4981 int ins_nr = 0; 4982 int ret; 4983 4984 /* 4985 * We must make sure we don't copy extent items that are entirely out of 4986 * the range [start, end - 1]. This is not just an optimization to avoid 4987 * copying but also needed to avoid a corruption where we end up with 4988 * file extent items in the log tree that have overlapping ranges - this 4989 * can happen if we race with ordered extent completion for ranges that 4990 * are outside our target range. For example we copy an extent item and 4991 * when we move to the next leaf, that extent was trimmed and a new one 4992 * covering a subrange of it, but with a higher key, was inserted - we 4993 * would then copy this other extent too, resulting in a log tree with 4994 * 2 extent items that represent overlapping ranges. 4995 * 4996 * We can copy the entire extents at the range bondaries however, even 4997 * if they cover an area outside the target range. That's ok. 4998 */ 4999 while (1) { 5000 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5001 if (ret < 0) 5002 return ret; 5003 if (ret > 0) { 5004 ret = 0; 5005 break; 5006 } 5007 again: 5008 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5009 if (min_key->objectid != max_key->objectid) 5010 break; 5011 if (min_key->type > max_key->type) 5012 break; 5013 5014 if (min_key->type == BTRFS_INODE_ITEM_KEY) 5015 *need_log_inode_item = false; 5016 5017 if ((min_key->type == BTRFS_INODE_REF_KEY || 5018 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5019 inode->generation == trans->transid && 5020 !recursive_logging) { 5021 u64 other_ino = 0; 5022 u64 other_parent = 0; 5023 5024 ret = btrfs_check_ref_name_override(path->nodes[0], 5025 path->slots[0], min_key, inode, 5026 &other_ino, &other_parent); 5027 if (ret < 0) { 5028 return ret; 5029 } else if (ret > 0 && ctx && 5030 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5031 if (ins_nr > 0) { 5032 ins_nr++; 5033 } else { 5034 ins_nr = 1; 5035 ins_start_slot = path->slots[0]; 5036 } 5037 ret = copy_items(trans, inode, dst_path, path, 5038 ins_start_slot, ins_nr, 5039 inode_only, logged_isize); 5040 if (ret < 0) 5041 return ret; 5042 ins_nr = 0; 5043 5044 ret = log_conflicting_inodes(trans, root, path, 5045 ctx, other_ino, other_parent); 5046 if (ret) 5047 return ret; 5048 btrfs_release_path(path); 5049 goto next_key; 5050 } 5051 } 5052 5053 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5054 if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5055 if (ins_nr == 0) 5056 goto next_slot; 5057 ret = copy_items(trans, inode, dst_path, path, 5058 ins_start_slot, 5059 ins_nr, inode_only, logged_isize); 5060 if (ret < 0) 5061 return ret; 5062 ins_nr = 0; 5063 goto next_slot; 5064 } 5065 5066 if (min_key->type == BTRFS_EXTENT_DATA_KEY) { 5067 const u64 extent_end = btrfs_file_extent_end(path); 5068 5069 if (extent_end <= start) { 5070 if (ins_nr > 0) { 5071 ret = copy_items(trans, inode, dst_path, 5072 path, ins_start_slot, 5073 ins_nr, inode_only, 5074 logged_isize); 5075 if (ret < 0) 5076 return ret; 5077 ins_nr = 0; 5078 } 5079 goto next_slot; 5080 } 5081 if (extent_end >= end) { 5082 ins_nr++; 5083 if (ins_nr == 1) 5084 ins_start_slot = path->slots[0]; 5085 break; 5086 } 5087 } 5088 5089 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5090 ins_nr++; 5091 goto next_slot; 5092 } else if (!ins_nr) { 5093 ins_start_slot = path->slots[0]; 5094 ins_nr = 1; 5095 goto next_slot; 5096 } 5097 5098 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5099 ins_nr, inode_only, logged_isize); 5100 if (ret < 0) 5101 return ret; 5102 ins_nr = 1; 5103 ins_start_slot = path->slots[0]; 5104 next_slot: 5105 path->slots[0]++; 5106 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5107 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5108 path->slots[0]); 5109 goto again; 5110 } 5111 if (ins_nr) { 5112 ret = copy_items(trans, inode, dst_path, path, 5113 ins_start_slot, ins_nr, inode_only, 5114 logged_isize); 5115 if (ret < 0) 5116 return ret; 5117 ins_nr = 0; 5118 } 5119 btrfs_release_path(path); 5120 next_key: 5121 if (min_key->offset < (u64)-1) { 5122 min_key->offset++; 5123 } else if (min_key->type < max_key->type) { 5124 min_key->type++; 5125 min_key->offset = 0; 5126 } else { 5127 break; 5128 } 5129 } 5130 if (ins_nr) 5131 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5132 ins_nr, inode_only, logged_isize); 5133 5134 return ret; 5135 } 5136 5137 /* log a single inode in the tree log. 5138 * At least one parent directory for this inode must exist in the tree 5139 * or be logged already. 5140 * 5141 * Any items from this inode changed by the current transaction are copied 5142 * to the log tree. An extra reference is taken on any extents in this 5143 * file, allowing us to avoid a whole pile of corner cases around logging 5144 * blocks that have been removed from the tree. 5145 * 5146 * See LOG_INODE_ALL and related defines for a description of what inode_only 5147 * does. 5148 * 5149 * This handles both files and directories. 5150 */ 5151 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5152 struct btrfs_root *root, struct btrfs_inode *inode, 5153 int inode_only, 5154 u64 start, 5155 u64 end, 5156 struct btrfs_log_ctx *ctx) 5157 { 5158 struct btrfs_fs_info *fs_info = root->fs_info; 5159 struct btrfs_path *path; 5160 struct btrfs_path *dst_path; 5161 struct btrfs_key min_key; 5162 struct btrfs_key max_key; 5163 struct btrfs_root *log = root->log_root; 5164 int err = 0; 5165 int ret; 5166 bool fast_search = false; 5167 u64 ino = btrfs_ino(inode); 5168 struct extent_map_tree *em_tree = &inode->extent_tree; 5169 u64 logged_isize = 0; 5170 bool need_log_inode_item = true; 5171 bool xattrs_logged = false; 5172 bool recursive_logging = false; 5173 5174 path = btrfs_alloc_path(); 5175 if (!path) 5176 return -ENOMEM; 5177 dst_path = btrfs_alloc_path(); 5178 if (!dst_path) { 5179 btrfs_free_path(path); 5180 return -ENOMEM; 5181 } 5182 5183 start = ALIGN_DOWN(start, fs_info->sectorsize); 5184 end = ALIGN(end, fs_info->sectorsize); 5185 5186 min_key.objectid = ino; 5187 min_key.type = BTRFS_INODE_ITEM_KEY; 5188 min_key.offset = 0; 5189 5190 max_key.objectid = ino; 5191 5192 5193 /* today the code can only do partial logging of directories */ 5194 if (S_ISDIR(inode->vfs_inode.i_mode) || 5195 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5196 &inode->runtime_flags) && 5197 inode_only >= LOG_INODE_EXISTS)) 5198 max_key.type = BTRFS_XATTR_ITEM_KEY; 5199 else 5200 max_key.type = (u8)-1; 5201 max_key.offset = (u64)-1; 5202 5203 /* 5204 * Only run delayed items if we are a dir or a new file. 5205 * Otherwise commit the delayed inode only, which is needed in 5206 * order for the log replay code to mark inodes for link count 5207 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 5208 */ 5209 if (S_ISDIR(inode->vfs_inode.i_mode) || 5210 inode->generation > fs_info->last_trans_committed) 5211 ret = btrfs_commit_inode_delayed_items(trans, inode); 5212 else 5213 ret = btrfs_commit_inode_delayed_inode(inode); 5214 5215 if (ret) { 5216 btrfs_free_path(path); 5217 btrfs_free_path(dst_path); 5218 return ret; 5219 } 5220 5221 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5222 recursive_logging = true; 5223 if (inode_only == LOG_OTHER_INODE) 5224 inode_only = LOG_INODE_EXISTS; 5225 else 5226 inode_only = LOG_INODE_ALL; 5227 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5228 } else { 5229 mutex_lock(&inode->log_mutex); 5230 } 5231 5232 /* 5233 * a brute force approach to making sure we get the most uptodate 5234 * copies of everything. 5235 */ 5236 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5237 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5238 5239 if (inode_only == LOG_INODE_EXISTS) 5240 max_key_type = BTRFS_XATTR_ITEM_KEY; 5241 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5242 } else { 5243 if (inode_only == LOG_INODE_EXISTS) { 5244 /* 5245 * Make sure the new inode item we write to the log has 5246 * the same isize as the current one (if it exists). 5247 * This is necessary to prevent data loss after log 5248 * replay, and also to prevent doing a wrong expanding 5249 * truncate - for e.g. create file, write 4K into offset 5250 * 0, fsync, write 4K into offset 4096, add hard link, 5251 * fsync some other file (to sync log), power fail - if 5252 * we use the inode's current i_size, after log replay 5253 * we get a 8Kb file, with the last 4Kb extent as a hole 5254 * (zeroes), as if an expanding truncate happened, 5255 * instead of getting a file of 4Kb only. 5256 */ 5257 err = logged_inode_size(log, inode, path, &logged_isize); 5258 if (err) 5259 goto out_unlock; 5260 } 5261 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5262 &inode->runtime_flags)) { 5263 if (inode_only == LOG_INODE_EXISTS) { 5264 max_key.type = BTRFS_XATTR_ITEM_KEY; 5265 ret = drop_objectid_items(trans, log, path, ino, 5266 max_key.type); 5267 } else { 5268 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5269 &inode->runtime_flags); 5270 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5271 &inode->runtime_flags); 5272 while(1) { 5273 ret = btrfs_truncate_inode_items(trans, 5274 log, &inode->vfs_inode, 0, 0); 5275 if (ret != -EAGAIN) 5276 break; 5277 } 5278 } 5279 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5280 &inode->runtime_flags) || 5281 inode_only == LOG_INODE_EXISTS) { 5282 if (inode_only == LOG_INODE_ALL) 5283 fast_search = true; 5284 max_key.type = BTRFS_XATTR_ITEM_KEY; 5285 ret = drop_objectid_items(trans, log, path, ino, 5286 max_key.type); 5287 } else { 5288 if (inode_only == LOG_INODE_ALL) 5289 fast_search = true; 5290 goto log_extents; 5291 } 5292 5293 } 5294 if (ret) { 5295 err = ret; 5296 goto out_unlock; 5297 } 5298 5299 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5300 path, dst_path, logged_isize, 5301 recursive_logging, inode_only, 5302 start, end, ctx, &need_log_inode_item); 5303 if (err) 5304 goto out_unlock; 5305 5306 btrfs_release_path(path); 5307 btrfs_release_path(dst_path); 5308 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5309 if (err) 5310 goto out_unlock; 5311 xattrs_logged = true; 5312 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5313 btrfs_release_path(path); 5314 btrfs_release_path(dst_path); 5315 err = btrfs_log_holes(trans, root, inode, path, start, end); 5316 if (err) 5317 goto out_unlock; 5318 } 5319 log_extents: 5320 btrfs_release_path(path); 5321 btrfs_release_path(dst_path); 5322 if (need_log_inode_item) { 5323 err = log_inode_item(trans, log, dst_path, inode); 5324 if (!err && !xattrs_logged) { 5325 err = btrfs_log_all_xattrs(trans, root, inode, path, 5326 dst_path); 5327 btrfs_release_path(path); 5328 } 5329 if (err) 5330 goto out_unlock; 5331 } 5332 if (fast_search) { 5333 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5334 ctx, start, end); 5335 if (ret) { 5336 err = ret; 5337 goto out_unlock; 5338 } 5339 } else if (inode_only == LOG_INODE_ALL) { 5340 struct extent_map *em, *n; 5341 5342 write_lock(&em_tree->lock); 5343 /* 5344 * We can't just remove every em if we're called for a ranged 5345 * fsync - that is, one that doesn't cover the whole possible 5346 * file range (0 to LLONG_MAX). This is because we can have 5347 * em's that fall outside the range we're logging and therefore 5348 * their ordered operations haven't completed yet 5349 * (btrfs_finish_ordered_io() not invoked yet). This means we 5350 * didn't get their respective file extent item in the fs/subvol 5351 * tree yet, and need to let the next fast fsync (one which 5352 * consults the list of modified extent maps) find the em so 5353 * that it logs a matching file extent item and waits for the 5354 * respective ordered operation to complete (if it's still 5355 * running). 5356 * 5357 * Removing every em outside the range we're logging would make 5358 * the next fast fsync not log their matching file extent items, 5359 * therefore making us lose data after a log replay. 5360 */ 5361 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5362 list) { 5363 const u64 mod_end = em->mod_start + em->mod_len - 1; 5364 5365 if (em->mod_start >= start && mod_end <= end) 5366 list_del_init(&em->list); 5367 } 5368 write_unlock(&em_tree->lock); 5369 } 5370 5371 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5372 ret = log_directory_changes(trans, root, inode, path, dst_path, 5373 ctx); 5374 if (ret) { 5375 err = ret; 5376 goto out_unlock; 5377 } 5378 } 5379 5380 /* 5381 * Don't update last_log_commit if we logged that an inode exists after 5382 * it was loaded to memory (full_sync bit set). 5383 * This is to prevent data loss when we do a write to the inode, then 5384 * the inode gets evicted after all delalloc was flushed, then we log 5385 * it exists (due to a rename for example) and then fsync it. This last 5386 * fsync would do nothing (not logging the extents previously written). 5387 */ 5388 spin_lock(&inode->lock); 5389 inode->logged_trans = trans->transid; 5390 if (inode_only != LOG_INODE_EXISTS || 5391 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5392 inode->last_log_commit = inode->last_sub_trans; 5393 spin_unlock(&inode->lock); 5394 out_unlock: 5395 mutex_unlock(&inode->log_mutex); 5396 5397 btrfs_free_path(path); 5398 btrfs_free_path(dst_path); 5399 return err; 5400 } 5401 5402 /* 5403 * Check if we must fallback to a transaction commit when logging an inode. 5404 * This must be called after logging the inode and is used only in the context 5405 * when fsyncing an inode requires the need to log some other inode - in which 5406 * case we can't lock the i_mutex of each other inode we need to log as that 5407 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5408 * log inodes up or down in the hierarchy) or rename operations for example. So 5409 * we take the log_mutex of the inode after we have logged it and then check for 5410 * its last_unlink_trans value - this is safe because any task setting 5411 * last_unlink_trans must take the log_mutex and it must do this before it does 5412 * the actual unlink operation, so if we do this check before a concurrent task 5413 * sets last_unlink_trans it means we've logged a consistent version/state of 5414 * all the inode items, otherwise we are not sure and must do a transaction 5415 * commit (the concurrent task might have only updated last_unlink_trans before 5416 * we logged the inode or it might have also done the unlink). 5417 */ 5418 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5419 struct btrfs_inode *inode) 5420 { 5421 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5422 bool ret = false; 5423 5424 mutex_lock(&inode->log_mutex); 5425 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5426 /* 5427 * Make sure any commits to the log are forced to be full 5428 * commits. 5429 */ 5430 btrfs_set_log_full_commit(trans); 5431 ret = true; 5432 } 5433 mutex_unlock(&inode->log_mutex); 5434 5435 return ret; 5436 } 5437 5438 /* 5439 * follow the dentry parent pointers up the chain and see if any 5440 * of the directories in it require a full commit before they can 5441 * be logged. Returns zero if nothing special needs to be done or 1 if 5442 * a full commit is required. 5443 */ 5444 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5445 struct btrfs_inode *inode, 5446 struct dentry *parent, 5447 struct super_block *sb, 5448 u64 last_committed) 5449 { 5450 int ret = 0; 5451 struct dentry *old_parent = NULL; 5452 5453 /* 5454 * for regular files, if its inode is already on disk, we don't 5455 * have to worry about the parents at all. This is because 5456 * we can use the last_unlink_trans field to record renames 5457 * and other fun in this file. 5458 */ 5459 if (S_ISREG(inode->vfs_inode.i_mode) && 5460 inode->generation <= last_committed && 5461 inode->last_unlink_trans <= last_committed) 5462 goto out; 5463 5464 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5465 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5466 goto out; 5467 inode = BTRFS_I(d_inode(parent)); 5468 } 5469 5470 while (1) { 5471 if (btrfs_must_commit_transaction(trans, inode)) { 5472 ret = 1; 5473 break; 5474 } 5475 5476 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5477 break; 5478 5479 if (IS_ROOT(parent)) { 5480 inode = BTRFS_I(d_inode(parent)); 5481 if (btrfs_must_commit_transaction(trans, inode)) 5482 ret = 1; 5483 break; 5484 } 5485 5486 parent = dget_parent(parent); 5487 dput(old_parent); 5488 old_parent = parent; 5489 inode = BTRFS_I(d_inode(parent)); 5490 5491 } 5492 dput(old_parent); 5493 out: 5494 return ret; 5495 } 5496 5497 struct btrfs_dir_list { 5498 u64 ino; 5499 struct list_head list; 5500 }; 5501 5502 /* 5503 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5504 * details about the why it is needed. 5505 * This is a recursive operation - if an existing dentry corresponds to a 5506 * directory, that directory's new entries are logged too (same behaviour as 5507 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5508 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5509 * complains about the following circular lock dependency / possible deadlock: 5510 * 5511 * CPU0 CPU1 5512 * ---- ---- 5513 * lock(&type->i_mutex_dir_key#3/2); 5514 * lock(sb_internal#2); 5515 * lock(&type->i_mutex_dir_key#3/2); 5516 * lock(&sb->s_type->i_mutex_key#14); 5517 * 5518 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5519 * sb_start_intwrite() in btrfs_start_transaction(). 5520 * Not locking i_mutex of the inodes is still safe because: 5521 * 5522 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5523 * that while logging the inode new references (names) are added or removed 5524 * from the inode, leaving the logged inode item with a link count that does 5525 * not match the number of logged inode reference items. This is fine because 5526 * at log replay time we compute the real number of links and correct the 5527 * link count in the inode item (see replay_one_buffer() and 5528 * link_to_fixup_dir()); 5529 * 5530 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5531 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5532 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5533 * has a size that doesn't match the sum of the lengths of all the logged 5534 * names. This does not result in a problem because if a dir_item key is 5535 * logged but its matching dir_index key is not logged, at log replay time we 5536 * don't use it to replay the respective name (see replay_one_name()). On the 5537 * other hand if only the dir_index key ends up being logged, the respective 5538 * name is added to the fs/subvol tree with both the dir_item and dir_index 5539 * keys created (see replay_one_name()). 5540 * The directory's inode item with a wrong i_size is not a problem as well, 5541 * since we don't use it at log replay time to set the i_size in the inode 5542 * item of the fs/subvol tree (see overwrite_item()). 5543 */ 5544 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5545 struct btrfs_root *root, 5546 struct btrfs_inode *start_inode, 5547 struct btrfs_log_ctx *ctx) 5548 { 5549 struct btrfs_fs_info *fs_info = root->fs_info; 5550 struct btrfs_root *log = root->log_root; 5551 struct btrfs_path *path; 5552 LIST_HEAD(dir_list); 5553 struct btrfs_dir_list *dir_elem; 5554 int ret = 0; 5555 5556 path = btrfs_alloc_path(); 5557 if (!path) 5558 return -ENOMEM; 5559 5560 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5561 if (!dir_elem) { 5562 btrfs_free_path(path); 5563 return -ENOMEM; 5564 } 5565 dir_elem->ino = btrfs_ino(start_inode); 5566 list_add_tail(&dir_elem->list, &dir_list); 5567 5568 while (!list_empty(&dir_list)) { 5569 struct extent_buffer *leaf; 5570 struct btrfs_key min_key; 5571 int nritems; 5572 int i; 5573 5574 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5575 list); 5576 if (ret) 5577 goto next_dir_inode; 5578 5579 min_key.objectid = dir_elem->ino; 5580 min_key.type = BTRFS_DIR_ITEM_KEY; 5581 min_key.offset = 0; 5582 again: 5583 btrfs_release_path(path); 5584 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5585 if (ret < 0) { 5586 goto next_dir_inode; 5587 } else if (ret > 0) { 5588 ret = 0; 5589 goto next_dir_inode; 5590 } 5591 5592 process_leaf: 5593 leaf = path->nodes[0]; 5594 nritems = btrfs_header_nritems(leaf); 5595 for (i = path->slots[0]; i < nritems; i++) { 5596 struct btrfs_dir_item *di; 5597 struct btrfs_key di_key; 5598 struct inode *di_inode; 5599 struct btrfs_dir_list *new_dir_elem; 5600 int log_mode = LOG_INODE_EXISTS; 5601 int type; 5602 5603 btrfs_item_key_to_cpu(leaf, &min_key, i); 5604 if (min_key.objectid != dir_elem->ino || 5605 min_key.type != BTRFS_DIR_ITEM_KEY) 5606 goto next_dir_inode; 5607 5608 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5609 type = btrfs_dir_type(leaf, di); 5610 if (btrfs_dir_transid(leaf, di) < trans->transid && 5611 type != BTRFS_FT_DIR) 5612 continue; 5613 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5614 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5615 continue; 5616 5617 btrfs_release_path(path); 5618 di_inode = btrfs_iget(fs_info->sb, &di_key, root); 5619 if (IS_ERR(di_inode)) { 5620 ret = PTR_ERR(di_inode); 5621 goto next_dir_inode; 5622 } 5623 5624 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5625 btrfs_add_delayed_iput(di_inode); 5626 break; 5627 } 5628 5629 ctx->log_new_dentries = false; 5630 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5631 log_mode = LOG_INODE_ALL; 5632 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5633 log_mode, 0, LLONG_MAX, ctx); 5634 if (!ret && 5635 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5636 ret = 1; 5637 btrfs_add_delayed_iput(di_inode); 5638 if (ret) 5639 goto next_dir_inode; 5640 if (ctx->log_new_dentries) { 5641 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5642 GFP_NOFS); 5643 if (!new_dir_elem) { 5644 ret = -ENOMEM; 5645 goto next_dir_inode; 5646 } 5647 new_dir_elem->ino = di_key.objectid; 5648 list_add_tail(&new_dir_elem->list, &dir_list); 5649 } 5650 break; 5651 } 5652 if (i == nritems) { 5653 ret = btrfs_next_leaf(log, path); 5654 if (ret < 0) { 5655 goto next_dir_inode; 5656 } else if (ret > 0) { 5657 ret = 0; 5658 goto next_dir_inode; 5659 } 5660 goto process_leaf; 5661 } 5662 if (min_key.offset < (u64)-1) { 5663 min_key.offset++; 5664 goto again; 5665 } 5666 next_dir_inode: 5667 list_del(&dir_elem->list); 5668 kfree(dir_elem); 5669 } 5670 5671 btrfs_free_path(path); 5672 return ret; 5673 } 5674 5675 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5676 struct btrfs_inode *inode, 5677 struct btrfs_log_ctx *ctx) 5678 { 5679 struct btrfs_fs_info *fs_info = trans->fs_info; 5680 int ret; 5681 struct btrfs_path *path; 5682 struct btrfs_key key; 5683 struct btrfs_root *root = inode->root; 5684 const u64 ino = btrfs_ino(inode); 5685 5686 path = btrfs_alloc_path(); 5687 if (!path) 5688 return -ENOMEM; 5689 path->skip_locking = 1; 5690 path->search_commit_root = 1; 5691 5692 key.objectid = ino; 5693 key.type = BTRFS_INODE_REF_KEY; 5694 key.offset = 0; 5695 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5696 if (ret < 0) 5697 goto out; 5698 5699 while (true) { 5700 struct extent_buffer *leaf = path->nodes[0]; 5701 int slot = path->slots[0]; 5702 u32 cur_offset = 0; 5703 u32 item_size; 5704 unsigned long ptr; 5705 5706 if (slot >= btrfs_header_nritems(leaf)) { 5707 ret = btrfs_next_leaf(root, path); 5708 if (ret < 0) 5709 goto out; 5710 else if (ret > 0) 5711 break; 5712 continue; 5713 } 5714 5715 btrfs_item_key_to_cpu(leaf, &key, slot); 5716 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5717 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5718 break; 5719 5720 item_size = btrfs_item_size_nr(leaf, slot); 5721 ptr = btrfs_item_ptr_offset(leaf, slot); 5722 while (cur_offset < item_size) { 5723 struct btrfs_key inode_key; 5724 struct inode *dir_inode; 5725 5726 inode_key.type = BTRFS_INODE_ITEM_KEY; 5727 inode_key.offset = 0; 5728 5729 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5730 struct btrfs_inode_extref *extref; 5731 5732 extref = (struct btrfs_inode_extref *) 5733 (ptr + cur_offset); 5734 inode_key.objectid = btrfs_inode_extref_parent( 5735 leaf, extref); 5736 cur_offset += sizeof(*extref); 5737 cur_offset += btrfs_inode_extref_name_len(leaf, 5738 extref); 5739 } else { 5740 inode_key.objectid = key.offset; 5741 cur_offset = item_size; 5742 } 5743 5744 dir_inode = btrfs_iget(fs_info->sb, &inode_key, root); 5745 /* 5746 * If the parent inode was deleted, return an error to 5747 * fallback to a transaction commit. This is to prevent 5748 * getting an inode that was moved from one parent A to 5749 * a parent B, got its former parent A deleted and then 5750 * it got fsync'ed, from existing at both parents after 5751 * a log replay (and the old parent still existing). 5752 * Example: 5753 * 5754 * mkdir /mnt/A 5755 * mkdir /mnt/B 5756 * touch /mnt/B/bar 5757 * sync 5758 * mv /mnt/B/bar /mnt/A/bar 5759 * mv -T /mnt/A /mnt/B 5760 * fsync /mnt/B/bar 5761 * <power fail> 5762 * 5763 * If we ignore the old parent B which got deleted, 5764 * after a log replay we would have file bar linked 5765 * at both parents and the old parent B would still 5766 * exist. 5767 */ 5768 if (IS_ERR(dir_inode)) { 5769 ret = PTR_ERR(dir_inode); 5770 goto out; 5771 } 5772 5773 if (ctx) 5774 ctx->log_new_dentries = false; 5775 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5776 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5777 if (!ret && 5778 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5779 ret = 1; 5780 if (!ret && ctx && ctx->log_new_dentries) 5781 ret = log_new_dir_dentries(trans, root, 5782 BTRFS_I(dir_inode), ctx); 5783 btrfs_add_delayed_iput(dir_inode); 5784 if (ret) 5785 goto out; 5786 } 5787 path->slots[0]++; 5788 } 5789 ret = 0; 5790 out: 5791 btrfs_free_path(path); 5792 return ret; 5793 } 5794 5795 static int log_new_ancestors(struct btrfs_trans_handle *trans, 5796 struct btrfs_root *root, 5797 struct btrfs_path *path, 5798 struct btrfs_log_ctx *ctx) 5799 { 5800 struct btrfs_key found_key; 5801 5802 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5803 5804 while (true) { 5805 struct btrfs_fs_info *fs_info = root->fs_info; 5806 const u64 last_committed = fs_info->last_trans_committed; 5807 struct extent_buffer *leaf = path->nodes[0]; 5808 int slot = path->slots[0]; 5809 struct btrfs_key search_key; 5810 struct inode *inode; 5811 int ret = 0; 5812 5813 btrfs_release_path(path); 5814 5815 search_key.objectid = found_key.offset; 5816 search_key.type = BTRFS_INODE_ITEM_KEY; 5817 search_key.offset = 0; 5818 inode = btrfs_iget(fs_info->sb, &search_key, root); 5819 if (IS_ERR(inode)) 5820 return PTR_ERR(inode); 5821 5822 if (BTRFS_I(inode)->generation > last_committed) 5823 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5824 LOG_INODE_EXISTS, 5825 0, LLONG_MAX, ctx); 5826 btrfs_add_delayed_iput(inode); 5827 if (ret) 5828 return ret; 5829 5830 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5831 break; 5832 5833 search_key.type = BTRFS_INODE_REF_KEY; 5834 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5835 if (ret < 0) 5836 return ret; 5837 5838 leaf = path->nodes[0]; 5839 slot = path->slots[0]; 5840 if (slot >= btrfs_header_nritems(leaf)) { 5841 ret = btrfs_next_leaf(root, path); 5842 if (ret < 0) 5843 return ret; 5844 else if (ret > 0) 5845 return -ENOENT; 5846 leaf = path->nodes[0]; 5847 slot = path->slots[0]; 5848 } 5849 5850 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5851 if (found_key.objectid != search_key.objectid || 5852 found_key.type != BTRFS_INODE_REF_KEY) 5853 return -ENOENT; 5854 } 5855 return 0; 5856 } 5857 5858 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 5859 struct btrfs_inode *inode, 5860 struct dentry *parent, 5861 struct btrfs_log_ctx *ctx) 5862 { 5863 struct btrfs_root *root = inode->root; 5864 struct btrfs_fs_info *fs_info = root->fs_info; 5865 struct dentry *old_parent = NULL; 5866 struct super_block *sb = inode->vfs_inode.i_sb; 5867 int ret = 0; 5868 5869 while (true) { 5870 if (!parent || d_really_is_negative(parent) || 5871 sb != parent->d_sb) 5872 break; 5873 5874 inode = BTRFS_I(d_inode(parent)); 5875 if (root != inode->root) 5876 break; 5877 5878 if (inode->generation > fs_info->last_trans_committed) { 5879 ret = btrfs_log_inode(trans, root, inode, 5880 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5881 if (ret) 5882 break; 5883 } 5884 if (IS_ROOT(parent)) 5885 break; 5886 5887 parent = dget_parent(parent); 5888 dput(old_parent); 5889 old_parent = parent; 5890 } 5891 dput(old_parent); 5892 5893 return ret; 5894 } 5895 5896 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 5897 struct btrfs_inode *inode, 5898 struct dentry *parent, 5899 struct btrfs_log_ctx *ctx) 5900 { 5901 struct btrfs_root *root = inode->root; 5902 const u64 ino = btrfs_ino(inode); 5903 struct btrfs_path *path; 5904 struct btrfs_key search_key; 5905 int ret; 5906 5907 /* 5908 * For a single hard link case, go through a fast path that does not 5909 * need to iterate the fs/subvolume tree. 5910 */ 5911 if (inode->vfs_inode.i_nlink < 2) 5912 return log_new_ancestors_fast(trans, inode, parent, ctx); 5913 5914 path = btrfs_alloc_path(); 5915 if (!path) 5916 return -ENOMEM; 5917 5918 search_key.objectid = ino; 5919 search_key.type = BTRFS_INODE_REF_KEY; 5920 search_key.offset = 0; 5921 again: 5922 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5923 if (ret < 0) 5924 goto out; 5925 if (ret == 0) 5926 path->slots[0]++; 5927 5928 while (true) { 5929 struct extent_buffer *leaf = path->nodes[0]; 5930 int slot = path->slots[0]; 5931 struct btrfs_key found_key; 5932 5933 if (slot >= btrfs_header_nritems(leaf)) { 5934 ret = btrfs_next_leaf(root, path); 5935 if (ret < 0) 5936 goto out; 5937 else if (ret > 0) 5938 break; 5939 continue; 5940 } 5941 5942 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5943 if (found_key.objectid != ino || 5944 found_key.type > BTRFS_INODE_EXTREF_KEY) 5945 break; 5946 5947 /* 5948 * Don't deal with extended references because they are rare 5949 * cases and too complex to deal with (we would need to keep 5950 * track of which subitem we are processing for each item in 5951 * this loop, etc). So just return some error to fallback to 5952 * a transaction commit. 5953 */ 5954 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 5955 ret = -EMLINK; 5956 goto out; 5957 } 5958 5959 /* 5960 * Logging ancestors needs to do more searches on the fs/subvol 5961 * tree, so it releases the path as needed to avoid deadlocks. 5962 * Keep track of the last inode ref key and resume from that key 5963 * after logging all new ancestors for the current hard link. 5964 */ 5965 memcpy(&search_key, &found_key, sizeof(search_key)); 5966 5967 ret = log_new_ancestors(trans, root, path, ctx); 5968 if (ret) 5969 goto out; 5970 btrfs_release_path(path); 5971 goto again; 5972 } 5973 ret = 0; 5974 out: 5975 btrfs_free_path(path); 5976 return ret; 5977 } 5978 5979 /* 5980 * helper function around btrfs_log_inode to make sure newly created 5981 * parent directories also end up in the log. A minimal inode and backref 5982 * only logging is done of any parent directories that are older than 5983 * the last committed transaction 5984 */ 5985 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5986 struct btrfs_inode *inode, 5987 struct dentry *parent, 5988 const loff_t start, 5989 const loff_t end, 5990 int inode_only, 5991 struct btrfs_log_ctx *ctx) 5992 { 5993 struct btrfs_root *root = inode->root; 5994 struct btrfs_fs_info *fs_info = root->fs_info; 5995 struct super_block *sb; 5996 int ret = 0; 5997 u64 last_committed = fs_info->last_trans_committed; 5998 bool log_dentries = false; 5999 6000 sb = inode->vfs_inode.i_sb; 6001 6002 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6003 ret = 1; 6004 goto end_no_trans; 6005 } 6006 6007 /* 6008 * The prev transaction commit doesn't complete, we need do 6009 * full commit by ourselves. 6010 */ 6011 if (fs_info->last_trans_log_full_commit > 6012 fs_info->last_trans_committed) { 6013 ret = 1; 6014 goto end_no_trans; 6015 } 6016 6017 if (btrfs_root_refs(&root->root_item) == 0) { 6018 ret = 1; 6019 goto end_no_trans; 6020 } 6021 6022 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 6023 last_committed); 6024 if (ret) 6025 goto end_no_trans; 6026 6027 /* 6028 * Skip already logged inodes or inodes corresponding to tmpfiles 6029 * (since logging them is pointless, a link count of 0 means they 6030 * will never be accessible). 6031 */ 6032 if (btrfs_inode_in_log(inode, trans->transid) || 6033 inode->vfs_inode.i_nlink == 0) { 6034 ret = BTRFS_NO_LOG_SYNC; 6035 goto end_no_trans; 6036 } 6037 6038 ret = start_log_trans(trans, root, ctx); 6039 if (ret) 6040 goto end_no_trans; 6041 6042 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 6043 if (ret) 6044 goto end_trans; 6045 6046 /* 6047 * for regular files, if its inode is already on disk, we don't 6048 * have to worry about the parents at all. This is because 6049 * we can use the last_unlink_trans field to record renames 6050 * and other fun in this file. 6051 */ 6052 if (S_ISREG(inode->vfs_inode.i_mode) && 6053 inode->generation <= last_committed && 6054 inode->last_unlink_trans <= last_committed) { 6055 ret = 0; 6056 goto end_trans; 6057 } 6058 6059 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6060 log_dentries = true; 6061 6062 /* 6063 * On unlink we must make sure all our current and old parent directory 6064 * inodes are fully logged. This is to prevent leaving dangling 6065 * directory index entries in directories that were our parents but are 6066 * not anymore. Not doing this results in old parent directory being 6067 * impossible to delete after log replay (rmdir will always fail with 6068 * error -ENOTEMPTY). 6069 * 6070 * Example 1: 6071 * 6072 * mkdir testdir 6073 * touch testdir/foo 6074 * ln testdir/foo testdir/bar 6075 * sync 6076 * unlink testdir/bar 6077 * xfs_io -c fsync testdir/foo 6078 * <power failure> 6079 * mount fs, triggers log replay 6080 * 6081 * If we don't log the parent directory (testdir), after log replay the 6082 * directory still has an entry pointing to the file inode using the bar 6083 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6084 * the file inode has a link count of 1. 6085 * 6086 * Example 2: 6087 * 6088 * mkdir testdir 6089 * touch foo 6090 * ln foo testdir/foo2 6091 * ln foo testdir/foo3 6092 * sync 6093 * unlink testdir/foo3 6094 * xfs_io -c fsync foo 6095 * <power failure> 6096 * mount fs, triggers log replay 6097 * 6098 * Similar as the first example, after log replay the parent directory 6099 * testdir still has an entry pointing to the inode file with name foo3 6100 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6101 * and has a link count of 2. 6102 */ 6103 if (inode->last_unlink_trans > last_committed) { 6104 ret = btrfs_log_all_parents(trans, inode, ctx); 6105 if (ret) 6106 goto end_trans; 6107 } 6108 6109 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6110 if (ret) 6111 goto end_trans; 6112 6113 if (log_dentries) 6114 ret = log_new_dir_dentries(trans, root, inode, ctx); 6115 else 6116 ret = 0; 6117 end_trans: 6118 if (ret < 0) { 6119 btrfs_set_log_full_commit(trans); 6120 ret = 1; 6121 } 6122 6123 if (ret) 6124 btrfs_remove_log_ctx(root, ctx); 6125 btrfs_end_log_trans(root); 6126 end_no_trans: 6127 return ret; 6128 } 6129 6130 /* 6131 * it is not safe to log dentry if the chunk root has added new 6132 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6133 * If this returns 1, you must commit the transaction to safely get your 6134 * data on disk. 6135 */ 6136 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6137 struct dentry *dentry, 6138 const loff_t start, 6139 const loff_t end, 6140 struct btrfs_log_ctx *ctx) 6141 { 6142 struct dentry *parent = dget_parent(dentry); 6143 int ret; 6144 6145 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6146 start, end, LOG_INODE_ALL, ctx); 6147 dput(parent); 6148 6149 return ret; 6150 } 6151 6152 /* 6153 * should be called during mount to recover any replay any log trees 6154 * from the FS 6155 */ 6156 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6157 { 6158 int ret; 6159 struct btrfs_path *path; 6160 struct btrfs_trans_handle *trans; 6161 struct btrfs_key key; 6162 struct btrfs_key found_key; 6163 struct btrfs_key tmp_key; 6164 struct btrfs_root *log; 6165 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6166 struct walk_control wc = { 6167 .process_func = process_one_buffer, 6168 .stage = LOG_WALK_PIN_ONLY, 6169 }; 6170 6171 path = btrfs_alloc_path(); 6172 if (!path) 6173 return -ENOMEM; 6174 6175 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6176 6177 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6178 if (IS_ERR(trans)) { 6179 ret = PTR_ERR(trans); 6180 goto error; 6181 } 6182 6183 wc.trans = trans; 6184 wc.pin = 1; 6185 6186 ret = walk_log_tree(trans, log_root_tree, &wc); 6187 if (ret) { 6188 btrfs_handle_fs_error(fs_info, ret, 6189 "Failed to pin buffers while recovering log root tree."); 6190 goto error; 6191 } 6192 6193 again: 6194 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6195 key.offset = (u64)-1; 6196 key.type = BTRFS_ROOT_ITEM_KEY; 6197 6198 while (1) { 6199 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6200 6201 if (ret < 0) { 6202 btrfs_handle_fs_error(fs_info, ret, 6203 "Couldn't find tree log root."); 6204 goto error; 6205 } 6206 if (ret > 0) { 6207 if (path->slots[0] == 0) 6208 break; 6209 path->slots[0]--; 6210 } 6211 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6212 path->slots[0]); 6213 btrfs_release_path(path); 6214 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6215 break; 6216 6217 log = btrfs_read_tree_root(log_root_tree, &found_key); 6218 if (IS_ERR(log)) { 6219 ret = PTR_ERR(log); 6220 btrfs_handle_fs_error(fs_info, ret, 6221 "Couldn't read tree log root."); 6222 goto error; 6223 } 6224 6225 tmp_key.objectid = found_key.offset; 6226 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 6227 tmp_key.offset = (u64)-1; 6228 6229 wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); 6230 if (IS_ERR(wc.replay_dest)) { 6231 ret = PTR_ERR(wc.replay_dest); 6232 6233 /* 6234 * We didn't find the subvol, likely because it was 6235 * deleted. This is ok, simply skip this log and go to 6236 * the next one. 6237 * 6238 * We need to exclude the root because we can't have 6239 * other log replays overwriting this log as we'll read 6240 * it back in a few more times. This will keep our 6241 * block from being modified, and we'll just bail for 6242 * each subsequent pass. 6243 */ 6244 if (ret == -ENOENT) 6245 ret = btrfs_pin_extent_for_log_replay(trans, 6246 log->node->start, 6247 log->node->len); 6248 btrfs_put_root(log); 6249 6250 if (!ret) 6251 goto next; 6252 btrfs_handle_fs_error(fs_info, ret, 6253 "Couldn't read target root for tree log recovery."); 6254 goto error; 6255 } 6256 6257 wc.replay_dest->log_root = log; 6258 btrfs_record_root_in_trans(trans, wc.replay_dest); 6259 ret = walk_log_tree(trans, log, &wc); 6260 6261 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6262 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6263 path); 6264 } 6265 6266 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6267 struct btrfs_root *root = wc.replay_dest; 6268 6269 btrfs_release_path(path); 6270 6271 /* 6272 * We have just replayed everything, and the highest 6273 * objectid of fs roots probably has changed in case 6274 * some inode_item's got replayed. 6275 * 6276 * root->objectid_mutex is not acquired as log replay 6277 * could only happen during mount. 6278 */ 6279 ret = btrfs_find_highest_objectid(root, 6280 &root->highest_objectid); 6281 } 6282 6283 wc.replay_dest->log_root = NULL; 6284 btrfs_put_root(wc.replay_dest); 6285 btrfs_put_root(log); 6286 6287 if (ret) 6288 goto error; 6289 next: 6290 if (found_key.offset == 0) 6291 break; 6292 key.offset = found_key.offset - 1; 6293 } 6294 btrfs_release_path(path); 6295 6296 /* step one is to pin it all, step two is to replay just inodes */ 6297 if (wc.pin) { 6298 wc.pin = 0; 6299 wc.process_func = replay_one_buffer; 6300 wc.stage = LOG_WALK_REPLAY_INODES; 6301 goto again; 6302 } 6303 /* step three is to replay everything */ 6304 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6305 wc.stage++; 6306 goto again; 6307 } 6308 6309 btrfs_free_path(path); 6310 6311 /* step 4: commit the transaction, which also unpins the blocks */ 6312 ret = btrfs_commit_transaction(trans); 6313 if (ret) 6314 return ret; 6315 6316 log_root_tree->log_root = NULL; 6317 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6318 btrfs_put_root(log_root_tree); 6319 6320 return 0; 6321 error: 6322 if (wc.trans) 6323 btrfs_end_transaction(wc.trans); 6324 btrfs_free_path(path); 6325 return ret; 6326 } 6327 6328 /* 6329 * there are some corner cases where we want to force a full 6330 * commit instead of allowing a directory to be logged. 6331 * 6332 * They revolve around files there were unlinked from the directory, and 6333 * this function updates the parent directory so that a full commit is 6334 * properly done if it is fsync'd later after the unlinks are done. 6335 * 6336 * Must be called before the unlink operations (updates to the subvolume tree, 6337 * inodes, etc) are done. 6338 */ 6339 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6340 struct btrfs_inode *dir, struct btrfs_inode *inode, 6341 int for_rename) 6342 { 6343 /* 6344 * when we're logging a file, if it hasn't been renamed 6345 * or unlinked, and its inode is fully committed on disk, 6346 * we don't have to worry about walking up the directory chain 6347 * to log its parents. 6348 * 6349 * So, we use the last_unlink_trans field to put this transid 6350 * into the file. When the file is logged we check it and 6351 * don't log the parents if the file is fully on disk. 6352 */ 6353 mutex_lock(&inode->log_mutex); 6354 inode->last_unlink_trans = trans->transid; 6355 mutex_unlock(&inode->log_mutex); 6356 6357 /* 6358 * if this directory was already logged any new 6359 * names for this file/dir will get recorded 6360 */ 6361 if (dir->logged_trans == trans->transid) 6362 return; 6363 6364 /* 6365 * if the inode we're about to unlink was logged, 6366 * the log will be properly updated for any new names 6367 */ 6368 if (inode->logged_trans == trans->transid) 6369 return; 6370 6371 /* 6372 * when renaming files across directories, if the directory 6373 * there we're unlinking from gets fsync'd later on, there's 6374 * no way to find the destination directory later and fsync it 6375 * properly. So, we have to be conservative and force commits 6376 * so the new name gets discovered. 6377 */ 6378 if (for_rename) 6379 goto record; 6380 6381 /* we can safely do the unlink without any special recording */ 6382 return; 6383 6384 record: 6385 mutex_lock(&dir->log_mutex); 6386 dir->last_unlink_trans = trans->transid; 6387 mutex_unlock(&dir->log_mutex); 6388 } 6389 6390 /* 6391 * Make sure that if someone attempts to fsync the parent directory of a deleted 6392 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6393 * that after replaying the log tree of the parent directory's root we will not 6394 * see the snapshot anymore and at log replay time we will not see any log tree 6395 * corresponding to the deleted snapshot's root, which could lead to replaying 6396 * it after replaying the log tree of the parent directory (which would replay 6397 * the snapshot delete operation). 6398 * 6399 * Must be called before the actual snapshot destroy operation (updates to the 6400 * parent root and tree of tree roots trees, etc) are done. 6401 */ 6402 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6403 struct btrfs_inode *dir) 6404 { 6405 mutex_lock(&dir->log_mutex); 6406 dir->last_unlink_trans = trans->transid; 6407 mutex_unlock(&dir->log_mutex); 6408 } 6409 6410 /* 6411 * Call this after adding a new name for a file and it will properly 6412 * update the log to reflect the new name. 6413 * 6414 * @ctx can not be NULL when @sync_log is false, and should be NULL when it's 6415 * true (because it's not used). 6416 * 6417 * Return value depends on whether @sync_log is true or false. 6418 * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be 6419 * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT 6420 * otherwise. 6421 * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to 6422 * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, 6423 * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be 6424 * committed (without attempting to sync the log). 6425 */ 6426 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 6427 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6428 struct dentry *parent, 6429 bool sync_log, struct btrfs_log_ctx *ctx) 6430 { 6431 struct btrfs_fs_info *fs_info = trans->fs_info; 6432 int ret; 6433 6434 /* 6435 * this will force the logging code to walk the dentry chain 6436 * up for the file 6437 */ 6438 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6439 inode->last_unlink_trans = trans->transid; 6440 6441 /* 6442 * if this inode hasn't been logged and directory we're renaming it 6443 * from hasn't been logged, we don't need to log it 6444 */ 6445 if (inode->logged_trans <= fs_info->last_trans_committed && 6446 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6447 return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : 6448 BTRFS_DONT_NEED_LOG_SYNC; 6449 6450 if (sync_log) { 6451 struct btrfs_log_ctx ctx2; 6452 6453 btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); 6454 ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6455 LOG_INODE_EXISTS, &ctx2); 6456 if (ret == BTRFS_NO_LOG_SYNC) 6457 return BTRFS_DONT_NEED_TRANS_COMMIT; 6458 else if (ret) 6459 return BTRFS_NEED_TRANS_COMMIT; 6460 6461 ret = btrfs_sync_log(trans, inode->root, &ctx2); 6462 if (ret) 6463 return BTRFS_NEED_TRANS_COMMIT; 6464 return BTRFS_DONT_NEED_TRANS_COMMIT; 6465 } 6466 6467 ASSERT(ctx); 6468 ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6469 LOG_INODE_EXISTS, ctx); 6470 if (ret == BTRFS_NO_LOG_SYNC) 6471 return BTRFS_DONT_NEED_LOG_SYNC; 6472 else if (ret) 6473 return BTRFS_NEED_TRANS_COMMIT; 6474 6475 return BTRFS_NEED_LOG_SYNC; 6476 } 6477 6478