1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 #include "compression.h" 30 31 /* magic values for the inode_only field in btrfs_log_inode: 32 * 33 * LOG_INODE_ALL means to log everything 34 * LOG_INODE_EXISTS means to log just enough to recreate the inode 35 * during log replay 36 */ 37 #define LOG_INODE_ALL 0 38 #define LOG_INODE_EXISTS 1 39 40 /* 41 * directory trouble cases 42 * 43 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 44 * log, we must force a full commit before doing an fsync of the directory 45 * where the unlink was done. 46 * ---> record transid of last unlink/rename per directory 47 * 48 * mkdir foo/some_dir 49 * normal commit 50 * rename foo/some_dir foo2/some_dir 51 * mkdir foo/some_dir 52 * fsync foo/some_dir/some_file 53 * 54 * The fsync above will unlink the original some_dir without recording 55 * it in its new location (foo2). After a crash, some_dir will be gone 56 * unless the fsync of some_file forces a full commit 57 * 58 * 2) we must log any new names for any file or dir that is in the fsync 59 * log. ---> check inode while renaming/linking. 60 * 61 * 2a) we must log any new names for any file or dir during rename 62 * when the directory they are being removed from was logged. 63 * ---> check inode and old parent dir during rename 64 * 65 * 2a is actually the more important variant. With the extra logging 66 * a crash might unlink the old name without recreating the new one 67 * 68 * 3) after a crash, we must go through any directories with a link count 69 * of zero and redo the rm -rf 70 * 71 * mkdir f1/foo 72 * normal commit 73 * rm -rf f1/foo 74 * fsync(f1) 75 * 76 * The directory f1 was fully removed from the FS, but fsync was never 77 * called on f1, only its parent dir. After a crash the rm -rf must 78 * be replayed. This must be able to recurse down the entire 79 * directory tree. The inode link count fixup code takes care of the 80 * ugly details. 81 */ 82 83 /* 84 * stages for the tree walking. The first 85 * stage (0) is to only pin down the blocks we find 86 * the second stage (1) is to make sure that all the inodes 87 * we find in the log are created in the subvolume. 88 * 89 * The last stage is to deal with directories and links and extents 90 * and all the other fun semantics 91 */ 92 #define LOG_WALK_PIN_ONLY 0 93 #define LOG_WALK_REPLAY_INODES 1 94 #define LOG_WALK_REPLAY_DIR_INDEX 2 95 #define LOG_WALK_REPLAY_ALL 3 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode, 99 int inode_only, 100 const loff_t start, 101 const loff_t end, 102 struct btrfs_log_ctx *ctx); 103 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_path *path, u64 objectid); 106 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 107 struct btrfs_root *root, 108 struct btrfs_root *log, 109 struct btrfs_path *path, 110 u64 dirid, int del_all); 111 112 /* 113 * tree logging is a special write ahead log used to make sure that 114 * fsyncs and O_SYNCs can happen without doing full tree commits. 115 * 116 * Full tree commits are expensive because they require commonly 117 * modified blocks to be recowed, creating many dirty pages in the 118 * extent tree an 4x-6x higher write load than ext3. 119 * 120 * Instead of doing a tree commit on every fsync, we use the 121 * key ranges and transaction ids to find items for a given file or directory 122 * that have changed in this transaction. Those items are copied into 123 * a special tree (one per subvolume root), that tree is written to disk 124 * and then the fsync is considered complete. 125 * 126 * After a crash, items are copied out of the log-tree back into the 127 * subvolume tree. Any file data extents found are recorded in the extent 128 * allocation tree, and the log-tree freed. 129 * 130 * The log tree is read three times, once to pin down all the extents it is 131 * using in ram and once, once to create all the inodes logged in the tree 132 * and once to do all the other items. 133 */ 134 135 /* 136 * start a sub transaction and setup the log tree 137 * this increments the log tree writer count to make the people 138 * syncing the tree wait for us to finish 139 */ 140 static int start_log_trans(struct btrfs_trans_handle *trans, 141 struct btrfs_root *root, 142 struct btrfs_log_ctx *ctx) 143 { 144 int ret = 0; 145 146 mutex_lock(&root->log_mutex); 147 148 if (root->log_root) { 149 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 150 ret = -EAGAIN; 151 goto out; 152 } 153 154 if (!root->log_start_pid) { 155 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 156 root->log_start_pid = current->pid; 157 } else if (root->log_start_pid != current->pid) { 158 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 } 160 } else { 161 mutex_lock(&root->fs_info->tree_log_mutex); 162 if (!root->fs_info->log_root_tree) 163 ret = btrfs_init_log_root_tree(trans, root->fs_info); 164 mutex_unlock(&root->fs_info->tree_log_mutex); 165 if (ret) 166 goto out; 167 168 ret = btrfs_add_log_tree(trans, root); 169 if (ret) 170 goto out; 171 172 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 173 root->log_start_pid = current->pid; 174 } 175 176 atomic_inc(&root->log_batch); 177 atomic_inc(&root->log_writers); 178 if (ctx) { 179 int index = root->log_transid % 2; 180 list_add_tail(&ctx->list, &root->log_ctxs[index]); 181 ctx->log_transid = root->log_transid; 182 } 183 184 out: 185 mutex_unlock(&root->log_mutex); 186 return ret; 187 } 188 189 /* 190 * returns 0 if there was a log transaction running and we were able 191 * to join, or returns -ENOENT if there were not transactions 192 * in progress 193 */ 194 static int join_running_log_trans(struct btrfs_root *root) 195 { 196 int ret = -ENOENT; 197 198 smp_mb(); 199 if (!root->log_root) 200 return -ENOENT; 201 202 mutex_lock(&root->log_mutex); 203 if (root->log_root) { 204 ret = 0; 205 atomic_inc(&root->log_writers); 206 } 207 mutex_unlock(&root->log_mutex); 208 return ret; 209 } 210 211 /* 212 * This either makes the current running log transaction wait 213 * until you call btrfs_end_log_trans() or it makes any future 214 * log transactions wait until you call btrfs_end_log_trans() 215 */ 216 int btrfs_pin_log_trans(struct btrfs_root *root) 217 { 218 int ret = -ENOENT; 219 220 mutex_lock(&root->log_mutex); 221 atomic_inc(&root->log_writers); 222 mutex_unlock(&root->log_mutex); 223 return ret; 224 } 225 226 /* 227 * indicate we're done making changes to the log tree 228 * and wake up anyone waiting to do a sync 229 */ 230 void btrfs_end_log_trans(struct btrfs_root *root) 231 { 232 if (atomic_dec_and_test(&root->log_writers)) { 233 /* 234 * Implicit memory barrier after atomic_dec_and_test 235 */ 236 if (waitqueue_active(&root->log_writer_wait)) 237 wake_up(&root->log_writer_wait); 238 } 239 } 240 241 242 /* 243 * the walk control struct is used to pass state down the chain when 244 * processing the log tree. The stage field tells us which part 245 * of the log tree processing we are currently doing. The others 246 * are state fields used for that specific part 247 */ 248 struct walk_control { 249 /* should we free the extent on disk when done? This is used 250 * at transaction commit time while freeing a log tree 251 */ 252 int free; 253 254 /* should we write out the extent buffer? This is used 255 * while flushing the log tree to disk during a sync 256 */ 257 int write; 258 259 /* should we wait for the extent buffer io to finish? Also used 260 * while flushing the log tree to disk for a sync 261 */ 262 int wait; 263 264 /* pin only walk, we record which extents on disk belong to the 265 * log trees 266 */ 267 int pin; 268 269 /* what stage of the replay code we're currently in */ 270 int stage; 271 272 /* the root we are currently replaying */ 273 struct btrfs_root *replay_dest; 274 275 /* the trans handle for the current replay */ 276 struct btrfs_trans_handle *trans; 277 278 /* the function that gets used to process blocks we find in the 279 * tree. Note the extent_buffer might not be up to date when it is 280 * passed in, and it must be checked or read if you need the data 281 * inside it 282 */ 283 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 284 struct walk_control *wc, u64 gen); 285 }; 286 287 /* 288 * process_func used to pin down extents, write them or wait on them 289 */ 290 static int process_one_buffer(struct btrfs_root *log, 291 struct extent_buffer *eb, 292 struct walk_control *wc, u64 gen) 293 { 294 int ret = 0; 295 296 /* 297 * If this fs is mixed then we need to be able to process the leaves to 298 * pin down any logged extents, so we have to read the block. 299 */ 300 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 301 ret = btrfs_read_buffer(eb, gen); 302 if (ret) 303 return ret; 304 } 305 306 if (wc->pin) 307 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 308 eb->start, eb->len); 309 310 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 311 if (wc->pin && btrfs_header_level(eb) == 0) 312 ret = btrfs_exclude_logged_extents(log, eb); 313 if (wc->write) 314 btrfs_write_tree_block(eb); 315 if (wc->wait) 316 btrfs_wait_tree_block_writeback(eb); 317 } 318 return ret; 319 } 320 321 /* 322 * Item overwrite used by replay and tree logging. eb, slot and key all refer 323 * to the src data we are copying out. 324 * 325 * root is the tree we are copying into, and path is a scratch 326 * path for use in this function (it should be released on entry and 327 * will be released on exit). 328 * 329 * If the key is already in the destination tree the existing item is 330 * overwritten. If the existing item isn't big enough, it is extended. 331 * If it is too large, it is truncated. 332 * 333 * If the key isn't in the destination yet, a new item is inserted. 334 */ 335 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 336 struct btrfs_root *root, 337 struct btrfs_path *path, 338 struct extent_buffer *eb, int slot, 339 struct btrfs_key *key) 340 { 341 int ret; 342 u32 item_size; 343 u64 saved_i_size = 0; 344 int save_old_i_size = 0; 345 unsigned long src_ptr; 346 unsigned long dst_ptr; 347 int overwrite_root = 0; 348 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 349 350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 351 overwrite_root = 1; 352 353 item_size = btrfs_item_size_nr(eb, slot); 354 src_ptr = btrfs_item_ptr_offset(eb, slot); 355 356 /* look for the key in the destination tree */ 357 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 358 if (ret < 0) 359 return ret; 360 361 if (ret == 0) { 362 char *src_copy; 363 char *dst_copy; 364 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 365 path->slots[0]); 366 if (dst_size != item_size) 367 goto insert; 368 369 if (item_size == 0) { 370 btrfs_release_path(path); 371 return 0; 372 } 373 dst_copy = kmalloc(item_size, GFP_NOFS); 374 src_copy = kmalloc(item_size, GFP_NOFS); 375 if (!dst_copy || !src_copy) { 376 btrfs_release_path(path); 377 kfree(dst_copy); 378 kfree(src_copy); 379 return -ENOMEM; 380 } 381 382 read_extent_buffer(eb, src_copy, src_ptr, item_size); 383 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 385 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 386 item_size); 387 ret = memcmp(dst_copy, src_copy, item_size); 388 389 kfree(dst_copy); 390 kfree(src_copy); 391 /* 392 * they have the same contents, just return, this saves 393 * us from cowing blocks in the destination tree and doing 394 * extra writes that may not have been done by a previous 395 * sync 396 */ 397 if (ret == 0) { 398 btrfs_release_path(path); 399 return 0; 400 } 401 402 /* 403 * We need to load the old nbytes into the inode so when we 404 * replay the extents we've logged we get the right nbytes. 405 */ 406 if (inode_item) { 407 struct btrfs_inode_item *item; 408 u64 nbytes; 409 u32 mode; 410 411 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 412 struct btrfs_inode_item); 413 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 414 item = btrfs_item_ptr(eb, slot, 415 struct btrfs_inode_item); 416 btrfs_set_inode_nbytes(eb, item, nbytes); 417 418 /* 419 * If this is a directory we need to reset the i_size to 420 * 0 so that we can set it up properly when replaying 421 * the rest of the items in this log. 422 */ 423 mode = btrfs_inode_mode(eb, item); 424 if (S_ISDIR(mode)) 425 btrfs_set_inode_size(eb, item, 0); 426 } 427 } else if (inode_item) { 428 struct btrfs_inode_item *item; 429 u32 mode; 430 431 /* 432 * New inode, set nbytes to 0 so that the nbytes comes out 433 * properly when we replay the extents. 434 */ 435 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 436 btrfs_set_inode_nbytes(eb, item, 0); 437 438 /* 439 * If this is a directory we need to reset the i_size to 0 so 440 * that we can set it up properly when replaying the rest of 441 * the items in this log. 442 */ 443 mode = btrfs_inode_mode(eb, item); 444 if (S_ISDIR(mode)) 445 btrfs_set_inode_size(eb, item, 0); 446 } 447 insert: 448 btrfs_release_path(path); 449 /* try to insert the key into the destination tree */ 450 path->skip_release_on_error = 1; 451 ret = btrfs_insert_empty_item(trans, root, path, 452 key, item_size); 453 path->skip_release_on_error = 0; 454 455 /* make sure any existing item is the correct size */ 456 if (ret == -EEXIST || ret == -EOVERFLOW) { 457 u32 found_size; 458 found_size = btrfs_item_size_nr(path->nodes[0], 459 path->slots[0]); 460 if (found_size > item_size) 461 btrfs_truncate_item(root, path, item_size, 1); 462 else if (found_size < item_size) 463 btrfs_extend_item(root, path, 464 item_size - found_size); 465 } else if (ret) { 466 return ret; 467 } 468 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 469 path->slots[0]); 470 471 /* don't overwrite an existing inode if the generation number 472 * was logged as zero. This is done when the tree logging code 473 * is just logging an inode to make sure it exists after recovery. 474 * 475 * Also, don't overwrite i_size on directories during replay. 476 * log replay inserts and removes directory items based on the 477 * state of the tree found in the subvolume, and i_size is modified 478 * as it goes 479 */ 480 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 481 struct btrfs_inode_item *src_item; 482 struct btrfs_inode_item *dst_item; 483 484 src_item = (struct btrfs_inode_item *)src_ptr; 485 dst_item = (struct btrfs_inode_item *)dst_ptr; 486 487 if (btrfs_inode_generation(eb, src_item) == 0) { 488 struct extent_buffer *dst_eb = path->nodes[0]; 489 const u64 ino_size = btrfs_inode_size(eb, src_item); 490 491 /* 492 * For regular files an ino_size == 0 is used only when 493 * logging that an inode exists, as part of a directory 494 * fsync, and the inode wasn't fsynced before. In this 495 * case don't set the size of the inode in the fs/subvol 496 * tree, otherwise we would be throwing valid data away. 497 */ 498 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 499 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 500 ino_size != 0) { 501 struct btrfs_map_token token; 502 503 btrfs_init_map_token(&token); 504 btrfs_set_token_inode_size(dst_eb, dst_item, 505 ino_size, &token); 506 } 507 goto no_copy; 508 } 509 510 if (overwrite_root && 511 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 512 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 513 save_old_i_size = 1; 514 saved_i_size = btrfs_inode_size(path->nodes[0], 515 dst_item); 516 } 517 } 518 519 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 520 src_ptr, item_size); 521 522 if (save_old_i_size) { 523 struct btrfs_inode_item *dst_item; 524 dst_item = (struct btrfs_inode_item *)dst_ptr; 525 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 526 } 527 528 /* make sure the generation is filled in */ 529 if (key->type == BTRFS_INODE_ITEM_KEY) { 530 struct btrfs_inode_item *dst_item; 531 dst_item = (struct btrfs_inode_item *)dst_ptr; 532 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 533 btrfs_set_inode_generation(path->nodes[0], dst_item, 534 trans->transid); 535 } 536 } 537 no_copy: 538 btrfs_mark_buffer_dirty(path->nodes[0]); 539 btrfs_release_path(path); 540 return 0; 541 } 542 543 /* 544 * simple helper to read an inode off the disk from a given root 545 * This can only be called for subvolume roots and not for the log 546 */ 547 static noinline struct inode *read_one_inode(struct btrfs_root *root, 548 u64 objectid) 549 { 550 struct btrfs_key key; 551 struct inode *inode; 552 553 key.objectid = objectid; 554 key.type = BTRFS_INODE_ITEM_KEY; 555 key.offset = 0; 556 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 557 if (IS_ERR(inode)) { 558 inode = NULL; 559 } else if (is_bad_inode(inode)) { 560 iput(inode); 561 inode = NULL; 562 } 563 return inode; 564 } 565 566 /* replays a single extent in 'eb' at 'slot' with 'key' into the 567 * subvolume 'root'. path is released on entry and should be released 568 * on exit. 569 * 570 * extents in the log tree have not been allocated out of the extent 571 * tree yet. So, this completes the allocation, taking a reference 572 * as required if the extent already exists or creating a new extent 573 * if it isn't in the extent allocation tree yet. 574 * 575 * The extent is inserted into the file, dropping any existing extents 576 * from the file that overlap the new one. 577 */ 578 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 579 struct btrfs_root *root, 580 struct btrfs_path *path, 581 struct extent_buffer *eb, int slot, 582 struct btrfs_key *key) 583 { 584 int found_type; 585 u64 extent_end; 586 u64 start = key->offset; 587 u64 nbytes = 0; 588 struct btrfs_file_extent_item *item; 589 struct inode *inode = NULL; 590 unsigned long size; 591 int ret = 0; 592 593 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 594 found_type = btrfs_file_extent_type(eb, item); 595 596 if (found_type == BTRFS_FILE_EXTENT_REG || 597 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 598 nbytes = btrfs_file_extent_num_bytes(eb, item); 599 extent_end = start + nbytes; 600 601 /* 602 * We don't add to the inodes nbytes if we are prealloc or a 603 * hole. 604 */ 605 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 606 nbytes = 0; 607 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 608 size = btrfs_file_extent_inline_len(eb, slot, item); 609 nbytes = btrfs_file_extent_ram_bytes(eb, item); 610 extent_end = ALIGN(start + size, root->sectorsize); 611 } else { 612 ret = 0; 613 goto out; 614 } 615 616 inode = read_one_inode(root, key->objectid); 617 if (!inode) { 618 ret = -EIO; 619 goto out; 620 } 621 622 /* 623 * first check to see if we already have this extent in the 624 * file. This must be done before the btrfs_drop_extents run 625 * so we don't try to drop this extent. 626 */ 627 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 628 start, 0); 629 630 if (ret == 0 && 631 (found_type == BTRFS_FILE_EXTENT_REG || 632 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 633 struct btrfs_file_extent_item cmp1; 634 struct btrfs_file_extent_item cmp2; 635 struct btrfs_file_extent_item *existing; 636 struct extent_buffer *leaf; 637 638 leaf = path->nodes[0]; 639 existing = btrfs_item_ptr(leaf, path->slots[0], 640 struct btrfs_file_extent_item); 641 642 read_extent_buffer(eb, &cmp1, (unsigned long)item, 643 sizeof(cmp1)); 644 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 645 sizeof(cmp2)); 646 647 /* 648 * we already have a pointer to this exact extent, 649 * we don't have to do anything 650 */ 651 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 652 btrfs_release_path(path); 653 goto out; 654 } 655 } 656 btrfs_release_path(path); 657 658 /* drop any overlapping extents */ 659 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 660 if (ret) 661 goto out; 662 663 if (found_type == BTRFS_FILE_EXTENT_REG || 664 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 665 u64 offset; 666 unsigned long dest_offset; 667 struct btrfs_key ins; 668 669 ret = btrfs_insert_empty_item(trans, root, path, key, 670 sizeof(*item)); 671 if (ret) 672 goto out; 673 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 674 path->slots[0]); 675 copy_extent_buffer(path->nodes[0], eb, dest_offset, 676 (unsigned long)item, sizeof(*item)); 677 678 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 679 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 680 ins.type = BTRFS_EXTENT_ITEM_KEY; 681 offset = key->offset - btrfs_file_extent_offset(eb, item); 682 683 if (ins.objectid > 0) { 684 u64 csum_start; 685 u64 csum_end; 686 LIST_HEAD(ordered_sums); 687 /* 688 * is this extent already allocated in the extent 689 * allocation tree? If so, just add a reference 690 */ 691 ret = btrfs_lookup_data_extent(root, ins.objectid, 692 ins.offset); 693 if (ret == 0) { 694 ret = btrfs_inc_extent_ref(trans, root, 695 ins.objectid, ins.offset, 696 0, root->root_key.objectid, 697 key->objectid, offset); 698 if (ret) 699 goto out; 700 } else { 701 /* 702 * insert the extent pointer in the extent 703 * allocation tree 704 */ 705 ret = btrfs_alloc_logged_file_extent(trans, 706 root, root->root_key.objectid, 707 key->objectid, offset, &ins); 708 if (ret) 709 goto out; 710 } 711 btrfs_release_path(path); 712 713 if (btrfs_file_extent_compression(eb, item)) { 714 csum_start = ins.objectid; 715 csum_end = csum_start + ins.offset; 716 } else { 717 csum_start = ins.objectid + 718 btrfs_file_extent_offset(eb, item); 719 csum_end = csum_start + 720 btrfs_file_extent_num_bytes(eb, item); 721 } 722 723 ret = btrfs_lookup_csums_range(root->log_root, 724 csum_start, csum_end - 1, 725 &ordered_sums, 0); 726 if (ret) 727 goto out; 728 /* 729 * Now delete all existing cums in the csum root that 730 * cover our range. We do this because we can have an 731 * extent that is completely referenced by one file 732 * extent item and partially referenced by another 733 * file extent item (like after using the clone or 734 * extent_same ioctls). In this case if we end up doing 735 * the replay of the one that partially references the 736 * extent first, and we do not do the csum deletion 737 * below, we can get 2 csum items in the csum tree that 738 * overlap each other. For example, imagine our log has 739 * the two following file extent items: 740 * 741 * key (257 EXTENT_DATA 409600) 742 * extent data disk byte 12845056 nr 102400 743 * extent data offset 20480 nr 20480 ram 102400 744 * 745 * key (257 EXTENT_DATA 819200) 746 * extent data disk byte 12845056 nr 102400 747 * extent data offset 0 nr 102400 ram 102400 748 * 749 * Where the second one fully references the 100K extent 750 * that starts at disk byte 12845056, and the log tree 751 * has a single csum item that covers the entire range 752 * of the extent: 753 * 754 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 755 * 756 * After the first file extent item is replayed, the 757 * csum tree gets the following csum item: 758 * 759 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 760 * 761 * Which covers the 20K sub-range starting at offset 20K 762 * of our extent. Now when we replay the second file 763 * extent item, if we do not delete existing csum items 764 * that cover any of its blocks, we end up getting two 765 * csum items in our csum tree that overlap each other: 766 * 767 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 768 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 769 * 770 * Which is a problem, because after this anyone trying 771 * to lookup up for the checksum of any block of our 772 * extent starting at an offset of 40K or higher, will 773 * end up looking at the second csum item only, which 774 * does not contain the checksum for any block starting 775 * at offset 40K or higher of our extent. 776 */ 777 while (!list_empty(&ordered_sums)) { 778 struct btrfs_ordered_sum *sums; 779 sums = list_entry(ordered_sums.next, 780 struct btrfs_ordered_sum, 781 list); 782 if (!ret) 783 ret = btrfs_del_csums(trans, 784 root->fs_info->csum_root, 785 sums->bytenr, 786 sums->len); 787 if (!ret) 788 ret = btrfs_csum_file_blocks(trans, 789 root->fs_info->csum_root, 790 sums); 791 list_del(&sums->list); 792 kfree(sums); 793 } 794 if (ret) 795 goto out; 796 } else { 797 btrfs_release_path(path); 798 } 799 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 800 /* inline extents are easy, we just overwrite them */ 801 ret = overwrite_item(trans, root, path, eb, slot, key); 802 if (ret) 803 goto out; 804 } 805 806 inode_add_bytes(inode, nbytes); 807 ret = btrfs_update_inode(trans, root, inode); 808 out: 809 if (inode) 810 iput(inode); 811 return ret; 812 } 813 814 /* 815 * when cleaning up conflicts between the directory names in the 816 * subvolume, directory names in the log and directory names in the 817 * inode back references, we may have to unlink inodes from directories. 818 * 819 * This is a helper function to do the unlink of a specific directory 820 * item 821 */ 822 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 823 struct btrfs_root *root, 824 struct btrfs_path *path, 825 struct inode *dir, 826 struct btrfs_dir_item *di) 827 { 828 struct inode *inode; 829 char *name; 830 int name_len; 831 struct extent_buffer *leaf; 832 struct btrfs_key location; 833 int ret; 834 835 leaf = path->nodes[0]; 836 837 btrfs_dir_item_key_to_cpu(leaf, di, &location); 838 name_len = btrfs_dir_name_len(leaf, di); 839 name = kmalloc(name_len, GFP_NOFS); 840 if (!name) 841 return -ENOMEM; 842 843 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 844 btrfs_release_path(path); 845 846 inode = read_one_inode(root, location.objectid); 847 if (!inode) { 848 ret = -EIO; 849 goto out; 850 } 851 852 ret = link_to_fixup_dir(trans, root, path, location.objectid); 853 if (ret) 854 goto out; 855 856 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 857 if (ret) 858 goto out; 859 else 860 ret = btrfs_run_delayed_items(trans, root); 861 out: 862 kfree(name); 863 iput(inode); 864 return ret; 865 } 866 867 /* 868 * helper function to see if a given name and sequence number found 869 * in an inode back reference are already in a directory and correctly 870 * point to this inode 871 */ 872 static noinline int inode_in_dir(struct btrfs_root *root, 873 struct btrfs_path *path, 874 u64 dirid, u64 objectid, u64 index, 875 const char *name, int name_len) 876 { 877 struct btrfs_dir_item *di; 878 struct btrfs_key location; 879 int match = 0; 880 881 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 882 index, name, name_len, 0); 883 if (di && !IS_ERR(di)) { 884 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 885 if (location.objectid != objectid) 886 goto out; 887 } else 888 goto out; 889 btrfs_release_path(path); 890 891 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 892 if (di && !IS_ERR(di)) { 893 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 894 if (location.objectid != objectid) 895 goto out; 896 } else 897 goto out; 898 match = 1; 899 out: 900 btrfs_release_path(path); 901 return match; 902 } 903 904 /* 905 * helper function to check a log tree for a named back reference in 906 * an inode. This is used to decide if a back reference that is 907 * found in the subvolume conflicts with what we find in the log. 908 * 909 * inode backreferences may have multiple refs in a single item, 910 * during replay we process one reference at a time, and we don't 911 * want to delete valid links to a file from the subvolume if that 912 * link is also in the log. 913 */ 914 static noinline int backref_in_log(struct btrfs_root *log, 915 struct btrfs_key *key, 916 u64 ref_objectid, 917 const char *name, int namelen) 918 { 919 struct btrfs_path *path; 920 struct btrfs_inode_ref *ref; 921 unsigned long ptr; 922 unsigned long ptr_end; 923 unsigned long name_ptr; 924 int found_name_len; 925 int item_size; 926 int ret; 927 int match = 0; 928 929 path = btrfs_alloc_path(); 930 if (!path) 931 return -ENOMEM; 932 933 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 934 if (ret != 0) 935 goto out; 936 937 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 938 939 if (key->type == BTRFS_INODE_EXTREF_KEY) { 940 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 941 name, namelen, NULL)) 942 match = 1; 943 944 goto out; 945 } 946 947 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 948 ptr_end = ptr + item_size; 949 while (ptr < ptr_end) { 950 ref = (struct btrfs_inode_ref *)ptr; 951 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 952 if (found_name_len == namelen) { 953 name_ptr = (unsigned long)(ref + 1); 954 ret = memcmp_extent_buffer(path->nodes[0], name, 955 name_ptr, namelen); 956 if (ret == 0) { 957 match = 1; 958 goto out; 959 } 960 } 961 ptr = (unsigned long)(ref + 1) + found_name_len; 962 } 963 out: 964 btrfs_free_path(path); 965 return match; 966 } 967 968 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 969 struct btrfs_root *root, 970 struct btrfs_path *path, 971 struct btrfs_root *log_root, 972 struct inode *dir, struct inode *inode, 973 struct extent_buffer *eb, 974 u64 inode_objectid, u64 parent_objectid, 975 u64 ref_index, char *name, int namelen, 976 int *search_done) 977 { 978 int ret; 979 char *victim_name; 980 int victim_name_len; 981 struct extent_buffer *leaf; 982 struct btrfs_dir_item *di; 983 struct btrfs_key search_key; 984 struct btrfs_inode_extref *extref; 985 986 again: 987 /* Search old style refs */ 988 search_key.objectid = inode_objectid; 989 search_key.type = BTRFS_INODE_REF_KEY; 990 search_key.offset = parent_objectid; 991 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 992 if (ret == 0) { 993 struct btrfs_inode_ref *victim_ref; 994 unsigned long ptr; 995 unsigned long ptr_end; 996 997 leaf = path->nodes[0]; 998 999 /* are we trying to overwrite a back ref for the root directory 1000 * if so, just jump out, we're done 1001 */ 1002 if (search_key.objectid == search_key.offset) 1003 return 1; 1004 1005 /* check all the names in this back reference to see 1006 * if they are in the log. if so, we allow them to stay 1007 * otherwise they must be unlinked as a conflict 1008 */ 1009 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1010 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1011 while (ptr < ptr_end) { 1012 victim_ref = (struct btrfs_inode_ref *)ptr; 1013 victim_name_len = btrfs_inode_ref_name_len(leaf, 1014 victim_ref); 1015 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1016 if (!victim_name) 1017 return -ENOMEM; 1018 1019 read_extent_buffer(leaf, victim_name, 1020 (unsigned long)(victim_ref + 1), 1021 victim_name_len); 1022 1023 if (!backref_in_log(log_root, &search_key, 1024 parent_objectid, 1025 victim_name, 1026 victim_name_len)) { 1027 inc_nlink(inode); 1028 btrfs_release_path(path); 1029 1030 ret = btrfs_unlink_inode(trans, root, dir, 1031 inode, victim_name, 1032 victim_name_len); 1033 kfree(victim_name); 1034 if (ret) 1035 return ret; 1036 ret = btrfs_run_delayed_items(trans, root); 1037 if (ret) 1038 return ret; 1039 *search_done = 1; 1040 goto again; 1041 } 1042 kfree(victim_name); 1043 1044 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1045 } 1046 1047 /* 1048 * NOTE: we have searched root tree and checked the 1049 * corresponding ref, it does not need to check again. 1050 */ 1051 *search_done = 1; 1052 } 1053 btrfs_release_path(path); 1054 1055 /* Same search but for extended refs */ 1056 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1057 inode_objectid, parent_objectid, 0, 1058 0); 1059 if (!IS_ERR_OR_NULL(extref)) { 1060 u32 item_size; 1061 u32 cur_offset = 0; 1062 unsigned long base; 1063 struct inode *victim_parent; 1064 1065 leaf = path->nodes[0]; 1066 1067 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1068 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1069 1070 while (cur_offset < item_size) { 1071 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1072 1073 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1074 1075 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1076 goto next; 1077 1078 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1079 if (!victim_name) 1080 return -ENOMEM; 1081 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1082 victim_name_len); 1083 1084 search_key.objectid = inode_objectid; 1085 search_key.type = BTRFS_INODE_EXTREF_KEY; 1086 search_key.offset = btrfs_extref_hash(parent_objectid, 1087 victim_name, 1088 victim_name_len); 1089 ret = 0; 1090 if (!backref_in_log(log_root, &search_key, 1091 parent_objectid, victim_name, 1092 victim_name_len)) { 1093 ret = -ENOENT; 1094 victim_parent = read_one_inode(root, 1095 parent_objectid); 1096 if (victim_parent) { 1097 inc_nlink(inode); 1098 btrfs_release_path(path); 1099 1100 ret = btrfs_unlink_inode(trans, root, 1101 victim_parent, 1102 inode, 1103 victim_name, 1104 victim_name_len); 1105 if (!ret) 1106 ret = btrfs_run_delayed_items( 1107 trans, root); 1108 } 1109 iput(victim_parent); 1110 kfree(victim_name); 1111 if (ret) 1112 return ret; 1113 *search_done = 1; 1114 goto again; 1115 } 1116 kfree(victim_name); 1117 if (ret) 1118 return ret; 1119 next: 1120 cur_offset += victim_name_len + sizeof(*extref); 1121 } 1122 *search_done = 1; 1123 } 1124 btrfs_release_path(path); 1125 1126 /* look for a conflicting sequence number */ 1127 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1128 ref_index, name, namelen, 0); 1129 if (di && !IS_ERR(di)) { 1130 ret = drop_one_dir_item(trans, root, path, dir, di); 1131 if (ret) 1132 return ret; 1133 } 1134 btrfs_release_path(path); 1135 1136 /* look for a conflicing name */ 1137 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1138 name, namelen, 0); 1139 if (di && !IS_ERR(di)) { 1140 ret = drop_one_dir_item(trans, root, path, dir, di); 1141 if (ret) 1142 return ret; 1143 } 1144 btrfs_release_path(path); 1145 1146 return 0; 1147 } 1148 1149 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1150 u32 *namelen, char **name, u64 *index, 1151 u64 *parent_objectid) 1152 { 1153 struct btrfs_inode_extref *extref; 1154 1155 extref = (struct btrfs_inode_extref *)ref_ptr; 1156 1157 *namelen = btrfs_inode_extref_name_len(eb, extref); 1158 *name = kmalloc(*namelen, GFP_NOFS); 1159 if (*name == NULL) 1160 return -ENOMEM; 1161 1162 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1163 *namelen); 1164 1165 *index = btrfs_inode_extref_index(eb, extref); 1166 if (parent_objectid) 1167 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1168 1169 return 0; 1170 } 1171 1172 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1173 u32 *namelen, char **name, u64 *index) 1174 { 1175 struct btrfs_inode_ref *ref; 1176 1177 ref = (struct btrfs_inode_ref *)ref_ptr; 1178 1179 *namelen = btrfs_inode_ref_name_len(eb, ref); 1180 *name = kmalloc(*namelen, GFP_NOFS); 1181 if (*name == NULL) 1182 return -ENOMEM; 1183 1184 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1185 1186 *index = btrfs_inode_ref_index(eb, ref); 1187 1188 return 0; 1189 } 1190 1191 /* 1192 * replay one inode back reference item found in the log tree. 1193 * eb, slot and key refer to the buffer and key found in the log tree. 1194 * root is the destination we are replaying into, and path is for temp 1195 * use by this function. (it should be released on return). 1196 */ 1197 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1198 struct btrfs_root *root, 1199 struct btrfs_root *log, 1200 struct btrfs_path *path, 1201 struct extent_buffer *eb, int slot, 1202 struct btrfs_key *key) 1203 { 1204 struct inode *dir = NULL; 1205 struct inode *inode = NULL; 1206 unsigned long ref_ptr; 1207 unsigned long ref_end; 1208 char *name = NULL; 1209 int namelen; 1210 int ret; 1211 int search_done = 0; 1212 int log_ref_ver = 0; 1213 u64 parent_objectid; 1214 u64 inode_objectid; 1215 u64 ref_index = 0; 1216 int ref_struct_size; 1217 1218 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1219 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1220 1221 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1222 struct btrfs_inode_extref *r; 1223 1224 ref_struct_size = sizeof(struct btrfs_inode_extref); 1225 log_ref_ver = 1; 1226 r = (struct btrfs_inode_extref *)ref_ptr; 1227 parent_objectid = btrfs_inode_extref_parent(eb, r); 1228 } else { 1229 ref_struct_size = sizeof(struct btrfs_inode_ref); 1230 parent_objectid = key->offset; 1231 } 1232 inode_objectid = key->objectid; 1233 1234 /* 1235 * it is possible that we didn't log all the parent directories 1236 * for a given inode. If we don't find the dir, just don't 1237 * copy the back ref in. The link count fixup code will take 1238 * care of the rest 1239 */ 1240 dir = read_one_inode(root, parent_objectid); 1241 if (!dir) { 1242 ret = -ENOENT; 1243 goto out; 1244 } 1245 1246 inode = read_one_inode(root, inode_objectid); 1247 if (!inode) { 1248 ret = -EIO; 1249 goto out; 1250 } 1251 1252 while (ref_ptr < ref_end) { 1253 if (log_ref_ver) { 1254 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1255 &ref_index, &parent_objectid); 1256 /* 1257 * parent object can change from one array 1258 * item to another. 1259 */ 1260 if (!dir) 1261 dir = read_one_inode(root, parent_objectid); 1262 if (!dir) { 1263 ret = -ENOENT; 1264 goto out; 1265 } 1266 } else { 1267 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1268 &ref_index); 1269 } 1270 if (ret) 1271 goto out; 1272 1273 /* if we already have a perfect match, we're done */ 1274 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1275 ref_index, name, namelen)) { 1276 /* 1277 * look for a conflicting back reference in the 1278 * metadata. if we find one we have to unlink that name 1279 * of the file before we add our new link. Later on, we 1280 * overwrite any existing back reference, and we don't 1281 * want to create dangling pointers in the directory. 1282 */ 1283 1284 if (!search_done) { 1285 ret = __add_inode_ref(trans, root, path, log, 1286 dir, inode, eb, 1287 inode_objectid, 1288 parent_objectid, 1289 ref_index, name, namelen, 1290 &search_done); 1291 if (ret) { 1292 if (ret == 1) 1293 ret = 0; 1294 goto out; 1295 } 1296 } 1297 1298 /* insert our name */ 1299 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1300 0, ref_index); 1301 if (ret) 1302 goto out; 1303 1304 btrfs_update_inode(trans, root, inode); 1305 } 1306 1307 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1308 kfree(name); 1309 name = NULL; 1310 if (log_ref_ver) { 1311 iput(dir); 1312 dir = NULL; 1313 } 1314 } 1315 1316 /* finally write the back reference in the inode */ 1317 ret = overwrite_item(trans, root, path, eb, slot, key); 1318 out: 1319 btrfs_release_path(path); 1320 kfree(name); 1321 iput(dir); 1322 iput(inode); 1323 return ret; 1324 } 1325 1326 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1327 struct btrfs_root *root, u64 ino) 1328 { 1329 int ret; 1330 1331 ret = btrfs_insert_orphan_item(trans, root, ino); 1332 if (ret == -EEXIST) 1333 ret = 0; 1334 1335 return ret; 1336 } 1337 1338 static int count_inode_extrefs(struct btrfs_root *root, 1339 struct inode *inode, struct btrfs_path *path) 1340 { 1341 int ret = 0; 1342 int name_len; 1343 unsigned int nlink = 0; 1344 u32 item_size; 1345 u32 cur_offset = 0; 1346 u64 inode_objectid = btrfs_ino(inode); 1347 u64 offset = 0; 1348 unsigned long ptr; 1349 struct btrfs_inode_extref *extref; 1350 struct extent_buffer *leaf; 1351 1352 while (1) { 1353 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1354 &extref, &offset); 1355 if (ret) 1356 break; 1357 1358 leaf = path->nodes[0]; 1359 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1360 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1361 cur_offset = 0; 1362 1363 while (cur_offset < item_size) { 1364 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1365 name_len = btrfs_inode_extref_name_len(leaf, extref); 1366 1367 nlink++; 1368 1369 cur_offset += name_len + sizeof(*extref); 1370 } 1371 1372 offset++; 1373 btrfs_release_path(path); 1374 } 1375 btrfs_release_path(path); 1376 1377 if (ret < 0 && ret != -ENOENT) 1378 return ret; 1379 return nlink; 1380 } 1381 1382 static int count_inode_refs(struct btrfs_root *root, 1383 struct inode *inode, struct btrfs_path *path) 1384 { 1385 int ret; 1386 struct btrfs_key key; 1387 unsigned int nlink = 0; 1388 unsigned long ptr; 1389 unsigned long ptr_end; 1390 int name_len; 1391 u64 ino = btrfs_ino(inode); 1392 1393 key.objectid = ino; 1394 key.type = BTRFS_INODE_REF_KEY; 1395 key.offset = (u64)-1; 1396 1397 while (1) { 1398 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1399 if (ret < 0) 1400 break; 1401 if (ret > 0) { 1402 if (path->slots[0] == 0) 1403 break; 1404 path->slots[0]--; 1405 } 1406 process_slot: 1407 btrfs_item_key_to_cpu(path->nodes[0], &key, 1408 path->slots[0]); 1409 if (key.objectid != ino || 1410 key.type != BTRFS_INODE_REF_KEY) 1411 break; 1412 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1413 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1414 path->slots[0]); 1415 while (ptr < ptr_end) { 1416 struct btrfs_inode_ref *ref; 1417 1418 ref = (struct btrfs_inode_ref *)ptr; 1419 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1420 ref); 1421 ptr = (unsigned long)(ref + 1) + name_len; 1422 nlink++; 1423 } 1424 1425 if (key.offset == 0) 1426 break; 1427 if (path->slots[0] > 0) { 1428 path->slots[0]--; 1429 goto process_slot; 1430 } 1431 key.offset--; 1432 btrfs_release_path(path); 1433 } 1434 btrfs_release_path(path); 1435 1436 return nlink; 1437 } 1438 1439 /* 1440 * There are a few corners where the link count of the file can't 1441 * be properly maintained during replay. So, instead of adding 1442 * lots of complexity to the log code, we just scan the backrefs 1443 * for any file that has been through replay. 1444 * 1445 * The scan will update the link count on the inode to reflect the 1446 * number of back refs found. If it goes down to zero, the iput 1447 * will free the inode. 1448 */ 1449 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1450 struct btrfs_root *root, 1451 struct inode *inode) 1452 { 1453 struct btrfs_path *path; 1454 int ret; 1455 u64 nlink = 0; 1456 u64 ino = btrfs_ino(inode); 1457 1458 path = btrfs_alloc_path(); 1459 if (!path) 1460 return -ENOMEM; 1461 1462 ret = count_inode_refs(root, inode, path); 1463 if (ret < 0) 1464 goto out; 1465 1466 nlink = ret; 1467 1468 ret = count_inode_extrefs(root, inode, path); 1469 if (ret < 0) 1470 goto out; 1471 1472 nlink += ret; 1473 1474 ret = 0; 1475 1476 if (nlink != inode->i_nlink) { 1477 set_nlink(inode, nlink); 1478 btrfs_update_inode(trans, root, inode); 1479 } 1480 BTRFS_I(inode)->index_cnt = (u64)-1; 1481 1482 if (inode->i_nlink == 0) { 1483 if (S_ISDIR(inode->i_mode)) { 1484 ret = replay_dir_deletes(trans, root, NULL, path, 1485 ino, 1); 1486 if (ret) 1487 goto out; 1488 } 1489 ret = insert_orphan_item(trans, root, ino); 1490 } 1491 1492 out: 1493 btrfs_free_path(path); 1494 return ret; 1495 } 1496 1497 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1498 struct btrfs_root *root, 1499 struct btrfs_path *path) 1500 { 1501 int ret; 1502 struct btrfs_key key; 1503 struct inode *inode; 1504 1505 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1506 key.type = BTRFS_ORPHAN_ITEM_KEY; 1507 key.offset = (u64)-1; 1508 while (1) { 1509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1510 if (ret < 0) 1511 break; 1512 1513 if (ret == 1) { 1514 if (path->slots[0] == 0) 1515 break; 1516 path->slots[0]--; 1517 } 1518 1519 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1520 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1521 key.type != BTRFS_ORPHAN_ITEM_KEY) 1522 break; 1523 1524 ret = btrfs_del_item(trans, root, path); 1525 if (ret) 1526 goto out; 1527 1528 btrfs_release_path(path); 1529 inode = read_one_inode(root, key.offset); 1530 if (!inode) 1531 return -EIO; 1532 1533 ret = fixup_inode_link_count(trans, root, inode); 1534 iput(inode); 1535 if (ret) 1536 goto out; 1537 1538 /* 1539 * fixup on a directory may create new entries, 1540 * make sure we always look for the highset possible 1541 * offset 1542 */ 1543 key.offset = (u64)-1; 1544 } 1545 ret = 0; 1546 out: 1547 btrfs_release_path(path); 1548 return ret; 1549 } 1550 1551 1552 /* 1553 * record a given inode in the fixup dir so we can check its link 1554 * count when replay is done. The link count is incremented here 1555 * so the inode won't go away until we check it 1556 */ 1557 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1558 struct btrfs_root *root, 1559 struct btrfs_path *path, 1560 u64 objectid) 1561 { 1562 struct btrfs_key key; 1563 int ret = 0; 1564 struct inode *inode; 1565 1566 inode = read_one_inode(root, objectid); 1567 if (!inode) 1568 return -EIO; 1569 1570 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1571 key.type = BTRFS_ORPHAN_ITEM_KEY; 1572 key.offset = objectid; 1573 1574 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1575 1576 btrfs_release_path(path); 1577 if (ret == 0) { 1578 if (!inode->i_nlink) 1579 set_nlink(inode, 1); 1580 else 1581 inc_nlink(inode); 1582 ret = btrfs_update_inode(trans, root, inode); 1583 } else if (ret == -EEXIST) { 1584 ret = 0; 1585 } else { 1586 BUG(); /* Logic Error */ 1587 } 1588 iput(inode); 1589 1590 return ret; 1591 } 1592 1593 /* 1594 * when replaying the log for a directory, we only insert names 1595 * for inodes that actually exist. This means an fsync on a directory 1596 * does not implicitly fsync all the new files in it 1597 */ 1598 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1599 struct btrfs_root *root, 1600 u64 dirid, u64 index, 1601 char *name, int name_len, 1602 struct btrfs_key *location) 1603 { 1604 struct inode *inode; 1605 struct inode *dir; 1606 int ret; 1607 1608 inode = read_one_inode(root, location->objectid); 1609 if (!inode) 1610 return -ENOENT; 1611 1612 dir = read_one_inode(root, dirid); 1613 if (!dir) { 1614 iput(inode); 1615 return -EIO; 1616 } 1617 1618 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1619 1620 /* FIXME, put inode into FIXUP list */ 1621 1622 iput(inode); 1623 iput(dir); 1624 return ret; 1625 } 1626 1627 /* 1628 * Return true if an inode reference exists in the log for the given name, 1629 * inode and parent inode. 1630 */ 1631 static bool name_in_log_ref(struct btrfs_root *log_root, 1632 const char *name, const int name_len, 1633 const u64 dirid, const u64 ino) 1634 { 1635 struct btrfs_key search_key; 1636 1637 search_key.objectid = ino; 1638 search_key.type = BTRFS_INODE_REF_KEY; 1639 search_key.offset = dirid; 1640 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1641 return true; 1642 1643 search_key.type = BTRFS_INODE_EXTREF_KEY; 1644 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1645 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1646 return true; 1647 1648 return false; 1649 } 1650 1651 /* 1652 * take a single entry in a log directory item and replay it into 1653 * the subvolume. 1654 * 1655 * if a conflicting item exists in the subdirectory already, 1656 * the inode it points to is unlinked and put into the link count 1657 * fix up tree. 1658 * 1659 * If a name from the log points to a file or directory that does 1660 * not exist in the FS, it is skipped. fsyncs on directories 1661 * do not force down inodes inside that directory, just changes to the 1662 * names or unlinks in a directory. 1663 * 1664 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1665 * non-existing inode) and 1 if the name was replayed. 1666 */ 1667 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1668 struct btrfs_root *root, 1669 struct btrfs_path *path, 1670 struct extent_buffer *eb, 1671 struct btrfs_dir_item *di, 1672 struct btrfs_key *key) 1673 { 1674 char *name; 1675 int name_len; 1676 struct btrfs_dir_item *dst_di; 1677 struct btrfs_key found_key; 1678 struct btrfs_key log_key; 1679 struct inode *dir; 1680 u8 log_type; 1681 int exists; 1682 int ret = 0; 1683 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1684 bool name_added = false; 1685 1686 dir = read_one_inode(root, key->objectid); 1687 if (!dir) 1688 return -EIO; 1689 1690 name_len = btrfs_dir_name_len(eb, di); 1691 name = kmalloc(name_len, GFP_NOFS); 1692 if (!name) { 1693 ret = -ENOMEM; 1694 goto out; 1695 } 1696 1697 log_type = btrfs_dir_type(eb, di); 1698 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1699 name_len); 1700 1701 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1702 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1703 if (exists == 0) 1704 exists = 1; 1705 else 1706 exists = 0; 1707 btrfs_release_path(path); 1708 1709 if (key->type == BTRFS_DIR_ITEM_KEY) { 1710 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1711 name, name_len, 1); 1712 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1713 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1714 key->objectid, 1715 key->offset, name, 1716 name_len, 1); 1717 } else { 1718 /* Corruption */ 1719 ret = -EINVAL; 1720 goto out; 1721 } 1722 if (IS_ERR_OR_NULL(dst_di)) { 1723 /* we need a sequence number to insert, so we only 1724 * do inserts for the BTRFS_DIR_INDEX_KEY types 1725 */ 1726 if (key->type != BTRFS_DIR_INDEX_KEY) 1727 goto out; 1728 goto insert; 1729 } 1730 1731 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1732 /* the existing item matches the logged item */ 1733 if (found_key.objectid == log_key.objectid && 1734 found_key.type == log_key.type && 1735 found_key.offset == log_key.offset && 1736 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1737 update_size = false; 1738 goto out; 1739 } 1740 1741 /* 1742 * don't drop the conflicting directory entry if the inode 1743 * for the new entry doesn't exist 1744 */ 1745 if (!exists) 1746 goto out; 1747 1748 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1749 if (ret) 1750 goto out; 1751 1752 if (key->type == BTRFS_DIR_INDEX_KEY) 1753 goto insert; 1754 out: 1755 btrfs_release_path(path); 1756 if (!ret && update_size) { 1757 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1758 ret = btrfs_update_inode(trans, root, dir); 1759 } 1760 kfree(name); 1761 iput(dir); 1762 if (!ret && name_added) 1763 ret = 1; 1764 return ret; 1765 1766 insert: 1767 if (name_in_log_ref(root->log_root, name, name_len, 1768 key->objectid, log_key.objectid)) { 1769 /* The dentry will be added later. */ 1770 ret = 0; 1771 update_size = false; 1772 goto out; 1773 } 1774 btrfs_release_path(path); 1775 ret = insert_one_name(trans, root, key->objectid, key->offset, 1776 name, name_len, &log_key); 1777 if (ret && ret != -ENOENT && ret != -EEXIST) 1778 goto out; 1779 if (!ret) 1780 name_added = true; 1781 update_size = false; 1782 ret = 0; 1783 goto out; 1784 } 1785 1786 /* 1787 * find all the names in a directory item and reconcile them into 1788 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1789 * one name in a directory item, but the same code gets used for 1790 * both directory index types 1791 */ 1792 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1793 struct btrfs_root *root, 1794 struct btrfs_path *path, 1795 struct extent_buffer *eb, int slot, 1796 struct btrfs_key *key) 1797 { 1798 int ret = 0; 1799 u32 item_size = btrfs_item_size_nr(eb, slot); 1800 struct btrfs_dir_item *di; 1801 int name_len; 1802 unsigned long ptr; 1803 unsigned long ptr_end; 1804 struct btrfs_path *fixup_path = NULL; 1805 1806 ptr = btrfs_item_ptr_offset(eb, slot); 1807 ptr_end = ptr + item_size; 1808 while (ptr < ptr_end) { 1809 di = (struct btrfs_dir_item *)ptr; 1810 if (verify_dir_item(root, eb, di)) 1811 return -EIO; 1812 name_len = btrfs_dir_name_len(eb, di); 1813 ret = replay_one_name(trans, root, path, eb, di, key); 1814 if (ret < 0) 1815 break; 1816 ptr = (unsigned long)(di + 1); 1817 ptr += name_len; 1818 1819 /* 1820 * If this entry refers to a non-directory (directories can not 1821 * have a link count > 1) and it was added in the transaction 1822 * that was not committed, make sure we fixup the link count of 1823 * the inode it the entry points to. Otherwise something like 1824 * the following would result in a directory pointing to an 1825 * inode with a wrong link that does not account for this dir 1826 * entry: 1827 * 1828 * mkdir testdir 1829 * touch testdir/foo 1830 * touch testdir/bar 1831 * sync 1832 * 1833 * ln testdir/bar testdir/bar_link 1834 * ln testdir/foo testdir/foo_link 1835 * xfs_io -c "fsync" testdir/bar 1836 * 1837 * <power failure> 1838 * 1839 * mount fs, log replay happens 1840 * 1841 * File foo would remain with a link count of 1 when it has two 1842 * entries pointing to it in the directory testdir. This would 1843 * make it impossible to ever delete the parent directory has 1844 * it would result in stale dentries that can never be deleted. 1845 */ 1846 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1847 struct btrfs_key di_key; 1848 1849 if (!fixup_path) { 1850 fixup_path = btrfs_alloc_path(); 1851 if (!fixup_path) { 1852 ret = -ENOMEM; 1853 break; 1854 } 1855 } 1856 1857 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1858 ret = link_to_fixup_dir(trans, root, fixup_path, 1859 di_key.objectid); 1860 if (ret) 1861 break; 1862 } 1863 ret = 0; 1864 } 1865 btrfs_free_path(fixup_path); 1866 return ret; 1867 } 1868 1869 /* 1870 * directory replay has two parts. There are the standard directory 1871 * items in the log copied from the subvolume, and range items 1872 * created in the log while the subvolume was logged. 1873 * 1874 * The range items tell us which parts of the key space the log 1875 * is authoritative for. During replay, if a key in the subvolume 1876 * directory is in a logged range item, but not actually in the log 1877 * that means it was deleted from the directory before the fsync 1878 * and should be removed. 1879 */ 1880 static noinline int find_dir_range(struct btrfs_root *root, 1881 struct btrfs_path *path, 1882 u64 dirid, int key_type, 1883 u64 *start_ret, u64 *end_ret) 1884 { 1885 struct btrfs_key key; 1886 u64 found_end; 1887 struct btrfs_dir_log_item *item; 1888 int ret; 1889 int nritems; 1890 1891 if (*start_ret == (u64)-1) 1892 return 1; 1893 1894 key.objectid = dirid; 1895 key.type = key_type; 1896 key.offset = *start_ret; 1897 1898 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1899 if (ret < 0) 1900 goto out; 1901 if (ret > 0) { 1902 if (path->slots[0] == 0) 1903 goto out; 1904 path->slots[0]--; 1905 } 1906 if (ret != 0) 1907 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1908 1909 if (key.type != key_type || key.objectid != dirid) { 1910 ret = 1; 1911 goto next; 1912 } 1913 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1914 struct btrfs_dir_log_item); 1915 found_end = btrfs_dir_log_end(path->nodes[0], item); 1916 1917 if (*start_ret >= key.offset && *start_ret <= found_end) { 1918 ret = 0; 1919 *start_ret = key.offset; 1920 *end_ret = found_end; 1921 goto out; 1922 } 1923 ret = 1; 1924 next: 1925 /* check the next slot in the tree to see if it is a valid item */ 1926 nritems = btrfs_header_nritems(path->nodes[0]); 1927 if (path->slots[0] >= nritems) { 1928 ret = btrfs_next_leaf(root, path); 1929 if (ret) 1930 goto out; 1931 } else { 1932 path->slots[0]++; 1933 } 1934 1935 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1936 1937 if (key.type != key_type || key.objectid != dirid) { 1938 ret = 1; 1939 goto out; 1940 } 1941 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1942 struct btrfs_dir_log_item); 1943 found_end = btrfs_dir_log_end(path->nodes[0], item); 1944 *start_ret = key.offset; 1945 *end_ret = found_end; 1946 ret = 0; 1947 out: 1948 btrfs_release_path(path); 1949 return ret; 1950 } 1951 1952 /* 1953 * this looks for a given directory item in the log. If the directory 1954 * item is not in the log, the item is removed and the inode it points 1955 * to is unlinked 1956 */ 1957 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1958 struct btrfs_root *root, 1959 struct btrfs_root *log, 1960 struct btrfs_path *path, 1961 struct btrfs_path *log_path, 1962 struct inode *dir, 1963 struct btrfs_key *dir_key) 1964 { 1965 int ret; 1966 struct extent_buffer *eb; 1967 int slot; 1968 u32 item_size; 1969 struct btrfs_dir_item *di; 1970 struct btrfs_dir_item *log_di; 1971 int name_len; 1972 unsigned long ptr; 1973 unsigned long ptr_end; 1974 char *name; 1975 struct inode *inode; 1976 struct btrfs_key location; 1977 1978 again: 1979 eb = path->nodes[0]; 1980 slot = path->slots[0]; 1981 item_size = btrfs_item_size_nr(eb, slot); 1982 ptr = btrfs_item_ptr_offset(eb, slot); 1983 ptr_end = ptr + item_size; 1984 while (ptr < ptr_end) { 1985 di = (struct btrfs_dir_item *)ptr; 1986 if (verify_dir_item(root, eb, di)) { 1987 ret = -EIO; 1988 goto out; 1989 } 1990 1991 name_len = btrfs_dir_name_len(eb, di); 1992 name = kmalloc(name_len, GFP_NOFS); 1993 if (!name) { 1994 ret = -ENOMEM; 1995 goto out; 1996 } 1997 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1998 name_len); 1999 log_di = NULL; 2000 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2001 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2002 dir_key->objectid, 2003 name, name_len, 0); 2004 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2005 log_di = btrfs_lookup_dir_index_item(trans, log, 2006 log_path, 2007 dir_key->objectid, 2008 dir_key->offset, 2009 name, name_len, 0); 2010 } 2011 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2012 btrfs_dir_item_key_to_cpu(eb, di, &location); 2013 btrfs_release_path(path); 2014 btrfs_release_path(log_path); 2015 inode = read_one_inode(root, location.objectid); 2016 if (!inode) { 2017 kfree(name); 2018 return -EIO; 2019 } 2020 2021 ret = link_to_fixup_dir(trans, root, 2022 path, location.objectid); 2023 if (ret) { 2024 kfree(name); 2025 iput(inode); 2026 goto out; 2027 } 2028 2029 inc_nlink(inode); 2030 ret = btrfs_unlink_inode(trans, root, dir, inode, 2031 name, name_len); 2032 if (!ret) 2033 ret = btrfs_run_delayed_items(trans, root); 2034 kfree(name); 2035 iput(inode); 2036 if (ret) 2037 goto out; 2038 2039 /* there might still be more names under this key 2040 * check and repeat if required 2041 */ 2042 ret = btrfs_search_slot(NULL, root, dir_key, path, 2043 0, 0); 2044 if (ret == 0) 2045 goto again; 2046 ret = 0; 2047 goto out; 2048 } else if (IS_ERR(log_di)) { 2049 kfree(name); 2050 return PTR_ERR(log_di); 2051 } 2052 btrfs_release_path(log_path); 2053 kfree(name); 2054 2055 ptr = (unsigned long)(di + 1); 2056 ptr += name_len; 2057 } 2058 ret = 0; 2059 out: 2060 btrfs_release_path(path); 2061 btrfs_release_path(log_path); 2062 return ret; 2063 } 2064 2065 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2066 struct btrfs_root *root, 2067 struct btrfs_root *log, 2068 struct btrfs_path *path, 2069 const u64 ino) 2070 { 2071 struct btrfs_key search_key; 2072 struct btrfs_path *log_path; 2073 int i; 2074 int nritems; 2075 int ret; 2076 2077 log_path = btrfs_alloc_path(); 2078 if (!log_path) 2079 return -ENOMEM; 2080 2081 search_key.objectid = ino; 2082 search_key.type = BTRFS_XATTR_ITEM_KEY; 2083 search_key.offset = 0; 2084 again: 2085 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2086 if (ret < 0) 2087 goto out; 2088 process_leaf: 2089 nritems = btrfs_header_nritems(path->nodes[0]); 2090 for (i = path->slots[0]; i < nritems; i++) { 2091 struct btrfs_key key; 2092 struct btrfs_dir_item *di; 2093 struct btrfs_dir_item *log_di; 2094 u32 total_size; 2095 u32 cur; 2096 2097 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2098 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2099 ret = 0; 2100 goto out; 2101 } 2102 2103 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2104 total_size = btrfs_item_size_nr(path->nodes[0], i); 2105 cur = 0; 2106 while (cur < total_size) { 2107 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2108 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2109 u32 this_len = sizeof(*di) + name_len + data_len; 2110 char *name; 2111 2112 name = kmalloc(name_len, GFP_NOFS); 2113 if (!name) { 2114 ret = -ENOMEM; 2115 goto out; 2116 } 2117 read_extent_buffer(path->nodes[0], name, 2118 (unsigned long)(di + 1), name_len); 2119 2120 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2121 name, name_len, 0); 2122 btrfs_release_path(log_path); 2123 if (!log_di) { 2124 /* Doesn't exist in log tree, so delete it. */ 2125 btrfs_release_path(path); 2126 di = btrfs_lookup_xattr(trans, root, path, ino, 2127 name, name_len, -1); 2128 kfree(name); 2129 if (IS_ERR(di)) { 2130 ret = PTR_ERR(di); 2131 goto out; 2132 } 2133 ASSERT(di); 2134 ret = btrfs_delete_one_dir_name(trans, root, 2135 path, di); 2136 if (ret) 2137 goto out; 2138 btrfs_release_path(path); 2139 search_key = key; 2140 goto again; 2141 } 2142 kfree(name); 2143 if (IS_ERR(log_di)) { 2144 ret = PTR_ERR(log_di); 2145 goto out; 2146 } 2147 cur += this_len; 2148 di = (struct btrfs_dir_item *)((char *)di + this_len); 2149 } 2150 } 2151 ret = btrfs_next_leaf(root, path); 2152 if (ret > 0) 2153 ret = 0; 2154 else if (ret == 0) 2155 goto process_leaf; 2156 out: 2157 btrfs_free_path(log_path); 2158 btrfs_release_path(path); 2159 return ret; 2160 } 2161 2162 2163 /* 2164 * deletion replay happens before we copy any new directory items 2165 * out of the log or out of backreferences from inodes. It 2166 * scans the log to find ranges of keys that log is authoritative for, 2167 * and then scans the directory to find items in those ranges that are 2168 * not present in the log. 2169 * 2170 * Anything we don't find in the log is unlinked and removed from the 2171 * directory. 2172 */ 2173 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2174 struct btrfs_root *root, 2175 struct btrfs_root *log, 2176 struct btrfs_path *path, 2177 u64 dirid, int del_all) 2178 { 2179 u64 range_start; 2180 u64 range_end; 2181 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2182 int ret = 0; 2183 struct btrfs_key dir_key; 2184 struct btrfs_key found_key; 2185 struct btrfs_path *log_path; 2186 struct inode *dir; 2187 2188 dir_key.objectid = dirid; 2189 dir_key.type = BTRFS_DIR_ITEM_KEY; 2190 log_path = btrfs_alloc_path(); 2191 if (!log_path) 2192 return -ENOMEM; 2193 2194 dir = read_one_inode(root, dirid); 2195 /* it isn't an error if the inode isn't there, that can happen 2196 * because we replay the deletes before we copy in the inode item 2197 * from the log 2198 */ 2199 if (!dir) { 2200 btrfs_free_path(log_path); 2201 return 0; 2202 } 2203 again: 2204 range_start = 0; 2205 range_end = 0; 2206 while (1) { 2207 if (del_all) 2208 range_end = (u64)-1; 2209 else { 2210 ret = find_dir_range(log, path, dirid, key_type, 2211 &range_start, &range_end); 2212 if (ret != 0) 2213 break; 2214 } 2215 2216 dir_key.offset = range_start; 2217 while (1) { 2218 int nritems; 2219 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2220 0, 0); 2221 if (ret < 0) 2222 goto out; 2223 2224 nritems = btrfs_header_nritems(path->nodes[0]); 2225 if (path->slots[0] >= nritems) { 2226 ret = btrfs_next_leaf(root, path); 2227 if (ret) 2228 break; 2229 } 2230 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2231 path->slots[0]); 2232 if (found_key.objectid != dirid || 2233 found_key.type != dir_key.type) 2234 goto next_type; 2235 2236 if (found_key.offset > range_end) 2237 break; 2238 2239 ret = check_item_in_log(trans, root, log, path, 2240 log_path, dir, 2241 &found_key); 2242 if (ret) 2243 goto out; 2244 if (found_key.offset == (u64)-1) 2245 break; 2246 dir_key.offset = found_key.offset + 1; 2247 } 2248 btrfs_release_path(path); 2249 if (range_end == (u64)-1) 2250 break; 2251 range_start = range_end + 1; 2252 } 2253 2254 next_type: 2255 ret = 0; 2256 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2257 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2258 dir_key.type = BTRFS_DIR_INDEX_KEY; 2259 btrfs_release_path(path); 2260 goto again; 2261 } 2262 out: 2263 btrfs_release_path(path); 2264 btrfs_free_path(log_path); 2265 iput(dir); 2266 return ret; 2267 } 2268 2269 /* 2270 * the process_func used to replay items from the log tree. This 2271 * gets called in two different stages. The first stage just looks 2272 * for inodes and makes sure they are all copied into the subvolume. 2273 * 2274 * The second stage copies all the other item types from the log into 2275 * the subvolume. The two stage approach is slower, but gets rid of 2276 * lots of complexity around inodes referencing other inodes that exist 2277 * only in the log (references come from either directory items or inode 2278 * back refs). 2279 */ 2280 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2281 struct walk_control *wc, u64 gen) 2282 { 2283 int nritems; 2284 struct btrfs_path *path; 2285 struct btrfs_root *root = wc->replay_dest; 2286 struct btrfs_key key; 2287 int level; 2288 int i; 2289 int ret; 2290 2291 ret = btrfs_read_buffer(eb, gen); 2292 if (ret) 2293 return ret; 2294 2295 level = btrfs_header_level(eb); 2296 2297 if (level != 0) 2298 return 0; 2299 2300 path = btrfs_alloc_path(); 2301 if (!path) 2302 return -ENOMEM; 2303 2304 nritems = btrfs_header_nritems(eb); 2305 for (i = 0; i < nritems; i++) { 2306 btrfs_item_key_to_cpu(eb, &key, i); 2307 2308 /* inode keys are done during the first stage */ 2309 if (key.type == BTRFS_INODE_ITEM_KEY && 2310 wc->stage == LOG_WALK_REPLAY_INODES) { 2311 struct btrfs_inode_item *inode_item; 2312 u32 mode; 2313 2314 inode_item = btrfs_item_ptr(eb, i, 2315 struct btrfs_inode_item); 2316 ret = replay_xattr_deletes(wc->trans, root, log, 2317 path, key.objectid); 2318 if (ret) 2319 break; 2320 mode = btrfs_inode_mode(eb, inode_item); 2321 if (S_ISDIR(mode)) { 2322 ret = replay_dir_deletes(wc->trans, 2323 root, log, path, key.objectid, 0); 2324 if (ret) 2325 break; 2326 } 2327 ret = overwrite_item(wc->trans, root, path, 2328 eb, i, &key); 2329 if (ret) 2330 break; 2331 2332 /* for regular files, make sure corresponding 2333 * orphan item exist. extents past the new EOF 2334 * will be truncated later by orphan cleanup. 2335 */ 2336 if (S_ISREG(mode)) { 2337 ret = insert_orphan_item(wc->trans, root, 2338 key.objectid); 2339 if (ret) 2340 break; 2341 } 2342 2343 ret = link_to_fixup_dir(wc->trans, root, 2344 path, key.objectid); 2345 if (ret) 2346 break; 2347 } 2348 2349 if (key.type == BTRFS_DIR_INDEX_KEY && 2350 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2351 ret = replay_one_dir_item(wc->trans, root, path, 2352 eb, i, &key); 2353 if (ret) 2354 break; 2355 } 2356 2357 if (wc->stage < LOG_WALK_REPLAY_ALL) 2358 continue; 2359 2360 /* these keys are simply copied */ 2361 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2362 ret = overwrite_item(wc->trans, root, path, 2363 eb, i, &key); 2364 if (ret) 2365 break; 2366 } else if (key.type == BTRFS_INODE_REF_KEY || 2367 key.type == BTRFS_INODE_EXTREF_KEY) { 2368 ret = add_inode_ref(wc->trans, root, log, path, 2369 eb, i, &key); 2370 if (ret && ret != -ENOENT) 2371 break; 2372 ret = 0; 2373 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2374 ret = replay_one_extent(wc->trans, root, path, 2375 eb, i, &key); 2376 if (ret) 2377 break; 2378 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2379 ret = replay_one_dir_item(wc->trans, root, path, 2380 eb, i, &key); 2381 if (ret) 2382 break; 2383 } 2384 } 2385 btrfs_free_path(path); 2386 return ret; 2387 } 2388 2389 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2390 struct btrfs_root *root, 2391 struct btrfs_path *path, int *level, 2392 struct walk_control *wc) 2393 { 2394 u64 root_owner; 2395 u64 bytenr; 2396 u64 ptr_gen; 2397 struct extent_buffer *next; 2398 struct extent_buffer *cur; 2399 struct extent_buffer *parent; 2400 u32 blocksize; 2401 int ret = 0; 2402 2403 WARN_ON(*level < 0); 2404 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2405 2406 while (*level > 0) { 2407 WARN_ON(*level < 0); 2408 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2409 cur = path->nodes[*level]; 2410 2411 WARN_ON(btrfs_header_level(cur) != *level); 2412 2413 if (path->slots[*level] >= 2414 btrfs_header_nritems(cur)) 2415 break; 2416 2417 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2418 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2419 blocksize = root->nodesize; 2420 2421 parent = path->nodes[*level]; 2422 root_owner = btrfs_header_owner(parent); 2423 2424 next = btrfs_find_create_tree_block(root, bytenr); 2425 if (!next) 2426 return -ENOMEM; 2427 2428 if (*level == 1) { 2429 ret = wc->process_func(root, next, wc, ptr_gen); 2430 if (ret) { 2431 free_extent_buffer(next); 2432 return ret; 2433 } 2434 2435 path->slots[*level]++; 2436 if (wc->free) { 2437 ret = btrfs_read_buffer(next, ptr_gen); 2438 if (ret) { 2439 free_extent_buffer(next); 2440 return ret; 2441 } 2442 2443 if (trans) { 2444 btrfs_tree_lock(next); 2445 btrfs_set_lock_blocking(next); 2446 clean_tree_block(trans, root->fs_info, 2447 next); 2448 btrfs_wait_tree_block_writeback(next); 2449 btrfs_tree_unlock(next); 2450 } 2451 2452 WARN_ON(root_owner != 2453 BTRFS_TREE_LOG_OBJECTID); 2454 ret = btrfs_free_and_pin_reserved_extent(root, 2455 bytenr, blocksize); 2456 if (ret) { 2457 free_extent_buffer(next); 2458 return ret; 2459 } 2460 } 2461 free_extent_buffer(next); 2462 continue; 2463 } 2464 ret = btrfs_read_buffer(next, ptr_gen); 2465 if (ret) { 2466 free_extent_buffer(next); 2467 return ret; 2468 } 2469 2470 WARN_ON(*level <= 0); 2471 if (path->nodes[*level-1]) 2472 free_extent_buffer(path->nodes[*level-1]); 2473 path->nodes[*level-1] = next; 2474 *level = btrfs_header_level(next); 2475 path->slots[*level] = 0; 2476 cond_resched(); 2477 } 2478 WARN_ON(*level < 0); 2479 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2480 2481 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2482 2483 cond_resched(); 2484 return 0; 2485 } 2486 2487 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2488 struct btrfs_root *root, 2489 struct btrfs_path *path, int *level, 2490 struct walk_control *wc) 2491 { 2492 u64 root_owner; 2493 int i; 2494 int slot; 2495 int ret; 2496 2497 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2498 slot = path->slots[i]; 2499 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2500 path->slots[i]++; 2501 *level = i; 2502 WARN_ON(*level == 0); 2503 return 0; 2504 } else { 2505 struct extent_buffer *parent; 2506 if (path->nodes[*level] == root->node) 2507 parent = path->nodes[*level]; 2508 else 2509 parent = path->nodes[*level + 1]; 2510 2511 root_owner = btrfs_header_owner(parent); 2512 ret = wc->process_func(root, path->nodes[*level], wc, 2513 btrfs_header_generation(path->nodes[*level])); 2514 if (ret) 2515 return ret; 2516 2517 if (wc->free) { 2518 struct extent_buffer *next; 2519 2520 next = path->nodes[*level]; 2521 2522 if (trans) { 2523 btrfs_tree_lock(next); 2524 btrfs_set_lock_blocking(next); 2525 clean_tree_block(trans, root->fs_info, 2526 next); 2527 btrfs_wait_tree_block_writeback(next); 2528 btrfs_tree_unlock(next); 2529 } 2530 2531 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2532 ret = btrfs_free_and_pin_reserved_extent(root, 2533 path->nodes[*level]->start, 2534 path->nodes[*level]->len); 2535 if (ret) 2536 return ret; 2537 } 2538 free_extent_buffer(path->nodes[*level]); 2539 path->nodes[*level] = NULL; 2540 *level = i + 1; 2541 } 2542 } 2543 return 1; 2544 } 2545 2546 /* 2547 * drop the reference count on the tree rooted at 'snap'. This traverses 2548 * the tree freeing any blocks that have a ref count of zero after being 2549 * decremented. 2550 */ 2551 static int walk_log_tree(struct btrfs_trans_handle *trans, 2552 struct btrfs_root *log, struct walk_control *wc) 2553 { 2554 int ret = 0; 2555 int wret; 2556 int level; 2557 struct btrfs_path *path; 2558 int orig_level; 2559 2560 path = btrfs_alloc_path(); 2561 if (!path) 2562 return -ENOMEM; 2563 2564 level = btrfs_header_level(log->node); 2565 orig_level = level; 2566 path->nodes[level] = log->node; 2567 extent_buffer_get(log->node); 2568 path->slots[level] = 0; 2569 2570 while (1) { 2571 wret = walk_down_log_tree(trans, log, path, &level, wc); 2572 if (wret > 0) 2573 break; 2574 if (wret < 0) { 2575 ret = wret; 2576 goto out; 2577 } 2578 2579 wret = walk_up_log_tree(trans, log, path, &level, wc); 2580 if (wret > 0) 2581 break; 2582 if (wret < 0) { 2583 ret = wret; 2584 goto out; 2585 } 2586 } 2587 2588 /* was the root node processed? if not, catch it here */ 2589 if (path->nodes[orig_level]) { 2590 ret = wc->process_func(log, path->nodes[orig_level], wc, 2591 btrfs_header_generation(path->nodes[orig_level])); 2592 if (ret) 2593 goto out; 2594 if (wc->free) { 2595 struct extent_buffer *next; 2596 2597 next = path->nodes[orig_level]; 2598 2599 if (trans) { 2600 btrfs_tree_lock(next); 2601 btrfs_set_lock_blocking(next); 2602 clean_tree_block(trans, log->fs_info, next); 2603 btrfs_wait_tree_block_writeback(next); 2604 btrfs_tree_unlock(next); 2605 } 2606 2607 WARN_ON(log->root_key.objectid != 2608 BTRFS_TREE_LOG_OBJECTID); 2609 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2610 next->len); 2611 if (ret) 2612 goto out; 2613 } 2614 } 2615 2616 out: 2617 btrfs_free_path(path); 2618 return ret; 2619 } 2620 2621 /* 2622 * helper function to update the item for a given subvolumes log root 2623 * in the tree of log roots 2624 */ 2625 static int update_log_root(struct btrfs_trans_handle *trans, 2626 struct btrfs_root *log) 2627 { 2628 int ret; 2629 2630 if (log->log_transid == 1) { 2631 /* insert root item on the first sync */ 2632 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2633 &log->root_key, &log->root_item); 2634 } else { 2635 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2636 &log->root_key, &log->root_item); 2637 } 2638 return ret; 2639 } 2640 2641 static void wait_log_commit(struct btrfs_root *root, int transid) 2642 { 2643 DEFINE_WAIT(wait); 2644 int index = transid % 2; 2645 2646 /* 2647 * we only allow two pending log transactions at a time, 2648 * so we know that if ours is more than 2 older than the 2649 * current transaction, we're done 2650 */ 2651 do { 2652 prepare_to_wait(&root->log_commit_wait[index], 2653 &wait, TASK_UNINTERRUPTIBLE); 2654 mutex_unlock(&root->log_mutex); 2655 2656 if (root->log_transid_committed < transid && 2657 atomic_read(&root->log_commit[index])) 2658 schedule(); 2659 2660 finish_wait(&root->log_commit_wait[index], &wait); 2661 mutex_lock(&root->log_mutex); 2662 } while (root->log_transid_committed < transid && 2663 atomic_read(&root->log_commit[index])); 2664 } 2665 2666 static void wait_for_writer(struct btrfs_root *root) 2667 { 2668 DEFINE_WAIT(wait); 2669 2670 while (atomic_read(&root->log_writers)) { 2671 prepare_to_wait(&root->log_writer_wait, 2672 &wait, TASK_UNINTERRUPTIBLE); 2673 mutex_unlock(&root->log_mutex); 2674 if (atomic_read(&root->log_writers)) 2675 schedule(); 2676 finish_wait(&root->log_writer_wait, &wait); 2677 mutex_lock(&root->log_mutex); 2678 } 2679 } 2680 2681 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2682 struct btrfs_log_ctx *ctx) 2683 { 2684 if (!ctx) 2685 return; 2686 2687 mutex_lock(&root->log_mutex); 2688 list_del_init(&ctx->list); 2689 mutex_unlock(&root->log_mutex); 2690 } 2691 2692 /* 2693 * Invoked in log mutex context, or be sure there is no other task which 2694 * can access the list. 2695 */ 2696 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2697 int index, int error) 2698 { 2699 struct btrfs_log_ctx *ctx; 2700 2701 if (!error) { 2702 INIT_LIST_HEAD(&root->log_ctxs[index]); 2703 return; 2704 } 2705 2706 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2707 ctx->log_ret = error; 2708 2709 INIT_LIST_HEAD(&root->log_ctxs[index]); 2710 } 2711 2712 /* 2713 * btrfs_sync_log does sends a given tree log down to the disk and 2714 * updates the super blocks to record it. When this call is done, 2715 * you know that any inodes previously logged are safely on disk only 2716 * if it returns 0. 2717 * 2718 * Any other return value means you need to call btrfs_commit_transaction. 2719 * Some of the edge cases for fsyncing directories that have had unlinks 2720 * or renames done in the past mean that sometimes the only safe 2721 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2722 * that has happened. 2723 */ 2724 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2725 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2726 { 2727 int index1; 2728 int index2; 2729 int mark; 2730 int ret; 2731 struct btrfs_root *log = root->log_root; 2732 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2733 int log_transid = 0; 2734 struct btrfs_log_ctx root_log_ctx; 2735 struct blk_plug plug; 2736 2737 mutex_lock(&root->log_mutex); 2738 log_transid = ctx->log_transid; 2739 if (root->log_transid_committed >= log_transid) { 2740 mutex_unlock(&root->log_mutex); 2741 return ctx->log_ret; 2742 } 2743 2744 index1 = log_transid % 2; 2745 if (atomic_read(&root->log_commit[index1])) { 2746 wait_log_commit(root, log_transid); 2747 mutex_unlock(&root->log_mutex); 2748 return ctx->log_ret; 2749 } 2750 ASSERT(log_transid == root->log_transid); 2751 atomic_set(&root->log_commit[index1], 1); 2752 2753 /* wait for previous tree log sync to complete */ 2754 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2755 wait_log_commit(root, log_transid - 1); 2756 2757 while (1) { 2758 int batch = atomic_read(&root->log_batch); 2759 /* when we're on an ssd, just kick the log commit out */ 2760 if (!btrfs_test_opt(root, SSD) && 2761 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2762 mutex_unlock(&root->log_mutex); 2763 schedule_timeout_uninterruptible(1); 2764 mutex_lock(&root->log_mutex); 2765 } 2766 wait_for_writer(root); 2767 if (batch == atomic_read(&root->log_batch)) 2768 break; 2769 } 2770 2771 /* bail out if we need to do a full commit */ 2772 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2773 ret = -EAGAIN; 2774 btrfs_free_logged_extents(log, log_transid); 2775 mutex_unlock(&root->log_mutex); 2776 goto out; 2777 } 2778 2779 if (log_transid % 2 == 0) 2780 mark = EXTENT_DIRTY; 2781 else 2782 mark = EXTENT_NEW; 2783 2784 /* we start IO on all the marked extents here, but we don't actually 2785 * wait for them until later. 2786 */ 2787 blk_start_plug(&plug); 2788 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2789 if (ret) { 2790 blk_finish_plug(&plug); 2791 btrfs_abort_transaction(trans, root, ret); 2792 btrfs_free_logged_extents(log, log_transid); 2793 btrfs_set_log_full_commit(root->fs_info, trans); 2794 mutex_unlock(&root->log_mutex); 2795 goto out; 2796 } 2797 2798 btrfs_set_root_node(&log->root_item, log->node); 2799 2800 root->log_transid++; 2801 log->log_transid = root->log_transid; 2802 root->log_start_pid = 0; 2803 /* 2804 * IO has been started, blocks of the log tree have WRITTEN flag set 2805 * in their headers. new modifications of the log will be written to 2806 * new positions. so it's safe to allow log writers to go in. 2807 */ 2808 mutex_unlock(&root->log_mutex); 2809 2810 btrfs_init_log_ctx(&root_log_ctx); 2811 2812 mutex_lock(&log_root_tree->log_mutex); 2813 atomic_inc(&log_root_tree->log_batch); 2814 atomic_inc(&log_root_tree->log_writers); 2815 2816 index2 = log_root_tree->log_transid % 2; 2817 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2818 root_log_ctx.log_transid = log_root_tree->log_transid; 2819 2820 mutex_unlock(&log_root_tree->log_mutex); 2821 2822 ret = update_log_root(trans, log); 2823 2824 mutex_lock(&log_root_tree->log_mutex); 2825 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2826 /* 2827 * Implicit memory barrier after atomic_dec_and_test 2828 */ 2829 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2830 wake_up(&log_root_tree->log_writer_wait); 2831 } 2832 2833 if (ret) { 2834 if (!list_empty(&root_log_ctx.list)) 2835 list_del_init(&root_log_ctx.list); 2836 2837 blk_finish_plug(&plug); 2838 btrfs_set_log_full_commit(root->fs_info, trans); 2839 2840 if (ret != -ENOSPC) { 2841 btrfs_abort_transaction(trans, root, ret); 2842 mutex_unlock(&log_root_tree->log_mutex); 2843 goto out; 2844 } 2845 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2846 btrfs_free_logged_extents(log, log_transid); 2847 mutex_unlock(&log_root_tree->log_mutex); 2848 ret = -EAGAIN; 2849 goto out; 2850 } 2851 2852 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2853 blk_finish_plug(&plug); 2854 mutex_unlock(&log_root_tree->log_mutex); 2855 ret = root_log_ctx.log_ret; 2856 goto out; 2857 } 2858 2859 index2 = root_log_ctx.log_transid % 2; 2860 if (atomic_read(&log_root_tree->log_commit[index2])) { 2861 blk_finish_plug(&plug); 2862 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, 2863 mark); 2864 btrfs_wait_logged_extents(trans, log, log_transid); 2865 wait_log_commit(log_root_tree, 2866 root_log_ctx.log_transid); 2867 mutex_unlock(&log_root_tree->log_mutex); 2868 if (!ret) 2869 ret = root_log_ctx.log_ret; 2870 goto out; 2871 } 2872 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2873 atomic_set(&log_root_tree->log_commit[index2], 1); 2874 2875 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2876 wait_log_commit(log_root_tree, 2877 root_log_ctx.log_transid - 1); 2878 } 2879 2880 wait_for_writer(log_root_tree); 2881 2882 /* 2883 * now that we've moved on to the tree of log tree roots, 2884 * check the full commit flag again 2885 */ 2886 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2887 blk_finish_plug(&plug); 2888 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2889 btrfs_free_logged_extents(log, log_transid); 2890 mutex_unlock(&log_root_tree->log_mutex); 2891 ret = -EAGAIN; 2892 goto out_wake_log_root; 2893 } 2894 2895 ret = btrfs_write_marked_extents(log_root_tree, 2896 &log_root_tree->dirty_log_pages, 2897 EXTENT_DIRTY | EXTENT_NEW); 2898 blk_finish_plug(&plug); 2899 if (ret) { 2900 btrfs_set_log_full_commit(root->fs_info, trans); 2901 btrfs_abort_transaction(trans, root, ret); 2902 btrfs_free_logged_extents(log, log_transid); 2903 mutex_unlock(&log_root_tree->log_mutex); 2904 goto out_wake_log_root; 2905 } 2906 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2907 if (!ret) 2908 ret = btrfs_wait_marked_extents(log_root_tree, 2909 &log_root_tree->dirty_log_pages, 2910 EXTENT_NEW | EXTENT_DIRTY); 2911 if (ret) { 2912 btrfs_set_log_full_commit(root->fs_info, trans); 2913 btrfs_free_logged_extents(log, log_transid); 2914 mutex_unlock(&log_root_tree->log_mutex); 2915 goto out_wake_log_root; 2916 } 2917 btrfs_wait_logged_extents(trans, log, log_transid); 2918 2919 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2920 log_root_tree->node->start); 2921 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2922 btrfs_header_level(log_root_tree->node)); 2923 2924 log_root_tree->log_transid++; 2925 mutex_unlock(&log_root_tree->log_mutex); 2926 2927 /* 2928 * nobody else is going to jump in and write the the ctree 2929 * super here because the log_commit atomic below is protecting 2930 * us. We must be called with a transaction handle pinning 2931 * the running transaction open, so a full commit can't hop 2932 * in and cause problems either. 2933 */ 2934 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2935 if (ret) { 2936 btrfs_set_log_full_commit(root->fs_info, trans); 2937 btrfs_abort_transaction(trans, root, ret); 2938 goto out_wake_log_root; 2939 } 2940 2941 mutex_lock(&root->log_mutex); 2942 if (root->last_log_commit < log_transid) 2943 root->last_log_commit = log_transid; 2944 mutex_unlock(&root->log_mutex); 2945 2946 out_wake_log_root: 2947 /* 2948 * We needn't get log_mutex here because we are sure all 2949 * the other tasks are blocked. 2950 */ 2951 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2952 2953 mutex_lock(&log_root_tree->log_mutex); 2954 log_root_tree->log_transid_committed++; 2955 atomic_set(&log_root_tree->log_commit[index2], 0); 2956 mutex_unlock(&log_root_tree->log_mutex); 2957 2958 /* 2959 * The barrier before waitqueue_active is implied by mutex_unlock 2960 */ 2961 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2962 wake_up(&log_root_tree->log_commit_wait[index2]); 2963 out: 2964 /* See above. */ 2965 btrfs_remove_all_log_ctxs(root, index1, ret); 2966 2967 mutex_lock(&root->log_mutex); 2968 root->log_transid_committed++; 2969 atomic_set(&root->log_commit[index1], 0); 2970 mutex_unlock(&root->log_mutex); 2971 2972 /* 2973 * The barrier before waitqueue_active is implied by mutex_unlock 2974 */ 2975 if (waitqueue_active(&root->log_commit_wait[index1])) 2976 wake_up(&root->log_commit_wait[index1]); 2977 return ret; 2978 } 2979 2980 static void free_log_tree(struct btrfs_trans_handle *trans, 2981 struct btrfs_root *log) 2982 { 2983 int ret; 2984 u64 start; 2985 u64 end; 2986 struct walk_control wc = { 2987 .free = 1, 2988 .process_func = process_one_buffer 2989 }; 2990 2991 ret = walk_log_tree(trans, log, &wc); 2992 /* I don't think this can happen but just in case */ 2993 if (ret) 2994 btrfs_abort_transaction(trans, log, ret); 2995 2996 while (1) { 2997 ret = find_first_extent_bit(&log->dirty_log_pages, 2998 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2999 NULL); 3000 if (ret) 3001 break; 3002 3003 clear_extent_bits(&log->dirty_log_pages, start, end, 3004 EXTENT_DIRTY | EXTENT_NEW); 3005 } 3006 3007 /* 3008 * We may have short-circuited the log tree with the full commit logic 3009 * and left ordered extents on our list, so clear these out to keep us 3010 * from leaking inodes and memory. 3011 */ 3012 btrfs_free_logged_extents(log, 0); 3013 btrfs_free_logged_extents(log, 1); 3014 3015 free_extent_buffer(log->node); 3016 kfree(log); 3017 } 3018 3019 /* 3020 * free all the extents used by the tree log. This should be called 3021 * at commit time of the full transaction 3022 */ 3023 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3024 { 3025 if (root->log_root) { 3026 free_log_tree(trans, root->log_root); 3027 root->log_root = NULL; 3028 } 3029 return 0; 3030 } 3031 3032 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3033 struct btrfs_fs_info *fs_info) 3034 { 3035 if (fs_info->log_root_tree) { 3036 free_log_tree(trans, fs_info->log_root_tree); 3037 fs_info->log_root_tree = NULL; 3038 } 3039 return 0; 3040 } 3041 3042 /* 3043 * If both a file and directory are logged, and unlinks or renames are 3044 * mixed in, we have a few interesting corners: 3045 * 3046 * create file X in dir Y 3047 * link file X to X.link in dir Y 3048 * fsync file X 3049 * unlink file X but leave X.link 3050 * fsync dir Y 3051 * 3052 * After a crash we would expect only X.link to exist. But file X 3053 * didn't get fsync'd again so the log has back refs for X and X.link. 3054 * 3055 * We solve this by removing directory entries and inode backrefs from the 3056 * log when a file that was logged in the current transaction is 3057 * unlinked. Any later fsync will include the updated log entries, and 3058 * we'll be able to reconstruct the proper directory items from backrefs. 3059 * 3060 * This optimizations allows us to avoid relogging the entire inode 3061 * or the entire directory. 3062 */ 3063 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3064 struct btrfs_root *root, 3065 const char *name, int name_len, 3066 struct inode *dir, u64 index) 3067 { 3068 struct btrfs_root *log; 3069 struct btrfs_dir_item *di; 3070 struct btrfs_path *path; 3071 int ret; 3072 int err = 0; 3073 int bytes_del = 0; 3074 u64 dir_ino = btrfs_ino(dir); 3075 3076 if (BTRFS_I(dir)->logged_trans < trans->transid) 3077 return 0; 3078 3079 ret = join_running_log_trans(root); 3080 if (ret) 3081 return 0; 3082 3083 mutex_lock(&BTRFS_I(dir)->log_mutex); 3084 3085 log = root->log_root; 3086 path = btrfs_alloc_path(); 3087 if (!path) { 3088 err = -ENOMEM; 3089 goto out_unlock; 3090 } 3091 3092 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3093 name, name_len, -1); 3094 if (IS_ERR(di)) { 3095 err = PTR_ERR(di); 3096 goto fail; 3097 } 3098 if (di) { 3099 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3100 bytes_del += name_len; 3101 if (ret) { 3102 err = ret; 3103 goto fail; 3104 } 3105 } 3106 btrfs_release_path(path); 3107 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3108 index, name, name_len, -1); 3109 if (IS_ERR(di)) { 3110 err = PTR_ERR(di); 3111 goto fail; 3112 } 3113 if (di) { 3114 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3115 bytes_del += name_len; 3116 if (ret) { 3117 err = ret; 3118 goto fail; 3119 } 3120 } 3121 3122 /* update the directory size in the log to reflect the names 3123 * we have removed 3124 */ 3125 if (bytes_del) { 3126 struct btrfs_key key; 3127 3128 key.objectid = dir_ino; 3129 key.offset = 0; 3130 key.type = BTRFS_INODE_ITEM_KEY; 3131 btrfs_release_path(path); 3132 3133 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3134 if (ret < 0) { 3135 err = ret; 3136 goto fail; 3137 } 3138 if (ret == 0) { 3139 struct btrfs_inode_item *item; 3140 u64 i_size; 3141 3142 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3143 struct btrfs_inode_item); 3144 i_size = btrfs_inode_size(path->nodes[0], item); 3145 if (i_size > bytes_del) 3146 i_size -= bytes_del; 3147 else 3148 i_size = 0; 3149 btrfs_set_inode_size(path->nodes[0], item, i_size); 3150 btrfs_mark_buffer_dirty(path->nodes[0]); 3151 } else 3152 ret = 0; 3153 btrfs_release_path(path); 3154 } 3155 fail: 3156 btrfs_free_path(path); 3157 out_unlock: 3158 mutex_unlock(&BTRFS_I(dir)->log_mutex); 3159 if (ret == -ENOSPC) { 3160 btrfs_set_log_full_commit(root->fs_info, trans); 3161 ret = 0; 3162 } else if (ret < 0) 3163 btrfs_abort_transaction(trans, root, ret); 3164 3165 btrfs_end_log_trans(root); 3166 3167 return err; 3168 } 3169 3170 /* see comments for btrfs_del_dir_entries_in_log */ 3171 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3172 struct btrfs_root *root, 3173 const char *name, int name_len, 3174 struct inode *inode, u64 dirid) 3175 { 3176 struct btrfs_root *log; 3177 u64 index; 3178 int ret; 3179 3180 if (BTRFS_I(inode)->logged_trans < trans->transid) 3181 return 0; 3182 3183 ret = join_running_log_trans(root); 3184 if (ret) 3185 return 0; 3186 log = root->log_root; 3187 mutex_lock(&BTRFS_I(inode)->log_mutex); 3188 3189 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3190 dirid, &index); 3191 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3192 if (ret == -ENOSPC) { 3193 btrfs_set_log_full_commit(root->fs_info, trans); 3194 ret = 0; 3195 } else if (ret < 0 && ret != -ENOENT) 3196 btrfs_abort_transaction(trans, root, ret); 3197 btrfs_end_log_trans(root); 3198 3199 return ret; 3200 } 3201 3202 /* 3203 * creates a range item in the log for 'dirid'. first_offset and 3204 * last_offset tell us which parts of the key space the log should 3205 * be considered authoritative for. 3206 */ 3207 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3208 struct btrfs_root *log, 3209 struct btrfs_path *path, 3210 int key_type, u64 dirid, 3211 u64 first_offset, u64 last_offset) 3212 { 3213 int ret; 3214 struct btrfs_key key; 3215 struct btrfs_dir_log_item *item; 3216 3217 key.objectid = dirid; 3218 key.offset = first_offset; 3219 if (key_type == BTRFS_DIR_ITEM_KEY) 3220 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3221 else 3222 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3223 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3224 if (ret) 3225 return ret; 3226 3227 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3228 struct btrfs_dir_log_item); 3229 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3230 btrfs_mark_buffer_dirty(path->nodes[0]); 3231 btrfs_release_path(path); 3232 return 0; 3233 } 3234 3235 /* 3236 * log all the items included in the current transaction for a given 3237 * directory. This also creates the range items in the log tree required 3238 * to replay anything deleted before the fsync 3239 */ 3240 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3241 struct btrfs_root *root, struct inode *inode, 3242 struct btrfs_path *path, 3243 struct btrfs_path *dst_path, int key_type, 3244 struct btrfs_log_ctx *ctx, 3245 u64 min_offset, u64 *last_offset_ret) 3246 { 3247 struct btrfs_key min_key; 3248 struct btrfs_root *log = root->log_root; 3249 struct extent_buffer *src; 3250 int err = 0; 3251 int ret; 3252 int i; 3253 int nritems; 3254 u64 first_offset = min_offset; 3255 u64 last_offset = (u64)-1; 3256 u64 ino = btrfs_ino(inode); 3257 3258 log = root->log_root; 3259 3260 min_key.objectid = ino; 3261 min_key.type = key_type; 3262 min_key.offset = min_offset; 3263 3264 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3265 3266 /* 3267 * we didn't find anything from this transaction, see if there 3268 * is anything at all 3269 */ 3270 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3271 min_key.objectid = ino; 3272 min_key.type = key_type; 3273 min_key.offset = (u64)-1; 3274 btrfs_release_path(path); 3275 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3276 if (ret < 0) { 3277 btrfs_release_path(path); 3278 return ret; 3279 } 3280 ret = btrfs_previous_item(root, path, ino, key_type); 3281 3282 /* if ret == 0 there are items for this type, 3283 * create a range to tell us the last key of this type. 3284 * otherwise, there are no items in this directory after 3285 * *min_offset, and we create a range to indicate that. 3286 */ 3287 if (ret == 0) { 3288 struct btrfs_key tmp; 3289 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3290 path->slots[0]); 3291 if (key_type == tmp.type) 3292 first_offset = max(min_offset, tmp.offset) + 1; 3293 } 3294 goto done; 3295 } 3296 3297 /* go backward to find any previous key */ 3298 ret = btrfs_previous_item(root, path, ino, key_type); 3299 if (ret == 0) { 3300 struct btrfs_key tmp; 3301 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3302 if (key_type == tmp.type) { 3303 first_offset = tmp.offset; 3304 ret = overwrite_item(trans, log, dst_path, 3305 path->nodes[0], path->slots[0], 3306 &tmp); 3307 if (ret) { 3308 err = ret; 3309 goto done; 3310 } 3311 } 3312 } 3313 btrfs_release_path(path); 3314 3315 /* find the first key from this transaction again */ 3316 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3317 if (WARN_ON(ret != 0)) 3318 goto done; 3319 3320 /* 3321 * we have a block from this transaction, log every item in it 3322 * from our directory 3323 */ 3324 while (1) { 3325 struct btrfs_key tmp; 3326 src = path->nodes[0]; 3327 nritems = btrfs_header_nritems(src); 3328 for (i = path->slots[0]; i < nritems; i++) { 3329 struct btrfs_dir_item *di; 3330 3331 btrfs_item_key_to_cpu(src, &min_key, i); 3332 3333 if (min_key.objectid != ino || min_key.type != key_type) 3334 goto done; 3335 ret = overwrite_item(trans, log, dst_path, src, i, 3336 &min_key); 3337 if (ret) { 3338 err = ret; 3339 goto done; 3340 } 3341 3342 /* 3343 * We must make sure that when we log a directory entry, 3344 * the corresponding inode, after log replay, has a 3345 * matching link count. For example: 3346 * 3347 * touch foo 3348 * mkdir mydir 3349 * sync 3350 * ln foo mydir/bar 3351 * xfs_io -c "fsync" mydir 3352 * <crash> 3353 * <mount fs and log replay> 3354 * 3355 * Would result in a fsync log that when replayed, our 3356 * file inode would have a link count of 1, but we get 3357 * two directory entries pointing to the same inode. 3358 * After removing one of the names, it would not be 3359 * possible to remove the other name, which resulted 3360 * always in stale file handle errors, and would not 3361 * be possible to rmdir the parent directory, since 3362 * its i_size could never decrement to the value 3363 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3364 */ 3365 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3366 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3367 if (ctx && 3368 (btrfs_dir_transid(src, di) == trans->transid || 3369 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3370 tmp.type != BTRFS_ROOT_ITEM_KEY) 3371 ctx->log_new_dentries = true; 3372 } 3373 path->slots[0] = nritems; 3374 3375 /* 3376 * look ahead to the next item and see if it is also 3377 * from this directory and from this transaction 3378 */ 3379 ret = btrfs_next_leaf(root, path); 3380 if (ret == 1) { 3381 last_offset = (u64)-1; 3382 goto done; 3383 } 3384 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3385 if (tmp.objectid != ino || tmp.type != key_type) { 3386 last_offset = (u64)-1; 3387 goto done; 3388 } 3389 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3390 ret = overwrite_item(trans, log, dst_path, 3391 path->nodes[0], path->slots[0], 3392 &tmp); 3393 if (ret) 3394 err = ret; 3395 else 3396 last_offset = tmp.offset; 3397 goto done; 3398 } 3399 } 3400 done: 3401 btrfs_release_path(path); 3402 btrfs_release_path(dst_path); 3403 3404 if (err == 0) { 3405 *last_offset_ret = last_offset; 3406 /* 3407 * insert the log range keys to indicate where the log 3408 * is valid 3409 */ 3410 ret = insert_dir_log_key(trans, log, path, key_type, 3411 ino, first_offset, last_offset); 3412 if (ret) 3413 err = ret; 3414 } 3415 return err; 3416 } 3417 3418 /* 3419 * logging directories is very similar to logging inodes, We find all the items 3420 * from the current transaction and write them to the log. 3421 * 3422 * The recovery code scans the directory in the subvolume, and if it finds a 3423 * key in the range logged that is not present in the log tree, then it means 3424 * that dir entry was unlinked during the transaction. 3425 * 3426 * In order for that scan to work, we must include one key smaller than 3427 * the smallest logged by this transaction and one key larger than the largest 3428 * key logged by this transaction. 3429 */ 3430 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3431 struct btrfs_root *root, struct inode *inode, 3432 struct btrfs_path *path, 3433 struct btrfs_path *dst_path, 3434 struct btrfs_log_ctx *ctx) 3435 { 3436 u64 min_key; 3437 u64 max_key; 3438 int ret; 3439 int key_type = BTRFS_DIR_ITEM_KEY; 3440 3441 again: 3442 min_key = 0; 3443 max_key = 0; 3444 while (1) { 3445 ret = log_dir_items(trans, root, inode, path, 3446 dst_path, key_type, ctx, min_key, 3447 &max_key); 3448 if (ret) 3449 return ret; 3450 if (max_key == (u64)-1) 3451 break; 3452 min_key = max_key + 1; 3453 } 3454 3455 if (key_type == BTRFS_DIR_ITEM_KEY) { 3456 key_type = BTRFS_DIR_INDEX_KEY; 3457 goto again; 3458 } 3459 return 0; 3460 } 3461 3462 /* 3463 * a helper function to drop items from the log before we relog an 3464 * inode. max_key_type indicates the highest item type to remove. 3465 * This cannot be run for file data extents because it does not 3466 * free the extents they point to. 3467 */ 3468 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3469 struct btrfs_root *log, 3470 struct btrfs_path *path, 3471 u64 objectid, int max_key_type) 3472 { 3473 int ret; 3474 struct btrfs_key key; 3475 struct btrfs_key found_key; 3476 int start_slot; 3477 3478 key.objectid = objectid; 3479 key.type = max_key_type; 3480 key.offset = (u64)-1; 3481 3482 while (1) { 3483 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3484 BUG_ON(ret == 0); /* Logic error */ 3485 if (ret < 0) 3486 break; 3487 3488 if (path->slots[0] == 0) 3489 break; 3490 3491 path->slots[0]--; 3492 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3493 path->slots[0]); 3494 3495 if (found_key.objectid != objectid) 3496 break; 3497 3498 found_key.offset = 0; 3499 found_key.type = 0; 3500 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3501 &start_slot); 3502 3503 ret = btrfs_del_items(trans, log, path, start_slot, 3504 path->slots[0] - start_slot + 1); 3505 /* 3506 * If start slot isn't 0 then we don't need to re-search, we've 3507 * found the last guy with the objectid in this tree. 3508 */ 3509 if (ret || start_slot != 0) 3510 break; 3511 btrfs_release_path(path); 3512 } 3513 btrfs_release_path(path); 3514 if (ret > 0) 3515 ret = 0; 3516 return ret; 3517 } 3518 3519 static void fill_inode_item(struct btrfs_trans_handle *trans, 3520 struct extent_buffer *leaf, 3521 struct btrfs_inode_item *item, 3522 struct inode *inode, int log_inode_only, 3523 u64 logged_isize) 3524 { 3525 struct btrfs_map_token token; 3526 3527 btrfs_init_map_token(&token); 3528 3529 if (log_inode_only) { 3530 /* set the generation to zero so the recover code 3531 * can tell the difference between an logging 3532 * just to say 'this inode exists' and a logging 3533 * to say 'update this inode with these values' 3534 */ 3535 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3536 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3537 } else { 3538 btrfs_set_token_inode_generation(leaf, item, 3539 BTRFS_I(inode)->generation, 3540 &token); 3541 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3542 } 3543 3544 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3545 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3546 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3547 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3548 3549 btrfs_set_token_timespec_sec(leaf, &item->atime, 3550 inode->i_atime.tv_sec, &token); 3551 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3552 inode->i_atime.tv_nsec, &token); 3553 3554 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3555 inode->i_mtime.tv_sec, &token); 3556 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3557 inode->i_mtime.tv_nsec, &token); 3558 3559 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3560 inode->i_ctime.tv_sec, &token); 3561 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3562 inode->i_ctime.tv_nsec, &token); 3563 3564 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3565 &token); 3566 3567 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3568 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3569 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3570 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3571 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3572 } 3573 3574 static int log_inode_item(struct btrfs_trans_handle *trans, 3575 struct btrfs_root *log, struct btrfs_path *path, 3576 struct inode *inode) 3577 { 3578 struct btrfs_inode_item *inode_item; 3579 int ret; 3580 3581 ret = btrfs_insert_empty_item(trans, log, path, 3582 &BTRFS_I(inode)->location, 3583 sizeof(*inode_item)); 3584 if (ret && ret != -EEXIST) 3585 return ret; 3586 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3587 struct btrfs_inode_item); 3588 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3589 btrfs_release_path(path); 3590 return 0; 3591 } 3592 3593 static noinline int copy_items(struct btrfs_trans_handle *trans, 3594 struct inode *inode, 3595 struct btrfs_path *dst_path, 3596 struct btrfs_path *src_path, u64 *last_extent, 3597 int start_slot, int nr, int inode_only, 3598 u64 logged_isize) 3599 { 3600 unsigned long src_offset; 3601 unsigned long dst_offset; 3602 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3603 struct btrfs_file_extent_item *extent; 3604 struct btrfs_inode_item *inode_item; 3605 struct extent_buffer *src = src_path->nodes[0]; 3606 struct btrfs_key first_key, last_key, key; 3607 int ret; 3608 struct btrfs_key *ins_keys; 3609 u32 *ins_sizes; 3610 char *ins_data; 3611 int i; 3612 struct list_head ordered_sums; 3613 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3614 bool has_extents = false; 3615 bool need_find_last_extent = true; 3616 bool done = false; 3617 3618 INIT_LIST_HEAD(&ordered_sums); 3619 3620 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3621 nr * sizeof(u32), GFP_NOFS); 3622 if (!ins_data) 3623 return -ENOMEM; 3624 3625 first_key.objectid = (u64)-1; 3626 3627 ins_sizes = (u32 *)ins_data; 3628 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3629 3630 for (i = 0; i < nr; i++) { 3631 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3632 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3633 } 3634 ret = btrfs_insert_empty_items(trans, log, dst_path, 3635 ins_keys, ins_sizes, nr); 3636 if (ret) { 3637 kfree(ins_data); 3638 return ret; 3639 } 3640 3641 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3642 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3643 dst_path->slots[0]); 3644 3645 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3646 3647 if ((i == (nr - 1))) 3648 last_key = ins_keys[i]; 3649 3650 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3651 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3652 dst_path->slots[0], 3653 struct btrfs_inode_item); 3654 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3655 inode, inode_only == LOG_INODE_EXISTS, 3656 logged_isize); 3657 } else { 3658 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3659 src_offset, ins_sizes[i]); 3660 } 3661 3662 /* 3663 * We set need_find_last_extent here in case we know we were 3664 * processing other items and then walk into the first extent in 3665 * the inode. If we don't hit an extent then nothing changes, 3666 * we'll do the last search the next time around. 3667 */ 3668 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3669 has_extents = true; 3670 if (first_key.objectid == (u64)-1) 3671 first_key = ins_keys[i]; 3672 } else { 3673 need_find_last_extent = false; 3674 } 3675 3676 /* take a reference on file data extents so that truncates 3677 * or deletes of this inode don't have to relog the inode 3678 * again 3679 */ 3680 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3681 !skip_csum) { 3682 int found_type; 3683 extent = btrfs_item_ptr(src, start_slot + i, 3684 struct btrfs_file_extent_item); 3685 3686 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3687 continue; 3688 3689 found_type = btrfs_file_extent_type(src, extent); 3690 if (found_type == BTRFS_FILE_EXTENT_REG) { 3691 u64 ds, dl, cs, cl; 3692 ds = btrfs_file_extent_disk_bytenr(src, 3693 extent); 3694 /* ds == 0 is a hole */ 3695 if (ds == 0) 3696 continue; 3697 3698 dl = btrfs_file_extent_disk_num_bytes(src, 3699 extent); 3700 cs = btrfs_file_extent_offset(src, extent); 3701 cl = btrfs_file_extent_num_bytes(src, 3702 extent); 3703 if (btrfs_file_extent_compression(src, 3704 extent)) { 3705 cs = 0; 3706 cl = dl; 3707 } 3708 3709 ret = btrfs_lookup_csums_range( 3710 log->fs_info->csum_root, 3711 ds + cs, ds + cs + cl - 1, 3712 &ordered_sums, 0); 3713 if (ret) { 3714 btrfs_release_path(dst_path); 3715 kfree(ins_data); 3716 return ret; 3717 } 3718 } 3719 } 3720 } 3721 3722 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3723 btrfs_release_path(dst_path); 3724 kfree(ins_data); 3725 3726 /* 3727 * we have to do this after the loop above to avoid changing the 3728 * log tree while trying to change the log tree. 3729 */ 3730 ret = 0; 3731 while (!list_empty(&ordered_sums)) { 3732 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3733 struct btrfs_ordered_sum, 3734 list); 3735 if (!ret) 3736 ret = btrfs_csum_file_blocks(trans, log, sums); 3737 list_del(&sums->list); 3738 kfree(sums); 3739 } 3740 3741 if (!has_extents) 3742 return ret; 3743 3744 if (need_find_last_extent && *last_extent == first_key.offset) { 3745 /* 3746 * We don't have any leafs between our current one and the one 3747 * we processed before that can have file extent items for our 3748 * inode (and have a generation number smaller than our current 3749 * transaction id). 3750 */ 3751 need_find_last_extent = false; 3752 } 3753 3754 /* 3755 * Because we use btrfs_search_forward we could skip leaves that were 3756 * not modified and then assume *last_extent is valid when it really 3757 * isn't. So back up to the previous leaf and read the end of the last 3758 * extent before we go and fill in holes. 3759 */ 3760 if (need_find_last_extent) { 3761 u64 len; 3762 3763 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3764 if (ret < 0) 3765 return ret; 3766 if (ret) 3767 goto fill_holes; 3768 if (src_path->slots[0]) 3769 src_path->slots[0]--; 3770 src = src_path->nodes[0]; 3771 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3772 if (key.objectid != btrfs_ino(inode) || 3773 key.type != BTRFS_EXTENT_DATA_KEY) 3774 goto fill_holes; 3775 extent = btrfs_item_ptr(src, src_path->slots[0], 3776 struct btrfs_file_extent_item); 3777 if (btrfs_file_extent_type(src, extent) == 3778 BTRFS_FILE_EXTENT_INLINE) { 3779 len = btrfs_file_extent_inline_len(src, 3780 src_path->slots[0], 3781 extent); 3782 *last_extent = ALIGN(key.offset + len, 3783 log->sectorsize); 3784 } else { 3785 len = btrfs_file_extent_num_bytes(src, extent); 3786 *last_extent = key.offset + len; 3787 } 3788 } 3789 fill_holes: 3790 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3791 * things could have happened 3792 * 3793 * 1) A merge could have happened, so we could currently be on a leaf 3794 * that holds what we were copying in the first place. 3795 * 2) A split could have happened, and now not all of the items we want 3796 * are on the same leaf. 3797 * 3798 * So we need to adjust how we search for holes, we need to drop the 3799 * path and re-search for the first extent key we found, and then walk 3800 * forward until we hit the last one we copied. 3801 */ 3802 if (need_find_last_extent) { 3803 /* btrfs_prev_leaf could return 1 without releasing the path */ 3804 btrfs_release_path(src_path); 3805 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3806 src_path, 0, 0); 3807 if (ret < 0) 3808 return ret; 3809 ASSERT(ret == 0); 3810 src = src_path->nodes[0]; 3811 i = src_path->slots[0]; 3812 } else { 3813 i = start_slot; 3814 } 3815 3816 /* 3817 * Ok so here we need to go through and fill in any holes we may have 3818 * to make sure that holes are punched for those areas in case they had 3819 * extents previously. 3820 */ 3821 while (!done) { 3822 u64 offset, len; 3823 u64 extent_end; 3824 3825 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3826 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3827 if (ret < 0) 3828 return ret; 3829 ASSERT(ret == 0); 3830 src = src_path->nodes[0]; 3831 i = 0; 3832 } 3833 3834 btrfs_item_key_to_cpu(src, &key, i); 3835 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3836 done = true; 3837 if (key.objectid != btrfs_ino(inode) || 3838 key.type != BTRFS_EXTENT_DATA_KEY) { 3839 i++; 3840 continue; 3841 } 3842 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3843 if (btrfs_file_extent_type(src, extent) == 3844 BTRFS_FILE_EXTENT_INLINE) { 3845 len = btrfs_file_extent_inline_len(src, i, extent); 3846 extent_end = ALIGN(key.offset + len, log->sectorsize); 3847 } else { 3848 len = btrfs_file_extent_num_bytes(src, extent); 3849 extent_end = key.offset + len; 3850 } 3851 i++; 3852 3853 if (*last_extent == key.offset) { 3854 *last_extent = extent_end; 3855 continue; 3856 } 3857 offset = *last_extent; 3858 len = key.offset - *last_extent; 3859 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3860 offset, 0, 0, len, 0, len, 0, 3861 0, 0); 3862 if (ret) 3863 break; 3864 *last_extent = extent_end; 3865 } 3866 /* 3867 * Need to let the callers know we dropped the path so they should 3868 * re-search. 3869 */ 3870 if (!ret && need_find_last_extent) 3871 ret = 1; 3872 return ret; 3873 } 3874 3875 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3876 { 3877 struct extent_map *em1, *em2; 3878 3879 em1 = list_entry(a, struct extent_map, list); 3880 em2 = list_entry(b, struct extent_map, list); 3881 3882 if (em1->start < em2->start) 3883 return -1; 3884 else if (em1->start > em2->start) 3885 return 1; 3886 return 0; 3887 } 3888 3889 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3890 struct inode *inode, 3891 struct btrfs_root *root, 3892 const struct extent_map *em, 3893 const struct list_head *logged_list, 3894 bool *ordered_io_error) 3895 { 3896 struct btrfs_ordered_extent *ordered; 3897 struct btrfs_root *log = root->log_root; 3898 u64 mod_start = em->mod_start; 3899 u64 mod_len = em->mod_len; 3900 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3901 u64 csum_offset; 3902 u64 csum_len; 3903 LIST_HEAD(ordered_sums); 3904 int ret = 0; 3905 3906 *ordered_io_error = false; 3907 3908 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3909 em->block_start == EXTENT_MAP_HOLE) 3910 return 0; 3911 3912 /* 3913 * Wait far any ordered extent that covers our extent map. If it 3914 * finishes without an error, first check and see if our csums are on 3915 * our outstanding ordered extents. 3916 */ 3917 list_for_each_entry(ordered, logged_list, log_list) { 3918 struct btrfs_ordered_sum *sum; 3919 3920 if (!mod_len) 3921 break; 3922 3923 if (ordered->file_offset + ordered->len <= mod_start || 3924 mod_start + mod_len <= ordered->file_offset) 3925 continue; 3926 3927 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3928 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3929 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3930 const u64 start = ordered->file_offset; 3931 const u64 end = ordered->file_offset + ordered->len - 1; 3932 3933 WARN_ON(ordered->inode != inode); 3934 filemap_fdatawrite_range(inode->i_mapping, start, end); 3935 } 3936 3937 wait_event(ordered->wait, 3938 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3939 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3940 3941 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3942 /* 3943 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3944 * i_mapping flags, so that the next fsync won't get 3945 * an outdated io error too. 3946 */ 3947 btrfs_inode_check_errors(inode); 3948 *ordered_io_error = true; 3949 break; 3950 } 3951 /* 3952 * We are going to copy all the csums on this ordered extent, so 3953 * go ahead and adjust mod_start and mod_len in case this 3954 * ordered extent has already been logged. 3955 */ 3956 if (ordered->file_offset > mod_start) { 3957 if (ordered->file_offset + ordered->len >= 3958 mod_start + mod_len) 3959 mod_len = ordered->file_offset - mod_start; 3960 /* 3961 * If we have this case 3962 * 3963 * |--------- logged extent ---------| 3964 * |----- ordered extent ----| 3965 * 3966 * Just don't mess with mod_start and mod_len, we'll 3967 * just end up logging more csums than we need and it 3968 * will be ok. 3969 */ 3970 } else { 3971 if (ordered->file_offset + ordered->len < 3972 mod_start + mod_len) { 3973 mod_len = (mod_start + mod_len) - 3974 (ordered->file_offset + ordered->len); 3975 mod_start = ordered->file_offset + 3976 ordered->len; 3977 } else { 3978 mod_len = 0; 3979 } 3980 } 3981 3982 if (skip_csum) 3983 continue; 3984 3985 /* 3986 * To keep us from looping for the above case of an ordered 3987 * extent that falls inside of the logged extent. 3988 */ 3989 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3990 &ordered->flags)) 3991 continue; 3992 3993 list_for_each_entry(sum, &ordered->list, list) { 3994 ret = btrfs_csum_file_blocks(trans, log, sum); 3995 if (ret) 3996 break; 3997 } 3998 } 3999 4000 if (*ordered_io_error || !mod_len || ret || skip_csum) 4001 return ret; 4002 4003 if (em->compress_type) { 4004 csum_offset = 0; 4005 csum_len = max(em->block_len, em->orig_block_len); 4006 } else { 4007 csum_offset = mod_start - em->start; 4008 csum_len = mod_len; 4009 } 4010 4011 /* block start is already adjusted for the file extent offset. */ 4012 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 4013 em->block_start + csum_offset, 4014 em->block_start + csum_offset + 4015 csum_len - 1, &ordered_sums, 0); 4016 if (ret) 4017 return ret; 4018 4019 while (!list_empty(&ordered_sums)) { 4020 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4021 struct btrfs_ordered_sum, 4022 list); 4023 if (!ret) 4024 ret = btrfs_csum_file_blocks(trans, log, sums); 4025 list_del(&sums->list); 4026 kfree(sums); 4027 } 4028 4029 return ret; 4030 } 4031 4032 static int log_one_extent(struct btrfs_trans_handle *trans, 4033 struct inode *inode, struct btrfs_root *root, 4034 const struct extent_map *em, 4035 struct btrfs_path *path, 4036 const struct list_head *logged_list, 4037 struct btrfs_log_ctx *ctx) 4038 { 4039 struct btrfs_root *log = root->log_root; 4040 struct btrfs_file_extent_item *fi; 4041 struct extent_buffer *leaf; 4042 struct btrfs_map_token token; 4043 struct btrfs_key key; 4044 u64 extent_offset = em->start - em->orig_start; 4045 u64 block_len; 4046 int ret; 4047 int extent_inserted = 0; 4048 bool ordered_io_err = false; 4049 4050 ret = wait_ordered_extents(trans, inode, root, em, logged_list, 4051 &ordered_io_err); 4052 if (ret) 4053 return ret; 4054 4055 if (ordered_io_err) { 4056 ctx->io_err = -EIO; 4057 return 0; 4058 } 4059 4060 btrfs_init_map_token(&token); 4061 4062 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 4063 em->start + em->len, NULL, 0, 1, 4064 sizeof(*fi), &extent_inserted); 4065 if (ret) 4066 return ret; 4067 4068 if (!extent_inserted) { 4069 key.objectid = btrfs_ino(inode); 4070 key.type = BTRFS_EXTENT_DATA_KEY; 4071 key.offset = em->start; 4072 4073 ret = btrfs_insert_empty_item(trans, log, path, &key, 4074 sizeof(*fi)); 4075 if (ret) 4076 return ret; 4077 } 4078 leaf = path->nodes[0]; 4079 fi = btrfs_item_ptr(leaf, path->slots[0], 4080 struct btrfs_file_extent_item); 4081 4082 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4083 &token); 4084 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4085 btrfs_set_token_file_extent_type(leaf, fi, 4086 BTRFS_FILE_EXTENT_PREALLOC, 4087 &token); 4088 else 4089 btrfs_set_token_file_extent_type(leaf, fi, 4090 BTRFS_FILE_EXTENT_REG, 4091 &token); 4092 4093 block_len = max(em->block_len, em->orig_block_len); 4094 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4095 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4096 em->block_start, 4097 &token); 4098 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4099 &token); 4100 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4101 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4102 em->block_start - 4103 extent_offset, &token); 4104 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4105 &token); 4106 } else { 4107 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4108 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4109 &token); 4110 } 4111 4112 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4113 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4114 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4115 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4116 &token); 4117 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4118 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4119 btrfs_mark_buffer_dirty(leaf); 4120 4121 btrfs_release_path(path); 4122 4123 return ret; 4124 } 4125 4126 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4127 struct btrfs_root *root, 4128 struct inode *inode, 4129 struct btrfs_path *path, 4130 struct list_head *logged_list, 4131 struct btrfs_log_ctx *ctx, 4132 const u64 start, 4133 const u64 end) 4134 { 4135 struct extent_map *em, *n; 4136 struct list_head extents; 4137 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 4138 u64 test_gen; 4139 int ret = 0; 4140 int num = 0; 4141 4142 INIT_LIST_HEAD(&extents); 4143 4144 down_write(&BTRFS_I(inode)->dio_sem); 4145 write_lock(&tree->lock); 4146 test_gen = root->fs_info->last_trans_committed; 4147 4148 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4149 list_del_init(&em->list); 4150 4151 /* 4152 * Just an arbitrary number, this can be really CPU intensive 4153 * once we start getting a lot of extents, and really once we 4154 * have a bunch of extents we just want to commit since it will 4155 * be faster. 4156 */ 4157 if (++num > 32768) { 4158 list_del_init(&tree->modified_extents); 4159 ret = -EFBIG; 4160 goto process; 4161 } 4162 4163 if (em->generation <= test_gen) 4164 continue; 4165 /* Need a ref to keep it from getting evicted from cache */ 4166 atomic_inc(&em->refs); 4167 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4168 list_add_tail(&em->list, &extents); 4169 num++; 4170 } 4171 4172 list_sort(NULL, &extents, extent_cmp); 4173 btrfs_get_logged_extents(inode, logged_list, start, end); 4174 /* 4175 * Some ordered extents started by fsync might have completed 4176 * before we could collect them into the list logged_list, which 4177 * means they're gone, not in our logged_list nor in the inode's 4178 * ordered tree. We want the application/user space to know an 4179 * error happened while attempting to persist file data so that 4180 * it can take proper action. If such error happened, we leave 4181 * without writing to the log tree and the fsync must report the 4182 * file data write error and not commit the current transaction. 4183 */ 4184 ret = btrfs_inode_check_errors(inode); 4185 if (ret) 4186 ctx->io_err = ret; 4187 process: 4188 while (!list_empty(&extents)) { 4189 em = list_entry(extents.next, struct extent_map, list); 4190 4191 list_del_init(&em->list); 4192 4193 /* 4194 * If we had an error we just need to delete everybody from our 4195 * private list. 4196 */ 4197 if (ret) { 4198 clear_em_logging(tree, em); 4199 free_extent_map(em); 4200 continue; 4201 } 4202 4203 write_unlock(&tree->lock); 4204 4205 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4206 ctx); 4207 write_lock(&tree->lock); 4208 clear_em_logging(tree, em); 4209 free_extent_map(em); 4210 } 4211 WARN_ON(!list_empty(&extents)); 4212 write_unlock(&tree->lock); 4213 up_write(&BTRFS_I(inode)->dio_sem); 4214 4215 btrfs_release_path(path); 4216 return ret; 4217 } 4218 4219 static int logged_inode_size(struct btrfs_root *log, struct inode *inode, 4220 struct btrfs_path *path, u64 *size_ret) 4221 { 4222 struct btrfs_key key; 4223 int ret; 4224 4225 key.objectid = btrfs_ino(inode); 4226 key.type = BTRFS_INODE_ITEM_KEY; 4227 key.offset = 0; 4228 4229 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4230 if (ret < 0) { 4231 return ret; 4232 } else if (ret > 0) { 4233 *size_ret = 0; 4234 } else { 4235 struct btrfs_inode_item *item; 4236 4237 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4238 struct btrfs_inode_item); 4239 *size_ret = btrfs_inode_size(path->nodes[0], item); 4240 } 4241 4242 btrfs_release_path(path); 4243 return 0; 4244 } 4245 4246 /* 4247 * At the moment we always log all xattrs. This is to figure out at log replay 4248 * time which xattrs must have their deletion replayed. If a xattr is missing 4249 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4250 * because if a xattr is deleted, the inode is fsynced and a power failure 4251 * happens, causing the log to be replayed the next time the fs is mounted, 4252 * we want the xattr to not exist anymore (same behaviour as other filesystems 4253 * with a journal, ext3/4, xfs, f2fs, etc). 4254 */ 4255 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4256 struct btrfs_root *root, 4257 struct inode *inode, 4258 struct btrfs_path *path, 4259 struct btrfs_path *dst_path) 4260 { 4261 int ret; 4262 struct btrfs_key key; 4263 const u64 ino = btrfs_ino(inode); 4264 int ins_nr = 0; 4265 int start_slot = 0; 4266 4267 key.objectid = ino; 4268 key.type = BTRFS_XATTR_ITEM_KEY; 4269 key.offset = 0; 4270 4271 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4272 if (ret < 0) 4273 return ret; 4274 4275 while (true) { 4276 int slot = path->slots[0]; 4277 struct extent_buffer *leaf = path->nodes[0]; 4278 int nritems = btrfs_header_nritems(leaf); 4279 4280 if (slot >= nritems) { 4281 if (ins_nr > 0) { 4282 u64 last_extent = 0; 4283 4284 ret = copy_items(trans, inode, dst_path, path, 4285 &last_extent, start_slot, 4286 ins_nr, 1, 0); 4287 /* can't be 1, extent items aren't processed */ 4288 ASSERT(ret <= 0); 4289 if (ret < 0) 4290 return ret; 4291 ins_nr = 0; 4292 } 4293 ret = btrfs_next_leaf(root, path); 4294 if (ret < 0) 4295 return ret; 4296 else if (ret > 0) 4297 break; 4298 continue; 4299 } 4300 4301 btrfs_item_key_to_cpu(leaf, &key, slot); 4302 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4303 break; 4304 4305 if (ins_nr == 0) 4306 start_slot = slot; 4307 ins_nr++; 4308 path->slots[0]++; 4309 cond_resched(); 4310 } 4311 if (ins_nr > 0) { 4312 u64 last_extent = 0; 4313 4314 ret = copy_items(trans, inode, dst_path, path, 4315 &last_extent, start_slot, 4316 ins_nr, 1, 0); 4317 /* can't be 1, extent items aren't processed */ 4318 ASSERT(ret <= 0); 4319 if (ret < 0) 4320 return ret; 4321 } 4322 4323 return 0; 4324 } 4325 4326 /* 4327 * If the no holes feature is enabled we need to make sure any hole between the 4328 * last extent and the i_size of our inode is explicitly marked in the log. This 4329 * is to make sure that doing something like: 4330 * 4331 * 1) create file with 128Kb of data 4332 * 2) truncate file to 64Kb 4333 * 3) truncate file to 256Kb 4334 * 4) fsync file 4335 * 5) <crash/power failure> 4336 * 6) mount fs and trigger log replay 4337 * 4338 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4339 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4340 * file correspond to a hole. The presence of explicit holes in a log tree is 4341 * what guarantees that log replay will remove/adjust file extent items in the 4342 * fs/subvol tree. 4343 * 4344 * Here we do not need to care about holes between extents, that is already done 4345 * by copy_items(). We also only need to do this in the full sync path, where we 4346 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4347 * lookup the list of modified extent maps and if any represents a hole, we 4348 * insert a corresponding extent representing a hole in the log tree. 4349 */ 4350 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4351 struct btrfs_root *root, 4352 struct inode *inode, 4353 struct btrfs_path *path) 4354 { 4355 int ret; 4356 struct btrfs_key key; 4357 u64 hole_start; 4358 u64 hole_size; 4359 struct extent_buffer *leaf; 4360 struct btrfs_root *log = root->log_root; 4361 const u64 ino = btrfs_ino(inode); 4362 const u64 i_size = i_size_read(inode); 4363 4364 if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) 4365 return 0; 4366 4367 key.objectid = ino; 4368 key.type = BTRFS_EXTENT_DATA_KEY; 4369 key.offset = (u64)-1; 4370 4371 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4372 ASSERT(ret != 0); 4373 if (ret < 0) 4374 return ret; 4375 4376 ASSERT(path->slots[0] > 0); 4377 path->slots[0]--; 4378 leaf = path->nodes[0]; 4379 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4380 4381 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4382 /* inode does not have any extents */ 4383 hole_start = 0; 4384 hole_size = i_size; 4385 } else { 4386 struct btrfs_file_extent_item *extent; 4387 u64 len; 4388 4389 /* 4390 * If there's an extent beyond i_size, an explicit hole was 4391 * already inserted by copy_items(). 4392 */ 4393 if (key.offset >= i_size) 4394 return 0; 4395 4396 extent = btrfs_item_ptr(leaf, path->slots[0], 4397 struct btrfs_file_extent_item); 4398 4399 if (btrfs_file_extent_type(leaf, extent) == 4400 BTRFS_FILE_EXTENT_INLINE) { 4401 len = btrfs_file_extent_inline_len(leaf, 4402 path->slots[0], 4403 extent); 4404 ASSERT(len == i_size); 4405 return 0; 4406 } 4407 4408 len = btrfs_file_extent_num_bytes(leaf, extent); 4409 /* Last extent goes beyond i_size, no need to log a hole. */ 4410 if (key.offset + len > i_size) 4411 return 0; 4412 hole_start = key.offset + len; 4413 hole_size = i_size - hole_start; 4414 } 4415 btrfs_release_path(path); 4416 4417 /* Last extent ends at i_size. */ 4418 if (hole_size == 0) 4419 return 0; 4420 4421 hole_size = ALIGN(hole_size, root->sectorsize); 4422 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4423 hole_size, 0, hole_size, 0, 0, 0); 4424 return ret; 4425 } 4426 4427 /* 4428 * When we are logging a new inode X, check if it doesn't have a reference that 4429 * matches the reference from some other inode Y created in a past transaction 4430 * and that was renamed in the current transaction. If we don't do this, then at 4431 * log replay time we can lose inode Y (and all its files if it's a directory): 4432 * 4433 * mkdir /mnt/x 4434 * echo "hello world" > /mnt/x/foobar 4435 * sync 4436 * mv /mnt/x /mnt/y 4437 * mkdir /mnt/x # or touch /mnt/x 4438 * xfs_io -c fsync /mnt/x 4439 * <power fail> 4440 * mount fs, trigger log replay 4441 * 4442 * After the log replay procedure, we would lose the first directory and all its 4443 * files (file foobar). 4444 * For the case where inode Y is not a directory we simply end up losing it: 4445 * 4446 * echo "123" > /mnt/foo 4447 * sync 4448 * mv /mnt/foo /mnt/bar 4449 * echo "abc" > /mnt/foo 4450 * xfs_io -c fsync /mnt/foo 4451 * <power fail> 4452 * 4453 * We also need this for cases where a snapshot entry is replaced by some other 4454 * entry (file or directory) otherwise we end up with an unreplayable log due to 4455 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4456 * if it were a regular entry: 4457 * 4458 * mkdir /mnt/x 4459 * btrfs subvolume snapshot /mnt /mnt/x/snap 4460 * btrfs subvolume delete /mnt/x/snap 4461 * rmdir /mnt/x 4462 * mkdir /mnt/x 4463 * fsync /mnt/x or fsync some new file inside it 4464 * <power fail> 4465 * 4466 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4467 * the same transaction. 4468 */ 4469 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4470 const int slot, 4471 const struct btrfs_key *key, 4472 struct inode *inode) 4473 { 4474 int ret; 4475 struct btrfs_path *search_path; 4476 char *name = NULL; 4477 u32 name_len = 0; 4478 u32 item_size = btrfs_item_size_nr(eb, slot); 4479 u32 cur_offset = 0; 4480 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4481 4482 search_path = btrfs_alloc_path(); 4483 if (!search_path) 4484 return -ENOMEM; 4485 search_path->search_commit_root = 1; 4486 search_path->skip_locking = 1; 4487 4488 while (cur_offset < item_size) { 4489 u64 parent; 4490 u32 this_name_len; 4491 u32 this_len; 4492 unsigned long name_ptr; 4493 struct btrfs_dir_item *di; 4494 4495 if (key->type == BTRFS_INODE_REF_KEY) { 4496 struct btrfs_inode_ref *iref; 4497 4498 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4499 parent = key->offset; 4500 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4501 name_ptr = (unsigned long)(iref + 1); 4502 this_len = sizeof(*iref) + this_name_len; 4503 } else { 4504 struct btrfs_inode_extref *extref; 4505 4506 extref = (struct btrfs_inode_extref *)(ptr + 4507 cur_offset); 4508 parent = btrfs_inode_extref_parent(eb, extref); 4509 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4510 name_ptr = (unsigned long)&extref->name; 4511 this_len = sizeof(*extref) + this_name_len; 4512 } 4513 4514 if (this_name_len > name_len) { 4515 char *new_name; 4516 4517 new_name = krealloc(name, this_name_len, GFP_NOFS); 4518 if (!new_name) { 4519 ret = -ENOMEM; 4520 goto out; 4521 } 4522 name_len = this_name_len; 4523 name = new_name; 4524 } 4525 4526 read_extent_buffer(eb, name, name_ptr, this_name_len); 4527 di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, 4528 search_path, parent, 4529 name, this_name_len, 0); 4530 if (di && !IS_ERR(di)) { 4531 ret = 1; 4532 goto out; 4533 } else if (IS_ERR(di)) { 4534 ret = PTR_ERR(di); 4535 goto out; 4536 } 4537 btrfs_release_path(search_path); 4538 4539 cur_offset += this_len; 4540 } 4541 ret = 0; 4542 out: 4543 btrfs_free_path(search_path); 4544 kfree(name); 4545 return ret; 4546 } 4547 4548 /* log a single inode in the tree log. 4549 * At least one parent directory for this inode must exist in the tree 4550 * or be logged already. 4551 * 4552 * Any items from this inode changed by the current transaction are copied 4553 * to the log tree. An extra reference is taken on any extents in this 4554 * file, allowing us to avoid a whole pile of corner cases around logging 4555 * blocks that have been removed from the tree. 4556 * 4557 * See LOG_INODE_ALL and related defines for a description of what inode_only 4558 * does. 4559 * 4560 * This handles both files and directories. 4561 */ 4562 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4563 struct btrfs_root *root, struct inode *inode, 4564 int inode_only, 4565 const loff_t start, 4566 const loff_t end, 4567 struct btrfs_log_ctx *ctx) 4568 { 4569 struct btrfs_path *path; 4570 struct btrfs_path *dst_path; 4571 struct btrfs_key min_key; 4572 struct btrfs_key max_key; 4573 struct btrfs_root *log = root->log_root; 4574 struct extent_buffer *src = NULL; 4575 LIST_HEAD(logged_list); 4576 u64 last_extent = 0; 4577 int err = 0; 4578 int ret; 4579 int nritems; 4580 int ins_start_slot = 0; 4581 int ins_nr; 4582 bool fast_search = false; 4583 u64 ino = btrfs_ino(inode); 4584 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4585 u64 logged_isize = 0; 4586 bool need_log_inode_item = true; 4587 4588 path = btrfs_alloc_path(); 4589 if (!path) 4590 return -ENOMEM; 4591 dst_path = btrfs_alloc_path(); 4592 if (!dst_path) { 4593 btrfs_free_path(path); 4594 return -ENOMEM; 4595 } 4596 4597 min_key.objectid = ino; 4598 min_key.type = BTRFS_INODE_ITEM_KEY; 4599 min_key.offset = 0; 4600 4601 max_key.objectid = ino; 4602 4603 4604 /* today the code can only do partial logging of directories */ 4605 if (S_ISDIR(inode->i_mode) || 4606 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4607 &BTRFS_I(inode)->runtime_flags) && 4608 inode_only == LOG_INODE_EXISTS)) 4609 max_key.type = BTRFS_XATTR_ITEM_KEY; 4610 else 4611 max_key.type = (u8)-1; 4612 max_key.offset = (u64)-1; 4613 4614 /* 4615 * Only run delayed items if we are a dir or a new file. 4616 * Otherwise commit the delayed inode only, which is needed in 4617 * order for the log replay code to mark inodes for link count 4618 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4619 */ 4620 if (S_ISDIR(inode->i_mode) || 4621 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) 4622 ret = btrfs_commit_inode_delayed_items(trans, inode); 4623 else 4624 ret = btrfs_commit_inode_delayed_inode(inode); 4625 4626 if (ret) { 4627 btrfs_free_path(path); 4628 btrfs_free_path(dst_path); 4629 return ret; 4630 } 4631 4632 mutex_lock(&BTRFS_I(inode)->log_mutex); 4633 4634 /* 4635 * a brute force approach to making sure we get the most uptodate 4636 * copies of everything. 4637 */ 4638 if (S_ISDIR(inode->i_mode)) { 4639 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4640 4641 if (inode_only == LOG_INODE_EXISTS) 4642 max_key_type = BTRFS_XATTR_ITEM_KEY; 4643 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4644 } else { 4645 if (inode_only == LOG_INODE_EXISTS) { 4646 /* 4647 * Make sure the new inode item we write to the log has 4648 * the same isize as the current one (if it exists). 4649 * This is necessary to prevent data loss after log 4650 * replay, and also to prevent doing a wrong expanding 4651 * truncate - for e.g. create file, write 4K into offset 4652 * 0, fsync, write 4K into offset 4096, add hard link, 4653 * fsync some other file (to sync log), power fail - if 4654 * we use the inode's current i_size, after log replay 4655 * we get a 8Kb file, with the last 4Kb extent as a hole 4656 * (zeroes), as if an expanding truncate happened, 4657 * instead of getting a file of 4Kb only. 4658 */ 4659 err = logged_inode_size(log, inode, path, 4660 &logged_isize); 4661 if (err) 4662 goto out_unlock; 4663 } 4664 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4665 &BTRFS_I(inode)->runtime_flags)) { 4666 if (inode_only == LOG_INODE_EXISTS) { 4667 max_key.type = BTRFS_XATTR_ITEM_KEY; 4668 ret = drop_objectid_items(trans, log, path, ino, 4669 max_key.type); 4670 } else { 4671 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4672 &BTRFS_I(inode)->runtime_flags); 4673 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4674 &BTRFS_I(inode)->runtime_flags); 4675 while(1) { 4676 ret = btrfs_truncate_inode_items(trans, 4677 log, inode, 0, 0); 4678 if (ret != -EAGAIN) 4679 break; 4680 } 4681 } 4682 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4683 &BTRFS_I(inode)->runtime_flags) || 4684 inode_only == LOG_INODE_EXISTS) { 4685 if (inode_only == LOG_INODE_ALL) 4686 fast_search = true; 4687 max_key.type = BTRFS_XATTR_ITEM_KEY; 4688 ret = drop_objectid_items(trans, log, path, ino, 4689 max_key.type); 4690 } else { 4691 if (inode_only == LOG_INODE_ALL) 4692 fast_search = true; 4693 goto log_extents; 4694 } 4695 4696 } 4697 if (ret) { 4698 err = ret; 4699 goto out_unlock; 4700 } 4701 4702 while (1) { 4703 ins_nr = 0; 4704 ret = btrfs_search_forward(root, &min_key, 4705 path, trans->transid); 4706 if (ret != 0) 4707 break; 4708 again: 4709 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4710 if (min_key.objectid != ino) 4711 break; 4712 if (min_key.type > max_key.type) 4713 break; 4714 4715 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4716 need_log_inode_item = false; 4717 4718 if ((min_key.type == BTRFS_INODE_REF_KEY || 4719 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4720 BTRFS_I(inode)->generation == trans->transid) { 4721 ret = btrfs_check_ref_name_override(path->nodes[0], 4722 path->slots[0], 4723 &min_key, inode); 4724 if (ret < 0) { 4725 err = ret; 4726 goto out_unlock; 4727 } else if (ret > 0) { 4728 err = 1; 4729 btrfs_set_log_full_commit(root->fs_info, trans); 4730 goto out_unlock; 4731 } 4732 } 4733 4734 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4735 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4736 if (ins_nr == 0) 4737 goto next_slot; 4738 ret = copy_items(trans, inode, dst_path, path, 4739 &last_extent, ins_start_slot, 4740 ins_nr, inode_only, logged_isize); 4741 if (ret < 0) { 4742 err = ret; 4743 goto out_unlock; 4744 } 4745 ins_nr = 0; 4746 if (ret) { 4747 btrfs_release_path(path); 4748 continue; 4749 } 4750 goto next_slot; 4751 } 4752 4753 src = path->nodes[0]; 4754 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4755 ins_nr++; 4756 goto next_slot; 4757 } else if (!ins_nr) { 4758 ins_start_slot = path->slots[0]; 4759 ins_nr = 1; 4760 goto next_slot; 4761 } 4762 4763 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4764 ins_start_slot, ins_nr, inode_only, 4765 logged_isize); 4766 if (ret < 0) { 4767 err = ret; 4768 goto out_unlock; 4769 } 4770 if (ret) { 4771 ins_nr = 0; 4772 btrfs_release_path(path); 4773 continue; 4774 } 4775 ins_nr = 1; 4776 ins_start_slot = path->slots[0]; 4777 next_slot: 4778 4779 nritems = btrfs_header_nritems(path->nodes[0]); 4780 path->slots[0]++; 4781 if (path->slots[0] < nritems) { 4782 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4783 path->slots[0]); 4784 goto again; 4785 } 4786 if (ins_nr) { 4787 ret = copy_items(trans, inode, dst_path, path, 4788 &last_extent, ins_start_slot, 4789 ins_nr, inode_only, logged_isize); 4790 if (ret < 0) { 4791 err = ret; 4792 goto out_unlock; 4793 } 4794 ret = 0; 4795 ins_nr = 0; 4796 } 4797 btrfs_release_path(path); 4798 4799 if (min_key.offset < (u64)-1) { 4800 min_key.offset++; 4801 } else if (min_key.type < max_key.type) { 4802 min_key.type++; 4803 min_key.offset = 0; 4804 } else { 4805 break; 4806 } 4807 } 4808 if (ins_nr) { 4809 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4810 ins_start_slot, ins_nr, inode_only, 4811 logged_isize); 4812 if (ret < 0) { 4813 err = ret; 4814 goto out_unlock; 4815 } 4816 ret = 0; 4817 ins_nr = 0; 4818 } 4819 4820 btrfs_release_path(path); 4821 btrfs_release_path(dst_path); 4822 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4823 if (err) 4824 goto out_unlock; 4825 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4826 btrfs_release_path(path); 4827 btrfs_release_path(dst_path); 4828 err = btrfs_log_trailing_hole(trans, root, inode, path); 4829 if (err) 4830 goto out_unlock; 4831 } 4832 log_extents: 4833 btrfs_release_path(path); 4834 btrfs_release_path(dst_path); 4835 if (need_log_inode_item) { 4836 err = log_inode_item(trans, log, dst_path, inode); 4837 if (err) 4838 goto out_unlock; 4839 } 4840 if (fast_search) { 4841 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4842 &logged_list, ctx, start, end); 4843 if (ret) { 4844 err = ret; 4845 goto out_unlock; 4846 } 4847 } else if (inode_only == LOG_INODE_ALL) { 4848 struct extent_map *em, *n; 4849 4850 write_lock(&em_tree->lock); 4851 /* 4852 * We can't just remove every em if we're called for a ranged 4853 * fsync - that is, one that doesn't cover the whole possible 4854 * file range (0 to LLONG_MAX). This is because we can have 4855 * em's that fall outside the range we're logging and therefore 4856 * their ordered operations haven't completed yet 4857 * (btrfs_finish_ordered_io() not invoked yet). This means we 4858 * didn't get their respective file extent item in the fs/subvol 4859 * tree yet, and need to let the next fast fsync (one which 4860 * consults the list of modified extent maps) find the em so 4861 * that it logs a matching file extent item and waits for the 4862 * respective ordered operation to complete (if it's still 4863 * running). 4864 * 4865 * Removing every em outside the range we're logging would make 4866 * the next fast fsync not log their matching file extent items, 4867 * therefore making us lose data after a log replay. 4868 */ 4869 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4870 list) { 4871 const u64 mod_end = em->mod_start + em->mod_len - 1; 4872 4873 if (em->mod_start >= start && mod_end <= end) 4874 list_del_init(&em->list); 4875 } 4876 write_unlock(&em_tree->lock); 4877 } 4878 4879 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4880 ret = log_directory_changes(trans, root, inode, path, dst_path, 4881 ctx); 4882 if (ret) { 4883 err = ret; 4884 goto out_unlock; 4885 } 4886 } 4887 4888 spin_lock(&BTRFS_I(inode)->lock); 4889 BTRFS_I(inode)->logged_trans = trans->transid; 4890 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4891 spin_unlock(&BTRFS_I(inode)->lock); 4892 out_unlock: 4893 if (unlikely(err)) 4894 btrfs_put_logged_extents(&logged_list); 4895 else 4896 btrfs_submit_logged_extents(&logged_list, log); 4897 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4898 4899 btrfs_free_path(path); 4900 btrfs_free_path(dst_path); 4901 return err; 4902 } 4903 4904 /* 4905 * Check if we must fallback to a transaction commit when logging an inode. 4906 * This must be called after logging the inode and is used only in the context 4907 * when fsyncing an inode requires the need to log some other inode - in which 4908 * case we can't lock the i_mutex of each other inode we need to log as that 4909 * can lead to deadlocks with concurrent fsync against other inodes (as we can 4910 * log inodes up or down in the hierarchy) or rename operations for example. So 4911 * we take the log_mutex of the inode after we have logged it and then check for 4912 * its last_unlink_trans value - this is safe because any task setting 4913 * last_unlink_trans must take the log_mutex and it must do this before it does 4914 * the actual unlink operation, so if we do this check before a concurrent task 4915 * sets last_unlink_trans it means we've logged a consistent version/state of 4916 * all the inode items, otherwise we are not sure and must do a transaction 4917 * commit (the concurrent task might have only updated last_unlink_trans before 4918 * we logged the inode or it might have also done the unlink). 4919 */ 4920 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 4921 struct inode *inode) 4922 { 4923 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 4924 bool ret = false; 4925 4926 mutex_lock(&BTRFS_I(inode)->log_mutex); 4927 if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) { 4928 /* 4929 * Make sure any commits to the log are forced to be full 4930 * commits. 4931 */ 4932 btrfs_set_log_full_commit(fs_info, trans); 4933 ret = true; 4934 } 4935 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4936 4937 return ret; 4938 } 4939 4940 /* 4941 * follow the dentry parent pointers up the chain and see if any 4942 * of the directories in it require a full commit before they can 4943 * be logged. Returns zero if nothing special needs to be done or 1 if 4944 * a full commit is required. 4945 */ 4946 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4947 struct inode *inode, 4948 struct dentry *parent, 4949 struct super_block *sb, 4950 u64 last_committed) 4951 { 4952 int ret = 0; 4953 struct dentry *old_parent = NULL; 4954 struct inode *orig_inode = inode; 4955 4956 /* 4957 * for regular files, if its inode is already on disk, we don't 4958 * have to worry about the parents at all. This is because 4959 * we can use the last_unlink_trans field to record renames 4960 * and other fun in this file. 4961 */ 4962 if (S_ISREG(inode->i_mode) && 4963 BTRFS_I(inode)->generation <= last_committed && 4964 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4965 goto out; 4966 4967 if (!S_ISDIR(inode->i_mode)) { 4968 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 4969 goto out; 4970 inode = d_inode(parent); 4971 } 4972 4973 while (1) { 4974 /* 4975 * If we are logging a directory then we start with our inode, 4976 * not our parent's inode, so we need to skip setting the 4977 * logged_trans so that further down in the log code we don't 4978 * think this inode has already been logged. 4979 */ 4980 if (inode != orig_inode) 4981 BTRFS_I(inode)->logged_trans = trans->transid; 4982 smp_mb(); 4983 4984 if (btrfs_must_commit_transaction(trans, inode)) { 4985 ret = 1; 4986 break; 4987 } 4988 4989 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 4990 break; 4991 4992 if (IS_ROOT(parent)) 4993 break; 4994 4995 parent = dget_parent(parent); 4996 dput(old_parent); 4997 old_parent = parent; 4998 inode = d_inode(parent); 4999 5000 } 5001 dput(old_parent); 5002 out: 5003 return ret; 5004 } 5005 5006 struct btrfs_dir_list { 5007 u64 ino; 5008 struct list_head list; 5009 }; 5010 5011 /* 5012 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5013 * details about the why it is needed. 5014 * This is a recursive operation - if an existing dentry corresponds to a 5015 * directory, that directory's new entries are logged too (same behaviour as 5016 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5017 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5018 * complains about the following circular lock dependency / possible deadlock: 5019 * 5020 * CPU0 CPU1 5021 * ---- ---- 5022 * lock(&type->i_mutex_dir_key#3/2); 5023 * lock(sb_internal#2); 5024 * lock(&type->i_mutex_dir_key#3/2); 5025 * lock(&sb->s_type->i_mutex_key#14); 5026 * 5027 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5028 * sb_start_intwrite() in btrfs_start_transaction(). 5029 * Not locking i_mutex of the inodes is still safe because: 5030 * 5031 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5032 * that while logging the inode new references (names) are added or removed 5033 * from the inode, leaving the logged inode item with a link count that does 5034 * not match the number of logged inode reference items. This is fine because 5035 * at log replay time we compute the real number of links and correct the 5036 * link count in the inode item (see replay_one_buffer() and 5037 * link_to_fixup_dir()); 5038 * 5039 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5040 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5041 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5042 * has a size that doesn't match the sum of the lengths of all the logged 5043 * names. This does not result in a problem because if a dir_item key is 5044 * logged but its matching dir_index key is not logged, at log replay time we 5045 * don't use it to replay the respective name (see replay_one_name()). On the 5046 * other hand if only the dir_index key ends up being logged, the respective 5047 * name is added to the fs/subvol tree with both the dir_item and dir_index 5048 * keys created (see replay_one_name()). 5049 * The directory's inode item with a wrong i_size is not a problem as well, 5050 * since we don't use it at log replay time to set the i_size in the inode 5051 * item of the fs/subvol tree (see overwrite_item()). 5052 */ 5053 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5054 struct btrfs_root *root, 5055 struct inode *start_inode, 5056 struct btrfs_log_ctx *ctx) 5057 { 5058 struct btrfs_root *log = root->log_root; 5059 struct btrfs_path *path; 5060 LIST_HEAD(dir_list); 5061 struct btrfs_dir_list *dir_elem; 5062 int ret = 0; 5063 5064 path = btrfs_alloc_path(); 5065 if (!path) 5066 return -ENOMEM; 5067 5068 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5069 if (!dir_elem) { 5070 btrfs_free_path(path); 5071 return -ENOMEM; 5072 } 5073 dir_elem->ino = btrfs_ino(start_inode); 5074 list_add_tail(&dir_elem->list, &dir_list); 5075 5076 while (!list_empty(&dir_list)) { 5077 struct extent_buffer *leaf; 5078 struct btrfs_key min_key; 5079 int nritems; 5080 int i; 5081 5082 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5083 list); 5084 if (ret) 5085 goto next_dir_inode; 5086 5087 min_key.objectid = dir_elem->ino; 5088 min_key.type = BTRFS_DIR_ITEM_KEY; 5089 min_key.offset = 0; 5090 again: 5091 btrfs_release_path(path); 5092 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5093 if (ret < 0) { 5094 goto next_dir_inode; 5095 } else if (ret > 0) { 5096 ret = 0; 5097 goto next_dir_inode; 5098 } 5099 5100 process_leaf: 5101 leaf = path->nodes[0]; 5102 nritems = btrfs_header_nritems(leaf); 5103 for (i = path->slots[0]; i < nritems; i++) { 5104 struct btrfs_dir_item *di; 5105 struct btrfs_key di_key; 5106 struct inode *di_inode; 5107 struct btrfs_dir_list *new_dir_elem; 5108 int log_mode = LOG_INODE_EXISTS; 5109 int type; 5110 5111 btrfs_item_key_to_cpu(leaf, &min_key, i); 5112 if (min_key.objectid != dir_elem->ino || 5113 min_key.type != BTRFS_DIR_ITEM_KEY) 5114 goto next_dir_inode; 5115 5116 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5117 type = btrfs_dir_type(leaf, di); 5118 if (btrfs_dir_transid(leaf, di) < trans->transid && 5119 type != BTRFS_FT_DIR) 5120 continue; 5121 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5122 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5123 continue; 5124 5125 di_inode = btrfs_iget(root->fs_info->sb, &di_key, 5126 root, NULL); 5127 if (IS_ERR(di_inode)) { 5128 ret = PTR_ERR(di_inode); 5129 goto next_dir_inode; 5130 } 5131 5132 if (btrfs_inode_in_log(di_inode, trans->transid)) { 5133 iput(di_inode); 5134 continue; 5135 } 5136 5137 ctx->log_new_dentries = false; 5138 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5139 log_mode = LOG_INODE_ALL; 5140 btrfs_release_path(path); 5141 ret = btrfs_log_inode(trans, root, di_inode, 5142 log_mode, 0, LLONG_MAX, ctx); 5143 if (!ret && 5144 btrfs_must_commit_transaction(trans, di_inode)) 5145 ret = 1; 5146 iput(di_inode); 5147 if (ret) 5148 goto next_dir_inode; 5149 if (ctx->log_new_dentries) { 5150 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5151 GFP_NOFS); 5152 if (!new_dir_elem) { 5153 ret = -ENOMEM; 5154 goto next_dir_inode; 5155 } 5156 new_dir_elem->ino = di_key.objectid; 5157 list_add_tail(&new_dir_elem->list, &dir_list); 5158 } 5159 break; 5160 } 5161 if (i == nritems) { 5162 ret = btrfs_next_leaf(log, path); 5163 if (ret < 0) { 5164 goto next_dir_inode; 5165 } else if (ret > 0) { 5166 ret = 0; 5167 goto next_dir_inode; 5168 } 5169 goto process_leaf; 5170 } 5171 if (min_key.offset < (u64)-1) { 5172 min_key.offset++; 5173 goto again; 5174 } 5175 next_dir_inode: 5176 list_del(&dir_elem->list); 5177 kfree(dir_elem); 5178 } 5179 5180 btrfs_free_path(path); 5181 return ret; 5182 } 5183 5184 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5185 struct inode *inode, 5186 struct btrfs_log_ctx *ctx) 5187 { 5188 int ret; 5189 struct btrfs_path *path; 5190 struct btrfs_key key; 5191 struct btrfs_root *root = BTRFS_I(inode)->root; 5192 const u64 ino = btrfs_ino(inode); 5193 5194 path = btrfs_alloc_path(); 5195 if (!path) 5196 return -ENOMEM; 5197 path->skip_locking = 1; 5198 path->search_commit_root = 1; 5199 5200 key.objectid = ino; 5201 key.type = BTRFS_INODE_REF_KEY; 5202 key.offset = 0; 5203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5204 if (ret < 0) 5205 goto out; 5206 5207 while (true) { 5208 struct extent_buffer *leaf = path->nodes[0]; 5209 int slot = path->slots[0]; 5210 u32 cur_offset = 0; 5211 u32 item_size; 5212 unsigned long ptr; 5213 5214 if (slot >= btrfs_header_nritems(leaf)) { 5215 ret = btrfs_next_leaf(root, path); 5216 if (ret < 0) 5217 goto out; 5218 else if (ret > 0) 5219 break; 5220 continue; 5221 } 5222 5223 btrfs_item_key_to_cpu(leaf, &key, slot); 5224 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5225 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5226 break; 5227 5228 item_size = btrfs_item_size_nr(leaf, slot); 5229 ptr = btrfs_item_ptr_offset(leaf, slot); 5230 while (cur_offset < item_size) { 5231 struct btrfs_key inode_key; 5232 struct inode *dir_inode; 5233 5234 inode_key.type = BTRFS_INODE_ITEM_KEY; 5235 inode_key.offset = 0; 5236 5237 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5238 struct btrfs_inode_extref *extref; 5239 5240 extref = (struct btrfs_inode_extref *) 5241 (ptr + cur_offset); 5242 inode_key.objectid = btrfs_inode_extref_parent( 5243 leaf, extref); 5244 cur_offset += sizeof(*extref); 5245 cur_offset += btrfs_inode_extref_name_len(leaf, 5246 extref); 5247 } else { 5248 inode_key.objectid = key.offset; 5249 cur_offset = item_size; 5250 } 5251 5252 dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, 5253 root, NULL); 5254 /* If parent inode was deleted, skip it. */ 5255 if (IS_ERR(dir_inode)) 5256 continue; 5257 5258 if (ctx) 5259 ctx->log_new_dentries = false; 5260 ret = btrfs_log_inode(trans, root, dir_inode, 5261 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5262 if (!ret && 5263 btrfs_must_commit_transaction(trans, dir_inode)) 5264 ret = 1; 5265 if (!ret && ctx && ctx->log_new_dentries) 5266 ret = log_new_dir_dentries(trans, root, 5267 dir_inode, ctx); 5268 iput(dir_inode); 5269 if (ret) 5270 goto out; 5271 } 5272 path->slots[0]++; 5273 } 5274 ret = 0; 5275 out: 5276 btrfs_free_path(path); 5277 return ret; 5278 } 5279 5280 /* 5281 * helper function around btrfs_log_inode to make sure newly created 5282 * parent directories also end up in the log. A minimal inode and backref 5283 * only logging is done of any parent directories that are older than 5284 * the last committed transaction 5285 */ 5286 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5287 struct btrfs_root *root, struct inode *inode, 5288 struct dentry *parent, 5289 const loff_t start, 5290 const loff_t end, 5291 int exists_only, 5292 struct btrfs_log_ctx *ctx) 5293 { 5294 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5295 struct super_block *sb; 5296 struct dentry *old_parent = NULL; 5297 int ret = 0; 5298 u64 last_committed = root->fs_info->last_trans_committed; 5299 bool log_dentries = false; 5300 struct inode *orig_inode = inode; 5301 5302 sb = inode->i_sb; 5303 5304 if (btrfs_test_opt(root, NOTREELOG)) { 5305 ret = 1; 5306 goto end_no_trans; 5307 } 5308 5309 /* 5310 * The prev transaction commit doesn't complete, we need do 5311 * full commit by ourselves. 5312 */ 5313 if (root->fs_info->last_trans_log_full_commit > 5314 root->fs_info->last_trans_committed) { 5315 ret = 1; 5316 goto end_no_trans; 5317 } 5318 5319 if (root != BTRFS_I(inode)->root || 5320 btrfs_root_refs(&root->root_item) == 0) { 5321 ret = 1; 5322 goto end_no_trans; 5323 } 5324 5325 ret = check_parent_dirs_for_sync(trans, inode, parent, 5326 sb, last_committed); 5327 if (ret) 5328 goto end_no_trans; 5329 5330 if (btrfs_inode_in_log(inode, trans->transid)) { 5331 ret = BTRFS_NO_LOG_SYNC; 5332 goto end_no_trans; 5333 } 5334 5335 ret = start_log_trans(trans, root, ctx); 5336 if (ret) 5337 goto end_no_trans; 5338 5339 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5340 if (ret) 5341 goto end_trans; 5342 5343 /* 5344 * for regular files, if its inode is already on disk, we don't 5345 * have to worry about the parents at all. This is because 5346 * we can use the last_unlink_trans field to record renames 5347 * and other fun in this file. 5348 */ 5349 if (S_ISREG(inode->i_mode) && 5350 BTRFS_I(inode)->generation <= last_committed && 5351 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 5352 ret = 0; 5353 goto end_trans; 5354 } 5355 5356 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 5357 log_dentries = true; 5358 5359 /* 5360 * On unlink we must make sure all our current and old parent directory 5361 * inodes are fully logged. This is to prevent leaving dangling 5362 * directory index entries in directories that were our parents but are 5363 * not anymore. Not doing this results in old parent directory being 5364 * impossible to delete after log replay (rmdir will always fail with 5365 * error -ENOTEMPTY). 5366 * 5367 * Example 1: 5368 * 5369 * mkdir testdir 5370 * touch testdir/foo 5371 * ln testdir/foo testdir/bar 5372 * sync 5373 * unlink testdir/bar 5374 * xfs_io -c fsync testdir/foo 5375 * <power failure> 5376 * mount fs, triggers log replay 5377 * 5378 * If we don't log the parent directory (testdir), after log replay the 5379 * directory still has an entry pointing to the file inode using the bar 5380 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5381 * the file inode has a link count of 1. 5382 * 5383 * Example 2: 5384 * 5385 * mkdir testdir 5386 * touch foo 5387 * ln foo testdir/foo2 5388 * ln foo testdir/foo3 5389 * sync 5390 * unlink testdir/foo3 5391 * xfs_io -c fsync foo 5392 * <power failure> 5393 * mount fs, triggers log replay 5394 * 5395 * Similar as the first example, after log replay the parent directory 5396 * testdir still has an entry pointing to the inode file with name foo3 5397 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5398 * and has a link count of 2. 5399 */ 5400 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 5401 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5402 if (ret) 5403 goto end_trans; 5404 } 5405 5406 while (1) { 5407 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5408 break; 5409 5410 inode = d_inode(parent); 5411 if (root != BTRFS_I(inode)->root) 5412 break; 5413 5414 if (BTRFS_I(inode)->generation > last_committed) { 5415 ret = btrfs_log_inode(trans, root, inode, 5416 LOG_INODE_EXISTS, 5417 0, LLONG_MAX, ctx); 5418 if (ret) 5419 goto end_trans; 5420 } 5421 if (IS_ROOT(parent)) 5422 break; 5423 5424 parent = dget_parent(parent); 5425 dput(old_parent); 5426 old_parent = parent; 5427 } 5428 if (log_dentries) 5429 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5430 else 5431 ret = 0; 5432 end_trans: 5433 dput(old_parent); 5434 if (ret < 0) { 5435 btrfs_set_log_full_commit(root->fs_info, trans); 5436 ret = 1; 5437 } 5438 5439 if (ret) 5440 btrfs_remove_log_ctx(root, ctx); 5441 btrfs_end_log_trans(root); 5442 end_no_trans: 5443 return ret; 5444 } 5445 5446 /* 5447 * it is not safe to log dentry if the chunk root has added new 5448 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5449 * If this returns 1, you must commit the transaction to safely get your 5450 * data on disk. 5451 */ 5452 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5453 struct btrfs_root *root, struct dentry *dentry, 5454 const loff_t start, 5455 const loff_t end, 5456 struct btrfs_log_ctx *ctx) 5457 { 5458 struct dentry *parent = dget_parent(dentry); 5459 int ret; 5460 5461 ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, 5462 start, end, 0, ctx); 5463 dput(parent); 5464 5465 return ret; 5466 } 5467 5468 /* 5469 * should be called during mount to recover any replay any log trees 5470 * from the FS 5471 */ 5472 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5473 { 5474 int ret; 5475 struct btrfs_path *path; 5476 struct btrfs_trans_handle *trans; 5477 struct btrfs_key key; 5478 struct btrfs_key found_key; 5479 struct btrfs_key tmp_key; 5480 struct btrfs_root *log; 5481 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5482 struct walk_control wc = { 5483 .process_func = process_one_buffer, 5484 .stage = 0, 5485 }; 5486 5487 path = btrfs_alloc_path(); 5488 if (!path) 5489 return -ENOMEM; 5490 5491 fs_info->log_root_recovering = 1; 5492 5493 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5494 if (IS_ERR(trans)) { 5495 ret = PTR_ERR(trans); 5496 goto error; 5497 } 5498 5499 wc.trans = trans; 5500 wc.pin = 1; 5501 5502 ret = walk_log_tree(trans, log_root_tree, &wc); 5503 if (ret) { 5504 btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while " 5505 "recovering log root tree."); 5506 goto error; 5507 } 5508 5509 again: 5510 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5511 key.offset = (u64)-1; 5512 key.type = BTRFS_ROOT_ITEM_KEY; 5513 5514 while (1) { 5515 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5516 5517 if (ret < 0) { 5518 btrfs_handle_fs_error(fs_info, ret, 5519 "Couldn't find tree log root."); 5520 goto error; 5521 } 5522 if (ret > 0) { 5523 if (path->slots[0] == 0) 5524 break; 5525 path->slots[0]--; 5526 } 5527 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5528 path->slots[0]); 5529 btrfs_release_path(path); 5530 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5531 break; 5532 5533 log = btrfs_read_fs_root(log_root_tree, &found_key); 5534 if (IS_ERR(log)) { 5535 ret = PTR_ERR(log); 5536 btrfs_handle_fs_error(fs_info, ret, 5537 "Couldn't read tree log root."); 5538 goto error; 5539 } 5540 5541 tmp_key.objectid = found_key.offset; 5542 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5543 tmp_key.offset = (u64)-1; 5544 5545 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5546 if (IS_ERR(wc.replay_dest)) { 5547 ret = PTR_ERR(wc.replay_dest); 5548 free_extent_buffer(log->node); 5549 free_extent_buffer(log->commit_root); 5550 kfree(log); 5551 btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root " 5552 "for tree log recovery."); 5553 goto error; 5554 } 5555 5556 wc.replay_dest->log_root = log; 5557 btrfs_record_root_in_trans(trans, wc.replay_dest); 5558 ret = walk_log_tree(trans, log, &wc); 5559 5560 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5561 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5562 path); 5563 } 5564 5565 key.offset = found_key.offset - 1; 5566 wc.replay_dest->log_root = NULL; 5567 free_extent_buffer(log->node); 5568 free_extent_buffer(log->commit_root); 5569 kfree(log); 5570 5571 if (ret) 5572 goto error; 5573 5574 if (found_key.offset == 0) 5575 break; 5576 } 5577 btrfs_release_path(path); 5578 5579 /* step one is to pin it all, step two is to replay just inodes */ 5580 if (wc.pin) { 5581 wc.pin = 0; 5582 wc.process_func = replay_one_buffer; 5583 wc.stage = LOG_WALK_REPLAY_INODES; 5584 goto again; 5585 } 5586 /* step three is to replay everything */ 5587 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5588 wc.stage++; 5589 goto again; 5590 } 5591 5592 btrfs_free_path(path); 5593 5594 /* step 4: commit the transaction, which also unpins the blocks */ 5595 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 5596 if (ret) 5597 return ret; 5598 5599 free_extent_buffer(log_root_tree->node); 5600 log_root_tree->log_root = NULL; 5601 fs_info->log_root_recovering = 0; 5602 kfree(log_root_tree); 5603 5604 return 0; 5605 error: 5606 if (wc.trans) 5607 btrfs_end_transaction(wc.trans, fs_info->tree_root); 5608 btrfs_free_path(path); 5609 return ret; 5610 } 5611 5612 /* 5613 * there are some corner cases where we want to force a full 5614 * commit instead of allowing a directory to be logged. 5615 * 5616 * They revolve around files there were unlinked from the directory, and 5617 * this function updates the parent directory so that a full commit is 5618 * properly done if it is fsync'd later after the unlinks are done. 5619 * 5620 * Must be called before the unlink operations (updates to the subvolume tree, 5621 * inodes, etc) are done. 5622 */ 5623 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5624 struct inode *dir, struct inode *inode, 5625 int for_rename) 5626 { 5627 /* 5628 * when we're logging a file, if it hasn't been renamed 5629 * or unlinked, and its inode is fully committed on disk, 5630 * we don't have to worry about walking up the directory chain 5631 * to log its parents. 5632 * 5633 * So, we use the last_unlink_trans field to put this transid 5634 * into the file. When the file is logged we check it and 5635 * don't log the parents if the file is fully on disk. 5636 */ 5637 mutex_lock(&BTRFS_I(inode)->log_mutex); 5638 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5639 mutex_unlock(&BTRFS_I(inode)->log_mutex); 5640 5641 /* 5642 * if this directory was already logged any new 5643 * names for this file/dir will get recorded 5644 */ 5645 smp_mb(); 5646 if (BTRFS_I(dir)->logged_trans == trans->transid) 5647 return; 5648 5649 /* 5650 * if the inode we're about to unlink was logged, 5651 * the log will be properly updated for any new names 5652 */ 5653 if (BTRFS_I(inode)->logged_trans == trans->transid) 5654 return; 5655 5656 /* 5657 * when renaming files across directories, if the directory 5658 * there we're unlinking from gets fsync'd later on, there's 5659 * no way to find the destination directory later and fsync it 5660 * properly. So, we have to be conservative and force commits 5661 * so the new name gets discovered. 5662 */ 5663 if (for_rename) 5664 goto record; 5665 5666 /* we can safely do the unlink without any special recording */ 5667 return; 5668 5669 record: 5670 mutex_lock(&BTRFS_I(dir)->log_mutex); 5671 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5672 mutex_unlock(&BTRFS_I(dir)->log_mutex); 5673 } 5674 5675 /* 5676 * Make sure that if someone attempts to fsync the parent directory of a deleted 5677 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5678 * that after replaying the log tree of the parent directory's root we will not 5679 * see the snapshot anymore and at log replay time we will not see any log tree 5680 * corresponding to the deleted snapshot's root, which could lead to replaying 5681 * it after replaying the log tree of the parent directory (which would replay 5682 * the snapshot delete operation). 5683 * 5684 * Must be called before the actual snapshot destroy operation (updates to the 5685 * parent root and tree of tree roots trees, etc) are done. 5686 */ 5687 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5688 struct inode *dir) 5689 { 5690 mutex_lock(&BTRFS_I(dir)->log_mutex); 5691 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5692 mutex_unlock(&BTRFS_I(dir)->log_mutex); 5693 } 5694 5695 /* 5696 * Call this after adding a new name for a file and it will properly 5697 * update the log to reflect the new name. 5698 * 5699 * It will return zero if all goes well, and it will return 1 if a 5700 * full transaction commit is required. 5701 */ 5702 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5703 struct inode *inode, struct inode *old_dir, 5704 struct dentry *parent) 5705 { 5706 struct btrfs_root * root = BTRFS_I(inode)->root; 5707 5708 /* 5709 * this will force the logging code to walk the dentry chain 5710 * up for the file 5711 */ 5712 if (S_ISREG(inode->i_mode)) 5713 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5714 5715 /* 5716 * if this inode hasn't been logged and directory we're renaming it 5717 * from hasn't been logged, we don't need to log it 5718 */ 5719 if (BTRFS_I(inode)->logged_trans <= 5720 root->fs_info->last_trans_committed && 5721 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 5722 root->fs_info->last_trans_committed)) 5723 return 0; 5724 5725 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5726 LLONG_MAX, 1, NULL); 5727 } 5728 5729