1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "ctree.h" 12 #include "tree-log.h" 13 #include "disk-io.h" 14 #include "locking.h" 15 #include "print-tree.h" 16 #include "backref.h" 17 #include "compression.h" 18 #include "qgroup.h" 19 #include "inode-map.h" 20 21 /* magic values for the inode_only field in btrfs_log_inode: 22 * 23 * LOG_INODE_ALL means to log everything 24 * LOG_INODE_EXISTS means to log just enough to recreate the inode 25 * during log replay 26 */ 27 #define LOG_INODE_ALL 0 28 #define LOG_INODE_EXISTS 1 29 #define LOG_OTHER_INODE 2 30 31 /* 32 * directory trouble cases 33 * 34 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 35 * log, we must force a full commit before doing an fsync of the directory 36 * where the unlink was done. 37 * ---> record transid of last unlink/rename per directory 38 * 39 * mkdir foo/some_dir 40 * normal commit 41 * rename foo/some_dir foo2/some_dir 42 * mkdir foo/some_dir 43 * fsync foo/some_dir/some_file 44 * 45 * The fsync above will unlink the original some_dir without recording 46 * it in its new location (foo2). After a crash, some_dir will be gone 47 * unless the fsync of some_file forces a full commit 48 * 49 * 2) we must log any new names for any file or dir that is in the fsync 50 * log. ---> check inode while renaming/linking. 51 * 52 * 2a) we must log any new names for any file or dir during rename 53 * when the directory they are being removed from was logged. 54 * ---> check inode and old parent dir during rename 55 * 56 * 2a is actually the more important variant. With the extra logging 57 * a crash might unlink the old name without recreating the new one 58 * 59 * 3) after a crash, we must go through any directories with a link count 60 * of zero and redo the rm -rf 61 * 62 * mkdir f1/foo 63 * normal commit 64 * rm -rf f1/foo 65 * fsync(f1) 66 * 67 * The directory f1 was fully removed from the FS, but fsync was never 68 * called on f1, only its parent dir. After a crash the rm -rf must 69 * be replayed. This must be able to recurse down the entire 70 * directory tree. The inode link count fixup code takes care of the 71 * ugly details. 72 */ 73 74 /* 75 * stages for the tree walking. The first 76 * stage (0) is to only pin down the blocks we find 77 * the second stage (1) is to make sure that all the inodes 78 * we find in the log are created in the subvolume. 79 * 80 * The last stage is to deal with directories and links and extents 81 * and all the other fun semantics 82 */ 83 #define LOG_WALK_PIN_ONLY 0 84 #define LOG_WALK_REPLAY_INODES 1 85 #define LOG_WALK_REPLAY_DIR_INDEX 2 86 #define LOG_WALK_REPLAY_ALL 3 87 88 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 89 struct btrfs_root *root, struct btrfs_inode *inode, 90 int inode_only, 91 const loff_t start, 92 const loff_t end, 93 struct btrfs_log_ctx *ctx); 94 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, 96 struct btrfs_path *path, u64 objectid); 97 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, 99 struct btrfs_root *log, 100 struct btrfs_path *path, 101 u64 dirid, int del_all); 102 103 /* 104 * tree logging is a special write ahead log used to make sure that 105 * fsyncs and O_SYNCs can happen without doing full tree commits. 106 * 107 * Full tree commits are expensive because they require commonly 108 * modified blocks to be recowed, creating many dirty pages in the 109 * extent tree an 4x-6x higher write load than ext3. 110 * 111 * Instead of doing a tree commit on every fsync, we use the 112 * key ranges and transaction ids to find items for a given file or directory 113 * that have changed in this transaction. Those items are copied into 114 * a special tree (one per subvolume root), that tree is written to disk 115 * and then the fsync is considered complete. 116 * 117 * After a crash, items are copied out of the log-tree back into the 118 * subvolume tree. Any file data extents found are recorded in the extent 119 * allocation tree, and the log-tree freed. 120 * 121 * The log tree is read three times, once to pin down all the extents it is 122 * using in ram and once, once to create all the inodes logged in the tree 123 * and once to do all the other items. 124 */ 125 126 /* 127 * start a sub transaction and setup the log tree 128 * this increments the log tree writer count to make the people 129 * syncing the tree wait for us to finish 130 */ 131 static int start_log_trans(struct btrfs_trans_handle *trans, 132 struct btrfs_root *root, 133 struct btrfs_log_ctx *ctx) 134 { 135 struct btrfs_fs_info *fs_info = root->fs_info; 136 int ret = 0; 137 138 mutex_lock(&root->log_mutex); 139 140 if (root->log_root) { 141 if (btrfs_need_log_full_commit(fs_info, trans)) { 142 ret = -EAGAIN; 143 goto out; 144 } 145 146 if (!root->log_start_pid) { 147 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 148 root->log_start_pid = current->pid; 149 } else if (root->log_start_pid != current->pid) { 150 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 151 } 152 } else { 153 mutex_lock(&fs_info->tree_log_mutex); 154 if (!fs_info->log_root_tree) 155 ret = btrfs_init_log_root_tree(trans, fs_info); 156 mutex_unlock(&fs_info->tree_log_mutex); 157 if (ret) 158 goto out; 159 160 ret = btrfs_add_log_tree(trans, root); 161 if (ret) 162 goto out; 163 164 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 165 root->log_start_pid = current->pid; 166 } 167 168 atomic_inc(&root->log_batch); 169 atomic_inc(&root->log_writers); 170 if (ctx) { 171 int index = root->log_transid % 2; 172 list_add_tail(&ctx->list, &root->log_ctxs[index]); 173 ctx->log_transid = root->log_transid; 174 } 175 176 out: 177 mutex_unlock(&root->log_mutex); 178 return ret; 179 } 180 181 /* 182 * returns 0 if there was a log transaction running and we were able 183 * to join, or returns -ENOENT if there were not transactions 184 * in progress 185 */ 186 static int join_running_log_trans(struct btrfs_root *root) 187 { 188 int ret = -ENOENT; 189 190 smp_mb(); 191 if (!root->log_root) 192 return -ENOENT; 193 194 mutex_lock(&root->log_mutex); 195 if (root->log_root) { 196 ret = 0; 197 atomic_inc(&root->log_writers); 198 } 199 mutex_unlock(&root->log_mutex); 200 return ret; 201 } 202 203 /* 204 * This either makes the current running log transaction wait 205 * until you call btrfs_end_log_trans() or it makes any future 206 * log transactions wait until you call btrfs_end_log_trans() 207 */ 208 int btrfs_pin_log_trans(struct btrfs_root *root) 209 { 210 int ret = -ENOENT; 211 212 mutex_lock(&root->log_mutex); 213 atomic_inc(&root->log_writers); 214 mutex_unlock(&root->log_mutex); 215 return ret; 216 } 217 218 /* 219 * indicate we're done making changes to the log tree 220 * and wake up anyone waiting to do a sync 221 */ 222 void btrfs_end_log_trans(struct btrfs_root *root) 223 { 224 if (atomic_dec_and_test(&root->log_writers)) { 225 /* 226 * Implicit memory barrier after atomic_dec_and_test 227 */ 228 if (waitqueue_active(&root->log_writer_wait)) 229 wake_up(&root->log_writer_wait); 230 } 231 } 232 233 234 /* 235 * the walk control struct is used to pass state down the chain when 236 * processing the log tree. The stage field tells us which part 237 * of the log tree processing we are currently doing. The others 238 * are state fields used for that specific part 239 */ 240 struct walk_control { 241 /* should we free the extent on disk when done? This is used 242 * at transaction commit time while freeing a log tree 243 */ 244 int free; 245 246 /* should we write out the extent buffer? This is used 247 * while flushing the log tree to disk during a sync 248 */ 249 int write; 250 251 /* should we wait for the extent buffer io to finish? Also used 252 * while flushing the log tree to disk for a sync 253 */ 254 int wait; 255 256 /* pin only walk, we record which extents on disk belong to the 257 * log trees 258 */ 259 int pin; 260 261 /* what stage of the replay code we're currently in */ 262 int stage; 263 264 /* the root we are currently replaying */ 265 struct btrfs_root *replay_dest; 266 267 /* the trans handle for the current replay */ 268 struct btrfs_trans_handle *trans; 269 270 /* the function that gets used to process blocks we find in the 271 * tree. Note the extent_buffer might not be up to date when it is 272 * passed in, and it must be checked or read if you need the data 273 * inside it 274 */ 275 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 276 struct walk_control *wc, u64 gen, int level); 277 }; 278 279 /* 280 * process_func used to pin down extents, write them or wait on them 281 */ 282 static int process_one_buffer(struct btrfs_root *log, 283 struct extent_buffer *eb, 284 struct walk_control *wc, u64 gen, int level) 285 { 286 struct btrfs_fs_info *fs_info = log->fs_info; 287 int ret = 0; 288 289 /* 290 * If this fs is mixed then we need to be able to process the leaves to 291 * pin down any logged extents, so we have to read the block. 292 */ 293 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 294 ret = btrfs_read_buffer(eb, gen, level, NULL); 295 if (ret) 296 return ret; 297 } 298 299 if (wc->pin) 300 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 301 eb->len); 302 303 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 304 if (wc->pin && btrfs_header_level(eb) == 0) 305 ret = btrfs_exclude_logged_extents(fs_info, eb); 306 if (wc->write) 307 btrfs_write_tree_block(eb); 308 if (wc->wait) 309 btrfs_wait_tree_block_writeback(eb); 310 } 311 return ret; 312 } 313 314 /* 315 * Item overwrite used by replay and tree logging. eb, slot and key all refer 316 * to the src data we are copying out. 317 * 318 * root is the tree we are copying into, and path is a scratch 319 * path for use in this function (it should be released on entry and 320 * will be released on exit). 321 * 322 * If the key is already in the destination tree the existing item is 323 * overwritten. If the existing item isn't big enough, it is extended. 324 * If it is too large, it is truncated. 325 * 326 * If the key isn't in the destination yet, a new item is inserted. 327 */ 328 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 329 struct btrfs_root *root, 330 struct btrfs_path *path, 331 struct extent_buffer *eb, int slot, 332 struct btrfs_key *key) 333 { 334 struct btrfs_fs_info *fs_info = root->fs_info; 335 int ret; 336 u32 item_size; 337 u64 saved_i_size = 0; 338 int save_old_i_size = 0; 339 unsigned long src_ptr; 340 unsigned long dst_ptr; 341 int overwrite_root = 0; 342 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 343 344 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 345 overwrite_root = 1; 346 347 item_size = btrfs_item_size_nr(eb, slot); 348 src_ptr = btrfs_item_ptr_offset(eb, slot); 349 350 /* look for the key in the destination tree */ 351 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 352 if (ret < 0) 353 return ret; 354 355 if (ret == 0) { 356 char *src_copy; 357 char *dst_copy; 358 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 359 path->slots[0]); 360 if (dst_size != item_size) 361 goto insert; 362 363 if (item_size == 0) { 364 btrfs_release_path(path); 365 return 0; 366 } 367 dst_copy = kmalloc(item_size, GFP_NOFS); 368 src_copy = kmalloc(item_size, GFP_NOFS); 369 if (!dst_copy || !src_copy) { 370 btrfs_release_path(path); 371 kfree(dst_copy); 372 kfree(src_copy); 373 return -ENOMEM; 374 } 375 376 read_extent_buffer(eb, src_copy, src_ptr, item_size); 377 378 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 379 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 380 item_size); 381 ret = memcmp(dst_copy, src_copy, item_size); 382 383 kfree(dst_copy); 384 kfree(src_copy); 385 /* 386 * they have the same contents, just return, this saves 387 * us from cowing blocks in the destination tree and doing 388 * extra writes that may not have been done by a previous 389 * sync 390 */ 391 if (ret == 0) { 392 btrfs_release_path(path); 393 return 0; 394 } 395 396 /* 397 * We need to load the old nbytes into the inode so when we 398 * replay the extents we've logged we get the right nbytes. 399 */ 400 if (inode_item) { 401 struct btrfs_inode_item *item; 402 u64 nbytes; 403 u32 mode; 404 405 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 406 struct btrfs_inode_item); 407 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 408 item = btrfs_item_ptr(eb, slot, 409 struct btrfs_inode_item); 410 btrfs_set_inode_nbytes(eb, item, nbytes); 411 412 /* 413 * If this is a directory we need to reset the i_size to 414 * 0 so that we can set it up properly when replaying 415 * the rest of the items in this log. 416 */ 417 mode = btrfs_inode_mode(eb, item); 418 if (S_ISDIR(mode)) 419 btrfs_set_inode_size(eb, item, 0); 420 } 421 } else if (inode_item) { 422 struct btrfs_inode_item *item; 423 u32 mode; 424 425 /* 426 * New inode, set nbytes to 0 so that the nbytes comes out 427 * properly when we replay the extents. 428 */ 429 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 430 btrfs_set_inode_nbytes(eb, item, 0); 431 432 /* 433 * If this is a directory we need to reset the i_size to 0 so 434 * that we can set it up properly when replaying the rest of 435 * the items in this log. 436 */ 437 mode = btrfs_inode_mode(eb, item); 438 if (S_ISDIR(mode)) 439 btrfs_set_inode_size(eb, item, 0); 440 } 441 insert: 442 btrfs_release_path(path); 443 /* try to insert the key into the destination tree */ 444 path->skip_release_on_error = 1; 445 ret = btrfs_insert_empty_item(trans, root, path, 446 key, item_size); 447 path->skip_release_on_error = 0; 448 449 /* make sure any existing item is the correct size */ 450 if (ret == -EEXIST || ret == -EOVERFLOW) { 451 u32 found_size; 452 found_size = btrfs_item_size_nr(path->nodes[0], 453 path->slots[0]); 454 if (found_size > item_size) 455 btrfs_truncate_item(fs_info, path, item_size, 1); 456 else if (found_size < item_size) 457 btrfs_extend_item(fs_info, path, 458 item_size - found_size); 459 } else if (ret) { 460 return ret; 461 } 462 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 463 path->slots[0]); 464 465 /* don't overwrite an existing inode if the generation number 466 * was logged as zero. This is done when the tree logging code 467 * is just logging an inode to make sure it exists after recovery. 468 * 469 * Also, don't overwrite i_size on directories during replay. 470 * log replay inserts and removes directory items based on the 471 * state of the tree found in the subvolume, and i_size is modified 472 * as it goes 473 */ 474 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 475 struct btrfs_inode_item *src_item; 476 struct btrfs_inode_item *dst_item; 477 478 src_item = (struct btrfs_inode_item *)src_ptr; 479 dst_item = (struct btrfs_inode_item *)dst_ptr; 480 481 if (btrfs_inode_generation(eb, src_item) == 0) { 482 struct extent_buffer *dst_eb = path->nodes[0]; 483 const u64 ino_size = btrfs_inode_size(eb, src_item); 484 485 /* 486 * For regular files an ino_size == 0 is used only when 487 * logging that an inode exists, as part of a directory 488 * fsync, and the inode wasn't fsynced before. In this 489 * case don't set the size of the inode in the fs/subvol 490 * tree, otherwise we would be throwing valid data away. 491 */ 492 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 493 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 494 ino_size != 0) { 495 struct btrfs_map_token token; 496 497 btrfs_init_map_token(&token); 498 btrfs_set_token_inode_size(dst_eb, dst_item, 499 ino_size, &token); 500 } 501 goto no_copy; 502 } 503 504 if (overwrite_root && 505 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 506 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 507 save_old_i_size = 1; 508 saved_i_size = btrfs_inode_size(path->nodes[0], 509 dst_item); 510 } 511 } 512 513 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 514 src_ptr, item_size); 515 516 if (save_old_i_size) { 517 struct btrfs_inode_item *dst_item; 518 dst_item = (struct btrfs_inode_item *)dst_ptr; 519 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 520 } 521 522 /* make sure the generation is filled in */ 523 if (key->type == BTRFS_INODE_ITEM_KEY) { 524 struct btrfs_inode_item *dst_item; 525 dst_item = (struct btrfs_inode_item *)dst_ptr; 526 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 527 btrfs_set_inode_generation(path->nodes[0], dst_item, 528 trans->transid); 529 } 530 } 531 no_copy: 532 btrfs_mark_buffer_dirty(path->nodes[0]); 533 btrfs_release_path(path); 534 return 0; 535 } 536 537 /* 538 * simple helper to read an inode off the disk from a given root 539 * This can only be called for subvolume roots and not for the log 540 */ 541 static noinline struct inode *read_one_inode(struct btrfs_root *root, 542 u64 objectid) 543 { 544 struct btrfs_key key; 545 struct inode *inode; 546 547 key.objectid = objectid; 548 key.type = BTRFS_INODE_ITEM_KEY; 549 key.offset = 0; 550 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 551 if (IS_ERR(inode)) { 552 inode = NULL; 553 } else if (is_bad_inode(inode)) { 554 iput(inode); 555 inode = NULL; 556 } 557 return inode; 558 } 559 560 /* replays a single extent in 'eb' at 'slot' with 'key' into the 561 * subvolume 'root'. path is released on entry and should be released 562 * on exit. 563 * 564 * extents in the log tree have not been allocated out of the extent 565 * tree yet. So, this completes the allocation, taking a reference 566 * as required if the extent already exists or creating a new extent 567 * if it isn't in the extent allocation tree yet. 568 * 569 * The extent is inserted into the file, dropping any existing extents 570 * from the file that overlap the new one. 571 */ 572 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 573 struct btrfs_root *root, 574 struct btrfs_path *path, 575 struct extent_buffer *eb, int slot, 576 struct btrfs_key *key) 577 { 578 struct btrfs_fs_info *fs_info = root->fs_info; 579 int found_type; 580 u64 extent_end; 581 u64 start = key->offset; 582 u64 nbytes = 0; 583 struct btrfs_file_extent_item *item; 584 struct inode *inode = NULL; 585 unsigned long size; 586 int ret = 0; 587 588 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 589 found_type = btrfs_file_extent_type(eb, item); 590 591 if (found_type == BTRFS_FILE_EXTENT_REG || 592 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 593 nbytes = btrfs_file_extent_num_bytes(eb, item); 594 extent_end = start + nbytes; 595 596 /* 597 * We don't add to the inodes nbytes if we are prealloc or a 598 * hole. 599 */ 600 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 601 nbytes = 0; 602 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 603 size = btrfs_file_extent_inline_len(eb, slot, item); 604 nbytes = btrfs_file_extent_ram_bytes(eb, item); 605 extent_end = ALIGN(start + size, 606 fs_info->sectorsize); 607 } else { 608 ret = 0; 609 goto out; 610 } 611 612 inode = read_one_inode(root, key->objectid); 613 if (!inode) { 614 ret = -EIO; 615 goto out; 616 } 617 618 /* 619 * first check to see if we already have this extent in the 620 * file. This must be done before the btrfs_drop_extents run 621 * so we don't try to drop this extent. 622 */ 623 ret = btrfs_lookup_file_extent(trans, root, path, 624 btrfs_ino(BTRFS_I(inode)), start, 0); 625 626 if (ret == 0 && 627 (found_type == BTRFS_FILE_EXTENT_REG || 628 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 629 struct btrfs_file_extent_item cmp1; 630 struct btrfs_file_extent_item cmp2; 631 struct btrfs_file_extent_item *existing; 632 struct extent_buffer *leaf; 633 634 leaf = path->nodes[0]; 635 existing = btrfs_item_ptr(leaf, path->slots[0], 636 struct btrfs_file_extent_item); 637 638 read_extent_buffer(eb, &cmp1, (unsigned long)item, 639 sizeof(cmp1)); 640 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 641 sizeof(cmp2)); 642 643 /* 644 * we already have a pointer to this exact extent, 645 * we don't have to do anything 646 */ 647 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 648 btrfs_release_path(path); 649 goto out; 650 } 651 } 652 btrfs_release_path(path); 653 654 /* drop any overlapping extents */ 655 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 656 if (ret) 657 goto out; 658 659 if (found_type == BTRFS_FILE_EXTENT_REG || 660 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 661 u64 offset; 662 unsigned long dest_offset; 663 struct btrfs_key ins; 664 665 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 666 btrfs_fs_incompat(fs_info, NO_HOLES)) 667 goto update_inode; 668 669 ret = btrfs_insert_empty_item(trans, root, path, key, 670 sizeof(*item)); 671 if (ret) 672 goto out; 673 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 674 path->slots[0]); 675 copy_extent_buffer(path->nodes[0], eb, dest_offset, 676 (unsigned long)item, sizeof(*item)); 677 678 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 679 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 680 ins.type = BTRFS_EXTENT_ITEM_KEY; 681 offset = key->offset - btrfs_file_extent_offset(eb, item); 682 683 /* 684 * Manually record dirty extent, as here we did a shallow 685 * file extent item copy and skip normal backref update, 686 * but modifying extent tree all by ourselves. 687 * So need to manually record dirty extent for qgroup, 688 * as the owner of the file extent changed from log tree 689 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 690 */ 691 ret = btrfs_qgroup_trace_extent(trans, fs_info, 692 btrfs_file_extent_disk_bytenr(eb, item), 693 btrfs_file_extent_disk_num_bytes(eb, item), 694 GFP_NOFS); 695 if (ret < 0) 696 goto out; 697 698 if (ins.objectid > 0) { 699 u64 csum_start; 700 u64 csum_end; 701 LIST_HEAD(ordered_sums); 702 /* 703 * is this extent already allocated in the extent 704 * allocation tree? If so, just add a reference 705 */ 706 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 707 ins.offset); 708 if (ret == 0) { 709 ret = btrfs_inc_extent_ref(trans, root, 710 ins.objectid, ins.offset, 711 0, root->root_key.objectid, 712 key->objectid, offset); 713 if (ret) 714 goto out; 715 } else { 716 /* 717 * insert the extent pointer in the extent 718 * allocation tree 719 */ 720 ret = btrfs_alloc_logged_file_extent(trans, 721 fs_info, 722 root->root_key.objectid, 723 key->objectid, offset, &ins); 724 if (ret) 725 goto out; 726 } 727 btrfs_release_path(path); 728 729 if (btrfs_file_extent_compression(eb, item)) { 730 csum_start = ins.objectid; 731 csum_end = csum_start + ins.offset; 732 } else { 733 csum_start = ins.objectid + 734 btrfs_file_extent_offset(eb, item); 735 csum_end = csum_start + 736 btrfs_file_extent_num_bytes(eb, item); 737 } 738 739 ret = btrfs_lookup_csums_range(root->log_root, 740 csum_start, csum_end - 1, 741 &ordered_sums, 0); 742 if (ret) 743 goto out; 744 /* 745 * Now delete all existing cums in the csum root that 746 * cover our range. We do this because we can have an 747 * extent that is completely referenced by one file 748 * extent item and partially referenced by another 749 * file extent item (like after using the clone or 750 * extent_same ioctls). In this case if we end up doing 751 * the replay of the one that partially references the 752 * extent first, and we do not do the csum deletion 753 * below, we can get 2 csum items in the csum tree that 754 * overlap each other. For example, imagine our log has 755 * the two following file extent items: 756 * 757 * key (257 EXTENT_DATA 409600) 758 * extent data disk byte 12845056 nr 102400 759 * extent data offset 20480 nr 20480 ram 102400 760 * 761 * key (257 EXTENT_DATA 819200) 762 * extent data disk byte 12845056 nr 102400 763 * extent data offset 0 nr 102400 ram 102400 764 * 765 * Where the second one fully references the 100K extent 766 * that starts at disk byte 12845056, and the log tree 767 * has a single csum item that covers the entire range 768 * of the extent: 769 * 770 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 771 * 772 * After the first file extent item is replayed, the 773 * csum tree gets the following csum item: 774 * 775 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 776 * 777 * Which covers the 20K sub-range starting at offset 20K 778 * of our extent. Now when we replay the second file 779 * extent item, if we do not delete existing csum items 780 * that cover any of its blocks, we end up getting two 781 * csum items in our csum tree that overlap each other: 782 * 783 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 784 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 785 * 786 * Which is a problem, because after this anyone trying 787 * to lookup up for the checksum of any block of our 788 * extent starting at an offset of 40K or higher, will 789 * end up looking at the second csum item only, which 790 * does not contain the checksum for any block starting 791 * at offset 40K or higher of our extent. 792 */ 793 while (!list_empty(&ordered_sums)) { 794 struct btrfs_ordered_sum *sums; 795 sums = list_entry(ordered_sums.next, 796 struct btrfs_ordered_sum, 797 list); 798 if (!ret) 799 ret = btrfs_del_csums(trans, fs_info, 800 sums->bytenr, 801 sums->len); 802 if (!ret) 803 ret = btrfs_csum_file_blocks(trans, 804 fs_info->csum_root, sums); 805 list_del(&sums->list); 806 kfree(sums); 807 } 808 if (ret) 809 goto out; 810 } else { 811 btrfs_release_path(path); 812 } 813 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 814 /* inline extents are easy, we just overwrite them */ 815 ret = overwrite_item(trans, root, path, eb, slot, key); 816 if (ret) 817 goto out; 818 } 819 820 inode_add_bytes(inode, nbytes); 821 update_inode: 822 ret = btrfs_update_inode(trans, root, inode); 823 out: 824 if (inode) 825 iput(inode); 826 return ret; 827 } 828 829 /* 830 * when cleaning up conflicts between the directory names in the 831 * subvolume, directory names in the log and directory names in the 832 * inode back references, we may have to unlink inodes from directories. 833 * 834 * This is a helper function to do the unlink of a specific directory 835 * item 836 */ 837 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 838 struct btrfs_root *root, 839 struct btrfs_path *path, 840 struct btrfs_inode *dir, 841 struct btrfs_dir_item *di) 842 { 843 struct inode *inode; 844 char *name; 845 int name_len; 846 struct extent_buffer *leaf; 847 struct btrfs_key location; 848 int ret; 849 850 leaf = path->nodes[0]; 851 852 btrfs_dir_item_key_to_cpu(leaf, di, &location); 853 name_len = btrfs_dir_name_len(leaf, di); 854 name = kmalloc(name_len, GFP_NOFS); 855 if (!name) 856 return -ENOMEM; 857 858 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 859 btrfs_release_path(path); 860 861 inode = read_one_inode(root, location.objectid); 862 if (!inode) { 863 ret = -EIO; 864 goto out; 865 } 866 867 ret = link_to_fixup_dir(trans, root, path, location.objectid); 868 if (ret) 869 goto out; 870 871 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 872 name_len); 873 if (ret) 874 goto out; 875 else 876 ret = btrfs_run_delayed_items(trans); 877 out: 878 kfree(name); 879 iput(inode); 880 return ret; 881 } 882 883 /* 884 * helper function to see if a given name and sequence number found 885 * in an inode back reference are already in a directory and correctly 886 * point to this inode 887 */ 888 static noinline int inode_in_dir(struct btrfs_root *root, 889 struct btrfs_path *path, 890 u64 dirid, u64 objectid, u64 index, 891 const char *name, int name_len) 892 { 893 struct btrfs_dir_item *di; 894 struct btrfs_key location; 895 int match = 0; 896 897 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 898 index, name, name_len, 0); 899 if (di && !IS_ERR(di)) { 900 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 901 if (location.objectid != objectid) 902 goto out; 903 } else 904 goto out; 905 btrfs_release_path(path); 906 907 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 908 if (di && !IS_ERR(di)) { 909 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 910 if (location.objectid != objectid) 911 goto out; 912 } else 913 goto out; 914 match = 1; 915 out: 916 btrfs_release_path(path); 917 return match; 918 } 919 920 /* 921 * helper function to check a log tree for a named back reference in 922 * an inode. This is used to decide if a back reference that is 923 * found in the subvolume conflicts with what we find in the log. 924 * 925 * inode backreferences may have multiple refs in a single item, 926 * during replay we process one reference at a time, and we don't 927 * want to delete valid links to a file from the subvolume if that 928 * link is also in the log. 929 */ 930 static noinline int backref_in_log(struct btrfs_root *log, 931 struct btrfs_key *key, 932 u64 ref_objectid, 933 const char *name, int namelen) 934 { 935 struct btrfs_path *path; 936 struct btrfs_inode_ref *ref; 937 unsigned long ptr; 938 unsigned long ptr_end; 939 unsigned long name_ptr; 940 int found_name_len; 941 int item_size; 942 int ret; 943 int match = 0; 944 945 path = btrfs_alloc_path(); 946 if (!path) 947 return -ENOMEM; 948 949 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 950 if (ret != 0) 951 goto out; 952 953 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 954 955 if (key->type == BTRFS_INODE_EXTREF_KEY) { 956 if (btrfs_find_name_in_ext_backref(path->nodes[0], 957 path->slots[0], 958 ref_objectid, 959 name, namelen, NULL)) 960 match = 1; 961 962 goto out; 963 } 964 965 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 966 ptr_end = ptr + item_size; 967 while (ptr < ptr_end) { 968 ref = (struct btrfs_inode_ref *)ptr; 969 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 970 if (found_name_len == namelen) { 971 name_ptr = (unsigned long)(ref + 1); 972 ret = memcmp_extent_buffer(path->nodes[0], name, 973 name_ptr, namelen); 974 if (ret == 0) { 975 match = 1; 976 goto out; 977 } 978 } 979 ptr = (unsigned long)(ref + 1) + found_name_len; 980 } 981 out: 982 btrfs_free_path(path); 983 return match; 984 } 985 986 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 987 struct btrfs_root *root, 988 struct btrfs_path *path, 989 struct btrfs_root *log_root, 990 struct btrfs_inode *dir, 991 struct btrfs_inode *inode, 992 u64 inode_objectid, u64 parent_objectid, 993 u64 ref_index, char *name, int namelen, 994 int *search_done) 995 { 996 int ret; 997 char *victim_name; 998 int victim_name_len; 999 struct extent_buffer *leaf; 1000 struct btrfs_dir_item *di; 1001 struct btrfs_key search_key; 1002 struct btrfs_inode_extref *extref; 1003 1004 again: 1005 /* Search old style refs */ 1006 search_key.objectid = inode_objectid; 1007 search_key.type = BTRFS_INODE_REF_KEY; 1008 search_key.offset = parent_objectid; 1009 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1010 if (ret == 0) { 1011 struct btrfs_inode_ref *victim_ref; 1012 unsigned long ptr; 1013 unsigned long ptr_end; 1014 1015 leaf = path->nodes[0]; 1016 1017 /* are we trying to overwrite a back ref for the root directory 1018 * if so, just jump out, we're done 1019 */ 1020 if (search_key.objectid == search_key.offset) 1021 return 1; 1022 1023 /* check all the names in this back reference to see 1024 * if they are in the log. if so, we allow them to stay 1025 * otherwise they must be unlinked as a conflict 1026 */ 1027 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1028 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1029 while (ptr < ptr_end) { 1030 victim_ref = (struct btrfs_inode_ref *)ptr; 1031 victim_name_len = btrfs_inode_ref_name_len(leaf, 1032 victim_ref); 1033 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1034 if (!victim_name) 1035 return -ENOMEM; 1036 1037 read_extent_buffer(leaf, victim_name, 1038 (unsigned long)(victim_ref + 1), 1039 victim_name_len); 1040 1041 if (!backref_in_log(log_root, &search_key, 1042 parent_objectid, 1043 victim_name, 1044 victim_name_len)) { 1045 inc_nlink(&inode->vfs_inode); 1046 btrfs_release_path(path); 1047 1048 ret = btrfs_unlink_inode(trans, root, dir, inode, 1049 victim_name, victim_name_len); 1050 kfree(victim_name); 1051 if (ret) 1052 return ret; 1053 ret = btrfs_run_delayed_items(trans); 1054 if (ret) 1055 return ret; 1056 *search_done = 1; 1057 goto again; 1058 } 1059 kfree(victim_name); 1060 1061 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1062 } 1063 1064 /* 1065 * NOTE: we have searched root tree and checked the 1066 * corresponding ref, it does not need to check again. 1067 */ 1068 *search_done = 1; 1069 } 1070 btrfs_release_path(path); 1071 1072 /* Same search but for extended refs */ 1073 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1074 inode_objectid, parent_objectid, 0, 1075 0); 1076 if (!IS_ERR_OR_NULL(extref)) { 1077 u32 item_size; 1078 u32 cur_offset = 0; 1079 unsigned long base; 1080 struct inode *victim_parent; 1081 1082 leaf = path->nodes[0]; 1083 1084 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1085 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1086 1087 while (cur_offset < item_size) { 1088 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1089 1090 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1091 1092 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1093 goto next; 1094 1095 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1096 if (!victim_name) 1097 return -ENOMEM; 1098 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1099 victim_name_len); 1100 1101 search_key.objectid = inode_objectid; 1102 search_key.type = BTRFS_INODE_EXTREF_KEY; 1103 search_key.offset = btrfs_extref_hash(parent_objectid, 1104 victim_name, 1105 victim_name_len); 1106 ret = 0; 1107 if (!backref_in_log(log_root, &search_key, 1108 parent_objectid, victim_name, 1109 victim_name_len)) { 1110 ret = -ENOENT; 1111 victim_parent = read_one_inode(root, 1112 parent_objectid); 1113 if (victim_parent) { 1114 inc_nlink(&inode->vfs_inode); 1115 btrfs_release_path(path); 1116 1117 ret = btrfs_unlink_inode(trans, root, 1118 BTRFS_I(victim_parent), 1119 inode, 1120 victim_name, 1121 victim_name_len); 1122 if (!ret) 1123 ret = btrfs_run_delayed_items( 1124 trans); 1125 } 1126 iput(victim_parent); 1127 kfree(victim_name); 1128 if (ret) 1129 return ret; 1130 *search_done = 1; 1131 goto again; 1132 } 1133 kfree(victim_name); 1134 next: 1135 cur_offset += victim_name_len + sizeof(*extref); 1136 } 1137 *search_done = 1; 1138 } 1139 btrfs_release_path(path); 1140 1141 /* look for a conflicting sequence number */ 1142 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1143 ref_index, name, namelen, 0); 1144 if (di && !IS_ERR(di)) { 1145 ret = drop_one_dir_item(trans, root, path, dir, di); 1146 if (ret) 1147 return ret; 1148 } 1149 btrfs_release_path(path); 1150 1151 /* look for a conflicing name */ 1152 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1153 name, namelen, 0); 1154 if (di && !IS_ERR(di)) { 1155 ret = drop_one_dir_item(trans, root, path, dir, di); 1156 if (ret) 1157 return ret; 1158 } 1159 btrfs_release_path(path); 1160 1161 return 0; 1162 } 1163 1164 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1165 u32 *namelen, char **name, u64 *index, 1166 u64 *parent_objectid) 1167 { 1168 struct btrfs_inode_extref *extref; 1169 1170 extref = (struct btrfs_inode_extref *)ref_ptr; 1171 1172 *namelen = btrfs_inode_extref_name_len(eb, extref); 1173 *name = kmalloc(*namelen, GFP_NOFS); 1174 if (*name == NULL) 1175 return -ENOMEM; 1176 1177 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1178 *namelen); 1179 1180 if (index) 1181 *index = btrfs_inode_extref_index(eb, extref); 1182 if (parent_objectid) 1183 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1184 1185 return 0; 1186 } 1187 1188 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1189 u32 *namelen, char **name, u64 *index) 1190 { 1191 struct btrfs_inode_ref *ref; 1192 1193 ref = (struct btrfs_inode_ref *)ref_ptr; 1194 1195 *namelen = btrfs_inode_ref_name_len(eb, ref); 1196 *name = kmalloc(*namelen, GFP_NOFS); 1197 if (*name == NULL) 1198 return -ENOMEM; 1199 1200 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1201 1202 if (index) 1203 *index = btrfs_inode_ref_index(eb, ref); 1204 1205 return 0; 1206 } 1207 1208 /* 1209 * Take an inode reference item from the log tree and iterate all names from the 1210 * inode reference item in the subvolume tree with the same key (if it exists). 1211 * For any name that is not in the inode reference item from the log tree, do a 1212 * proper unlink of that name (that is, remove its entry from the inode 1213 * reference item and both dir index keys). 1214 */ 1215 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1216 struct btrfs_root *root, 1217 struct btrfs_path *path, 1218 struct btrfs_inode *inode, 1219 struct extent_buffer *log_eb, 1220 int log_slot, 1221 struct btrfs_key *key) 1222 { 1223 int ret; 1224 unsigned long ref_ptr; 1225 unsigned long ref_end; 1226 struct extent_buffer *eb; 1227 1228 again: 1229 btrfs_release_path(path); 1230 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1231 if (ret > 0) { 1232 ret = 0; 1233 goto out; 1234 } 1235 if (ret < 0) 1236 goto out; 1237 1238 eb = path->nodes[0]; 1239 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1240 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1241 while (ref_ptr < ref_end) { 1242 char *name = NULL; 1243 int namelen; 1244 u64 parent_id; 1245 1246 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1247 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1248 NULL, &parent_id); 1249 } else { 1250 parent_id = key->offset; 1251 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1252 NULL); 1253 } 1254 if (ret) 1255 goto out; 1256 1257 if (key->type == BTRFS_INODE_EXTREF_KEY) 1258 ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, 1259 parent_id, name, 1260 namelen, NULL); 1261 else 1262 ret = btrfs_find_name_in_backref(log_eb, log_slot, name, 1263 namelen, NULL); 1264 1265 if (!ret) { 1266 struct inode *dir; 1267 1268 btrfs_release_path(path); 1269 dir = read_one_inode(root, parent_id); 1270 if (!dir) { 1271 ret = -ENOENT; 1272 kfree(name); 1273 goto out; 1274 } 1275 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1276 inode, name, namelen); 1277 kfree(name); 1278 iput(dir); 1279 if (ret) 1280 goto out; 1281 goto again; 1282 } 1283 1284 kfree(name); 1285 ref_ptr += namelen; 1286 if (key->type == BTRFS_INODE_EXTREF_KEY) 1287 ref_ptr += sizeof(struct btrfs_inode_extref); 1288 else 1289 ref_ptr += sizeof(struct btrfs_inode_ref); 1290 } 1291 ret = 0; 1292 out: 1293 btrfs_release_path(path); 1294 return ret; 1295 } 1296 1297 /* 1298 * replay one inode back reference item found in the log tree. 1299 * eb, slot and key refer to the buffer and key found in the log tree. 1300 * root is the destination we are replaying into, and path is for temp 1301 * use by this function. (it should be released on return). 1302 */ 1303 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1304 struct btrfs_root *root, 1305 struct btrfs_root *log, 1306 struct btrfs_path *path, 1307 struct extent_buffer *eb, int slot, 1308 struct btrfs_key *key) 1309 { 1310 struct inode *dir = NULL; 1311 struct inode *inode = NULL; 1312 unsigned long ref_ptr; 1313 unsigned long ref_end; 1314 char *name = NULL; 1315 int namelen; 1316 int ret; 1317 int search_done = 0; 1318 int log_ref_ver = 0; 1319 u64 parent_objectid; 1320 u64 inode_objectid; 1321 u64 ref_index = 0; 1322 int ref_struct_size; 1323 1324 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1325 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1326 1327 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1328 struct btrfs_inode_extref *r; 1329 1330 ref_struct_size = sizeof(struct btrfs_inode_extref); 1331 log_ref_ver = 1; 1332 r = (struct btrfs_inode_extref *)ref_ptr; 1333 parent_objectid = btrfs_inode_extref_parent(eb, r); 1334 } else { 1335 ref_struct_size = sizeof(struct btrfs_inode_ref); 1336 parent_objectid = key->offset; 1337 } 1338 inode_objectid = key->objectid; 1339 1340 /* 1341 * it is possible that we didn't log all the parent directories 1342 * for a given inode. If we don't find the dir, just don't 1343 * copy the back ref in. The link count fixup code will take 1344 * care of the rest 1345 */ 1346 dir = read_one_inode(root, parent_objectid); 1347 if (!dir) { 1348 ret = -ENOENT; 1349 goto out; 1350 } 1351 1352 inode = read_one_inode(root, inode_objectid); 1353 if (!inode) { 1354 ret = -EIO; 1355 goto out; 1356 } 1357 1358 while (ref_ptr < ref_end) { 1359 if (log_ref_ver) { 1360 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1361 &ref_index, &parent_objectid); 1362 /* 1363 * parent object can change from one array 1364 * item to another. 1365 */ 1366 if (!dir) 1367 dir = read_one_inode(root, parent_objectid); 1368 if (!dir) { 1369 ret = -ENOENT; 1370 goto out; 1371 } 1372 } else { 1373 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1374 &ref_index); 1375 } 1376 if (ret) 1377 goto out; 1378 1379 /* if we already have a perfect match, we're done */ 1380 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1381 btrfs_ino(BTRFS_I(inode)), ref_index, 1382 name, namelen)) { 1383 /* 1384 * look for a conflicting back reference in the 1385 * metadata. if we find one we have to unlink that name 1386 * of the file before we add our new link. Later on, we 1387 * overwrite any existing back reference, and we don't 1388 * want to create dangling pointers in the directory. 1389 */ 1390 1391 if (!search_done) { 1392 ret = __add_inode_ref(trans, root, path, log, 1393 BTRFS_I(dir), 1394 BTRFS_I(inode), 1395 inode_objectid, 1396 parent_objectid, 1397 ref_index, name, namelen, 1398 &search_done); 1399 if (ret) { 1400 if (ret == 1) 1401 ret = 0; 1402 goto out; 1403 } 1404 } 1405 1406 /* insert our name */ 1407 ret = btrfs_add_link(trans, BTRFS_I(dir), 1408 BTRFS_I(inode), 1409 name, namelen, 0, ref_index); 1410 if (ret) 1411 goto out; 1412 1413 btrfs_update_inode(trans, root, inode); 1414 } 1415 1416 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1417 kfree(name); 1418 name = NULL; 1419 if (log_ref_ver) { 1420 iput(dir); 1421 dir = NULL; 1422 } 1423 } 1424 1425 /* 1426 * Before we overwrite the inode reference item in the subvolume tree 1427 * with the item from the log tree, we must unlink all names from the 1428 * parent directory that are in the subvolume's tree inode reference 1429 * item, otherwise we end up with an inconsistent subvolume tree where 1430 * dir index entries exist for a name but there is no inode reference 1431 * item with the same name. 1432 */ 1433 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1434 key); 1435 if (ret) 1436 goto out; 1437 1438 /* finally write the back reference in the inode */ 1439 ret = overwrite_item(trans, root, path, eb, slot, key); 1440 out: 1441 btrfs_release_path(path); 1442 kfree(name); 1443 iput(dir); 1444 iput(inode); 1445 return ret; 1446 } 1447 1448 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1449 struct btrfs_root *root, u64 ino) 1450 { 1451 int ret; 1452 1453 ret = btrfs_insert_orphan_item(trans, root, ino); 1454 if (ret == -EEXIST) 1455 ret = 0; 1456 1457 return ret; 1458 } 1459 1460 static int count_inode_extrefs(struct btrfs_root *root, 1461 struct btrfs_inode *inode, struct btrfs_path *path) 1462 { 1463 int ret = 0; 1464 int name_len; 1465 unsigned int nlink = 0; 1466 u32 item_size; 1467 u32 cur_offset = 0; 1468 u64 inode_objectid = btrfs_ino(inode); 1469 u64 offset = 0; 1470 unsigned long ptr; 1471 struct btrfs_inode_extref *extref; 1472 struct extent_buffer *leaf; 1473 1474 while (1) { 1475 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1476 &extref, &offset); 1477 if (ret) 1478 break; 1479 1480 leaf = path->nodes[0]; 1481 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1482 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1483 cur_offset = 0; 1484 1485 while (cur_offset < item_size) { 1486 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1487 name_len = btrfs_inode_extref_name_len(leaf, extref); 1488 1489 nlink++; 1490 1491 cur_offset += name_len + sizeof(*extref); 1492 } 1493 1494 offset++; 1495 btrfs_release_path(path); 1496 } 1497 btrfs_release_path(path); 1498 1499 if (ret < 0 && ret != -ENOENT) 1500 return ret; 1501 return nlink; 1502 } 1503 1504 static int count_inode_refs(struct btrfs_root *root, 1505 struct btrfs_inode *inode, struct btrfs_path *path) 1506 { 1507 int ret; 1508 struct btrfs_key key; 1509 unsigned int nlink = 0; 1510 unsigned long ptr; 1511 unsigned long ptr_end; 1512 int name_len; 1513 u64 ino = btrfs_ino(inode); 1514 1515 key.objectid = ino; 1516 key.type = BTRFS_INODE_REF_KEY; 1517 key.offset = (u64)-1; 1518 1519 while (1) { 1520 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1521 if (ret < 0) 1522 break; 1523 if (ret > 0) { 1524 if (path->slots[0] == 0) 1525 break; 1526 path->slots[0]--; 1527 } 1528 process_slot: 1529 btrfs_item_key_to_cpu(path->nodes[0], &key, 1530 path->slots[0]); 1531 if (key.objectid != ino || 1532 key.type != BTRFS_INODE_REF_KEY) 1533 break; 1534 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1535 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1536 path->slots[0]); 1537 while (ptr < ptr_end) { 1538 struct btrfs_inode_ref *ref; 1539 1540 ref = (struct btrfs_inode_ref *)ptr; 1541 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1542 ref); 1543 ptr = (unsigned long)(ref + 1) + name_len; 1544 nlink++; 1545 } 1546 1547 if (key.offset == 0) 1548 break; 1549 if (path->slots[0] > 0) { 1550 path->slots[0]--; 1551 goto process_slot; 1552 } 1553 key.offset--; 1554 btrfs_release_path(path); 1555 } 1556 btrfs_release_path(path); 1557 1558 return nlink; 1559 } 1560 1561 /* 1562 * There are a few corners where the link count of the file can't 1563 * be properly maintained during replay. So, instead of adding 1564 * lots of complexity to the log code, we just scan the backrefs 1565 * for any file that has been through replay. 1566 * 1567 * The scan will update the link count on the inode to reflect the 1568 * number of back refs found. If it goes down to zero, the iput 1569 * will free the inode. 1570 */ 1571 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1572 struct btrfs_root *root, 1573 struct inode *inode) 1574 { 1575 struct btrfs_path *path; 1576 int ret; 1577 u64 nlink = 0; 1578 u64 ino = btrfs_ino(BTRFS_I(inode)); 1579 1580 path = btrfs_alloc_path(); 1581 if (!path) 1582 return -ENOMEM; 1583 1584 ret = count_inode_refs(root, BTRFS_I(inode), path); 1585 if (ret < 0) 1586 goto out; 1587 1588 nlink = ret; 1589 1590 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1591 if (ret < 0) 1592 goto out; 1593 1594 nlink += ret; 1595 1596 ret = 0; 1597 1598 if (nlink != inode->i_nlink) { 1599 set_nlink(inode, nlink); 1600 btrfs_update_inode(trans, root, inode); 1601 } 1602 BTRFS_I(inode)->index_cnt = (u64)-1; 1603 1604 if (inode->i_nlink == 0) { 1605 if (S_ISDIR(inode->i_mode)) { 1606 ret = replay_dir_deletes(trans, root, NULL, path, 1607 ino, 1); 1608 if (ret) 1609 goto out; 1610 } 1611 ret = insert_orphan_item(trans, root, ino); 1612 } 1613 1614 out: 1615 btrfs_free_path(path); 1616 return ret; 1617 } 1618 1619 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1620 struct btrfs_root *root, 1621 struct btrfs_path *path) 1622 { 1623 int ret; 1624 struct btrfs_key key; 1625 struct inode *inode; 1626 1627 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1628 key.type = BTRFS_ORPHAN_ITEM_KEY; 1629 key.offset = (u64)-1; 1630 while (1) { 1631 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1632 if (ret < 0) 1633 break; 1634 1635 if (ret == 1) { 1636 if (path->slots[0] == 0) 1637 break; 1638 path->slots[0]--; 1639 } 1640 1641 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1642 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1643 key.type != BTRFS_ORPHAN_ITEM_KEY) 1644 break; 1645 1646 ret = btrfs_del_item(trans, root, path); 1647 if (ret) 1648 goto out; 1649 1650 btrfs_release_path(path); 1651 inode = read_one_inode(root, key.offset); 1652 if (!inode) 1653 return -EIO; 1654 1655 ret = fixup_inode_link_count(trans, root, inode); 1656 iput(inode); 1657 if (ret) 1658 goto out; 1659 1660 /* 1661 * fixup on a directory may create new entries, 1662 * make sure we always look for the highset possible 1663 * offset 1664 */ 1665 key.offset = (u64)-1; 1666 } 1667 ret = 0; 1668 out: 1669 btrfs_release_path(path); 1670 return ret; 1671 } 1672 1673 1674 /* 1675 * record a given inode in the fixup dir so we can check its link 1676 * count when replay is done. The link count is incremented here 1677 * so the inode won't go away until we check it 1678 */ 1679 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1680 struct btrfs_root *root, 1681 struct btrfs_path *path, 1682 u64 objectid) 1683 { 1684 struct btrfs_key key; 1685 int ret = 0; 1686 struct inode *inode; 1687 1688 inode = read_one_inode(root, objectid); 1689 if (!inode) 1690 return -EIO; 1691 1692 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1693 key.type = BTRFS_ORPHAN_ITEM_KEY; 1694 key.offset = objectid; 1695 1696 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1697 1698 btrfs_release_path(path); 1699 if (ret == 0) { 1700 if (!inode->i_nlink) 1701 set_nlink(inode, 1); 1702 else 1703 inc_nlink(inode); 1704 ret = btrfs_update_inode(trans, root, inode); 1705 } else if (ret == -EEXIST) { 1706 ret = 0; 1707 } else { 1708 BUG(); /* Logic Error */ 1709 } 1710 iput(inode); 1711 1712 return ret; 1713 } 1714 1715 /* 1716 * when replaying the log for a directory, we only insert names 1717 * for inodes that actually exist. This means an fsync on a directory 1718 * does not implicitly fsync all the new files in it 1719 */ 1720 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1721 struct btrfs_root *root, 1722 u64 dirid, u64 index, 1723 char *name, int name_len, 1724 struct btrfs_key *location) 1725 { 1726 struct inode *inode; 1727 struct inode *dir; 1728 int ret; 1729 1730 inode = read_one_inode(root, location->objectid); 1731 if (!inode) 1732 return -ENOENT; 1733 1734 dir = read_one_inode(root, dirid); 1735 if (!dir) { 1736 iput(inode); 1737 return -EIO; 1738 } 1739 1740 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1741 name_len, 1, index); 1742 1743 /* FIXME, put inode into FIXUP list */ 1744 1745 iput(inode); 1746 iput(dir); 1747 return ret; 1748 } 1749 1750 /* 1751 * Return true if an inode reference exists in the log for the given name, 1752 * inode and parent inode. 1753 */ 1754 static bool name_in_log_ref(struct btrfs_root *log_root, 1755 const char *name, const int name_len, 1756 const u64 dirid, const u64 ino) 1757 { 1758 struct btrfs_key search_key; 1759 1760 search_key.objectid = ino; 1761 search_key.type = BTRFS_INODE_REF_KEY; 1762 search_key.offset = dirid; 1763 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1764 return true; 1765 1766 search_key.type = BTRFS_INODE_EXTREF_KEY; 1767 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1768 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1769 return true; 1770 1771 return false; 1772 } 1773 1774 /* 1775 * take a single entry in a log directory item and replay it into 1776 * the subvolume. 1777 * 1778 * if a conflicting item exists in the subdirectory already, 1779 * the inode it points to is unlinked and put into the link count 1780 * fix up tree. 1781 * 1782 * If a name from the log points to a file or directory that does 1783 * not exist in the FS, it is skipped. fsyncs on directories 1784 * do not force down inodes inside that directory, just changes to the 1785 * names or unlinks in a directory. 1786 * 1787 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1788 * non-existing inode) and 1 if the name was replayed. 1789 */ 1790 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1791 struct btrfs_root *root, 1792 struct btrfs_path *path, 1793 struct extent_buffer *eb, 1794 struct btrfs_dir_item *di, 1795 struct btrfs_key *key) 1796 { 1797 char *name; 1798 int name_len; 1799 struct btrfs_dir_item *dst_di; 1800 struct btrfs_key found_key; 1801 struct btrfs_key log_key; 1802 struct inode *dir; 1803 u8 log_type; 1804 int exists; 1805 int ret = 0; 1806 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1807 bool name_added = false; 1808 1809 dir = read_one_inode(root, key->objectid); 1810 if (!dir) 1811 return -EIO; 1812 1813 name_len = btrfs_dir_name_len(eb, di); 1814 name = kmalloc(name_len, GFP_NOFS); 1815 if (!name) { 1816 ret = -ENOMEM; 1817 goto out; 1818 } 1819 1820 log_type = btrfs_dir_type(eb, di); 1821 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1822 name_len); 1823 1824 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1825 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1826 if (exists == 0) 1827 exists = 1; 1828 else 1829 exists = 0; 1830 btrfs_release_path(path); 1831 1832 if (key->type == BTRFS_DIR_ITEM_KEY) { 1833 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1834 name, name_len, 1); 1835 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1836 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1837 key->objectid, 1838 key->offset, name, 1839 name_len, 1); 1840 } else { 1841 /* Corruption */ 1842 ret = -EINVAL; 1843 goto out; 1844 } 1845 if (IS_ERR_OR_NULL(dst_di)) { 1846 /* we need a sequence number to insert, so we only 1847 * do inserts for the BTRFS_DIR_INDEX_KEY types 1848 */ 1849 if (key->type != BTRFS_DIR_INDEX_KEY) 1850 goto out; 1851 goto insert; 1852 } 1853 1854 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1855 /* the existing item matches the logged item */ 1856 if (found_key.objectid == log_key.objectid && 1857 found_key.type == log_key.type && 1858 found_key.offset == log_key.offset && 1859 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1860 update_size = false; 1861 goto out; 1862 } 1863 1864 /* 1865 * don't drop the conflicting directory entry if the inode 1866 * for the new entry doesn't exist 1867 */ 1868 if (!exists) 1869 goto out; 1870 1871 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1872 if (ret) 1873 goto out; 1874 1875 if (key->type == BTRFS_DIR_INDEX_KEY) 1876 goto insert; 1877 out: 1878 btrfs_release_path(path); 1879 if (!ret && update_size) { 1880 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1881 ret = btrfs_update_inode(trans, root, dir); 1882 } 1883 kfree(name); 1884 iput(dir); 1885 if (!ret && name_added) 1886 ret = 1; 1887 return ret; 1888 1889 insert: 1890 if (name_in_log_ref(root->log_root, name, name_len, 1891 key->objectid, log_key.objectid)) { 1892 /* The dentry will be added later. */ 1893 ret = 0; 1894 update_size = false; 1895 goto out; 1896 } 1897 btrfs_release_path(path); 1898 ret = insert_one_name(trans, root, key->objectid, key->offset, 1899 name, name_len, &log_key); 1900 if (ret && ret != -ENOENT && ret != -EEXIST) 1901 goto out; 1902 if (!ret) 1903 name_added = true; 1904 update_size = false; 1905 ret = 0; 1906 goto out; 1907 } 1908 1909 /* 1910 * find all the names in a directory item and reconcile them into 1911 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1912 * one name in a directory item, but the same code gets used for 1913 * both directory index types 1914 */ 1915 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1916 struct btrfs_root *root, 1917 struct btrfs_path *path, 1918 struct extent_buffer *eb, int slot, 1919 struct btrfs_key *key) 1920 { 1921 int ret = 0; 1922 u32 item_size = btrfs_item_size_nr(eb, slot); 1923 struct btrfs_dir_item *di; 1924 int name_len; 1925 unsigned long ptr; 1926 unsigned long ptr_end; 1927 struct btrfs_path *fixup_path = NULL; 1928 1929 ptr = btrfs_item_ptr_offset(eb, slot); 1930 ptr_end = ptr + item_size; 1931 while (ptr < ptr_end) { 1932 di = (struct btrfs_dir_item *)ptr; 1933 name_len = btrfs_dir_name_len(eb, di); 1934 ret = replay_one_name(trans, root, path, eb, di, key); 1935 if (ret < 0) 1936 break; 1937 ptr = (unsigned long)(di + 1); 1938 ptr += name_len; 1939 1940 /* 1941 * If this entry refers to a non-directory (directories can not 1942 * have a link count > 1) and it was added in the transaction 1943 * that was not committed, make sure we fixup the link count of 1944 * the inode it the entry points to. Otherwise something like 1945 * the following would result in a directory pointing to an 1946 * inode with a wrong link that does not account for this dir 1947 * entry: 1948 * 1949 * mkdir testdir 1950 * touch testdir/foo 1951 * touch testdir/bar 1952 * sync 1953 * 1954 * ln testdir/bar testdir/bar_link 1955 * ln testdir/foo testdir/foo_link 1956 * xfs_io -c "fsync" testdir/bar 1957 * 1958 * <power failure> 1959 * 1960 * mount fs, log replay happens 1961 * 1962 * File foo would remain with a link count of 1 when it has two 1963 * entries pointing to it in the directory testdir. This would 1964 * make it impossible to ever delete the parent directory has 1965 * it would result in stale dentries that can never be deleted. 1966 */ 1967 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1968 struct btrfs_key di_key; 1969 1970 if (!fixup_path) { 1971 fixup_path = btrfs_alloc_path(); 1972 if (!fixup_path) { 1973 ret = -ENOMEM; 1974 break; 1975 } 1976 } 1977 1978 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1979 ret = link_to_fixup_dir(trans, root, fixup_path, 1980 di_key.objectid); 1981 if (ret) 1982 break; 1983 } 1984 ret = 0; 1985 } 1986 btrfs_free_path(fixup_path); 1987 return ret; 1988 } 1989 1990 /* 1991 * directory replay has two parts. There are the standard directory 1992 * items in the log copied from the subvolume, and range items 1993 * created in the log while the subvolume was logged. 1994 * 1995 * The range items tell us which parts of the key space the log 1996 * is authoritative for. During replay, if a key in the subvolume 1997 * directory is in a logged range item, but not actually in the log 1998 * that means it was deleted from the directory before the fsync 1999 * and should be removed. 2000 */ 2001 static noinline int find_dir_range(struct btrfs_root *root, 2002 struct btrfs_path *path, 2003 u64 dirid, int key_type, 2004 u64 *start_ret, u64 *end_ret) 2005 { 2006 struct btrfs_key key; 2007 u64 found_end; 2008 struct btrfs_dir_log_item *item; 2009 int ret; 2010 int nritems; 2011 2012 if (*start_ret == (u64)-1) 2013 return 1; 2014 2015 key.objectid = dirid; 2016 key.type = key_type; 2017 key.offset = *start_ret; 2018 2019 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2020 if (ret < 0) 2021 goto out; 2022 if (ret > 0) { 2023 if (path->slots[0] == 0) 2024 goto out; 2025 path->slots[0]--; 2026 } 2027 if (ret != 0) 2028 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2029 2030 if (key.type != key_type || key.objectid != dirid) { 2031 ret = 1; 2032 goto next; 2033 } 2034 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2035 struct btrfs_dir_log_item); 2036 found_end = btrfs_dir_log_end(path->nodes[0], item); 2037 2038 if (*start_ret >= key.offset && *start_ret <= found_end) { 2039 ret = 0; 2040 *start_ret = key.offset; 2041 *end_ret = found_end; 2042 goto out; 2043 } 2044 ret = 1; 2045 next: 2046 /* check the next slot in the tree to see if it is a valid item */ 2047 nritems = btrfs_header_nritems(path->nodes[0]); 2048 path->slots[0]++; 2049 if (path->slots[0] >= nritems) { 2050 ret = btrfs_next_leaf(root, path); 2051 if (ret) 2052 goto out; 2053 } 2054 2055 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2056 2057 if (key.type != key_type || key.objectid != dirid) { 2058 ret = 1; 2059 goto out; 2060 } 2061 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2062 struct btrfs_dir_log_item); 2063 found_end = btrfs_dir_log_end(path->nodes[0], item); 2064 *start_ret = key.offset; 2065 *end_ret = found_end; 2066 ret = 0; 2067 out: 2068 btrfs_release_path(path); 2069 return ret; 2070 } 2071 2072 /* 2073 * this looks for a given directory item in the log. If the directory 2074 * item is not in the log, the item is removed and the inode it points 2075 * to is unlinked 2076 */ 2077 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2078 struct btrfs_root *root, 2079 struct btrfs_root *log, 2080 struct btrfs_path *path, 2081 struct btrfs_path *log_path, 2082 struct inode *dir, 2083 struct btrfs_key *dir_key) 2084 { 2085 int ret; 2086 struct extent_buffer *eb; 2087 int slot; 2088 u32 item_size; 2089 struct btrfs_dir_item *di; 2090 struct btrfs_dir_item *log_di; 2091 int name_len; 2092 unsigned long ptr; 2093 unsigned long ptr_end; 2094 char *name; 2095 struct inode *inode; 2096 struct btrfs_key location; 2097 2098 again: 2099 eb = path->nodes[0]; 2100 slot = path->slots[0]; 2101 item_size = btrfs_item_size_nr(eb, slot); 2102 ptr = btrfs_item_ptr_offset(eb, slot); 2103 ptr_end = ptr + item_size; 2104 while (ptr < ptr_end) { 2105 di = (struct btrfs_dir_item *)ptr; 2106 name_len = btrfs_dir_name_len(eb, di); 2107 name = kmalloc(name_len, GFP_NOFS); 2108 if (!name) { 2109 ret = -ENOMEM; 2110 goto out; 2111 } 2112 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2113 name_len); 2114 log_di = NULL; 2115 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2116 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2117 dir_key->objectid, 2118 name, name_len, 0); 2119 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2120 log_di = btrfs_lookup_dir_index_item(trans, log, 2121 log_path, 2122 dir_key->objectid, 2123 dir_key->offset, 2124 name, name_len, 0); 2125 } 2126 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2127 btrfs_dir_item_key_to_cpu(eb, di, &location); 2128 btrfs_release_path(path); 2129 btrfs_release_path(log_path); 2130 inode = read_one_inode(root, location.objectid); 2131 if (!inode) { 2132 kfree(name); 2133 return -EIO; 2134 } 2135 2136 ret = link_to_fixup_dir(trans, root, 2137 path, location.objectid); 2138 if (ret) { 2139 kfree(name); 2140 iput(inode); 2141 goto out; 2142 } 2143 2144 inc_nlink(inode); 2145 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2146 BTRFS_I(inode), name, name_len); 2147 if (!ret) 2148 ret = btrfs_run_delayed_items(trans); 2149 kfree(name); 2150 iput(inode); 2151 if (ret) 2152 goto out; 2153 2154 /* there might still be more names under this key 2155 * check and repeat if required 2156 */ 2157 ret = btrfs_search_slot(NULL, root, dir_key, path, 2158 0, 0); 2159 if (ret == 0) 2160 goto again; 2161 ret = 0; 2162 goto out; 2163 } else if (IS_ERR(log_di)) { 2164 kfree(name); 2165 return PTR_ERR(log_di); 2166 } 2167 btrfs_release_path(log_path); 2168 kfree(name); 2169 2170 ptr = (unsigned long)(di + 1); 2171 ptr += name_len; 2172 } 2173 ret = 0; 2174 out: 2175 btrfs_release_path(path); 2176 btrfs_release_path(log_path); 2177 return ret; 2178 } 2179 2180 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2181 struct btrfs_root *root, 2182 struct btrfs_root *log, 2183 struct btrfs_path *path, 2184 const u64 ino) 2185 { 2186 struct btrfs_key search_key; 2187 struct btrfs_path *log_path; 2188 int i; 2189 int nritems; 2190 int ret; 2191 2192 log_path = btrfs_alloc_path(); 2193 if (!log_path) 2194 return -ENOMEM; 2195 2196 search_key.objectid = ino; 2197 search_key.type = BTRFS_XATTR_ITEM_KEY; 2198 search_key.offset = 0; 2199 again: 2200 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2201 if (ret < 0) 2202 goto out; 2203 process_leaf: 2204 nritems = btrfs_header_nritems(path->nodes[0]); 2205 for (i = path->slots[0]; i < nritems; i++) { 2206 struct btrfs_key key; 2207 struct btrfs_dir_item *di; 2208 struct btrfs_dir_item *log_di; 2209 u32 total_size; 2210 u32 cur; 2211 2212 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2213 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2214 ret = 0; 2215 goto out; 2216 } 2217 2218 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2219 total_size = btrfs_item_size_nr(path->nodes[0], i); 2220 cur = 0; 2221 while (cur < total_size) { 2222 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2223 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2224 u32 this_len = sizeof(*di) + name_len + data_len; 2225 char *name; 2226 2227 name = kmalloc(name_len, GFP_NOFS); 2228 if (!name) { 2229 ret = -ENOMEM; 2230 goto out; 2231 } 2232 read_extent_buffer(path->nodes[0], name, 2233 (unsigned long)(di + 1), name_len); 2234 2235 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2236 name, name_len, 0); 2237 btrfs_release_path(log_path); 2238 if (!log_di) { 2239 /* Doesn't exist in log tree, so delete it. */ 2240 btrfs_release_path(path); 2241 di = btrfs_lookup_xattr(trans, root, path, ino, 2242 name, name_len, -1); 2243 kfree(name); 2244 if (IS_ERR(di)) { 2245 ret = PTR_ERR(di); 2246 goto out; 2247 } 2248 ASSERT(di); 2249 ret = btrfs_delete_one_dir_name(trans, root, 2250 path, di); 2251 if (ret) 2252 goto out; 2253 btrfs_release_path(path); 2254 search_key = key; 2255 goto again; 2256 } 2257 kfree(name); 2258 if (IS_ERR(log_di)) { 2259 ret = PTR_ERR(log_di); 2260 goto out; 2261 } 2262 cur += this_len; 2263 di = (struct btrfs_dir_item *)((char *)di + this_len); 2264 } 2265 } 2266 ret = btrfs_next_leaf(root, path); 2267 if (ret > 0) 2268 ret = 0; 2269 else if (ret == 0) 2270 goto process_leaf; 2271 out: 2272 btrfs_free_path(log_path); 2273 btrfs_release_path(path); 2274 return ret; 2275 } 2276 2277 2278 /* 2279 * deletion replay happens before we copy any new directory items 2280 * out of the log or out of backreferences from inodes. It 2281 * scans the log to find ranges of keys that log is authoritative for, 2282 * and then scans the directory to find items in those ranges that are 2283 * not present in the log. 2284 * 2285 * Anything we don't find in the log is unlinked and removed from the 2286 * directory. 2287 */ 2288 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2289 struct btrfs_root *root, 2290 struct btrfs_root *log, 2291 struct btrfs_path *path, 2292 u64 dirid, int del_all) 2293 { 2294 u64 range_start; 2295 u64 range_end; 2296 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2297 int ret = 0; 2298 struct btrfs_key dir_key; 2299 struct btrfs_key found_key; 2300 struct btrfs_path *log_path; 2301 struct inode *dir; 2302 2303 dir_key.objectid = dirid; 2304 dir_key.type = BTRFS_DIR_ITEM_KEY; 2305 log_path = btrfs_alloc_path(); 2306 if (!log_path) 2307 return -ENOMEM; 2308 2309 dir = read_one_inode(root, dirid); 2310 /* it isn't an error if the inode isn't there, that can happen 2311 * because we replay the deletes before we copy in the inode item 2312 * from the log 2313 */ 2314 if (!dir) { 2315 btrfs_free_path(log_path); 2316 return 0; 2317 } 2318 again: 2319 range_start = 0; 2320 range_end = 0; 2321 while (1) { 2322 if (del_all) 2323 range_end = (u64)-1; 2324 else { 2325 ret = find_dir_range(log, path, dirid, key_type, 2326 &range_start, &range_end); 2327 if (ret != 0) 2328 break; 2329 } 2330 2331 dir_key.offset = range_start; 2332 while (1) { 2333 int nritems; 2334 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2335 0, 0); 2336 if (ret < 0) 2337 goto out; 2338 2339 nritems = btrfs_header_nritems(path->nodes[0]); 2340 if (path->slots[0] >= nritems) { 2341 ret = btrfs_next_leaf(root, path); 2342 if (ret == 1) 2343 break; 2344 else if (ret < 0) 2345 goto out; 2346 } 2347 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2348 path->slots[0]); 2349 if (found_key.objectid != dirid || 2350 found_key.type != dir_key.type) 2351 goto next_type; 2352 2353 if (found_key.offset > range_end) 2354 break; 2355 2356 ret = check_item_in_log(trans, root, log, path, 2357 log_path, dir, 2358 &found_key); 2359 if (ret) 2360 goto out; 2361 if (found_key.offset == (u64)-1) 2362 break; 2363 dir_key.offset = found_key.offset + 1; 2364 } 2365 btrfs_release_path(path); 2366 if (range_end == (u64)-1) 2367 break; 2368 range_start = range_end + 1; 2369 } 2370 2371 next_type: 2372 ret = 0; 2373 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2374 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2375 dir_key.type = BTRFS_DIR_INDEX_KEY; 2376 btrfs_release_path(path); 2377 goto again; 2378 } 2379 out: 2380 btrfs_release_path(path); 2381 btrfs_free_path(log_path); 2382 iput(dir); 2383 return ret; 2384 } 2385 2386 /* 2387 * the process_func used to replay items from the log tree. This 2388 * gets called in two different stages. The first stage just looks 2389 * for inodes and makes sure they are all copied into the subvolume. 2390 * 2391 * The second stage copies all the other item types from the log into 2392 * the subvolume. The two stage approach is slower, but gets rid of 2393 * lots of complexity around inodes referencing other inodes that exist 2394 * only in the log (references come from either directory items or inode 2395 * back refs). 2396 */ 2397 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2398 struct walk_control *wc, u64 gen, int level) 2399 { 2400 int nritems; 2401 struct btrfs_path *path; 2402 struct btrfs_root *root = wc->replay_dest; 2403 struct btrfs_key key; 2404 int i; 2405 int ret; 2406 2407 ret = btrfs_read_buffer(eb, gen, level, NULL); 2408 if (ret) 2409 return ret; 2410 2411 level = btrfs_header_level(eb); 2412 2413 if (level != 0) 2414 return 0; 2415 2416 path = btrfs_alloc_path(); 2417 if (!path) 2418 return -ENOMEM; 2419 2420 nritems = btrfs_header_nritems(eb); 2421 for (i = 0; i < nritems; i++) { 2422 btrfs_item_key_to_cpu(eb, &key, i); 2423 2424 /* inode keys are done during the first stage */ 2425 if (key.type == BTRFS_INODE_ITEM_KEY && 2426 wc->stage == LOG_WALK_REPLAY_INODES) { 2427 struct btrfs_inode_item *inode_item; 2428 u32 mode; 2429 2430 inode_item = btrfs_item_ptr(eb, i, 2431 struct btrfs_inode_item); 2432 ret = replay_xattr_deletes(wc->trans, root, log, 2433 path, key.objectid); 2434 if (ret) 2435 break; 2436 mode = btrfs_inode_mode(eb, inode_item); 2437 if (S_ISDIR(mode)) { 2438 ret = replay_dir_deletes(wc->trans, 2439 root, log, path, key.objectid, 0); 2440 if (ret) 2441 break; 2442 } 2443 ret = overwrite_item(wc->trans, root, path, 2444 eb, i, &key); 2445 if (ret) 2446 break; 2447 2448 /* 2449 * Before replaying extents, truncate the inode to its 2450 * size. We need to do it now and not after log replay 2451 * because before an fsync we can have prealloc extents 2452 * added beyond the inode's i_size. If we did it after, 2453 * through orphan cleanup for example, we would drop 2454 * those prealloc extents just after replaying them. 2455 */ 2456 if (S_ISREG(mode)) { 2457 struct inode *inode; 2458 u64 from; 2459 2460 inode = read_one_inode(root, key.objectid); 2461 if (!inode) { 2462 ret = -EIO; 2463 break; 2464 } 2465 from = ALIGN(i_size_read(inode), 2466 root->fs_info->sectorsize); 2467 ret = btrfs_drop_extents(wc->trans, root, inode, 2468 from, (u64)-1, 1); 2469 /* 2470 * If the nlink count is zero here, the iput 2471 * will free the inode. We bump it to make 2472 * sure it doesn't get freed until the link 2473 * count fixup is done. 2474 */ 2475 if (!ret) { 2476 if (inode->i_nlink == 0) 2477 inc_nlink(inode); 2478 /* Update link count and nbytes. */ 2479 ret = btrfs_update_inode(wc->trans, 2480 root, inode); 2481 } 2482 iput(inode); 2483 if (ret) 2484 break; 2485 } 2486 2487 ret = link_to_fixup_dir(wc->trans, root, 2488 path, key.objectid); 2489 if (ret) 2490 break; 2491 } 2492 2493 if (key.type == BTRFS_DIR_INDEX_KEY && 2494 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2495 ret = replay_one_dir_item(wc->trans, root, path, 2496 eb, i, &key); 2497 if (ret) 2498 break; 2499 } 2500 2501 if (wc->stage < LOG_WALK_REPLAY_ALL) 2502 continue; 2503 2504 /* these keys are simply copied */ 2505 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2506 ret = overwrite_item(wc->trans, root, path, 2507 eb, i, &key); 2508 if (ret) 2509 break; 2510 } else if (key.type == BTRFS_INODE_REF_KEY || 2511 key.type == BTRFS_INODE_EXTREF_KEY) { 2512 ret = add_inode_ref(wc->trans, root, log, path, 2513 eb, i, &key); 2514 if (ret && ret != -ENOENT) 2515 break; 2516 ret = 0; 2517 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2518 ret = replay_one_extent(wc->trans, root, path, 2519 eb, i, &key); 2520 if (ret) 2521 break; 2522 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2523 ret = replay_one_dir_item(wc->trans, root, path, 2524 eb, i, &key); 2525 if (ret) 2526 break; 2527 } 2528 } 2529 btrfs_free_path(path); 2530 return ret; 2531 } 2532 2533 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2534 struct btrfs_root *root, 2535 struct btrfs_path *path, int *level, 2536 struct walk_control *wc) 2537 { 2538 struct btrfs_fs_info *fs_info = root->fs_info; 2539 u64 root_owner; 2540 u64 bytenr; 2541 u64 ptr_gen; 2542 struct extent_buffer *next; 2543 struct extent_buffer *cur; 2544 struct extent_buffer *parent; 2545 u32 blocksize; 2546 int ret = 0; 2547 2548 WARN_ON(*level < 0); 2549 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2550 2551 while (*level > 0) { 2552 struct btrfs_key first_key; 2553 2554 WARN_ON(*level < 0); 2555 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2556 cur = path->nodes[*level]; 2557 2558 WARN_ON(btrfs_header_level(cur) != *level); 2559 2560 if (path->slots[*level] >= 2561 btrfs_header_nritems(cur)) 2562 break; 2563 2564 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2565 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2566 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2567 blocksize = fs_info->nodesize; 2568 2569 parent = path->nodes[*level]; 2570 root_owner = btrfs_header_owner(parent); 2571 2572 next = btrfs_find_create_tree_block(fs_info, bytenr); 2573 if (IS_ERR(next)) 2574 return PTR_ERR(next); 2575 2576 if (*level == 1) { 2577 ret = wc->process_func(root, next, wc, ptr_gen, 2578 *level - 1); 2579 if (ret) { 2580 free_extent_buffer(next); 2581 return ret; 2582 } 2583 2584 path->slots[*level]++; 2585 if (wc->free) { 2586 ret = btrfs_read_buffer(next, ptr_gen, 2587 *level - 1, &first_key); 2588 if (ret) { 2589 free_extent_buffer(next); 2590 return ret; 2591 } 2592 2593 if (trans) { 2594 btrfs_tree_lock(next); 2595 btrfs_set_lock_blocking(next); 2596 clean_tree_block(fs_info, next); 2597 btrfs_wait_tree_block_writeback(next); 2598 btrfs_tree_unlock(next); 2599 } else { 2600 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2601 clear_extent_buffer_dirty(next); 2602 } 2603 2604 WARN_ON(root_owner != 2605 BTRFS_TREE_LOG_OBJECTID); 2606 ret = btrfs_free_and_pin_reserved_extent( 2607 fs_info, bytenr, 2608 blocksize); 2609 if (ret) { 2610 free_extent_buffer(next); 2611 return ret; 2612 } 2613 } 2614 free_extent_buffer(next); 2615 continue; 2616 } 2617 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2618 if (ret) { 2619 free_extent_buffer(next); 2620 return ret; 2621 } 2622 2623 WARN_ON(*level <= 0); 2624 if (path->nodes[*level-1]) 2625 free_extent_buffer(path->nodes[*level-1]); 2626 path->nodes[*level-1] = next; 2627 *level = btrfs_header_level(next); 2628 path->slots[*level] = 0; 2629 cond_resched(); 2630 } 2631 WARN_ON(*level < 0); 2632 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2633 2634 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2635 2636 cond_resched(); 2637 return 0; 2638 } 2639 2640 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2641 struct btrfs_root *root, 2642 struct btrfs_path *path, int *level, 2643 struct walk_control *wc) 2644 { 2645 struct btrfs_fs_info *fs_info = root->fs_info; 2646 u64 root_owner; 2647 int i; 2648 int slot; 2649 int ret; 2650 2651 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2652 slot = path->slots[i]; 2653 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2654 path->slots[i]++; 2655 *level = i; 2656 WARN_ON(*level == 0); 2657 return 0; 2658 } else { 2659 struct extent_buffer *parent; 2660 if (path->nodes[*level] == root->node) 2661 parent = path->nodes[*level]; 2662 else 2663 parent = path->nodes[*level + 1]; 2664 2665 root_owner = btrfs_header_owner(parent); 2666 ret = wc->process_func(root, path->nodes[*level], wc, 2667 btrfs_header_generation(path->nodes[*level]), 2668 *level); 2669 if (ret) 2670 return ret; 2671 2672 if (wc->free) { 2673 struct extent_buffer *next; 2674 2675 next = path->nodes[*level]; 2676 2677 if (trans) { 2678 btrfs_tree_lock(next); 2679 btrfs_set_lock_blocking(next); 2680 clean_tree_block(fs_info, next); 2681 btrfs_wait_tree_block_writeback(next); 2682 btrfs_tree_unlock(next); 2683 } else { 2684 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2685 clear_extent_buffer_dirty(next); 2686 } 2687 2688 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2689 ret = btrfs_free_and_pin_reserved_extent( 2690 fs_info, 2691 path->nodes[*level]->start, 2692 path->nodes[*level]->len); 2693 if (ret) 2694 return ret; 2695 } 2696 free_extent_buffer(path->nodes[*level]); 2697 path->nodes[*level] = NULL; 2698 *level = i + 1; 2699 } 2700 } 2701 return 1; 2702 } 2703 2704 /* 2705 * drop the reference count on the tree rooted at 'snap'. This traverses 2706 * the tree freeing any blocks that have a ref count of zero after being 2707 * decremented. 2708 */ 2709 static int walk_log_tree(struct btrfs_trans_handle *trans, 2710 struct btrfs_root *log, struct walk_control *wc) 2711 { 2712 struct btrfs_fs_info *fs_info = log->fs_info; 2713 int ret = 0; 2714 int wret; 2715 int level; 2716 struct btrfs_path *path; 2717 int orig_level; 2718 2719 path = btrfs_alloc_path(); 2720 if (!path) 2721 return -ENOMEM; 2722 2723 level = btrfs_header_level(log->node); 2724 orig_level = level; 2725 path->nodes[level] = log->node; 2726 extent_buffer_get(log->node); 2727 path->slots[level] = 0; 2728 2729 while (1) { 2730 wret = walk_down_log_tree(trans, log, path, &level, wc); 2731 if (wret > 0) 2732 break; 2733 if (wret < 0) { 2734 ret = wret; 2735 goto out; 2736 } 2737 2738 wret = walk_up_log_tree(trans, log, path, &level, wc); 2739 if (wret > 0) 2740 break; 2741 if (wret < 0) { 2742 ret = wret; 2743 goto out; 2744 } 2745 } 2746 2747 /* was the root node processed? if not, catch it here */ 2748 if (path->nodes[orig_level]) { 2749 ret = wc->process_func(log, path->nodes[orig_level], wc, 2750 btrfs_header_generation(path->nodes[orig_level]), 2751 orig_level); 2752 if (ret) 2753 goto out; 2754 if (wc->free) { 2755 struct extent_buffer *next; 2756 2757 next = path->nodes[orig_level]; 2758 2759 if (trans) { 2760 btrfs_tree_lock(next); 2761 btrfs_set_lock_blocking(next); 2762 clean_tree_block(fs_info, next); 2763 btrfs_wait_tree_block_writeback(next); 2764 btrfs_tree_unlock(next); 2765 } else { 2766 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2767 clear_extent_buffer_dirty(next); 2768 } 2769 2770 WARN_ON(log->root_key.objectid != 2771 BTRFS_TREE_LOG_OBJECTID); 2772 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2773 next->start, next->len); 2774 if (ret) 2775 goto out; 2776 } 2777 } 2778 2779 out: 2780 btrfs_free_path(path); 2781 return ret; 2782 } 2783 2784 /* 2785 * helper function to update the item for a given subvolumes log root 2786 * in the tree of log roots 2787 */ 2788 static int update_log_root(struct btrfs_trans_handle *trans, 2789 struct btrfs_root *log) 2790 { 2791 struct btrfs_fs_info *fs_info = log->fs_info; 2792 int ret; 2793 2794 if (log->log_transid == 1) { 2795 /* insert root item on the first sync */ 2796 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2797 &log->root_key, &log->root_item); 2798 } else { 2799 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2800 &log->root_key, &log->root_item); 2801 } 2802 return ret; 2803 } 2804 2805 static void wait_log_commit(struct btrfs_root *root, int transid) 2806 { 2807 DEFINE_WAIT(wait); 2808 int index = transid % 2; 2809 2810 /* 2811 * we only allow two pending log transactions at a time, 2812 * so we know that if ours is more than 2 older than the 2813 * current transaction, we're done 2814 */ 2815 for (;;) { 2816 prepare_to_wait(&root->log_commit_wait[index], 2817 &wait, TASK_UNINTERRUPTIBLE); 2818 2819 if (!(root->log_transid_committed < transid && 2820 atomic_read(&root->log_commit[index]))) 2821 break; 2822 2823 mutex_unlock(&root->log_mutex); 2824 schedule(); 2825 mutex_lock(&root->log_mutex); 2826 } 2827 finish_wait(&root->log_commit_wait[index], &wait); 2828 } 2829 2830 static void wait_for_writer(struct btrfs_root *root) 2831 { 2832 DEFINE_WAIT(wait); 2833 2834 for (;;) { 2835 prepare_to_wait(&root->log_writer_wait, &wait, 2836 TASK_UNINTERRUPTIBLE); 2837 if (!atomic_read(&root->log_writers)) 2838 break; 2839 2840 mutex_unlock(&root->log_mutex); 2841 schedule(); 2842 mutex_lock(&root->log_mutex); 2843 } 2844 finish_wait(&root->log_writer_wait, &wait); 2845 } 2846 2847 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2848 struct btrfs_log_ctx *ctx) 2849 { 2850 if (!ctx) 2851 return; 2852 2853 mutex_lock(&root->log_mutex); 2854 list_del_init(&ctx->list); 2855 mutex_unlock(&root->log_mutex); 2856 } 2857 2858 /* 2859 * Invoked in log mutex context, or be sure there is no other task which 2860 * can access the list. 2861 */ 2862 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2863 int index, int error) 2864 { 2865 struct btrfs_log_ctx *ctx; 2866 struct btrfs_log_ctx *safe; 2867 2868 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2869 list_del_init(&ctx->list); 2870 ctx->log_ret = error; 2871 } 2872 2873 INIT_LIST_HEAD(&root->log_ctxs[index]); 2874 } 2875 2876 /* 2877 * btrfs_sync_log does sends a given tree log down to the disk and 2878 * updates the super blocks to record it. When this call is done, 2879 * you know that any inodes previously logged are safely on disk only 2880 * if it returns 0. 2881 * 2882 * Any other return value means you need to call btrfs_commit_transaction. 2883 * Some of the edge cases for fsyncing directories that have had unlinks 2884 * or renames done in the past mean that sometimes the only safe 2885 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2886 * that has happened. 2887 */ 2888 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2889 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2890 { 2891 int index1; 2892 int index2; 2893 int mark; 2894 int ret; 2895 struct btrfs_fs_info *fs_info = root->fs_info; 2896 struct btrfs_root *log = root->log_root; 2897 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2898 int log_transid = 0; 2899 struct btrfs_log_ctx root_log_ctx; 2900 struct blk_plug plug; 2901 2902 mutex_lock(&root->log_mutex); 2903 log_transid = ctx->log_transid; 2904 if (root->log_transid_committed >= log_transid) { 2905 mutex_unlock(&root->log_mutex); 2906 return ctx->log_ret; 2907 } 2908 2909 index1 = log_transid % 2; 2910 if (atomic_read(&root->log_commit[index1])) { 2911 wait_log_commit(root, log_transid); 2912 mutex_unlock(&root->log_mutex); 2913 return ctx->log_ret; 2914 } 2915 ASSERT(log_transid == root->log_transid); 2916 atomic_set(&root->log_commit[index1], 1); 2917 2918 /* wait for previous tree log sync to complete */ 2919 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2920 wait_log_commit(root, log_transid - 1); 2921 2922 while (1) { 2923 int batch = atomic_read(&root->log_batch); 2924 /* when we're on an ssd, just kick the log commit out */ 2925 if (!btrfs_test_opt(fs_info, SSD) && 2926 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2927 mutex_unlock(&root->log_mutex); 2928 schedule_timeout_uninterruptible(1); 2929 mutex_lock(&root->log_mutex); 2930 } 2931 wait_for_writer(root); 2932 if (batch == atomic_read(&root->log_batch)) 2933 break; 2934 } 2935 2936 /* bail out if we need to do a full commit */ 2937 if (btrfs_need_log_full_commit(fs_info, trans)) { 2938 ret = -EAGAIN; 2939 btrfs_free_logged_extents(log, log_transid); 2940 mutex_unlock(&root->log_mutex); 2941 goto out; 2942 } 2943 2944 if (log_transid % 2 == 0) 2945 mark = EXTENT_DIRTY; 2946 else 2947 mark = EXTENT_NEW; 2948 2949 /* we start IO on all the marked extents here, but we don't actually 2950 * wait for them until later. 2951 */ 2952 blk_start_plug(&plug); 2953 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2954 if (ret) { 2955 blk_finish_plug(&plug); 2956 btrfs_abort_transaction(trans, ret); 2957 btrfs_free_logged_extents(log, log_transid); 2958 btrfs_set_log_full_commit(fs_info, trans); 2959 mutex_unlock(&root->log_mutex); 2960 goto out; 2961 } 2962 2963 btrfs_set_root_node(&log->root_item, log->node); 2964 2965 root->log_transid++; 2966 log->log_transid = root->log_transid; 2967 root->log_start_pid = 0; 2968 /* 2969 * IO has been started, blocks of the log tree have WRITTEN flag set 2970 * in their headers. new modifications of the log will be written to 2971 * new positions. so it's safe to allow log writers to go in. 2972 */ 2973 mutex_unlock(&root->log_mutex); 2974 2975 btrfs_init_log_ctx(&root_log_ctx, NULL); 2976 2977 mutex_lock(&log_root_tree->log_mutex); 2978 atomic_inc(&log_root_tree->log_batch); 2979 atomic_inc(&log_root_tree->log_writers); 2980 2981 index2 = log_root_tree->log_transid % 2; 2982 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2983 root_log_ctx.log_transid = log_root_tree->log_transid; 2984 2985 mutex_unlock(&log_root_tree->log_mutex); 2986 2987 ret = update_log_root(trans, log); 2988 2989 mutex_lock(&log_root_tree->log_mutex); 2990 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2991 /* 2992 * Implicit memory barrier after atomic_dec_and_test 2993 */ 2994 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2995 wake_up(&log_root_tree->log_writer_wait); 2996 } 2997 2998 if (ret) { 2999 if (!list_empty(&root_log_ctx.list)) 3000 list_del_init(&root_log_ctx.list); 3001 3002 blk_finish_plug(&plug); 3003 btrfs_set_log_full_commit(fs_info, trans); 3004 3005 if (ret != -ENOSPC) { 3006 btrfs_abort_transaction(trans, ret); 3007 mutex_unlock(&log_root_tree->log_mutex); 3008 goto out; 3009 } 3010 btrfs_wait_tree_log_extents(log, mark); 3011 btrfs_free_logged_extents(log, log_transid); 3012 mutex_unlock(&log_root_tree->log_mutex); 3013 ret = -EAGAIN; 3014 goto out; 3015 } 3016 3017 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3018 blk_finish_plug(&plug); 3019 list_del_init(&root_log_ctx.list); 3020 mutex_unlock(&log_root_tree->log_mutex); 3021 ret = root_log_ctx.log_ret; 3022 goto out; 3023 } 3024 3025 index2 = root_log_ctx.log_transid % 2; 3026 if (atomic_read(&log_root_tree->log_commit[index2])) { 3027 blk_finish_plug(&plug); 3028 ret = btrfs_wait_tree_log_extents(log, mark); 3029 btrfs_wait_logged_extents(trans, log, log_transid); 3030 wait_log_commit(log_root_tree, 3031 root_log_ctx.log_transid); 3032 mutex_unlock(&log_root_tree->log_mutex); 3033 if (!ret) 3034 ret = root_log_ctx.log_ret; 3035 goto out; 3036 } 3037 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3038 atomic_set(&log_root_tree->log_commit[index2], 1); 3039 3040 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3041 wait_log_commit(log_root_tree, 3042 root_log_ctx.log_transid - 1); 3043 } 3044 3045 wait_for_writer(log_root_tree); 3046 3047 /* 3048 * now that we've moved on to the tree of log tree roots, 3049 * check the full commit flag again 3050 */ 3051 if (btrfs_need_log_full_commit(fs_info, trans)) { 3052 blk_finish_plug(&plug); 3053 btrfs_wait_tree_log_extents(log, mark); 3054 btrfs_free_logged_extents(log, log_transid); 3055 mutex_unlock(&log_root_tree->log_mutex); 3056 ret = -EAGAIN; 3057 goto out_wake_log_root; 3058 } 3059 3060 ret = btrfs_write_marked_extents(fs_info, 3061 &log_root_tree->dirty_log_pages, 3062 EXTENT_DIRTY | EXTENT_NEW); 3063 blk_finish_plug(&plug); 3064 if (ret) { 3065 btrfs_set_log_full_commit(fs_info, trans); 3066 btrfs_abort_transaction(trans, ret); 3067 btrfs_free_logged_extents(log, log_transid); 3068 mutex_unlock(&log_root_tree->log_mutex); 3069 goto out_wake_log_root; 3070 } 3071 ret = btrfs_wait_tree_log_extents(log, mark); 3072 if (!ret) 3073 ret = btrfs_wait_tree_log_extents(log_root_tree, 3074 EXTENT_NEW | EXTENT_DIRTY); 3075 if (ret) { 3076 btrfs_set_log_full_commit(fs_info, trans); 3077 btrfs_free_logged_extents(log, log_transid); 3078 mutex_unlock(&log_root_tree->log_mutex); 3079 goto out_wake_log_root; 3080 } 3081 btrfs_wait_logged_extents(trans, log, log_transid); 3082 3083 btrfs_set_super_log_root(fs_info->super_for_commit, 3084 log_root_tree->node->start); 3085 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3086 btrfs_header_level(log_root_tree->node)); 3087 3088 log_root_tree->log_transid++; 3089 mutex_unlock(&log_root_tree->log_mutex); 3090 3091 /* 3092 * nobody else is going to jump in and write the the ctree 3093 * super here because the log_commit atomic below is protecting 3094 * us. We must be called with a transaction handle pinning 3095 * the running transaction open, so a full commit can't hop 3096 * in and cause problems either. 3097 */ 3098 ret = write_all_supers(fs_info, 1); 3099 if (ret) { 3100 btrfs_set_log_full_commit(fs_info, trans); 3101 btrfs_abort_transaction(trans, ret); 3102 goto out_wake_log_root; 3103 } 3104 3105 mutex_lock(&root->log_mutex); 3106 if (root->last_log_commit < log_transid) 3107 root->last_log_commit = log_transid; 3108 mutex_unlock(&root->log_mutex); 3109 3110 out_wake_log_root: 3111 mutex_lock(&log_root_tree->log_mutex); 3112 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3113 3114 log_root_tree->log_transid_committed++; 3115 atomic_set(&log_root_tree->log_commit[index2], 0); 3116 mutex_unlock(&log_root_tree->log_mutex); 3117 3118 /* 3119 * The barrier before waitqueue_active is implied by mutex_unlock 3120 */ 3121 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3122 wake_up(&log_root_tree->log_commit_wait[index2]); 3123 out: 3124 mutex_lock(&root->log_mutex); 3125 btrfs_remove_all_log_ctxs(root, index1, ret); 3126 root->log_transid_committed++; 3127 atomic_set(&root->log_commit[index1], 0); 3128 mutex_unlock(&root->log_mutex); 3129 3130 /* 3131 * The barrier before waitqueue_active is implied by mutex_unlock 3132 */ 3133 if (waitqueue_active(&root->log_commit_wait[index1])) 3134 wake_up(&root->log_commit_wait[index1]); 3135 return ret; 3136 } 3137 3138 static void free_log_tree(struct btrfs_trans_handle *trans, 3139 struct btrfs_root *log) 3140 { 3141 int ret; 3142 u64 start; 3143 u64 end; 3144 struct walk_control wc = { 3145 .free = 1, 3146 .process_func = process_one_buffer 3147 }; 3148 3149 ret = walk_log_tree(trans, log, &wc); 3150 /* I don't think this can happen but just in case */ 3151 if (ret) 3152 btrfs_abort_transaction(trans, ret); 3153 3154 while (1) { 3155 ret = find_first_extent_bit(&log->dirty_log_pages, 3156 0, &start, &end, 3157 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, 3158 NULL); 3159 if (ret) 3160 break; 3161 3162 clear_extent_bits(&log->dirty_log_pages, start, end, 3163 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3164 } 3165 3166 /* 3167 * We may have short-circuited the log tree with the full commit logic 3168 * and left ordered extents on our list, so clear these out to keep us 3169 * from leaking inodes and memory. 3170 */ 3171 btrfs_free_logged_extents(log, 0); 3172 btrfs_free_logged_extents(log, 1); 3173 3174 free_extent_buffer(log->node); 3175 kfree(log); 3176 } 3177 3178 /* 3179 * free all the extents used by the tree log. This should be called 3180 * at commit time of the full transaction 3181 */ 3182 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3183 { 3184 if (root->log_root) { 3185 free_log_tree(trans, root->log_root); 3186 root->log_root = NULL; 3187 } 3188 return 0; 3189 } 3190 3191 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3192 struct btrfs_fs_info *fs_info) 3193 { 3194 if (fs_info->log_root_tree) { 3195 free_log_tree(trans, fs_info->log_root_tree); 3196 fs_info->log_root_tree = NULL; 3197 } 3198 return 0; 3199 } 3200 3201 /* 3202 * If both a file and directory are logged, and unlinks or renames are 3203 * mixed in, we have a few interesting corners: 3204 * 3205 * create file X in dir Y 3206 * link file X to X.link in dir Y 3207 * fsync file X 3208 * unlink file X but leave X.link 3209 * fsync dir Y 3210 * 3211 * After a crash we would expect only X.link to exist. But file X 3212 * didn't get fsync'd again so the log has back refs for X and X.link. 3213 * 3214 * We solve this by removing directory entries and inode backrefs from the 3215 * log when a file that was logged in the current transaction is 3216 * unlinked. Any later fsync will include the updated log entries, and 3217 * we'll be able to reconstruct the proper directory items from backrefs. 3218 * 3219 * This optimizations allows us to avoid relogging the entire inode 3220 * or the entire directory. 3221 */ 3222 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3223 struct btrfs_root *root, 3224 const char *name, int name_len, 3225 struct btrfs_inode *dir, u64 index) 3226 { 3227 struct btrfs_root *log; 3228 struct btrfs_dir_item *di; 3229 struct btrfs_path *path; 3230 int ret; 3231 int err = 0; 3232 int bytes_del = 0; 3233 u64 dir_ino = btrfs_ino(dir); 3234 3235 if (dir->logged_trans < trans->transid) 3236 return 0; 3237 3238 ret = join_running_log_trans(root); 3239 if (ret) 3240 return 0; 3241 3242 mutex_lock(&dir->log_mutex); 3243 3244 log = root->log_root; 3245 path = btrfs_alloc_path(); 3246 if (!path) { 3247 err = -ENOMEM; 3248 goto out_unlock; 3249 } 3250 3251 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3252 name, name_len, -1); 3253 if (IS_ERR(di)) { 3254 err = PTR_ERR(di); 3255 goto fail; 3256 } 3257 if (di) { 3258 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3259 bytes_del += name_len; 3260 if (ret) { 3261 err = ret; 3262 goto fail; 3263 } 3264 } 3265 btrfs_release_path(path); 3266 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3267 index, name, name_len, -1); 3268 if (IS_ERR(di)) { 3269 err = PTR_ERR(di); 3270 goto fail; 3271 } 3272 if (di) { 3273 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3274 bytes_del += name_len; 3275 if (ret) { 3276 err = ret; 3277 goto fail; 3278 } 3279 } 3280 3281 /* update the directory size in the log to reflect the names 3282 * we have removed 3283 */ 3284 if (bytes_del) { 3285 struct btrfs_key key; 3286 3287 key.objectid = dir_ino; 3288 key.offset = 0; 3289 key.type = BTRFS_INODE_ITEM_KEY; 3290 btrfs_release_path(path); 3291 3292 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3293 if (ret < 0) { 3294 err = ret; 3295 goto fail; 3296 } 3297 if (ret == 0) { 3298 struct btrfs_inode_item *item; 3299 u64 i_size; 3300 3301 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3302 struct btrfs_inode_item); 3303 i_size = btrfs_inode_size(path->nodes[0], item); 3304 if (i_size > bytes_del) 3305 i_size -= bytes_del; 3306 else 3307 i_size = 0; 3308 btrfs_set_inode_size(path->nodes[0], item, i_size); 3309 btrfs_mark_buffer_dirty(path->nodes[0]); 3310 } else 3311 ret = 0; 3312 btrfs_release_path(path); 3313 } 3314 fail: 3315 btrfs_free_path(path); 3316 out_unlock: 3317 mutex_unlock(&dir->log_mutex); 3318 if (ret == -ENOSPC) { 3319 btrfs_set_log_full_commit(root->fs_info, trans); 3320 ret = 0; 3321 } else if (ret < 0) 3322 btrfs_abort_transaction(trans, ret); 3323 3324 btrfs_end_log_trans(root); 3325 3326 return err; 3327 } 3328 3329 /* see comments for btrfs_del_dir_entries_in_log */ 3330 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3331 struct btrfs_root *root, 3332 const char *name, int name_len, 3333 struct btrfs_inode *inode, u64 dirid) 3334 { 3335 struct btrfs_fs_info *fs_info = root->fs_info; 3336 struct btrfs_root *log; 3337 u64 index; 3338 int ret; 3339 3340 if (inode->logged_trans < trans->transid) 3341 return 0; 3342 3343 ret = join_running_log_trans(root); 3344 if (ret) 3345 return 0; 3346 log = root->log_root; 3347 mutex_lock(&inode->log_mutex); 3348 3349 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3350 dirid, &index); 3351 mutex_unlock(&inode->log_mutex); 3352 if (ret == -ENOSPC) { 3353 btrfs_set_log_full_commit(fs_info, trans); 3354 ret = 0; 3355 } else if (ret < 0 && ret != -ENOENT) 3356 btrfs_abort_transaction(trans, ret); 3357 btrfs_end_log_trans(root); 3358 3359 return ret; 3360 } 3361 3362 /* 3363 * creates a range item in the log for 'dirid'. first_offset and 3364 * last_offset tell us which parts of the key space the log should 3365 * be considered authoritative for. 3366 */ 3367 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3368 struct btrfs_root *log, 3369 struct btrfs_path *path, 3370 int key_type, u64 dirid, 3371 u64 first_offset, u64 last_offset) 3372 { 3373 int ret; 3374 struct btrfs_key key; 3375 struct btrfs_dir_log_item *item; 3376 3377 key.objectid = dirid; 3378 key.offset = first_offset; 3379 if (key_type == BTRFS_DIR_ITEM_KEY) 3380 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3381 else 3382 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3383 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3384 if (ret) 3385 return ret; 3386 3387 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3388 struct btrfs_dir_log_item); 3389 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3390 btrfs_mark_buffer_dirty(path->nodes[0]); 3391 btrfs_release_path(path); 3392 return 0; 3393 } 3394 3395 /* 3396 * log all the items included in the current transaction for a given 3397 * directory. This also creates the range items in the log tree required 3398 * to replay anything deleted before the fsync 3399 */ 3400 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3401 struct btrfs_root *root, struct btrfs_inode *inode, 3402 struct btrfs_path *path, 3403 struct btrfs_path *dst_path, int key_type, 3404 struct btrfs_log_ctx *ctx, 3405 u64 min_offset, u64 *last_offset_ret) 3406 { 3407 struct btrfs_key min_key; 3408 struct btrfs_root *log = root->log_root; 3409 struct extent_buffer *src; 3410 int err = 0; 3411 int ret; 3412 int i; 3413 int nritems; 3414 u64 first_offset = min_offset; 3415 u64 last_offset = (u64)-1; 3416 u64 ino = btrfs_ino(inode); 3417 3418 log = root->log_root; 3419 3420 min_key.objectid = ino; 3421 min_key.type = key_type; 3422 min_key.offset = min_offset; 3423 3424 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3425 3426 /* 3427 * we didn't find anything from this transaction, see if there 3428 * is anything at all 3429 */ 3430 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3431 min_key.objectid = ino; 3432 min_key.type = key_type; 3433 min_key.offset = (u64)-1; 3434 btrfs_release_path(path); 3435 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3436 if (ret < 0) { 3437 btrfs_release_path(path); 3438 return ret; 3439 } 3440 ret = btrfs_previous_item(root, path, ino, key_type); 3441 3442 /* if ret == 0 there are items for this type, 3443 * create a range to tell us the last key of this type. 3444 * otherwise, there are no items in this directory after 3445 * *min_offset, and we create a range to indicate that. 3446 */ 3447 if (ret == 0) { 3448 struct btrfs_key tmp; 3449 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3450 path->slots[0]); 3451 if (key_type == tmp.type) 3452 first_offset = max(min_offset, tmp.offset) + 1; 3453 } 3454 goto done; 3455 } 3456 3457 /* go backward to find any previous key */ 3458 ret = btrfs_previous_item(root, path, ino, key_type); 3459 if (ret == 0) { 3460 struct btrfs_key tmp; 3461 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3462 if (key_type == tmp.type) { 3463 first_offset = tmp.offset; 3464 ret = overwrite_item(trans, log, dst_path, 3465 path->nodes[0], path->slots[0], 3466 &tmp); 3467 if (ret) { 3468 err = ret; 3469 goto done; 3470 } 3471 } 3472 } 3473 btrfs_release_path(path); 3474 3475 /* find the first key from this transaction again */ 3476 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3477 if (WARN_ON(ret != 0)) 3478 goto done; 3479 3480 /* 3481 * we have a block from this transaction, log every item in it 3482 * from our directory 3483 */ 3484 while (1) { 3485 struct btrfs_key tmp; 3486 src = path->nodes[0]; 3487 nritems = btrfs_header_nritems(src); 3488 for (i = path->slots[0]; i < nritems; i++) { 3489 struct btrfs_dir_item *di; 3490 3491 btrfs_item_key_to_cpu(src, &min_key, i); 3492 3493 if (min_key.objectid != ino || min_key.type != key_type) 3494 goto done; 3495 ret = overwrite_item(trans, log, dst_path, src, i, 3496 &min_key); 3497 if (ret) { 3498 err = ret; 3499 goto done; 3500 } 3501 3502 /* 3503 * We must make sure that when we log a directory entry, 3504 * the corresponding inode, after log replay, has a 3505 * matching link count. For example: 3506 * 3507 * touch foo 3508 * mkdir mydir 3509 * sync 3510 * ln foo mydir/bar 3511 * xfs_io -c "fsync" mydir 3512 * <crash> 3513 * <mount fs and log replay> 3514 * 3515 * Would result in a fsync log that when replayed, our 3516 * file inode would have a link count of 1, but we get 3517 * two directory entries pointing to the same inode. 3518 * After removing one of the names, it would not be 3519 * possible to remove the other name, which resulted 3520 * always in stale file handle errors, and would not 3521 * be possible to rmdir the parent directory, since 3522 * its i_size could never decrement to the value 3523 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3524 */ 3525 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3526 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3527 if (ctx && 3528 (btrfs_dir_transid(src, di) == trans->transid || 3529 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3530 tmp.type != BTRFS_ROOT_ITEM_KEY) 3531 ctx->log_new_dentries = true; 3532 } 3533 path->slots[0] = nritems; 3534 3535 /* 3536 * look ahead to the next item and see if it is also 3537 * from this directory and from this transaction 3538 */ 3539 ret = btrfs_next_leaf(root, path); 3540 if (ret) { 3541 if (ret == 1) 3542 last_offset = (u64)-1; 3543 else 3544 err = ret; 3545 goto done; 3546 } 3547 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3548 if (tmp.objectid != ino || tmp.type != key_type) { 3549 last_offset = (u64)-1; 3550 goto done; 3551 } 3552 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3553 ret = overwrite_item(trans, log, dst_path, 3554 path->nodes[0], path->slots[0], 3555 &tmp); 3556 if (ret) 3557 err = ret; 3558 else 3559 last_offset = tmp.offset; 3560 goto done; 3561 } 3562 } 3563 done: 3564 btrfs_release_path(path); 3565 btrfs_release_path(dst_path); 3566 3567 if (err == 0) { 3568 *last_offset_ret = last_offset; 3569 /* 3570 * insert the log range keys to indicate where the log 3571 * is valid 3572 */ 3573 ret = insert_dir_log_key(trans, log, path, key_type, 3574 ino, first_offset, last_offset); 3575 if (ret) 3576 err = ret; 3577 } 3578 return err; 3579 } 3580 3581 /* 3582 * logging directories is very similar to logging inodes, We find all the items 3583 * from the current transaction and write them to the log. 3584 * 3585 * The recovery code scans the directory in the subvolume, and if it finds a 3586 * key in the range logged that is not present in the log tree, then it means 3587 * that dir entry was unlinked during the transaction. 3588 * 3589 * In order for that scan to work, we must include one key smaller than 3590 * the smallest logged by this transaction and one key larger than the largest 3591 * key logged by this transaction. 3592 */ 3593 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3594 struct btrfs_root *root, struct btrfs_inode *inode, 3595 struct btrfs_path *path, 3596 struct btrfs_path *dst_path, 3597 struct btrfs_log_ctx *ctx) 3598 { 3599 u64 min_key; 3600 u64 max_key; 3601 int ret; 3602 int key_type = BTRFS_DIR_ITEM_KEY; 3603 3604 again: 3605 min_key = 0; 3606 max_key = 0; 3607 while (1) { 3608 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3609 ctx, min_key, &max_key); 3610 if (ret) 3611 return ret; 3612 if (max_key == (u64)-1) 3613 break; 3614 min_key = max_key + 1; 3615 } 3616 3617 if (key_type == BTRFS_DIR_ITEM_KEY) { 3618 key_type = BTRFS_DIR_INDEX_KEY; 3619 goto again; 3620 } 3621 return 0; 3622 } 3623 3624 /* 3625 * a helper function to drop items from the log before we relog an 3626 * inode. max_key_type indicates the highest item type to remove. 3627 * This cannot be run for file data extents because it does not 3628 * free the extents they point to. 3629 */ 3630 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3631 struct btrfs_root *log, 3632 struct btrfs_path *path, 3633 u64 objectid, int max_key_type) 3634 { 3635 int ret; 3636 struct btrfs_key key; 3637 struct btrfs_key found_key; 3638 int start_slot; 3639 3640 key.objectid = objectid; 3641 key.type = max_key_type; 3642 key.offset = (u64)-1; 3643 3644 while (1) { 3645 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3646 BUG_ON(ret == 0); /* Logic error */ 3647 if (ret < 0) 3648 break; 3649 3650 if (path->slots[0] == 0) 3651 break; 3652 3653 path->slots[0]--; 3654 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3655 path->slots[0]); 3656 3657 if (found_key.objectid != objectid) 3658 break; 3659 3660 found_key.offset = 0; 3661 found_key.type = 0; 3662 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3663 &start_slot); 3664 3665 ret = btrfs_del_items(trans, log, path, start_slot, 3666 path->slots[0] - start_slot + 1); 3667 /* 3668 * If start slot isn't 0 then we don't need to re-search, we've 3669 * found the last guy with the objectid in this tree. 3670 */ 3671 if (ret || start_slot != 0) 3672 break; 3673 btrfs_release_path(path); 3674 } 3675 btrfs_release_path(path); 3676 if (ret > 0) 3677 ret = 0; 3678 return ret; 3679 } 3680 3681 static void fill_inode_item(struct btrfs_trans_handle *trans, 3682 struct extent_buffer *leaf, 3683 struct btrfs_inode_item *item, 3684 struct inode *inode, int log_inode_only, 3685 u64 logged_isize) 3686 { 3687 struct btrfs_map_token token; 3688 3689 btrfs_init_map_token(&token); 3690 3691 if (log_inode_only) { 3692 /* set the generation to zero so the recover code 3693 * can tell the difference between an logging 3694 * just to say 'this inode exists' and a logging 3695 * to say 'update this inode with these values' 3696 */ 3697 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3698 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3699 } else { 3700 btrfs_set_token_inode_generation(leaf, item, 3701 BTRFS_I(inode)->generation, 3702 &token); 3703 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3704 } 3705 3706 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3707 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3708 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3709 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3710 3711 btrfs_set_token_timespec_sec(leaf, &item->atime, 3712 inode->i_atime.tv_sec, &token); 3713 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3714 inode->i_atime.tv_nsec, &token); 3715 3716 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3717 inode->i_mtime.tv_sec, &token); 3718 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3719 inode->i_mtime.tv_nsec, &token); 3720 3721 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3722 inode->i_ctime.tv_sec, &token); 3723 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3724 inode->i_ctime.tv_nsec, &token); 3725 3726 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3727 &token); 3728 3729 btrfs_set_token_inode_sequence(leaf, item, 3730 inode_peek_iversion(inode), &token); 3731 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3732 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3733 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3734 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3735 } 3736 3737 static int log_inode_item(struct btrfs_trans_handle *trans, 3738 struct btrfs_root *log, struct btrfs_path *path, 3739 struct btrfs_inode *inode) 3740 { 3741 struct btrfs_inode_item *inode_item; 3742 int ret; 3743 3744 ret = btrfs_insert_empty_item(trans, log, path, 3745 &inode->location, sizeof(*inode_item)); 3746 if (ret && ret != -EEXIST) 3747 return ret; 3748 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3749 struct btrfs_inode_item); 3750 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3751 0, 0); 3752 btrfs_release_path(path); 3753 return 0; 3754 } 3755 3756 static noinline int copy_items(struct btrfs_trans_handle *trans, 3757 struct btrfs_inode *inode, 3758 struct btrfs_path *dst_path, 3759 struct btrfs_path *src_path, u64 *last_extent, 3760 int start_slot, int nr, int inode_only, 3761 u64 logged_isize) 3762 { 3763 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3764 unsigned long src_offset; 3765 unsigned long dst_offset; 3766 struct btrfs_root *log = inode->root->log_root; 3767 struct btrfs_file_extent_item *extent; 3768 struct btrfs_inode_item *inode_item; 3769 struct extent_buffer *src = src_path->nodes[0]; 3770 struct btrfs_key first_key, last_key, key; 3771 int ret; 3772 struct btrfs_key *ins_keys; 3773 u32 *ins_sizes; 3774 char *ins_data; 3775 int i; 3776 struct list_head ordered_sums; 3777 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3778 bool has_extents = false; 3779 bool need_find_last_extent = true; 3780 bool done = false; 3781 3782 INIT_LIST_HEAD(&ordered_sums); 3783 3784 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3785 nr * sizeof(u32), GFP_NOFS); 3786 if (!ins_data) 3787 return -ENOMEM; 3788 3789 first_key.objectid = (u64)-1; 3790 3791 ins_sizes = (u32 *)ins_data; 3792 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3793 3794 for (i = 0; i < nr; i++) { 3795 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3796 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3797 } 3798 ret = btrfs_insert_empty_items(trans, log, dst_path, 3799 ins_keys, ins_sizes, nr); 3800 if (ret) { 3801 kfree(ins_data); 3802 return ret; 3803 } 3804 3805 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3806 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3807 dst_path->slots[0]); 3808 3809 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3810 3811 if (i == nr - 1) 3812 last_key = ins_keys[i]; 3813 3814 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3815 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3816 dst_path->slots[0], 3817 struct btrfs_inode_item); 3818 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3819 &inode->vfs_inode, 3820 inode_only == LOG_INODE_EXISTS, 3821 logged_isize); 3822 } else { 3823 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3824 src_offset, ins_sizes[i]); 3825 } 3826 3827 /* 3828 * We set need_find_last_extent here in case we know we were 3829 * processing other items and then walk into the first extent in 3830 * the inode. If we don't hit an extent then nothing changes, 3831 * we'll do the last search the next time around. 3832 */ 3833 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3834 has_extents = true; 3835 if (first_key.objectid == (u64)-1) 3836 first_key = ins_keys[i]; 3837 } else { 3838 need_find_last_extent = false; 3839 } 3840 3841 /* take a reference on file data extents so that truncates 3842 * or deletes of this inode don't have to relog the inode 3843 * again 3844 */ 3845 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3846 !skip_csum) { 3847 int found_type; 3848 extent = btrfs_item_ptr(src, start_slot + i, 3849 struct btrfs_file_extent_item); 3850 3851 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3852 continue; 3853 3854 found_type = btrfs_file_extent_type(src, extent); 3855 if (found_type == BTRFS_FILE_EXTENT_REG) { 3856 u64 ds, dl, cs, cl; 3857 ds = btrfs_file_extent_disk_bytenr(src, 3858 extent); 3859 /* ds == 0 is a hole */ 3860 if (ds == 0) 3861 continue; 3862 3863 dl = btrfs_file_extent_disk_num_bytes(src, 3864 extent); 3865 cs = btrfs_file_extent_offset(src, extent); 3866 cl = btrfs_file_extent_num_bytes(src, 3867 extent); 3868 if (btrfs_file_extent_compression(src, 3869 extent)) { 3870 cs = 0; 3871 cl = dl; 3872 } 3873 3874 ret = btrfs_lookup_csums_range( 3875 fs_info->csum_root, 3876 ds + cs, ds + cs + cl - 1, 3877 &ordered_sums, 0); 3878 if (ret) { 3879 btrfs_release_path(dst_path); 3880 kfree(ins_data); 3881 return ret; 3882 } 3883 } 3884 } 3885 } 3886 3887 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3888 btrfs_release_path(dst_path); 3889 kfree(ins_data); 3890 3891 /* 3892 * we have to do this after the loop above to avoid changing the 3893 * log tree while trying to change the log tree. 3894 */ 3895 ret = 0; 3896 while (!list_empty(&ordered_sums)) { 3897 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3898 struct btrfs_ordered_sum, 3899 list); 3900 if (!ret) 3901 ret = btrfs_csum_file_blocks(trans, log, sums); 3902 list_del(&sums->list); 3903 kfree(sums); 3904 } 3905 3906 if (!has_extents) 3907 return ret; 3908 3909 if (need_find_last_extent && *last_extent == first_key.offset) { 3910 /* 3911 * We don't have any leafs between our current one and the one 3912 * we processed before that can have file extent items for our 3913 * inode (and have a generation number smaller than our current 3914 * transaction id). 3915 */ 3916 need_find_last_extent = false; 3917 } 3918 3919 /* 3920 * Because we use btrfs_search_forward we could skip leaves that were 3921 * not modified and then assume *last_extent is valid when it really 3922 * isn't. So back up to the previous leaf and read the end of the last 3923 * extent before we go and fill in holes. 3924 */ 3925 if (need_find_last_extent) { 3926 u64 len; 3927 3928 ret = btrfs_prev_leaf(inode->root, src_path); 3929 if (ret < 0) 3930 return ret; 3931 if (ret) 3932 goto fill_holes; 3933 if (src_path->slots[0]) 3934 src_path->slots[0]--; 3935 src = src_path->nodes[0]; 3936 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3937 if (key.objectid != btrfs_ino(inode) || 3938 key.type != BTRFS_EXTENT_DATA_KEY) 3939 goto fill_holes; 3940 extent = btrfs_item_ptr(src, src_path->slots[0], 3941 struct btrfs_file_extent_item); 3942 if (btrfs_file_extent_type(src, extent) == 3943 BTRFS_FILE_EXTENT_INLINE) { 3944 len = btrfs_file_extent_inline_len(src, 3945 src_path->slots[0], 3946 extent); 3947 *last_extent = ALIGN(key.offset + len, 3948 fs_info->sectorsize); 3949 } else { 3950 len = btrfs_file_extent_num_bytes(src, extent); 3951 *last_extent = key.offset + len; 3952 } 3953 } 3954 fill_holes: 3955 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3956 * things could have happened 3957 * 3958 * 1) A merge could have happened, so we could currently be on a leaf 3959 * that holds what we were copying in the first place. 3960 * 2) A split could have happened, and now not all of the items we want 3961 * are on the same leaf. 3962 * 3963 * So we need to adjust how we search for holes, we need to drop the 3964 * path and re-search for the first extent key we found, and then walk 3965 * forward until we hit the last one we copied. 3966 */ 3967 if (need_find_last_extent) { 3968 /* btrfs_prev_leaf could return 1 without releasing the path */ 3969 btrfs_release_path(src_path); 3970 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3971 src_path, 0, 0); 3972 if (ret < 0) 3973 return ret; 3974 ASSERT(ret == 0); 3975 src = src_path->nodes[0]; 3976 i = src_path->slots[0]; 3977 } else { 3978 i = start_slot; 3979 } 3980 3981 /* 3982 * Ok so here we need to go through and fill in any holes we may have 3983 * to make sure that holes are punched for those areas in case they had 3984 * extents previously. 3985 */ 3986 while (!done) { 3987 u64 offset, len; 3988 u64 extent_end; 3989 3990 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3991 ret = btrfs_next_leaf(inode->root, src_path); 3992 if (ret < 0) 3993 return ret; 3994 ASSERT(ret == 0); 3995 src = src_path->nodes[0]; 3996 i = 0; 3997 need_find_last_extent = true; 3998 } 3999 4000 btrfs_item_key_to_cpu(src, &key, i); 4001 if (!btrfs_comp_cpu_keys(&key, &last_key)) 4002 done = true; 4003 if (key.objectid != btrfs_ino(inode) || 4004 key.type != BTRFS_EXTENT_DATA_KEY) { 4005 i++; 4006 continue; 4007 } 4008 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 4009 if (btrfs_file_extent_type(src, extent) == 4010 BTRFS_FILE_EXTENT_INLINE) { 4011 len = btrfs_file_extent_inline_len(src, i, extent); 4012 extent_end = ALIGN(key.offset + len, 4013 fs_info->sectorsize); 4014 } else { 4015 len = btrfs_file_extent_num_bytes(src, extent); 4016 extent_end = key.offset + len; 4017 } 4018 i++; 4019 4020 if (*last_extent == key.offset) { 4021 *last_extent = extent_end; 4022 continue; 4023 } 4024 offset = *last_extent; 4025 len = key.offset - *last_extent; 4026 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 4027 offset, 0, 0, len, 0, len, 0, 0, 0); 4028 if (ret) 4029 break; 4030 *last_extent = extent_end; 4031 } 4032 4033 /* 4034 * Check if there is a hole between the last extent found in our leaf 4035 * and the first extent in the next leaf. If there is one, we need to 4036 * log an explicit hole so that at replay time we can punch the hole. 4037 */ 4038 if (ret == 0 && 4039 key.objectid == btrfs_ino(inode) && 4040 key.type == BTRFS_EXTENT_DATA_KEY && 4041 i == btrfs_header_nritems(src_path->nodes[0])) { 4042 ret = btrfs_next_leaf(inode->root, src_path); 4043 need_find_last_extent = true; 4044 if (ret > 0) { 4045 ret = 0; 4046 } else if (ret == 0) { 4047 btrfs_item_key_to_cpu(src_path->nodes[0], &key, 4048 src_path->slots[0]); 4049 if (key.objectid == btrfs_ino(inode) && 4050 key.type == BTRFS_EXTENT_DATA_KEY && 4051 *last_extent < key.offset) { 4052 const u64 len = key.offset - *last_extent; 4053 4054 ret = btrfs_insert_file_extent(trans, log, 4055 btrfs_ino(inode), 4056 *last_extent, 0, 4057 0, len, 0, len, 4058 0, 0, 0); 4059 } 4060 } 4061 } 4062 /* 4063 * Need to let the callers know we dropped the path so they should 4064 * re-search. 4065 */ 4066 if (!ret && need_find_last_extent) 4067 ret = 1; 4068 return ret; 4069 } 4070 4071 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4072 { 4073 struct extent_map *em1, *em2; 4074 4075 em1 = list_entry(a, struct extent_map, list); 4076 em2 = list_entry(b, struct extent_map, list); 4077 4078 if (em1->start < em2->start) 4079 return -1; 4080 else if (em1->start > em2->start) 4081 return 1; 4082 return 0; 4083 } 4084 4085 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 4086 struct inode *inode, 4087 struct btrfs_root *root, 4088 const struct extent_map *em, 4089 const struct list_head *logged_list, 4090 bool *ordered_io_error) 4091 { 4092 struct btrfs_fs_info *fs_info = root->fs_info; 4093 struct btrfs_ordered_extent *ordered; 4094 struct btrfs_root *log = root->log_root; 4095 u64 mod_start = em->mod_start; 4096 u64 mod_len = em->mod_len; 4097 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 4098 u64 csum_offset; 4099 u64 csum_len; 4100 LIST_HEAD(ordered_sums); 4101 int ret = 0; 4102 4103 *ordered_io_error = false; 4104 4105 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4106 em->block_start == EXTENT_MAP_HOLE) 4107 return 0; 4108 4109 /* 4110 * Wait far any ordered extent that covers our extent map. If it 4111 * finishes without an error, first check and see if our csums are on 4112 * our outstanding ordered extents. 4113 */ 4114 list_for_each_entry(ordered, logged_list, log_list) { 4115 struct btrfs_ordered_sum *sum; 4116 4117 if (!mod_len) 4118 break; 4119 4120 if (ordered->file_offset + ordered->len <= mod_start || 4121 mod_start + mod_len <= ordered->file_offset) 4122 continue; 4123 4124 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 4125 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 4126 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 4127 const u64 start = ordered->file_offset; 4128 const u64 end = ordered->file_offset + ordered->len - 1; 4129 4130 WARN_ON(ordered->inode != inode); 4131 filemap_fdatawrite_range(inode->i_mapping, start, end); 4132 } 4133 4134 wait_event(ordered->wait, 4135 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 4136 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 4137 4138 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 4139 /* 4140 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 4141 * i_mapping flags, so that the next fsync won't get 4142 * an outdated io error too. 4143 */ 4144 filemap_check_errors(inode->i_mapping); 4145 *ordered_io_error = true; 4146 break; 4147 } 4148 /* 4149 * We are going to copy all the csums on this ordered extent, so 4150 * go ahead and adjust mod_start and mod_len in case this 4151 * ordered extent has already been logged. 4152 */ 4153 if (ordered->file_offset > mod_start) { 4154 if (ordered->file_offset + ordered->len >= 4155 mod_start + mod_len) 4156 mod_len = ordered->file_offset - mod_start; 4157 /* 4158 * If we have this case 4159 * 4160 * |--------- logged extent ---------| 4161 * |----- ordered extent ----| 4162 * 4163 * Just don't mess with mod_start and mod_len, we'll 4164 * just end up logging more csums than we need and it 4165 * will be ok. 4166 */ 4167 } else { 4168 if (ordered->file_offset + ordered->len < 4169 mod_start + mod_len) { 4170 mod_len = (mod_start + mod_len) - 4171 (ordered->file_offset + ordered->len); 4172 mod_start = ordered->file_offset + 4173 ordered->len; 4174 } else { 4175 mod_len = 0; 4176 } 4177 } 4178 4179 if (skip_csum) 4180 continue; 4181 4182 /* 4183 * To keep us from looping for the above case of an ordered 4184 * extent that falls inside of the logged extent. 4185 */ 4186 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4187 &ordered->flags)) 4188 continue; 4189 4190 list_for_each_entry(sum, &ordered->list, list) { 4191 ret = btrfs_csum_file_blocks(trans, log, sum); 4192 if (ret) 4193 break; 4194 } 4195 } 4196 4197 if (*ordered_io_error || !mod_len || ret || skip_csum) 4198 return ret; 4199 4200 if (em->compress_type) { 4201 csum_offset = 0; 4202 csum_len = max(em->block_len, em->orig_block_len); 4203 } else { 4204 csum_offset = mod_start - em->start; 4205 csum_len = mod_len; 4206 } 4207 4208 /* block start is already adjusted for the file extent offset. */ 4209 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4210 em->block_start + csum_offset, 4211 em->block_start + csum_offset + 4212 csum_len - 1, &ordered_sums, 0); 4213 if (ret) 4214 return ret; 4215 4216 while (!list_empty(&ordered_sums)) { 4217 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4218 struct btrfs_ordered_sum, 4219 list); 4220 if (!ret) 4221 ret = btrfs_csum_file_blocks(trans, log, sums); 4222 list_del(&sums->list); 4223 kfree(sums); 4224 } 4225 4226 return ret; 4227 } 4228 4229 static int log_one_extent(struct btrfs_trans_handle *trans, 4230 struct btrfs_inode *inode, struct btrfs_root *root, 4231 const struct extent_map *em, 4232 struct btrfs_path *path, 4233 const struct list_head *logged_list, 4234 struct btrfs_log_ctx *ctx) 4235 { 4236 struct btrfs_root *log = root->log_root; 4237 struct btrfs_file_extent_item *fi; 4238 struct extent_buffer *leaf; 4239 struct btrfs_map_token token; 4240 struct btrfs_key key; 4241 u64 extent_offset = em->start - em->orig_start; 4242 u64 block_len; 4243 int ret; 4244 int extent_inserted = 0; 4245 bool ordered_io_err = false; 4246 4247 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4248 logged_list, &ordered_io_err); 4249 if (ret) 4250 return ret; 4251 4252 if (ordered_io_err) { 4253 ctx->io_err = -EIO; 4254 return ctx->io_err; 4255 } 4256 4257 btrfs_init_map_token(&token); 4258 4259 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4260 em->start + em->len, NULL, 0, 1, 4261 sizeof(*fi), &extent_inserted); 4262 if (ret) 4263 return ret; 4264 4265 if (!extent_inserted) { 4266 key.objectid = btrfs_ino(inode); 4267 key.type = BTRFS_EXTENT_DATA_KEY; 4268 key.offset = em->start; 4269 4270 ret = btrfs_insert_empty_item(trans, log, path, &key, 4271 sizeof(*fi)); 4272 if (ret) 4273 return ret; 4274 } 4275 leaf = path->nodes[0]; 4276 fi = btrfs_item_ptr(leaf, path->slots[0], 4277 struct btrfs_file_extent_item); 4278 4279 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4280 &token); 4281 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4282 btrfs_set_token_file_extent_type(leaf, fi, 4283 BTRFS_FILE_EXTENT_PREALLOC, 4284 &token); 4285 else 4286 btrfs_set_token_file_extent_type(leaf, fi, 4287 BTRFS_FILE_EXTENT_REG, 4288 &token); 4289 4290 block_len = max(em->block_len, em->orig_block_len); 4291 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4292 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4293 em->block_start, 4294 &token); 4295 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4296 &token); 4297 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4298 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4299 em->block_start - 4300 extent_offset, &token); 4301 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4302 &token); 4303 } else { 4304 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4305 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4306 &token); 4307 } 4308 4309 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4310 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4311 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4312 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4313 &token); 4314 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4315 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4316 btrfs_mark_buffer_dirty(leaf); 4317 4318 btrfs_release_path(path); 4319 4320 return ret; 4321 } 4322 4323 /* 4324 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4325 * lose them after doing a fast fsync and replaying the log. We scan the 4326 * subvolume's root instead of iterating the inode's extent map tree because 4327 * otherwise we can log incorrect extent items based on extent map conversion. 4328 * That can happen due to the fact that extent maps are merged when they 4329 * are not in the extent map tree's list of modified extents. 4330 */ 4331 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4332 struct btrfs_inode *inode, 4333 struct btrfs_path *path) 4334 { 4335 struct btrfs_root *root = inode->root; 4336 struct btrfs_key key; 4337 const u64 i_size = i_size_read(&inode->vfs_inode); 4338 const u64 ino = btrfs_ino(inode); 4339 struct btrfs_path *dst_path = NULL; 4340 u64 last_extent = (u64)-1; 4341 int ins_nr = 0; 4342 int start_slot; 4343 int ret; 4344 4345 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4346 return 0; 4347 4348 key.objectid = ino; 4349 key.type = BTRFS_EXTENT_DATA_KEY; 4350 key.offset = i_size; 4351 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4352 if (ret < 0) 4353 goto out; 4354 4355 while (true) { 4356 struct extent_buffer *leaf = path->nodes[0]; 4357 int slot = path->slots[0]; 4358 4359 if (slot >= btrfs_header_nritems(leaf)) { 4360 if (ins_nr > 0) { 4361 ret = copy_items(trans, inode, dst_path, path, 4362 &last_extent, start_slot, 4363 ins_nr, 1, 0); 4364 if (ret < 0) 4365 goto out; 4366 ins_nr = 0; 4367 } 4368 ret = btrfs_next_leaf(root, path); 4369 if (ret < 0) 4370 goto out; 4371 if (ret > 0) { 4372 ret = 0; 4373 break; 4374 } 4375 continue; 4376 } 4377 4378 btrfs_item_key_to_cpu(leaf, &key, slot); 4379 if (key.objectid > ino) 4380 break; 4381 if (WARN_ON_ONCE(key.objectid < ino) || 4382 key.type < BTRFS_EXTENT_DATA_KEY || 4383 key.offset < i_size) { 4384 path->slots[0]++; 4385 continue; 4386 } 4387 if (last_extent == (u64)-1) { 4388 last_extent = key.offset; 4389 /* 4390 * Avoid logging extent items logged in past fsync calls 4391 * and leading to duplicate keys in the log tree. 4392 */ 4393 do { 4394 ret = btrfs_truncate_inode_items(trans, 4395 root->log_root, 4396 &inode->vfs_inode, 4397 i_size, 4398 BTRFS_EXTENT_DATA_KEY); 4399 } while (ret == -EAGAIN); 4400 if (ret) 4401 goto out; 4402 } 4403 if (ins_nr == 0) 4404 start_slot = slot; 4405 ins_nr++; 4406 path->slots[0]++; 4407 if (!dst_path) { 4408 dst_path = btrfs_alloc_path(); 4409 if (!dst_path) { 4410 ret = -ENOMEM; 4411 goto out; 4412 } 4413 } 4414 } 4415 if (ins_nr > 0) { 4416 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4417 start_slot, ins_nr, 1, 0); 4418 if (ret > 0) 4419 ret = 0; 4420 } 4421 out: 4422 btrfs_release_path(path); 4423 btrfs_free_path(dst_path); 4424 return ret; 4425 } 4426 4427 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4428 struct btrfs_root *root, 4429 struct btrfs_inode *inode, 4430 struct btrfs_path *path, 4431 struct list_head *logged_list, 4432 struct btrfs_log_ctx *ctx, 4433 const u64 start, 4434 const u64 end) 4435 { 4436 struct extent_map *em, *n; 4437 struct list_head extents; 4438 struct extent_map_tree *tree = &inode->extent_tree; 4439 u64 logged_start, logged_end; 4440 u64 test_gen; 4441 int ret = 0; 4442 int num = 0; 4443 4444 INIT_LIST_HEAD(&extents); 4445 4446 down_write(&inode->dio_sem); 4447 write_lock(&tree->lock); 4448 test_gen = root->fs_info->last_trans_committed; 4449 logged_start = start; 4450 logged_end = end; 4451 4452 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4453 list_del_init(&em->list); 4454 /* 4455 * Just an arbitrary number, this can be really CPU intensive 4456 * once we start getting a lot of extents, and really once we 4457 * have a bunch of extents we just want to commit since it will 4458 * be faster. 4459 */ 4460 if (++num > 32768) { 4461 list_del_init(&tree->modified_extents); 4462 ret = -EFBIG; 4463 goto process; 4464 } 4465 4466 if (em->generation <= test_gen) 4467 continue; 4468 4469 /* We log prealloc extents beyond eof later. */ 4470 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4471 em->start >= i_size_read(&inode->vfs_inode)) 4472 continue; 4473 4474 if (em->start < logged_start) 4475 logged_start = em->start; 4476 if ((em->start + em->len - 1) > logged_end) 4477 logged_end = em->start + em->len - 1; 4478 4479 /* Need a ref to keep it from getting evicted from cache */ 4480 refcount_inc(&em->refs); 4481 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4482 list_add_tail(&em->list, &extents); 4483 num++; 4484 } 4485 4486 list_sort(NULL, &extents, extent_cmp); 4487 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4488 /* 4489 * Some ordered extents started by fsync might have completed 4490 * before we could collect them into the list logged_list, which 4491 * means they're gone, not in our logged_list nor in the inode's 4492 * ordered tree. We want the application/user space to know an 4493 * error happened while attempting to persist file data so that 4494 * it can take proper action. If such error happened, we leave 4495 * without writing to the log tree and the fsync must report the 4496 * file data write error and not commit the current transaction. 4497 */ 4498 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4499 if (ret) 4500 ctx->io_err = ret; 4501 process: 4502 while (!list_empty(&extents)) { 4503 em = list_entry(extents.next, struct extent_map, list); 4504 4505 list_del_init(&em->list); 4506 4507 /* 4508 * If we had an error we just need to delete everybody from our 4509 * private list. 4510 */ 4511 if (ret) { 4512 clear_em_logging(tree, em); 4513 free_extent_map(em); 4514 continue; 4515 } 4516 4517 write_unlock(&tree->lock); 4518 4519 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4520 ctx); 4521 write_lock(&tree->lock); 4522 clear_em_logging(tree, em); 4523 free_extent_map(em); 4524 } 4525 WARN_ON(!list_empty(&extents)); 4526 write_unlock(&tree->lock); 4527 up_write(&inode->dio_sem); 4528 4529 btrfs_release_path(path); 4530 if (!ret) 4531 ret = btrfs_log_prealloc_extents(trans, inode, path); 4532 4533 return ret; 4534 } 4535 4536 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4537 struct btrfs_path *path, u64 *size_ret) 4538 { 4539 struct btrfs_key key; 4540 int ret; 4541 4542 key.objectid = btrfs_ino(inode); 4543 key.type = BTRFS_INODE_ITEM_KEY; 4544 key.offset = 0; 4545 4546 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4547 if (ret < 0) { 4548 return ret; 4549 } else if (ret > 0) { 4550 *size_ret = 0; 4551 } else { 4552 struct btrfs_inode_item *item; 4553 4554 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4555 struct btrfs_inode_item); 4556 *size_ret = btrfs_inode_size(path->nodes[0], item); 4557 } 4558 4559 btrfs_release_path(path); 4560 return 0; 4561 } 4562 4563 /* 4564 * At the moment we always log all xattrs. This is to figure out at log replay 4565 * time which xattrs must have their deletion replayed. If a xattr is missing 4566 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4567 * because if a xattr is deleted, the inode is fsynced and a power failure 4568 * happens, causing the log to be replayed the next time the fs is mounted, 4569 * we want the xattr to not exist anymore (same behaviour as other filesystems 4570 * with a journal, ext3/4, xfs, f2fs, etc). 4571 */ 4572 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4573 struct btrfs_root *root, 4574 struct btrfs_inode *inode, 4575 struct btrfs_path *path, 4576 struct btrfs_path *dst_path) 4577 { 4578 int ret; 4579 struct btrfs_key key; 4580 const u64 ino = btrfs_ino(inode); 4581 int ins_nr = 0; 4582 int start_slot = 0; 4583 4584 key.objectid = ino; 4585 key.type = BTRFS_XATTR_ITEM_KEY; 4586 key.offset = 0; 4587 4588 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4589 if (ret < 0) 4590 return ret; 4591 4592 while (true) { 4593 int slot = path->slots[0]; 4594 struct extent_buffer *leaf = path->nodes[0]; 4595 int nritems = btrfs_header_nritems(leaf); 4596 4597 if (slot >= nritems) { 4598 if (ins_nr > 0) { 4599 u64 last_extent = 0; 4600 4601 ret = copy_items(trans, inode, dst_path, path, 4602 &last_extent, start_slot, 4603 ins_nr, 1, 0); 4604 /* can't be 1, extent items aren't processed */ 4605 ASSERT(ret <= 0); 4606 if (ret < 0) 4607 return ret; 4608 ins_nr = 0; 4609 } 4610 ret = btrfs_next_leaf(root, path); 4611 if (ret < 0) 4612 return ret; 4613 else if (ret > 0) 4614 break; 4615 continue; 4616 } 4617 4618 btrfs_item_key_to_cpu(leaf, &key, slot); 4619 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4620 break; 4621 4622 if (ins_nr == 0) 4623 start_slot = slot; 4624 ins_nr++; 4625 path->slots[0]++; 4626 cond_resched(); 4627 } 4628 if (ins_nr > 0) { 4629 u64 last_extent = 0; 4630 4631 ret = copy_items(trans, inode, dst_path, path, 4632 &last_extent, start_slot, 4633 ins_nr, 1, 0); 4634 /* can't be 1, extent items aren't processed */ 4635 ASSERT(ret <= 0); 4636 if (ret < 0) 4637 return ret; 4638 } 4639 4640 return 0; 4641 } 4642 4643 /* 4644 * If the no holes feature is enabled we need to make sure any hole between the 4645 * last extent and the i_size of our inode is explicitly marked in the log. This 4646 * is to make sure that doing something like: 4647 * 4648 * 1) create file with 128Kb of data 4649 * 2) truncate file to 64Kb 4650 * 3) truncate file to 256Kb 4651 * 4) fsync file 4652 * 5) <crash/power failure> 4653 * 6) mount fs and trigger log replay 4654 * 4655 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4656 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4657 * file correspond to a hole. The presence of explicit holes in a log tree is 4658 * what guarantees that log replay will remove/adjust file extent items in the 4659 * fs/subvol tree. 4660 * 4661 * Here we do not need to care about holes between extents, that is already done 4662 * by copy_items(). We also only need to do this in the full sync path, where we 4663 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4664 * lookup the list of modified extent maps and if any represents a hole, we 4665 * insert a corresponding extent representing a hole in the log tree. 4666 */ 4667 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4668 struct btrfs_root *root, 4669 struct btrfs_inode *inode, 4670 struct btrfs_path *path) 4671 { 4672 struct btrfs_fs_info *fs_info = root->fs_info; 4673 int ret; 4674 struct btrfs_key key; 4675 u64 hole_start; 4676 u64 hole_size; 4677 struct extent_buffer *leaf; 4678 struct btrfs_root *log = root->log_root; 4679 const u64 ino = btrfs_ino(inode); 4680 const u64 i_size = i_size_read(&inode->vfs_inode); 4681 4682 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4683 return 0; 4684 4685 key.objectid = ino; 4686 key.type = BTRFS_EXTENT_DATA_KEY; 4687 key.offset = (u64)-1; 4688 4689 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4690 ASSERT(ret != 0); 4691 if (ret < 0) 4692 return ret; 4693 4694 ASSERT(path->slots[0] > 0); 4695 path->slots[0]--; 4696 leaf = path->nodes[0]; 4697 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4698 4699 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4700 /* inode does not have any extents */ 4701 hole_start = 0; 4702 hole_size = i_size; 4703 } else { 4704 struct btrfs_file_extent_item *extent; 4705 u64 len; 4706 4707 /* 4708 * If there's an extent beyond i_size, an explicit hole was 4709 * already inserted by copy_items(). 4710 */ 4711 if (key.offset >= i_size) 4712 return 0; 4713 4714 extent = btrfs_item_ptr(leaf, path->slots[0], 4715 struct btrfs_file_extent_item); 4716 4717 if (btrfs_file_extent_type(leaf, extent) == 4718 BTRFS_FILE_EXTENT_INLINE) { 4719 len = btrfs_file_extent_inline_len(leaf, 4720 path->slots[0], 4721 extent); 4722 ASSERT(len == i_size || 4723 (len == fs_info->sectorsize && 4724 btrfs_file_extent_compression(leaf, extent) != 4725 BTRFS_COMPRESS_NONE)); 4726 return 0; 4727 } 4728 4729 len = btrfs_file_extent_num_bytes(leaf, extent); 4730 /* Last extent goes beyond i_size, no need to log a hole. */ 4731 if (key.offset + len > i_size) 4732 return 0; 4733 hole_start = key.offset + len; 4734 hole_size = i_size - hole_start; 4735 } 4736 btrfs_release_path(path); 4737 4738 /* Last extent ends at i_size. */ 4739 if (hole_size == 0) 4740 return 0; 4741 4742 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4743 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4744 hole_size, 0, hole_size, 0, 0, 0); 4745 return ret; 4746 } 4747 4748 /* 4749 * When we are logging a new inode X, check if it doesn't have a reference that 4750 * matches the reference from some other inode Y created in a past transaction 4751 * and that was renamed in the current transaction. If we don't do this, then at 4752 * log replay time we can lose inode Y (and all its files if it's a directory): 4753 * 4754 * mkdir /mnt/x 4755 * echo "hello world" > /mnt/x/foobar 4756 * sync 4757 * mv /mnt/x /mnt/y 4758 * mkdir /mnt/x # or touch /mnt/x 4759 * xfs_io -c fsync /mnt/x 4760 * <power fail> 4761 * mount fs, trigger log replay 4762 * 4763 * After the log replay procedure, we would lose the first directory and all its 4764 * files (file foobar). 4765 * For the case where inode Y is not a directory we simply end up losing it: 4766 * 4767 * echo "123" > /mnt/foo 4768 * sync 4769 * mv /mnt/foo /mnt/bar 4770 * echo "abc" > /mnt/foo 4771 * xfs_io -c fsync /mnt/foo 4772 * <power fail> 4773 * 4774 * We also need this for cases where a snapshot entry is replaced by some other 4775 * entry (file or directory) otherwise we end up with an unreplayable log due to 4776 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4777 * if it were a regular entry: 4778 * 4779 * mkdir /mnt/x 4780 * btrfs subvolume snapshot /mnt /mnt/x/snap 4781 * btrfs subvolume delete /mnt/x/snap 4782 * rmdir /mnt/x 4783 * mkdir /mnt/x 4784 * fsync /mnt/x or fsync some new file inside it 4785 * <power fail> 4786 * 4787 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4788 * the same transaction. 4789 */ 4790 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4791 const int slot, 4792 const struct btrfs_key *key, 4793 struct btrfs_inode *inode, 4794 u64 *other_ino) 4795 { 4796 int ret; 4797 struct btrfs_path *search_path; 4798 char *name = NULL; 4799 u32 name_len = 0; 4800 u32 item_size = btrfs_item_size_nr(eb, slot); 4801 u32 cur_offset = 0; 4802 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4803 4804 search_path = btrfs_alloc_path(); 4805 if (!search_path) 4806 return -ENOMEM; 4807 search_path->search_commit_root = 1; 4808 search_path->skip_locking = 1; 4809 4810 while (cur_offset < item_size) { 4811 u64 parent; 4812 u32 this_name_len; 4813 u32 this_len; 4814 unsigned long name_ptr; 4815 struct btrfs_dir_item *di; 4816 4817 if (key->type == BTRFS_INODE_REF_KEY) { 4818 struct btrfs_inode_ref *iref; 4819 4820 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4821 parent = key->offset; 4822 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4823 name_ptr = (unsigned long)(iref + 1); 4824 this_len = sizeof(*iref) + this_name_len; 4825 } else { 4826 struct btrfs_inode_extref *extref; 4827 4828 extref = (struct btrfs_inode_extref *)(ptr + 4829 cur_offset); 4830 parent = btrfs_inode_extref_parent(eb, extref); 4831 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4832 name_ptr = (unsigned long)&extref->name; 4833 this_len = sizeof(*extref) + this_name_len; 4834 } 4835 4836 if (this_name_len > name_len) { 4837 char *new_name; 4838 4839 new_name = krealloc(name, this_name_len, GFP_NOFS); 4840 if (!new_name) { 4841 ret = -ENOMEM; 4842 goto out; 4843 } 4844 name_len = this_name_len; 4845 name = new_name; 4846 } 4847 4848 read_extent_buffer(eb, name, name_ptr, this_name_len); 4849 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4850 parent, name, this_name_len, 0); 4851 if (di && !IS_ERR(di)) { 4852 struct btrfs_key di_key; 4853 4854 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4855 di, &di_key); 4856 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4857 ret = 1; 4858 *other_ino = di_key.objectid; 4859 } else { 4860 ret = -EAGAIN; 4861 } 4862 goto out; 4863 } else if (IS_ERR(di)) { 4864 ret = PTR_ERR(di); 4865 goto out; 4866 } 4867 btrfs_release_path(search_path); 4868 4869 cur_offset += this_len; 4870 } 4871 ret = 0; 4872 out: 4873 btrfs_free_path(search_path); 4874 kfree(name); 4875 return ret; 4876 } 4877 4878 /* log a single inode in the tree log. 4879 * At least one parent directory for this inode must exist in the tree 4880 * or be logged already. 4881 * 4882 * Any items from this inode changed by the current transaction are copied 4883 * to the log tree. An extra reference is taken on any extents in this 4884 * file, allowing us to avoid a whole pile of corner cases around logging 4885 * blocks that have been removed from the tree. 4886 * 4887 * See LOG_INODE_ALL and related defines for a description of what inode_only 4888 * does. 4889 * 4890 * This handles both files and directories. 4891 */ 4892 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4893 struct btrfs_root *root, struct btrfs_inode *inode, 4894 int inode_only, 4895 const loff_t start, 4896 const loff_t end, 4897 struct btrfs_log_ctx *ctx) 4898 { 4899 struct btrfs_fs_info *fs_info = root->fs_info; 4900 struct btrfs_path *path; 4901 struct btrfs_path *dst_path; 4902 struct btrfs_key min_key; 4903 struct btrfs_key max_key; 4904 struct btrfs_root *log = root->log_root; 4905 LIST_HEAD(logged_list); 4906 u64 last_extent = 0; 4907 int err = 0; 4908 int ret; 4909 int nritems; 4910 int ins_start_slot = 0; 4911 int ins_nr; 4912 bool fast_search = false; 4913 u64 ino = btrfs_ino(inode); 4914 struct extent_map_tree *em_tree = &inode->extent_tree; 4915 u64 logged_isize = 0; 4916 bool need_log_inode_item = true; 4917 bool xattrs_logged = false; 4918 4919 path = btrfs_alloc_path(); 4920 if (!path) 4921 return -ENOMEM; 4922 dst_path = btrfs_alloc_path(); 4923 if (!dst_path) { 4924 btrfs_free_path(path); 4925 return -ENOMEM; 4926 } 4927 4928 min_key.objectid = ino; 4929 min_key.type = BTRFS_INODE_ITEM_KEY; 4930 min_key.offset = 0; 4931 4932 max_key.objectid = ino; 4933 4934 4935 /* today the code can only do partial logging of directories */ 4936 if (S_ISDIR(inode->vfs_inode.i_mode) || 4937 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4938 &inode->runtime_flags) && 4939 inode_only >= LOG_INODE_EXISTS)) 4940 max_key.type = BTRFS_XATTR_ITEM_KEY; 4941 else 4942 max_key.type = (u8)-1; 4943 max_key.offset = (u64)-1; 4944 4945 /* 4946 * Only run delayed items if we are a dir or a new file. 4947 * Otherwise commit the delayed inode only, which is needed in 4948 * order for the log replay code to mark inodes for link count 4949 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4950 */ 4951 if (S_ISDIR(inode->vfs_inode.i_mode) || 4952 inode->generation > fs_info->last_trans_committed) 4953 ret = btrfs_commit_inode_delayed_items(trans, inode); 4954 else 4955 ret = btrfs_commit_inode_delayed_inode(inode); 4956 4957 if (ret) { 4958 btrfs_free_path(path); 4959 btrfs_free_path(dst_path); 4960 return ret; 4961 } 4962 4963 if (inode_only == LOG_OTHER_INODE) { 4964 inode_only = LOG_INODE_EXISTS; 4965 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4966 } else { 4967 mutex_lock(&inode->log_mutex); 4968 } 4969 4970 /* 4971 * a brute force approach to making sure we get the most uptodate 4972 * copies of everything. 4973 */ 4974 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4975 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4976 4977 if (inode_only == LOG_INODE_EXISTS) 4978 max_key_type = BTRFS_XATTR_ITEM_KEY; 4979 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4980 } else { 4981 if (inode_only == LOG_INODE_EXISTS) { 4982 /* 4983 * Make sure the new inode item we write to the log has 4984 * the same isize as the current one (if it exists). 4985 * This is necessary to prevent data loss after log 4986 * replay, and also to prevent doing a wrong expanding 4987 * truncate - for e.g. create file, write 4K into offset 4988 * 0, fsync, write 4K into offset 4096, add hard link, 4989 * fsync some other file (to sync log), power fail - if 4990 * we use the inode's current i_size, after log replay 4991 * we get a 8Kb file, with the last 4Kb extent as a hole 4992 * (zeroes), as if an expanding truncate happened, 4993 * instead of getting a file of 4Kb only. 4994 */ 4995 err = logged_inode_size(log, inode, path, &logged_isize); 4996 if (err) 4997 goto out_unlock; 4998 } 4999 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5000 &inode->runtime_flags)) { 5001 if (inode_only == LOG_INODE_EXISTS) { 5002 max_key.type = BTRFS_XATTR_ITEM_KEY; 5003 ret = drop_objectid_items(trans, log, path, ino, 5004 max_key.type); 5005 } else { 5006 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5007 &inode->runtime_flags); 5008 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5009 &inode->runtime_flags); 5010 while(1) { 5011 ret = btrfs_truncate_inode_items(trans, 5012 log, &inode->vfs_inode, 0, 0); 5013 if (ret != -EAGAIN) 5014 break; 5015 } 5016 } 5017 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5018 &inode->runtime_flags) || 5019 inode_only == LOG_INODE_EXISTS) { 5020 if (inode_only == LOG_INODE_ALL) 5021 fast_search = true; 5022 max_key.type = BTRFS_XATTR_ITEM_KEY; 5023 ret = drop_objectid_items(trans, log, path, ino, 5024 max_key.type); 5025 } else { 5026 if (inode_only == LOG_INODE_ALL) 5027 fast_search = true; 5028 goto log_extents; 5029 } 5030 5031 } 5032 if (ret) { 5033 err = ret; 5034 goto out_unlock; 5035 } 5036 5037 while (1) { 5038 ins_nr = 0; 5039 ret = btrfs_search_forward(root, &min_key, 5040 path, trans->transid); 5041 if (ret < 0) { 5042 err = ret; 5043 goto out_unlock; 5044 } 5045 if (ret != 0) 5046 break; 5047 again: 5048 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 5049 if (min_key.objectid != ino) 5050 break; 5051 if (min_key.type > max_key.type) 5052 break; 5053 5054 if (min_key.type == BTRFS_INODE_ITEM_KEY) 5055 need_log_inode_item = false; 5056 5057 if ((min_key.type == BTRFS_INODE_REF_KEY || 5058 min_key.type == BTRFS_INODE_EXTREF_KEY) && 5059 inode->generation == trans->transid) { 5060 u64 other_ino = 0; 5061 5062 ret = btrfs_check_ref_name_override(path->nodes[0], 5063 path->slots[0], &min_key, inode, 5064 &other_ino); 5065 if (ret < 0) { 5066 err = ret; 5067 goto out_unlock; 5068 } else if (ret > 0 && ctx && 5069 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5070 struct btrfs_key inode_key; 5071 struct inode *other_inode; 5072 5073 if (ins_nr > 0) { 5074 ins_nr++; 5075 } else { 5076 ins_nr = 1; 5077 ins_start_slot = path->slots[0]; 5078 } 5079 ret = copy_items(trans, inode, dst_path, path, 5080 &last_extent, ins_start_slot, 5081 ins_nr, inode_only, 5082 logged_isize); 5083 if (ret < 0) { 5084 err = ret; 5085 goto out_unlock; 5086 } 5087 ins_nr = 0; 5088 btrfs_release_path(path); 5089 inode_key.objectid = other_ino; 5090 inode_key.type = BTRFS_INODE_ITEM_KEY; 5091 inode_key.offset = 0; 5092 other_inode = btrfs_iget(fs_info->sb, 5093 &inode_key, root, 5094 NULL); 5095 /* 5096 * If the other inode that had a conflicting dir 5097 * entry was deleted in the current transaction, 5098 * we don't need to do more work nor fallback to 5099 * a transaction commit. 5100 */ 5101 if (IS_ERR(other_inode) && 5102 PTR_ERR(other_inode) == -ENOENT) { 5103 goto next_key; 5104 } else if (IS_ERR(other_inode)) { 5105 err = PTR_ERR(other_inode); 5106 goto out_unlock; 5107 } 5108 /* 5109 * We are safe logging the other inode without 5110 * acquiring its i_mutex as long as we log with 5111 * the LOG_INODE_EXISTS mode. We're safe against 5112 * concurrent renames of the other inode as well 5113 * because during a rename we pin the log and 5114 * update the log with the new name before we 5115 * unpin it. 5116 */ 5117 err = btrfs_log_inode(trans, root, 5118 BTRFS_I(other_inode), 5119 LOG_OTHER_INODE, 0, LLONG_MAX, 5120 ctx); 5121 iput(other_inode); 5122 if (err) 5123 goto out_unlock; 5124 else 5125 goto next_key; 5126 } 5127 } 5128 5129 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5130 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 5131 if (ins_nr == 0) 5132 goto next_slot; 5133 ret = copy_items(trans, inode, dst_path, path, 5134 &last_extent, ins_start_slot, 5135 ins_nr, inode_only, logged_isize); 5136 if (ret < 0) { 5137 err = ret; 5138 goto out_unlock; 5139 } 5140 ins_nr = 0; 5141 if (ret) { 5142 btrfs_release_path(path); 5143 continue; 5144 } 5145 goto next_slot; 5146 } 5147 5148 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5149 ins_nr++; 5150 goto next_slot; 5151 } else if (!ins_nr) { 5152 ins_start_slot = path->slots[0]; 5153 ins_nr = 1; 5154 goto next_slot; 5155 } 5156 5157 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5158 ins_start_slot, ins_nr, inode_only, 5159 logged_isize); 5160 if (ret < 0) { 5161 err = ret; 5162 goto out_unlock; 5163 } 5164 if (ret) { 5165 ins_nr = 0; 5166 btrfs_release_path(path); 5167 continue; 5168 } 5169 ins_nr = 1; 5170 ins_start_slot = path->slots[0]; 5171 next_slot: 5172 5173 nritems = btrfs_header_nritems(path->nodes[0]); 5174 path->slots[0]++; 5175 if (path->slots[0] < nritems) { 5176 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 5177 path->slots[0]); 5178 goto again; 5179 } 5180 if (ins_nr) { 5181 ret = copy_items(trans, inode, dst_path, path, 5182 &last_extent, ins_start_slot, 5183 ins_nr, inode_only, logged_isize); 5184 if (ret < 0) { 5185 err = ret; 5186 goto out_unlock; 5187 } 5188 ret = 0; 5189 ins_nr = 0; 5190 } 5191 btrfs_release_path(path); 5192 next_key: 5193 if (min_key.offset < (u64)-1) { 5194 min_key.offset++; 5195 } else if (min_key.type < max_key.type) { 5196 min_key.type++; 5197 min_key.offset = 0; 5198 } else { 5199 break; 5200 } 5201 } 5202 if (ins_nr) { 5203 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5204 ins_start_slot, ins_nr, inode_only, 5205 logged_isize); 5206 if (ret < 0) { 5207 err = ret; 5208 goto out_unlock; 5209 } 5210 ret = 0; 5211 ins_nr = 0; 5212 } 5213 5214 btrfs_release_path(path); 5215 btrfs_release_path(dst_path); 5216 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5217 if (err) 5218 goto out_unlock; 5219 xattrs_logged = true; 5220 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5221 btrfs_release_path(path); 5222 btrfs_release_path(dst_path); 5223 err = btrfs_log_trailing_hole(trans, root, inode, path); 5224 if (err) 5225 goto out_unlock; 5226 } 5227 log_extents: 5228 btrfs_release_path(path); 5229 btrfs_release_path(dst_path); 5230 if (need_log_inode_item) { 5231 err = log_inode_item(trans, log, dst_path, inode); 5232 if (!err && !xattrs_logged) { 5233 err = btrfs_log_all_xattrs(trans, root, inode, path, 5234 dst_path); 5235 btrfs_release_path(path); 5236 } 5237 if (err) 5238 goto out_unlock; 5239 } 5240 if (fast_search) { 5241 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5242 &logged_list, ctx, start, end); 5243 if (ret) { 5244 err = ret; 5245 goto out_unlock; 5246 } 5247 } else if (inode_only == LOG_INODE_ALL) { 5248 struct extent_map *em, *n; 5249 5250 write_lock(&em_tree->lock); 5251 /* 5252 * We can't just remove every em if we're called for a ranged 5253 * fsync - that is, one that doesn't cover the whole possible 5254 * file range (0 to LLONG_MAX). This is because we can have 5255 * em's that fall outside the range we're logging and therefore 5256 * their ordered operations haven't completed yet 5257 * (btrfs_finish_ordered_io() not invoked yet). This means we 5258 * didn't get their respective file extent item in the fs/subvol 5259 * tree yet, and need to let the next fast fsync (one which 5260 * consults the list of modified extent maps) find the em so 5261 * that it logs a matching file extent item and waits for the 5262 * respective ordered operation to complete (if it's still 5263 * running). 5264 * 5265 * Removing every em outside the range we're logging would make 5266 * the next fast fsync not log their matching file extent items, 5267 * therefore making us lose data after a log replay. 5268 */ 5269 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5270 list) { 5271 const u64 mod_end = em->mod_start + em->mod_len - 1; 5272 5273 if (em->mod_start >= start && mod_end <= end) 5274 list_del_init(&em->list); 5275 } 5276 write_unlock(&em_tree->lock); 5277 } 5278 5279 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5280 ret = log_directory_changes(trans, root, inode, path, dst_path, 5281 ctx); 5282 if (ret) { 5283 err = ret; 5284 goto out_unlock; 5285 } 5286 } 5287 5288 spin_lock(&inode->lock); 5289 inode->logged_trans = trans->transid; 5290 inode->last_log_commit = inode->last_sub_trans; 5291 spin_unlock(&inode->lock); 5292 out_unlock: 5293 if (unlikely(err)) 5294 btrfs_put_logged_extents(&logged_list); 5295 else 5296 btrfs_submit_logged_extents(&logged_list, log); 5297 mutex_unlock(&inode->log_mutex); 5298 5299 btrfs_free_path(path); 5300 btrfs_free_path(dst_path); 5301 return err; 5302 } 5303 5304 /* 5305 * Check if we must fallback to a transaction commit when logging an inode. 5306 * This must be called after logging the inode and is used only in the context 5307 * when fsyncing an inode requires the need to log some other inode - in which 5308 * case we can't lock the i_mutex of each other inode we need to log as that 5309 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5310 * log inodes up or down in the hierarchy) or rename operations for example. So 5311 * we take the log_mutex of the inode after we have logged it and then check for 5312 * its last_unlink_trans value - this is safe because any task setting 5313 * last_unlink_trans must take the log_mutex and it must do this before it does 5314 * the actual unlink operation, so if we do this check before a concurrent task 5315 * sets last_unlink_trans it means we've logged a consistent version/state of 5316 * all the inode items, otherwise we are not sure and must do a transaction 5317 * commit (the concurrent task might have only updated last_unlink_trans before 5318 * we logged the inode or it might have also done the unlink). 5319 */ 5320 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5321 struct btrfs_inode *inode) 5322 { 5323 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5324 bool ret = false; 5325 5326 mutex_lock(&inode->log_mutex); 5327 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5328 /* 5329 * Make sure any commits to the log are forced to be full 5330 * commits. 5331 */ 5332 btrfs_set_log_full_commit(fs_info, trans); 5333 ret = true; 5334 } 5335 mutex_unlock(&inode->log_mutex); 5336 5337 return ret; 5338 } 5339 5340 /* 5341 * follow the dentry parent pointers up the chain and see if any 5342 * of the directories in it require a full commit before they can 5343 * be logged. Returns zero if nothing special needs to be done or 1 if 5344 * a full commit is required. 5345 */ 5346 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5347 struct btrfs_inode *inode, 5348 struct dentry *parent, 5349 struct super_block *sb, 5350 u64 last_committed) 5351 { 5352 int ret = 0; 5353 struct dentry *old_parent = NULL; 5354 struct btrfs_inode *orig_inode = inode; 5355 5356 /* 5357 * for regular files, if its inode is already on disk, we don't 5358 * have to worry about the parents at all. This is because 5359 * we can use the last_unlink_trans field to record renames 5360 * and other fun in this file. 5361 */ 5362 if (S_ISREG(inode->vfs_inode.i_mode) && 5363 inode->generation <= last_committed && 5364 inode->last_unlink_trans <= last_committed) 5365 goto out; 5366 5367 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5368 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5369 goto out; 5370 inode = BTRFS_I(d_inode(parent)); 5371 } 5372 5373 while (1) { 5374 /* 5375 * If we are logging a directory then we start with our inode, 5376 * not our parent's inode, so we need to skip setting the 5377 * logged_trans so that further down in the log code we don't 5378 * think this inode has already been logged. 5379 */ 5380 if (inode != orig_inode) 5381 inode->logged_trans = trans->transid; 5382 smp_mb(); 5383 5384 if (btrfs_must_commit_transaction(trans, inode)) { 5385 ret = 1; 5386 break; 5387 } 5388 5389 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5390 break; 5391 5392 if (IS_ROOT(parent)) { 5393 inode = BTRFS_I(d_inode(parent)); 5394 if (btrfs_must_commit_transaction(trans, inode)) 5395 ret = 1; 5396 break; 5397 } 5398 5399 parent = dget_parent(parent); 5400 dput(old_parent); 5401 old_parent = parent; 5402 inode = BTRFS_I(d_inode(parent)); 5403 5404 } 5405 dput(old_parent); 5406 out: 5407 return ret; 5408 } 5409 5410 struct btrfs_dir_list { 5411 u64 ino; 5412 struct list_head list; 5413 }; 5414 5415 /* 5416 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5417 * details about the why it is needed. 5418 * This is a recursive operation - if an existing dentry corresponds to a 5419 * directory, that directory's new entries are logged too (same behaviour as 5420 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5421 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5422 * complains about the following circular lock dependency / possible deadlock: 5423 * 5424 * CPU0 CPU1 5425 * ---- ---- 5426 * lock(&type->i_mutex_dir_key#3/2); 5427 * lock(sb_internal#2); 5428 * lock(&type->i_mutex_dir_key#3/2); 5429 * lock(&sb->s_type->i_mutex_key#14); 5430 * 5431 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5432 * sb_start_intwrite() in btrfs_start_transaction(). 5433 * Not locking i_mutex of the inodes is still safe because: 5434 * 5435 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5436 * that while logging the inode new references (names) are added or removed 5437 * from the inode, leaving the logged inode item with a link count that does 5438 * not match the number of logged inode reference items. This is fine because 5439 * at log replay time we compute the real number of links and correct the 5440 * link count in the inode item (see replay_one_buffer() and 5441 * link_to_fixup_dir()); 5442 * 5443 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5444 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5445 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5446 * has a size that doesn't match the sum of the lengths of all the logged 5447 * names. This does not result in a problem because if a dir_item key is 5448 * logged but its matching dir_index key is not logged, at log replay time we 5449 * don't use it to replay the respective name (see replay_one_name()). On the 5450 * other hand if only the dir_index key ends up being logged, the respective 5451 * name is added to the fs/subvol tree with both the dir_item and dir_index 5452 * keys created (see replay_one_name()). 5453 * The directory's inode item with a wrong i_size is not a problem as well, 5454 * since we don't use it at log replay time to set the i_size in the inode 5455 * item of the fs/subvol tree (see overwrite_item()). 5456 */ 5457 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5458 struct btrfs_root *root, 5459 struct btrfs_inode *start_inode, 5460 struct btrfs_log_ctx *ctx) 5461 { 5462 struct btrfs_fs_info *fs_info = root->fs_info; 5463 struct btrfs_root *log = root->log_root; 5464 struct btrfs_path *path; 5465 LIST_HEAD(dir_list); 5466 struct btrfs_dir_list *dir_elem; 5467 int ret = 0; 5468 5469 path = btrfs_alloc_path(); 5470 if (!path) 5471 return -ENOMEM; 5472 5473 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5474 if (!dir_elem) { 5475 btrfs_free_path(path); 5476 return -ENOMEM; 5477 } 5478 dir_elem->ino = btrfs_ino(start_inode); 5479 list_add_tail(&dir_elem->list, &dir_list); 5480 5481 while (!list_empty(&dir_list)) { 5482 struct extent_buffer *leaf; 5483 struct btrfs_key min_key; 5484 int nritems; 5485 int i; 5486 5487 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5488 list); 5489 if (ret) 5490 goto next_dir_inode; 5491 5492 min_key.objectid = dir_elem->ino; 5493 min_key.type = BTRFS_DIR_ITEM_KEY; 5494 min_key.offset = 0; 5495 again: 5496 btrfs_release_path(path); 5497 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5498 if (ret < 0) { 5499 goto next_dir_inode; 5500 } else if (ret > 0) { 5501 ret = 0; 5502 goto next_dir_inode; 5503 } 5504 5505 process_leaf: 5506 leaf = path->nodes[0]; 5507 nritems = btrfs_header_nritems(leaf); 5508 for (i = path->slots[0]; i < nritems; i++) { 5509 struct btrfs_dir_item *di; 5510 struct btrfs_key di_key; 5511 struct inode *di_inode; 5512 struct btrfs_dir_list *new_dir_elem; 5513 int log_mode = LOG_INODE_EXISTS; 5514 int type; 5515 5516 btrfs_item_key_to_cpu(leaf, &min_key, i); 5517 if (min_key.objectid != dir_elem->ino || 5518 min_key.type != BTRFS_DIR_ITEM_KEY) 5519 goto next_dir_inode; 5520 5521 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5522 type = btrfs_dir_type(leaf, di); 5523 if (btrfs_dir_transid(leaf, di) < trans->transid && 5524 type != BTRFS_FT_DIR) 5525 continue; 5526 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5527 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5528 continue; 5529 5530 btrfs_release_path(path); 5531 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5532 if (IS_ERR(di_inode)) { 5533 ret = PTR_ERR(di_inode); 5534 goto next_dir_inode; 5535 } 5536 5537 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5538 iput(di_inode); 5539 break; 5540 } 5541 5542 ctx->log_new_dentries = false; 5543 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5544 log_mode = LOG_INODE_ALL; 5545 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5546 log_mode, 0, LLONG_MAX, ctx); 5547 if (!ret && 5548 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5549 ret = 1; 5550 iput(di_inode); 5551 if (ret) 5552 goto next_dir_inode; 5553 if (ctx->log_new_dentries) { 5554 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5555 GFP_NOFS); 5556 if (!new_dir_elem) { 5557 ret = -ENOMEM; 5558 goto next_dir_inode; 5559 } 5560 new_dir_elem->ino = di_key.objectid; 5561 list_add_tail(&new_dir_elem->list, &dir_list); 5562 } 5563 break; 5564 } 5565 if (i == nritems) { 5566 ret = btrfs_next_leaf(log, path); 5567 if (ret < 0) { 5568 goto next_dir_inode; 5569 } else if (ret > 0) { 5570 ret = 0; 5571 goto next_dir_inode; 5572 } 5573 goto process_leaf; 5574 } 5575 if (min_key.offset < (u64)-1) { 5576 min_key.offset++; 5577 goto again; 5578 } 5579 next_dir_inode: 5580 list_del(&dir_elem->list); 5581 kfree(dir_elem); 5582 } 5583 5584 btrfs_free_path(path); 5585 return ret; 5586 } 5587 5588 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5589 struct btrfs_inode *inode, 5590 struct btrfs_log_ctx *ctx) 5591 { 5592 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5593 int ret; 5594 struct btrfs_path *path; 5595 struct btrfs_key key; 5596 struct btrfs_root *root = inode->root; 5597 const u64 ino = btrfs_ino(inode); 5598 5599 path = btrfs_alloc_path(); 5600 if (!path) 5601 return -ENOMEM; 5602 path->skip_locking = 1; 5603 path->search_commit_root = 1; 5604 5605 key.objectid = ino; 5606 key.type = BTRFS_INODE_REF_KEY; 5607 key.offset = 0; 5608 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5609 if (ret < 0) 5610 goto out; 5611 5612 while (true) { 5613 struct extent_buffer *leaf = path->nodes[0]; 5614 int slot = path->slots[0]; 5615 u32 cur_offset = 0; 5616 u32 item_size; 5617 unsigned long ptr; 5618 5619 if (slot >= btrfs_header_nritems(leaf)) { 5620 ret = btrfs_next_leaf(root, path); 5621 if (ret < 0) 5622 goto out; 5623 else if (ret > 0) 5624 break; 5625 continue; 5626 } 5627 5628 btrfs_item_key_to_cpu(leaf, &key, slot); 5629 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5630 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5631 break; 5632 5633 item_size = btrfs_item_size_nr(leaf, slot); 5634 ptr = btrfs_item_ptr_offset(leaf, slot); 5635 while (cur_offset < item_size) { 5636 struct btrfs_key inode_key; 5637 struct inode *dir_inode; 5638 5639 inode_key.type = BTRFS_INODE_ITEM_KEY; 5640 inode_key.offset = 0; 5641 5642 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5643 struct btrfs_inode_extref *extref; 5644 5645 extref = (struct btrfs_inode_extref *) 5646 (ptr + cur_offset); 5647 inode_key.objectid = btrfs_inode_extref_parent( 5648 leaf, extref); 5649 cur_offset += sizeof(*extref); 5650 cur_offset += btrfs_inode_extref_name_len(leaf, 5651 extref); 5652 } else { 5653 inode_key.objectid = key.offset; 5654 cur_offset = item_size; 5655 } 5656 5657 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5658 root, NULL); 5659 /* If parent inode was deleted, skip it. */ 5660 if (IS_ERR(dir_inode)) 5661 continue; 5662 5663 if (ctx) 5664 ctx->log_new_dentries = false; 5665 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5666 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5667 if (!ret && 5668 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5669 ret = 1; 5670 if (!ret && ctx && ctx->log_new_dentries) 5671 ret = log_new_dir_dentries(trans, root, 5672 BTRFS_I(dir_inode), ctx); 5673 iput(dir_inode); 5674 if (ret) 5675 goto out; 5676 } 5677 path->slots[0]++; 5678 } 5679 ret = 0; 5680 out: 5681 btrfs_free_path(path); 5682 return ret; 5683 } 5684 5685 /* 5686 * helper function around btrfs_log_inode to make sure newly created 5687 * parent directories also end up in the log. A minimal inode and backref 5688 * only logging is done of any parent directories that are older than 5689 * the last committed transaction 5690 */ 5691 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5692 struct btrfs_inode *inode, 5693 struct dentry *parent, 5694 const loff_t start, 5695 const loff_t end, 5696 int inode_only, 5697 struct btrfs_log_ctx *ctx) 5698 { 5699 struct btrfs_root *root = inode->root; 5700 struct btrfs_fs_info *fs_info = root->fs_info; 5701 struct super_block *sb; 5702 struct dentry *old_parent = NULL; 5703 int ret = 0; 5704 u64 last_committed = fs_info->last_trans_committed; 5705 bool log_dentries = false; 5706 struct btrfs_inode *orig_inode = inode; 5707 5708 sb = inode->vfs_inode.i_sb; 5709 5710 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5711 ret = 1; 5712 goto end_no_trans; 5713 } 5714 5715 /* 5716 * The prev transaction commit doesn't complete, we need do 5717 * full commit by ourselves. 5718 */ 5719 if (fs_info->last_trans_log_full_commit > 5720 fs_info->last_trans_committed) { 5721 ret = 1; 5722 goto end_no_trans; 5723 } 5724 5725 if (btrfs_root_refs(&root->root_item) == 0) { 5726 ret = 1; 5727 goto end_no_trans; 5728 } 5729 5730 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5731 last_committed); 5732 if (ret) 5733 goto end_no_trans; 5734 5735 if (btrfs_inode_in_log(inode, trans->transid)) { 5736 ret = BTRFS_NO_LOG_SYNC; 5737 goto end_no_trans; 5738 } 5739 5740 ret = start_log_trans(trans, root, ctx); 5741 if (ret) 5742 goto end_no_trans; 5743 5744 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5745 if (ret) 5746 goto end_trans; 5747 5748 /* 5749 * for regular files, if its inode is already on disk, we don't 5750 * have to worry about the parents at all. This is because 5751 * we can use the last_unlink_trans field to record renames 5752 * and other fun in this file. 5753 */ 5754 if (S_ISREG(inode->vfs_inode.i_mode) && 5755 inode->generation <= last_committed && 5756 inode->last_unlink_trans <= last_committed) { 5757 ret = 0; 5758 goto end_trans; 5759 } 5760 5761 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5762 log_dentries = true; 5763 5764 /* 5765 * On unlink we must make sure all our current and old parent directory 5766 * inodes are fully logged. This is to prevent leaving dangling 5767 * directory index entries in directories that were our parents but are 5768 * not anymore. Not doing this results in old parent directory being 5769 * impossible to delete after log replay (rmdir will always fail with 5770 * error -ENOTEMPTY). 5771 * 5772 * Example 1: 5773 * 5774 * mkdir testdir 5775 * touch testdir/foo 5776 * ln testdir/foo testdir/bar 5777 * sync 5778 * unlink testdir/bar 5779 * xfs_io -c fsync testdir/foo 5780 * <power failure> 5781 * mount fs, triggers log replay 5782 * 5783 * If we don't log the parent directory (testdir), after log replay the 5784 * directory still has an entry pointing to the file inode using the bar 5785 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5786 * the file inode has a link count of 1. 5787 * 5788 * Example 2: 5789 * 5790 * mkdir testdir 5791 * touch foo 5792 * ln foo testdir/foo2 5793 * ln foo testdir/foo3 5794 * sync 5795 * unlink testdir/foo3 5796 * xfs_io -c fsync foo 5797 * <power failure> 5798 * mount fs, triggers log replay 5799 * 5800 * Similar as the first example, after log replay the parent directory 5801 * testdir still has an entry pointing to the inode file with name foo3 5802 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5803 * and has a link count of 2. 5804 */ 5805 if (inode->last_unlink_trans > last_committed) { 5806 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5807 if (ret) 5808 goto end_trans; 5809 } 5810 5811 while (1) { 5812 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5813 break; 5814 5815 inode = BTRFS_I(d_inode(parent)); 5816 if (root != inode->root) 5817 break; 5818 5819 if (inode->generation > last_committed) { 5820 ret = btrfs_log_inode(trans, root, inode, 5821 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5822 if (ret) 5823 goto end_trans; 5824 } 5825 if (IS_ROOT(parent)) 5826 break; 5827 5828 parent = dget_parent(parent); 5829 dput(old_parent); 5830 old_parent = parent; 5831 } 5832 if (log_dentries) 5833 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5834 else 5835 ret = 0; 5836 end_trans: 5837 dput(old_parent); 5838 if (ret < 0) { 5839 btrfs_set_log_full_commit(fs_info, trans); 5840 ret = 1; 5841 } 5842 5843 if (ret) 5844 btrfs_remove_log_ctx(root, ctx); 5845 btrfs_end_log_trans(root); 5846 end_no_trans: 5847 return ret; 5848 } 5849 5850 /* 5851 * it is not safe to log dentry if the chunk root has added new 5852 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5853 * If this returns 1, you must commit the transaction to safely get your 5854 * data on disk. 5855 */ 5856 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5857 struct dentry *dentry, 5858 const loff_t start, 5859 const loff_t end, 5860 struct btrfs_log_ctx *ctx) 5861 { 5862 struct dentry *parent = dget_parent(dentry); 5863 int ret; 5864 5865 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 5866 start, end, LOG_INODE_ALL, ctx); 5867 dput(parent); 5868 5869 return ret; 5870 } 5871 5872 /* 5873 * should be called during mount to recover any replay any log trees 5874 * from the FS 5875 */ 5876 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5877 { 5878 int ret; 5879 struct btrfs_path *path; 5880 struct btrfs_trans_handle *trans; 5881 struct btrfs_key key; 5882 struct btrfs_key found_key; 5883 struct btrfs_key tmp_key; 5884 struct btrfs_root *log; 5885 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5886 struct walk_control wc = { 5887 .process_func = process_one_buffer, 5888 .stage = 0, 5889 }; 5890 5891 path = btrfs_alloc_path(); 5892 if (!path) 5893 return -ENOMEM; 5894 5895 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5896 5897 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5898 if (IS_ERR(trans)) { 5899 ret = PTR_ERR(trans); 5900 goto error; 5901 } 5902 5903 wc.trans = trans; 5904 wc.pin = 1; 5905 5906 ret = walk_log_tree(trans, log_root_tree, &wc); 5907 if (ret) { 5908 btrfs_handle_fs_error(fs_info, ret, 5909 "Failed to pin buffers while recovering log root tree."); 5910 goto error; 5911 } 5912 5913 again: 5914 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5915 key.offset = (u64)-1; 5916 key.type = BTRFS_ROOT_ITEM_KEY; 5917 5918 while (1) { 5919 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5920 5921 if (ret < 0) { 5922 btrfs_handle_fs_error(fs_info, ret, 5923 "Couldn't find tree log root."); 5924 goto error; 5925 } 5926 if (ret > 0) { 5927 if (path->slots[0] == 0) 5928 break; 5929 path->slots[0]--; 5930 } 5931 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5932 path->slots[0]); 5933 btrfs_release_path(path); 5934 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5935 break; 5936 5937 log = btrfs_read_fs_root(log_root_tree, &found_key); 5938 if (IS_ERR(log)) { 5939 ret = PTR_ERR(log); 5940 btrfs_handle_fs_error(fs_info, ret, 5941 "Couldn't read tree log root."); 5942 goto error; 5943 } 5944 5945 tmp_key.objectid = found_key.offset; 5946 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5947 tmp_key.offset = (u64)-1; 5948 5949 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5950 if (IS_ERR(wc.replay_dest)) { 5951 ret = PTR_ERR(wc.replay_dest); 5952 free_extent_buffer(log->node); 5953 free_extent_buffer(log->commit_root); 5954 kfree(log); 5955 btrfs_handle_fs_error(fs_info, ret, 5956 "Couldn't read target root for tree log recovery."); 5957 goto error; 5958 } 5959 5960 wc.replay_dest->log_root = log; 5961 btrfs_record_root_in_trans(trans, wc.replay_dest); 5962 ret = walk_log_tree(trans, log, &wc); 5963 5964 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5965 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5966 path); 5967 } 5968 5969 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5970 struct btrfs_root *root = wc.replay_dest; 5971 5972 btrfs_release_path(path); 5973 5974 /* 5975 * We have just replayed everything, and the highest 5976 * objectid of fs roots probably has changed in case 5977 * some inode_item's got replayed. 5978 * 5979 * root->objectid_mutex is not acquired as log replay 5980 * could only happen during mount. 5981 */ 5982 ret = btrfs_find_highest_objectid(root, 5983 &root->highest_objectid); 5984 } 5985 5986 key.offset = found_key.offset - 1; 5987 wc.replay_dest->log_root = NULL; 5988 free_extent_buffer(log->node); 5989 free_extent_buffer(log->commit_root); 5990 kfree(log); 5991 5992 if (ret) 5993 goto error; 5994 5995 if (found_key.offset == 0) 5996 break; 5997 } 5998 btrfs_release_path(path); 5999 6000 /* step one is to pin it all, step two is to replay just inodes */ 6001 if (wc.pin) { 6002 wc.pin = 0; 6003 wc.process_func = replay_one_buffer; 6004 wc.stage = LOG_WALK_REPLAY_INODES; 6005 goto again; 6006 } 6007 /* step three is to replay everything */ 6008 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6009 wc.stage++; 6010 goto again; 6011 } 6012 6013 btrfs_free_path(path); 6014 6015 /* step 4: commit the transaction, which also unpins the blocks */ 6016 ret = btrfs_commit_transaction(trans); 6017 if (ret) 6018 return ret; 6019 6020 free_extent_buffer(log_root_tree->node); 6021 log_root_tree->log_root = NULL; 6022 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6023 kfree(log_root_tree); 6024 6025 return 0; 6026 error: 6027 if (wc.trans) 6028 btrfs_end_transaction(wc.trans); 6029 btrfs_free_path(path); 6030 return ret; 6031 } 6032 6033 /* 6034 * there are some corner cases where we want to force a full 6035 * commit instead of allowing a directory to be logged. 6036 * 6037 * They revolve around files there were unlinked from the directory, and 6038 * this function updates the parent directory so that a full commit is 6039 * properly done if it is fsync'd later after the unlinks are done. 6040 * 6041 * Must be called before the unlink operations (updates to the subvolume tree, 6042 * inodes, etc) are done. 6043 */ 6044 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6045 struct btrfs_inode *dir, struct btrfs_inode *inode, 6046 int for_rename) 6047 { 6048 /* 6049 * when we're logging a file, if it hasn't been renamed 6050 * or unlinked, and its inode is fully committed on disk, 6051 * we don't have to worry about walking up the directory chain 6052 * to log its parents. 6053 * 6054 * So, we use the last_unlink_trans field to put this transid 6055 * into the file. When the file is logged we check it and 6056 * don't log the parents if the file is fully on disk. 6057 */ 6058 mutex_lock(&inode->log_mutex); 6059 inode->last_unlink_trans = trans->transid; 6060 mutex_unlock(&inode->log_mutex); 6061 6062 /* 6063 * if this directory was already logged any new 6064 * names for this file/dir will get recorded 6065 */ 6066 smp_mb(); 6067 if (dir->logged_trans == trans->transid) 6068 return; 6069 6070 /* 6071 * if the inode we're about to unlink was logged, 6072 * the log will be properly updated for any new names 6073 */ 6074 if (inode->logged_trans == trans->transid) 6075 return; 6076 6077 /* 6078 * when renaming files across directories, if the directory 6079 * there we're unlinking from gets fsync'd later on, there's 6080 * no way to find the destination directory later and fsync it 6081 * properly. So, we have to be conservative and force commits 6082 * so the new name gets discovered. 6083 */ 6084 if (for_rename) 6085 goto record; 6086 6087 /* we can safely do the unlink without any special recording */ 6088 return; 6089 6090 record: 6091 mutex_lock(&dir->log_mutex); 6092 dir->last_unlink_trans = trans->transid; 6093 mutex_unlock(&dir->log_mutex); 6094 } 6095 6096 /* 6097 * Make sure that if someone attempts to fsync the parent directory of a deleted 6098 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6099 * that after replaying the log tree of the parent directory's root we will not 6100 * see the snapshot anymore and at log replay time we will not see any log tree 6101 * corresponding to the deleted snapshot's root, which could lead to replaying 6102 * it after replaying the log tree of the parent directory (which would replay 6103 * the snapshot delete operation). 6104 * 6105 * Must be called before the actual snapshot destroy operation (updates to the 6106 * parent root and tree of tree roots trees, etc) are done. 6107 */ 6108 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6109 struct btrfs_inode *dir) 6110 { 6111 mutex_lock(&dir->log_mutex); 6112 dir->last_unlink_trans = trans->transid; 6113 mutex_unlock(&dir->log_mutex); 6114 } 6115 6116 /* 6117 * Call this after adding a new name for a file and it will properly 6118 * update the log to reflect the new name. 6119 * 6120 * It will return zero if all goes well, and it will return 1 if a 6121 * full transaction commit is required. 6122 */ 6123 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 6124 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6125 struct dentry *parent) 6126 { 6127 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6128 6129 /* 6130 * this will force the logging code to walk the dentry chain 6131 * up for the file 6132 */ 6133 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6134 inode->last_unlink_trans = trans->transid; 6135 6136 /* 6137 * if this inode hasn't been logged and directory we're renaming it 6138 * from hasn't been logged, we don't need to log it 6139 */ 6140 if (inode->logged_trans <= fs_info->last_trans_committed && 6141 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6142 return 0; 6143 6144 return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6145 LOG_INODE_EXISTS, NULL); 6146 } 6147 6148