1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "ctree.h" 12 #include "tree-log.h" 13 #include "disk-io.h" 14 #include "locking.h" 15 #include "print-tree.h" 16 #include "backref.h" 17 #include "compression.h" 18 #include "qgroup.h" 19 #include "inode-map.h" 20 21 /* magic values for the inode_only field in btrfs_log_inode: 22 * 23 * LOG_INODE_ALL means to log everything 24 * LOG_INODE_EXISTS means to log just enough to recreate the inode 25 * during log replay 26 */ 27 #define LOG_INODE_ALL 0 28 #define LOG_INODE_EXISTS 1 29 #define LOG_OTHER_INODE 2 30 31 /* 32 * directory trouble cases 33 * 34 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 35 * log, we must force a full commit before doing an fsync of the directory 36 * where the unlink was done. 37 * ---> record transid of last unlink/rename per directory 38 * 39 * mkdir foo/some_dir 40 * normal commit 41 * rename foo/some_dir foo2/some_dir 42 * mkdir foo/some_dir 43 * fsync foo/some_dir/some_file 44 * 45 * The fsync above will unlink the original some_dir without recording 46 * it in its new location (foo2). After a crash, some_dir will be gone 47 * unless the fsync of some_file forces a full commit 48 * 49 * 2) we must log any new names for any file or dir that is in the fsync 50 * log. ---> check inode while renaming/linking. 51 * 52 * 2a) we must log any new names for any file or dir during rename 53 * when the directory they are being removed from was logged. 54 * ---> check inode and old parent dir during rename 55 * 56 * 2a is actually the more important variant. With the extra logging 57 * a crash might unlink the old name without recreating the new one 58 * 59 * 3) after a crash, we must go through any directories with a link count 60 * of zero and redo the rm -rf 61 * 62 * mkdir f1/foo 63 * normal commit 64 * rm -rf f1/foo 65 * fsync(f1) 66 * 67 * The directory f1 was fully removed from the FS, but fsync was never 68 * called on f1, only its parent dir. After a crash the rm -rf must 69 * be replayed. This must be able to recurse down the entire 70 * directory tree. The inode link count fixup code takes care of the 71 * ugly details. 72 */ 73 74 /* 75 * stages for the tree walking. The first 76 * stage (0) is to only pin down the blocks we find 77 * the second stage (1) is to make sure that all the inodes 78 * we find in the log are created in the subvolume. 79 * 80 * The last stage is to deal with directories and links and extents 81 * and all the other fun semantics 82 */ 83 #define LOG_WALK_PIN_ONLY 0 84 #define LOG_WALK_REPLAY_INODES 1 85 #define LOG_WALK_REPLAY_DIR_INDEX 2 86 #define LOG_WALK_REPLAY_ALL 3 87 88 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 89 struct btrfs_root *root, struct btrfs_inode *inode, 90 int inode_only, 91 const loff_t start, 92 const loff_t end, 93 struct btrfs_log_ctx *ctx); 94 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, 96 struct btrfs_path *path, u64 objectid); 97 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, 99 struct btrfs_root *log, 100 struct btrfs_path *path, 101 u64 dirid, int del_all); 102 103 /* 104 * tree logging is a special write ahead log used to make sure that 105 * fsyncs and O_SYNCs can happen without doing full tree commits. 106 * 107 * Full tree commits are expensive because they require commonly 108 * modified blocks to be recowed, creating many dirty pages in the 109 * extent tree an 4x-6x higher write load than ext3. 110 * 111 * Instead of doing a tree commit on every fsync, we use the 112 * key ranges and transaction ids to find items for a given file or directory 113 * that have changed in this transaction. Those items are copied into 114 * a special tree (one per subvolume root), that tree is written to disk 115 * and then the fsync is considered complete. 116 * 117 * After a crash, items are copied out of the log-tree back into the 118 * subvolume tree. Any file data extents found are recorded in the extent 119 * allocation tree, and the log-tree freed. 120 * 121 * The log tree is read three times, once to pin down all the extents it is 122 * using in ram and once, once to create all the inodes logged in the tree 123 * and once to do all the other items. 124 */ 125 126 /* 127 * start a sub transaction and setup the log tree 128 * this increments the log tree writer count to make the people 129 * syncing the tree wait for us to finish 130 */ 131 static int start_log_trans(struct btrfs_trans_handle *trans, 132 struct btrfs_root *root, 133 struct btrfs_log_ctx *ctx) 134 { 135 struct btrfs_fs_info *fs_info = root->fs_info; 136 int ret = 0; 137 138 mutex_lock(&root->log_mutex); 139 140 if (root->log_root) { 141 if (btrfs_need_log_full_commit(fs_info, trans)) { 142 ret = -EAGAIN; 143 goto out; 144 } 145 146 if (!root->log_start_pid) { 147 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 148 root->log_start_pid = current->pid; 149 } else if (root->log_start_pid != current->pid) { 150 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 151 } 152 } else { 153 mutex_lock(&fs_info->tree_log_mutex); 154 if (!fs_info->log_root_tree) 155 ret = btrfs_init_log_root_tree(trans, fs_info); 156 mutex_unlock(&fs_info->tree_log_mutex); 157 if (ret) 158 goto out; 159 160 ret = btrfs_add_log_tree(trans, root); 161 if (ret) 162 goto out; 163 164 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 165 root->log_start_pid = current->pid; 166 } 167 168 atomic_inc(&root->log_batch); 169 atomic_inc(&root->log_writers); 170 if (ctx) { 171 int index = root->log_transid % 2; 172 list_add_tail(&ctx->list, &root->log_ctxs[index]); 173 ctx->log_transid = root->log_transid; 174 } 175 176 out: 177 mutex_unlock(&root->log_mutex); 178 return ret; 179 } 180 181 /* 182 * returns 0 if there was a log transaction running and we were able 183 * to join, or returns -ENOENT if there were not transactions 184 * in progress 185 */ 186 static int join_running_log_trans(struct btrfs_root *root) 187 { 188 int ret = -ENOENT; 189 190 smp_mb(); 191 if (!root->log_root) 192 return -ENOENT; 193 194 mutex_lock(&root->log_mutex); 195 if (root->log_root) { 196 ret = 0; 197 atomic_inc(&root->log_writers); 198 } 199 mutex_unlock(&root->log_mutex); 200 return ret; 201 } 202 203 /* 204 * This either makes the current running log transaction wait 205 * until you call btrfs_end_log_trans() or it makes any future 206 * log transactions wait until you call btrfs_end_log_trans() 207 */ 208 int btrfs_pin_log_trans(struct btrfs_root *root) 209 { 210 int ret = -ENOENT; 211 212 mutex_lock(&root->log_mutex); 213 atomic_inc(&root->log_writers); 214 mutex_unlock(&root->log_mutex); 215 return ret; 216 } 217 218 /* 219 * indicate we're done making changes to the log tree 220 * and wake up anyone waiting to do a sync 221 */ 222 void btrfs_end_log_trans(struct btrfs_root *root) 223 { 224 if (atomic_dec_and_test(&root->log_writers)) { 225 /* atomic_dec_and_test implies a barrier */ 226 cond_wake_up_nomb(&root->log_writer_wait); 227 } 228 } 229 230 231 /* 232 * the walk control struct is used to pass state down the chain when 233 * processing the log tree. The stage field tells us which part 234 * of the log tree processing we are currently doing. The others 235 * are state fields used for that specific part 236 */ 237 struct walk_control { 238 /* should we free the extent on disk when done? This is used 239 * at transaction commit time while freeing a log tree 240 */ 241 int free; 242 243 /* should we write out the extent buffer? This is used 244 * while flushing the log tree to disk during a sync 245 */ 246 int write; 247 248 /* should we wait for the extent buffer io to finish? Also used 249 * while flushing the log tree to disk for a sync 250 */ 251 int wait; 252 253 /* pin only walk, we record which extents on disk belong to the 254 * log trees 255 */ 256 int pin; 257 258 /* what stage of the replay code we're currently in */ 259 int stage; 260 261 /* the root we are currently replaying */ 262 struct btrfs_root *replay_dest; 263 264 /* the trans handle for the current replay */ 265 struct btrfs_trans_handle *trans; 266 267 /* the function that gets used to process blocks we find in the 268 * tree. Note the extent_buffer might not be up to date when it is 269 * passed in, and it must be checked or read if you need the data 270 * inside it 271 */ 272 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 273 struct walk_control *wc, u64 gen, int level); 274 }; 275 276 /* 277 * process_func used to pin down extents, write them or wait on them 278 */ 279 static int process_one_buffer(struct btrfs_root *log, 280 struct extent_buffer *eb, 281 struct walk_control *wc, u64 gen, int level) 282 { 283 struct btrfs_fs_info *fs_info = log->fs_info; 284 int ret = 0; 285 286 /* 287 * If this fs is mixed then we need to be able to process the leaves to 288 * pin down any logged extents, so we have to read the block. 289 */ 290 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 291 ret = btrfs_read_buffer(eb, gen, level, NULL); 292 if (ret) 293 return ret; 294 } 295 296 if (wc->pin) 297 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 298 eb->len); 299 300 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 301 if (wc->pin && btrfs_header_level(eb) == 0) 302 ret = btrfs_exclude_logged_extents(fs_info, eb); 303 if (wc->write) 304 btrfs_write_tree_block(eb); 305 if (wc->wait) 306 btrfs_wait_tree_block_writeback(eb); 307 } 308 return ret; 309 } 310 311 /* 312 * Item overwrite used by replay and tree logging. eb, slot and key all refer 313 * to the src data we are copying out. 314 * 315 * root is the tree we are copying into, and path is a scratch 316 * path for use in this function (it should be released on entry and 317 * will be released on exit). 318 * 319 * If the key is already in the destination tree the existing item is 320 * overwritten. If the existing item isn't big enough, it is extended. 321 * If it is too large, it is truncated. 322 * 323 * If the key isn't in the destination yet, a new item is inserted. 324 */ 325 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 326 struct btrfs_root *root, 327 struct btrfs_path *path, 328 struct extent_buffer *eb, int slot, 329 struct btrfs_key *key) 330 { 331 struct btrfs_fs_info *fs_info = root->fs_info; 332 int ret; 333 u32 item_size; 334 u64 saved_i_size = 0; 335 int save_old_i_size = 0; 336 unsigned long src_ptr; 337 unsigned long dst_ptr; 338 int overwrite_root = 0; 339 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 340 341 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 342 overwrite_root = 1; 343 344 item_size = btrfs_item_size_nr(eb, slot); 345 src_ptr = btrfs_item_ptr_offset(eb, slot); 346 347 /* look for the key in the destination tree */ 348 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 349 if (ret < 0) 350 return ret; 351 352 if (ret == 0) { 353 char *src_copy; 354 char *dst_copy; 355 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 356 path->slots[0]); 357 if (dst_size != item_size) 358 goto insert; 359 360 if (item_size == 0) { 361 btrfs_release_path(path); 362 return 0; 363 } 364 dst_copy = kmalloc(item_size, GFP_NOFS); 365 src_copy = kmalloc(item_size, GFP_NOFS); 366 if (!dst_copy || !src_copy) { 367 btrfs_release_path(path); 368 kfree(dst_copy); 369 kfree(src_copy); 370 return -ENOMEM; 371 } 372 373 read_extent_buffer(eb, src_copy, src_ptr, item_size); 374 375 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 376 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 377 item_size); 378 ret = memcmp(dst_copy, src_copy, item_size); 379 380 kfree(dst_copy); 381 kfree(src_copy); 382 /* 383 * they have the same contents, just return, this saves 384 * us from cowing blocks in the destination tree and doing 385 * extra writes that may not have been done by a previous 386 * sync 387 */ 388 if (ret == 0) { 389 btrfs_release_path(path); 390 return 0; 391 } 392 393 /* 394 * We need to load the old nbytes into the inode so when we 395 * replay the extents we've logged we get the right nbytes. 396 */ 397 if (inode_item) { 398 struct btrfs_inode_item *item; 399 u64 nbytes; 400 u32 mode; 401 402 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 403 struct btrfs_inode_item); 404 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 405 item = btrfs_item_ptr(eb, slot, 406 struct btrfs_inode_item); 407 btrfs_set_inode_nbytes(eb, item, nbytes); 408 409 /* 410 * If this is a directory we need to reset the i_size to 411 * 0 so that we can set it up properly when replaying 412 * the rest of the items in this log. 413 */ 414 mode = btrfs_inode_mode(eb, item); 415 if (S_ISDIR(mode)) 416 btrfs_set_inode_size(eb, item, 0); 417 } 418 } else if (inode_item) { 419 struct btrfs_inode_item *item; 420 u32 mode; 421 422 /* 423 * New inode, set nbytes to 0 so that the nbytes comes out 424 * properly when we replay the extents. 425 */ 426 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 427 btrfs_set_inode_nbytes(eb, item, 0); 428 429 /* 430 * If this is a directory we need to reset the i_size to 0 so 431 * that we can set it up properly when replaying the rest of 432 * the items in this log. 433 */ 434 mode = btrfs_inode_mode(eb, item); 435 if (S_ISDIR(mode)) 436 btrfs_set_inode_size(eb, item, 0); 437 } 438 insert: 439 btrfs_release_path(path); 440 /* try to insert the key into the destination tree */ 441 path->skip_release_on_error = 1; 442 ret = btrfs_insert_empty_item(trans, root, path, 443 key, item_size); 444 path->skip_release_on_error = 0; 445 446 /* make sure any existing item is the correct size */ 447 if (ret == -EEXIST || ret == -EOVERFLOW) { 448 u32 found_size; 449 found_size = btrfs_item_size_nr(path->nodes[0], 450 path->slots[0]); 451 if (found_size > item_size) 452 btrfs_truncate_item(fs_info, path, item_size, 1); 453 else if (found_size < item_size) 454 btrfs_extend_item(fs_info, path, 455 item_size - found_size); 456 } else if (ret) { 457 return ret; 458 } 459 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 460 path->slots[0]); 461 462 /* don't overwrite an existing inode if the generation number 463 * was logged as zero. This is done when the tree logging code 464 * is just logging an inode to make sure it exists after recovery. 465 * 466 * Also, don't overwrite i_size on directories during replay. 467 * log replay inserts and removes directory items based on the 468 * state of the tree found in the subvolume, and i_size is modified 469 * as it goes 470 */ 471 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 472 struct btrfs_inode_item *src_item; 473 struct btrfs_inode_item *dst_item; 474 475 src_item = (struct btrfs_inode_item *)src_ptr; 476 dst_item = (struct btrfs_inode_item *)dst_ptr; 477 478 if (btrfs_inode_generation(eb, src_item) == 0) { 479 struct extent_buffer *dst_eb = path->nodes[0]; 480 const u64 ino_size = btrfs_inode_size(eb, src_item); 481 482 /* 483 * For regular files an ino_size == 0 is used only when 484 * logging that an inode exists, as part of a directory 485 * fsync, and the inode wasn't fsynced before. In this 486 * case don't set the size of the inode in the fs/subvol 487 * tree, otherwise we would be throwing valid data away. 488 */ 489 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 490 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 491 ino_size != 0) { 492 struct btrfs_map_token token; 493 494 btrfs_init_map_token(&token); 495 btrfs_set_token_inode_size(dst_eb, dst_item, 496 ino_size, &token); 497 } 498 goto no_copy; 499 } 500 501 if (overwrite_root && 502 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 503 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 504 save_old_i_size = 1; 505 saved_i_size = btrfs_inode_size(path->nodes[0], 506 dst_item); 507 } 508 } 509 510 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 511 src_ptr, item_size); 512 513 if (save_old_i_size) { 514 struct btrfs_inode_item *dst_item; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 517 } 518 519 /* make sure the generation is filled in */ 520 if (key->type == BTRFS_INODE_ITEM_KEY) { 521 struct btrfs_inode_item *dst_item; 522 dst_item = (struct btrfs_inode_item *)dst_ptr; 523 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 524 btrfs_set_inode_generation(path->nodes[0], dst_item, 525 trans->transid); 526 } 527 } 528 no_copy: 529 btrfs_mark_buffer_dirty(path->nodes[0]); 530 btrfs_release_path(path); 531 return 0; 532 } 533 534 /* 535 * simple helper to read an inode off the disk from a given root 536 * This can only be called for subvolume roots and not for the log 537 */ 538 static noinline struct inode *read_one_inode(struct btrfs_root *root, 539 u64 objectid) 540 { 541 struct btrfs_key key; 542 struct inode *inode; 543 544 key.objectid = objectid; 545 key.type = BTRFS_INODE_ITEM_KEY; 546 key.offset = 0; 547 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 548 if (IS_ERR(inode)) 549 inode = NULL; 550 return inode; 551 } 552 553 /* replays a single extent in 'eb' at 'slot' with 'key' into the 554 * subvolume 'root'. path is released on entry and should be released 555 * on exit. 556 * 557 * extents in the log tree have not been allocated out of the extent 558 * tree yet. So, this completes the allocation, taking a reference 559 * as required if the extent already exists or creating a new extent 560 * if it isn't in the extent allocation tree yet. 561 * 562 * The extent is inserted into the file, dropping any existing extents 563 * from the file that overlap the new one. 564 */ 565 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 566 struct btrfs_root *root, 567 struct btrfs_path *path, 568 struct extent_buffer *eb, int slot, 569 struct btrfs_key *key) 570 { 571 struct btrfs_fs_info *fs_info = root->fs_info; 572 int found_type; 573 u64 extent_end; 574 u64 start = key->offset; 575 u64 nbytes = 0; 576 struct btrfs_file_extent_item *item; 577 struct inode *inode = NULL; 578 unsigned long size; 579 int ret = 0; 580 581 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 582 found_type = btrfs_file_extent_type(eb, item); 583 584 if (found_type == BTRFS_FILE_EXTENT_REG || 585 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 586 nbytes = btrfs_file_extent_num_bytes(eb, item); 587 extent_end = start + nbytes; 588 589 /* 590 * We don't add to the inodes nbytes if we are prealloc or a 591 * hole. 592 */ 593 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 594 nbytes = 0; 595 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 596 size = btrfs_file_extent_ram_bytes(eb, item); 597 nbytes = btrfs_file_extent_ram_bytes(eb, item); 598 extent_end = ALIGN(start + size, 599 fs_info->sectorsize); 600 } else { 601 ret = 0; 602 goto out; 603 } 604 605 inode = read_one_inode(root, key->objectid); 606 if (!inode) { 607 ret = -EIO; 608 goto out; 609 } 610 611 /* 612 * first check to see if we already have this extent in the 613 * file. This must be done before the btrfs_drop_extents run 614 * so we don't try to drop this extent. 615 */ 616 ret = btrfs_lookup_file_extent(trans, root, path, 617 btrfs_ino(BTRFS_I(inode)), start, 0); 618 619 if (ret == 0 && 620 (found_type == BTRFS_FILE_EXTENT_REG || 621 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 622 struct btrfs_file_extent_item cmp1; 623 struct btrfs_file_extent_item cmp2; 624 struct btrfs_file_extent_item *existing; 625 struct extent_buffer *leaf; 626 627 leaf = path->nodes[0]; 628 existing = btrfs_item_ptr(leaf, path->slots[0], 629 struct btrfs_file_extent_item); 630 631 read_extent_buffer(eb, &cmp1, (unsigned long)item, 632 sizeof(cmp1)); 633 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 634 sizeof(cmp2)); 635 636 /* 637 * we already have a pointer to this exact extent, 638 * we don't have to do anything 639 */ 640 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 641 btrfs_release_path(path); 642 goto out; 643 } 644 } 645 btrfs_release_path(path); 646 647 /* drop any overlapping extents */ 648 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 649 if (ret) 650 goto out; 651 652 if (found_type == BTRFS_FILE_EXTENT_REG || 653 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 654 u64 offset; 655 unsigned long dest_offset; 656 struct btrfs_key ins; 657 658 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 659 btrfs_fs_incompat(fs_info, NO_HOLES)) 660 goto update_inode; 661 662 ret = btrfs_insert_empty_item(trans, root, path, key, 663 sizeof(*item)); 664 if (ret) 665 goto out; 666 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 667 path->slots[0]); 668 copy_extent_buffer(path->nodes[0], eb, dest_offset, 669 (unsigned long)item, sizeof(*item)); 670 671 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 672 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 673 ins.type = BTRFS_EXTENT_ITEM_KEY; 674 offset = key->offset - btrfs_file_extent_offset(eb, item); 675 676 /* 677 * Manually record dirty extent, as here we did a shallow 678 * file extent item copy and skip normal backref update, 679 * but modifying extent tree all by ourselves. 680 * So need to manually record dirty extent for qgroup, 681 * as the owner of the file extent changed from log tree 682 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 683 */ 684 ret = btrfs_qgroup_trace_extent(trans, 685 btrfs_file_extent_disk_bytenr(eb, item), 686 btrfs_file_extent_disk_num_bytes(eb, item), 687 GFP_NOFS); 688 if (ret < 0) 689 goto out; 690 691 if (ins.objectid > 0) { 692 u64 csum_start; 693 u64 csum_end; 694 LIST_HEAD(ordered_sums); 695 /* 696 * is this extent already allocated in the extent 697 * allocation tree? If so, just add a reference 698 */ 699 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 700 ins.offset); 701 if (ret == 0) { 702 ret = btrfs_inc_extent_ref(trans, root, 703 ins.objectid, ins.offset, 704 0, root->root_key.objectid, 705 key->objectid, offset); 706 if (ret) 707 goto out; 708 } else { 709 /* 710 * insert the extent pointer in the extent 711 * allocation tree 712 */ 713 ret = btrfs_alloc_logged_file_extent(trans, 714 root->root_key.objectid, 715 key->objectid, offset, &ins); 716 if (ret) 717 goto out; 718 } 719 btrfs_release_path(path); 720 721 if (btrfs_file_extent_compression(eb, item)) { 722 csum_start = ins.objectid; 723 csum_end = csum_start + ins.offset; 724 } else { 725 csum_start = ins.objectid + 726 btrfs_file_extent_offset(eb, item); 727 csum_end = csum_start + 728 btrfs_file_extent_num_bytes(eb, item); 729 } 730 731 ret = btrfs_lookup_csums_range(root->log_root, 732 csum_start, csum_end - 1, 733 &ordered_sums, 0); 734 if (ret) 735 goto out; 736 /* 737 * Now delete all existing cums in the csum root that 738 * cover our range. We do this because we can have an 739 * extent that is completely referenced by one file 740 * extent item and partially referenced by another 741 * file extent item (like after using the clone or 742 * extent_same ioctls). In this case if we end up doing 743 * the replay of the one that partially references the 744 * extent first, and we do not do the csum deletion 745 * below, we can get 2 csum items in the csum tree that 746 * overlap each other. For example, imagine our log has 747 * the two following file extent items: 748 * 749 * key (257 EXTENT_DATA 409600) 750 * extent data disk byte 12845056 nr 102400 751 * extent data offset 20480 nr 20480 ram 102400 752 * 753 * key (257 EXTENT_DATA 819200) 754 * extent data disk byte 12845056 nr 102400 755 * extent data offset 0 nr 102400 ram 102400 756 * 757 * Where the second one fully references the 100K extent 758 * that starts at disk byte 12845056, and the log tree 759 * has a single csum item that covers the entire range 760 * of the extent: 761 * 762 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 763 * 764 * After the first file extent item is replayed, the 765 * csum tree gets the following csum item: 766 * 767 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 768 * 769 * Which covers the 20K sub-range starting at offset 20K 770 * of our extent. Now when we replay the second file 771 * extent item, if we do not delete existing csum items 772 * that cover any of its blocks, we end up getting two 773 * csum items in our csum tree that overlap each other: 774 * 775 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 776 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 777 * 778 * Which is a problem, because after this anyone trying 779 * to lookup up for the checksum of any block of our 780 * extent starting at an offset of 40K or higher, will 781 * end up looking at the second csum item only, which 782 * does not contain the checksum for any block starting 783 * at offset 40K or higher of our extent. 784 */ 785 while (!list_empty(&ordered_sums)) { 786 struct btrfs_ordered_sum *sums; 787 sums = list_entry(ordered_sums.next, 788 struct btrfs_ordered_sum, 789 list); 790 if (!ret) 791 ret = btrfs_del_csums(trans, fs_info, 792 sums->bytenr, 793 sums->len); 794 if (!ret) 795 ret = btrfs_csum_file_blocks(trans, 796 fs_info->csum_root, sums); 797 list_del(&sums->list); 798 kfree(sums); 799 } 800 if (ret) 801 goto out; 802 } else { 803 btrfs_release_path(path); 804 } 805 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 806 /* inline extents are easy, we just overwrite them */ 807 ret = overwrite_item(trans, root, path, eb, slot, key); 808 if (ret) 809 goto out; 810 } 811 812 inode_add_bytes(inode, nbytes); 813 update_inode: 814 ret = btrfs_update_inode(trans, root, inode); 815 out: 816 if (inode) 817 iput(inode); 818 return ret; 819 } 820 821 /* 822 * when cleaning up conflicts between the directory names in the 823 * subvolume, directory names in the log and directory names in the 824 * inode back references, we may have to unlink inodes from directories. 825 * 826 * This is a helper function to do the unlink of a specific directory 827 * item 828 */ 829 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 830 struct btrfs_root *root, 831 struct btrfs_path *path, 832 struct btrfs_inode *dir, 833 struct btrfs_dir_item *di) 834 { 835 struct inode *inode; 836 char *name; 837 int name_len; 838 struct extent_buffer *leaf; 839 struct btrfs_key location; 840 int ret; 841 842 leaf = path->nodes[0]; 843 844 btrfs_dir_item_key_to_cpu(leaf, di, &location); 845 name_len = btrfs_dir_name_len(leaf, di); 846 name = kmalloc(name_len, GFP_NOFS); 847 if (!name) 848 return -ENOMEM; 849 850 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 851 btrfs_release_path(path); 852 853 inode = read_one_inode(root, location.objectid); 854 if (!inode) { 855 ret = -EIO; 856 goto out; 857 } 858 859 ret = link_to_fixup_dir(trans, root, path, location.objectid); 860 if (ret) 861 goto out; 862 863 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 864 name_len); 865 if (ret) 866 goto out; 867 else 868 ret = btrfs_run_delayed_items(trans); 869 out: 870 kfree(name); 871 iput(inode); 872 return ret; 873 } 874 875 /* 876 * helper function to see if a given name and sequence number found 877 * in an inode back reference are already in a directory and correctly 878 * point to this inode 879 */ 880 static noinline int inode_in_dir(struct btrfs_root *root, 881 struct btrfs_path *path, 882 u64 dirid, u64 objectid, u64 index, 883 const char *name, int name_len) 884 { 885 struct btrfs_dir_item *di; 886 struct btrfs_key location; 887 int match = 0; 888 889 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 890 index, name, name_len, 0); 891 if (di && !IS_ERR(di)) { 892 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 893 if (location.objectid != objectid) 894 goto out; 895 } else 896 goto out; 897 btrfs_release_path(path); 898 899 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 900 if (di && !IS_ERR(di)) { 901 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 902 if (location.objectid != objectid) 903 goto out; 904 } else 905 goto out; 906 match = 1; 907 out: 908 btrfs_release_path(path); 909 return match; 910 } 911 912 /* 913 * helper function to check a log tree for a named back reference in 914 * an inode. This is used to decide if a back reference that is 915 * found in the subvolume conflicts with what we find in the log. 916 * 917 * inode backreferences may have multiple refs in a single item, 918 * during replay we process one reference at a time, and we don't 919 * want to delete valid links to a file from the subvolume if that 920 * link is also in the log. 921 */ 922 static noinline int backref_in_log(struct btrfs_root *log, 923 struct btrfs_key *key, 924 u64 ref_objectid, 925 const char *name, int namelen) 926 { 927 struct btrfs_path *path; 928 struct btrfs_inode_ref *ref; 929 unsigned long ptr; 930 unsigned long ptr_end; 931 unsigned long name_ptr; 932 int found_name_len; 933 int item_size; 934 int ret; 935 int match = 0; 936 937 path = btrfs_alloc_path(); 938 if (!path) 939 return -ENOMEM; 940 941 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 942 if (ret != 0) 943 goto out; 944 945 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 946 947 if (key->type == BTRFS_INODE_EXTREF_KEY) { 948 if (btrfs_find_name_in_ext_backref(path->nodes[0], 949 path->slots[0], 950 ref_objectid, 951 name, namelen, NULL)) 952 match = 1; 953 954 goto out; 955 } 956 957 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 958 ptr_end = ptr + item_size; 959 while (ptr < ptr_end) { 960 ref = (struct btrfs_inode_ref *)ptr; 961 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 962 if (found_name_len == namelen) { 963 name_ptr = (unsigned long)(ref + 1); 964 ret = memcmp_extent_buffer(path->nodes[0], name, 965 name_ptr, namelen); 966 if (ret == 0) { 967 match = 1; 968 goto out; 969 } 970 } 971 ptr = (unsigned long)(ref + 1) + found_name_len; 972 } 973 out: 974 btrfs_free_path(path); 975 return match; 976 } 977 978 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 979 struct btrfs_root *root, 980 struct btrfs_path *path, 981 struct btrfs_root *log_root, 982 struct btrfs_inode *dir, 983 struct btrfs_inode *inode, 984 u64 inode_objectid, u64 parent_objectid, 985 u64 ref_index, char *name, int namelen, 986 int *search_done) 987 { 988 int ret; 989 char *victim_name; 990 int victim_name_len; 991 struct extent_buffer *leaf; 992 struct btrfs_dir_item *di; 993 struct btrfs_key search_key; 994 struct btrfs_inode_extref *extref; 995 996 again: 997 /* Search old style refs */ 998 search_key.objectid = inode_objectid; 999 search_key.type = BTRFS_INODE_REF_KEY; 1000 search_key.offset = parent_objectid; 1001 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1002 if (ret == 0) { 1003 struct btrfs_inode_ref *victim_ref; 1004 unsigned long ptr; 1005 unsigned long ptr_end; 1006 1007 leaf = path->nodes[0]; 1008 1009 /* are we trying to overwrite a back ref for the root directory 1010 * if so, just jump out, we're done 1011 */ 1012 if (search_key.objectid == search_key.offset) 1013 return 1; 1014 1015 /* check all the names in this back reference to see 1016 * if they are in the log. if so, we allow them to stay 1017 * otherwise they must be unlinked as a conflict 1018 */ 1019 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1020 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1021 while (ptr < ptr_end) { 1022 victim_ref = (struct btrfs_inode_ref *)ptr; 1023 victim_name_len = btrfs_inode_ref_name_len(leaf, 1024 victim_ref); 1025 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1026 if (!victim_name) 1027 return -ENOMEM; 1028 1029 read_extent_buffer(leaf, victim_name, 1030 (unsigned long)(victim_ref + 1), 1031 victim_name_len); 1032 1033 if (!backref_in_log(log_root, &search_key, 1034 parent_objectid, 1035 victim_name, 1036 victim_name_len)) { 1037 inc_nlink(&inode->vfs_inode); 1038 btrfs_release_path(path); 1039 1040 ret = btrfs_unlink_inode(trans, root, dir, inode, 1041 victim_name, victim_name_len); 1042 kfree(victim_name); 1043 if (ret) 1044 return ret; 1045 ret = btrfs_run_delayed_items(trans); 1046 if (ret) 1047 return ret; 1048 *search_done = 1; 1049 goto again; 1050 } 1051 kfree(victim_name); 1052 1053 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1054 } 1055 1056 /* 1057 * NOTE: we have searched root tree and checked the 1058 * corresponding ref, it does not need to check again. 1059 */ 1060 *search_done = 1; 1061 } 1062 btrfs_release_path(path); 1063 1064 /* Same search but for extended refs */ 1065 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1066 inode_objectid, parent_objectid, 0, 1067 0); 1068 if (!IS_ERR_OR_NULL(extref)) { 1069 u32 item_size; 1070 u32 cur_offset = 0; 1071 unsigned long base; 1072 struct inode *victim_parent; 1073 1074 leaf = path->nodes[0]; 1075 1076 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1077 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1078 1079 while (cur_offset < item_size) { 1080 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1081 1082 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1083 1084 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1085 goto next; 1086 1087 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1088 if (!victim_name) 1089 return -ENOMEM; 1090 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1091 victim_name_len); 1092 1093 search_key.objectid = inode_objectid; 1094 search_key.type = BTRFS_INODE_EXTREF_KEY; 1095 search_key.offset = btrfs_extref_hash(parent_objectid, 1096 victim_name, 1097 victim_name_len); 1098 ret = 0; 1099 if (!backref_in_log(log_root, &search_key, 1100 parent_objectid, victim_name, 1101 victim_name_len)) { 1102 ret = -ENOENT; 1103 victim_parent = read_one_inode(root, 1104 parent_objectid); 1105 if (victim_parent) { 1106 inc_nlink(&inode->vfs_inode); 1107 btrfs_release_path(path); 1108 1109 ret = btrfs_unlink_inode(trans, root, 1110 BTRFS_I(victim_parent), 1111 inode, 1112 victim_name, 1113 victim_name_len); 1114 if (!ret) 1115 ret = btrfs_run_delayed_items( 1116 trans); 1117 } 1118 iput(victim_parent); 1119 kfree(victim_name); 1120 if (ret) 1121 return ret; 1122 *search_done = 1; 1123 goto again; 1124 } 1125 kfree(victim_name); 1126 next: 1127 cur_offset += victim_name_len + sizeof(*extref); 1128 } 1129 *search_done = 1; 1130 } 1131 btrfs_release_path(path); 1132 1133 /* look for a conflicting sequence number */ 1134 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1135 ref_index, name, namelen, 0); 1136 if (di && !IS_ERR(di)) { 1137 ret = drop_one_dir_item(trans, root, path, dir, di); 1138 if (ret) 1139 return ret; 1140 } 1141 btrfs_release_path(path); 1142 1143 /* look for a conflicing name */ 1144 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1145 name, namelen, 0); 1146 if (di && !IS_ERR(di)) { 1147 ret = drop_one_dir_item(trans, root, path, dir, di); 1148 if (ret) 1149 return ret; 1150 } 1151 btrfs_release_path(path); 1152 1153 return 0; 1154 } 1155 1156 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1157 u32 *namelen, char **name, u64 *index, 1158 u64 *parent_objectid) 1159 { 1160 struct btrfs_inode_extref *extref; 1161 1162 extref = (struct btrfs_inode_extref *)ref_ptr; 1163 1164 *namelen = btrfs_inode_extref_name_len(eb, extref); 1165 *name = kmalloc(*namelen, GFP_NOFS); 1166 if (*name == NULL) 1167 return -ENOMEM; 1168 1169 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1170 *namelen); 1171 1172 if (index) 1173 *index = btrfs_inode_extref_index(eb, extref); 1174 if (parent_objectid) 1175 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1176 1177 return 0; 1178 } 1179 1180 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1181 u32 *namelen, char **name, u64 *index) 1182 { 1183 struct btrfs_inode_ref *ref; 1184 1185 ref = (struct btrfs_inode_ref *)ref_ptr; 1186 1187 *namelen = btrfs_inode_ref_name_len(eb, ref); 1188 *name = kmalloc(*namelen, GFP_NOFS); 1189 if (*name == NULL) 1190 return -ENOMEM; 1191 1192 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1193 1194 if (index) 1195 *index = btrfs_inode_ref_index(eb, ref); 1196 1197 return 0; 1198 } 1199 1200 /* 1201 * Take an inode reference item from the log tree and iterate all names from the 1202 * inode reference item in the subvolume tree with the same key (if it exists). 1203 * For any name that is not in the inode reference item from the log tree, do a 1204 * proper unlink of that name (that is, remove its entry from the inode 1205 * reference item and both dir index keys). 1206 */ 1207 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1208 struct btrfs_root *root, 1209 struct btrfs_path *path, 1210 struct btrfs_inode *inode, 1211 struct extent_buffer *log_eb, 1212 int log_slot, 1213 struct btrfs_key *key) 1214 { 1215 int ret; 1216 unsigned long ref_ptr; 1217 unsigned long ref_end; 1218 struct extent_buffer *eb; 1219 1220 again: 1221 btrfs_release_path(path); 1222 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1223 if (ret > 0) { 1224 ret = 0; 1225 goto out; 1226 } 1227 if (ret < 0) 1228 goto out; 1229 1230 eb = path->nodes[0]; 1231 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1232 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1233 while (ref_ptr < ref_end) { 1234 char *name = NULL; 1235 int namelen; 1236 u64 parent_id; 1237 1238 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1239 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1240 NULL, &parent_id); 1241 } else { 1242 parent_id = key->offset; 1243 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1244 NULL); 1245 } 1246 if (ret) 1247 goto out; 1248 1249 if (key->type == BTRFS_INODE_EXTREF_KEY) 1250 ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, 1251 parent_id, name, 1252 namelen, NULL); 1253 else 1254 ret = btrfs_find_name_in_backref(log_eb, log_slot, name, 1255 namelen, NULL); 1256 1257 if (!ret) { 1258 struct inode *dir; 1259 1260 btrfs_release_path(path); 1261 dir = read_one_inode(root, parent_id); 1262 if (!dir) { 1263 ret = -ENOENT; 1264 kfree(name); 1265 goto out; 1266 } 1267 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1268 inode, name, namelen); 1269 kfree(name); 1270 iput(dir); 1271 if (ret) 1272 goto out; 1273 goto again; 1274 } 1275 1276 kfree(name); 1277 ref_ptr += namelen; 1278 if (key->type == BTRFS_INODE_EXTREF_KEY) 1279 ref_ptr += sizeof(struct btrfs_inode_extref); 1280 else 1281 ref_ptr += sizeof(struct btrfs_inode_ref); 1282 } 1283 ret = 0; 1284 out: 1285 btrfs_release_path(path); 1286 return ret; 1287 } 1288 1289 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1290 const u8 ref_type, const char *name, 1291 const int namelen) 1292 { 1293 struct btrfs_key key; 1294 struct btrfs_path *path; 1295 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1296 int ret; 1297 1298 path = btrfs_alloc_path(); 1299 if (!path) 1300 return -ENOMEM; 1301 1302 key.objectid = btrfs_ino(BTRFS_I(inode)); 1303 key.type = ref_type; 1304 if (key.type == BTRFS_INODE_REF_KEY) 1305 key.offset = parent_id; 1306 else 1307 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1308 1309 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1310 if (ret < 0) 1311 goto out; 1312 if (ret > 0) { 1313 ret = 0; 1314 goto out; 1315 } 1316 if (key.type == BTRFS_INODE_EXTREF_KEY) 1317 ret = btrfs_find_name_in_ext_backref(path->nodes[0], 1318 path->slots[0], parent_id, 1319 name, namelen, NULL); 1320 else 1321 ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1322 name, namelen, NULL); 1323 1324 out: 1325 btrfs_free_path(path); 1326 return ret; 1327 } 1328 1329 /* 1330 * replay one inode back reference item found in the log tree. 1331 * eb, slot and key refer to the buffer and key found in the log tree. 1332 * root is the destination we are replaying into, and path is for temp 1333 * use by this function. (it should be released on return). 1334 */ 1335 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1336 struct btrfs_root *root, 1337 struct btrfs_root *log, 1338 struct btrfs_path *path, 1339 struct extent_buffer *eb, int slot, 1340 struct btrfs_key *key) 1341 { 1342 struct inode *dir = NULL; 1343 struct inode *inode = NULL; 1344 unsigned long ref_ptr; 1345 unsigned long ref_end; 1346 char *name = NULL; 1347 int namelen; 1348 int ret; 1349 int search_done = 0; 1350 int log_ref_ver = 0; 1351 u64 parent_objectid; 1352 u64 inode_objectid; 1353 u64 ref_index = 0; 1354 int ref_struct_size; 1355 1356 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1357 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1358 1359 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1360 struct btrfs_inode_extref *r; 1361 1362 ref_struct_size = sizeof(struct btrfs_inode_extref); 1363 log_ref_ver = 1; 1364 r = (struct btrfs_inode_extref *)ref_ptr; 1365 parent_objectid = btrfs_inode_extref_parent(eb, r); 1366 } else { 1367 ref_struct_size = sizeof(struct btrfs_inode_ref); 1368 parent_objectid = key->offset; 1369 } 1370 inode_objectid = key->objectid; 1371 1372 /* 1373 * it is possible that we didn't log all the parent directories 1374 * for a given inode. If we don't find the dir, just don't 1375 * copy the back ref in. The link count fixup code will take 1376 * care of the rest 1377 */ 1378 dir = read_one_inode(root, parent_objectid); 1379 if (!dir) { 1380 ret = -ENOENT; 1381 goto out; 1382 } 1383 1384 inode = read_one_inode(root, inode_objectid); 1385 if (!inode) { 1386 ret = -EIO; 1387 goto out; 1388 } 1389 1390 while (ref_ptr < ref_end) { 1391 if (log_ref_ver) { 1392 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1393 &ref_index, &parent_objectid); 1394 /* 1395 * parent object can change from one array 1396 * item to another. 1397 */ 1398 if (!dir) 1399 dir = read_one_inode(root, parent_objectid); 1400 if (!dir) { 1401 ret = -ENOENT; 1402 goto out; 1403 } 1404 } else { 1405 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1406 &ref_index); 1407 } 1408 if (ret) 1409 goto out; 1410 1411 /* if we already have a perfect match, we're done */ 1412 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1413 btrfs_ino(BTRFS_I(inode)), ref_index, 1414 name, namelen)) { 1415 /* 1416 * look for a conflicting back reference in the 1417 * metadata. if we find one we have to unlink that name 1418 * of the file before we add our new link. Later on, we 1419 * overwrite any existing back reference, and we don't 1420 * want to create dangling pointers in the directory. 1421 */ 1422 1423 if (!search_done) { 1424 ret = __add_inode_ref(trans, root, path, log, 1425 BTRFS_I(dir), 1426 BTRFS_I(inode), 1427 inode_objectid, 1428 parent_objectid, 1429 ref_index, name, namelen, 1430 &search_done); 1431 if (ret) { 1432 if (ret == 1) 1433 ret = 0; 1434 goto out; 1435 } 1436 } 1437 1438 /* 1439 * If a reference item already exists for this inode 1440 * with the same parent and name, but different index, 1441 * drop it and the corresponding directory index entries 1442 * from the parent before adding the new reference item 1443 * and dir index entries, otherwise we would fail with 1444 * -EEXIST returned from btrfs_add_link() below. 1445 */ 1446 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1447 name, namelen); 1448 if (ret > 0) { 1449 ret = btrfs_unlink_inode(trans, root, 1450 BTRFS_I(dir), 1451 BTRFS_I(inode), 1452 name, namelen); 1453 /* 1454 * If we dropped the link count to 0, bump it so 1455 * that later the iput() on the inode will not 1456 * free it. We will fixup the link count later. 1457 */ 1458 if (!ret && inode->i_nlink == 0) 1459 inc_nlink(inode); 1460 } 1461 if (ret < 0) 1462 goto out; 1463 1464 /* insert our name */ 1465 ret = btrfs_add_link(trans, BTRFS_I(dir), 1466 BTRFS_I(inode), 1467 name, namelen, 0, ref_index); 1468 if (ret) 1469 goto out; 1470 1471 btrfs_update_inode(trans, root, inode); 1472 } 1473 1474 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1475 kfree(name); 1476 name = NULL; 1477 if (log_ref_ver) { 1478 iput(dir); 1479 dir = NULL; 1480 } 1481 } 1482 1483 /* 1484 * Before we overwrite the inode reference item in the subvolume tree 1485 * with the item from the log tree, we must unlink all names from the 1486 * parent directory that are in the subvolume's tree inode reference 1487 * item, otherwise we end up with an inconsistent subvolume tree where 1488 * dir index entries exist for a name but there is no inode reference 1489 * item with the same name. 1490 */ 1491 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1492 key); 1493 if (ret) 1494 goto out; 1495 1496 /* finally write the back reference in the inode */ 1497 ret = overwrite_item(trans, root, path, eb, slot, key); 1498 out: 1499 btrfs_release_path(path); 1500 kfree(name); 1501 iput(dir); 1502 iput(inode); 1503 return ret; 1504 } 1505 1506 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1507 struct btrfs_root *root, u64 ino) 1508 { 1509 int ret; 1510 1511 ret = btrfs_insert_orphan_item(trans, root, ino); 1512 if (ret == -EEXIST) 1513 ret = 0; 1514 1515 return ret; 1516 } 1517 1518 static int count_inode_extrefs(struct btrfs_root *root, 1519 struct btrfs_inode *inode, struct btrfs_path *path) 1520 { 1521 int ret = 0; 1522 int name_len; 1523 unsigned int nlink = 0; 1524 u32 item_size; 1525 u32 cur_offset = 0; 1526 u64 inode_objectid = btrfs_ino(inode); 1527 u64 offset = 0; 1528 unsigned long ptr; 1529 struct btrfs_inode_extref *extref; 1530 struct extent_buffer *leaf; 1531 1532 while (1) { 1533 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1534 &extref, &offset); 1535 if (ret) 1536 break; 1537 1538 leaf = path->nodes[0]; 1539 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1540 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1541 cur_offset = 0; 1542 1543 while (cur_offset < item_size) { 1544 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1545 name_len = btrfs_inode_extref_name_len(leaf, extref); 1546 1547 nlink++; 1548 1549 cur_offset += name_len + sizeof(*extref); 1550 } 1551 1552 offset++; 1553 btrfs_release_path(path); 1554 } 1555 btrfs_release_path(path); 1556 1557 if (ret < 0 && ret != -ENOENT) 1558 return ret; 1559 return nlink; 1560 } 1561 1562 static int count_inode_refs(struct btrfs_root *root, 1563 struct btrfs_inode *inode, struct btrfs_path *path) 1564 { 1565 int ret; 1566 struct btrfs_key key; 1567 unsigned int nlink = 0; 1568 unsigned long ptr; 1569 unsigned long ptr_end; 1570 int name_len; 1571 u64 ino = btrfs_ino(inode); 1572 1573 key.objectid = ino; 1574 key.type = BTRFS_INODE_REF_KEY; 1575 key.offset = (u64)-1; 1576 1577 while (1) { 1578 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1579 if (ret < 0) 1580 break; 1581 if (ret > 0) { 1582 if (path->slots[0] == 0) 1583 break; 1584 path->slots[0]--; 1585 } 1586 process_slot: 1587 btrfs_item_key_to_cpu(path->nodes[0], &key, 1588 path->slots[0]); 1589 if (key.objectid != ino || 1590 key.type != BTRFS_INODE_REF_KEY) 1591 break; 1592 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1593 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1594 path->slots[0]); 1595 while (ptr < ptr_end) { 1596 struct btrfs_inode_ref *ref; 1597 1598 ref = (struct btrfs_inode_ref *)ptr; 1599 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1600 ref); 1601 ptr = (unsigned long)(ref + 1) + name_len; 1602 nlink++; 1603 } 1604 1605 if (key.offset == 0) 1606 break; 1607 if (path->slots[0] > 0) { 1608 path->slots[0]--; 1609 goto process_slot; 1610 } 1611 key.offset--; 1612 btrfs_release_path(path); 1613 } 1614 btrfs_release_path(path); 1615 1616 return nlink; 1617 } 1618 1619 /* 1620 * There are a few corners where the link count of the file can't 1621 * be properly maintained during replay. So, instead of adding 1622 * lots of complexity to the log code, we just scan the backrefs 1623 * for any file that has been through replay. 1624 * 1625 * The scan will update the link count on the inode to reflect the 1626 * number of back refs found. If it goes down to zero, the iput 1627 * will free the inode. 1628 */ 1629 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1630 struct btrfs_root *root, 1631 struct inode *inode) 1632 { 1633 struct btrfs_path *path; 1634 int ret; 1635 u64 nlink = 0; 1636 u64 ino = btrfs_ino(BTRFS_I(inode)); 1637 1638 path = btrfs_alloc_path(); 1639 if (!path) 1640 return -ENOMEM; 1641 1642 ret = count_inode_refs(root, BTRFS_I(inode), path); 1643 if (ret < 0) 1644 goto out; 1645 1646 nlink = ret; 1647 1648 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1649 if (ret < 0) 1650 goto out; 1651 1652 nlink += ret; 1653 1654 ret = 0; 1655 1656 if (nlink != inode->i_nlink) { 1657 set_nlink(inode, nlink); 1658 btrfs_update_inode(trans, root, inode); 1659 } 1660 BTRFS_I(inode)->index_cnt = (u64)-1; 1661 1662 if (inode->i_nlink == 0) { 1663 if (S_ISDIR(inode->i_mode)) { 1664 ret = replay_dir_deletes(trans, root, NULL, path, 1665 ino, 1); 1666 if (ret) 1667 goto out; 1668 } 1669 ret = insert_orphan_item(trans, root, ino); 1670 } 1671 1672 out: 1673 btrfs_free_path(path); 1674 return ret; 1675 } 1676 1677 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1678 struct btrfs_root *root, 1679 struct btrfs_path *path) 1680 { 1681 int ret; 1682 struct btrfs_key key; 1683 struct inode *inode; 1684 1685 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1686 key.type = BTRFS_ORPHAN_ITEM_KEY; 1687 key.offset = (u64)-1; 1688 while (1) { 1689 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1690 if (ret < 0) 1691 break; 1692 1693 if (ret == 1) { 1694 if (path->slots[0] == 0) 1695 break; 1696 path->slots[0]--; 1697 } 1698 1699 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1700 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1701 key.type != BTRFS_ORPHAN_ITEM_KEY) 1702 break; 1703 1704 ret = btrfs_del_item(trans, root, path); 1705 if (ret) 1706 goto out; 1707 1708 btrfs_release_path(path); 1709 inode = read_one_inode(root, key.offset); 1710 if (!inode) 1711 return -EIO; 1712 1713 ret = fixup_inode_link_count(trans, root, inode); 1714 iput(inode); 1715 if (ret) 1716 goto out; 1717 1718 /* 1719 * fixup on a directory may create new entries, 1720 * make sure we always look for the highset possible 1721 * offset 1722 */ 1723 key.offset = (u64)-1; 1724 } 1725 ret = 0; 1726 out: 1727 btrfs_release_path(path); 1728 return ret; 1729 } 1730 1731 1732 /* 1733 * record a given inode in the fixup dir so we can check its link 1734 * count when replay is done. The link count is incremented here 1735 * so the inode won't go away until we check it 1736 */ 1737 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1738 struct btrfs_root *root, 1739 struct btrfs_path *path, 1740 u64 objectid) 1741 { 1742 struct btrfs_key key; 1743 int ret = 0; 1744 struct inode *inode; 1745 1746 inode = read_one_inode(root, objectid); 1747 if (!inode) 1748 return -EIO; 1749 1750 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1751 key.type = BTRFS_ORPHAN_ITEM_KEY; 1752 key.offset = objectid; 1753 1754 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1755 1756 btrfs_release_path(path); 1757 if (ret == 0) { 1758 if (!inode->i_nlink) 1759 set_nlink(inode, 1); 1760 else 1761 inc_nlink(inode); 1762 ret = btrfs_update_inode(trans, root, inode); 1763 } else if (ret == -EEXIST) { 1764 ret = 0; 1765 } else { 1766 BUG(); /* Logic Error */ 1767 } 1768 iput(inode); 1769 1770 return ret; 1771 } 1772 1773 /* 1774 * when replaying the log for a directory, we only insert names 1775 * for inodes that actually exist. This means an fsync on a directory 1776 * does not implicitly fsync all the new files in it 1777 */ 1778 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1779 struct btrfs_root *root, 1780 u64 dirid, u64 index, 1781 char *name, int name_len, 1782 struct btrfs_key *location) 1783 { 1784 struct inode *inode; 1785 struct inode *dir; 1786 int ret; 1787 1788 inode = read_one_inode(root, location->objectid); 1789 if (!inode) 1790 return -ENOENT; 1791 1792 dir = read_one_inode(root, dirid); 1793 if (!dir) { 1794 iput(inode); 1795 return -EIO; 1796 } 1797 1798 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1799 name_len, 1, index); 1800 1801 /* FIXME, put inode into FIXUP list */ 1802 1803 iput(inode); 1804 iput(dir); 1805 return ret; 1806 } 1807 1808 /* 1809 * Return true if an inode reference exists in the log for the given name, 1810 * inode and parent inode. 1811 */ 1812 static bool name_in_log_ref(struct btrfs_root *log_root, 1813 const char *name, const int name_len, 1814 const u64 dirid, const u64 ino) 1815 { 1816 struct btrfs_key search_key; 1817 1818 search_key.objectid = ino; 1819 search_key.type = BTRFS_INODE_REF_KEY; 1820 search_key.offset = dirid; 1821 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1822 return true; 1823 1824 search_key.type = BTRFS_INODE_EXTREF_KEY; 1825 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1826 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1827 return true; 1828 1829 return false; 1830 } 1831 1832 /* 1833 * take a single entry in a log directory item and replay it into 1834 * the subvolume. 1835 * 1836 * if a conflicting item exists in the subdirectory already, 1837 * the inode it points to is unlinked and put into the link count 1838 * fix up tree. 1839 * 1840 * If a name from the log points to a file or directory that does 1841 * not exist in the FS, it is skipped. fsyncs on directories 1842 * do not force down inodes inside that directory, just changes to the 1843 * names or unlinks in a directory. 1844 * 1845 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1846 * non-existing inode) and 1 if the name was replayed. 1847 */ 1848 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1849 struct btrfs_root *root, 1850 struct btrfs_path *path, 1851 struct extent_buffer *eb, 1852 struct btrfs_dir_item *di, 1853 struct btrfs_key *key) 1854 { 1855 char *name; 1856 int name_len; 1857 struct btrfs_dir_item *dst_di; 1858 struct btrfs_key found_key; 1859 struct btrfs_key log_key; 1860 struct inode *dir; 1861 u8 log_type; 1862 int exists; 1863 int ret = 0; 1864 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1865 bool name_added = false; 1866 1867 dir = read_one_inode(root, key->objectid); 1868 if (!dir) 1869 return -EIO; 1870 1871 name_len = btrfs_dir_name_len(eb, di); 1872 name = kmalloc(name_len, GFP_NOFS); 1873 if (!name) { 1874 ret = -ENOMEM; 1875 goto out; 1876 } 1877 1878 log_type = btrfs_dir_type(eb, di); 1879 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1880 name_len); 1881 1882 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1883 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1884 if (exists == 0) 1885 exists = 1; 1886 else 1887 exists = 0; 1888 btrfs_release_path(path); 1889 1890 if (key->type == BTRFS_DIR_ITEM_KEY) { 1891 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1892 name, name_len, 1); 1893 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1894 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1895 key->objectid, 1896 key->offset, name, 1897 name_len, 1); 1898 } else { 1899 /* Corruption */ 1900 ret = -EINVAL; 1901 goto out; 1902 } 1903 if (IS_ERR_OR_NULL(dst_di)) { 1904 /* we need a sequence number to insert, so we only 1905 * do inserts for the BTRFS_DIR_INDEX_KEY types 1906 */ 1907 if (key->type != BTRFS_DIR_INDEX_KEY) 1908 goto out; 1909 goto insert; 1910 } 1911 1912 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1913 /* the existing item matches the logged item */ 1914 if (found_key.objectid == log_key.objectid && 1915 found_key.type == log_key.type && 1916 found_key.offset == log_key.offset && 1917 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1918 update_size = false; 1919 goto out; 1920 } 1921 1922 /* 1923 * don't drop the conflicting directory entry if the inode 1924 * for the new entry doesn't exist 1925 */ 1926 if (!exists) 1927 goto out; 1928 1929 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1930 if (ret) 1931 goto out; 1932 1933 if (key->type == BTRFS_DIR_INDEX_KEY) 1934 goto insert; 1935 out: 1936 btrfs_release_path(path); 1937 if (!ret && update_size) { 1938 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1939 ret = btrfs_update_inode(trans, root, dir); 1940 } 1941 kfree(name); 1942 iput(dir); 1943 if (!ret && name_added) 1944 ret = 1; 1945 return ret; 1946 1947 insert: 1948 if (name_in_log_ref(root->log_root, name, name_len, 1949 key->objectid, log_key.objectid)) { 1950 /* The dentry will be added later. */ 1951 ret = 0; 1952 update_size = false; 1953 goto out; 1954 } 1955 btrfs_release_path(path); 1956 ret = insert_one_name(trans, root, key->objectid, key->offset, 1957 name, name_len, &log_key); 1958 if (ret && ret != -ENOENT && ret != -EEXIST) 1959 goto out; 1960 if (!ret) 1961 name_added = true; 1962 update_size = false; 1963 ret = 0; 1964 goto out; 1965 } 1966 1967 /* 1968 * find all the names in a directory item and reconcile them into 1969 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1970 * one name in a directory item, but the same code gets used for 1971 * both directory index types 1972 */ 1973 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1974 struct btrfs_root *root, 1975 struct btrfs_path *path, 1976 struct extent_buffer *eb, int slot, 1977 struct btrfs_key *key) 1978 { 1979 int ret = 0; 1980 u32 item_size = btrfs_item_size_nr(eb, slot); 1981 struct btrfs_dir_item *di; 1982 int name_len; 1983 unsigned long ptr; 1984 unsigned long ptr_end; 1985 struct btrfs_path *fixup_path = NULL; 1986 1987 ptr = btrfs_item_ptr_offset(eb, slot); 1988 ptr_end = ptr + item_size; 1989 while (ptr < ptr_end) { 1990 di = (struct btrfs_dir_item *)ptr; 1991 name_len = btrfs_dir_name_len(eb, di); 1992 ret = replay_one_name(trans, root, path, eb, di, key); 1993 if (ret < 0) 1994 break; 1995 ptr = (unsigned long)(di + 1); 1996 ptr += name_len; 1997 1998 /* 1999 * If this entry refers to a non-directory (directories can not 2000 * have a link count > 1) and it was added in the transaction 2001 * that was not committed, make sure we fixup the link count of 2002 * the inode it the entry points to. Otherwise something like 2003 * the following would result in a directory pointing to an 2004 * inode with a wrong link that does not account for this dir 2005 * entry: 2006 * 2007 * mkdir testdir 2008 * touch testdir/foo 2009 * touch testdir/bar 2010 * sync 2011 * 2012 * ln testdir/bar testdir/bar_link 2013 * ln testdir/foo testdir/foo_link 2014 * xfs_io -c "fsync" testdir/bar 2015 * 2016 * <power failure> 2017 * 2018 * mount fs, log replay happens 2019 * 2020 * File foo would remain with a link count of 1 when it has two 2021 * entries pointing to it in the directory testdir. This would 2022 * make it impossible to ever delete the parent directory has 2023 * it would result in stale dentries that can never be deleted. 2024 */ 2025 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2026 struct btrfs_key di_key; 2027 2028 if (!fixup_path) { 2029 fixup_path = btrfs_alloc_path(); 2030 if (!fixup_path) { 2031 ret = -ENOMEM; 2032 break; 2033 } 2034 } 2035 2036 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2037 ret = link_to_fixup_dir(trans, root, fixup_path, 2038 di_key.objectid); 2039 if (ret) 2040 break; 2041 } 2042 ret = 0; 2043 } 2044 btrfs_free_path(fixup_path); 2045 return ret; 2046 } 2047 2048 /* 2049 * directory replay has two parts. There are the standard directory 2050 * items in the log copied from the subvolume, and range items 2051 * created in the log while the subvolume was logged. 2052 * 2053 * The range items tell us which parts of the key space the log 2054 * is authoritative for. During replay, if a key in the subvolume 2055 * directory is in a logged range item, but not actually in the log 2056 * that means it was deleted from the directory before the fsync 2057 * and should be removed. 2058 */ 2059 static noinline int find_dir_range(struct btrfs_root *root, 2060 struct btrfs_path *path, 2061 u64 dirid, int key_type, 2062 u64 *start_ret, u64 *end_ret) 2063 { 2064 struct btrfs_key key; 2065 u64 found_end; 2066 struct btrfs_dir_log_item *item; 2067 int ret; 2068 int nritems; 2069 2070 if (*start_ret == (u64)-1) 2071 return 1; 2072 2073 key.objectid = dirid; 2074 key.type = key_type; 2075 key.offset = *start_ret; 2076 2077 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2078 if (ret < 0) 2079 goto out; 2080 if (ret > 0) { 2081 if (path->slots[0] == 0) 2082 goto out; 2083 path->slots[0]--; 2084 } 2085 if (ret != 0) 2086 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2087 2088 if (key.type != key_type || key.objectid != dirid) { 2089 ret = 1; 2090 goto next; 2091 } 2092 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2093 struct btrfs_dir_log_item); 2094 found_end = btrfs_dir_log_end(path->nodes[0], item); 2095 2096 if (*start_ret >= key.offset && *start_ret <= found_end) { 2097 ret = 0; 2098 *start_ret = key.offset; 2099 *end_ret = found_end; 2100 goto out; 2101 } 2102 ret = 1; 2103 next: 2104 /* check the next slot in the tree to see if it is a valid item */ 2105 nritems = btrfs_header_nritems(path->nodes[0]); 2106 path->slots[0]++; 2107 if (path->slots[0] >= nritems) { 2108 ret = btrfs_next_leaf(root, path); 2109 if (ret) 2110 goto out; 2111 } 2112 2113 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2114 2115 if (key.type != key_type || key.objectid != dirid) { 2116 ret = 1; 2117 goto out; 2118 } 2119 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2120 struct btrfs_dir_log_item); 2121 found_end = btrfs_dir_log_end(path->nodes[0], item); 2122 *start_ret = key.offset; 2123 *end_ret = found_end; 2124 ret = 0; 2125 out: 2126 btrfs_release_path(path); 2127 return ret; 2128 } 2129 2130 /* 2131 * this looks for a given directory item in the log. If the directory 2132 * item is not in the log, the item is removed and the inode it points 2133 * to is unlinked 2134 */ 2135 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2136 struct btrfs_root *root, 2137 struct btrfs_root *log, 2138 struct btrfs_path *path, 2139 struct btrfs_path *log_path, 2140 struct inode *dir, 2141 struct btrfs_key *dir_key) 2142 { 2143 int ret; 2144 struct extent_buffer *eb; 2145 int slot; 2146 u32 item_size; 2147 struct btrfs_dir_item *di; 2148 struct btrfs_dir_item *log_di; 2149 int name_len; 2150 unsigned long ptr; 2151 unsigned long ptr_end; 2152 char *name; 2153 struct inode *inode; 2154 struct btrfs_key location; 2155 2156 again: 2157 eb = path->nodes[0]; 2158 slot = path->slots[0]; 2159 item_size = btrfs_item_size_nr(eb, slot); 2160 ptr = btrfs_item_ptr_offset(eb, slot); 2161 ptr_end = ptr + item_size; 2162 while (ptr < ptr_end) { 2163 di = (struct btrfs_dir_item *)ptr; 2164 name_len = btrfs_dir_name_len(eb, di); 2165 name = kmalloc(name_len, GFP_NOFS); 2166 if (!name) { 2167 ret = -ENOMEM; 2168 goto out; 2169 } 2170 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2171 name_len); 2172 log_di = NULL; 2173 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2174 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2175 dir_key->objectid, 2176 name, name_len, 0); 2177 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2178 log_di = btrfs_lookup_dir_index_item(trans, log, 2179 log_path, 2180 dir_key->objectid, 2181 dir_key->offset, 2182 name, name_len, 0); 2183 } 2184 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2185 btrfs_dir_item_key_to_cpu(eb, di, &location); 2186 btrfs_release_path(path); 2187 btrfs_release_path(log_path); 2188 inode = read_one_inode(root, location.objectid); 2189 if (!inode) { 2190 kfree(name); 2191 return -EIO; 2192 } 2193 2194 ret = link_to_fixup_dir(trans, root, 2195 path, location.objectid); 2196 if (ret) { 2197 kfree(name); 2198 iput(inode); 2199 goto out; 2200 } 2201 2202 inc_nlink(inode); 2203 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2204 BTRFS_I(inode), name, name_len); 2205 if (!ret) 2206 ret = btrfs_run_delayed_items(trans); 2207 kfree(name); 2208 iput(inode); 2209 if (ret) 2210 goto out; 2211 2212 /* there might still be more names under this key 2213 * check and repeat if required 2214 */ 2215 ret = btrfs_search_slot(NULL, root, dir_key, path, 2216 0, 0); 2217 if (ret == 0) 2218 goto again; 2219 ret = 0; 2220 goto out; 2221 } else if (IS_ERR(log_di)) { 2222 kfree(name); 2223 return PTR_ERR(log_di); 2224 } 2225 btrfs_release_path(log_path); 2226 kfree(name); 2227 2228 ptr = (unsigned long)(di + 1); 2229 ptr += name_len; 2230 } 2231 ret = 0; 2232 out: 2233 btrfs_release_path(path); 2234 btrfs_release_path(log_path); 2235 return ret; 2236 } 2237 2238 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2239 struct btrfs_root *root, 2240 struct btrfs_root *log, 2241 struct btrfs_path *path, 2242 const u64 ino) 2243 { 2244 struct btrfs_key search_key; 2245 struct btrfs_path *log_path; 2246 int i; 2247 int nritems; 2248 int ret; 2249 2250 log_path = btrfs_alloc_path(); 2251 if (!log_path) 2252 return -ENOMEM; 2253 2254 search_key.objectid = ino; 2255 search_key.type = BTRFS_XATTR_ITEM_KEY; 2256 search_key.offset = 0; 2257 again: 2258 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2259 if (ret < 0) 2260 goto out; 2261 process_leaf: 2262 nritems = btrfs_header_nritems(path->nodes[0]); 2263 for (i = path->slots[0]; i < nritems; i++) { 2264 struct btrfs_key key; 2265 struct btrfs_dir_item *di; 2266 struct btrfs_dir_item *log_di; 2267 u32 total_size; 2268 u32 cur; 2269 2270 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2271 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2272 ret = 0; 2273 goto out; 2274 } 2275 2276 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2277 total_size = btrfs_item_size_nr(path->nodes[0], i); 2278 cur = 0; 2279 while (cur < total_size) { 2280 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2281 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2282 u32 this_len = sizeof(*di) + name_len + data_len; 2283 char *name; 2284 2285 name = kmalloc(name_len, GFP_NOFS); 2286 if (!name) { 2287 ret = -ENOMEM; 2288 goto out; 2289 } 2290 read_extent_buffer(path->nodes[0], name, 2291 (unsigned long)(di + 1), name_len); 2292 2293 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2294 name, name_len, 0); 2295 btrfs_release_path(log_path); 2296 if (!log_di) { 2297 /* Doesn't exist in log tree, so delete it. */ 2298 btrfs_release_path(path); 2299 di = btrfs_lookup_xattr(trans, root, path, ino, 2300 name, name_len, -1); 2301 kfree(name); 2302 if (IS_ERR(di)) { 2303 ret = PTR_ERR(di); 2304 goto out; 2305 } 2306 ASSERT(di); 2307 ret = btrfs_delete_one_dir_name(trans, root, 2308 path, di); 2309 if (ret) 2310 goto out; 2311 btrfs_release_path(path); 2312 search_key = key; 2313 goto again; 2314 } 2315 kfree(name); 2316 if (IS_ERR(log_di)) { 2317 ret = PTR_ERR(log_di); 2318 goto out; 2319 } 2320 cur += this_len; 2321 di = (struct btrfs_dir_item *)((char *)di + this_len); 2322 } 2323 } 2324 ret = btrfs_next_leaf(root, path); 2325 if (ret > 0) 2326 ret = 0; 2327 else if (ret == 0) 2328 goto process_leaf; 2329 out: 2330 btrfs_free_path(log_path); 2331 btrfs_release_path(path); 2332 return ret; 2333 } 2334 2335 2336 /* 2337 * deletion replay happens before we copy any new directory items 2338 * out of the log or out of backreferences from inodes. It 2339 * scans the log to find ranges of keys that log is authoritative for, 2340 * and then scans the directory to find items in those ranges that are 2341 * not present in the log. 2342 * 2343 * Anything we don't find in the log is unlinked and removed from the 2344 * directory. 2345 */ 2346 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2347 struct btrfs_root *root, 2348 struct btrfs_root *log, 2349 struct btrfs_path *path, 2350 u64 dirid, int del_all) 2351 { 2352 u64 range_start; 2353 u64 range_end; 2354 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2355 int ret = 0; 2356 struct btrfs_key dir_key; 2357 struct btrfs_key found_key; 2358 struct btrfs_path *log_path; 2359 struct inode *dir; 2360 2361 dir_key.objectid = dirid; 2362 dir_key.type = BTRFS_DIR_ITEM_KEY; 2363 log_path = btrfs_alloc_path(); 2364 if (!log_path) 2365 return -ENOMEM; 2366 2367 dir = read_one_inode(root, dirid); 2368 /* it isn't an error if the inode isn't there, that can happen 2369 * because we replay the deletes before we copy in the inode item 2370 * from the log 2371 */ 2372 if (!dir) { 2373 btrfs_free_path(log_path); 2374 return 0; 2375 } 2376 again: 2377 range_start = 0; 2378 range_end = 0; 2379 while (1) { 2380 if (del_all) 2381 range_end = (u64)-1; 2382 else { 2383 ret = find_dir_range(log, path, dirid, key_type, 2384 &range_start, &range_end); 2385 if (ret != 0) 2386 break; 2387 } 2388 2389 dir_key.offset = range_start; 2390 while (1) { 2391 int nritems; 2392 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2393 0, 0); 2394 if (ret < 0) 2395 goto out; 2396 2397 nritems = btrfs_header_nritems(path->nodes[0]); 2398 if (path->slots[0] >= nritems) { 2399 ret = btrfs_next_leaf(root, path); 2400 if (ret == 1) 2401 break; 2402 else if (ret < 0) 2403 goto out; 2404 } 2405 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2406 path->slots[0]); 2407 if (found_key.objectid != dirid || 2408 found_key.type != dir_key.type) 2409 goto next_type; 2410 2411 if (found_key.offset > range_end) 2412 break; 2413 2414 ret = check_item_in_log(trans, root, log, path, 2415 log_path, dir, 2416 &found_key); 2417 if (ret) 2418 goto out; 2419 if (found_key.offset == (u64)-1) 2420 break; 2421 dir_key.offset = found_key.offset + 1; 2422 } 2423 btrfs_release_path(path); 2424 if (range_end == (u64)-1) 2425 break; 2426 range_start = range_end + 1; 2427 } 2428 2429 next_type: 2430 ret = 0; 2431 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2432 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2433 dir_key.type = BTRFS_DIR_INDEX_KEY; 2434 btrfs_release_path(path); 2435 goto again; 2436 } 2437 out: 2438 btrfs_release_path(path); 2439 btrfs_free_path(log_path); 2440 iput(dir); 2441 return ret; 2442 } 2443 2444 /* 2445 * the process_func used to replay items from the log tree. This 2446 * gets called in two different stages. The first stage just looks 2447 * for inodes and makes sure they are all copied into the subvolume. 2448 * 2449 * The second stage copies all the other item types from the log into 2450 * the subvolume. The two stage approach is slower, but gets rid of 2451 * lots of complexity around inodes referencing other inodes that exist 2452 * only in the log (references come from either directory items or inode 2453 * back refs). 2454 */ 2455 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2456 struct walk_control *wc, u64 gen, int level) 2457 { 2458 int nritems; 2459 struct btrfs_path *path; 2460 struct btrfs_root *root = wc->replay_dest; 2461 struct btrfs_key key; 2462 int i; 2463 int ret; 2464 2465 ret = btrfs_read_buffer(eb, gen, level, NULL); 2466 if (ret) 2467 return ret; 2468 2469 level = btrfs_header_level(eb); 2470 2471 if (level != 0) 2472 return 0; 2473 2474 path = btrfs_alloc_path(); 2475 if (!path) 2476 return -ENOMEM; 2477 2478 nritems = btrfs_header_nritems(eb); 2479 for (i = 0; i < nritems; i++) { 2480 btrfs_item_key_to_cpu(eb, &key, i); 2481 2482 /* inode keys are done during the first stage */ 2483 if (key.type == BTRFS_INODE_ITEM_KEY && 2484 wc->stage == LOG_WALK_REPLAY_INODES) { 2485 struct btrfs_inode_item *inode_item; 2486 u32 mode; 2487 2488 inode_item = btrfs_item_ptr(eb, i, 2489 struct btrfs_inode_item); 2490 ret = replay_xattr_deletes(wc->trans, root, log, 2491 path, key.objectid); 2492 if (ret) 2493 break; 2494 mode = btrfs_inode_mode(eb, inode_item); 2495 if (S_ISDIR(mode)) { 2496 ret = replay_dir_deletes(wc->trans, 2497 root, log, path, key.objectid, 0); 2498 if (ret) 2499 break; 2500 } 2501 ret = overwrite_item(wc->trans, root, path, 2502 eb, i, &key); 2503 if (ret) 2504 break; 2505 2506 /* 2507 * Before replaying extents, truncate the inode to its 2508 * size. We need to do it now and not after log replay 2509 * because before an fsync we can have prealloc extents 2510 * added beyond the inode's i_size. If we did it after, 2511 * through orphan cleanup for example, we would drop 2512 * those prealloc extents just after replaying them. 2513 */ 2514 if (S_ISREG(mode)) { 2515 struct inode *inode; 2516 u64 from; 2517 2518 inode = read_one_inode(root, key.objectid); 2519 if (!inode) { 2520 ret = -EIO; 2521 break; 2522 } 2523 from = ALIGN(i_size_read(inode), 2524 root->fs_info->sectorsize); 2525 ret = btrfs_drop_extents(wc->trans, root, inode, 2526 from, (u64)-1, 1); 2527 /* 2528 * If the nlink count is zero here, the iput 2529 * will free the inode. We bump it to make 2530 * sure it doesn't get freed until the link 2531 * count fixup is done. 2532 */ 2533 if (!ret) { 2534 if (inode->i_nlink == 0) 2535 inc_nlink(inode); 2536 /* Update link count and nbytes. */ 2537 ret = btrfs_update_inode(wc->trans, 2538 root, inode); 2539 } 2540 iput(inode); 2541 if (ret) 2542 break; 2543 } 2544 2545 ret = link_to_fixup_dir(wc->trans, root, 2546 path, key.objectid); 2547 if (ret) 2548 break; 2549 } 2550 2551 if (key.type == BTRFS_DIR_INDEX_KEY && 2552 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2553 ret = replay_one_dir_item(wc->trans, root, path, 2554 eb, i, &key); 2555 if (ret) 2556 break; 2557 } 2558 2559 if (wc->stage < LOG_WALK_REPLAY_ALL) 2560 continue; 2561 2562 /* these keys are simply copied */ 2563 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2564 ret = overwrite_item(wc->trans, root, path, 2565 eb, i, &key); 2566 if (ret) 2567 break; 2568 } else if (key.type == BTRFS_INODE_REF_KEY || 2569 key.type == BTRFS_INODE_EXTREF_KEY) { 2570 ret = add_inode_ref(wc->trans, root, log, path, 2571 eb, i, &key); 2572 if (ret && ret != -ENOENT) 2573 break; 2574 ret = 0; 2575 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2576 ret = replay_one_extent(wc->trans, root, path, 2577 eb, i, &key); 2578 if (ret) 2579 break; 2580 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2581 ret = replay_one_dir_item(wc->trans, root, path, 2582 eb, i, &key); 2583 if (ret) 2584 break; 2585 } 2586 } 2587 btrfs_free_path(path); 2588 return ret; 2589 } 2590 2591 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2592 struct btrfs_root *root, 2593 struct btrfs_path *path, int *level, 2594 struct walk_control *wc) 2595 { 2596 struct btrfs_fs_info *fs_info = root->fs_info; 2597 u64 root_owner; 2598 u64 bytenr; 2599 u64 ptr_gen; 2600 struct extent_buffer *next; 2601 struct extent_buffer *cur; 2602 struct extent_buffer *parent; 2603 u32 blocksize; 2604 int ret = 0; 2605 2606 WARN_ON(*level < 0); 2607 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2608 2609 while (*level > 0) { 2610 struct btrfs_key first_key; 2611 2612 WARN_ON(*level < 0); 2613 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2614 cur = path->nodes[*level]; 2615 2616 WARN_ON(btrfs_header_level(cur) != *level); 2617 2618 if (path->slots[*level] >= 2619 btrfs_header_nritems(cur)) 2620 break; 2621 2622 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2623 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2624 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2625 blocksize = fs_info->nodesize; 2626 2627 parent = path->nodes[*level]; 2628 root_owner = btrfs_header_owner(parent); 2629 2630 next = btrfs_find_create_tree_block(fs_info, bytenr); 2631 if (IS_ERR(next)) 2632 return PTR_ERR(next); 2633 2634 if (*level == 1) { 2635 ret = wc->process_func(root, next, wc, ptr_gen, 2636 *level - 1); 2637 if (ret) { 2638 free_extent_buffer(next); 2639 return ret; 2640 } 2641 2642 path->slots[*level]++; 2643 if (wc->free) { 2644 ret = btrfs_read_buffer(next, ptr_gen, 2645 *level - 1, &first_key); 2646 if (ret) { 2647 free_extent_buffer(next); 2648 return ret; 2649 } 2650 2651 if (trans) { 2652 btrfs_tree_lock(next); 2653 btrfs_set_lock_blocking(next); 2654 clean_tree_block(fs_info, next); 2655 btrfs_wait_tree_block_writeback(next); 2656 btrfs_tree_unlock(next); 2657 } else { 2658 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2659 clear_extent_buffer_dirty(next); 2660 } 2661 2662 WARN_ON(root_owner != 2663 BTRFS_TREE_LOG_OBJECTID); 2664 ret = btrfs_free_and_pin_reserved_extent( 2665 fs_info, bytenr, 2666 blocksize); 2667 if (ret) { 2668 free_extent_buffer(next); 2669 return ret; 2670 } 2671 } 2672 free_extent_buffer(next); 2673 continue; 2674 } 2675 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2676 if (ret) { 2677 free_extent_buffer(next); 2678 return ret; 2679 } 2680 2681 WARN_ON(*level <= 0); 2682 if (path->nodes[*level-1]) 2683 free_extent_buffer(path->nodes[*level-1]); 2684 path->nodes[*level-1] = next; 2685 *level = btrfs_header_level(next); 2686 path->slots[*level] = 0; 2687 cond_resched(); 2688 } 2689 WARN_ON(*level < 0); 2690 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2691 2692 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2693 2694 cond_resched(); 2695 return 0; 2696 } 2697 2698 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2699 struct btrfs_root *root, 2700 struct btrfs_path *path, int *level, 2701 struct walk_control *wc) 2702 { 2703 struct btrfs_fs_info *fs_info = root->fs_info; 2704 u64 root_owner; 2705 int i; 2706 int slot; 2707 int ret; 2708 2709 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2710 slot = path->slots[i]; 2711 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2712 path->slots[i]++; 2713 *level = i; 2714 WARN_ON(*level == 0); 2715 return 0; 2716 } else { 2717 struct extent_buffer *parent; 2718 if (path->nodes[*level] == root->node) 2719 parent = path->nodes[*level]; 2720 else 2721 parent = path->nodes[*level + 1]; 2722 2723 root_owner = btrfs_header_owner(parent); 2724 ret = wc->process_func(root, path->nodes[*level], wc, 2725 btrfs_header_generation(path->nodes[*level]), 2726 *level); 2727 if (ret) 2728 return ret; 2729 2730 if (wc->free) { 2731 struct extent_buffer *next; 2732 2733 next = path->nodes[*level]; 2734 2735 if (trans) { 2736 btrfs_tree_lock(next); 2737 btrfs_set_lock_blocking(next); 2738 clean_tree_block(fs_info, next); 2739 btrfs_wait_tree_block_writeback(next); 2740 btrfs_tree_unlock(next); 2741 } else { 2742 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2743 clear_extent_buffer_dirty(next); 2744 } 2745 2746 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2747 ret = btrfs_free_and_pin_reserved_extent( 2748 fs_info, 2749 path->nodes[*level]->start, 2750 path->nodes[*level]->len); 2751 if (ret) 2752 return ret; 2753 } 2754 free_extent_buffer(path->nodes[*level]); 2755 path->nodes[*level] = NULL; 2756 *level = i + 1; 2757 } 2758 } 2759 return 1; 2760 } 2761 2762 /* 2763 * drop the reference count on the tree rooted at 'snap'. This traverses 2764 * the tree freeing any blocks that have a ref count of zero after being 2765 * decremented. 2766 */ 2767 static int walk_log_tree(struct btrfs_trans_handle *trans, 2768 struct btrfs_root *log, struct walk_control *wc) 2769 { 2770 struct btrfs_fs_info *fs_info = log->fs_info; 2771 int ret = 0; 2772 int wret; 2773 int level; 2774 struct btrfs_path *path; 2775 int orig_level; 2776 2777 path = btrfs_alloc_path(); 2778 if (!path) 2779 return -ENOMEM; 2780 2781 level = btrfs_header_level(log->node); 2782 orig_level = level; 2783 path->nodes[level] = log->node; 2784 extent_buffer_get(log->node); 2785 path->slots[level] = 0; 2786 2787 while (1) { 2788 wret = walk_down_log_tree(trans, log, path, &level, wc); 2789 if (wret > 0) 2790 break; 2791 if (wret < 0) { 2792 ret = wret; 2793 goto out; 2794 } 2795 2796 wret = walk_up_log_tree(trans, log, path, &level, wc); 2797 if (wret > 0) 2798 break; 2799 if (wret < 0) { 2800 ret = wret; 2801 goto out; 2802 } 2803 } 2804 2805 /* was the root node processed? if not, catch it here */ 2806 if (path->nodes[orig_level]) { 2807 ret = wc->process_func(log, path->nodes[orig_level], wc, 2808 btrfs_header_generation(path->nodes[orig_level]), 2809 orig_level); 2810 if (ret) 2811 goto out; 2812 if (wc->free) { 2813 struct extent_buffer *next; 2814 2815 next = path->nodes[orig_level]; 2816 2817 if (trans) { 2818 btrfs_tree_lock(next); 2819 btrfs_set_lock_blocking(next); 2820 clean_tree_block(fs_info, next); 2821 btrfs_wait_tree_block_writeback(next); 2822 btrfs_tree_unlock(next); 2823 } else { 2824 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2825 clear_extent_buffer_dirty(next); 2826 } 2827 2828 WARN_ON(log->root_key.objectid != 2829 BTRFS_TREE_LOG_OBJECTID); 2830 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2831 next->start, next->len); 2832 if (ret) 2833 goto out; 2834 } 2835 } 2836 2837 out: 2838 btrfs_free_path(path); 2839 return ret; 2840 } 2841 2842 /* 2843 * helper function to update the item for a given subvolumes log root 2844 * in the tree of log roots 2845 */ 2846 static int update_log_root(struct btrfs_trans_handle *trans, 2847 struct btrfs_root *log) 2848 { 2849 struct btrfs_fs_info *fs_info = log->fs_info; 2850 int ret; 2851 2852 if (log->log_transid == 1) { 2853 /* insert root item on the first sync */ 2854 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2855 &log->root_key, &log->root_item); 2856 } else { 2857 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2858 &log->root_key, &log->root_item); 2859 } 2860 return ret; 2861 } 2862 2863 static void wait_log_commit(struct btrfs_root *root, int transid) 2864 { 2865 DEFINE_WAIT(wait); 2866 int index = transid % 2; 2867 2868 /* 2869 * we only allow two pending log transactions at a time, 2870 * so we know that if ours is more than 2 older than the 2871 * current transaction, we're done 2872 */ 2873 for (;;) { 2874 prepare_to_wait(&root->log_commit_wait[index], 2875 &wait, TASK_UNINTERRUPTIBLE); 2876 2877 if (!(root->log_transid_committed < transid && 2878 atomic_read(&root->log_commit[index]))) 2879 break; 2880 2881 mutex_unlock(&root->log_mutex); 2882 schedule(); 2883 mutex_lock(&root->log_mutex); 2884 } 2885 finish_wait(&root->log_commit_wait[index], &wait); 2886 } 2887 2888 static void wait_for_writer(struct btrfs_root *root) 2889 { 2890 DEFINE_WAIT(wait); 2891 2892 for (;;) { 2893 prepare_to_wait(&root->log_writer_wait, &wait, 2894 TASK_UNINTERRUPTIBLE); 2895 if (!atomic_read(&root->log_writers)) 2896 break; 2897 2898 mutex_unlock(&root->log_mutex); 2899 schedule(); 2900 mutex_lock(&root->log_mutex); 2901 } 2902 finish_wait(&root->log_writer_wait, &wait); 2903 } 2904 2905 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2906 struct btrfs_log_ctx *ctx) 2907 { 2908 if (!ctx) 2909 return; 2910 2911 mutex_lock(&root->log_mutex); 2912 list_del_init(&ctx->list); 2913 mutex_unlock(&root->log_mutex); 2914 } 2915 2916 /* 2917 * Invoked in log mutex context, or be sure there is no other task which 2918 * can access the list. 2919 */ 2920 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2921 int index, int error) 2922 { 2923 struct btrfs_log_ctx *ctx; 2924 struct btrfs_log_ctx *safe; 2925 2926 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2927 list_del_init(&ctx->list); 2928 ctx->log_ret = error; 2929 } 2930 2931 INIT_LIST_HEAD(&root->log_ctxs[index]); 2932 } 2933 2934 /* 2935 * btrfs_sync_log does sends a given tree log down to the disk and 2936 * updates the super blocks to record it. When this call is done, 2937 * you know that any inodes previously logged are safely on disk only 2938 * if it returns 0. 2939 * 2940 * Any other return value means you need to call btrfs_commit_transaction. 2941 * Some of the edge cases for fsyncing directories that have had unlinks 2942 * or renames done in the past mean that sometimes the only safe 2943 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2944 * that has happened. 2945 */ 2946 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2947 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2948 { 2949 int index1; 2950 int index2; 2951 int mark; 2952 int ret; 2953 struct btrfs_fs_info *fs_info = root->fs_info; 2954 struct btrfs_root *log = root->log_root; 2955 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2956 int log_transid = 0; 2957 struct btrfs_log_ctx root_log_ctx; 2958 struct blk_plug plug; 2959 2960 mutex_lock(&root->log_mutex); 2961 log_transid = ctx->log_transid; 2962 if (root->log_transid_committed >= log_transid) { 2963 mutex_unlock(&root->log_mutex); 2964 return ctx->log_ret; 2965 } 2966 2967 index1 = log_transid % 2; 2968 if (atomic_read(&root->log_commit[index1])) { 2969 wait_log_commit(root, log_transid); 2970 mutex_unlock(&root->log_mutex); 2971 return ctx->log_ret; 2972 } 2973 ASSERT(log_transid == root->log_transid); 2974 atomic_set(&root->log_commit[index1], 1); 2975 2976 /* wait for previous tree log sync to complete */ 2977 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2978 wait_log_commit(root, log_transid - 1); 2979 2980 while (1) { 2981 int batch = atomic_read(&root->log_batch); 2982 /* when we're on an ssd, just kick the log commit out */ 2983 if (!btrfs_test_opt(fs_info, SSD) && 2984 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2985 mutex_unlock(&root->log_mutex); 2986 schedule_timeout_uninterruptible(1); 2987 mutex_lock(&root->log_mutex); 2988 } 2989 wait_for_writer(root); 2990 if (batch == atomic_read(&root->log_batch)) 2991 break; 2992 } 2993 2994 /* bail out if we need to do a full commit */ 2995 if (btrfs_need_log_full_commit(fs_info, trans)) { 2996 ret = -EAGAIN; 2997 mutex_unlock(&root->log_mutex); 2998 goto out; 2999 } 3000 3001 if (log_transid % 2 == 0) 3002 mark = EXTENT_DIRTY; 3003 else 3004 mark = EXTENT_NEW; 3005 3006 /* we start IO on all the marked extents here, but we don't actually 3007 * wait for them until later. 3008 */ 3009 blk_start_plug(&plug); 3010 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3011 if (ret) { 3012 blk_finish_plug(&plug); 3013 btrfs_abort_transaction(trans, ret); 3014 btrfs_set_log_full_commit(fs_info, trans); 3015 mutex_unlock(&root->log_mutex); 3016 goto out; 3017 } 3018 3019 btrfs_set_root_node(&log->root_item, log->node); 3020 3021 root->log_transid++; 3022 log->log_transid = root->log_transid; 3023 root->log_start_pid = 0; 3024 /* 3025 * IO has been started, blocks of the log tree have WRITTEN flag set 3026 * in their headers. new modifications of the log will be written to 3027 * new positions. so it's safe to allow log writers to go in. 3028 */ 3029 mutex_unlock(&root->log_mutex); 3030 3031 btrfs_init_log_ctx(&root_log_ctx, NULL); 3032 3033 mutex_lock(&log_root_tree->log_mutex); 3034 atomic_inc(&log_root_tree->log_batch); 3035 atomic_inc(&log_root_tree->log_writers); 3036 3037 index2 = log_root_tree->log_transid % 2; 3038 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3039 root_log_ctx.log_transid = log_root_tree->log_transid; 3040 3041 mutex_unlock(&log_root_tree->log_mutex); 3042 3043 ret = update_log_root(trans, log); 3044 3045 mutex_lock(&log_root_tree->log_mutex); 3046 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 3047 /* atomic_dec_and_test implies a barrier */ 3048 cond_wake_up_nomb(&log_root_tree->log_writer_wait); 3049 } 3050 3051 if (ret) { 3052 if (!list_empty(&root_log_ctx.list)) 3053 list_del_init(&root_log_ctx.list); 3054 3055 blk_finish_plug(&plug); 3056 btrfs_set_log_full_commit(fs_info, trans); 3057 3058 if (ret != -ENOSPC) { 3059 btrfs_abort_transaction(trans, ret); 3060 mutex_unlock(&log_root_tree->log_mutex); 3061 goto out; 3062 } 3063 btrfs_wait_tree_log_extents(log, mark); 3064 mutex_unlock(&log_root_tree->log_mutex); 3065 ret = -EAGAIN; 3066 goto out; 3067 } 3068 3069 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3070 blk_finish_plug(&plug); 3071 list_del_init(&root_log_ctx.list); 3072 mutex_unlock(&log_root_tree->log_mutex); 3073 ret = root_log_ctx.log_ret; 3074 goto out; 3075 } 3076 3077 index2 = root_log_ctx.log_transid % 2; 3078 if (atomic_read(&log_root_tree->log_commit[index2])) { 3079 blk_finish_plug(&plug); 3080 ret = btrfs_wait_tree_log_extents(log, mark); 3081 wait_log_commit(log_root_tree, 3082 root_log_ctx.log_transid); 3083 mutex_unlock(&log_root_tree->log_mutex); 3084 if (!ret) 3085 ret = root_log_ctx.log_ret; 3086 goto out; 3087 } 3088 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3089 atomic_set(&log_root_tree->log_commit[index2], 1); 3090 3091 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3092 wait_log_commit(log_root_tree, 3093 root_log_ctx.log_transid - 1); 3094 } 3095 3096 wait_for_writer(log_root_tree); 3097 3098 /* 3099 * now that we've moved on to the tree of log tree roots, 3100 * check the full commit flag again 3101 */ 3102 if (btrfs_need_log_full_commit(fs_info, trans)) { 3103 blk_finish_plug(&plug); 3104 btrfs_wait_tree_log_extents(log, mark); 3105 mutex_unlock(&log_root_tree->log_mutex); 3106 ret = -EAGAIN; 3107 goto out_wake_log_root; 3108 } 3109 3110 ret = btrfs_write_marked_extents(fs_info, 3111 &log_root_tree->dirty_log_pages, 3112 EXTENT_DIRTY | EXTENT_NEW); 3113 blk_finish_plug(&plug); 3114 if (ret) { 3115 btrfs_set_log_full_commit(fs_info, trans); 3116 btrfs_abort_transaction(trans, ret); 3117 mutex_unlock(&log_root_tree->log_mutex); 3118 goto out_wake_log_root; 3119 } 3120 ret = btrfs_wait_tree_log_extents(log, mark); 3121 if (!ret) 3122 ret = btrfs_wait_tree_log_extents(log_root_tree, 3123 EXTENT_NEW | EXTENT_DIRTY); 3124 if (ret) { 3125 btrfs_set_log_full_commit(fs_info, trans); 3126 mutex_unlock(&log_root_tree->log_mutex); 3127 goto out_wake_log_root; 3128 } 3129 3130 btrfs_set_super_log_root(fs_info->super_for_commit, 3131 log_root_tree->node->start); 3132 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3133 btrfs_header_level(log_root_tree->node)); 3134 3135 log_root_tree->log_transid++; 3136 mutex_unlock(&log_root_tree->log_mutex); 3137 3138 /* 3139 * nobody else is going to jump in and write the the ctree 3140 * super here because the log_commit atomic below is protecting 3141 * us. We must be called with a transaction handle pinning 3142 * the running transaction open, so a full commit can't hop 3143 * in and cause problems either. 3144 */ 3145 ret = write_all_supers(fs_info, 1); 3146 if (ret) { 3147 btrfs_set_log_full_commit(fs_info, trans); 3148 btrfs_abort_transaction(trans, ret); 3149 goto out_wake_log_root; 3150 } 3151 3152 mutex_lock(&root->log_mutex); 3153 if (root->last_log_commit < log_transid) 3154 root->last_log_commit = log_transid; 3155 mutex_unlock(&root->log_mutex); 3156 3157 out_wake_log_root: 3158 mutex_lock(&log_root_tree->log_mutex); 3159 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3160 3161 log_root_tree->log_transid_committed++; 3162 atomic_set(&log_root_tree->log_commit[index2], 0); 3163 mutex_unlock(&log_root_tree->log_mutex); 3164 3165 /* 3166 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3167 * all the updates above are seen by the woken threads. It might not be 3168 * necessary, but proving that seems to be hard. 3169 */ 3170 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3171 out: 3172 mutex_lock(&root->log_mutex); 3173 btrfs_remove_all_log_ctxs(root, index1, ret); 3174 root->log_transid_committed++; 3175 atomic_set(&root->log_commit[index1], 0); 3176 mutex_unlock(&root->log_mutex); 3177 3178 /* 3179 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3180 * all the updates above are seen by the woken threads. It might not be 3181 * necessary, but proving that seems to be hard. 3182 */ 3183 cond_wake_up(&root->log_commit_wait[index1]); 3184 return ret; 3185 } 3186 3187 static void free_log_tree(struct btrfs_trans_handle *trans, 3188 struct btrfs_root *log) 3189 { 3190 int ret; 3191 u64 start; 3192 u64 end; 3193 struct walk_control wc = { 3194 .free = 1, 3195 .process_func = process_one_buffer 3196 }; 3197 3198 ret = walk_log_tree(trans, log, &wc); 3199 /* I don't think this can happen but just in case */ 3200 if (ret) 3201 btrfs_abort_transaction(trans, ret); 3202 3203 while (1) { 3204 ret = find_first_extent_bit(&log->dirty_log_pages, 3205 0, &start, &end, 3206 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, 3207 NULL); 3208 if (ret) 3209 break; 3210 3211 clear_extent_bits(&log->dirty_log_pages, start, end, 3212 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3213 } 3214 3215 free_extent_buffer(log->node); 3216 kfree(log); 3217 } 3218 3219 /* 3220 * free all the extents used by the tree log. This should be called 3221 * at commit time of the full transaction 3222 */ 3223 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3224 { 3225 if (root->log_root) { 3226 free_log_tree(trans, root->log_root); 3227 root->log_root = NULL; 3228 } 3229 return 0; 3230 } 3231 3232 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3233 struct btrfs_fs_info *fs_info) 3234 { 3235 if (fs_info->log_root_tree) { 3236 free_log_tree(trans, fs_info->log_root_tree); 3237 fs_info->log_root_tree = NULL; 3238 } 3239 return 0; 3240 } 3241 3242 /* 3243 * If both a file and directory are logged, and unlinks or renames are 3244 * mixed in, we have a few interesting corners: 3245 * 3246 * create file X in dir Y 3247 * link file X to X.link in dir Y 3248 * fsync file X 3249 * unlink file X but leave X.link 3250 * fsync dir Y 3251 * 3252 * After a crash we would expect only X.link to exist. But file X 3253 * didn't get fsync'd again so the log has back refs for X and X.link. 3254 * 3255 * We solve this by removing directory entries and inode backrefs from the 3256 * log when a file that was logged in the current transaction is 3257 * unlinked. Any later fsync will include the updated log entries, and 3258 * we'll be able to reconstruct the proper directory items from backrefs. 3259 * 3260 * This optimizations allows us to avoid relogging the entire inode 3261 * or the entire directory. 3262 */ 3263 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3264 struct btrfs_root *root, 3265 const char *name, int name_len, 3266 struct btrfs_inode *dir, u64 index) 3267 { 3268 struct btrfs_root *log; 3269 struct btrfs_dir_item *di; 3270 struct btrfs_path *path; 3271 int ret; 3272 int err = 0; 3273 int bytes_del = 0; 3274 u64 dir_ino = btrfs_ino(dir); 3275 3276 if (dir->logged_trans < trans->transid) 3277 return 0; 3278 3279 ret = join_running_log_trans(root); 3280 if (ret) 3281 return 0; 3282 3283 mutex_lock(&dir->log_mutex); 3284 3285 log = root->log_root; 3286 path = btrfs_alloc_path(); 3287 if (!path) { 3288 err = -ENOMEM; 3289 goto out_unlock; 3290 } 3291 3292 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3293 name, name_len, -1); 3294 if (IS_ERR(di)) { 3295 err = PTR_ERR(di); 3296 goto fail; 3297 } 3298 if (di) { 3299 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3300 bytes_del += name_len; 3301 if (ret) { 3302 err = ret; 3303 goto fail; 3304 } 3305 } 3306 btrfs_release_path(path); 3307 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3308 index, name, name_len, -1); 3309 if (IS_ERR(di)) { 3310 err = PTR_ERR(di); 3311 goto fail; 3312 } 3313 if (di) { 3314 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3315 bytes_del += name_len; 3316 if (ret) { 3317 err = ret; 3318 goto fail; 3319 } 3320 } 3321 3322 /* update the directory size in the log to reflect the names 3323 * we have removed 3324 */ 3325 if (bytes_del) { 3326 struct btrfs_key key; 3327 3328 key.objectid = dir_ino; 3329 key.offset = 0; 3330 key.type = BTRFS_INODE_ITEM_KEY; 3331 btrfs_release_path(path); 3332 3333 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3334 if (ret < 0) { 3335 err = ret; 3336 goto fail; 3337 } 3338 if (ret == 0) { 3339 struct btrfs_inode_item *item; 3340 u64 i_size; 3341 3342 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3343 struct btrfs_inode_item); 3344 i_size = btrfs_inode_size(path->nodes[0], item); 3345 if (i_size > bytes_del) 3346 i_size -= bytes_del; 3347 else 3348 i_size = 0; 3349 btrfs_set_inode_size(path->nodes[0], item, i_size); 3350 btrfs_mark_buffer_dirty(path->nodes[0]); 3351 } else 3352 ret = 0; 3353 btrfs_release_path(path); 3354 } 3355 fail: 3356 btrfs_free_path(path); 3357 out_unlock: 3358 mutex_unlock(&dir->log_mutex); 3359 if (ret == -ENOSPC) { 3360 btrfs_set_log_full_commit(root->fs_info, trans); 3361 ret = 0; 3362 } else if (ret < 0) 3363 btrfs_abort_transaction(trans, ret); 3364 3365 btrfs_end_log_trans(root); 3366 3367 return err; 3368 } 3369 3370 /* see comments for btrfs_del_dir_entries_in_log */ 3371 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3372 struct btrfs_root *root, 3373 const char *name, int name_len, 3374 struct btrfs_inode *inode, u64 dirid) 3375 { 3376 struct btrfs_fs_info *fs_info = root->fs_info; 3377 struct btrfs_root *log; 3378 u64 index; 3379 int ret; 3380 3381 if (inode->logged_trans < trans->transid) 3382 return 0; 3383 3384 ret = join_running_log_trans(root); 3385 if (ret) 3386 return 0; 3387 log = root->log_root; 3388 mutex_lock(&inode->log_mutex); 3389 3390 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3391 dirid, &index); 3392 mutex_unlock(&inode->log_mutex); 3393 if (ret == -ENOSPC) { 3394 btrfs_set_log_full_commit(fs_info, trans); 3395 ret = 0; 3396 } else if (ret < 0 && ret != -ENOENT) 3397 btrfs_abort_transaction(trans, ret); 3398 btrfs_end_log_trans(root); 3399 3400 return ret; 3401 } 3402 3403 /* 3404 * creates a range item in the log for 'dirid'. first_offset and 3405 * last_offset tell us which parts of the key space the log should 3406 * be considered authoritative for. 3407 */ 3408 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3409 struct btrfs_root *log, 3410 struct btrfs_path *path, 3411 int key_type, u64 dirid, 3412 u64 first_offset, u64 last_offset) 3413 { 3414 int ret; 3415 struct btrfs_key key; 3416 struct btrfs_dir_log_item *item; 3417 3418 key.objectid = dirid; 3419 key.offset = first_offset; 3420 if (key_type == BTRFS_DIR_ITEM_KEY) 3421 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3422 else 3423 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3424 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3425 if (ret) 3426 return ret; 3427 3428 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3429 struct btrfs_dir_log_item); 3430 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3431 btrfs_mark_buffer_dirty(path->nodes[0]); 3432 btrfs_release_path(path); 3433 return 0; 3434 } 3435 3436 /* 3437 * log all the items included in the current transaction for a given 3438 * directory. This also creates the range items in the log tree required 3439 * to replay anything deleted before the fsync 3440 */ 3441 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3442 struct btrfs_root *root, struct btrfs_inode *inode, 3443 struct btrfs_path *path, 3444 struct btrfs_path *dst_path, int key_type, 3445 struct btrfs_log_ctx *ctx, 3446 u64 min_offset, u64 *last_offset_ret) 3447 { 3448 struct btrfs_key min_key; 3449 struct btrfs_root *log = root->log_root; 3450 struct extent_buffer *src; 3451 int err = 0; 3452 int ret; 3453 int i; 3454 int nritems; 3455 u64 first_offset = min_offset; 3456 u64 last_offset = (u64)-1; 3457 u64 ino = btrfs_ino(inode); 3458 3459 log = root->log_root; 3460 3461 min_key.objectid = ino; 3462 min_key.type = key_type; 3463 min_key.offset = min_offset; 3464 3465 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3466 3467 /* 3468 * we didn't find anything from this transaction, see if there 3469 * is anything at all 3470 */ 3471 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3472 min_key.objectid = ino; 3473 min_key.type = key_type; 3474 min_key.offset = (u64)-1; 3475 btrfs_release_path(path); 3476 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3477 if (ret < 0) { 3478 btrfs_release_path(path); 3479 return ret; 3480 } 3481 ret = btrfs_previous_item(root, path, ino, key_type); 3482 3483 /* if ret == 0 there are items for this type, 3484 * create a range to tell us the last key of this type. 3485 * otherwise, there are no items in this directory after 3486 * *min_offset, and we create a range to indicate that. 3487 */ 3488 if (ret == 0) { 3489 struct btrfs_key tmp; 3490 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3491 path->slots[0]); 3492 if (key_type == tmp.type) 3493 first_offset = max(min_offset, tmp.offset) + 1; 3494 } 3495 goto done; 3496 } 3497 3498 /* go backward to find any previous key */ 3499 ret = btrfs_previous_item(root, path, ino, key_type); 3500 if (ret == 0) { 3501 struct btrfs_key tmp; 3502 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3503 if (key_type == tmp.type) { 3504 first_offset = tmp.offset; 3505 ret = overwrite_item(trans, log, dst_path, 3506 path->nodes[0], path->slots[0], 3507 &tmp); 3508 if (ret) { 3509 err = ret; 3510 goto done; 3511 } 3512 } 3513 } 3514 btrfs_release_path(path); 3515 3516 /* find the first key from this transaction again */ 3517 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3518 if (WARN_ON(ret != 0)) 3519 goto done; 3520 3521 /* 3522 * we have a block from this transaction, log every item in it 3523 * from our directory 3524 */ 3525 while (1) { 3526 struct btrfs_key tmp; 3527 src = path->nodes[0]; 3528 nritems = btrfs_header_nritems(src); 3529 for (i = path->slots[0]; i < nritems; i++) { 3530 struct btrfs_dir_item *di; 3531 3532 btrfs_item_key_to_cpu(src, &min_key, i); 3533 3534 if (min_key.objectid != ino || min_key.type != key_type) 3535 goto done; 3536 ret = overwrite_item(trans, log, dst_path, src, i, 3537 &min_key); 3538 if (ret) { 3539 err = ret; 3540 goto done; 3541 } 3542 3543 /* 3544 * We must make sure that when we log a directory entry, 3545 * the corresponding inode, after log replay, has a 3546 * matching link count. For example: 3547 * 3548 * touch foo 3549 * mkdir mydir 3550 * sync 3551 * ln foo mydir/bar 3552 * xfs_io -c "fsync" mydir 3553 * <crash> 3554 * <mount fs and log replay> 3555 * 3556 * Would result in a fsync log that when replayed, our 3557 * file inode would have a link count of 1, but we get 3558 * two directory entries pointing to the same inode. 3559 * After removing one of the names, it would not be 3560 * possible to remove the other name, which resulted 3561 * always in stale file handle errors, and would not 3562 * be possible to rmdir the parent directory, since 3563 * its i_size could never decrement to the value 3564 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3565 */ 3566 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3567 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3568 if (ctx && 3569 (btrfs_dir_transid(src, di) == trans->transid || 3570 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3571 tmp.type != BTRFS_ROOT_ITEM_KEY) 3572 ctx->log_new_dentries = true; 3573 } 3574 path->slots[0] = nritems; 3575 3576 /* 3577 * look ahead to the next item and see if it is also 3578 * from this directory and from this transaction 3579 */ 3580 ret = btrfs_next_leaf(root, path); 3581 if (ret) { 3582 if (ret == 1) 3583 last_offset = (u64)-1; 3584 else 3585 err = ret; 3586 goto done; 3587 } 3588 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3589 if (tmp.objectid != ino || tmp.type != key_type) { 3590 last_offset = (u64)-1; 3591 goto done; 3592 } 3593 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3594 ret = overwrite_item(trans, log, dst_path, 3595 path->nodes[0], path->slots[0], 3596 &tmp); 3597 if (ret) 3598 err = ret; 3599 else 3600 last_offset = tmp.offset; 3601 goto done; 3602 } 3603 } 3604 done: 3605 btrfs_release_path(path); 3606 btrfs_release_path(dst_path); 3607 3608 if (err == 0) { 3609 *last_offset_ret = last_offset; 3610 /* 3611 * insert the log range keys to indicate where the log 3612 * is valid 3613 */ 3614 ret = insert_dir_log_key(trans, log, path, key_type, 3615 ino, first_offset, last_offset); 3616 if (ret) 3617 err = ret; 3618 } 3619 return err; 3620 } 3621 3622 /* 3623 * logging directories is very similar to logging inodes, We find all the items 3624 * from the current transaction and write them to the log. 3625 * 3626 * The recovery code scans the directory in the subvolume, and if it finds a 3627 * key in the range logged that is not present in the log tree, then it means 3628 * that dir entry was unlinked during the transaction. 3629 * 3630 * In order for that scan to work, we must include one key smaller than 3631 * the smallest logged by this transaction and one key larger than the largest 3632 * key logged by this transaction. 3633 */ 3634 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3635 struct btrfs_root *root, struct btrfs_inode *inode, 3636 struct btrfs_path *path, 3637 struct btrfs_path *dst_path, 3638 struct btrfs_log_ctx *ctx) 3639 { 3640 u64 min_key; 3641 u64 max_key; 3642 int ret; 3643 int key_type = BTRFS_DIR_ITEM_KEY; 3644 3645 again: 3646 min_key = 0; 3647 max_key = 0; 3648 while (1) { 3649 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3650 ctx, min_key, &max_key); 3651 if (ret) 3652 return ret; 3653 if (max_key == (u64)-1) 3654 break; 3655 min_key = max_key + 1; 3656 } 3657 3658 if (key_type == BTRFS_DIR_ITEM_KEY) { 3659 key_type = BTRFS_DIR_INDEX_KEY; 3660 goto again; 3661 } 3662 return 0; 3663 } 3664 3665 /* 3666 * a helper function to drop items from the log before we relog an 3667 * inode. max_key_type indicates the highest item type to remove. 3668 * This cannot be run for file data extents because it does not 3669 * free the extents they point to. 3670 */ 3671 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3672 struct btrfs_root *log, 3673 struct btrfs_path *path, 3674 u64 objectid, int max_key_type) 3675 { 3676 int ret; 3677 struct btrfs_key key; 3678 struct btrfs_key found_key; 3679 int start_slot; 3680 3681 key.objectid = objectid; 3682 key.type = max_key_type; 3683 key.offset = (u64)-1; 3684 3685 while (1) { 3686 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3687 BUG_ON(ret == 0); /* Logic error */ 3688 if (ret < 0) 3689 break; 3690 3691 if (path->slots[0] == 0) 3692 break; 3693 3694 path->slots[0]--; 3695 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3696 path->slots[0]); 3697 3698 if (found_key.objectid != objectid) 3699 break; 3700 3701 found_key.offset = 0; 3702 found_key.type = 0; 3703 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3704 &start_slot); 3705 3706 ret = btrfs_del_items(trans, log, path, start_slot, 3707 path->slots[0] - start_slot + 1); 3708 /* 3709 * If start slot isn't 0 then we don't need to re-search, we've 3710 * found the last guy with the objectid in this tree. 3711 */ 3712 if (ret || start_slot != 0) 3713 break; 3714 btrfs_release_path(path); 3715 } 3716 btrfs_release_path(path); 3717 if (ret > 0) 3718 ret = 0; 3719 return ret; 3720 } 3721 3722 static void fill_inode_item(struct btrfs_trans_handle *trans, 3723 struct extent_buffer *leaf, 3724 struct btrfs_inode_item *item, 3725 struct inode *inode, int log_inode_only, 3726 u64 logged_isize) 3727 { 3728 struct btrfs_map_token token; 3729 3730 btrfs_init_map_token(&token); 3731 3732 if (log_inode_only) { 3733 /* set the generation to zero so the recover code 3734 * can tell the difference between an logging 3735 * just to say 'this inode exists' and a logging 3736 * to say 'update this inode with these values' 3737 */ 3738 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3739 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3740 } else { 3741 btrfs_set_token_inode_generation(leaf, item, 3742 BTRFS_I(inode)->generation, 3743 &token); 3744 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3745 } 3746 3747 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3748 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3749 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3750 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3751 3752 btrfs_set_token_timespec_sec(leaf, &item->atime, 3753 inode->i_atime.tv_sec, &token); 3754 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3755 inode->i_atime.tv_nsec, &token); 3756 3757 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3758 inode->i_mtime.tv_sec, &token); 3759 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3760 inode->i_mtime.tv_nsec, &token); 3761 3762 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3763 inode->i_ctime.tv_sec, &token); 3764 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3765 inode->i_ctime.tv_nsec, &token); 3766 3767 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3768 &token); 3769 3770 btrfs_set_token_inode_sequence(leaf, item, 3771 inode_peek_iversion(inode), &token); 3772 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3773 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3774 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3775 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3776 } 3777 3778 static int log_inode_item(struct btrfs_trans_handle *trans, 3779 struct btrfs_root *log, struct btrfs_path *path, 3780 struct btrfs_inode *inode) 3781 { 3782 struct btrfs_inode_item *inode_item; 3783 int ret; 3784 3785 ret = btrfs_insert_empty_item(trans, log, path, 3786 &inode->location, sizeof(*inode_item)); 3787 if (ret && ret != -EEXIST) 3788 return ret; 3789 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3790 struct btrfs_inode_item); 3791 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3792 0, 0); 3793 btrfs_release_path(path); 3794 return 0; 3795 } 3796 3797 static noinline int copy_items(struct btrfs_trans_handle *trans, 3798 struct btrfs_inode *inode, 3799 struct btrfs_path *dst_path, 3800 struct btrfs_path *src_path, u64 *last_extent, 3801 int start_slot, int nr, int inode_only, 3802 u64 logged_isize) 3803 { 3804 struct btrfs_fs_info *fs_info = trans->fs_info; 3805 unsigned long src_offset; 3806 unsigned long dst_offset; 3807 struct btrfs_root *log = inode->root->log_root; 3808 struct btrfs_file_extent_item *extent; 3809 struct btrfs_inode_item *inode_item; 3810 struct extent_buffer *src = src_path->nodes[0]; 3811 struct btrfs_key first_key, last_key, key; 3812 int ret; 3813 struct btrfs_key *ins_keys; 3814 u32 *ins_sizes; 3815 char *ins_data; 3816 int i; 3817 struct list_head ordered_sums; 3818 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3819 bool has_extents = false; 3820 bool need_find_last_extent = true; 3821 bool done = false; 3822 3823 INIT_LIST_HEAD(&ordered_sums); 3824 3825 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3826 nr * sizeof(u32), GFP_NOFS); 3827 if (!ins_data) 3828 return -ENOMEM; 3829 3830 first_key.objectid = (u64)-1; 3831 3832 ins_sizes = (u32 *)ins_data; 3833 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3834 3835 for (i = 0; i < nr; i++) { 3836 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3837 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3838 } 3839 ret = btrfs_insert_empty_items(trans, log, dst_path, 3840 ins_keys, ins_sizes, nr); 3841 if (ret) { 3842 kfree(ins_data); 3843 return ret; 3844 } 3845 3846 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3847 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3848 dst_path->slots[0]); 3849 3850 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3851 3852 if (i == nr - 1) 3853 last_key = ins_keys[i]; 3854 3855 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3856 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3857 dst_path->slots[0], 3858 struct btrfs_inode_item); 3859 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3860 &inode->vfs_inode, 3861 inode_only == LOG_INODE_EXISTS, 3862 logged_isize); 3863 } else { 3864 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3865 src_offset, ins_sizes[i]); 3866 } 3867 3868 /* 3869 * We set need_find_last_extent here in case we know we were 3870 * processing other items and then walk into the first extent in 3871 * the inode. If we don't hit an extent then nothing changes, 3872 * we'll do the last search the next time around. 3873 */ 3874 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3875 has_extents = true; 3876 if (first_key.objectid == (u64)-1) 3877 first_key = ins_keys[i]; 3878 } else { 3879 need_find_last_extent = false; 3880 } 3881 3882 /* take a reference on file data extents so that truncates 3883 * or deletes of this inode don't have to relog the inode 3884 * again 3885 */ 3886 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3887 !skip_csum) { 3888 int found_type; 3889 extent = btrfs_item_ptr(src, start_slot + i, 3890 struct btrfs_file_extent_item); 3891 3892 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3893 continue; 3894 3895 found_type = btrfs_file_extent_type(src, extent); 3896 if (found_type == BTRFS_FILE_EXTENT_REG) { 3897 u64 ds, dl, cs, cl; 3898 ds = btrfs_file_extent_disk_bytenr(src, 3899 extent); 3900 /* ds == 0 is a hole */ 3901 if (ds == 0) 3902 continue; 3903 3904 dl = btrfs_file_extent_disk_num_bytes(src, 3905 extent); 3906 cs = btrfs_file_extent_offset(src, extent); 3907 cl = btrfs_file_extent_num_bytes(src, 3908 extent); 3909 if (btrfs_file_extent_compression(src, 3910 extent)) { 3911 cs = 0; 3912 cl = dl; 3913 } 3914 3915 ret = btrfs_lookup_csums_range( 3916 fs_info->csum_root, 3917 ds + cs, ds + cs + cl - 1, 3918 &ordered_sums, 0); 3919 if (ret) { 3920 btrfs_release_path(dst_path); 3921 kfree(ins_data); 3922 return ret; 3923 } 3924 } 3925 } 3926 } 3927 3928 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3929 btrfs_release_path(dst_path); 3930 kfree(ins_data); 3931 3932 /* 3933 * we have to do this after the loop above to avoid changing the 3934 * log tree while trying to change the log tree. 3935 */ 3936 ret = 0; 3937 while (!list_empty(&ordered_sums)) { 3938 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3939 struct btrfs_ordered_sum, 3940 list); 3941 if (!ret) 3942 ret = btrfs_csum_file_blocks(trans, log, sums); 3943 list_del(&sums->list); 3944 kfree(sums); 3945 } 3946 3947 if (!has_extents) 3948 return ret; 3949 3950 if (need_find_last_extent && *last_extent == first_key.offset) { 3951 /* 3952 * We don't have any leafs between our current one and the one 3953 * we processed before that can have file extent items for our 3954 * inode (and have a generation number smaller than our current 3955 * transaction id). 3956 */ 3957 need_find_last_extent = false; 3958 } 3959 3960 /* 3961 * Because we use btrfs_search_forward we could skip leaves that were 3962 * not modified and then assume *last_extent is valid when it really 3963 * isn't. So back up to the previous leaf and read the end of the last 3964 * extent before we go and fill in holes. 3965 */ 3966 if (need_find_last_extent) { 3967 u64 len; 3968 3969 ret = btrfs_prev_leaf(inode->root, src_path); 3970 if (ret < 0) 3971 return ret; 3972 if (ret) 3973 goto fill_holes; 3974 if (src_path->slots[0]) 3975 src_path->slots[0]--; 3976 src = src_path->nodes[0]; 3977 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3978 if (key.objectid != btrfs_ino(inode) || 3979 key.type != BTRFS_EXTENT_DATA_KEY) 3980 goto fill_holes; 3981 extent = btrfs_item_ptr(src, src_path->slots[0], 3982 struct btrfs_file_extent_item); 3983 if (btrfs_file_extent_type(src, extent) == 3984 BTRFS_FILE_EXTENT_INLINE) { 3985 len = btrfs_file_extent_ram_bytes(src, extent); 3986 *last_extent = ALIGN(key.offset + len, 3987 fs_info->sectorsize); 3988 } else { 3989 len = btrfs_file_extent_num_bytes(src, extent); 3990 *last_extent = key.offset + len; 3991 } 3992 } 3993 fill_holes: 3994 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3995 * things could have happened 3996 * 3997 * 1) A merge could have happened, so we could currently be on a leaf 3998 * that holds what we were copying in the first place. 3999 * 2) A split could have happened, and now not all of the items we want 4000 * are on the same leaf. 4001 * 4002 * So we need to adjust how we search for holes, we need to drop the 4003 * path and re-search for the first extent key we found, and then walk 4004 * forward until we hit the last one we copied. 4005 */ 4006 if (need_find_last_extent) { 4007 /* btrfs_prev_leaf could return 1 without releasing the path */ 4008 btrfs_release_path(src_path); 4009 ret = btrfs_search_slot(NULL, inode->root, &first_key, 4010 src_path, 0, 0); 4011 if (ret < 0) 4012 return ret; 4013 ASSERT(ret == 0); 4014 src = src_path->nodes[0]; 4015 i = src_path->slots[0]; 4016 } else { 4017 i = start_slot; 4018 } 4019 4020 /* 4021 * Ok so here we need to go through and fill in any holes we may have 4022 * to make sure that holes are punched for those areas in case they had 4023 * extents previously. 4024 */ 4025 while (!done) { 4026 u64 offset, len; 4027 u64 extent_end; 4028 4029 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 4030 ret = btrfs_next_leaf(inode->root, src_path); 4031 if (ret < 0) 4032 return ret; 4033 ASSERT(ret == 0); 4034 src = src_path->nodes[0]; 4035 i = 0; 4036 need_find_last_extent = true; 4037 } 4038 4039 btrfs_item_key_to_cpu(src, &key, i); 4040 if (!btrfs_comp_cpu_keys(&key, &last_key)) 4041 done = true; 4042 if (key.objectid != btrfs_ino(inode) || 4043 key.type != BTRFS_EXTENT_DATA_KEY) { 4044 i++; 4045 continue; 4046 } 4047 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 4048 if (btrfs_file_extent_type(src, extent) == 4049 BTRFS_FILE_EXTENT_INLINE) { 4050 len = btrfs_file_extent_ram_bytes(src, extent); 4051 extent_end = ALIGN(key.offset + len, 4052 fs_info->sectorsize); 4053 } else { 4054 len = btrfs_file_extent_num_bytes(src, extent); 4055 extent_end = key.offset + len; 4056 } 4057 i++; 4058 4059 if (*last_extent == key.offset) { 4060 *last_extent = extent_end; 4061 continue; 4062 } 4063 offset = *last_extent; 4064 len = key.offset - *last_extent; 4065 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 4066 offset, 0, 0, len, 0, len, 0, 0, 0); 4067 if (ret) 4068 break; 4069 *last_extent = extent_end; 4070 } 4071 4072 /* 4073 * Check if there is a hole between the last extent found in our leaf 4074 * and the first extent in the next leaf. If there is one, we need to 4075 * log an explicit hole so that at replay time we can punch the hole. 4076 */ 4077 if (ret == 0 && 4078 key.objectid == btrfs_ino(inode) && 4079 key.type == BTRFS_EXTENT_DATA_KEY && 4080 i == btrfs_header_nritems(src_path->nodes[0])) { 4081 ret = btrfs_next_leaf(inode->root, src_path); 4082 need_find_last_extent = true; 4083 if (ret > 0) { 4084 ret = 0; 4085 } else if (ret == 0) { 4086 btrfs_item_key_to_cpu(src_path->nodes[0], &key, 4087 src_path->slots[0]); 4088 if (key.objectid == btrfs_ino(inode) && 4089 key.type == BTRFS_EXTENT_DATA_KEY && 4090 *last_extent < key.offset) { 4091 const u64 len = key.offset - *last_extent; 4092 4093 ret = btrfs_insert_file_extent(trans, log, 4094 btrfs_ino(inode), 4095 *last_extent, 0, 4096 0, len, 0, len, 4097 0, 0, 0); 4098 } 4099 } 4100 } 4101 /* 4102 * Need to let the callers know we dropped the path so they should 4103 * re-search. 4104 */ 4105 if (!ret && need_find_last_extent) 4106 ret = 1; 4107 return ret; 4108 } 4109 4110 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4111 { 4112 struct extent_map *em1, *em2; 4113 4114 em1 = list_entry(a, struct extent_map, list); 4115 em2 = list_entry(b, struct extent_map, list); 4116 4117 if (em1->start < em2->start) 4118 return -1; 4119 else if (em1->start > em2->start) 4120 return 1; 4121 return 0; 4122 } 4123 4124 static int log_extent_csums(struct btrfs_trans_handle *trans, 4125 struct btrfs_inode *inode, 4126 struct btrfs_root *log_root, 4127 const struct extent_map *em) 4128 { 4129 u64 csum_offset; 4130 u64 csum_len; 4131 LIST_HEAD(ordered_sums); 4132 int ret = 0; 4133 4134 if (inode->flags & BTRFS_INODE_NODATASUM || 4135 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4136 em->block_start == EXTENT_MAP_HOLE) 4137 return 0; 4138 4139 /* If we're compressed we have to save the entire range of csums. */ 4140 if (em->compress_type) { 4141 csum_offset = 0; 4142 csum_len = max(em->block_len, em->orig_block_len); 4143 } else { 4144 csum_offset = em->mod_start - em->start; 4145 csum_len = em->mod_len; 4146 } 4147 4148 /* block start is already adjusted for the file extent offset. */ 4149 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4150 em->block_start + csum_offset, 4151 em->block_start + csum_offset + 4152 csum_len - 1, &ordered_sums, 0); 4153 if (ret) 4154 return ret; 4155 4156 while (!list_empty(&ordered_sums)) { 4157 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4158 struct btrfs_ordered_sum, 4159 list); 4160 if (!ret) 4161 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4162 list_del(&sums->list); 4163 kfree(sums); 4164 } 4165 4166 return ret; 4167 } 4168 4169 static int log_one_extent(struct btrfs_trans_handle *trans, 4170 struct btrfs_inode *inode, struct btrfs_root *root, 4171 const struct extent_map *em, 4172 struct btrfs_path *path, 4173 struct btrfs_log_ctx *ctx) 4174 { 4175 struct btrfs_root *log = root->log_root; 4176 struct btrfs_file_extent_item *fi; 4177 struct extent_buffer *leaf; 4178 struct btrfs_map_token token; 4179 struct btrfs_key key; 4180 u64 extent_offset = em->start - em->orig_start; 4181 u64 block_len; 4182 int ret; 4183 int extent_inserted = 0; 4184 4185 ret = log_extent_csums(trans, inode, log, em); 4186 if (ret) 4187 return ret; 4188 4189 btrfs_init_map_token(&token); 4190 4191 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4192 em->start + em->len, NULL, 0, 1, 4193 sizeof(*fi), &extent_inserted); 4194 if (ret) 4195 return ret; 4196 4197 if (!extent_inserted) { 4198 key.objectid = btrfs_ino(inode); 4199 key.type = BTRFS_EXTENT_DATA_KEY; 4200 key.offset = em->start; 4201 4202 ret = btrfs_insert_empty_item(trans, log, path, &key, 4203 sizeof(*fi)); 4204 if (ret) 4205 return ret; 4206 } 4207 leaf = path->nodes[0]; 4208 fi = btrfs_item_ptr(leaf, path->slots[0], 4209 struct btrfs_file_extent_item); 4210 4211 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4212 &token); 4213 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4214 btrfs_set_token_file_extent_type(leaf, fi, 4215 BTRFS_FILE_EXTENT_PREALLOC, 4216 &token); 4217 else 4218 btrfs_set_token_file_extent_type(leaf, fi, 4219 BTRFS_FILE_EXTENT_REG, 4220 &token); 4221 4222 block_len = max(em->block_len, em->orig_block_len); 4223 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4224 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4225 em->block_start, 4226 &token); 4227 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4228 &token); 4229 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4230 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4231 em->block_start - 4232 extent_offset, &token); 4233 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4234 &token); 4235 } else { 4236 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4237 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4238 &token); 4239 } 4240 4241 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4242 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4243 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4244 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4245 &token); 4246 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4247 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4248 btrfs_mark_buffer_dirty(leaf); 4249 4250 btrfs_release_path(path); 4251 4252 return ret; 4253 } 4254 4255 /* 4256 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4257 * lose them after doing a fast fsync and replaying the log. We scan the 4258 * subvolume's root instead of iterating the inode's extent map tree because 4259 * otherwise we can log incorrect extent items based on extent map conversion. 4260 * That can happen due to the fact that extent maps are merged when they 4261 * are not in the extent map tree's list of modified extents. 4262 */ 4263 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4264 struct btrfs_inode *inode, 4265 struct btrfs_path *path) 4266 { 4267 struct btrfs_root *root = inode->root; 4268 struct btrfs_key key; 4269 const u64 i_size = i_size_read(&inode->vfs_inode); 4270 const u64 ino = btrfs_ino(inode); 4271 struct btrfs_path *dst_path = NULL; 4272 u64 last_extent = (u64)-1; 4273 int ins_nr = 0; 4274 int start_slot; 4275 int ret; 4276 4277 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4278 return 0; 4279 4280 key.objectid = ino; 4281 key.type = BTRFS_EXTENT_DATA_KEY; 4282 key.offset = i_size; 4283 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4284 if (ret < 0) 4285 goto out; 4286 4287 while (true) { 4288 struct extent_buffer *leaf = path->nodes[0]; 4289 int slot = path->slots[0]; 4290 4291 if (slot >= btrfs_header_nritems(leaf)) { 4292 if (ins_nr > 0) { 4293 ret = copy_items(trans, inode, dst_path, path, 4294 &last_extent, start_slot, 4295 ins_nr, 1, 0); 4296 if (ret < 0) 4297 goto out; 4298 ins_nr = 0; 4299 } 4300 ret = btrfs_next_leaf(root, path); 4301 if (ret < 0) 4302 goto out; 4303 if (ret > 0) { 4304 ret = 0; 4305 break; 4306 } 4307 continue; 4308 } 4309 4310 btrfs_item_key_to_cpu(leaf, &key, slot); 4311 if (key.objectid > ino) 4312 break; 4313 if (WARN_ON_ONCE(key.objectid < ino) || 4314 key.type < BTRFS_EXTENT_DATA_KEY || 4315 key.offset < i_size) { 4316 path->slots[0]++; 4317 continue; 4318 } 4319 if (last_extent == (u64)-1) { 4320 last_extent = key.offset; 4321 /* 4322 * Avoid logging extent items logged in past fsync calls 4323 * and leading to duplicate keys in the log tree. 4324 */ 4325 do { 4326 ret = btrfs_truncate_inode_items(trans, 4327 root->log_root, 4328 &inode->vfs_inode, 4329 i_size, 4330 BTRFS_EXTENT_DATA_KEY); 4331 } while (ret == -EAGAIN); 4332 if (ret) 4333 goto out; 4334 } 4335 if (ins_nr == 0) 4336 start_slot = slot; 4337 ins_nr++; 4338 path->slots[0]++; 4339 if (!dst_path) { 4340 dst_path = btrfs_alloc_path(); 4341 if (!dst_path) { 4342 ret = -ENOMEM; 4343 goto out; 4344 } 4345 } 4346 } 4347 if (ins_nr > 0) { 4348 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4349 start_slot, ins_nr, 1, 0); 4350 if (ret > 0) 4351 ret = 0; 4352 } 4353 out: 4354 btrfs_release_path(path); 4355 btrfs_free_path(dst_path); 4356 return ret; 4357 } 4358 4359 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4360 struct btrfs_root *root, 4361 struct btrfs_inode *inode, 4362 struct btrfs_path *path, 4363 struct btrfs_log_ctx *ctx, 4364 const u64 start, 4365 const u64 end) 4366 { 4367 struct extent_map *em, *n; 4368 struct list_head extents; 4369 struct extent_map_tree *tree = &inode->extent_tree; 4370 u64 logged_start, logged_end; 4371 u64 test_gen; 4372 int ret = 0; 4373 int num = 0; 4374 4375 INIT_LIST_HEAD(&extents); 4376 4377 down_write(&inode->dio_sem); 4378 write_lock(&tree->lock); 4379 test_gen = root->fs_info->last_trans_committed; 4380 logged_start = start; 4381 logged_end = end; 4382 4383 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4384 list_del_init(&em->list); 4385 /* 4386 * Just an arbitrary number, this can be really CPU intensive 4387 * once we start getting a lot of extents, and really once we 4388 * have a bunch of extents we just want to commit since it will 4389 * be faster. 4390 */ 4391 if (++num > 32768) { 4392 list_del_init(&tree->modified_extents); 4393 ret = -EFBIG; 4394 goto process; 4395 } 4396 4397 if (em->generation <= test_gen) 4398 continue; 4399 4400 /* We log prealloc extents beyond eof later. */ 4401 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4402 em->start >= i_size_read(&inode->vfs_inode)) 4403 continue; 4404 4405 if (em->start < logged_start) 4406 logged_start = em->start; 4407 if ((em->start + em->len - 1) > logged_end) 4408 logged_end = em->start + em->len - 1; 4409 4410 /* Need a ref to keep it from getting evicted from cache */ 4411 refcount_inc(&em->refs); 4412 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4413 list_add_tail(&em->list, &extents); 4414 num++; 4415 } 4416 4417 list_sort(NULL, &extents, extent_cmp); 4418 process: 4419 while (!list_empty(&extents)) { 4420 em = list_entry(extents.next, struct extent_map, list); 4421 4422 list_del_init(&em->list); 4423 4424 /* 4425 * If we had an error we just need to delete everybody from our 4426 * private list. 4427 */ 4428 if (ret) { 4429 clear_em_logging(tree, em); 4430 free_extent_map(em); 4431 continue; 4432 } 4433 4434 write_unlock(&tree->lock); 4435 4436 ret = log_one_extent(trans, inode, root, em, path, ctx); 4437 write_lock(&tree->lock); 4438 clear_em_logging(tree, em); 4439 free_extent_map(em); 4440 } 4441 WARN_ON(!list_empty(&extents)); 4442 write_unlock(&tree->lock); 4443 up_write(&inode->dio_sem); 4444 4445 btrfs_release_path(path); 4446 if (!ret) 4447 ret = btrfs_log_prealloc_extents(trans, inode, path); 4448 4449 return ret; 4450 } 4451 4452 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4453 struct btrfs_path *path, u64 *size_ret) 4454 { 4455 struct btrfs_key key; 4456 int ret; 4457 4458 key.objectid = btrfs_ino(inode); 4459 key.type = BTRFS_INODE_ITEM_KEY; 4460 key.offset = 0; 4461 4462 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4463 if (ret < 0) { 4464 return ret; 4465 } else if (ret > 0) { 4466 *size_ret = 0; 4467 } else { 4468 struct btrfs_inode_item *item; 4469 4470 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4471 struct btrfs_inode_item); 4472 *size_ret = btrfs_inode_size(path->nodes[0], item); 4473 } 4474 4475 btrfs_release_path(path); 4476 return 0; 4477 } 4478 4479 /* 4480 * At the moment we always log all xattrs. This is to figure out at log replay 4481 * time which xattrs must have their deletion replayed. If a xattr is missing 4482 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4483 * because if a xattr is deleted, the inode is fsynced and a power failure 4484 * happens, causing the log to be replayed the next time the fs is mounted, 4485 * we want the xattr to not exist anymore (same behaviour as other filesystems 4486 * with a journal, ext3/4, xfs, f2fs, etc). 4487 */ 4488 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4489 struct btrfs_root *root, 4490 struct btrfs_inode *inode, 4491 struct btrfs_path *path, 4492 struct btrfs_path *dst_path) 4493 { 4494 int ret; 4495 struct btrfs_key key; 4496 const u64 ino = btrfs_ino(inode); 4497 int ins_nr = 0; 4498 int start_slot = 0; 4499 4500 key.objectid = ino; 4501 key.type = BTRFS_XATTR_ITEM_KEY; 4502 key.offset = 0; 4503 4504 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4505 if (ret < 0) 4506 return ret; 4507 4508 while (true) { 4509 int slot = path->slots[0]; 4510 struct extent_buffer *leaf = path->nodes[0]; 4511 int nritems = btrfs_header_nritems(leaf); 4512 4513 if (slot >= nritems) { 4514 if (ins_nr > 0) { 4515 u64 last_extent = 0; 4516 4517 ret = copy_items(trans, inode, dst_path, path, 4518 &last_extent, start_slot, 4519 ins_nr, 1, 0); 4520 /* can't be 1, extent items aren't processed */ 4521 ASSERT(ret <= 0); 4522 if (ret < 0) 4523 return ret; 4524 ins_nr = 0; 4525 } 4526 ret = btrfs_next_leaf(root, path); 4527 if (ret < 0) 4528 return ret; 4529 else if (ret > 0) 4530 break; 4531 continue; 4532 } 4533 4534 btrfs_item_key_to_cpu(leaf, &key, slot); 4535 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4536 break; 4537 4538 if (ins_nr == 0) 4539 start_slot = slot; 4540 ins_nr++; 4541 path->slots[0]++; 4542 cond_resched(); 4543 } 4544 if (ins_nr > 0) { 4545 u64 last_extent = 0; 4546 4547 ret = copy_items(trans, inode, dst_path, path, 4548 &last_extent, start_slot, 4549 ins_nr, 1, 0); 4550 /* can't be 1, extent items aren't processed */ 4551 ASSERT(ret <= 0); 4552 if (ret < 0) 4553 return ret; 4554 } 4555 4556 return 0; 4557 } 4558 4559 /* 4560 * If the no holes feature is enabled we need to make sure any hole between the 4561 * last extent and the i_size of our inode is explicitly marked in the log. This 4562 * is to make sure that doing something like: 4563 * 4564 * 1) create file with 128Kb of data 4565 * 2) truncate file to 64Kb 4566 * 3) truncate file to 256Kb 4567 * 4) fsync file 4568 * 5) <crash/power failure> 4569 * 6) mount fs and trigger log replay 4570 * 4571 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4572 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4573 * file correspond to a hole. The presence of explicit holes in a log tree is 4574 * what guarantees that log replay will remove/adjust file extent items in the 4575 * fs/subvol tree. 4576 * 4577 * Here we do not need to care about holes between extents, that is already done 4578 * by copy_items(). We also only need to do this in the full sync path, where we 4579 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4580 * lookup the list of modified extent maps and if any represents a hole, we 4581 * insert a corresponding extent representing a hole in the log tree. 4582 */ 4583 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4584 struct btrfs_root *root, 4585 struct btrfs_inode *inode, 4586 struct btrfs_path *path) 4587 { 4588 struct btrfs_fs_info *fs_info = root->fs_info; 4589 int ret; 4590 struct btrfs_key key; 4591 u64 hole_start; 4592 u64 hole_size; 4593 struct extent_buffer *leaf; 4594 struct btrfs_root *log = root->log_root; 4595 const u64 ino = btrfs_ino(inode); 4596 const u64 i_size = i_size_read(&inode->vfs_inode); 4597 4598 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4599 return 0; 4600 4601 key.objectid = ino; 4602 key.type = BTRFS_EXTENT_DATA_KEY; 4603 key.offset = (u64)-1; 4604 4605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4606 ASSERT(ret != 0); 4607 if (ret < 0) 4608 return ret; 4609 4610 ASSERT(path->slots[0] > 0); 4611 path->slots[0]--; 4612 leaf = path->nodes[0]; 4613 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4614 4615 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4616 /* inode does not have any extents */ 4617 hole_start = 0; 4618 hole_size = i_size; 4619 } else { 4620 struct btrfs_file_extent_item *extent; 4621 u64 len; 4622 4623 /* 4624 * If there's an extent beyond i_size, an explicit hole was 4625 * already inserted by copy_items(). 4626 */ 4627 if (key.offset >= i_size) 4628 return 0; 4629 4630 extent = btrfs_item_ptr(leaf, path->slots[0], 4631 struct btrfs_file_extent_item); 4632 4633 if (btrfs_file_extent_type(leaf, extent) == 4634 BTRFS_FILE_EXTENT_INLINE) { 4635 len = btrfs_file_extent_ram_bytes(leaf, extent); 4636 ASSERT(len == i_size || 4637 (len == fs_info->sectorsize && 4638 btrfs_file_extent_compression(leaf, extent) != 4639 BTRFS_COMPRESS_NONE)); 4640 return 0; 4641 } 4642 4643 len = btrfs_file_extent_num_bytes(leaf, extent); 4644 /* Last extent goes beyond i_size, no need to log a hole. */ 4645 if (key.offset + len > i_size) 4646 return 0; 4647 hole_start = key.offset + len; 4648 hole_size = i_size - hole_start; 4649 } 4650 btrfs_release_path(path); 4651 4652 /* Last extent ends at i_size. */ 4653 if (hole_size == 0) 4654 return 0; 4655 4656 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4657 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4658 hole_size, 0, hole_size, 0, 0, 0); 4659 return ret; 4660 } 4661 4662 /* 4663 * When we are logging a new inode X, check if it doesn't have a reference that 4664 * matches the reference from some other inode Y created in a past transaction 4665 * and that was renamed in the current transaction. If we don't do this, then at 4666 * log replay time we can lose inode Y (and all its files if it's a directory): 4667 * 4668 * mkdir /mnt/x 4669 * echo "hello world" > /mnt/x/foobar 4670 * sync 4671 * mv /mnt/x /mnt/y 4672 * mkdir /mnt/x # or touch /mnt/x 4673 * xfs_io -c fsync /mnt/x 4674 * <power fail> 4675 * mount fs, trigger log replay 4676 * 4677 * After the log replay procedure, we would lose the first directory and all its 4678 * files (file foobar). 4679 * For the case where inode Y is not a directory we simply end up losing it: 4680 * 4681 * echo "123" > /mnt/foo 4682 * sync 4683 * mv /mnt/foo /mnt/bar 4684 * echo "abc" > /mnt/foo 4685 * xfs_io -c fsync /mnt/foo 4686 * <power fail> 4687 * 4688 * We also need this for cases where a snapshot entry is replaced by some other 4689 * entry (file or directory) otherwise we end up with an unreplayable log due to 4690 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4691 * if it were a regular entry: 4692 * 4693 * mkdir /mnt/x 4694 * btrfs subvolume snapshot /mnt /mnt/x/snap 4695 * btrfs subvolume delete /mnt/x/snap 4696 * rmdir /mnt/x 4697 * mkdir /mnt/x 4698 * fsync /mnt/x or fsync some new file inside it 4699 * <power fail> 4700 * 4701 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4702 * the same transaction. 4703 */ 4704 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4705 const int slot, 4706 const struct btrfs_key *key, 4707 struct btrfs_inode *inode, 4708 u64 *other_ino) 4709 { 4710 int ret; 4711 struct btrfs_path *search_path; 4712 char *name = NULL; 4713 u32 name_len = 0; 4714 u32 item_size = btrfs_item_size_nr(eb, slot); 4715 u32 cur_offset = 0; 4716 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4717 4718 search_path = btrfs_alloc_path(); 4719 if (!search_path) 4720 return -ENOMEM; 4721 search_path->search_commit_root = 1; 4722 search_path->skip_locking = 1; 4723 4724 while (cur_offset < item_size) { 4725 u64 parent; 4726 u32 this_name_len; 4727 u32 this_len; 4728 unsigned long name_ptr; 4729 struct btrfs_dir_item *di; 4730 4731 if (key->type == BTRFS_INODE_REF_KEY) { 4732 struct btrfs_inode_ref *iref; 4733 4734 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4735 parent = key->offset; 4736 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4737 name_ptr = (unsigned long)(iref + 1); 4738 this_len = sizeof(*iref) + this_name_len; 4739 } else { 4740 struct btrfs_inode_extref *extref; 4741 4742 extref = (struct btrfs_inode_extref *)(ptr + 4743 cur_offset); 4744 parent = btrfs_inode_extref_parent(eb, extref); 4745 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4746 name_ptr = (unsigned long)&extref->name; 4747 this_len = sizeof(*extref) + this_name_len; 4748 } 4749 4750 if (this_name_len > name_len) { 4751 char *new_name; 4752 4753 new_name = krealloc(name, this_name_len, GFP_NOFS); 4754 if (!new_name) { 4755 ret = -ENOMEM; 4756 goto out; 4757 } 4758 name_len = this_name_len; 4759 name = new_name; 4760 } 4761 4762 read_extent_buffer(eb, name, name_ptr, this_name_len); 4763 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4764 parent, name, this_name_len, 0); 4765 if (di && !IS_ERR(di)) { 4766 struct btrfs_key di_key; 4767 4768 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4769 di, &di_key); 4770 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4771 ret = 1; 4772 *other_ino = di_key.objectid; 4773 } else { 4774 ret = -EAGAIN; 4775 } 4776 goto out; 4777 } else if (IS_ERR(di)) { 4778 ret = PTR_ERR(di); 4779 goto out; 4780 } 4781 btrfs_release_path(search_path); 4782 4783 cur_offset += this_len; 4784 } 4785 ret = 0; 4786 out: 4787 btrfs_free_path(search_path); 4788 kfree(name); 4789 return ret; 4790 } 4791 4792 /* log a single inode in the tree log. 4793 * At least one parent directory for this inode must exist in the tree 4794 * or be logged already. 4795 * 4796 * Any items from this inode changed by the current transaction are copied 4797 * to the log tree. An extra reference is taken on any extents in this 4798 * file, allowing us to avoid a whole pile of corner cases around logging 4799 * blocks that have been removed from the tree. 4800 * 4801 * See LOG_INODE_ALL and related defines for a description of what inode_only 4802 * does. 4803 * 4804 * This handles both files and directories. 4805 */ 4806 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4807 struct btrfs_root *root, struct btrfs_inode *inode, 4808 int inode_only, 4809 const loff_t start, 4810 const loff_t end, 4811 struct btrfs_log_ctx *ctx) 4812 { 4813 struct btrfs_fs_info *fs_info = root->fs_info; 4814 struct btrfs_path *path; 4815 struct btrfs_path *dst_path; 4816 struct btrfs_key min_key; 4817 struct btrfs_key max_key; 4818 struct btrfs_root *log = root->log_root; 4819 u64 last_extent = 0; 4820 int err = 0; 4821 int ret; 4822 int nritems; 4823 int ins_start_slot = 0; 4824 int ins_nr; 4825 bool fast_search = false; 4826 u64 ino = btrfs_ino(inode); 4827 struct extent_map_tree *em_tree = &inode->extent_tree; 4828 u64 logged_isize = 0; 4829 bool need_log_inode_item = true; 4830 bool xattrs_logged = false; 4831 4832 path = btrfs_alloc_path(); 4833 if (!path) 4834 return -ENOMEM; 4835 dst_path = btrfs_alloc_path(); 4836 if (!dst_path) { 4837 btrfs_free_path(path); 4838 return -ENOMEM; 4839 } 4840 4841 min_key.objectid = ino; 4842 min_key.type = BTRFS_INODE_ITEM_KEY; 4843 min_key.offset = 0; 4844 4845 max_key.objectid = ino; 4846 4847 4848 /* today the code can only do partial logging of directories */ 4849 if (S_ISDIR(inode->vfs_inode.i_mode) || 4850 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4851 &inode->runtime_flags) && 4852 inode_only >= LOG_INODE_EXISTS)) 4853 max_key.type = BTRFS_XATTR_ITEM_KEY; 4854 else 4855 max_key.type = (u8)-1; 4856 max_key.offset = (u64)-1; 4857 4858 /* 4859 * Only run delayed items if we are a dir or a new file. 4860 * Otherwise commit the delayed inode only, which is needed in 4861 * order for the log replay code to mark inodes for link count 4862 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4863 */ 4864 if (S_ISDIR(inode->vfs_inode.i_mode) || 4865 inode->generation > fs_info->last_trans_committed) 4866 ret = btrfs_commit_inode_delayed_items(trans, inode); 4867 else 4868 ret = btrfs_commit_inode_delayed_inode(inode); 4869 4870 if (ret) { 4871 btrfs_free_path(path); 4872 btrfs_free_path(dst_path); 4873 return ret; 4874 } 4875 4876 if (inode_only == LOG_OTHER_INODE) { 4877 inode_only = LOG_INODE_EXISTS; 4878 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4879 } else { 4880 mutex_lock(&inode->log_mutex); 4881 } 4882 4883 /* 4884 * a brute force approach to making sure we get the most uptodate 4885 * copies of everything. 4886 */ 4887 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4888 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4889 4890 if (inode_only == LOG_INODE_EXISTS) 4891 max_key_type = BTRFS_XATTR_ITEM_KEY; 4892 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4893 } else { 4894 if (inode_only == LOG_INODE_EXISTS) { 4895 /* 4896 * Make sure the new inode item we write to the log has 4897 * the same isize as the current one (if it exists). 4898 * This is necessary to prevent data loss after log 4899 * replay, and also to prevent doing a wrong expanding 4900 * truncate - for e.g. create file, write 4K into offset 4901 * 0, fsync, write 4K into offset 4096, add hard link, 4902 * fsync some other file (to sync log), power fail - if 4903 * we use the inode's current i_size, after log replay 4904 * we get a 8Kb file, with the last 4Kb extent as a hole 4905 * (zeroes), as if an expanding truncate happened, 4906 * instead of getting a file of 4Kb only. 4907 */ 4908 err = logged_inode_size(log, inode, path, &logged_isize); 4909 if (err) 4910 goto out_unlock; 4911 } 4912 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4913 &inode->runtime_flags)) { 4914 if (inode_only == LOG_INODE_EXISTS) { 4915 max_key.type = BTRFS_XATTR_ITEM_KEY; 4916 ret = drop_objectid_items(trans, log, path, ino, 4917 max_key.type); 4918 } else { 4919 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4920 &inode->runtime_flags); 4921 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4922 &inode->runtime_flags); 4923 while(1) { 4924 ret = btrfs_truncate_inode_items(trans, 4925 log, &inode->vfs_inode, 0, 0); 4926 if (ret != -EAGAIN) 4927 break; 4928 } 4929 } 4930 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4931 &inode->runtime_flags) || 4932 inode_only == LOG_INODE_EXISTS) { 4933 if (inode_only == LOG_INODE_ALL) 4934 fast_search = true; 4935 max_key.type = BTRFS_XATTR_ITEM_KEY; 4936 ret = drop_objectid_items(trans, log, path, ino, 4937 max_key.type); 4938 } else { 4939 if (inode_only == LOG_INODE_ALL) 4940 fast_search = true; 4941 goto log_extents; 4942 } 4943 4944 } 4945 if (ret) { 4946 err = ret; 4947 goto out_unlock; 4948 } 4949 4950 while (1) { 4951 ins_nr = 0; 4952 ret = btrfs_search_forward(root, &min_key, 4953 path, trans->transid); 4954 if (ret < 0) { 4955 err = ret; 4956 goto out_unlock; 4957 } 4958 if (ret != 0) 4959 break; 4960 again: 4961 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4962 if (min_key.objectid != ino) 4963 break; 4964 if (min_key.type > max_key.type) 4965 break; 4966 4967 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4968 need_log_inode_item = false; 4969 4970 if ((min_key.type == BTRFS_INODE_REF_KEY || 4971 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4972 inode->generation == trans->transid) { 4973 u64 other_ino = 0; 4974 4975 ret = btrfs_check_ref_name_override(path->nodes[0], 4976 path->slots[0], &min_key, inode, 4977 &other_ino); 4978 if (ret < 0) { 4979 err = ret; 4980 goto out_unlock; 4981 } else if (ret > 0 && ctx && 4982 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4983 struct btrfs_key inode_key; 4984 struct inode *other_inode; 4985 4986 if (ins_nr > 0) { 4987 ins_nr++; 4988 } else { 4989 ins_nr = 1; 4990 ins_start_slot = path->slots[0]; 4991 } 4992 ret = copy_items(trans, inode, dst_path, path, 4993 &last_extent, ins_start_slot, 4994 ins_nr, inode_only, 4995 logged_isize); 4996 if (ret < 0) { 4997 err = ret; 4998 goto out_unlock; 4999 } 5000 ins_nr = 0; 5001 btrfs_release_path(path); 5002 inode_key.objectid = other_ino; 5003 inode_key.type = BTRFS_INODE_ITEM_KEY; 5004 inode_key.offset = 0; 5005 other_inode = btrfs_iget(fs_info->sb, 5006 &inode_key, root, 5007 NULL); 5008 /* 5009 * If the other inode that had a conflicting dir 5010 * entry was deleted in the current transaction, 5011 * we don't need to do more work nor fallback to 5012 * a transaction commit. 5013 */ 5014 if (other_inode == ERR_PTR(-ENOENT)) { 5015 goto next_key; 5016 } else if (IS_ERR(other_inode)) { 5017 err = PTR_ERR(other_inode); 5018 goto out_unlock; 5019 } 5020 /* 5021 * We are safe logging the other inode without 5022 * acquiring its i_mutex as long as we log with 5023 * the LOG_INODE_EXISTS mode. We're safe against 5024 * concurrent renames of the other inode as well 5025 * because during a rename we pin the log and 5026 * update the log with the new name before we 5027 * unpin it. 5028 */ 5029 err = btrfs_log_inode(trans, root, 5030 BTRFS_I(other_inode), 5031 LOG_OTHER_INODE, 0, LLONG_MAX, 5032 ctx); 5033 iput(other_inode); 5034 if (err) 5035 goto out_unlock; 5036 else 5037 goto next_key; 5038 } 5039 } 5040 5041 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5042 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 5043 if (ins_nr == 0) 5044 goto next_slot; 5045 ret = copy_items(trans, inode, dst_path, path, 5046 &last_extent, ins_start_slot, 5047 ins_nr, inode_only, logged_isize); 5048 if (ret < 0) { 5049 err = ret; 5050 goto out_unlock; 5051 } 5052 ins_nr = 0; 5053 if (ret) { 5054 btrfs_release_path(path); 5055 continue; 5056 } 5057 goto next_slot; 5058 } 5059 5060 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5061 ins_nr++; 5062 goto next_slot; 5063 } else if (!ins_nr) { 5064 ins_start_slot = path->slots[0]; 5065 ins_nr = 1; 5066 goto next_slot; 5067 } 5068 5069 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5070 ins_start_slot, ins_nr, inode_only, 5071 logged_isize); 5072 if (ret < 0) { 5073 err = ret; 5074 goto out_unlock; 5075 } 5076 if (ret) { 5077 ins_nr = 0; 5078 btrfs_release_path(path); 5079 continue; 5080 } 5081 ins_nr = 1; 5082 ins_start_slot = path->slots[0]; 5083 next_slot: 5084 5085 nritems = btrfs_header_nritems(path->nodes[0]); 5086 path->slots[0]++; 5087 if (path->slots[0] < nritems) { 5088 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 5089 path->slots[0]); 5090 goto again; 5091 } 5092 if (ins_nr) { 5093 ret = copy_items(trans, inode, dst_path, path, 5094 &last_extent, ins_start_slot, 5095 ins_nr, inode_only, logged_isize); 5096 if (ret < 0) { 5097 err = ret; 5098 goto out_unlock; 5099 } 5100 ret = 0; 5101 ins_nr = 0; 5102 } 5103 btrfs_release_path(path); 5104 next_key: 5105 if (min_key.offset < (u64)-1) { 5106 min_key.offset++; 5107 } else if (min_key.type < max_key.type) { 5108 min_key.type++; 5109 min_key.offset = 0; 5110 } else { 5111 break; 5112 } 5113 } 5114 if (ins_nr) { 5115 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5116 ins_start_slot, ins_nr, inode_only, 5117 logged_isize); 5118 if (ret < 0) { 5119 err = ret; 5120 goto out_unlock; 5121 } 5122 ret = 0; 5123 ins_nr = 0; 5124 } 5125 5126 btrfs_release_path(path); 5127 btrfs_release_path(dst_path); 5128 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5129 if (err) 5130 goto out_unlock; 5131 xattrs_logged = true; 5132 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5133 btrfs_release_path(path); 5134 btrfs_release_path(dst_path); 5135 err = btrfs_log_trailing_hole(trans, root, inode, path); 5136 if (err) 5137 goto out_unlock; 5138 } 5139 log_extents: 5140 btrfs_release_path(path); 5141 btrfs_release_path(dst_path); 5142 if (need_log_inode_item) { 5143 err = log_inode_item(trans, log, dst_path, inode); 5144 if (!err && !xattrs_logged) { 5145 err = btrfs_log_all_xattrs(trans, root, inode, path, 5146 dst_path); 5147 btrfs_release_path(path); 5148 } 5149 if (err) 5150 goto out_unlock; 5151 } 5152 if (fast_search) { 5153 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5154 ctx, start, end); 5155 if (ret) { 5156 err = ret; 5157 goto out_unlock; 5158 } 5159 } else if (inode_only == LOG_INODE_ALL) { 5160 struct extent_map *em, *n; 5161 5162 write_lock(&em_tree->lock); 5163 /* 5164 * We can't just remove every em if we're called for a ranged 5165 * fsync - that is, one that doesn't cover the whole possible 5166 * file range (0 to LLONG_MAX). This is because we can have 5167 * em's that fall outside the range we're logging and therefore 5168 * their ordered operations haven't completed yet 5169 * (btrfs_finish_ordered_io() not invoked yet). This means we 5170 * didn't get their respective file extent item in the fs/subvol 5171 * tree yet, and need to let the next fast fsync (one which 5172 * consults the list of modified extent maps) find the em so 5173 * that it logs a matching file extent item and waits for the 5174 * respective ordered operation to complete (if it's still 5175 * running). 5176 * 5177 * Removing every em outside the range we're logging would make 5178 * the next fast fsync not log their matching file extent items, 5179 * therefore making us lose data after a log replay. 5180 */ 5181 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5182 list) { 5183 const u64 mod_end = em->mod_start + em->mod_len - 1; 5184 5185 if (em->mod_start >= start && mod_end <= end) 5186 list_del_init(&em->list); 5187 } 5188 write_unlock(&em_tree->lock); 5189 } 5190 5191 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5192 ret = log_directory_changes(trans, root, inode, path, dst_path, 5193 ctx); 5194 if (ret) { 5195 err = ret; 5196 goto out_unlock; 5197 } 5198 } 5199 5200 spin_lock(&inode->lock); 5201 inode->logged_trans = trans->transid; 5202 inode->last_log_commit = inode->last_sub_trans; 5203 spin_unlock(&inode->lock); 5204 out_unlock: 5205 mutex_unlock(&inode->log_mutex); 5206 5207 btrfs_free_path(path); 5208 btrfs_free_path(dst_path); 5209 return err; 5210 } 5211 5212 /* 5213 * Check if we must fallback to a transaction commit when logging an inode. 5214 * This must be called after logging the inode and is used only in the context 5215 * when fsyncing an inode requires the need to log some other inode - in which 5216 * case we can't lock the i_mutex of each other inode we need to log as that 5217 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5218 * log inodes up or down in the hierarchy) or rename operations for example. So 5219 * we take the log_mutex of the inode after we have logged it and then check for 5220 * its last_unlink_trans value - this is safe because any task setting 5221 * last_unlink_trans must take the log_mutex and it must do this before it does 5222 * the actual unlink operation, so if we do this check before a concurrent task 5223 * sets last_unlink_trans it means we've logged a consistent version/state of 5224 * all the inode items, otherwise we are not sure and must do a transaction 5225 * commit (the concurrent task might have only updated last_unlink_trans before 5226 * we logged the inode or it might have also done the unlink). 5227 */ 5228 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5229 struct btrfs_inode *inode) 5230 { 5231 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5232 bool ret = false; 5233 5234 mutex_lock(&inode->log_mutex); 5235 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5236 /* 5237 * Make sure any commits to the log are forced to be full 5238 * commits. 5239 */ 5240 btrfs_set_log_full_commit(fs_info, trans); 5241 ret = true; 5242 } 5243 mutex_unlock(&inode->log_mutex); 5244 5245 return ret; 5246 } 5247 5248 /* 5249 * follow the dentry parent pointers up the chain and see if any 5250 * of the directories in it require a full commit before they can 5251 * be logged. Returns zero if nothing special needs to be done or 1 if 5252 * a full commit is required. 5253 */ 5254 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5255 struct btrfs_inode *inode, 5256 struct dentry *parent, 5257 struct super_block *sb, 5258 u64 last_committed) 5259 { 5260 int ret = 0; 5261 struct dentry *old_parent = NULL; 5262 struct btrfs_inode *orig_inode = inode; 5263 5264 /* 5265 * for regular files, if its inode is already on disk, we don't 5266 * have to worry about the parents at all. This is because 5267 * we can use the last_unlink_trans field to record renames 5268 * and other fun in this file. 5269 */ 5270 if (S_ISREG(inode->vfs_inode.i_mode) && 5271 inode->generation <= last_committed && 5272 inode->last_unlink_trans <= last_committed) 5273 goto out; 5274 5275 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5276 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5277 goto out; 5278 inode = BTRFS_I(d_inode(parent)); 5279 } 5280 5281 while (1) { 5282 /* 5283 * If we are logging a directory then we start with our inode, 5284 * not our parent's inode, so we need to skip setting the 5285 * logged_trans so that further down in the log code we don't 5286 * think this inode has already been logged. 5287 */ 5288 if (inode != orig_inode) 5289 inode->logged_trans = trans->transid; 5290 smp_mb(); 5291 5292 if (btrfs_must_commit_transaction(trans, inode)) { 5293 ret = 1; 5294 break; 5295 } 5296 5297 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5298 break; 5299 5300 if (IS_ROOT(parent)) { 5301 inode = BTRFS_I(d_inode(parent)); 5302 if (btrfs_must_commit_transaction(trans, inode)) 5303 ret = 1; 5304 break; 5305 } 5306 5307 parent = dget_parent(parent); 5308 dput(old_parent); 5309 old_parent = parent; 5310 inode = BTRFS_I(d_inode(parent)); 5311 5312 } 5313 dput(old_parent); 5314 out: 5315 return ret; 5316 } 5317 5318 struct btrfs_dir_list { 5319 u64 ino; 5320 struct list_head list; 5321 }; 5322 5323 /* 5324 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5325 * details about the why it is needed. 5326 * This is a recursive operation - if an existing dentry corresponds to a 5327 * directory, that directory's new entries are logged too (same behaviour as 5328 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5329 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5330 * complains about the following circular lock dependency / possible deadlock: 5331 * 5332 * CPU0 CPU1 5333 * ---- ---- 5334 * lock(&type->i_mutex_dir_key#3/2); 5335 * lock(sb_internal#2); 5336 * lock(&type->i_mutex_dir_key#3/2); 5337 * lock(&sb->s_type->i_mutex_key#14); 5338 * 5339 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5340 * sb_start_intwrite() in btrfs_start_transaction(). 5341 * Not locking i_mutex of the inodes is still safe because: 5342 * 5343 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5344 * that while logging the inode new references (names) are added or removed 5345 * from the inode, leaving the logged inode item with a link count that does 5346 * not match the number of logged inode reference items. This is fine because 5347 * at log replay time we compute the real number of links and correct the 5348 * link count in the inode item (see replay_one_buffer() and 5349 * link_to_fixup_dir()); 5350 * 5351 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5352 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5353 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5354 * has a size that doesn't match the sum of the lengths of all the logged 5355 * names. This does not result in a problem because if a dir_item key is 5356 * logged but its matching dir_index key is not logged, at log replay time we 5357 * don't use it to replay the respective name (see replay_one_name()). On the 5358 * other hand if only the dir_index key ends up being logged, the respective 5359 * name is added to the fs/subvol tree with both the dir_item and dir_index 5360 * keys created (see replay_one_name()). 5361 * The directory's inode item with a wrong i_size is not a problem as well, 5362 * since we don't use it at log replay time to set the i_size in the inode 5363 * item of the fs/subvol tree (see overwrite_item()). 5364 */ 5365 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5366 struct btrfs_root *root, 5367 struct btrfs_inode *start_inode, 5368 struct btrfs_log_ctx *ctx) 5369 { 5370 struct btrfs_fs_info *fs_info = root->fs_info; 5371 struct btrfs_root *log = root->log_root; 5372 struct btrfs_path *path; 5373 LIST_HEAD(dir_list); 5374 struct btrfs_dir_list *dir_elem; 5375 int ret = 0; 5376 5377 path = btrfs_alloc_path(); 5378 if (!path) 5379 return -ENOMEM; 5380 5381 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5382 if (!dir_elem) { 5383 btrfs_free_path(path); 5384 return -ENOMEM; 5385 } 5386 dir_elem->ino = btrfs_ino(start_inode); 5387 list_add_tail(&dir_elem->list, &dir_list); 5388 5389 while (!list_empty(&dir_list)) { 5390 struct extent_buffer *leaf; 5391 struct btrfs_key min_key; 5392 int nritems; 5393 int i; 5394 5395 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5396 list); 5397 if (ret) 5398 goto next_dir_inode; 5399 5400 min_key.objectid = dir_elem->ino; 5401 min_key.type = BTRFS_DIR_ITEM_KEY; 5402 min_key.offset = 0; 5403 again: 5404 btrfs_release_path(path); 5405 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5406 if (ret < 0) { 5407 goto next_dir_inode; 5408 } else if (ret > 0) { 5409 ret = 0; 5410 goto next_dir_inode; 5411 } 5412 5413 process_leaf: 5414 leaf = path->nodes[0]; 5415 nritems = btrfs_header_nritems(leaf); 5416 for (i = path->slots[0]; i < nritems; i++) { 5417 struct btrfs_dir_item *di; 5418 struct btrfs_key di_key; 5419 struct inode *di_inode; 5420 struct btrfs_dir_list *new_dir_elem; 5421 int log_mode = LOG_INODE_EXISTS; 5422 int type; 5423 5424 btrfs_item_key_to_cpu(leaf, &min_key, i); 5425 if (min_key.objectid != dir_elem->ino || 5426 min_key.type != BTRFS_DIR_ITEM_KEY) 5427 goto next_dir_inode; 5428 5429 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5430 type = btrfs_dir_type(leaf, di); 5431 if (btrfs_dir_transid(leaf, di) < trans->transid && 5432 type != BTRFS_FT_DIR) 5433 continue; 5434 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5435 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5436 continue; 5437 5438 btrfs_release_path(path); 5439 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5440 if (IS_ERR(di_inode)) { 5441 ret = PTR_ERR(di_inode); 5442 goto next_dir_inode; 5443 } 5444 5445 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5446 iput(di_inode); 5447 break; 5448 } 5449 5450 ctx->log_new_dentries = false; 5451 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5452 log_mode = LOG_INODE_ALL; 5453 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5454 log_mode, 0, LLONG_MAX, ctx); 5455 if (!ret && 5456 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5457 ret = 1; 5458 iput(di_inode); 5459 if (ret) 5460 goto next_dir_inode; 5461 if (ctx->log_new_dentries) { 5462 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5463 GFP_NOFS); 5464 if (!new_dir_elem) { 5465 ret = -ENOMEM; 5466 goto next_dir_inode; 5467 } 5468 new_dir_elem->ino = di_key.objectid; 5469 list_add_tail(&new_dir_elem->list, &dir_list); 5470 } 5471 break; 5472 } 5473 if (i == nritems) { 5474 ret = btrfs_next_leaf(log, path); 5475 if (ret < 0) { 5476 goto next_dir_inode; 5477 } else if (ret > 0) { 5478 ret = 0; 5479 goto next_dir_inode; 5480 } 5481 goto process_leaf; 5482 } 5483 if (min_key.offset < (u64)-1) { 5484 min_key.offset++; 5485 goto again; 5486 } 5487 next_dir_inode: 5488 list_del(&dir_elem->list); 5489 kfree(dir_elem); 5490 } 5491 5492 btrfs_free_path(path); 5493 return ret; 5494 } 5495 5496 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5497 struct btrfs_inode *inode, 5498 struct btrfs_log_ctx *ctx) 5499 { 5500 struct btrfs_fs_info *fs_info = trans->fs_info; 5501 int ret; 5502 struct btrfs_path *path; 5503 struct btrfs_key key; 5504 struct btrfs_root *root = inode->root; 5505 const u64 ino = btrfs_ino(inode); 5506 5507 path = btrfs_alloc_path(); 5508 if (!path) 5509 return -ENOMEM; 5510 path->skip_locking = 1; 5511 path->search_commit_root = 1; 5512 5513 key.objectid = ino; 5514 key.type = BTRFS_INODE_REF_KEY; 5515 key.offset = 0; 5516 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5517 if (ret < 0) 5518 goto out; 5519 5520 while (true) { 5521 struct extent_buffer *leaf = path->nodes[0]; 5522 int slot = path->slots[0]; 5523 u32 cur_offset = 0; 5524 u32 item_size; 5525 unsigned long ptr; 5526 5527 if (slot >= btrfs_header_nritems(leaf)) { 5528 ret = btrfs_next_leaf(root, path); 5529 if (ret < 0) 5530 goto out; 5531 else if (ret > 0) 5532 break; 5533 continue; 5534 } 5535 5536 btrfs_item_key_to_cpu(leaf, &key, slot); 5537 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5538 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5539 break; 5540 5541 item_size = btrfs_item_size_nr(leaf, slot); 5542 ptr = btrfs_item_ptr_offset(leaf, slot); 5543 while (cur_offset < item_size) { 5544 struct btrfs_key inode_key; 5545 struct inode *dir_inode; 5546 5547 inode_key.type = BTRFS_INODE_ITEM_KEY; 5548 inode_key.offset = 0; 5549 5550 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5551 struct btrfs_inode_extref *extref; 5552 5553 extref = (struct btrfs_inode_extref *) 5554 (ptr + cur_offset); 5555 inode_key.objectid = btrfs_inode_extref_parent( 5556 leaf, extref); 5557 cur_offset += sizeof(*extref); 5558 cur_offset += btrfs_inode_extref_name_len(leaf, 5559 extref); 5560 } else { 5561 inode_key.objectid = key.offset; 5562 cur_offset = item_size; 5563 } 5564 5565 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5566 root, NULL); 5567 /* If parent inode was deleted, skip it. */ 5568 if (IS_ERR(dir_inode)) 5569 continue; 5570 5571 if (ctx) 5572 ctx->log_new_dentries = false; 5573 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5574 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5575 if (!ret && 5576 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5577 ret = 1; 5578 if (!ret && ctx && ctx->log_new_dentries) 5579 ret = log_new_dir_dentries(trans, root, 5580 BTRFS_I(dir_inode), ctx); 5581 iput(dir_inode); 5582 if (ret) 5583 goto out; 5584 } 5585 path->slots[0]++; 5586 } 5587 ret = 0; 5588 out: 5589 btrfs_free_path(path); 5590 return ret; 5591 } 5592 5593 /* 5594 * helper function around btrfs_log_inode to make sure newly created 5595 * parent directories also end up in the log. A minimal inode and backref 5596 * only logging is done of any parent directories that are older than 5597 * the last committed transaction 5598 */ 5599 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5600 struct btrfs_inode *inode, 5601 struct dentry *parent, 5602 const loff_t start, 5603 const loff_t end, 5604 int inode_only, 5605 struct btrfs_log_ctx *ctx) 5606 { 5607 struct btrfs_root *root = inode->root; 5608 struct btrfs_fs_info *fs_info = root->fs_info; 5609 struct super_block *sb; 5610 struct dentry *old_parent = NULL; 5611 int ret = 0; 5612 u64 last_committed = fs_info->last_trans_committed; 5613 bool log_dentries = false; 5614 struct btrfs_inode *orig_inode = inode; 5615 5616 sb = inode->vfs_inode.i_sb; 5617 5618 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5619 ret = 1; 5620 goto end_no_trans; 5621 } 5622 5623 /* 5624 * The prev transaction commit doesn't complete, we need do 5625 * full commit by ourselves. 5626 */ 5627 if (fs_info->last_trans_log_full_commit > 5628 fs_info->last_trans_committed) { 5629 ret = 1; 5630 goto end_no_trans; 5631 } 5632 5633 if (btrfs_root_refs(&root->root_item) == 0) { 5634 ret = 1; 5635 goto end_no_trans; 5636 } 5637 5638 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5639 last_committed); 5640 if (ret) 5641 goto end_no_trans; 5642 5643 if (btrfs_inode_in_log(inode, trans->transid)) { 5644 ret = BTRFS_NO_LOG_SYNC; 5645 goto end_no_trans; 5646 } 5647 5648 ret = start_log_trans(trans, root, ctx); 5649 if (ret) 5650 goto end_no_trans; 5651 5652 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5653 if (ret) 5654 goto end_trans; 5655 5656 /* 5657 * for regular files, if its inode is already on disk, we don't 5658 * have to worry about the parents at all. This is because 5659 * we can use the last_unlink_trans field to record renames 5660 * and other fun in this file. 5661 */ 5662 if (S_ISREG(inode->vfs_inode.i_mode) && 5663 inode->generation <= last_committed && 5664 inode->last_unlink_trans <= last_committed) { 5665 ret = 0; 5666 goto end_trans; 5667 } 5668 5669 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5670 log_dentries = true; 5671 5672 /* 5673 * On unlink we must make sure all our current and old parent directory 5674 * inodes are fully logged. This is to prevent leaving dangling 5675 * directory index entries in directories that were our parents but are 5676 * not anymore. Not doing this results in old parent directory being 5677 * impossible to delete after log replay (rmdir will always fail with 5678 * error -ENOTEMPTY). 5679 * 5680 * Example 1: 5681 * 5682 * mkdir testdir 5683 * touch testdir/foo 5684 * ln testdir/foo testdir/bar 5685 * sync 5686 * unlink testdir/bar 5687 * xfs_io -c fsync testdir/foo 5688 * <power failure> 5689 * mount fs, triggers log replay 5690 * 5691 * If we don't log the parent directory (testdir), after log replay the 5692 * directory still has an entry pointing to the file inode using the bar 5693 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5694 * the file inode has a link count of 1. 5695 * 5696 * Example 2: 5697 * 5698 * mkdir testdir 5699 * touch foo 5700 * ln foo testdir/foo2 5701 * ln foo testdir/foo3 5702 * sync 5703 * unlink testdir/foo3 5704 * xfs_io -c fsync foo 5705 * <power failure> 5706 * mount fs, triggers log replay 5707 * 5708 * Similar as the first example, after log replay the parent directory 5709 * testdir still has an entry pointing to the inode file with name foo3 5710 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5711 * and has a link count of 2. 5712 */ 5713 if (inode->last_unlink_trans > last_committed) { 5714 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5715 if (ret) 5716 goto end_trans; 5717 } 5718 5719 while (1) { 5720 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5721 break; 5722 5723 inode = BTRFS_I(d_inode(parent)); 5724 if (root != inode->root) 5725 break; 5726 5727 if (inode->generation > last_committed) { 5728 ret = btrfs_log_inode(trans, root, inode, 5729 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5730 if (ret) 5731 goto end_trans; 5732 } 5733 if (IS_ROOT(parent)) 5734 break; 5735 5736 parent = dget_parent(parent); 5737 dput(old_parent); 5738 old_parent = parent; 5739 } 5740 if (log_dentries) 5741 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5742 else 5743 ret = 0; 5744 end_trans: 5745 dput(old_parent); 5746 if (ret < 0) { 5747 btrfs_set_log_full_commit(fs_info, trans); 5748 ret = 1; 5749 } 5750 5751 if (ret) 5752 btrfs_remove_log_ctx(root, ctx); 5753 btrfs_end_log_trans(root); 5754 end_no_trans: 5755 return ret; 5756 } 5757 5758 /* 5759 * it is not safe to log dentry if the chunk root has added new 5760 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5761 * If this returns 1, you must commit the transaction to safely get your 5762 * data on disk. 5763 */ 5764 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5765 struct dentry *dentry, 5766 const loff_t start, 5767 const loff_t end, 5768 struct btrfs_log_ctx *ctx) 5769 { 5770 struct dentry *parent = dget_parent(dentry); 5771 int ret; 5772 5773 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 5774 start, end, LOG_INODE_ALL, ctx); 5775 dput(parent); 5776 5777 return ret; 5778 } 5779 5780 /* 5781 * should be called during mount to recover any replay any log trees 5782 * from the FS 5783 */ 5784 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5785 { 5786 int ret; 5787 struct btrfs_path *path; 5788 struct btrfs_trans_handle *trans; 5789 struct btrfs_key key; 5790 struct btrfs_key found_key; 5791 struct btrfs_key tmp_key; 5792 struct btrfs_root *log; 5793 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5794 struct walk_control wc = { 5795 .process_func = process_one_buffer, 5796 .stage = 0, 5797 }; 5798 5799 path = btrfs_alloc_path(); 5800 if (!path) 5801 return -ENOMEM; 5802 5803 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5804 5805 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5806 if (IS_ERR(trans)) { 5807 ret = PTR_ERR(trans); 5808 goto error; 5809 } 5810 5811 wc.trans = trans; 5812 wc.pin = 1; 5813 5814 ret = walk_log_tree(trans, log_root_tree, &wc); 5815 if (ret) { 5816 btrfs_handle_fs_error(fs_info, ret, 5817 "Failed to pin buffers while recovering log root tree."); 5818 goto error; 5819 } 5820 5821 again: 5822 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5823 key.offset = (u64)-1; 5824 key.type = BTRFS_ROOT_ITEM_KEY; 5825 5826 while (1) { 5827 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5828 5829 if (ret < 0) { 5830 btrfs_handle_fs_error(fs_info, ret, 5831 "Couldn't find tree log root."); 5832 goto error; 5833 } 5834 if (ret > 0) { 5835 if (path->slots[0] == 0) 5836 break; 5837 path->slots[0]--; 5838 } 5839 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5840 path->slots[0]); 5841 btrfs_release_path(path); 5842 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5843 break; 5844 5845 log = btrfs_read_fs_root(log_root_tree, &found_key); 5846 if (IS_ERR(log)) { 5847 ret = PTR_ERR(log); 5848 btrfs_handle_fs_error(fs_info, ret, 5849 "Couldn't read tree log root."); 5850 goto error; 5851 } 5852 5853 tmp_key.objectid = found_key.offset; 5854 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5855 tmp_key.offset = (u64)-1; 5856 5857 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5858 if (IS_ERR(wc.replay_dest)) { 5859 ret = PTR_ERR(wc.replay_dest); 5860 free_extent_buffer(log->node); 5861 free_extent_buffer(log->commit_root); 5862 kfree(log); 5863 btrfs_handle_fs_error(fs_info, ret, 5864 "Couldn't read target root for tree log recovery."); 5865 goto error; 5866 } 5867 5868 wc.replay_dest->log_root = log; 5869 btrfs_record_root_in_trans(trans, wc.replay_dest); 5870 ret = walk_log_tree(trans, log, &wc); 5871 5872 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5873 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5874 path); 5875 } 5876 5877 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5878 struct btrfs_root *root = wc.replay_dest; 5879 5880 btrfs_release_path(path); 5881 5882 /* 5883 * We have just replayed everything, and the highest 5884 * objectid of fs roots probably has changed in case 5885 * some inode_item's got replayed. 5886 * 5887 * root->objectid_mutex is not acquired as log replay 5888 * could only happen during mount. 5889 */ 5890 ret = btrfs_find_highest_objectid(root, 5891 &root->highest_objectid); 5892 } 5893 5894 key.offset = found_key.offset - 1; 5895 wc.replay_dest->log_root = NULL; 5896 free_extent_buffer(log->node); 5897 free_extent_buffer(log->commit_root); 5898 kfree(log); 5899 5900 if (ret) 5901 goto error; 5902 5903 if (found_key.offset == 0) 5904 break; 5905 } 5906 btrfs_release_path(path); 5907 5908 /* step one is to pin it all, step two is to replay just inodes */ 5909 if (wc.pin) { 5910 wc.pin = 0; 5911 wc.process_func = replay_one_buffer; 5912 wc.stage = LOG_WALK_REPLAY_INODES; 5913 goto again; 5914 } 5915 /* step three is to replay everything */ 5916 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5917 wc.stage++; 5918 goto again; 5919 } 5920 5921 btrfs_free_path(path); 5922 5923 /* step 4: commit the transaction, which also unpins the blocks */ 5924 ret = btrfs_commit_transaction(trans); 5925 if (ret) 5926 return ret; 5927 5928 free_extent_buffer(log_root_tree->node); 5929 log_root_tree->log_root = NULL; 5930 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5931 kfree(log_root_tree); 5932 5933 return 0; 5934 error: 5935 if (wc.trans) 5936 btrfs_end_transaction(wc.trans); 5937 btrfs_free_path(path); 5938 return ret; 5939 } 5940 5941 /* 5942 * there are some corner cases where we want to force a full 5943 * commit instead of allowing a directory to be logged. 5944 * 5945 * They revolve around files there were unlinked from the directory, and 5946 * this function updates the parent directory so that a full commit is 5947 * properly done if it is fsync'd later after the unlinks are done. 5948 * 5949 * Must be called before the unlink operations (updates to the subvolume tree, 5950 * inodes, etc) are done. 5951 */ 5952 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5953 struct btrfs_inode *dir, struct btrfs_inode *inode, 5954 int for_rename) 5955 { 5956 /* 5957 * when we're logging a file, if it hasn't been renamed 5958 * or unlinked, and its inode is fully committed on disk, 5959 * we don't have to worry about walking up the directory chain 5960 * to log its parents. 5961 * 5962 * So, we use the last_unlink_trans field to put this transid 5963 * into the file. When the file is logged we check it and 5964 * don't log the parents if the file is fully on disk. 5965 */ 5966 mutex_lock(&inode->log_mutex); 5967 inode->last_unlink_trans = trans->transid; 5968 mutex_unlock(&inode->log_mutex); 5969 5970 /* 5971 * if this directory was already logged any new 5972 * names for this file/dir will get recorded 5973 */ 5974 smp_mb(); 5975 if (dir->logged_trans == trans->transid) 5976 return; 5977 5978 /* 5979 * if the inode we're about to unlink was logged, 5980 * the log will be properly updated for any new names 5981 */ 5982 if (inode->logged_trans == trans->transid) 5983 return; 5984 5985 /* 5986 * when renaming files across directories, if the directory 5987 * there we're unlinking from gets fsync'd later on, there's 5988 * no way to find the destination directory later and fsync it 5989 * properly. So, we have to be conservative and force commits 5990 * so the new name gets discovered. 5991 */ 5992 if (for_rename) 5993 goto record; 5994 5995 /* we can safely do the unlink without any special recording */ 5996 return; 5997 5998 record: 5999 mutex_lock(&dir->log_mutex); 6000 dir->last_unlink_trans = trans->transid; 6001 mutex_unlock(&dir->log_mutex); 6002 } 6003 6004 /* 6005 * Make sure that if someone attempts to fsync the parent directory of a deleted 6006 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6007 * that after replaying the log tree of the parent directory's root we will not 6008 * see the snapshot anymore and at log replay time we will not see any log tree 6009 * corresponding to the deleted snapshot's root, which could lead to replaying 6010 * it after replaying the log tree of the parent directory (which would replay 6011 * the snapshot delete operation). 6012 * 6013 * Must be called before the actual snapshot destroy operation (updates to the 6014 * parent root and tree of tree roots trees, etc) are done. 6015 */ 6016 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6017 struct btrfs_inode *dir) 6018 { 6019 mutex_lock(&dir->log_mutex); 6020 dir->last_unlink_trans = trans->transid; 6021 mutex_unlock(&dir->log_mutex); 6022 } 6023 6024 /* 6025 * Call this after adding a new name for a file and it will properly 6026 * update the log to reflect the new name. 6027 * 6028 * It will return zero if all goes well, and it will return 1 if a 6029 * full transaction commit is required. 6030 */ 6031 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 6032 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6033 struct dentry *parent) 6034 { 6035 struct btrfs_fs_info *fs_info = trans->fs_info; 6036 6037 /* 6038 * this will force the logging code to walk the dentry chain 6039 * up for the file 6040 */ 6041 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6042 inode->last_unlink_trans = trans->transid; 6043 6044 /* 6045 * if this inode hasn't been logged and directory we're renaming it 6046 * from hasn't been logged, we don't need to log it 6047 */ 6048 if (inode->logged_trans <= fs_info->last_trans_committed && 6049 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6050 return 0; 6051 6052 return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6053 LOG_INODE_EXISTS, NULL); 6054 } 6055 6056