1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "ctree.h" 12 #include "tree-log.h" 13 #include "disk-io.h" 14 #include "locking.h" 15 #include "print-tree.h" 16 #include "backref.h" 17 #include "compression.h" 18 #include "qgroup.h" 19 #include "inode-map.h" 20 21 /* magic values for the inode_only field in btrfs_log_inode: 22 * 23 * LOG_INODE_ALL means to log everything 24 * LOG_INODE_EXISTS means to log just enough to recreate the inode 25 * during log replay 26 */ 27 #define LOG_INODE_ALL 0 28 #define LOG_INODE_EXISTS 1 29 #define LOG_OTHER_INODE 2 30 31 /* 32 * directory trouble cases 33 * 34 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 35 * log, we must force a full commit before doing an fsync of the directory 36 * where the unlink was done. 37 * ---> record transid of last unlink/rename per directory 38 * 39 * mkdir foo/some_dir 40 * normal commit 41 * rename foo/some_dir foo2/some_dir 42 * mkdir foo/some_dir 43 * fsync foo/some_dir/some_file 44 * 45 * The fsync above will unlink the original some_dir without recording 46 * it in its new location (foo2). After a crash, some_dir will be gone 47 * unless the fsync of some_file forces a full commit 48 * 49 * 2) we must log any new names for any file or dir that is in the fsync 50 * log. ---> check inode while renaming/linking. 51 * 52 * 2a) we must log any new names for any file or dir during rename 53 * when the directory they are being removed from was logged. 54 * ---> check inode and old parent dir during rename 55 * 56 * 2a is actually the more important variant. With the extra logging 57 * a crash might unlink the old name without recreating the new one 58 * 59 * 3) after a crash, we must go through any directories with a link count 60 * of zero and redo the rm -rf 61 * 62 * mkdir f1/foo 63 * normal commit 64 * rm -rf f1/foo 65 * fsync(f1) 66 * 67 * The directory f1 was fully removed from the FS, but fsync was never 68 * called on f1, only its parent dir. After a crash the rm -rf must 69 * be replayed. This must be able to recurse down the entire 70 * directory tree. The inode link count fixup code takes care of the 71 * ugly details. 72 */ 73 74 /* 75 * stages for the tree walking. The first 76 * stage (0) is to only pin down the blocks we find 77 * the second stage (1) is to make sure that all the inodes 78 * we find in the log are created in the subvolume. 79 * 80 * The last stage is to deal with directories and links and extents 81 * and all the other fun semantics 82 */ 83 #define LOG_WALK_PIN_ONLY 0 84 #define LOG_WALK_REPLAY_INODES 1 85 #define LOG_WALK_REPLAY_DIR_INDEX 2 86 #define LOG_WALK_REPLAY_ALL 3 87 88 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 89 struct btrfs_root *root, struct btrfs_inode *inode, 90 int inode_only, 91 const loff_t start, 92 const loff_t end, 93 struct btrfs_log_ctx *ctx); 94 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, 96 struct btrfs_path *path, u64 objectid); 97 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, 99 struct btrfs_root *log, 100 struct btrfs_path *path, 101 u64 dirid, int del_all); 102 103 /* 104 * tree logging is a special write ahead log used to make sure that 105 * fsyncs and O_SYNCs can happen without doing full tree commits. 106 * 107 * Full tree commits are expensive because they require commonly 108 * modified blocks to be recowed, creating many dirty pages in the 109 * extent tree an 4x-6x higher write load than ext3. 110 * 111 * Instead of doing a tree commit on every fsync, we use the 112 * key ranges and transaction ids to find items for a given file or directory 113 * that have changed in this transaction. Those items are copied into 114 * a special tree (one per subvolume root), that tree is written to disk 115 * and then the fsync is considered complete. 116 * 117 * After a crash, items are copied out of the log-tree back into the 118 * subvolume tree. Any file data extents found are recorded in the extent 119 * allocation tree, and the log-tree freed. 120 * 121 * The log tree is read three times, once to pin down all the extents it is 122 * using in ram and once, once to create all the inodes logged in the tree 123 * and once to do all the other items. 124 */ 125 126 /* 127 * start a sub transaction and setup the log tree 128 * this increments the log tree writer count to make the people 129 * syncing the tree wait for us to finish 130 */ 131 static int start_log_trans(struct btrfs_trans_handle *trans, 132 struct btrfs_root *root, 133 struct btrfs_log_ctx *ctx) 134 { 135 struct btrfs_fs_info *fs_info = root->fs_info; 136 int ret = 0; 137 138 mutex_lock(&root->log_mutex); 139 140 if (root->log_root) { 141 if (btrfs_need_log_full_commit(fs_info, trans)) { 142 ret = -EAGAIN; 143 goto out; 144 } 145 146 if (!root->log_start_pid) { 147 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 148 root->log_start_pid = current->pid; 149 } else if (root->log_start_pid != current->pid) { 150 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 151 } 152 } else { 153 mutex_lock(&fs_info->tree_log_mutex); 154 if (!fs_info->log_root_tree) 155 ret = btrfs_init_log_root_tree(trans, fs_info); 156 mutex_unlock(&fs_info->tree_log_mutex); 157 if (ret) 158 goto out; 159 160 ret = btrfs_add_log_tree(trans, root); 161 if (ret) 162 goto out; 163 164 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 165 root->log_start_pid = current->pid; 166 } 167 168 atomic_inc(&root->log_batch); 169 atomic_inc(&root->log_writers); 170 if (ctx) { 171 int index = root->log_transid % 2; 172 list_add_tail(&ctx->list, &root->log_ctxs[index]); 173 ctx->log_transid = root->log_transid; 174 } 175 176 out: 177 mutex_unlock(&root->log_mutex); 178 return ret; 179 } 180 181 /* 182 * returns 0 if there was a log transaction running and we were able 183 * to join, or returns -ENOENT if there were not transactions 184 * in progress 185 */ 186 static int join_running_log_trans(struct btrfs_root *root) 187 { 188 int ret = -ENOENT; 189 190 smp_mb(); 191 if (!root->log_root) 192 return -ENOENT; 193 194 mutex_lock(&root->log_mutex); 195 if (root->log_root) { 196 ret = 0; 197 atomic_inc(&root->log_writers); 198 } 199 mutex_unlock(&root->log_mutex); 200 return ret; 201 } 202 203 /* 204 * This either makes the current running log transaction wait 205 * until you call btrfs_end_log_trans() or it makes any future 206 * log transactions wait until you call btrfs_end_log_trans() 207 */ 208 int btrfs_pin_log_trans(struct btrfs_root *root) 209 { 210 int ret = -ENOENT; 211 212 mutex_lock(&root->log_mutex); 213 atomic_inc(&root->log_writers); 214 mutex_unlock(&root->log_mutex); 215 return ret; 216 } 217 218 /* 219 * indicate we're done making changes to the log tree 220 * and wake up anyone waiting to do a sync 221 */ 222 void btrfs_end_log_trans(struct btrfs_root *root) 223 { 224 if (atomic_dec_and_test(&root->log_writers)) { 225 /* atomic_dec_and_test implies a barrier */ 226 cond_wake_up_nomb(&root->log_writer_wait); 227 } 228 } 229 230 231 /* 232 * the walk control struct is used to pass state down the chain when 233 * processing the log tree. The stage field tells us which part 234 * of the log tree processing we are currently doing. The others 235 * are state fields used for that specific part 236 */ 237 struct walk_control { 238 /* should we free the extent on disk when done? This is used 239 * at transaction commit time while freeing a log tree 240 */ 241 int free; 242 243 /* should we write out the extent buffer? This is used 244 * while flushing the log tree to disk during a sync 245 */ 246 int write; 247 248 /* should we wait for the extent buffer io to finish? Also used 249 * while flushing the log tree to disk for a sync 250 */ 251 int wait; 252 253 /* pin only walk, we record which extents on disk belong to the 254 * log trees 255 */ 256 int pin; 257 258 /* what stage of the replay code we're currently in */ 259 int stage; 260 261 /* the root we are currently replaying */ 262 struct btrfs_root *replay_dest; 263 264 /* the trans handle for the current replay */ 265 struct btrfs_trans_handle *trans; 266 267 /* the function that gets used to process blocks we find in the 268 * tree. Note the extent_buffer might not be up to date when it is 269 * passed in, and it must be checked or read if you need the data 270 * inside it 271 */ 272 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 273 struct walk_control *wc, u64 gen, int level); 274 }; 275 276 /* 277 * process_func used to pin down extents, write them or wait on them 278 */ 279 static int process_one_buffer(struct btrfs_root *log, 280 struct extent_buffer *eb, 281 struct walk_control *wc, u64 gen, int level) 282 { 283 struct btrfs_fs_info *fs_info = log->fs_info; 284 int ret = 0; 285 286 /* 287 * If this fs is mixed then we need to be able to process the leaves to 288 * pin down any logged extents, so we have to read the block. 289 */ 290 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 291 ret = btrfs_read_buffer(eb, gen, level, NULL); 292 if (ret) 293 return ret; 294 } 295 296 if (wc->pin) 297 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 298 eb->len); 299 300 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 301 if (wc->pin && btrfs_header_level(eb) == 0) 302 ret = btrfs_exclude_logged_extents(fs_info, eb); 303 if (wc->write) 304 btrfs_write_tree_block(eb); 305 if (wc->wait) 306 btrfs_wait_tree_block_writeback(eb); 307 } 308 return ret; 309 } 310 311 /* 312 * Item overwrite used by replay and tree logging. eb, slot and key all refer 313 * to the src data we are copying out. 314 * 315 * root is the tree we are copying into, and path is a scratch 316 * path for use in this function (it should be released on entry and 317 * will be released on exit). 318 * 319 * If the key is already in the destination tree the existing item is 320 * overwritten. If the existing item isn't big enough, it is extended. 321 * If it is too large, it is truncated. 322 * 323 * If the key isn't in the destination yet, a new item is inserted. 324 */ 325 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 326 struct btrfs_root *root, 327 struct btrfs_path *path, 328 struct extent_buffer *eb, int slot, 329 struct btrfs_key *key) 330 { 331 struct btrfs_fs_info *fs_info = root->fs_info; 332 int ret; 333 u32 item_size; 334 u64 saved_i_size = 0; 335 int save_old_i_size = 0; 336 unsigned long src_ptr; 337 unsigned long dst_ptr; 338 int overwrite_root = 0; 339 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 340 341 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 342 overwrite_root = 1; 343 344 item_size = btrfs_item_size_nr(eb, slot); 345 src_ptr = btrfs_item_ptr_offset(eb, slot); 346 347 /* look for the key in the destination tree */ 348 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 349 if (ret < 0) 350 return ret; 351 352 if (ret == 0) { 353 char *src_copy; 354 char *dst_copy; 355 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 356 path->slots[0]); 357 if (dst_size != item_size) 358 goto insert; 359 360 if (item_size == 0) { 361 btrfs_release_path(path); 362 return 0; 363 } 364 dst_copy = kmalloc(item_size, GFP_NOFS); 365 src_copy = kmalloc(item_size, GFP_NOFS); 366 if (!dst_copy || !src_copy) { 367 btrfs_release_path(path); 368 kfree(dst_copy); 369 kfree(src_copy); 370 return -ENOMEM; 371 } 372 373 read_extent_buffer(eb, src_copy, src_ptr, item_size); 374 375 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 376 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 377 item_size); 378 ret = memcmp(dst_copy, src_copy, item_size); 379 380 kfree(dst_copy); 381 kfree(src_copy); 382 /* 383 * they have the same contents, just return, this saves 384 * us from cowing blocks in the destination tree and doing 385 * extra writes that may not have been done by a previous 386 * sync 387 */ 388 if (ret == 0) { 389 btrfs_release_path(path); 390 return 0; 391 } 392 393 /* 394 * We need to load the old nbytes into the inode so when we 395 * replay the extents we've logged we get the right nbytes. 396 */ 397 if (inode_item) { 398 struct btrfs_inode_item *item; 399 u64 nbytes; 400 u32 mode; 401 402 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 403 struct btrfs_inode_item); 404 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 405 item = btrfs_item_ptr(eb, slot, 406 struct btrfs_inode_item); 407 btrfs_set_inode_nbytes(eb, item, nbytes); 408 409 /* 410 * If this is a directory we need to reset the i_size to 411 * 0 so that we can set it up properly when replaying 412 * the rest of the items in this log. 413 */ 414 mode = btrfs_inode_mode(eb, item); 415 if (S_ISDIR(mode)) 416 btrfs_set_inode_size(eb, item, 0); 417 } 418 } else if (inode_item) { 419 struct btrfs_inode_item *item; 420 u32 mode; 421 422 /* 423 * New inode, set nbytes to 0 so that the nbytes comes out 424 * properly when we replay the extents. 425 */ 426 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 427 btrfs_set_inode_nbytes(eb, item, 0); 428 429 /* 430 * If this is a directory we need to reset the i_size to 0 so 431 * that we can set it up properly when replaying the rest of 432 * the items in this log. 433 */ 434 mode = btrfs_inode_mode(eb, item); 435 if (S_ISDIR(mode)) 436 btrfs_set_inode_size(eb, item, 0); 437 } 438 insert: 439 btrfs_release_path(path); 440 /* try to insert the key into the destination tree */ 441 path->skip_release_on_error = 1; 442 ret = btrfs_insert_empty_item(trans, root, path, 443 key, item_size); 444 path->skip_release_on_error = 0; 445 446 /* make sure any existing item is the correct size */ 447 if (ret == -EEXIST || ret == -EOVERFLOW) { 448 u32 found_size; 449 found_size = btrfs_item_size_nr(path->nodes[0], 450 path->slots[0]); 451 if (found_size > item_size) 452 btrfs_truncate_item(fs_info, path, item_size, 1); 453 else if (found_size < item_size) 454 btrfs_extend_item(fs_info, path, 455 item_size - found_size); 456 } else if (ret) { 457 return ret; 458 } 459 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 460 path->slots[0]); 461 462 /* don't overwrite an existing inode if the generation number 463 * was logged as zero. This is done when the tree logging code 464 * is just logging an inode to make sure it exists after recovery. 465 * 466 * Also, don't overwrite i_size on directories during replay. 467 * log replay inserts and removes directory items based on the 468 * state of the tree found in the subvolume, and i_size is modified 469 * as it goes 470 */ 471 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 472 struct btrfs_inode_item *src_item; 473 struct btrfs_inode_item *dst_item; 474 475 src_item = (struct btrfs_inode_item *)src_ptr; 476 dst_item = (struct btrfs_inode_item *)dst_ptr; 477 478 if (btrfs_inode_generation(eb, src_item) == 0) { 479 struct extent_buffer *dst_eb = path->nodes[0]; 480 const u64 ino_size = btrfs_inode_size(eb, src_item); 481 482 /* 483 * For regular files an ino_size == 0 is used only when 484 * logging that an inode exists, as part of a directory 485 * fsync, and the inode wasn't fsynced before. In this 486 * case don't set the size of the inode in the fs/subvol 487 * tree, otherwise we would be throwing valid data away. 488 */ 489 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 490 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 491 ino_size != 0) { 492 struct btrfs_map_token token; 493 494 btrfs_init_map_token(&token); 495 btrfs_set_token_inode_size(dst_eb, dst_item, 496 ino_size, &token); 497 } 498 goto no_copy; 499 } 500 501 if (overwrite_root && 502 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 503 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 504 save_old_i_size = 1; 505 saved_i_size = btrfs_inode_size(path->nodes[0], 506 dst_item); 507 } 508 } 509 510 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 511 src_ptr, item_size); 512 513 if (save_old_i_size) { 514 struct btrfs_inode_item *dst_item; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 517 } 518 519 /* make sure the generation is filled in */ 520 if (key->type == BTRFS_INODE_ITEM_KEY) { 521 struct btrfs_inode_item *dst_item; 522 dst_item = (struct btrfs_inode_item *)dst_ptr; 523 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 524 btrfs_set_inode_generation(path->nodes[0], dst_item, 525 trans->transid); 526 } 527 } 528 no_copy: 529 btrfs_mark_buffer_dirty(path->nodes[0]); 530 btrfs_release_path(path); 531 return 0; 532 } 533 534 /* 535 * simple helper to read an inode off the disk from a given root 536 * This can only be called for subvolume roots and not for the log 537 */ 538 static noinline struct inode *read_one_inode(struct btrfs_root *root, 539 u64 objectid) 540 { 541 struct btrfs_key key; 542 struct inode *inode; 543 544 key.objectid = objectid; 545 key.type = BTRFS_INODE_ITEM_KEY; 546 key.offset = 0; 547 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 548 if (IS_ERR(inode)) { 549 inode = NULL; 550 } else if (is_bad_inode(inode)) { 551 iput(inode); 552 inode = NULL; 553 } 554 return inode; 555 } 556 557 /* replays a single extent in 'eb' at 'slot' with 'key' into the 558 * subvolume 'root'. path is released on entry and should be released 559 * on exit. 560 * 561 * extents in the log tree have not been allocated out of the extent 562 * tree yet. So, this completes the allocation, taking a reference 563 * as required if the extent already exists or creating a new extent 564 * if it isn't in the extent allocation tree yet. 565 * 566 * The extent is inserted into the file, dropping any existing extents 567 * from the file that overlap the new one. 568 */ 569 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 570 struct btrfs_root *root, 571 struct btrfs_path *path, 572 struct extent_buffer *eb, int slot, 573 struct btrfs_key *key) 574 { 575 struct btrfs_fs_info *fs_info = root->fs_info; 576 int found_type; 577 u64 extent_end; 578 u64 start = key->offset; 579 u64 nbytes = 0; 580 struct btrfs_file_extent_item *item; 581 struct inode *inode = NULL; 582 unsigned long size; 583 int ret = 0; 584 585 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 586 found_type = btrfs_file_extent_type(eb, item); 587 588 if (found_type == BTRFS_FILE_EXTENT_REG || 589 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 590 nbytes = btrfs_file_extent_num_bytes(eb, item); 591 extent_end = start + nbytes; 592 593 /* 594 * We don't add to the inodes nbytes if we are prealloc or a 595 * hole. 596 */ 597 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 598 nbytes = 0; 599 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 600 size = btrfs_file_extent_inline_len(eb, slot, item); 601 nbytes = btrfs_file_extent_ram_bytes(eb, item); 602 extent_end = ALIGN(start + size, 603 fs_info->sectorsize); 604 } else { 605 ret = 0; 606 goto out; 607 } 608 609 inode = read_one_inode(root, key->objectid); 610 if (!inode) { 611 ret = -EIO; 612 goto out; 613 } 614 615 /* 616 * first check to see if we already have this extent in the 617 * file. This must be done before the btrfs_drop_extents run 618 * so we don't try to drop this extent. 619 */ 620 ret = btrfs_lookup_file_extent(trans, root, path, 621 btrfs_ino(BTRFS_I(inode)), start, 0); 622 623 if (ret == 0 && 624 (found_type == BTRFS_FILE_EXTENT_REG || 625 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 626 struct btrfs_file_extent_item cmp1; 627 struct btrfs_file_extent_item cmp2; 628 struct btrfs_file_extent_item *existing; 629 struct extent_buffer *leaf; 630 631 leaf = path->nodes[0]; 632 existing = btrfs_item_ptr(leaf, path->slots[0], 633 struct btrfs_file_extent_item); 634 635 read_extent_buffer(eb, &cmp1, (unsigned long)item, 636 sizeof(cmp1)); 637 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 638 sizeof(cmp2)); 639 640 /* 641 * we already have a pointer to this exact extent, 642 * we don't have to do anything 643 */ 644 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 645 btrfs_release_path(path); 646 goto out; 647 } 648 } 649 btrfs_release_path(path); 650 651 /* drop any overlapping extents */ 652 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 653 if (ret) 654 goto out; 655 656 if (found_type == BTRFS_FILE_EXTENT_REG || 657 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 658 u64 offset; 659 unsigned long dest_offset; 660 struct btrfs_key ins; 661 662 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 663 btrfs_fs_incompat(fs_info, NO_HOLES)) 664 goto update_inode; 665 666 ret = btrfs_insert_empty_item(trans, root, path, key, 667 sizeof(*item)); 668 if (ret) 669 goto out; 670 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 671 path->slots[0]); 672 copy_extent_buffer(path->nodes[0], eb, dest_offset, 673 (unsigned long)item, sizeof(*item)); 674 675 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 676 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 677 ins.type = BTRFS_EXTENT_ITEM_KEY; 678 offset = key->offset - btrfs_file_extent_offset(eb, item); 679 680 /* 681 * Manually record dirty extent, as here we did a shallow 682 * file extent item copy and skip normal backref update, 683 * but modifying extent tree all by ourselves. 684 * So need to manually record dirty extent for qgroup, 685 * as the owner of the file extent changed from log tree 686 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 687 */ 688 ret = btrfs_qgroup_trace_extent(trans, fs_info, 689 btrfs_file_extent_disk_bytenr(eb, item), 690 btrfs_file_extent_disk_num_bytes(eb, item), 691 GFP_NOFS); 692 if (ret < 0) 693 goto out; 694 695 if (ins.objectid > 0) { 696 u64 csum_start; 697 u64 csum_end; 698 LIST_HEAD(ordered_sums); 699 /* 700 * is this extent already allocated in the extent 701 * allocation tree? If so, just add a reference 702 */ 703 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 704 ins.offset); 705 if (ret == 0) { 706 ret = btrfs_inc_extent_ref(trans, root, 707 ins.objectid, ins.offset, 708 0, root->root_key.objectid, 709 key->objectid, offset); 710 if (ret) 711 goto out; 712 } else { 713 /* 714 * insert the extent pointer in the extent 715 * allocation tree 716 */ 717 ret = btrfs_alloc_logged_file_extent(trans, 718 fs_info, 719 root->root_key.objectid, 720 key->objectid, offset, &ins); 721 if (ret) 722 goto out; 723 } 724 btrfs_release_path(path); 725 726 if (btrfs_file_extent_compression(eb, item)) { 727 csum_start = ins.objectid; 728 csum_end = csum_start + ins.offset; 729 } else { 730 csum_start = ins.objectid + 731 btrfs_file_extent_offset(eb, item); 732 csum_end = csum_start + 733 btrfs_file_extent_num_bytes(eb, item); 734 } 735 736 ret = btrfs_lookup_csums_range(root->log_root, 737 csum_start, csum_end - 1, 738 &ordered_sums, 0); 739 if (ret) 740 goto out; 741 /* 742 * Now delete all existing cums in the csum root that 743 * cover our range. We do this because we can have an 744 * extent that is completely referenced by one file 745 * extent item and partially referenced by another 746 * file extent item (like after using the clone or 747 * extent_same ioctls). In this case if we end up doing 748 * the replay of the one that partially references the 749 * extent first, and we do not do the csum deletion 750 * below, we can get 2 csum items in the csum tree that 751 * overlap each other. For example, imagine our log has 752 * the two following file extent items: 753 * 754 * key (257 EXTENT_DATA 409600) 755 * extent data disk byte 12845056 nr 102400 756 * extent data offset 20480 nr 20480 ram 102400 757 * 758 * key (257 EXTENT_DATA 819200) 759 * extent data disk byte 12845056 nr 102400 760 * extent data offset 0 nr 102400 ram 102400 761 * 762 * Where the second one fully references the 100K extent 763 * that starts at disk byte 12845056, and the log tree 764 * has a single csum item that covers the entire range 765 * of the extent: 766 * 767 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 768 * 769 * After the first file extent item is replayed, the 770 * csum tree gets the following csum item: 771 * 772 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 773 * 774 * Which covers the 20K sub-range starting at offset 20K 775 * of our extent. Now when we replay the second file 776 * extent item, if we do not delete existing csum items 777 * that cover any of its blocks, we end up getting two 778 * csum items in our csum tree that overlap each other: 779 * 780 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 781 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 782 * 783 * Which is a problem, because after this anyone trying 784 * to lookup up for the checksum of any block of our 785 * extent starting at an offset of 40K or higher, will 786 * end up looking at the second csum item only, which 787 * does not contain the checksum for any block starting 788 * at offset 40K or higher of our extent. 789 */ 790 while (!list_empty(&ordered_sums)) { 791 struct btrfs_ordered_sum *sums; 792 sums = list_entry(ordered_sums.next, 793 struct btrfs_ordered_sum, 794 list); 795 if (!ret) 796 ret = btrfs_del_csums(trans, fs_info, 797 sums->bytenr, 798 sums->len); 799 if (!ret) 800 ret = btrfs_csum_file_blocks(trans, 801 fs_info->csum_root, sums); 802 list_del(&sums->list); 803 kfree(sums); 804 } 805 if (ret) 806 goto out; 807 } else { 808 btrfs_release_path(path); 809 } 810 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 811 /* inline extents are easy, we just overwrite them */ 812 ret = overwrite_item(trans, root, path, eb, slot, key); 813 if (ret) 814 goto out; 815 } 816 817 inode_add_bytes(inode, nbytes); 818 update_inode: 819 ret = btrfs_update_inode(trans, root, inode); 820 out: 821 if (inode) 822 iput(inode); 823 return ret; 824 } 825 826 /* 827 * when cleaning up conflicts between the directory names in the 828 * subvolume, directory names in the log and directory names in the 829 * inode back references, we may have to unlink inodes from directories. 830 * 831 * This is a helper function to do the unlink of a specific directory 832 * item 833 */ 834 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 835 struct btrfs_root *root, 836 struct btrfs_path *path, 837 struct btrfs_inode *dir, 838 struct btrfs_dir_item *di) 839 { 840 struct inode *inode; 841 char *name; 842 int name_len; 843 struct extent_buffer *leaf; 844 struct btrfs_key location; 845 int ret; 846 847 leaf = path->nodes[0]; 848 849 btrfs_dir_item_key_to_cpu(leaf, di, &location); 850 name_len = btrfs_dir_name_len(leaf, di); 851 name = kmalloc(name_len, GFP_NOFS); 852 if (!name) 853 return -ENOMEM; 854 855 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 856 btrfs_release_path(path); 857 858 inode = read_one_inode(root, location.objectid); 859 if (!inode) { 860 ret = -EIO; 861 goto out; 862 } 863 864 ret = link_to_fixup_dir(trans, root, path, location.objectid); 865 if (ret) 866 goto out; 867 868 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 869 name_len); 870 if (ret) 871 goto out; 872 else 873 ret = btrfs_run_delayed_items(trans); 874 out: 875 kfree(name); 876 iput(inode); 877 return ret; 878 } 879 880 /* 881 * helper function to see if a given name and sequence number found 882 * in an inode back reference are already in a directory and correctly 883 * point to this inode 884 */ 885 static noinline int inode_in_dir(struct btrfs_root *root, 886 struct btrfs_path *path, 887 u64 dirid, u64 objectid, u64 index, 888 const char *name, int name_len) 889 { 890 struct btrfs_dir_item *di; 891 struct btrfs_key location; 892 int match = 0; 893 894 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 895 index, name, name_len, 0); 896 if (di && !IS_ERR(di)) { 897 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 898 if (location.objectid != objectid) 899 goto out; 900 } else 901 goto out; 902 btrfs_release_path(path); 903 904 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 905 if (di && !IS_ERR(di)) { 906 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 907 if (location.objectid != objectid) 908 goto out; 909 } else 910 goto out; 911 match = 1; 912 out: 913 btrfs_release_path(path); 914 return match; 915 } 916 917 /* 918 * helper function to check a log tree for a named back reference in 919 * an inode. This is used to decide if a back reference that is 920 * found in the subvolume conflicts with what we find in the log. 921 * 922 * inode backreferences may have multiple refs in a single item, 923 * during replay we process one reference at a time, and we don't 924 * want to delete valid links to a file from the subvolume if that 925 * link is also in the log. 926 */ 927 static noinline int backref_in_log(struct btrfs_root *log, 928 struct btrfs_key *key, 929 u64 ref_objectid, 930 const char *name, int namelen) 931 { 932 struct btrfs_path *path; 933 struct btrfs_inode_ref *ref; 934 unsigned long ptr; 935 unsigned long ptr_end; 936 unsigned long name_ptr; 937 int found_name_len; 938 int item_size; 939 int ret; 940 int match = 0; 941 942 path = btrfs_alloc_path(); 943 if (!path) 944 return -ENOMEM; 945 946 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 947 if (ret != 0) 948 goto out; 949 950 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 951 952 if (key->type == BTRFS_INODE_EXTREF_KEY) { 953 if (btrfs_find_name_in_ext_backref(path->nodes[0], 954 path->slots[0], 955 ref_objectid, 956 name, namelen, NULL)) 957 match = 1; 958 959 goto out; 960 } 961 962 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 963 ptr_end = ptr + item_size; 964 while (ptr < ptr_end) { 965 ref = (struct btrfs_inode_ref *)ptr; 966 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 967 if (found_name_len == namelen) { 968 name_ptr = (unsigned long)(ref + 1); 969 ret = memcmp_extent_buffer(path->nodes[0], name, 970 name_ptr, namelen); 971 if (ret == 0) { 972 match = 1; 973 goto out; 974 } 975 } 976 ptr = (unsigned long)(ref + 1) + found_name_len; 977 } 978 out: 979 btrfs_free_path(path); 980 return match; 981 } 982 983 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 984 struct btrfs_root *root, 985 struct btrfs_path *path, 986 struct btrfs_root *log_root, 987 struct btrfs_inode *dir, 988 struct btrfs_inode *inode, 989 u64 inode_objectid, u64 parent_objectid, 990 u64 ref_index, char *name, int namelen, 991 int *search_done) 992 { 993 int ret; 994 char *victim_name; 995 int victim_name_len; 996 struct extent_buffer *leaf; 997 struct btrfs_dir_item *di; 998 struct btrfs_key search_key; 999 struct btrfs_inode_extref *extref; 1000 1001 again: 1002 /* Search old style refs */ 1003 search_key.objectid = inode_objectid; 1004 search_key.type = BTRFS_INODE_REF_KEY; 1005 search_key.offset = parent_objectid; 1006 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1007 if (ret == 0) { 1008 struct btrfs_inode_ref *victim_ref; 1009 unsigned long ptr; 1010 unsigned long ptr_end; 1011 1012 leaf = path->nodes[0]; 1013 1014 /* are we trying to overwrite a back ref for the root directory 1015 * if so, just jump out, we're done 1016 */ 1017 if (search_key.objectid == search_key.offset) 1018 return 1; 1019 1020 /* check all the names in this back reference to see 1021 * if they are in the log. if so, we allow them to stay 1022 * otherwise they must be unlinked as a conflict 1023 */ 1024 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1025 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1026 while (ptr < ptr_end) { 1027 victim_ref = (struct btrfs_inode_ref *)ptr; 1028 victim_name_len = btrfs_inode_ref_name_len(leaf, 1029 victim_ref); 1030 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1031 if (!victim_name) 1032 return -ENOMEM; 1033 1034 read_extent_buffer(leaf, victim_name, 1035 (unsigned long)(victim_ref + 1), 1036 victim_name_len); 1037 1038 if (!backref_in_log(log_root, &search_key, 1039 parent_objectid, 1040 victim_name, 1041 victim_name_len)) { 1042 inc_nlink(&inode->vfs_inode); 1043 btrfs_release_path(path); 1044 1045 ret = btrfs_unlink_inode(trans, root, dir, inode, 1046 victim_name, victim_name_len); 1047 kfree(victim_name); 1048 if (ret) 1049 return ret; 1050 ret = btrfs_run_delayed_items(trans); 1051 if (ret) 1052 return ret; 1053 *search_done = 1; 1054 goto again; 1055 } 1056 kfree(victim_name); 1057 1058 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1059 } 1060 1061 /* 1062 * NOTE: we have searched root tree and checked the 1063 * corresponding ref, it does not need to check again. 1064 */ 1065 *search_done = 1; 1066 } 1067 btrfs_release_path(path); 1068 1069 /* Same search but for extended refs */ 1070 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1071 inode_objectid, parent_objectid, 0, 1072 0); 1073 if (!IS_ERR_OR_NULL(extref)) { 1074 u32 item_size; 1075 u32 cur_offset = 0; 1076 unsigned long base; 1077 struct inode *victim_parent; 1078 1079 leaf = path->nodes[0]; 1080 1081 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1082 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1083 1084 while (cur_offset < item_size) { 1085 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1086 1087 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1088 1089 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1090 goto next; 1091 1092 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1093 if (!victim_name) 1094 return -ENOMEM; 1095 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1096 victim_name_len); 1097 1098 search_key.objectid = inode_objectid; 1099 search_key.type = BTRFS_INODE_EXTREF_KEY; 1100 search_key.offset = btrfs_extref_hash(parent_objectid, 1101 victim_name, 1102 victim_name_len); 1103 ret = 0; 1104 if (!backref_in_log(log_root, &search_key, 1105 parent_objectid, victim_name, 1106 victim_name_len)) { 1107 ret = -ENOENT; 1108 victim_parent = read_one_inode(root, 1109 parent_objectid); 1110 if (victim_parent) { 1111 inc_nlink(&inode->vfs_inode); 1112 btrfs_release_path(path); 1113 1114 ret = btrfs_unlink_inode(trans, root, 1115 BTRFS_I(victim_parent), 1116 inode, 1117 victim_name, 1118 victim_name_len); 1119 if (!ret) 1120 ret = btrfs_run_delayed_items( 1121 trans); 1122 } 1123 iput(victim_parent); 1124 kfree(victim_name); 1125 if (ret) 1126 return ret; 1127 *search_done = 1; 1128 goto again; 1129 } 1130 kfree(victim_name); 1131 next: 1132 cur_offset += victim_name_len + sizeof(*extref); 1133 } 1134 *search_done = 1; 1135 } 1136 btrfs_release_path(path); 1137 1138 /* look for a conflicting sequence number */ 1139 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1140 ref_index, name, namelen, 0); 1141 if (di && !IS_ERR(di)) { 1142 ret = drop_one_dir_item(trans, root, path, dir, di); 1143 if (ret) 1144 return ret; 1145 } 1146 btrfs_release_path(path); 1147 1148 /* look for a conflicing name */ 1149 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1150 name, namelen, 0); 1151 if (di && !IS_ERR(di)) { 1152 ret = drop_one_dir_item(trans, root, path, dir, di); 1153 if (ret) 1154 return ret; 1155 } 1156 btrfs_release_path(path); 1157 1158 return 0; 1159 } 1160 1161 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1162 u32 *namelen, char **name, u64 *index, 1163 u64 *parent_objectid) 1164 { 1165 struct btrfs_inode_extref *extref; 1166 1167 extref = (struct btrfs_inode_extref *)ref_ptr; 1168 1169 *namelen = btrfs_inode_extref_name_len(eb, extref); 1170 *name = kmalloc(*namelen, GFP_NOFS); 1171 if (*name == NULL) 1172 return -ENOMEM; 1173 1174 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1175 *namelen); 1176 1177 if (index) 1178 *index = btrfs_inode_extref_index(eb, extref); 1179 if (parent_objectid) 1180 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1181 1182 return 0; 1183 } 1184 1185 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1186 u32 *namelen, char **name, u64 *index) 1187 { 1188 struct btrfs_inode_ref *ref; 1189 1190 ref = (struct btrfs_inode_ref *)ref_ptr; 1191 1192 *namelen = btrfs_inode_ref_name_len(eb, ref); 1193 *name = kmalloc(*namelen, GFP_NOFS); 1194 if (*name == NULL) 1195 return -ENOMEM; 1196 1197 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1198 1199 if (index) 1200 *index = btrfs_inode_ref_index(eb, ref); 1201 1202 return 0; 1203 } 1204 1205 /* 1206 * Take an inode reference item from the log tree and iterate all names from the 1207 * inode reference item in the subvolume tree with the same key (if it exists). 1208 * For any name that is not in the inode reference item from the log tree, do a 1209 * proper unlink of that name (that is, remove its entry from the inode 1210 * reference item and both dir index keys). 1211 */ 1212 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1213 struct btrfs_root *root, 1214 struct btrfs_path *path, 1215 struct btrfs_inode *inode, 1216 struct extent_buffer *log_eb, 1217 int log_slot, 1218 struct btrfs_key *key) 1219 { 1220 int ret; 1221 unsigned long ref_ptr; 1222 unsigned long ref_end; 1223 struct extent_buffer *eb; 1224 1225 again: 1226 btrfs_release_path(path); 1227 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1228 if (ret > 0) { 1229 ret = 0; 1230 goto out; 1231 } 1232 if (ret < 0) 1233 goto out; 1234 1235 eb = path->nodes[0]; 1236 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1237 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1238 while (ref_ptr < ref_end) { 1239 char *name = NULL; 1240 int namelen; 1241 u64 parent_id; 1242 1243 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1244 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1245 NULL, &parent_id); 1246 } else { 1247 parent_id = key->offset; 1248 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1249 NULL); 1250 } 1251 if (ret) 1252 goto out; 1253 1254 if (key->type == BTRFS_INODE_EXTREF_KEY) 1255 ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, 1256 parent_id, name, 1257 namelen, NULL); 1258 else 1259 ret = btrfs_find_name_in_backref(log_eb, log_slot, name, 1260 namelen, NULL); 1261 1262 if (!ret) { 1263 struct inode *dir; 1264 1265 btrfs_release_path(path); 1266 dir = read_one_inode(root, parent_id); 1267 if (!dir) { 1268 ret = -ENOENT; 1269 kfree(name); 1270 goto out; 1271 } 1272 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1273 inode, name, namelen); 1274 kfree(name); 1275 iput(dir); 1276 if (ret) 1277 goto out; 1278 goto again; 1279 } 1280 1281 kfree(name); 1282 ref_ptr += namelen; 1283 if (key->type == BTRFS_INODE_EXTREF_KEY) 1284 ref_ptr += sizeof(struct btrfs_inode_extref); 1285 else 1286 ref_ptr += sizeof(struct btrfs_inode_ref); 1287 } 1288 ret = 0; 1289 out: 1290 btrfs_release_path(path); 1291 return ret; 1292 } 1293 1294 /* 1295 * replay one inode back reference item found in the log tree. 1296 * eb, slot and key refer to the buffer and key found in the log tree. 1297 * root is the destination we are replaying into, and path is for temp 1298 * use by this function. (it should be released on return). 1299 */ 1300 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1301 struct btrfs_root *root, 1302 struct btrfs_root *log, 1303 struct btrfs_path *path, 1304 struct extent_buffer *eb, int slot, 1305 struct btrfs_key *key) 1306 { 1307 struct inode *dir = NULL; 1308 struct inode *inode = NULL; 1309 unsigned long ref_ptr; 1310 unsigned long ref_end; 1311 char *name = NULL; 1312 int namelen; 1313 int ret; 1314 int search_done = 0; 1315 int log_ref_ver = 0; 1316 u64 parent_objectid; 1317 u64 inode_objectid; 1318 u64 ref_index = 0; 1319 int ref_struct_size; 1320 1321 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1322 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1323 1324 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1325 struct btrfs_inode_extref *r; 1326 1327 ref_struct_size = sizeof(struct btrfs_inode_extref); 1328 log_ref_ver = 1; 1329 r = (struct btrfs_inode_extref *)ref_ptr; 1330 parent_objectid = btrfs_inode_extref_parent(eb, r); 1331 } else { 1332 ref_struct_size = sizeof(struct btrfs_inode_ref); 1333 parent_objectid = key->offset; 1334 } 1335 inode_objectid = key->objectid; 1336 1337 /* 1338 * it is possible that we didn't log all the parent directories 1339 * for a given inode. If we don't find the dir, just don't 1340 * copy the back ref in. The link count fixup code will take 1341 * care of the rest 1342 */ 1343 dir = read_one_inode(root, parent_objectid); 1344 if (!dir) { 1345 ret = -ENOENT; 1346 goto out; 1347 } 1348 1349 inode = read_one_inode(root, inode_objectid); 1350 if (!inode) { 1351 ret = -EIO; 1352 goto out; 1353 } 1354 1355 while (ref_ptr < ref_end) { 1356 if (log_ref_ver) { 1357 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1358 &ref_index, &parent_objectid); 1359 /* 1360 * parent object can change from one array 1361 * item to another. 1362 */ 1363 if (!dir) 1364 dir = read_one_inode(root, parent_objectid); 1365 if (!dir) { 1366 ret = -ENOENT; 1367 goto out; 1368 } 1369 } else { 1370 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1371 &ref_index); 1372 } 1373 if (ret) 1374 goto out; 1375 1376 /* if we already have a perfect match, we're done */ 1377 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1378 btrfs_ino(BTRFS_I(inode)), ref_index, 1379 name, namelen)) { 1380 /* 1381 * look for a conflicting back reference in the 1382 * metadata. if we find one we have to unlink that name 1383 * of the file before we add our new link. Later on, we 1384 * overwrite any existing back reference, and we don't 1385 * want to create dangling pointers in the directory. 1386 */ 1387 1388 if (!search_done) { 1389 ret = __add_inode_ref(trans, root, path, log, 1390 BTRFS_I(dir), 1391 BTRFS_I(inode), 1392 inode_objectid, 1393 parent_objectid, 1394 ref_index, name, namelen, 1395 &search_done); 1396 if (ret) { 1397 if (ret == 1) 1398 ret = 0; 1399 goto out; 1400 } 1401 } 1402 1403 /* insert our name */ 1404 ret = btrfs_add_link(trans, BTRFS_I(dir), 1405 BTRFS_I(inode), 1406 name, namelen, 0, ref_index); 1407 if (ret) 1408 goto out; 1409 1410 btrfs_update_inode(trans, root, inode); 1411 } 1412 1413 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1414 kfree(name); 1415 name = NULL; 1416 if (log_ref_ver) { 1417 iput(dir); 1418 dir = NULL; 1419 } 1420 } 1421 1422 /* 1423 * Before we overwrite the inode reference item in the subvolume tree 1424 * with the item from the log tree, we must unlink all names from the 1425 * parent directory that are in the subvolume's tree inode reference 1426 * item, otherwise we end up with an inconsistent subvolume tree where 1427 * dir index entries exist for a name but there is no inode reference 1428 * item with the same name. 1429 */ 1430 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1431 key); 1432 if (ret) 1433 goto out; 1434 1435 /* finally write the back reference in the inode */ 1436 ret = overwrite_item(trans, root, path, eb, slot, key); 1437 out: 1438 btrfs_release_path(path); 1439 kfree(name); 1440 iput(dir); 1441 iput(inode); 1442 return ret; 1443 } 1444 1445 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1446 struct btrfs_root *root, u64 ino) 1447 { 1448 int ret; 1449 1450 ret = btrfs_insert_orphan_item(trans, root, ino); 1451 if (ret == -EEXIST) 1452 ret = 0; 1453 1454 return ret; 1455 } 1456 1457 static int count_inode_extrefs(struct btrfs_root *root, 1458 struct btrfs_inode *inode, struct btrfs_path *path) 1459 { 1460 int ret = 0; 1461 int name_len; 1462 unsigned int nlink = 0; 1463 u32 item_size; 1464 u32 cur_offset = 0; 1465 u64 inode_objectid = btrfs_ino(inode); 1466 u64 offset = 0; 1467 unsigned long ptr; 1468 struct btrfs_inode_extref *extref; 1469 struct extent_buffer *leaf; 1470 1471 while (1) { 1472 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1473 &extref, &offset); 1474 if (ret) 1475 break; 1476 1477 leaf = path->nodes[0]; 1478 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1479 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1480 cur_offset = 0; 1481 1482 while (cur_offset < item_size) { 1483 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1484 name_len = btrfs_inode_extref_name_len(leaf, extref); 1485 1486 nlink++; 1487 1488 cur_offset += name_len + sizeof(*extref); 1489 } 1490 1491 offset++; 1492 btrfs_release_path(path); 1493 } 1494 btrfs_release_path(path); 1495 1496 if (ret < 0 && ret != -ENOENT) 1497 return ret; 1498 return nlink; 1499 } 1500 1501 static int count_inode_refs(struct btrfs_root *root, 1502 struct btrfs_inode *inode, struct btrfs_path *path) 1503 { 1504 int ret; 1505 struct btrfs_key key; 1506 unsigned int nlink = 0; 1507 unsigned long ptr; 1508 unsigned long ptr_end; 1509 int name_len; 1510 u64 ino = btrfs_ino(inode); 1511 1512 key.objectid = ino; 1513 key.type = BTRFS_INODE_REF_KEY; 1514 key.offset = (u64)-1; 1515 1516 while (1) { 1517 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1518 if (ret < 0) 1519 break; 1520 if (ret > 0) { 1521 if (path->slots[0] == 0) 1522 break; 1523 path->slots[0]--; 1524 } 1525 process_slot: 1526 btrfs_item_key_to_cpu(path->nodes[0], &key, 1527 path->slots[0]); 1528 if (key.objectid != ino || 1529 key.type != BTRFS_INODE_REF_KEY) 1530 break; 1531 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1532 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1533 path->slots[0]); 1534 while (ptr < ptr_end) { 1535 struct btrfs_inode_ref *ref; 1536 1537 ref = (struct btrfs_inode_ref *)ptr; 1538 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1539 ref); 1540 ptr = (unsigned long)(ref + 1) + name_len; 1541 nlink++; 1542 } 1543 1544 if (key.offset == 0) 1545 break; 1546 if (path->slots[0] > 0) { 1547 path->slots[0]--; 1548 goto process_slot; 1549 } 1550 key.offset--; 1551 btrfs_release_path(path); 1552 } 1553 btrfs_release_path(path); 1554 1555 return nlink; 1556 } 1557 1558 /* 1559 * There are a few corners where the link count of the file can't 1560 * be properly maintained during replay. So, instead of adding 1561 * lots of complexity to the log code, we just scan the backrefs 1562 * for any file that has been through replay. 1563 * 1564 * The scan will update the link count on the inode to reflect the 1565 * number of back refs found. If it goes down to zero, the iput 1566 * will free the inode. 1567 */ 1568 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1569 struct btrfs_root *root, 1570 struct inode *inode) 1571 { 1572 struct btrfs_path *path; 1573 int ret; 1574 u64 nlink = 0; 1575 u64 ino = btrfs_ino(BTRFS_I(inode)); 1576 1577 path = btrfs_alloc_path(); 1578 if (!path) 1579 return -ENOMEM; 1580 1581 ret = count_inode_refs(root, BTRFS_I(inode), path); 1582 if (ret < 0) 1583 goto out; 1584 1585 nlink = ret; 1586 1587 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1588 if (ret < 0) 1589 goto out; 1590 1591 nlink += ret; 1592 1593 ret = 0; 1594 1595 if (nlink != inode->i_nlink) { 1596 set_nlink(inode, nlink); 1597 btrfs_update_inode(trans, root, inode); 1598 } 1599 BTRFS_I(inode)->index_cnt = (u64)-1; 1600 1601 if (inode->i_nlink == 0) { 1602 if (S_ISDIR(inode->i_mode)) { 1603 ret = replay_dir_deletes(trans, root, NULL, path, 1604 ino, 1); 1605 if (ret) 1606 goto out; 1607 } 1608 ret = insert_orphan_item(trans, root, ino); 1609 } 1610 1611 out: 1612 btrfs_free_path(path); 1613 return ret; 1614 } 1615 1616 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1617 struct btrfs_root *root, 1618 struct btrfs_path *path) 1619 { 1620 int ret; 1621 struct btrfs_key key; 1622 struct inode *inode; 1623 1624 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1625 key.type = BTRFS_ORPHAN_ITEM_KEY; 1626 key.offset = (u64)-1; 1627 while (1) { 1628 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1629 if (ret < 0) 1630 break; 1631 1632 if (ret == 1) { 1633 if (path->slots[0] == 0) 1634 break; 1635 path->slots[0]--; 1636 } 1637 1638 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1639 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1640 key.type != BTRFS_ORPHAN_ITEM_KEY) 1641 break; 1642 1643 ret = btrfs_del_item(trans, root, path); 1644 if (ret) 1645 goto out; 1646 1647 btrfs_release_path(path); 1648 inode = read_one_inode(root, key.offset); 1649 if (!inode) 1650 return -EIO; 1651 1652 ret = fixup_inode_link_count(trans, root, inode); 1653 iput(inode); 1654 if (ret) 1655 goto out; 1656 1657 /* 1658 * fixup on a directory may create new entries, 1659 * make sure we always look for the highset possible 1660 * offset 1661 */ 1662 key.offset = (u64)-1; 1663 } 1664 ret = 0; 1665 out: 1666 btrfs_release_path(path); 1667 return ret; 1668 } 1669 1670 1671 /* 1672 * record a given inode in the fixup dir so we can check its link 1673 * count when replay is done. The link count is incremented here 1674 * so the inode won't go away until we check it 1675 */ 1676 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1677 struct btrfs_root *root, 1678 struct btrfs_path *path, 1679 u64 objectid) 1680 { 1681 struct btrfs_key key; 1682 int ret = 0; 1683 struct inode *inode; 1684 1685 inode = read_one_inode(root, objectid); 1686 if (!inode) 1687 return -EIO; 1688 1689 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1690 key.type = BTRFS_ORPHAN_ITEM_KEY; 1691 key.offset = objectid; 1692 1693 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1694 1695 btrfs_release_path(path); 1696 if (ret == 0) { 1697 if (!inode->i_nlink) 1698 set_nlink(inode, 1); 1699 else 1700 inc_nlink(inode); 1701 ret = btrfs_update_inode(trans, root, inode); 1702 } else if (ret == -EEXIST) { 1703 ret = 0; 1704 } else { 1705 BUG(); /* Logic Error */ 1706 } 1707 iput(inode); 1708 1709 return ret; 1710 } 1711 1712 /* 1713 * when replaying the log for a directory, we only insert names 1714 * for inodes that actually exist. This means an fsync on a directory 1715 * does not implicitly fsync all the new files in it 1716 */ 1717 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1718 struct btrfs_root *root, 1719 u64 dirid, u64 index, 1720 char *name, int name_len, 1721 struct btrfs_key *location) 1722 { 1723 struct inode *inode; 1724 struct inode *dir; 1725 int ret; 1726 1727 inode = read_one_inode(root, location->objectid); 1728 if (!inode) 1729 return -ENOENT; 1730 1731 dir = read_one_inode(root, dirid); 1732 if (!dir) { 1733 iput(inode); 1734 return -EIO; 1735 } 1736 1737 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1738 name_len, 1, index); 1739 1740 /* FIXME, put inode into FIXUP list */ 1741 1742 iput(inode); 1743 iput(dir); 1744 return ret; 1745 } 1746 1747 /* 1748 * Return true if an inode reference exists in the log for the given name, 1749 * inode and parent inode. 1750 */ 1751 static bool name_in_log_ref(struct btrfs_root *log_root, 1752 const char *name, const int name_len, 1753 const u64 dirid, const u64 ino) 1754 { 1755 struct btrfs_key search_key; 1756 1757 search_key.objectid = ino; 1758 search_key.type = BTRFS_INODE_REF_KEY; 1759 search_key.offset = dirid; 1760 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1761 return true; 1762 1763 search_key.type = BTRFS_INODE_EXTREF_KEY; 1764 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1765 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1766 return true; 1767 1768 return false; 1769 } 1770 1771 /* 1772 * take a single entry in a log directory item and replay it into 1773 * the subvolume. 1774 * 1775 * if a conflicting item exists in the subdirectory already, 1776 * the inode it points to is unlinked and put into the link count 1777 * fix up tree. 1778 * 1779 * If a name from the log points to a file or directory that does 1780 * not exist in the FS, it is skipped. fsyncs on directories 1781 * do not force down inodes inside that directory, just changes to the 1782 * names or unlinks in a directory. 1783 * 1784 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1785 * non-existing inode) and 1 if the name was replayed. 1786 */ 1787 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1788 struct btrfs_root *root, 1789 struct btrfs_path *path, 1790 struct extent_buffer *eb, 1791 struct btrfs_dir_item *di, 1792 struct btrfs_key *key) 1793 { 1794 char *name; 1795 int name_len; 1796 struct btrfs_dir_item *dst_di; 1797 struct btrfs_key found_key; 1798 struct btrfs_key log_key; 1799 struct inode *dir; 1800 u8 log_type; 1801 int exists; 1802 int ret = 0; 1803 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1804 bool name_added = false; 1805 1806 dir = read_one_inode(root, key->objectid); 1807 if (!dir) 1808 return -EIO; 1809 1810 name_len = btrfs_dir_name_len(eb, di); 1811 name = kmalloc(name_len, GFP_NOFS); 1812 if (!name) { 1813 ret = -ENOMEM; 1814 goto out; 1815 } 1816 1817 log_type = btrfs_dir_type(eb, di); 1818 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1819 name_len); 1820 1821 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1822 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1823 if (exists == 0) 1824 exists = 1; 1825 else 1826 exists = 0; 1827 btrfs_release_path(path); 1828 1829 if (key->type == BTRFS_DIR_ITEM_KEY) { 1830 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1831 name, name_len, 1); 1832 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1833 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1834 key->objectid, 1835 key->offset, name, 1836 name_len, 1); 1837 } else { 1838 /* Corruption */ 1839 ret = -EINVAL; 1840 goto out; 1841 } 1842 if (IS_ERR_OR_NULL(dst_di)) { 1843 /* we need a sequence number to insert, so we only 1844 * do inserts for the BTRFS_DIR_INDEX_KEY types 1845 */ 1846 if (key->type != BTRFS_DIR_INDEX_KEY) 1847 goto out; 1848 goto insert; 1849 } 1850 1851 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1852 /* the existing item matches the logged item */ 1853 if (found_key.objectid == log_key.objectid && 1854 found_key.type == log_key.type && 1855 found_key.offset == log_key.offset && 1856 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1857 update_size = false; 1858 goto out; 1859 } 1860 1861 /* 1862 * don't drop the conflicting directory entry if the inode 1863 * for the new entry doesn't exist 1864 */ 1865 if (!exists) 1866 goto out; 1867 1868 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1869 if (ret) 1870 goto out; 1871 1872 if (key->type == BTRFS_DIR_INDEX_KEY) 1873 goto insert; 1874 out: 1875 btrfs_release_path(path); 1876 if (!ret && update_size) { 1877 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1878 ret = btrfs_update_inode(trans, root, dir); 1879 } 1880 kfree(name); 1881 iput(dir); 1882 if (!ret && name_added) 1883 ret = 1; 1884 return ret; 1885 1886 insert: 1887 if (name_in_log_ref(root->log_root, name, name_len, 1888 key->objectid, log_key.objectid)) { 1889 /* The dentry will be added later. */ 1890 ret = 0; 1891 update_size = false; 1892 goto out; 1893 } 1894 btrfs_release_path(path); 1895 ret = insert_one_name(trans, root, key->objectid, key->offset, 1896 name, name_len, &log_key); 1897 if (ret && ret != -ENOENT && ret != -EEXIST) 1898 goto out; 1899 if (!ret) 1900 name_added = true; 1901 update_size = false; 1902 ret = 0; 1903 goto out; 1904 } 1905 1906 /* 1907 * find all the names in a directory item and reconcile them into 1908 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1909 * one name in a directory item, but the same code gets used for 1910 * both directory index types 1911 */ 1912 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1913 struct btrfs_root *root, 1914 struct btrfs_path *path, 1915 struct extent_buffer *eb, int slot, 1916 struct btrfs_key *key) 1917 { 1918 int ret = 0; 1919 u32 item_size = btrfs_item_size_nr(eb, slot); 1920 struct btrfs_dir_item *di; 1921 int name_len; 1922 unsigned long ptr; 1923 unsigned long ptr_end; 1924 struct btrfs_path *fixup_path = NULL; 1925 1926 ptr = btrfs_item_ptr_offset(eb, slot); 1927 ptr_end = ptr + item_size; 1928 while (ptr < ptr_end) { 1929 di = (struct btrfs_dir_item *)ptr; 1930 name_len = btrfs_dir_name_len(eb, di); 1931 ret = replay_one_name(trans, root, path, eb, di, key); 1932 if (ret < 0) 1933 break; 1934 ptr = (unsigned long)(di + 1); 1935 ptr += name_len; 1936 1937 /* 1938 * If this entry refers to a non-directory (directories can not 1939 * have a link count > 1) and it was added in the transaction 1940 * that was not committed, make sure we fixup the link count of 1941 * the inode it the entry points to. Otherwise something like 1942 * the following would result in a directory pointing to an 1943 * inode with a wrong link that does not account for this dir 1944 * entry: 1945 * 1946 * mkdir testdir 1947 * touch testdir/foo 1948 * touch testdir/bar 1949 * sync 1950 * 1951 * ln testdir/bar testdir/bar_link 1952 * ln testdir/foo testdir/foo_link 1953 * xfs_io -c "fsync" testdir/bar 1954 * 1955 * <power failure> 1956 * 1957 * mount fs, log replay happens 1958 * 1959 * File foo would remain with a link count of 1 when it has two 1960 * entries pointing to it in the directory testdir. This would 1961 * make it impossible to ever delete the parent directory has 1962 * it would result in stale dentries that can never be deleted. 1963 */ 1964 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1965 struct btrfs_key di_key; 1966 1967 if (!fixup_path) { 1968 fixup_path = btrfs_alloc_path(); 1969 if (!fixup_path) { 1970 ret = -ENOMEM; 1971 break; 1972 } 1973 } 1974 1975 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1976 ret = link_to_fixup_dir(trans, root, fixup_path, 1977 di_key.objectid); 1978 if (ret) 1979 break; 1980 } 1981 ret = 0; 1982 } 1983 btrfs_free_path(fixup_path); 1984 return ret; 1985 } 1986 1987 /* 1988 * directory replay has two parts. There are the standard directory 1989 * items in the log copied from the subvolume, and range items 1990 * created in the log while the subvolume was logged. 1991 * 1992 * The range items tell us which parts of the key space the log 1993 * is authoritative for. During replay, if a key in the subvolume 1994 * directory is in a logged range item, but not actually in the log 1995 * that means it was deleted from the directory before the fsync 1996 * and should be removed. 1997 */ 1998 static noinline int find_dir_range(struct btrfs_root *root, 1999 struct btrfs_path *path, 2000 u64 dirid, int key_type, 2001 u64 *start_ret, u64 *end_ret) 2002 { 2003 struct btrfs_key key; 2004 u64 found_end; 2005 struct btrfs_dir_log_item *item; 2006 int ret; 2007 int nritems; 2008 2009 if (*start_ret == (u64)-1) 2010 return 1; 2011 2012 key.objectid = dirid; 2013 key.type = key_type; 2014 key.offset = *start_ret; 2015 2016 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2017 if (ret < 0) 2018 goto out; 2019 if (ret > 0) { 2020 if (path->slots[0] == 0) 2021 goto out; 2022 path->slots[0]--; 2023 } 2024 if (ret != 0) 2025 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2026 2027 if (key.type != key_type || key.objectid != dirid) { 2028 ret = 1; 2029 goto next; 2030 } 2031 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2032 struct btrfs_dir_log_item); 2033 found_end = btrfs_dir_log_end(path->nodes[0], item); 2034 2035 if (*start_ret >= key.offset && *start_ret <= found_end) { 2036 ret = 0; 2037 *start_ret = key.offset; 2038 *end_ret = found_end; 2039 goto out; 2040 } 2041 ret = 1; 2042 next: 2043 /* check the next slot in the tree to see if it is a valid item */ 2044 nritems = btrfs_header_nritems(path->nodes[0]); 2045 path->slots[0]++; 2046 if (path->slots[0] >= nritems) { 2047 ret = btrfs_next_leaf(root, path); 2048 if (ret) 2049 goto out; 2050 } 2051 2052 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2053 2054 if (key.type != key_type || key.objectid != dirid) { 2055 ret = 1; 2056 goto out; 2057 } 2058 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2059 struct btrfs_dir_log_item); 2060 found_end = btrfs_dir_log_end(path->nodes[0], item); 2061 *start_ret = key.offset; 2062 *end_ret = found_end; 2063 ret = 0; 2064 out: 2065 btrfs_release_path(path); 2066 return ret; 2067 } 2068 2069 /* 2070 * this looks for a given directory item in the log. If the directory 2071 * item is not in the log, the item is removed and the inode it points 2072 * to is unlinked 2073 */ 2074 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2075 struct btrfs_root *root, 2076 struct btrfs_root *log, 2077 struct btrfs_path *path, 2078 struct btrfs_path *log_path, 2079 struct inode *dir, 2080 struct btrfs_key *dir_key) 2081 { 2082 int ret; 2083 struct extent_buffer *eb; 2084 int slot; 2085 u32 item_size; 2086 struct btrfs_dir_item *di; 2087 struct btrfs_dir_item *log_di; 2088 int name_len; 2089 unsigned long ptr; 2090 unsigned long ptr_end; 2091 char *name; 2092 struct inode *inode; 2093 struct btrfs_key location; 2094 2095 again: 2096 eb = path->nodes[0]; 2097 slot = path->slots[0]; 2098 item_size = btrfs_item_size_nr(eb, slot); 2099 ptr = btrfs_item_ptr_offset(eb, slot); 2100 ptr_end = ptr + item_size; 2101 while (ptr < ptr_end) { 2102 di = (struct btrfs_dir_item *)ptr; 2103 name_len = btrfs_dir_name_len(eb, di); 2104 name = kmalloc(name_len, GFP_NOFS); 2105 if (!name) { 2106 ret = -ENOMEM; 2107 goto out; 2108 } 2109 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2110 name_len); 2111 log_di = NULL; 2112 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2113 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2114 dir_key->objectid, 2115 name, name_len, 0); 2116 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2117 log_di = btrfs_lookup_dir_index_item(trans, log, 2118 log_path, 2119 dir_key->objectid, 2120 dir_key->offset, 2121 name, name_len, 0); 2122 } 2123 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2124 btrfs_dir_item_key_to_cpu(eb, di, &location); 2125 btrfs_release_path(path); 2126 btrfs_release_path(log_path); 2127 inode = read_one_inode(root, location.objectid); 2128 if (!inode) { 2129 kfree(name); 2130 return -EIO; 2131 } 2132 2133 ret = link_to_fixup_dir(trans, root, 2134 path, location.objectid); 2135 if (ret) { 2136 kfree(name); 2137 iput(inode); 2138 goto out; 2139 } 2140 2141 inc_nlink(inode); 2142 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2143 BTRFS_I(inode), name, name_len); 2144 if (!ret) 2145 ret = btrfs_run_delayed_items(trans); 2146 kfree(name); 2147 iput(inode); 2148 if (ret) 2149 goto out; 2150 2151 /* there might still be more names under this key 2152 * check and repeat if required 2153 */ 2154 ret = btrfs_search_slot(NULL, root, dir_key, path, 2155 0, 0); 2156 if (ret == 0) 2157 goto again; 2158 ret = 0; 2159 goto out; 2160 } else if (IS_ERR(log_di)) { 2161 kfree(name); 2162 return PTR_ERR(log_di); 2163 } 2164 btrfs_release_path(log_path); 2165 kfree(name); 2166 2167 ptr = (unsigned long)(di + 1); 2168 ptr += name_len; 2169 } 2170 ret = 0; 2171 out: 2172 btrfs_release_path(path); 2173 btrfs_release_path(log_path); 2174 return ret; 2175 } 2176 2177 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2178 struct btrfs_root *root, 2179 struct btrfs_root *log, 2180 struct btrfs_path *path, 2181 const u64 ino) 2182 { 2183 struct btrfs_key search_key; 2184 struct btrfs_path *log_path; 2185 int i; 2186 int nritems; 2187 int ret; 2188 2189 log_path = btrfs_alloc_path(); 2190 if (!log_path) 2191 return -ENOMEM; 2192 2193 search_key.objectid = ino; 2194 search_key.type = BTRFS_XATTR_ITEM_KEY; 2195 search_key.offset = 0; 2196 again: 2197 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2198 if (ret < 0) 2199 goto out; 2200 process_leaf: 2201 nritems = btrfs_header_nritems(path->nodes[0]); 2202 for (i = path->slots[0]; i < nritems; i++) { 2203 struct btrfs_key key; 2204 struct btrfs_dir_item *di; 2205 struct btrfs_dir_item *log_di; 2206 u32 total_size; 2207 u32 cur; 2208 2209 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2210 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2211 ret = 0; 2212 goto out; 2213 } 2214 2215 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2216 total_size = btrfs_item_size_nr(path->nodes[0], i); 2217 cur = 0; 2218 while (cur < total_size) { 2219 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2220 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2221 u32 this_len = sizeof(*di) + name_len + data_len; 2222 char *name; 2223 2224 name = kmalloc(name_len, GFP_NOFS); 2225 if (!name) { 2226 ret = -ENOMEM; 2227 goto out; 2228 } 2229 read_extent_buffer(path->nodes[0], name, 2230 (unsigned long)(di + 1), name_len); 2231 2232 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2233 name, name_len, 0); 2234 btrfs_release_path(log_path); 2235 if (!log_di) { 2236 /* Doesn't exist in log tree, so delete it. */ 2237 btrfs_release_path(path); 2238 di = btrfs_lookup_xattr(trans, root, path, ino, 2239 name, name_len, -1); 2240 kfree(name); 2241 if (IS_ERR(di)) { 2242 ret = PTR_ERR(di); 2243 goto out; 2244 } 2245 ASSERT(di); 2246 ret = btrfs_delete_one_dir_name(trans, root, 2247 path, di); 2248 if (ret) 2249 goto out; 2250 btrfs_release_path(path); 2251 search_key = key; 2252 goto again; 2253 } 2254 kfree(name); 2255 if (IS_ERR(log_di)) { 2256 ret = PTR_ERR(log_di); 2257 goto out; 2258 } 2259 cur += this_len; 2260 di = (struct btrfs_dir_item *)((char *)di + this_len); 2261 } 2262 } 2263 ret = btrfs_next_leaf(root, path); 2264 if (ret > 0) 2265 ret = 0; 2266 else if (ret == 0) 2267 goto process_leaf; 2268 out: 2269 btrfs_free_path(log_path); 2270 btrfs_release_path(path); 2271 return ret; 2272 } 2273 2274 2275 /* 2276 * deletion replay happens before we copy any new directory items 2277 * out of the log or out of backreferences from inodes. It 2278 * scans the log to find ranges of keys that log is authoritative for, 2279 * and then scans the directory to find items in those ranges that are 2280 * not present in the log. 2281 * 2282 * Anything we don't find in the log is unlinked and removed from the 2283 * directory. 2284 */ 2285 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2286 struct btrfs_root *root, 2287 struct btrfs_root *log, 2288 struct btrfs_path *path, 2289 u64 dirid, int del_all) 2290 { 2291 u64 range_start; 2292 u64 range_end; 2293 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2294 int ret = 0; 2295 struct btrfs_key dir_key; 2296 struct btrfs_key found_key; 2297 struct btrfs_path *log_path; 2298 struct inode *dir; 2299 2300 dir_key.objectid = dirid; 2301 dir_key.type = BTRFS_DIR_ITEM_KEY; 2302 log_path = btrfs_alloc_path(); 2303 if (!log_path) 2304 return -ENOMEM; 2305 2306 dir = read_one_inode(root, dirid); 2307 /* it isn't an error if the inode isn't there, that can happen 2308 * because we replay the deletes before we copy in the inode item 2309 * from the log 2310 */ 2311 if (!dir) { 2312 btrfs_free_path(log_path); 2313 return 0; 2314 } 2315 again: 2316 range_start = 0; 2317 range_end = 0; 2318 while (1) { 2319 if (del_all) 2320 range_end = (u64)-1; 2321 else { 2322 ret = find_dir_range(log, path, dirid, key_type, 2323 &range_start, &range_end); 2324 if (ret != 0) 2325 break; 2326 } 2327 2328 dir_key.offset = range_start; 2329 while (1) { 2330 int nritems; 2331 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2332 0, 0); 2333 if (ret < 0) 2334 goto out; 2335 2336 nritems = btrfs_header_nritems(path->nodes[0]); 2337 if (path->slots[0] >= nritems) { 2338 ret = btrfs_next_leaf(root, path); 2339 if (ret == 1) 2340 break; 2341 else if (ret < 0) 2342 goto out; 2343 } 2344 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2345 path->slots[0]); 2346 if (found_key.objectid != dirid || 2347 found_key.type != dir_key.type) 2348 goto next_type; 2349 2350 if (found_key.offset > range_end) 2351 break; 2352 2353 ret = check_item_in_log(trans, root, log, path, 2354 log_path, dir, 2355 &found_key); 2356 if (ret) 2357 goto out; 2358 if (found_key.offset == (u64)-1) 2359 break; 2360 dir_key.offset = found_key.offset + 1; 2361 } 2362 btrfs_release_path(path); 2363 if (range_end == (u64)-1) 2364 break; 2365 range_start = range_end + 1; 2366 } 2367 2368 next_type: 2369 ret = 0; 2370 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2371 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2372 dir_key.type = BTRFS_DIR_INDEX_KEY; 2373 btrfs_release_path(path); 2374 goto again; 2375 } 2376 out: 2377 btrfs_release_path(path); 2378 btrfs_free_path(log_path); 2379 iput(dir); 2380 return ret; 2381 } 2382 2383 /* 2384 * the process_func used to replay items from the log tree. This 2385 * gets called in two different stages. The first stage just looks 2386 * for inodes and makes sure they are all copied into the subvolume. 2387 * 2388 * The second stage copies all the other item types from the log into 2389 * the subvolume. The two stage approach is slower, but gets rid of 2390 * lots of complexity around inodes referencing other inodes that exist 2391 * only in the log (references come from either directory items or inode 2392 * back refs). 2393 */ 2394 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2395 struct walk_control *wc, u64 gen, int level) 2396 { 2397 int nritems; 2398 struct btrfs_path *path; 2399 struct btrfs_root *root = wc->replay_dest; 2400 struct btrfs_key key; 2401 int i; 2402 int ret; 2403 2404 ret = btrfs_read_buffer(eb, gen, level, NULL); 2405 if (ret) 2406 return ret; 2407 2408 level = btrfs_header_level(eb); 2409 2410 if (level != 0) 2411 return 0; 2412 2413 path = btrfs_alloc_path(); 2414 if (!path) 2415 return -ENOMEM; 2416 2417 nritems = btrfs_header_nritems(eb); 2418 for (i = 0; i < nritems; i++) { 2419 btrfs_item_key_to_cpu(eb, &key, i); 2420 2421 /* inode keys are done during the first stage */ 2422 if (key.type == BTRFS_INODE_ITEM_KEY && 2423 wc->stage == LOG_WALK_REPLAY_INODES) { 2424 struct btrfs_inode_item *inode_item; 2425 u32 mode; 2426 2427 inode_item = btrfs_item_ptr(eb, i, 2428 struct btrfs_inode_item); 2429 ret = replay_xattr_deletes(wc->trans, root, log, 2430 path, key.objectid); 2431 if (ret) 2432 break; 2433 mode = btrfs_inode_mode(eb, inode_item); 2434 if (S_ISDIR(mode)) { 2435 ret = replay_dir_deletes(wc->trans, 2436 root, log, path, key.objectid, 0); 2437 if (ret) 2438 break; 2439 } 2440 ret = overwrite_item(wc->trans, root, path, 2441 eb, i, &key); 2442 if (ret) 2443 break; 2444 2445 /* 2446 * Before replaying extents, truncate the inode to its 2447 * size. We need to do it now and not after log replay 2448 * because before an fsync we can have prealloc extents 2449 * added beyond the inode's i_size. If we did it after, 2450 * through orphan cleanup for example, we would drop 2451 * those prealloc extents just after replaying them. 2452 */ 2453 if (S_ISREG(mode)) { 2454 struct inode *inode; 2455 u64 from; 2456 2457 inode = read_one_inode(root, key.objectid); 2458 if (!inode) { 2459 ret = -EIO; 2460 break; 2461 } 2462 from = ALIGN(i_size_read(inode), 2463 root->fs_info->sectorsize); 2464 ret = btrfs_drop_extents(wc->trans, root, inode, 2465 from, (u64)-1, 1); 2466 /* 2467 * If the nlink count is zero here, the iput 2468 * will free the inode. We bump it to make 2469 * sure it doesn't get freed until the link 2470 * count fixup is done. 2471 */ 2472 if (!ret) { 2473 if (inode->i_nlink == 0) 2474 inc_nlink(inode); 2475 /* Update link count and nbytes. */ 2476 ret = btrfs_update_inode(wc->trans, 2477 root, inode); 2478 } 2479 iput(inode); 2480 if (ret) 2481 break; 2482 } 2483 2484 ret = link_to_fixup_dir(wc->trans, root, 2485 path, key.objectid); 2486 if (ret) 2487 break; 2488 } 2489 2490 if (key.type == BTRFS_DIR_INDEX_KEY && 2491 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2492 ret = replay_one_dir_item(wc->trans, root, path, 2493 eb, i, &key); 2494 if (ret) 2495 break; 2496 } 2497 2498 if (wc->stage < LOG_WALK_REPLAY_ALL) 2499 continue; 2500 2501 /* these keys are simply copied */ 2502 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2503 ret = overwrite_item(wc->trans, root, path, 2504 eb, i, &key); 2505 if (ret) 2506 break; 2507 } else if (key.type == BTRFS_INODE_REF_KEY || 2508 key.type == BTRFS_INODE_EXTREF_KEY) { 2509 ret = add_inode_ref(wc->trans, root, log, path, 2510 eb, i, &key); 2511 if (ret && ret != -ENOENT) 2512 break; 2513 ret = 0; 2514 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2515 ret = replay_one_extent(wc->trans, root, path, 2516 eb, i, &key); 2517 if (ret) 2518 break; 2519 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2520 ret = replay_one_dir_item(wc->trans, root, path, 2521 eb, i, &key); 2522 if (ret) 2523 break; 2524 } 2525 } 2526 btrfs_free_path(path); 2527 return ret; 2528 } 2529 2530 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2531 struct btrfs_root *root, 2532 struct btrfs_path *path, int *level, 2533 struct walk_control *wc) 2534 { 2535 struct btrfs_fs_info *fs_info = root->fs_info; 2536 u64 root_owner; 2537 u64 bytenr; 2538 u64 ptr_gen; 2539 struct extent_buffer *next; 2540 struct extent_buffer *cur; 2541 struct extent_buffer *parent; 2542 u32 blocksize; 2543 int ret = 0; 2544 2545 WARN_ON(*level < 0); 2546 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2547 2548 while (*level > 0) { 2549 struct btrfs_key first_key; 2550 2551 WARN_ON(*level < 0); 2552 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2553 cur = path->nodes[*level]; 2554 2555 WARN_ON(btrfs_header_level(cur) != *level); 2556 2557 if (path->slots[*level] >= 2558 btrfs_header_nritems(cur)) 2559 break; 2560 2561 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2562 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2563 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2564 blocksize = fs_info->nodesize; 2565 2566 parent = path->nodes[*level]; 2567 root_owner = btrfs_header_owner(parent); 2568 2569 next = btrfs_find_create_tree_block(fs_info, bytenr); 2570 if (IS_ERR(next)) 2571 return PTR_ERR(next); 2572 2573 if (*level == 1) { 2574 ret = wc->process_func(root, next, wc, ptr_gen, 2575 *level - 1); 2576 if (ret) { 2577 free_extent_buffer(next); 2578 return ret; 2579 } 2580 2581 path->slots[*level]++; 2582 if (wc->free) { 2583 ret = btrfs_read_buffer(next, ptr_gen, 2584 *level - 1, &first_key); 2585 if (ret) { 2586 free_extent_buffer(next); 2587 return ret; 2588 } 2589 2590 if (trans) { 2591 btrfs_tree_lock(next); 2592 btrfs_set_lock_blocking(next); 2593 clean_tree_block(fs_info, next); 2594 btrfs_wait_tree_block_writeback(next); 2595 btrfs_tree_unlock(next); 2596 } else { 2597 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2598 clear_extent_buffer_dirty(next); 2599 } 2600 2601 WARN_ON(root_owner != 2602 BTRFS_TREE_LOG_OBJECTID); 2603 ret = btrfs_free_and_pin_reserved_extent( 2604 fs_info, bytenr, 2605 blocksize); 2606 if (ret) { 2607 free_extent_buffer(next); 2608 return ret; 2609 } 2610 } 2611 free_extent_buffer(next); 2612 continue; 2613 } 2614 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2615 if (ret) { 2616 free_extent_buffer(next); 2617 return ret; 2618 } 2619 2620 WARN_ON(*level <= 0); 2621 if (path->nodes[*level-1]) 2622 free_extent_buffer(path->nodes[*level-1]); 2623 path->nodes[*level-1] = next; 2624 *level = btrfs_header_level(next); 2625 path->slots[*level] = 0; 2626 cond_resched(); 2627 } 2628 WARN_ON(*level < 0); 2629 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2630 2631 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2632 2633 cond_resched(); 2634 return 0; 2635 } 2636 2637 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2638 struct btrfs_root *root, 2639 struct btrfs_path *path, int *level, 2640 struct walk_control *wc) 2641 { 2642 struct btrfs_fs_info *fs_info = root->fs_info; 2643 u64 root_owner; 2644 int i; 2645 int slot; 2646 int ret; 2647 2648 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2649 slot = path->slots[i]; 2650 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2651 path->slots[i]++; 2652 *level = i; 2653 WARN_ON(*level == 0); 2654 return 0; 2655 } else { 2656 struct extent_buffer *parent; 2657 if (path->nodes[*level] == root->node) 2658 parent = path->nodes[*level]; 2659 else 2660 parent = path->nodes[*level + 1]; 2661 2662 root_owner = btrfs_header_owner(parent); 2663 ret = wc->process_func(root, path->nodes[*level], wc, 2664 btrfs_header_generation(path->nodes[*level]), 2665 *level); 2666 if (ret) 2667 return ret; 2668 2669 if (wc->free) { 2670 struct extent_buffer *next; 2671 2672 next = path->nodes[*level]; 2673 2674 if (trans) { 2675 btrfs_tree_lock(next); 2676 btrfs_set_lock_blocking(next); 2677 clean_tree_block(fs_info, next); 2678 btrfs_wait_tree_block_writeback(next); 2679 btrfs_tree_unlock(next); 2680 } else { 2681 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2682 clear_extent_buffer_dirty(next); 2683 } 2684 2685 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2686 ret = btrfs_free_and_pin_reserved_extent( 2687 fs_info, 2688 path->nodes[*level]->start, 2689 path->nodes[*level]->len); 2690 if (ret) 2691 return ret; 2692 } 2693 free_extent_buffer(path->nodes[*level]); 2694 path->nodes[*level] = NULL; 2695 *level = i + 1; 2696 } 2697 } 2698 return 1; 2699 } 2700 2701 /* 2702 * drop the reference count on the tree rooted at 'snap'. This traverses 2703 * the tree freeing any blocks that have a ref count of zero after being 2704 * decremented. 2705 */ 2706 static int walk_log_tree(struct btrfs_trans_handle *trans, 2707 struct btrfs_root *log, struct walk_control *wc) 2708 { 2709 struct btrfs_fs_info *fs_info = log->fs_info; 2710 int ret = 0; 2711 int wret; 2712 int level; 2713 struct btrfs_path *path; 2714 int orig_level; 2715 2716 path = btrfs_alloc_path(); 2717 if (!path) 2718 return -ENOMEM; 2719 2720 level = btrfs_header_level(log->node); 2721 orig_level = level; 2722 path->nodes[level] = log->node; 2723 extent_buffer_get(log->node); 2724 path->slots[level] = 0; 2725 2726 while (1) { 2727 wret = walk_down_log_tree(trans, log, path, &level, wc); 2728 if (wret > 0) 2729 break; 2730 if (wret < 0) { 2731 ret = wret; 2732 goto out; 2733 } 2734 2735 wret = walk_up_log_tree(trans, log, path, &level, wc); 2736 if (wret > 0) 2737 break; 2738 if (wret < 0) { 2739 ret = wret; 2740 goto out; 2741 } 2742 } 2743 2744 /* was the root node processed? if not, catch it here */ 2745 if (path->nodes[orig_level]) { 2746 ret = wc->process_func(log, path->nodes[orig_level], wc, 2747 btrfs_header_generation(path->nodes[orig_level]), 2748 orig_level); 2749 if (ret) 2750 goto out; 2751 if (wc->free) { 2752 struct extent_buffer *next; 2753 2754 next = path->nodes[orig_level]; 2755 2756 if (trans) { 2757 btrfs_tree_lock(next); 2758 btrfs_set_lock_blocking(next); 2759 clean_tree_block(fs_info, next); 2760 btrfs_wait_tree_block_writeback(next); 2761 btrfs_tree_unlock(next); 2762 } else { 2763 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2764 clear_extent_buffer_dirty(next); 2765 } 2766 2767 WARN_ON(log->root_key.objectid != 2768 BTRFS_TREE_LOG_OBJECTID); 2769 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2770 next->start, next->len); 2771 if (ret) 2772 goto out; 2773 } 2774 } 2775 2776 out: 2777 btrfs_free_path(path); 2778 return ret; 2779 } 2780 2781 /* 2782 * helper function to update the item for a given subvolumes log root 2783 * in the tree of log roots 2784 */ 2785 static int update_log_root(struct btrfs_trans_handle *trans, 2786 struct btrfs_root *log) 2787 { 2788 struct btrfs_fs_info *fs_info = log->fs_info; 2789 int ret; 2790 2791 if (log->log_transid == 1) { 2792 /* insert root item on the first sync */ 2793 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2794 &log->root_key, &log->root_item); 2795 } else { 2796 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2797 &log->root_key, &log->root_item); 2798 } 2799 return ret; 2800 } 2801 2802 static void wait_log_commit(struct btrfs_root *root, int transid) 2803 { 2804 DEFINE_WAIT(wait); 2805 int index = transid % 2; 2806 2807 /* 2808 * we only allow two pending log transactions at a time, 2809 * so we know that if ours is more than 2 older than the 2810 * current transaction, we're done 2811 */ 2812 for (;;) { 2813 prepare_to_wait(&root->log_commit_wait[index], 2814 &wait, TASK_UNINTERRUPTIBLE); 2815 2816 if (!(root->log_transid_committed < transid && 2817 atomic_read(&root->log_commit[index]))) 2818 break; 2819 2820 mutex_unlock(&root->log_mutex); 2821 schedule(); 2822 mutex_lock(&root->log_mutex); 2823 } 2824 finish_wait(&root->log_commit_wait[index], &wait); 2825 } 2826 2827 static void wait_for_writer(struct btrfs_root *root) 2828 { 2829 DEFINE_WAIT(wait); 2830 2831 for (;;) { 2832 prepare_to_wait(&root->log_writer_wait, &wait, 2833 TASK_UNINTERRUPTIBLE); 2834 if (!atomic_read(&root->log_writers)) 2835 break; 2836 2837 mutex_unlock(&root->log_mutex); 2838 schedule(); 2839 mutex_lock(&root->log_mutex); 2840 } 2841 finish_wait(&root->log_writer_wait, &wait); 2842 } 2843 2844 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2845 struct btrfs_log_ctx *ctx) 2846 { 2847 if (!ctx) 2848 return; 2849 2850 mutex_lock(&root->log_mutex); 2851 list_del_init(&ctx->list); 2852 mutex_unlock(&root->log_mutex); 2853 } 2854 2855 /* 2856 * Invoked in log mutex context, or be sure there is no other task which 2857 * can access the list. 2858 */ 2859 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2860 int index, int error) 2861 { 2862 struct btrfs_log_ctx *ctx; 2863 struct btrfs_log_ctx *safe; 2864 2865 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2866 list_del_init(&ctx->list); 2867 ctx->log_ret = error; 2868 } 2869 2870 INIT_LIST_HEAD(&root->log_ctxs[index]); 2871 } 2872 2873 /* 2874 * btrfs_sync_log does sends a given tree log down to the disk and 2875 * updates the super blocks to record it. When this call is done, 2876 * you know that any inodes previously logged are safely on disk only 2877 * if it returns 0. 2878 * 2879 * Any other return value means you need to call btrfs_commit_transaction. 2880 * Some of the edge cases for fsyncing directories that have had unlinks 2881 * or renames done in the past mean that sometimes the only safe 2882 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2883 * that has happened. 2884 */ 2885 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2886 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2887 { 2888 int index1; 2889 int index2; 2890 int mark; 2891 int ret; 2892 struct btrfs_fs_info *fs_info = root->fs_info; 2893 struct btrfs_root *log = root->log_root; 2894 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2895 int log_transid = 0; 2896 struct btrfs_log_ctx root_log_ctx; 2897 struct blk_plug plug; 2898 2899 mutex_lock(&root->log_mutex); 2900 log_transid = ctx->log_transid; 2901 if (root->log_transid_committed >= log_transid) { 2902 mutex_unlock(&root->log_mutex); 2903 return ctx->log_ret; 2904 } 2905 2906 index1 = log_transid % 2; 2907 if (atomic_read(&root->log_commit[index1])) { 2908 wait_log_commit(root, log_transid); 2909 mutex_unlock(&root->log_mutex); 2910 return ctx->log_ret; 2911 } 2912 ASSERT(log_transid == root->log_transid); 2913 atomic_set(&root->log_commit[index1], 1); 2914 2915 /* wait for previous tree log sync to complete */ 2916 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2917 wait_log_commit(root, log_transid - 1); 2918 2919 while (1) { 2920 int batch = atomic_read(&root->log_batch); 2921 /* when we're on an ssd, just kick the log commit out */ 2922 if (!btrfs_test_opt(fs_info, SSD) && 2923 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2924 mutex_unlock(&root->log_mutex); 2925 schedule_timeout_uninterruptible(1); 2926 mutex_lock(&root->log_mutex); 2927 } 2928 wait_for_writer(root); 2929 if (batch == atomic_read(&root->log_batch)) 2930 break; 2931 } 2932 2933 /* bail out if we need to do a full commit */ 2934 if (btrfs_need_log_full_commit(fs_info, trans)) { 2935 ret = -EAGAIN; 2936 btrfs_free_logged_extents(log, log_transid); 2937 mutex_unlock(&root->log_mutex); 2938 goto out; 2939 } 2940 2941 if (log_transid % 2 == 0) 2942 mark = EXTENT_DIRTY; 2943 else 2944 mark = EXTENT_NEW; 2945 2946 /* we start IO on all the marked extents here, but we don't actually 2947 * wait for them until later. 2948 */ 2949 blk_start_plug(&plug); 2950 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2951 if (ret) { 2952 blk_finish_plug(&plug); 2953 btrfs_abort_transaction(trans, ret); 2954 btrfs_free_logged_extents(log, log_transid); 2955 btrfs_set_log_full_commit(fs_info, trans); 2956 mutex_unlock(&root->log_mutex); 2957 goto out; 2958 } 2959 2960 btrfs_set_root_node(&log->root_item, log->node); 2961 2962 root->log_transid++; 2963 log->log_transid = root->log_transid; 2964 root->log_start_pid = 0; 2965 /* 2966 * IO has been started, blocks of the log tree have WRITTEN flag set 2967 * in their headers. new modifications of the log will be written to 2968 * new positions. so it's safe to allow log writers to go in. 2969 */ 2970 mutex_unlock(&root->log_mutex); 2971 2972 btrfs_init_log_ctx(&root_log_ctx, NULL); 2973 2974 mutex_lock(&log_root_tree->log_mutex); 2975 atomic_inc(&log_root_tree->log_batch); 2976 atomic_inc(&log_root_tree->log_writers); 2977 2978 index2 = log_root_tree->log_transid % 2; 2979 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2980 root_log_ctx.log_transid = log_root_tree->log_transid; 2981 2982 mutex_unlock(&log_root_tree->log_mutex); 2983 2984 ret = update_log_root(trans, log); 2985 2986 mutex_lock(&log_root_tree->log_mutex); 2987 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2988 /* atomic_dec_and_test implies a barrier */ 2989 cond_wake_up_nomb(&log_root_tree->log_writer_wait); 2990 } 2991 2992 if (ret) { 2993 if (!list_empty(&root_log_ctx.list)) 2994 list_del_init(&root_log_ctx.list); 2995 2996 blk_finish_plug(&plug); 2997 btrfs_set_log_full_commit(fs_info, trans); 2998 2999 if (ret != -ENOSPC) { 3000 btrfs_abort_transaction(trans, ret); 3001 mutex_unlock(&log_root_tree->log_mutex); 3002 goto out; 3003 } 3004 btrfs_wait_tree_log_extents(log, mark); 3005 btrfs_free_logged_extents(log, log_transid); 3006 mutex_unlock(&log_root_tree->log_mutex); 3007 ret = -EAGAIN; 3008 goto out; 3009 } 3010 3011 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3012 blk_finish_plug(&plug); 3013 list_del_init(&root_log_ctx.list); 3014 mutex_unlock(&log_root_tree->log_mutex); 3015 ret = root_log_ctx.log_ret; 3016 goto out; 3017 } 3018 3019 index2 = root_log_ctx.log_transid % 2; 3020 if (atomic_read(&log_root_tree->log_commit[index2])) { 3021 blk_finish_plug(&plug); 3022 ret = btrfs_wait_tree_log_extents(log, mark); 3023 btrfs_wait_logged_extents(trans, log, log_transid); 3024 wait_log_commit(log_root_tree, 3025 root_log_ctx.log_transid); 3026 mutex_unlock(&log_root_tree->log_mutex); 3027 if (!ret) 3028 ret = root_log_ctx.log_ret; 3029 goto out; 3030 } 3031 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3032 atomic_set(&log_root_tree->log_commit[index2], 1); 3033 3034 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3035 wait_log_commit(log_root_tree, 3036 root_log_ctx.log_transid - 1); 3037 } 3038 3039 wait_for_writer(log_root_tree); 3040 3041 /* 3042 * now that we've moved on to the tree of log tree roots, 3043 * check the full commit flag again 3044 */ 3045 if (btrfs_need_log_full_commit(fs_info, trans)) { 3046 blk_finish_plug(&plug); 3047 btrfs_wait_tree_log_extents(log, mark); 3048 btrfs_free_logged_extents(log, log_transid); 3049 mutex_unlock(&log_root_tree->log_mutex); 3050 ret = -EAGAIN; 3051 goto out_wake_log_root; 3052 } 3053 3054 ret = btrfs_write_marked_extents(fs_info, 3055 &log_root_tree->dirty_log_pages, 3056 EXTENT_DIRTY | EXTENT_NEW); 3057 blk_finish_plug(&plug); 3058 if (ret) { 3059 btrfs_set_log_full_commit(fs_info, trans); 3060 btrfs_abort_transaction(trans, ret); 3061 btrfs_free_logged_extents(log, log_transid); 3062 mutex_unlock(&log_root_tree->log_mutex); 3063 goto out_wake_log_root; 3064 } 3065 ret = btrfs_wait_tree_log_extents(log, mark); 3066 if (!ret) 3067 ret = btrfs_wait_tree_log_extents(log_root_tree, 3068 EXTENT_NEW | EXTENT_DIRTY); 3069 if (ret) { 3070 btrfs_set_log_full_commit(fs_info, trans); 3071 btrfs_free_logged_extents(log, log_transid); 3072 mutex_unlock(&log_root_tree->log_mutex); 3073 goto out_wake_log_root; 3074 } 3075 btrfs_wait_logged_extents(trans, log, log_transid); 3076 3077 btrfs_set_super_log_root(fs_info->super_for_commit, 3078 log_root_tree->node->start); 3079 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3080 btrfs_header_level(log_root_tree->node)); 3081 3082 log_root_tree->log_transid++; 3083 mutex_unlock(&log_root_tree->log_mutex); 3084 3085 /* 3086 * nobody else is going to jump in and write the the ctree 3087 * super here because the log_commit atomic below is protecting 3088 * us. We must be called with a transaction handle pinning 3089 * the running transaction open, so a full commit can't hop 3090 * in and cause problems either. 3091 */ 3092 ret = write_all_supers(fs_info, 1); 3093 if (ret) { 3094 btrfs_set_log_full_commit(fs_info, trans); 3095 btrfs_abort_transaction(trans, ret); 3096 goto out_wake_log_root; 3097 } 3098 3099 mutex_lock(&root->log_mutex); 3100 if (root->last_log_commit < log_transid) 3101 root->last_log_commit = log_transid; 3102 mutex_unlock(&root->log_mutex); 3103 3104 out_wake_log_root: 3105 mutex_lock(&log_root_tree->log_mutex); 3106 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3107 3108 log_root_tree->log_transid_committed++; 3109 atomic_set(&log_root_tree->log_commit[index2], 0); 3110 mutex_unlock(&log_root_tree->log_mutex); 3111 3112 /* 3113 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3114 * all the updates above are seen by the woken threads. It might not be 3115 * necessary, but proving that seems to be hard. 3116 */ 3117 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3118 out: 3119 mutex_lock(&root->log_mutex); 3120 btrfs_remove_all_log_ctxs(root, index1, ret); 3121 root->log_transid_committed++; 3122 atomic_set(&root->log_commit[index1], 0); 3123 mutex_unlock(&root->log_mutex); 3124 3125 /* 3126 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3127 * all the updates above are seen by the woken threads. It might not be 3128 * necessary, but proving that seems to be hard. 3129 */ 3130 cond_wake_up(&root->log_commit_wait[index1]); 3131 return ret; 3132 } 3133 3134 static void free_log_tree(struct btrfs_trans_handle *trans, 3135 struct btrfs_root *log) 3136 { 3137 int ret; 3138 u64 start; 3139 u64 end; 3140 struct walk_control wc = { 3141 .free = 1, 3142 .process_func = process_one_buffer 3143 }; 3144 3145 ret = walk_log_tree(trans, log, &wc); 3146 /* I don't think this can happen but just in case */ 3147 if (ret) 3148 btrfs_abort_transaction(trans, ret); 3149 3150 while (1) { 3151 ret = find_first_extent_bit(&log->dirty_log_pages, 3152 0, &start, &end, 3153 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, 3154 NULL); 3155 if (ret) 3156 break; 3157 3158 clear_extent_bits(&log->dirty_log_pages, start, end, 3159 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3160 } 3161 3162 /* 3163 * We may have short-circuited the log tree with the full commit logic 3164 * and left ordered extents on our list, so clear these out to keep us 3165 * from leaking inodes and memory. 3166 */ 3167 btrfs_free_logged_extents(log, 0); 3168 btrfs_free_logged_extents(log, 1); 3169 3170 free_extent_buffer(log->node); 3171 kfree(log); 3172 } 3173 3174 /* 3175 * free all the extents used by the tree log. This should be called 3176 * at commit time of the full transaction 3177 */ 3178 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3179 { 3180 if (root->log_root) { 3181 free_log_tree(trans, root->log_root); 3182 root->log_root = NULL; 3183 } 3184 return 0; 3185 } 3186 3187 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3188 struct btrfs_fs_info *fs_info) 3189 { 3190 if (fs_info->log_root_tree) { 3191 free_log_tree(trans, fs_info->log_root_tree); 3192 fs_info->log_root_tree = NULL; 3193 } 3194 return 0; 3195 } 3196 3197 /* 3198 * If both a file and directory are logged, and unlinks or renames are 3199 * mixed in, we have a few interesting corners: 3200 * 3201 * create file X in dir Y 3202 * link file X to X.link in dir Y 3203 * fsync file X 3204 * unlink file X but leave X.link 3205 * fsync dir Y 3206 * 3207 * After a crash we would expect only X.link to exist. But file X 3208 * didn't get fsync'd again so the log has back refs for X and X.link. 3209 * 3210 * We solve this by removing directory entries and inode backrefs from the 3211 * log when a file that was logged in the current transaction is 3212 * unlinked. Any later fsync will include the updated log entries, and 3213 * we'll be able to reconstruct the proper directory items from backrefs. 3214 * 3215 * This optimizations allows us to avoid relogging the entire inode 3216 * or the entire directory. 3217 */ 3218 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3219 struct btrfs_root *root, 3220 const char *name, int name_len, 3221 struct btrfs_inode *dir, u64 index) 3222 { 3223 struct btrfs_root *log; 3224 struct btrfs_dir_item *di; 3225 struct btrfs_path *path; 3226 int ret; 3227 int err = 0; 3228 int bytes_del = 0; 3229 u64 dir_ino = btrfs_ino(dir); 3230 3231 if (dir->logged_trans < trans->transid) 3232 return 0; 3233 3234 ret = join_running_log_trans(root); 3235 if (ret) 3236 return 0; 3237 3238 mutex_lock(&dir->log_mutex); 3239 3240 log = root->log_root; 3241 path = btrfs_alloc_path(); 3242 if (!path) { 3243 err = -ENOMEM; 3244 goto out_unlock; 3245 } 3246 3247 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3248 name, name_len, -1); 3249 if (IS_ERR(di)) { 3250 err = PTR_ERR(di); 3251 goto fail; 3252 } 3253 if (di) { 3254 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3255 bytes_del += name_len; 3256 if (ret) { 3257 err = ret; 3258 goto fail; 3259 } 3260 } 3261 btrfs_release_path(path); 3262 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3263 index, name, name_len, -1); 3264 if (IS_ERR(di)) { 3265 err = PTR_ERR(di); 3266 goto fail; 3267 } 3268 if (di) { 3269 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3270 bytes_del += name_len; 3271 if (ret) { 3272 err = ret; 3273 goto fail; 3274 } 3275 } 3276 3277 /* update the directory size in the log to reflect the names 3278 * we have removed 3279 */ 3280 if (bytes_del) { 3281 struct btrfs_key key; 3282 3283 key.objectid = dir_ino; 3284 key.offset = 0; 3285 key.type = BTRFS_INODE_ITEM_KEY; 3286 btrfs_release_path(path); 3287 3288 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3289 if (ret < 0) { 3290 err = ret; 3291 goto fail; 3292 } 3293 if (ret == 0) { 3294 struct btrfs_inode_item *item; 3295 u64 i_size; 3296 3297 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3298 struct btrfs_inode_item); 3299 i_size = btrfs_inode_size(path->nodes[0], item); 3300 if (i_size > bytes_del) 3301 i_size -= bytes_del; 3302 else 3303 i_size = 0; 3304 btrfs_set_inode_size(path->nodes[0], item, i_size); 3305 btrfs_mark_buffer_dirty(path->nodes[0]); 3306 } else 3307 ret = 0; 3308 btrfs_release_path(path); 3309 } 3310 fail: 3311 btrfs_free_path(path); 3312 out_unlock: 3313 mutex_unlock(&dir->log_mutex); 3314 if (ret == -ENOSPC) { 3315 btrfs_set_log_full_commit(root->fs_info, trans); 3316 ret = 0; 3317 } else if (ret < 0) 3318 btrfs_abort_transaction(trans, ret); 3319 3320 btrfs_end_log_trans(root); 3321 3322 return err; 3323 } 3324 3325 /* see comments for btrfs_del_dir_entries_in_log */ 3326 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3327 struct btrfs_root *root, 3328 const char *name, int name_len, 3329 struct btrfs_inode *inode, u64 dirid) 3330 { 3331 struct btrfs_fs_info *fs_info = root->fs_info; 3332 struct btrfs_root *log; 3333 u64 index; 3334 int ret; 3335 3336 if (inode->logged_trans < trans->transid) 3337 return 0; 3338 3339 ret = join_running_log_trans(root); 3340 if (ret) 3341 return 0; 3342 log = root->log_root; 3343 mutex_lock(&inode->log_mutex); 3344 3345 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3346 dirid, &index); 3347 mutex_unlock(&inode->log_mutex); 3348 if (ret == -ENOSPC) { 3349 btrfs_set_log_full_commit(fs_info, trans); 3350 ret = 0; 3351 } else if (ret < 0 && ret != -ENOENT) 3352 btrfs_abort_transaction(trans, ret); 3353 btrfs_end_log_trans(root); 3354 3355 return ret; 3356 } 3357 3358 /* 3359 * creates a range item in the log for 'dirid'. first_offset and 3360 * last_offset tell us which parts of the key space the log should 3361 * be considered authoritative for. 3362 */ 3363 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3364 struct btrfs_root *log, 3365 struct btrfs_path *path, 3366 int key_type, u64 dirid, 3367 u64 first_offset, u64 last_offset) 3368 { 3369 int ret; 3370 struct btrfs_key key; 3371 struct btrfs_dir_log_item *item; 3372 3373 key.objectid = dirid; 3374 key.offset = first_offset; 3375 if (key_type == BTRFS_DIR_ITEM_KEY) 3376 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3377 else 3378 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3379 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3380 if (ret) 3381 return ret; 3382 3383 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3384 struct btrfs_dir_log_item); 3385 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3386 btrfs_mark_buffer_dirty(path->nodes[0]); 3387 btrfs_release_path(path); 3388 return 0; 3389 } 3390 3391 /* 3392 * log all the items included in the current transaction for a given 3393 * directory. This also creates the range items in the log tree required 3394 * to replay anything deleted before the fsync 3395 */ 3396 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3397 struct btrfs_root *root, struct btrfs_inode *inode, 3398 struct btrfs_path *path, 3399 struct btrfs_path *dst_path, int key_type, 3400 struct btrfs_log_ctx *ctx, 3401 u64 min_offset, u64 *last_offset_ret) 3402 { 3403 struct btrfs_key min_key; 3404 struct btrfs_root *log = root->log_root; 3405 struct extent_buffer *src; 3406 int err = 0; 3407 int ret; 3408 int i; 3409 int nritems; 3410 u64 first_offset = min_offset; 3411 u64 last_offset = (u64)-1; 3412 u64 ino = btrfs_ino(inode); 3413 3414 log = root->log_root; 3415 3416 min_key.objectid = ino; 3417 min_key.type = key_type; 3418 min_key.offset = min_offset; 3419 3420 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3421 3422 /* 3423 * we didn't find anything from this transaction, see if there 3424 * is anything at all 3425 */ 3426 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3427 min_key.objectid = ino; 3428 min_key.type = key_type; 3429 min_key.offset = (u64)-1; 3430 btrfs_release_path(path); 3431 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3432 if (ret < 0) { 3433 btrfs_release_path(path); 3434 return ret; 3435 } 3436 ret = btrfs_previous_item(root, path, ino, key_type); 3437 3438 /* if ret == 0 there are items for this type, 3439 * create a range to tell us the last key of this type. 3440 * otherwise, there are no items in this directory after 3441 * *min_offset, and we create a range to indicate that. 3442 */ 3443 if (ret == 0) { 3444 struct btrfs_key tmp; 3445 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3446 path->slots[0]); 3447 if (key_type == tmp.type) 3448 first_offset = max(min_offset, tmp.offset) + 1; 3449 } 3450 goto done; 3451 } 3452 3453 /* go backward to find any previous key */ 3454 ret = btrfs_previous_item(root, path, ino, key_type); 3455 if (ret == 0) { 3456 struct btrfs_key tmp; 3457 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3458 if (key_type == tmp.type) { 3459 first_offset = tmp.offset; 3460 ret = overwrite_item(trans, log, dst_path, 3461 path->nodes[0], path->slots[0], 3462 &tmp); 3463 if (ret) { 3464 err = ret; 3465 goto done; 3466 } 3467 } 3468 } 3469 btrfs_release_path(path); 3470 3471 /* find the first key from this transaction again */ 3472 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3473 if (WARN_ON(ret != 0)) 3474 goto done; 3475 3476 /* 3477 * we have a block from this transaction, log every item in it 3478 * from our directory 3479 */ 3480 while (1) { 3481 struct btrfs_key tmp; 3482 src = path->nodes[0]; 3483 nritems = btrfs_header_nritems(src); 3484 for (i = path->slots[0]; i < nritems; i++) { 3485 struct btrfs_dir_item *di; 3486 3487 btrfs_item_key_to_cpu(src, &min_key, i); 3488 3489 if (min_key.objectid != ino || min_key.type != key_type) 3490 goto done; 3491 ret = overwrite_item(trans, log, dst_path, src, i, 3492 &min_key); 3493 if (ret) { 3494 err = ret; 3495 goto done; 3496 } 3497 3498 /* 3499 * We must make sure that when we log a directory entry, 3500 * the corresponding inode, after log replay, has a 3501 * matching link count. For example: 3502 * 3503 * touch foo 3504 * mkdir mydir 3505 * sync 3506 * ln foo mydir/bar 3507 * xfs_io -c "fsync" mydir 3508 * <crash> 3509 * <mount fs and log replay> 3510 * 3511 * Would result in a fsync log that when replayed, our 3512 * file inode would have a link count of 1, but we get 3513 * two directory entries pointing to the same inode. 3514 * After removing one of the names, it would not be 3515 * possible to remove the other name, which resulted 3516 * always in stale file handle errors, and would not 3517 * be possible to rmdir the parent directory, since 3518 * its i_size could never decrement to the value 3519 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3520 */ 3521 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3522 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3523 if (ctx && 3524 (btrfs_dir_transid(src, di) == trans->transid || 3525 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3526 tmp.type != BTRFS_ROOT_ITEM_KEY) 3527 ctx->log_new_dentries = true; 3528 } 3529 path->slots[0] = nritems; 3530 3531 /* 3532 * look ahead to the next item and see if it is also 3533 * from this directory and from this transaction 3534 */ 3535 ret = btrfs_next_leaf(root, path); 3536 if (ret) { 3537 if (ret == 1) 3538 last_offset = (u64)-1; 3539 else 3540 err = ret; 3541 goto done; 3542 } 3543 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3544 if (tmp.objectid != ino || tmp.type != key_type) { 3545 last_offset = (u64)-1; 3546 goto done; 3547 } 3548 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3549 ret = overwrite_item(trans, log, dst_path, 3550 path->nodes[0], path->slots[0], 3551 &tmp); 3552 if (ret) 3553 err = ret; 3554 else 3555 last_offset = tmp.offset; 3556 goto done; 3557 } 3558 } 3559 done: 3560 btrfs_release_path(path); 3561 btrfs_release_path(dst_path); 3562 3563 if (err == 0) { 3564 *last_offset_ret = last_offset; 3565 /* 3566 * insert the log range keys to indicate where the log 3567 * is valid 3568 */ 3569 ret = insert_dir_log_key(trans, log, path, key_type, 3570 ino, first_offset, last_offset); 3571 if (ret) 3572 err = ret; 3573 } 3574 return err; 3575 } 3576 3577 /* 3578 * logging directories is very similar to logging inodes, We find all the items 3579 * from the current transaction and write them to the log. 3580 * 3581 * The recovery code scans the directory in the subvolume, and if it finds a 3582 * key in the range logged that is not present in the log tree, then it means 3583 * that dir entry was unlinked during the transaction. 3584 * 3585 * In order for that scan to work, we must include one key smaller than 3586 * the smallest logged by this transaction and one key larger than the largest 3587 * key logged by this transaction. 3588 */ 3589 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3590 struct btrfs_root *root, struct btrfs_inode *inode, 3591 struct btrfs_path *path, 3592 struct btrfs_path *dst_path, 3593 struct btrfs_log_ctx *ctx) 3594 { 3595 u64 min_key; 3596 u64 max_key; 3597 int ret; 3598 int key_type = BTRFS_DIR_ITEM_KEY; 3599 3600 again: 3601 min_key = 0; 3602 max_key = 0; 3603 while (1) { 3604 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3605 ctx, min_key, &max_key); 3606 if (ret) 3607 return ret; 3608 if (max_key == (u64)-1) 3609 break; 3610 min_key = max_key + 1; 3611 } 3612 3613 if (key_type == BTRFS_DIR_ITEM_KEY) { 3614 key_type = BTRFS_DIR_INDEX_KEY; 3615 goto again; 3616 } 3617 return 0; 3618 } 3619 3620 /* 3621 * a helper function to drop items from the log before we relog an 3622 * inode. max_key_type indicates the highest item type to remove. 3623 * This cannot be run for file data extents because it does not 3624 * free the extents they point to. 3625 */ 3626 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3627 struct btrfs_root *log, 3628 struct btrfs_path *path, 3629 u64 objectid, int max_key_type) 3630 { 3631 int ret; 3632 struct btrfs_key key; 3633 struct btrfs_key found_key; 3634 int start_slot; 3635 3636 key.objectid = objectid; 3637 key.type = max_key_type; 3638 key.offset = (u64)-1; 3639 3640 while (1) { 3641 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3642 BUG_ON(ret == 0); /* Logic error */ 3643 if (ret < 0) 3644 break; 3645 3646 if (path->slots[0] == 0) 3647 break; 3648 3649 path->slots[0]--; 3650 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3651 path->slots[0]); 3652 3653 if (found_key.objectid != objectid) 3654 break; 3655 3656 found_key.offset = 0; 3657 found_key.type = 0; 3658 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3659 &start_slot); 3660 3661 ret = btrfs_del_items(trans, log, path, start_slot, 3662 path->slots[0] - start_slot + 1); 3663 /* 3664 * If start slot isn't 0 then we don't need to re-search, we've 3665 * found the last guy with the objectid in this tree. 3666 */ 3667 if (ret || start_slot != 0) 3668 break; 3669 btrfs_release_path(path); 3670 } 3671 btrfs_release_path(path); 3672 if (ret > 0) 3673 ret = 0; 3674 return ret; 3675 } 3676 3677 static void fill_inode_item(struct btrfs_trans_handle *trans, 3678 struct extent_buffer *leaf, 3679 struct btrfs_inode_item *item, 3680 struct inode *inode, int log_inode_only, 3681 u64 logged_isize) 3682 { 3683 struct btrfs_map_token token; 3684 3685 btrfs_init_map_token(&token); 3686 3687 if (log_inode_only) { 3688 /* set the generation to zero so the recover code 3689 * can tell the difference between an logging 3690 * just to say 'this inode exists' and a logging 3691 * to say 'update this inode with these values' 3692 */ 3693 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3694 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3695 } else { 3696 btrfs_set_token_inode_generation(leaf, item, 3697 BTRFS_I(inode)->generation, 3698 &token); 3699 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3700 } 3701 3702 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3703 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3704 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3705 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3706 3707 btrfs_set_token_timespec_sec(leaf, &item->atime, 3708 inode->i_atime.tv_sec, &token); 3709 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3710 inode->i_atime.tv_nsec, &token); 3711 3712 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3713 inode->i_mtime.tv_sec, &token); 3714 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3715 inode->i_mtime.tv_nsec, &token); 3716 3717 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3718 inode->i_ctime.tv_sec, &token); 3719 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3720 inode->i_ctime.tv_nsec, &token); 3721 3722 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3723 &token); 3724 3725 btrfs_set_token_inode_sequence(leaf, item, 3726 inode_peek_iversion(inode), &token); 3727 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3728 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3729 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3730 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3731 } 3732 3733 static int log_inode_item(struct btrfs_trans_handle *trans, 3734 struct btrfs_root *log, struct btrfs_path *path, 3735 struct btrfs_inode *inode) 3736 { 3737 struct btrfs_inode_item *inode_item; 3738 int ret; 3739 3740 ret = btrfs_insert_empty_item(trans, log, path, 3741 &inode->location, sizeof(*inode_item)); 3742 if (ret && ret != -EEXIST) 3743 return ret; 3744 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3745 struct btrfs_inode_item); 3746 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3747 0, 0); 3748 btrfs_release_path(path); 3749 return 0; 3750 } 3751 3752 static noinline int copy_items(struct btrfs_trans_handle *trans, 3753 struct btrfs_inode *inode, 3754 struct btrfs_path *dst_path, 3755 struct btrfs_path *src_path, u64 *last_extent, 3756 int start_slot, int nr, int inode_only, 3757 u64 logged_isize) 3758 { 3759 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3760 unsigned long src_offset; 3761 unsigned long dst_offset; 3762 struct btrfs_root *log = inode->root->log_root; 3763 struct btrfs_file_extent_item *extent; 3764 struct btrfs_inode_item *inode_item; 3765 struct extent_buffer *src = src_path->nodes[0]; 3766 struct btrfs_key first_key, last_key, key; 3767 int ret; 3768 struct btrfs_key *ins_keys; 3769 u32 *ins_sizes; 3770 char *ins_data; 3771 int i; 3772 struct list_head ordered_sums; 3773 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3774 bool has_extents = false; 3775 bool need_find_last_extent = true; 3776 bool done = false; 3777 3778 INIT_LIST_HEAD(&ordered_sums); 3779 3780 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3781 nr * sizeof(u32), GFP_NOFS); 3782 if (!ins_data) 3783 return -ENOMEM; 3784 3785 first_key.objectid = (u64)-1; 3786 3787 ins_sizes = (u32 *)ins_data; 3788 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3789 3790 for (i = 0; i < nr; i++) { 3791 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3792 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3793 } 3794 ret = btrfs_insert_empty_items(trans, log, dst_path, 3795 ins_keys, ins_sizes, nr); 3796 if (ret) { 3797 kfree(ins_data); 3798 return ret; 3799 } 3800 3801 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3802 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3803 dst_path->slots[0]); 3804 3805 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3806 3807 if (i == nr - 1) 3808 last_key = ins_keys[i]; 3809 3810 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3811 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3812 dst_path->slots[0], 3813 struct btrfs_inode_item); 3814 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3815 &inode->vfs_inode, 3816 inode_only == LOG_INODE_EXISTS, 3817 logged_isize); 3818 } else { 3819 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3820 src_offset, ins_sizes[i]); 3821 } 3822 3823 /* 3824 * We set need_find_last_extent here in case we know we were 3825 * processing other items and then walk into the first extent in 3826 * the inode. If we don't hit an extent then nothing changes, 3827 * we'll do the last search the next time around. 3828 */ 3829 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3830 has_extents = true; 3831 if (first_key.objectid == (u64)-1) 3832 first_key = ins_keys[i]; 3833 } else { 3834 need_find_last_extent = false; 3835 } 3836 3837 /* take a reference on file data extents so that truncates 3838 * or deletes of this inode don't have to relog the inode 3839 * again 3840 */ 3841 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3842 !skip_csum) { 3843 int found_type; 3844 extent = btrfs_item_ptr(src, start_slot + i, 3845 struct btrfs_file_extent_item); 3846 3847 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3848 continue; 3849 3850 found_type = btrfs_file_extent_type(src, extent); 3851 if (found_type == BTRFS_FILE_EXTENT_REG) { 3852 u64 ds, dl, cs, cl; 3853 ds = btrfs_file_extent_disk_bytenr(src, 3854 extent); 3855 /* ds == 0 is a hole */ 3856 if (ds == 0) 3857 continue; 3858 3859 dl = btrfs_file_extent_disk_num_bytes(src, 3860 extent); 3861 cs = btrfs_file_extent_offset(src, extent); 3862 cl = btrfs_file_extent_num_bytes(src, 3863 extent); 3864 if (btrfs_file_extent_compression(src, 3865 extent)) { 3866 cs = 0; 3867 cl = dl; 3868 } 3869 3870 ret = btrfs_lookup_csums_range( 3871 fs_info->csum_root, 3872 ds + cs, ds + cs + cl - 1, 3873 &ordered_sums, 0); 3874 if (ret) { 3875 btrfs_release_path(dst_path); 3876 kfree(ins_data); 3877 return ret; 3878 } 3879 } 3880 } 3881 } 3882 3883 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3884 btrfs_release_path(dst_path); 3885 kfree(ins_data); 3886 3887 /* 3888 * we have to do this after the loop above to avoid changing the 3889 * log tree while trying to change the log tree. 3890 */ 3891 ret = 0; 3892 while (!list_empty(&ordered_sums)) { 3893 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3894 struct btrfs_ordered_sum, 3895 list); 3896 if (!ret) 3897 ret = btrfs_csum_file_blocks(trans, log, sums); 3898 list_del(&sums->list); 3899 kfree(sums); 3900 } 3901 3902 if (!has_extents) 3903 return ret; 3904 3905 if (need_find_last_extent && *last_extent == first_key.offset) { 3906 /* 3907 * We don't have any leafs between our current one and the one 3908 * we processed before that can have file extent items for our 3909 * inode (and have a generation number smaller than our current 3910 * transaction id). 3911 */ 3912 need_find_last_extent = false; 3913 } 3914 3915 /* 3916 * Because we use btrfs_search_forward we could skip leaves that were 3917 * not modified and then assume *last_extent is valid when it really 3918 * isn't. So back up to the previous leaf and read the end of the last 3919 * extent before we go and fill in holes. 3920 */ 3921 if (need_find_last_extent) { 3922 u64 len; 3923 3924 ret = btrfs_prev_leaf(inode->root, src_path); 3925 if (ret < 0) 3926 return ret; 3927 if (ret) 3928 goto fill_holes; 3929 if (src_path->slots[0]) 3930 src_path->slots[0]--; 3931 src = src_path->nodes[0]; 3932 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3933 if (key.objectid != btrfs_ino(inode) || 3934 key.type != BTRFS_EXTENT_DATA_KEY) 3935 goto fill_holes; 3936 extent = btrfs_item_ptr(src, src_path->slots[0], 3937 struct btrfs_file_extent_item); 3938 if (btrfs_file_extent_type(src, extent) == 3939 BTRFS_FILE_EXTENT_INLINE) { 3940 len = btrfs_file_extent_inline_len(src, 3941 src_path->slots[0], 3942 extent); 3943 *last_extent = ALIGN(key.offset + len, 3944 fs_info->sectorsize); 3945 } else { 3946 len = btrfs_file_extent_num_bytes(src, extent); 3947 *last_extent = key.offset + len; 3948 } 3949 } 3950 fill_holes: 3951 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3952 * things could have happened 3953 * 3954 * 1) A merge could have happened, so we could currently be on a leaf 3955 * that holds what we were copying in the first place. 3956 * 2) A split could have happened, and now not all of the items we want 3957 * are on the same leaf. 3958 * 3959 * So we need to adjust how we search for holes, we need to drop the 3960 * path and re-search for the first extent key we found, and then walk 3961 * forward until we hit the last one we copied. 3962 */ 3963 if (need_find_last_extent) { 3964 /* btrfs_prev_leaf could return 1 without releasing the path */ 3965 btrfs_release_path(src_path); 3966 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3967 src_path, 0, 0); 3968 if (ret < 0) 3969 return ret; 3970 ASSERT(ret == 0); 3971 src = src_path->nodes[0]; 3972 i = src_path->slots[0]; 3973 } else { 3974 i = start_slot; 3975 } 3976 3977 /* 3978 * Ok so here we need to go through and fill in any holes we may have 3979 * to make sure that holes are punched for those areas in case they had 3980 * extents previously. 3981 */ 3982 while (!done) { 3983 u64 offset, len; 3984 u64 extent_end; 3985 3986 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3987 ret = btrfs_next_leaf(inode->root, src_path); 3988 if (ret < 0) 3989 return ret; 3990 ASSERT(ret == 0); 3991 src = src_path->nodes[0]; 3992 i = 0; 3993 need_find_last_extent = true; 3994 } 3995 3996 btrfs_item_key_to_cpu(src, &key, i); 3997 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3998 done = true; 3999 if (key.objectid != btrfs_ino(inode) || 4000 key.type != BTRFS_EXTENT_DATA_KEY) { 4001 i++; 4002 continue; 4003 } 4004 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 4005 if (btrfs_file_extent_type(src, extent) == 4006 BTRFS_FILE_EXTENT_INLINE) { 4007 len = btrfs_file_extent_inline_len(src, i, extent); 4008 extent_end = ALIGN(key.offset + len, 4009 fs_info->sectorsize); 4010 } else { 4011 len = btrfs_file_extent_num_bytes(src, extent); 4012 extent_end = key.offset + len; 4013 } 4014 i++; 4015 4016 if (*last_extent == key.offset) { 4017 *last_extent = extent_end; 4018 continue; 4019 } 4020 offset = *last_extent; 4021 len = key.offset - *last_extent; 4022 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 4023 offset, 0, 0, len, 0, len, 0, 0, 0); 4024 if (ret) 4025 break; 4026 *last_extent = extent_end; 4027 } 4028 4029 /* 4030 * Check if there is a hole between the last extent found in our leaf 4031 * and the first extent in the next leaf. If there is one, we need to 4032 * log an explicit hole so that at replay time we can punch the hole. 4033 */ 4034 if (ret == 0 && 4035 key.objectid == btrfs_ino(inode) && 4036 key.type == BTRFS_EXTENT_DATA_KEY && 4037 i == btrfs_header_nritems(src_path->nodes[0])) { 4038 ret = btrfs_next_leaf(inode->root, src_path); 4039 need_find_last_extent = true; 4040 if (ret > 0) { 4041 ret = 0; 4042 } else if (ret == 0) { 4043 btrfs_item_key_to_cpu(src_path->nodes[0], &key, 4044 src_path->slots[0]); 4045 if (key.objectid == btrfs_ino(inode) && 4046 key.type == BTRFS_EXTENT_DATA_KEY && 4047 *last_extent < key.offset) { 4048 const u64 len = key.offset - *last_extent; 4049 4050 ret = btrfs_insert_file_extent(trans, log, 4051 btrfs_ino(inode), 4052 *last_extent, 0, 4053 0, len, 0, len, 4054 0, 0, 0); 4055 } 4056 } 4057 } 4058 /* 4059 * Need to let the callers know we dropped the path so they should 4060 * re-search. 4061 */ 4062 if (!ret && need_find_last_extent) 4063 ret = 1; 4064 return ret; 4065 } 4066 4067 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4068 { 4069 struct extent_map *em1, *em2; 4070 4071 em1 = list_entry(a, struct extent_map, list); 4072 em2 = list_entry(b, struct extent_map, list); 4073 4074 if (em1->start < em2->start) 4075 return -1; 4076 else if (em1->start > em2->start) 4077 return 1; 4078 return 0; 4079 } 4080 4081 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 4082 struct inode *inode, 4083 struct btrfs_root *root, 4084 const struct extent_map *em, 4085 const struct list_head *logged_list, 4086 bool *ordered_io_error) 4087 { 4088 struct btrfs_fs_info *fs_info = root->fs_info; 4089 struct btrfs_ordered_extent *ordered; 4090 struct btrfs_root *log = root->log_root; 4091 u64 mod_start = em->mod_start; 4092 u64 mod_len = em->mod_len; 4093 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 4094 u64 csum_offset; 4095 u64 csum_len; 4096 LIST_HEAD(ordered_sums); 4097 int ret = 0; 4098 4099 *ordered_io_error = false; 4100 4101 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4102 em->block_start == EXTENT_MAP_HOLE) 4103 return 0; 4104 4105 /* 4106 * Wait far any ordered extent that covers our extent map. If it 4107 * finishes without an error, first check and see if our csums are on 4108 * our outstanding ordered extents. 4109 */ 4110 list_for_each_entry(ordered, logged_list, log_list) { 4111 struct btrfs_ordered_sum *sum; 4112 4113 if (!mod_len) 4114 break; 4115 4116 if (ordered->file_offset + ordered->len <= mod_start || 4117 mod_start + mod_len <= ordered->file_offset) 4118 continue; 4119 4120 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 4121 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 4122 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 4123 const u64 start = ordered->file_offset; 4124 const u64 end = ordered->file_offset + ordered->len - 1; 4125 4126 WARN_ON(ordered->inode != inode); 4127 filemap_fdatawrite_range(inode->i_mapping, start, end); 4128 } 4129 4130 wait_event(ordered->wait, 4131 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 4132 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 4133 4134 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 4135 /* 4136 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 4137 * i_mapping flags, so that the next fsync won't get 4138 * an outdated io error too. 4139 */ 4140 filemap_check_errors(inode->i_mapping); 4141 *ordered_io_error = true; 4142 break; 4143 } 4144 /* 4145 * We are going to copy all the csums on this ordered extent, so 4146 * go ahead and adjust mod_start and mod_len in case this 4147 * ordered extent has already been logged. 4148 */ 4149 if (ordered->file_offset > mod_start) { 4150 if (ordered->file_offset + ordered->len >= 4151 mod_start + mod_len) 4152 mod_len = ordered->file_offset - mod_start; 4153 /* 4154 * If we have this case 4155 * 4156 * |--------- logged extent ---------| 4157 * |----- ordered extent ----| 4158 * 4159 * Just don't mess with mod_start and mod_len, we'll 4160 * just end up logging more csums than we need and it 4161 * will be ok. 4162 */ 4163 } else { 4164 if (ordered->file_offset + ordered->len < 4165 mod_start + mod_len) { 4166 mod_len = (mod_start + mod_len) - 4167 (ordered->file_offset + ordered->len); 4168 mod_start = ordered->file_offset + 4169 ordered->len; 4170 } else { 4171 mod_len = 0; 4172 } 4173 } 4174 4175 if (skip_csum) 4176 continue; 4177 4178 /* 4179 * To keep us from looping for the above case of an ordered 4180 * extent that falls inside of the logged extent. 4181 */ 4182 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4183 &ordered->flags)) 4184 continue; 4185 4186 list_for_each_entry(sum, &ordered->list, list) { 4187 ret = btrfs_csum_file_blocks(trans, log, sum); 4188 if (ret) 4189 break; 4190 } 4191 } 4192 4193 if (*ordered_io_error || !mod_len || ret || skip_csum) 4194 return ret; 4195 4196 if (em->compress_type) { 4197 csum_offset = 0; 4198 csum_len = max(em->block_len, em->orig_block_len); 4199 } else { 4200 csum_offset = mod_start - em->start; 4201 csum_len = mod_len; 4202 } 4203 4204 /* block start is already adjusted for the file extent offset. */ 4205 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4206 em->block_start + csum_offset, 4207 em->block_start + csum_offset + 4208 csum_len - 1, &ordered_sums, 0); 4209 if (ret) 4210 return ret; 4211 4212 while (!list_empty(&ordered_sums)) { 4213 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4214 struct btrfs_ordered_sum, 4215 list); 4216 if (!ret) 4217 ret = btrfs_csum_file_blocks(trans, log, sums); 4218 list_del(&sums->list); 4219 kfree(sums); 4220 } 4221 4222 return ret; 4223 } 4224 4225 static int log_one_extent(struct btrfs_trans_handle *trans, 4226 struct btrfs_inode *inode, struct btrfs_root *root, 4227 const struct extent_map *em, 4228 struct btrfs_path *path, 4229 const struct list_head *logged_list, 4230 struct btrfs_log_ctx *ctx) 4231 { 4232 struct btrfs_root *log = root->log_root; 4233 struct btrfs_file_extent_item *fi; 4234 struct extent_buffer *leaf; 4235 struct btrfs_map_token token; 4236 struct btrfs_key key; 4237 u64 extent_offset = em->start - em->orig_start; 4238 u64 block_len; 4239 int ret; 4240 int extent_inserted = 0; 4241 bool ordered_io_err = false; 4242 4243 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4244 logged_list, &ordered_io_err); 4245 if (ret) 4246 return ret; 4247 4248 if (ordered_io_err) { 4249 ctx->io_err = -EIO; 4250 return ctx->io_err; 4251 } 4252 4253 btrfs_init_map_token(&token); 4254 4255 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4256 em->start + em->len, NULL, 0, 1, 4257 sizeof(*fi), &extent_inserted); 4258 if (ret) 4259 return ret; 4260 4261 if (!extent_inserted) { 4262 key.objectid = btrfs_ino(inode); 4263 key.type = BTRFS_EXTENT_DATA_KEY; 4264 key.offset = em->start; 4265 4266 ret = btrfs_insert_empty_item(trans, log, path, &key, 4267 sizeof(*fi)); 4268 if (ret) 4269 return ret; 4270 } 4271 leaf = path->nodes[0]; 4272 fi = btrfs_item_ptr(leaf, path->slots[0], 4273 struct btrfs_file_extent_item); 4274 4275 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4276 &token); 4277 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4278 btrfs_set_token_file_extent_type(leaf, fi, 4279 BTRFS_FILE_EXTENT_PREALLOC, 4280 &token); 4281 else 4282 btrfs_set_token_file_extent_type(leaf, fi, 4283 BTRFS_FILE_EXTENT_REG, 4284 &token); 4285 4286 block_len = max(em->block_len, em->orig_block_len); 4287 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4288 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4289 em->block_start, 4290 &token); 4291 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4292 &token); 4293 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4294 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4295 em->block_start - 4296 extent_offset, &token); 4297 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4298 &token); 4299 } else { 4300 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4301 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4302 &token); 4303 } 4304 4305 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4306 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4307 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4308 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4309 &token); 4310 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4311 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4312 btrfs_mark_buffer_dirty(leaf); 4313 4314 btrfs_release_path(path); 4315 4316 return ret; 4317 } 4318 4319 /* 4320 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4321 * lose them after doing a fast fsync and replaying the log. We scan the 4322 * subvolume's root instead of iterating the inode's extent map tree because 4323 * otherwise we can log incorrect extent items based on extent map conversion. 4324 * That can happen due to the fact that extent maps are merged when they 4325 * are not in the extent map tree's list of modified extents. 4326 */ 4327 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4328 struct btrfs_inode *inode, 4329 struct btrfs_path *path) 4330 { 4331 struct btrfs_root *root = inode->root; 4332 struct btrfs_key key; 4333 const u64 i_size = i_size_read(&inode->vfs_inode); 4334 const u64 ino = btrfs_ino(inode); 4335 struct btrfs_path *dst_path = NULL; 4336 u64 last_extent = (u64)-1; 4337 int ins_nr = 0; 4338 int start_slot; 4339 int ret; 4340 4341 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4342 return 0; 4343 4344 key.objectid = ino; 4345 key.type = BTRFS_EXTENT_DATA_KEY; 4346 key.offset = i_size; 4347 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4348 if (ret < 0) 4349 goto out; 4350 4351 while (true) { 4352 struct extent_buffer *leaf = path->nodes[0]; 4353 int slot = path->slots[0]; 4354 4355 if (slot >= btrfs_header_nritems(leaf)) { 4356 if (ins_nr > 0) { 4357 ret = copy_items(trans, inode, dst_path, path, 4358 &last_extent, start_slot, 4359 ins_nr, 1, 0); 4360 if (ret < 0) 4361 goto out; 4362 ins_nr = 0; 4363 } 4364 ret = btrfs_next_leaf(root, path); 4365 if (ret < 0) 4366 goto out; 4367 if (ret > 0) { 4368 ret = 0; 4369 break; 4370 } 4371 continue; 4372 } 4373 4374 btrfs_item_key_to_cpu(leaf, &key, slot); 4375 if (key.objectid > ino) 4376 break; 4377 if (WARN_ON_ONCE(key.objectid < ino) || 4378 key.type < BTRFS_EXTENT_DATA_KEY || 4379 key.offset < i_size) { 4380 path->slots[0]++; 4381 continue; 4382 } 4383 if (last_extent == (u64)-1) { 4384 last_extent = key.offset; 4385 /* 4386 * Avoid logging extent items logged in past fsync calls 4387 * and leading to duplicate keys in the log tree. 4388 */ 4389 do { 4390 ret = btrfs_truncate_inode_items(trans, 4391 root->log_root, 4392 &inode->vfs_inode, 4393 i_size, 4394 BTRFS_EXTENT_DATA_KEY); 4395 } while (ret == -EAGAIN); 4396 if (ret) 4397 goto out; 4398 } 4399 if (ins_nr == 0) 4400 start_slot = slot; 4401 ins_nr++; 4402 path->slots[0]++; 4403 if (!dst_path) { 4404 dst_path = btrfs_alloc_path(); 4405 if (!dst_path) { 4406 ret = -ENOMEM; 4407 goto out; 4408 } 4409 } 4410 } 4411 if (ins_nr > 0) { 4412 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4413 start_slot, ins_nr, 1, 0); 4414 if (ret > 0) 4415 ret = 0; 4416 } 4417 out: 4418 btrfs_release_path(path); 4419 btrfs_free_path(dst_path); 4420 return ret; 4421 } 4422 4423 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4424 struct btrfs_root *root, 4425 struct btrfs_inode *inode, 4426 struct btrfs_path *path, 4427 struct list_head *logged_list, 4428 struct btrfs_log_ctx *ctx, 4429 const u64 start, 4430 const u64 end) 4431 { 4432 struct extent_map *em, *n; 4433 struct list_head extents; 4434 struct extent_map_tree *tree = &inode->extent_tree; 4435 u64 logged_start, logged_end; 4436 u64 test_gen; 4437 int ret = 0; 4438 int num = 0; 4439 4440 INIT_LIST_HEAD(&extents); 4441 4442 down_write(&inode->dio_sem); 4443 write_lock(&tree->lock); 4444 test_gen = root->fs_info->last_trans_committed; 4445 logged_start = start; 4446 logged_end = end; 4447 4448 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4449 list_del_init(&em->list); 4450 /* 4451 * Just an arbitrary number, this can be really CPU intensive 4452 * once we start getting a lot of extents, and really once we 4453 * have a bunch of extents we just want to commit since it will 4454 * be faster. 4455 */ 4456 if (++num > 32768) { 4457 list_del_init(&tree->modified_extents); 4458 ret = -EFBIG; 4459 goto process; 4460 } 4461 4462 if (em->generation <= test_gen) 4463 continue; 4464 4465 /* We log prealloc extents beyond eof later. */ 4466 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4467 em->start >= i_size_read(&inode->vfs_inode)) 4468 continue; 4469 4470 if (em->start < logged_start) 4471 logged_start = em->start; 4472 if ((em->start + em->len - 1) > logged_end) 4473 logged_end = em->start + em->len - 1; 4474 4475 /* Need a ref to keep it from getting evicted from cache */ 4476 refcount_inc(&em->refs); 4477 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4478 list_add_tail(&em->list, &extents); 4479 num++; 4480 } 4481 4482 list_sort(NULL, &extents, extent_cmp); 4483 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4484 /* 4485 * Some ordered extents started by fsync might have completed 4486 * before we could collect them into the list logged_list, which 4487 * means they're gone, not in our logged_list nor in the inode's 4488 * ordered tree. We want the application/user space to know an 4489 * error happened while attempting to persist file data so that 4490 * it can take proper action. If such error happened, we leave 4491 * without writing to the log tree and the fsync must report the 4492 * file data write error and not commit the current transaction. 4493 */ 4494 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4495 if (ret) 4496 ctx->io_err = ret; 4497 process: 4498 while (!list_empty(&extents)) { 4499 em = list_entry(extents.next, struct extent_map, list); 4500 4501 list_del_init(&em->list); 4502 4503 /* 4504 * If we had an error we just need to delete everybody from our 4505 * private list. 4506 */ 4507 if (ret) { 4508 clear_em_logging(tree, em); 4509 free_extent_map(em); 4510 continue; 4511 } 4512 4513 write_unlock(&tree->lock); 4514 4515 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4516 ctx); 4517 write_lock(&tree->lock); 4518 clear_em_logging(tree, em); 4519 free_extent_map(em); 4520 } 4521 WARN_ON(!list_empty(&extents)); 4522 write_unlock(&tree->lock); 4523 up_write(&inode->dio_sem); 4524 4525 btrfs_release_path(path); 4526 if (!ret) 4527 ret = btrfs_log_prealloc_extents(trans, inode, path); 4528 4529 return ret; 4530 } 4531 4532 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4533 struct btrfs_path *path, u64 *size_ret) 4534 { 4535 struct btrfs_key key; 4536 int ret; 4537 4538 key.objectid = btrfs_ino(inode); 4539 key.type = BTRFS_INODE_ITEM_KEY; 4540 key.offset = 0; 4541 4542 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4543 if (ret < 0) { 4544 return ret; 4545 } else if (ret > 0) { 4546 *size_ret = 0; 4547 } else { 4548 struct btrfs_inode_item *item; 4549 4550 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4551 struct btrfs_inode_item); 4552 *size_ret = btrfs_inode_size(path->nodes[0], item); 4553 } 4554 4555 btrfs_release_path(path); 4556 return 0; 4557 } 4558 4559 /* 4560 * At the moment we always log all xattrs. This is to figure out at log replay 4561 * time which xattrs must have their deletion replayed. If a xattr is missing 4562 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4563 * because if a xattr is deleted, the inode is fsynced and a power failure 4564 * happens, causing the log to be replayed the next time the fs is mounted, 4565 * we want the xattr to not exist anymore (same behaviour as other filesystems 4566 * with a journal, ext3/4, xfs, f2fs, etc). 4567 */ 4568 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4569 struct btrfs_root *root, 4570 struct btrfs_inode *inode, 4571 struct btrfs_path *path, 4572 struct btrfs_path *dst_path) 4573 { 4574 int ret; 4575 struct btrfs_key key; 4576 const u64 ino = btrfs_ino(inode); 4577 int ins_nr = 0; 4578 int start_slot = 0; 4579 4580 key.objectid = ino; 4581 key.type = BTRFS_XATTR_ITEM_KEY; 4582 key.offset = 0; 4583 4584 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4585 if (ret < 0) 4586 return ret; 4587 4588 while (true) { 4589 int slot = path->slots[0]; 4590 struct extent_buffer *leaf = path->nodes[0]; 4591 int nritems = btrfs_header_nritems(leaf); 4592 4593 if (slot >= nritems) { 4594 if (ins_nr > 0) { 4595 u64 last_extent = 0; 4596 4597 ret = copy_items(trans, inode, dst_path, path, 4598 &last_extent, start_slot, 4599 ins_nr, 1, 0); 4600 /* can't be 1, extent items aren't processed */ 4601 ASSERT(ret <= 0); 4602 if (ret < 0) 4603 return ret; 4604 ins_nr = 0; 4605 } 4606 ret = btrfs_next_leaf(root, path); 4607 if (ret < 0) 4608 return ret; 4609 else if (ret > 0) 4610 break; 4611 continue; 4612 } 4613 4614 btrfs_item_key_to_cpu(leaf, &key, slot); 4615 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4616 break; 4617 4618 if (ins_nr == 0) 4619 start_slot = slot; 4620 ins_nr++; 4621 path->slots[0]++; 4622 cond_resched(); 4623 } 4624 if (ins_nr > 0) { 4625 u64 last_extent = 0; 4626 4627 ret = copy_items(trans, inode, dst_path, path, 4628 &last_extent, start_slot, 4629 ins_nr, 1, 0); 4630 /* can't be 1, extent items aren't processed */ 4631 ASSERT(ret <= 0); 4632 if (ret < 0) 4633 return ret; 4634 } 4635 4636 return 0; 4637 } 4638 4639 /* 4640 * If the no holes feature is enabled we need to make sure any hole between the 4641 * last extent and the i_size of our inode is explicitly marked in the log. This 4642 * is to make sure that doing something like: 4643 * 4644 * 1) create file with 128Kb of data 4645 * 2) truncate file to 64Kb 4646 * 3) truncate file to 256Kb 4647 * 4) fsync file 4648 * 5) <crash/power failure> 4649 * 6) mount fs and trigger log replay 4650 * 4651 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4652 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4653 * file correspond to a hole. The presence of explicit holes in a log tree is 4654 * what guarantees that log replay will remove/adjust file extent items in the 4655 * fs/subvol tree. 4656 * 4657 * Here we do not need to care about holes between extents, that is already done 4658 * by copy_items(). We also only need to do this in the full sync path, where we 4659 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4660 * lookup the list of modified extent maps and if any represents a hole, we 4661 * insert a corresponding extent representing a hole in the log tree. 4662 */ 4663 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4664 struct btrfs_root *root, 4665 struct btrfs_inode *inode, 4666 struct btrfs_path *path) 4667 { 4668 struct btrfs_fs_info *fs_info = root->fs_info; 4669 int ret; 4670 struct btrfs_key key; 4671 u64 hole_start; 4672 u64 hole_size; 4673 struct extent_buffer *leaf; 4674 struct btrfs_root *log = root->log_root; 4675 const u64 ino = btrfs_ino(inode); 4676 const u64 i_size = i_size_read(&inode->vfs_inode); 4677 4678 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4679 return 0; 4680 4681 key.objectid = ino; 4682 key.type = BTRFS_EXTENT_DATA_KEY; 4683 key.offset = (u64)-1; 4684 4685 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4686 ASSERT(ret != 0); 4687 if (ret < 0) 4688 return ret; 4689 4690 ASSERT(path->slots[0] > 0); 4691 path->slots[0]--; 4692 leaf = path->nodes[0]; 4693 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4694 4695 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4696 /* inode does not have any extents */ 4697 hole_start = 0; 4698 hole_size = i_size; 4699 } else { 4700 struct btrfs_file_extent_item *extent; 4701 u64 len; 4702 4703 /* 4704 * If there's an extent beyond i_size, an explicit hole was 4705 * already inserted by copy_items(). 4706 */ 4707 if (key.offset >= i_size) 4708 return 0; 4709 4710 extent = btrfs_item_ptr(leaf, path->slots[0], 4711 struct btrfs_file_extent_item); 4712 4713 if (btrfs_file_extent_type(leaf, extent) == 4714 BTRFS_FILE_EXTENT_INLINE) { 4715 len = btrfs_file_extent_inline_len(leaf, 4716 path->slots[0], 4717 extent); 4718 ASSERT(len == i_size || 4719 (len == fs_info->sectorsize && 4720 btrfs_file_extent_compression(leaf, extent) != 4721 BTRFS_COMPRESS_NONE)); 4722 return 0; 4723 } 4724 4725 len = btrfs_file_extent_num_bytes(leaf, extent); 4726 /* Last extent goes beyond i_size, no need to log a hole. */ 4727 if (key.offset + len > i_size) 4728 return 0; 4729 hole_start = key.offset + len; 4730 hole_size = i_size - hole_start; 4731 } 4732 btrfs_release_path(path); 4733 4734 /* Last extent ends at i_size. */ 4735 if (hole_size == 0) 4736 return 0; 4737 4738 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4739 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4740 hole_size, 0, hole_size, 0, 0, 0); 4741 return ret; 4742 } 4743 4744 /* 4745 * When we are logging a new inode X, check if it doesn't have a reference that 4746 * matches the reference from some other inode Y created in a past transaction 4747 * and that was renamed in the current transaction. If we don't do this, then at 4748 * log replay time we can lose inode Y (and all its files if it's a directory): 4749 * 4750 * mkdir /mnt/x 4751 * echo "hello world" > /mnt/x/foobar 4752 * sync 4753 * mv /mnt/x /mnt/y 4754 * mkdir /mnt/x # or touch /mnt/x 4755 * xfs_io -c fsync /mnt/x 4756 * <power fail> 4757 * mount fs, trigger log replay 4758 * 4759 * After the log replay procedure, we would lose the first directory and all its 4760 * files (file foobar). 4761 * For the case where inode Y is not a directory we simply end up losing it: 4762 * 4763 * echo "123" > /mnt/foo 4764 * sync 4765 * mv /mnt/foo /mnt/bar 4766 * echo "abc" > /mnt/foo 4767 * xfs_io -c fsync /mnt/foo 4768 * <power fail> 4769 * 4770 * We also need this for cases where a snapshot entry is replaced by some other 4771 * entry (file or directory) otherwise we end up with an unreplayable log due to 4772 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4773 * if it were a regular entry: 4774 * 4775 * mkdir /mnt/x 4776 * btrfs subvolume snapshot /mnt /mnt/x/snap 4777 * btrfs subvolume delete /mnt/x/snap 4778 * rmdir /mnt/x 4779 * mkdir /mnt/x 4780 * fsync /mnt/x or fsync some new file inside it 4781 * <power fail> 4782 * 4783 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4784 * the same transaction. 4785 */ 4786 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4787 const int slot, 4788 const struct btrfs_key *key, 4789 struct btrfs_inode *inode, 4790 u64 *other_ino) 4791 { 4792 int ret; 4793 struct btrfs_path *search_path; 4794 char *name = NULL; 4795 u32 name_len = 0; 4796 u32 item_size = btrfs_item_size_nr(eb, slot); 4797 u32 cur_offset = 0; 4798 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4799 4800 search_path = btrfs_alloc_path(); 4801 if (!search_path) 4802 return -ENOMEM; 4803 search_path->search_commit_root = 1; 4804 search_path->skip_locking = 1; 4805 4806 while (cur_offset < item_size) { 4807 u64 parent; 4808 u32 this_name_len; 4809 u32 this_len; 4810 unsigned long name_ptr; 4811 struct btrfs_dir_item *di; 4812 4813 if (key->type == BTRFS_INODE_REF_KEY) { 4814 struct btrfs_inode_ref *iref; 4815 4816 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4817 parent = key->offset; 4818 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4819 name_ptr = (unsigned long)(iref + 1); 4820 this_len = sizeof(*iref) + this_name_len; 4821 } else { 4822 struct btrfs_inode_extref *extref; 4823 4824 extref = (struct btrfs_inode_extref *)(ptr + 4825 cur_offset); 4826 parent = btrfs_inode_extref_parent(eb, extref); 4827 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4828 name_ptr = (unsigned long)&extref->name; 4829 this_len = sizeof(*extref) + this_name_len; 4830 } 4831 4832 if (this_name_len > name_len) { 4833 char *new_name; 4834 4835 new_name = krealloc(name, this_name_len, GFP_NOFS); 4836 if (!new_name) { 4837 ret = -ENOMEM; 4838 goto out; 4839 } 4840 name_len = this_name_len; 4841 name = new_name; 4842 } 4843 4844 read_extent_buffer(eb, name, name_ptr, this_name_len); 4845 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4846 parent, name, this_name_len, 0); 4847 if (di && !IS_ERR(di)) { 4848 struct btrfs_key di_key; 4849 4850 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4851 di, &di_key); 4852 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4853 ret = 1; 4854 *other_ino = di_key.objectid; 4855 } else { 4856 ret = -EAGAIN; 4857 } 4858 goto out; 4859 } else if (IS_ERR(di)) { 4860 ret = PTR_ERR(di); 4861 goto out; 4862 } 4863 btrfs_release_path(search_path); 4864 4865 cur_offset += this_len; 4866 } 4867 ret = 0; 4868 out: 4869 btrfs_free_path(search_path); 4870 kfree(name); 4871 return ret; 4872 } 4873 4874 /* log a single inode in the tree log. 4875 * At least one parent directory for this inode must exist in the tree 4876 * or be logged already. 4877 * 4878 * Any items from this inode changed by the current transaction are copied 4879 * to the log tree. An extra reference is taken on any extents in this 4880 * file, allowing us to avoid a whole pile of corner cases around logging 4881 * blocks that have been removed from the tree. 4882 * 4883 * See LOG_INODE_ALL and related defines for a description of what inode_only 4884 * does. 4885 * 4886 * This handles both files and directories. 4887 */ 4888 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4889 struct btrfs_root *root, struct btrfs_inode *inode, 4890 int inode_only, 4891 const loff_t start, 4892 const loff_t end, 4893 struct btrfs_log_ctx *ctx) 4894 { 4895 struct btrfs_fs_info *fs_info = root->fs_info; 4896 struct btrfs_path *path; 4897 struct btrfs_path *dst_path; 4898 struct btrfs_key min_key; 4899 struct btrfs_key max_key; 4900 struct btrfs_root *log = root->log_root; 4901 LIST_HEAD(logged_list); 4902 u64 last_extent = 0; 4903 int err = 0; 4904 int ret; 4905 int nritems; 4906 int ins_start_slot = 0; 4907 int ins_nr; 4908 bool fast_search = false; 4909 u64 ino = btrfs_ino(inode); 4910 struct extent_map_tree *em_tree = &inode->extent_tree; 4911 u64 logged_isize = 0; 4912 bool need_log_inode_item = true; 4913 bool xattrs_logged = false; 4914 4915 path = btrfs_alloc_path(); 4916 if (!path) 4917 return -ENOMEM; 4918 dst_path = btrfs_alloc_path(); 4919 if (!dst_path) { 4920 btrfs_free_path(path); 4921 return -ENOMEM; 4922 } 4923 4924 min_key.objectid = ino; 4925 min_key.type = BTRFS_INODE_ITEM_KEY; 4926 min_key.offset = 0; 4927 4928 max_key.objectid = ino; 4929 4930 4931 /* today the code can only do partial logging of directories */ 4932 if (S_ISDIR(inode->vfs_inode.i_mode) || 4933 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4934 &inode->runtime_flags) && 4935 inode_only >= LOG_INODE_EXISTS)) 4936 max_key.type = BTRFS_XATTR_ITEM_KEY; 4937 else 4938 max_key.type = (u8)-1; 4939 max_key.offset = (u64)-1; 4940 4941 /* 4942 * Only run delayed items if we are a dir or a new file. 4943 * Otherwise commit the delayed inode only, which is needed in 4944 * order for the log replay code to mark inodes for link count 4945 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4946 */ 4947 if (S_ISDIR(inode->vfs_inode.i_mode) || 4948 inode->generation > fs_info->last_trans_committed) 4949 ret = btrfs_commit_inode_delayed_items(trans, inode); 4950 else 4951 ret = btrfs_commit_inode_delayed_inode(inode); 4952 4953 if (ret) { 4954 btrfs_free_path(path); 4955 btrfs_free_path(dst_path); 4956 return ret; 4957 } 4958 4959 if (inode_only == LOG_OTHER_INODE) { 4960 inode_only = LOG_INODE_EXISTS; 4961 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4962 } else { 4963 mutex_lock(&inode->log_mutex); 4964 } 4965 4966 /* 4967 * a brute force approach to making sure we get the most uptodate 4968 * copies of everything. 4969 */ 4970 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4971 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4972 4973 if (inode_only == LOG_INODE_EXISTS) 4974 max_key_type = BTRFS_XATTR_ITEM_KEY; 4975 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4976 } else { 4977 if (inode_only == LOG_INODE_EXISTS) { 4978 /* 4979 * Make sure the new inode item we write to the log has 4980 * the same isize as the current one (if it exists). 4981 * This is necessary to prevent data loss after log 4982 * replay, and also to prevent doing a wrong expanding 4983 * truncate - for e.g. create file, write 4K into offset 4984 * 0, fsync, write 4K into offset 4096, add hard link, 4985 * fsync some other file (to sync log), power fail - if 4986 * we use the inode's current i_size, after log replay 4987 * we get a 8Kb file, with the last 4Kb extent as a hole 4988 * (zeroes), as if an expanding truncate happened, 4989 * instead of getting a file of 4Kb only. 4990 */ 4991 err = logged_inode_size(log, inode, path, &logged_isize); 4992 if (err) 4993 goto out_unlock; 4994 } 4995 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4996 &inode->runtime_flags)) { 4997 if (inode_only == LOG_INODE_EXISTS) { 4998 max_key.type = BTRFS_XATTR_ITEM_KEY; 4999 ret = drop_objectid_items(trans, log, path, ino, 5000 max_key.type); 5001 } else { 5002 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5003 &inode->runtime_flags); 5004 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5005 &inode->runtime_flags); 5006 while(1) { 5007 ret = btrfs_truncate_inode_items(trans, 5008 log, &inode->vfs_inode, 0, 0); 5009 if (ret != -EAGAIN) 5010 break; 5011 } 5012 } 5013 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5014 &inode->runtime_flags) || 5015 inode_only == LOG_INODE_EXISTS) { 5016 if (inode_only == LOG_INODE_ALL) 5017 fast_search = true; 5018 max_key.type = BTRFS_XATTR_ITEM_KEY; 5019 ret = drop_objectid_items(trans, log, path, ino, 5020 max_key.type); 5021 } else { 5022 if (inode_only == LOG_INODE_ALL) 5023 fast_search = true; 5024 goto log_extents; 5025 } 5026 5027 } 5028 if (ret) { 5029 err = ret; 5030 goto out_unlock; 5031 } 5032 5033 while (1) { 5034 ins_nr = 0; 5035 ret = btrfs_search_forward(root, &min_key, 5036 path, trans->transid); 5037 if (ret < 0) { 5038 err = ret; 5039 goto out_unlock; 5040 } 5041 if (ret != 0) 5042 break; 5043 again: 5044 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 5045 if (min_key.objectid != ino) 5046 break; 5047 if (min_key.type > max_key.type) 5048 break; 5049 5050 if (min_key.type == BTRFS_INODE_ITEM_KEY) 5051 need_log_inode_item = false; 5052 5053 if ((min_key.type == BTRFS_INODE_REF_KEY || 5054 min_key.type == BTRFS_INODE_EXTREF_KEY) && 5055 inode->generation == trans->transid) { 5056 u64 other_ino = 0; 5057 5058 ret = btrfs_check_ref_name_override(path->nodes[0], 5059 path->slots[0], &min_key, inode, 5060 &other_ino); 5061 if (ret < 0) { 5062 err = ret; 5063 goto out_unlock; 5064 } else if (ret > 0 && ctx && 5065 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5066 struct btrfs_key inode_key; 5067 struct inode *other_inode; 5068 5069 if (ins_nr > 0) { 5070 ins_nr++; 5071 } else { 5072 ins_nr = 1; 5073 ins_start_slot = path->slots[0]; 5074 } 5075 ret = copy_items(trans, inode, dst_path, path, 5076 &last_extent, ins_start_slot, 5077 ins_nr, inode_only, 5078 logged_isize); 5079 if (ret < 0) { 5080 err = ret; 5081 goto out_unlock; 5082 } 5083 ins_nr = 0; 5084 btrfs_release_path(path); 5085 inode_key.objectid = other_ino; 5086 inode_key.type = BTRFS_INODE_ITEM_KEY; 5087 inode_key.offset = 0; 5088 other_inode = btrfs_iget(fs_info->sb, 5089 &inode_key, root, 5090 NULL); 5091 /* 5092 * If the other inode that had a conflicting dir 5093 * entry was deleted in the current transaction, 5094 * we don't need to do more work nor fallback to 5095 * a transaction commit. 5096 */ 5097 if (IS_ERR(other_inode) && 5098 PTR_ERR(other_inode) == -ENOENT) { 5099 goto next_key; 5100 } else if (IS_ERR(other_inode)) { 5101 err = PTR_ERR(other_inode); 5102 goto out_unlock; 5103 } 5104 /* 5105 * We are safe logging the other inode without 5106 * acquiring its i_mutex as long as we log with 5107 * the LOG_INODE_EXISTS mode. We're safe against 5108 * concurrent renames of the other inode as well 5109 * because during a rename we pin the log and 5110 * update the log with the new name before we 5111 * unpin it. 5112 */ 5113 err = btrfs_log_inode(trans, root, 5114 BTRFS_I(other_inode), 5115 LOG_OTHER_INODE, 0, LLONG_MAX, 5116 ctx); 5117 iput(other_inode); 5118 if (err) 5119 goto out_unlock; 5120 else 5121 goto next_key; 5122 } 5123 } 5124 5125 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5126 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 5127 if (ins_nr == 0) 5128 goto next_slot; 5129 ret = copy_items(trans, inode, dst_path, path, 5130 &last_extent, ins_start_slot, 5131 ins_nr, inode_only, logged_isize); 5132 if (ret < 0) { 5133 err = ret; 5134 goto out_unlock; 5135 } 5136 ins_nr = 0; 5137 if (ret) { 5138 btrfs_release_path(path); 5139 continue; 5140 } 5141 goto next_slot; 5142 } 5143 5144 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5145 ins_nr++; 5146 goto next_slot; 5147 } else if (!ins_nr) { 5148 ins_start_slot = path->slots[0]; 5149 ins_nr = 1; 5150 goto next_slot; 5151 } 5152 5153 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5154 ins_start_slot, ins_nr, inode_only, 5155 logged_isize); 5156 if (ret < 0) { 5157 err = ret; 5158 goto out_unlock; 5159 } 5160 if (ret) { 5161 ins_nr = 0; 5162 btrfs_release_path(path); 5163 continue; 5164 } 5165 ins_nr = 1; 5166 ins_start_slot = path->slots[0]; 5167 next_slot: 5168 5169 nritems = btrfs_header_nritems(path->nodes[0]); 5170 path->slots[0]++; 5171 if (path->slots[0] < nritems) { 5172 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 5173 path->slots[0]); 5174 goto again; 5175 } 5176 if (ins_nr) { 5177 ret = copy_items(trans, inode, dst_path, path, 5178 &last_extent, ins_start_slot, 5179 ins_nr, inode_only, logged_isize); 5180 if (ret < 0) { 5181 err = ret; 5182 goto out_unlock; 5183 } 5184 ret = 0; 5185 ins_nr = 0; 5186 } 5187 btrfs_release_path(path); 5188 next_key: 5189 if (min_key.offset < (u64)-1) { 5190 min_key.offset++; 5191 } else if (min_key.type < max_key.type) { 5192 min_key.type++; 5193 min_key.offset = 0; 5194 } else { 5195 break; 5196 } 5197 } 5198 if (ins_nr) { 5199 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5200 ins_start_slot, ins_nr, inode_only, 5201 logged_isize); 5202 if (ret < 0) { 5203 err = ret; 5204 goto out_unlock; 5205 } 5206 ret = 0; 5207 ins_nr = 0; 5208 } 5209 5210 btrfs_release_path(path); 5211 btrfs_release_path(dst_path); 5212 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5213 if (err) 5214 goto out_unlock; 5215 xattrs_logged = true; 5216 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5217 btrfs_release_path(path); 5218 btrfs_release_path(dst_path); 5219 err = btrfs_log_trailing_hole(trans, root, inode, path); 5220 if (err) 5221 goto out_unlock; 5222 } 5223 log_extents: 5224 btrfs_release_path(path); 5225 btrfs_release_path(dst_path); 5226 if (need_log_inode_item) { 5227 err = log_inode_item(trans, log, dst_path, inode); 5228 if (!err && !xattrs_logged) { 5229 err = btrfs_log_all_xattrs(trans, root, inode, path, 5230 dst_path); 5231 btrfs_release_path(path); 5232 } 5233 if (err) 5234 goto out_unlock; 5235 } 5236 if (fast_search) { 5237 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5238 &logged_list, ctx, start, end); 5239 if (ret) { 5240 err = ret; 5241 goto out_unlock; 5242 } 5243 } else if (inode_only == LOG_INODE_ALL) { 5244 struct extent_map *em, *n; 5245 5246 write_lock(&em_tree->lock); 5247 /* 5248 * We can't just remove every em if we're called for a ranged 5249 * fsync - that is, one that doesn't cover the whole possible 5250 * file range (0 to LLONG_MAX). This is because we can have 5251 * em's that fall outside the range we're logging and therefore 5252 * their ordered operations haven't completed yet 5253 * (btrfs_finish_ordered_io() not invoked yet). This means we 5254 * didn't get their respective file extent item in the fs/subvol 5255 * tree yet, and need to let the next fast fsync (one which 5256 * consults the list of modified extent maps) find the em so 5257 * that it logs a matching file extent item and waits for the 5258 * respective ordered operation to complete (if it's still 5259 * running). 5260 * 5261 * Removing every em outside the range we're logging would make 5262 * the next fast fsync not log their matching file extent items, 5263 * therefore making us lose data after a log replay. 5264 */ 5265 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5266 list) { 5267 const u64 mod_end = em->mod_start + em->mod_len - 1; 5268 5269 if (em->mod_start >= start && mod_end <= end) 5270 list_del_init(&em->list); 5271 } 5272 write_unlock(&em_tree->lock); 5273 } 5274 5275 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5276 ret = log_directory_changes(trans, root, inode, path, dst_path, 5277 ctx); 5278 if (ret) { 5279 err = ret; 5280 goto out_unlock; 5281 } 5282 } 5283 5284 spin_lock(&inode->lock); 5285 inode->logged_trans = trans->transid; 5286 inode->last_log_commit = inode->last_sub_trans; 5287 spin_unlock(&inode->lock); 5288 out_unlock: 5289 if (unlikely(err)) 5290 btrfs_put_logged_extents(&logged_list); 5291 else 5292 btrfs_submit_logged_extents(&logged_list, log); 5293 mutex_unlock(&inode->log_mutex); 5294 5295 btrfs_free_path(path); 5296 btrfs_free_path(dst_path); 5297 return err; 5298 } 5299 5300 /* 5301 * Check if we must fallback to a transaction commit when logging an inode. 5302 * This must be called after logging the inode and is used only in the context 5303 * when fsyncing an inode requires the need to log some other inode - in which 5304 * case we can't lock the i_mutex of each other inode we need to log as that 5305 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5306 * log inodes up or down in the hierarchy) or rename operations for example. So 5307 * we take the log_mutex of the inode after we have logged it and then check for 5308 * its last_unlink_trans value - this is safe because any task setting 5309 * last_unlink_trans must take the log_mutex and it must do this before it does 5310 * the actual unlink operation, so if we do this check before a concurrent task 5311 * sets last_unlink_trans it means we've logged a consistent version/state of 5312 * all the inode items, otherwise we are not sure and must do a transaction 5313 * commit (the concurrent task might have only updated last_unlink_trans before 5314 * we logged the inode or it might have also done the unlink). 5315 */ 5316 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5317 struct btrfs_inode *inode) 5318 { 5319 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5320 bool ret = false; 5321 5322 mutex_lock(&inode->log_mutex); 5323 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5324 /* 5325 * Make sure any commits to the log are forced to be full 5326 * commits. 5327 */ 5328 btrfs_set_log_full_commit(fs_info, trans); 5329 ret = true; 5330 } 5331 mutex_unlock(&inode->log_mutex); 5332 5333 return ret; 5334 } 5335 5336 /* 5337 * follow the dentry parent pointers up the chain and see if any 5338 * of the directories in it require a full commit before they can 5339 * be logged. Returns zero if nothing special needs to be done or 1 if 5340 * a full commit is required. 5341 */ 5342 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5343 struct btrfs_inode *inode, 5344 struct dentry *parent, 5345 struct super_block *sb, 5346 u64 last_committed) 5347 { 5348 int ret = 0; 5349 struct dentry *old_parent = NULL; 5350 struct btrfs_inode *orig_inode = inode; 5351 5352 /* 5353 * for regular files, if its inode is already on disk, we don't 5354 * have to worry about the parents at all. This is because 5355 * we can use the last_unlink_trans field to record renames 5356 * and other fun in this file. 5357 */ 5358 if (S_ISREG(inode->vfs_inode.i_mode) && 5359 inode->generation <= last_committed && 5360 inode->last_unlink_trans <= last_committed) 5361 goto out; 5362 5363 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5364 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5365 goto out; 5366 inode = BTRFS_I(d_inode(parent)); 5367 } 5368 5369 while (1) { 5370 /* 5371 * If we are logging a directory then we start with our inode, 5372 * not our parent's inode, so we need to skip setting the 5373 * logged_trans so that further down in the log code we don't 5374 * think this inode has already been logged. 5375 */ 5376 if (inode != orig_inode) 5377 inode->logged_trans = trans->transid; 5378 smp_mb(); 5379 5380 if (btrfs_must_commit_transaction(trans, inode)) { 5381 ret = 1; 5382 break; 5383 } 5384 5385 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5386 break; 5387 5388 if (IS_ROOT(parent)) { 5389 inode = BTRFS_I(d_inode(parent)); 5390 if (btrfs_must_commit_transaction(trans, inode)) 5391 ret = 1; 5392 break; 5393 } 5394 5395 parent = dget_parent(parent); 5396 dput(old_parent); 5397 old_parent = parent; 5398 inode = BTRFS_I(d_inode(parent)); 5399 5400 } 5401 dput(old_parent); 5402 out: 5403 return ret; 5404 } 5405 5406 struct btrfs_dir_list { 5407 u64 ino; 5408 struct list_head list; 5409 }; 5410 5411 /* 5412 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5413 * details about the why it is needed. 5414 * This is a recursive operation - if an existing dentry corresponds to a 5415 * directory, that directory's new entries are logged too (same behaviour as 5416 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5417 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5418 * complains about the following circular lock dependency / possible deadlock: 5419 * 5420 * CPU0 CPU1 5421 * ---- ---- 5422 * lock(&type->i_mutex_dir_key#3/2); 5423 * lock(sb_internal#2); 5424 * lock(&type->i_mutex_dir_key#3/2); 5425 * lock(&sb->s_type->i_mutex_key#14); 5426 * 5427 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5428 * sb_start_intwrite() in btrfs_start_transaction(). 5429 * Not locking i_mutex of the inodes is still safe because: 5430 * 5431 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5432 * that while logging the inode new references (names) are added or removed 5433 * from the inode, leaving the logged inode item with a link count that does 5434 * not match the number of logged inode reference items. This is fine because 5435 * at log replay time we compute the real number of links and correct the 5436 * link count in the inode item (see replay_one_buffer() and 5437 * link_to_fixup_dir()); 5438 * 5439 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5440 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5441 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5442 * has a size that doesn't match the sum of the lengths of all the logged 5443 * names. This does not result in a problem because if a dir_item key is 5444 * logged but its matching dir_index key is not logged, at log replay time we 5445 * don't use it to replay the respective name (see replay_one_name()). On the 5446 * other hand if only the dir_index key ends up being logged, the respective 5447 * name is added to the fs/subvol tree with both the dir_item and dir_index 5448 * keys created (see replay_one_name()). 5449 * The directory's inode item with a wrong i_size is not a problem as well, 5450 * since we don't use it at log replay time to set the i_size in the inode 5451 * item of the fs/subvol tree (see overwrite_item()). 5452 */ 5453 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5454 struct btrfs_root *root, 5455 struct btrfs_inode *start_inode, 5456 struct btrfs_log_ctx *ctx) 5457 { 5458 struct btrfs_fs_info *fs_info = root->fs_info; 5459 struct btrfs_root *log = root->log_root; 5460 struct btrfs_path *path; 5461 LIST_HEAD(dir_list); 5462 struct btrfs_dir_list *dir_elem; 5463 int ret = 0; 5464 5465 path = btrfs_alloc_path(); 5466 if (!path) 5467 return -ENOMEM; 5468 5469 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5470 if (!dir_elem) { 5471 btrfs_free_path(path); 5472 return -ENOMEM; 5473 } 5474 dir_elem->ino = btrfs_ino(start_inode); 5475 list_add_tail(&dir_elem->list, &dir_list); 5476 5477 while (!list_empty(&dir_list)) { 5478 struct extent_buffer *leaf; 5479 struct btrfs_key min_key; 5480 int nritems; 5481 int i; 5482 5483 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5484 list); 5485 if (ret) 5486 goto next_dir_inode; 5487 5488 min_key.objectid = dir_elem->ino; 5489 min_key.type = BTRFS_DIR_ITEM_KEY; 5490 min_key.offset = 0; 5491 again: 5492 btrfs_release_path(path); 5493 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5494 if (ret < 0) { 5495 goto next_dir_inode; 5496 } else if (ret > 0) { 5497 ret = 0; 5498 goto next_dir_inode; 5499 } 5500 5501 process_leaf: 5502 leaf = path->nodes[0]; 5503 nritems = btrfs_header_nritems(leaf); 5504 for (i = path->slots[0]; i < nritems; i++) { 5505 struct btrfs_dir_item *di; 5506 struct btrfs_key di_key; 5507 struct inode *di_inode; 5508 struct btrfs_dir_list *new_dir_elem; 5509 int log_mode = LOG_INODE_EXISTS; 5510 int type; 5511 5512 btrfs_item_key_to_cpu(leaf, &min_key, i); 5513 if (min_key.objectid != dir_elem->ino || 5514 min_key.type != BTRFS_DIR_ITEM_KEY) 5515 goto next_dir_inode; 5516 5517 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5518 type = btrfs_dir_type(leaf, di); 5519 if (btrfs_dir_transid(leaf, di) < trans->transid && 5520 type != BTRFS_FT_DIR) 5521 continue; 5522 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5523 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5524 continue; 5525 5526 btrfs_release_path(path); 5527 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5528 if (IS_ERR(di_inode)) { 5529 ret = PTR_ERR(di_inode); 5530 goto next_dir_inode; 5531 } 5532 5533 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5534 iput(di_inode); 5535 break; 5536 } 5537 5538 ctx->log_new_dentries = false; 5539 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5540 log_mode = LOG_INODE_ALL; 5541 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5542 log_mode, 0, LLONG_MAX, ctx); 5543 if (!ret && 5544 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5545 ret = 1; 5546 iput(di_inode); 5547 if (ret) 5548 goto next_dir_inode; 5549 if (ctx->log_new_dentries) { 5550 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5551 GFP_NOFS); 5552 if (!new_dir_elem) { 5553 ret = -ENOMEM; 5554 goto next_dir_inode; 5555 } 5556 new_dir_elem->ino = di_key.objectid; 5557 list_add_tail(&new_dir_elem->list, &dir_list); 5558 } 5559 break; 5560 } 5561 if (i == nritems) { 5562 ret = btrfs_next_leaf(log, path); 5563 if (ret < 0) { 5564 goto next_dir_inode; 5565 } else if (ret > 0) { 5566 ret = 0; 5567 goto next_dir_inode; 5568 } 5569 goto process_leaf; 5570 } 5571 if (min_key.offset < (u64)-1) { 5572 min_key.offset++; 5573 goto again; 5574 } 5575 next_dir_inode: 5576 list_del(&dir_elem->list); 5577 kfree(dir_elem); 5578 } 5579 5580 btrfs_free_path(path); 5581 return ret; 5582 } 5583 5584 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5585 struct btrfs_inode *inode, 5586 struct btrfs_log_ctx *ctx) 5587 { 5588 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5589 int ret; 5590 struct btrfs_path *path; 5591 struct btrfs_key key; 5592 struct btrfs_root *root = inode->root; 5593 const u64 ino = btrfs_ino(inode); 5594 5595 path = btrfs_alloc_path(); 5596 if (!path) 5597 return -ENOMEM; 5598 path->skip_locking = 1; 5599 path->search_commit_root = 1; 5600 5601 key.objectid = ino; 5602 key.type = BTRFS_INODE_REF_KEY; 5603 key.offset = 0; 5604 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5605 if (ret < 0) 5606 goto out; 5607 5608 while (true) { 5609 struct extent_buffer *leaf = path->nodes[0]; 5610 int slot = path->slots[0]; 5611 u32 cur_offset = 0; 5612 u32 item_size; 5613 unsigned long ptr; 5614 5615 if (slot >= btrfs_header_nritems(leaf)) { 5616 ret = btrfs_next_leaf(root, path); 5617 if (ret < 0) 5618 goto out; 5619 else if (ret > 0) 5620 break; 5621 continue; 5622 } 5623 5624 btrfs_item_key_to_cpu(leaf, &key, slot); 5625 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5626 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5627 break; 5628 5629 item_size = btrfs_item_size_nr(leaf, slot); 5630 ptr = btrfs_item_ptr_offset(leaf, slot); 5631 while (cur_offset < item_size) { 5632 struct btrfs_key inode_key; 5633 struct inode *dir_inode; 5634 5635 inode_key.type = BTRFS_INODE_ITEM_KEY; 5636 inode_key.offset = 0; 5637 5638 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5639 struct btrfs_inode_extref *extref; 5640 5641 extref = (struct btrfs_inode_extref *) 5642 (ptr + cur_offset); 5643 inode_key.objectid = btrfs_inode_extref_parent( 5644 leaf, extref); 5645 cur_offset += sizeof(*extref); 5646 cur_offset += btrfs_inode_extref_name_len(leaf, 5647 extref); 5648 } else { 5649 inode_key.objectid = key.offset; 5650 cur_offset = item_size; 5651 } 5652 5653 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5654 root, NULL); 5655 /* If parent inode was deleted, skip it. */ 5656 if (IS_ERR(dir_inode)) 5657 continue; 5658 5659 if (ctx) 5660 ctx->log_new_dentries = false; 5661 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5662 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5663 if (!ret && 5664 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5665 ret = 1; 5666 if (!ret && ctx && ctx->log_new_dentries) 5667 ret = log_new_dir_dentries(trans, root, 5668 BTRFS_I(dir_inode), ctx); 5669 iput(dir_inode); 5670 if (ret) 5671 goto out; 5672 } 5673 path->slots[0]++; 5674 } 5675 ret = 0; 5676 out: 5677 btrfs_free_path(path); 5678 return ret; 5679 } 5680 5681 /* 5682 * helper function around btrfs_log_inode to make sure newly created 5683 * parent directories also end up in the log. A minimal inode and backref 5684 * only logging is done of any parent directories that are older than 5685 * the last committed transaction 5686 */ 5687 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5688 struct btrfs_inode *inode, 5689 struct dentry *parent, 5690 const loff_t start, 5691 const loff_t end, 5692 int inode_only, 5693 struct btrfs_log_ctx *ctx) 5694 { 5695 struct btrfs_root *root = inode->root; 5696 struct btrfs_fs_info *fs_info = root->fs_info; 5697 struct super_block *sb; 5698 struct dentry *old_parent = NULL; 5699 int ret = 0; 5700 u64 last_committed = fs_info->last_trans_committed; 5701 bool log_dentries = false; 5702 struct btrfs_inode *orig_inode = inode; 5703 5704 sb = inode->vfs_inode.i_sb; 5705 5706 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5707 ret = 1; 5708 goto end_no_trans; 5709 } 5710 5711 /* 5712 * The prev transaction commit doesn't complete, we need do 5713 * full commit by ourselves. 5714 */ 5715 if (fs_info->last_trans_log_full_commit > 5716 fs_info->last_trans_committed) { 5717 ret = 1; 5718 goto end_no_trans; 5719 } 5720 5721 if (btrfs_root_refs(&root->root_item) == 0) { 5722 ret = 1; 5723 goto end_no_trans; 5724 } 5725 5726 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5727 last_committed); 5728 if (ret) 5729 goto end_no_trans; 5730 5731 if (btrfs_inode_in_log(inode, trans->transid)) { 5732 ret = BTRFS_NO_LOG_SYNC; 5733 goto end_no_trans; 5734 } 5735 5736 ret = start_log_trans(trans, root, ctx); 5737 if (ret) 5738 goto end_no_trans; 5739 5740 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5741 if (ret) 5742 goto end_trans; 5743 5744 /* 5745 * for regular files, if its inode is already on disk, we don't 5746 * have to worry about the parents at all. This is because 5747 * we can use the last_unlink_trans field to record renames 5748 * and other fun in this file. 5749 */ 5750 if (S_ISREG(inode->vfs_inode.i_mode) && 5751 inode->generation <= last_committed && 5752 inode->last_unlink_trans <= last_committed) { 5753 ret = 0; 5754 goto end_trans; 5755 } 5756 5757 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5758 log_dentries = true; 5759 5760 /* 5761 * On unlink we must make sure all our current and old parent directory 5762 * inodes are fully logged. This is to prevent leaving dangling 5763 * directory index entries in directories that were our parents but are 5764 * not anymore. Not doing this results in old parent directory being 5765 * impossible to delete after log replay (rmdir will always fail with 5766 * error -ENOTEMPTY). 5767 * 5768 * Example 1: 5769 * 5770 * mkdir testdir 5771 * touch testdir/foo 5772 * ln testdir/foo testdir/bar 5773 * sync 5774 * unlink testdir/bar 5775 * xfs_io -c fsync testdir/foo 5776 * <power failure> 5777 * mount fs, triggers log replay 5778 * 5779 * If we don't log the parent directory (testdir), after log replay the 5780 * directory still has an entry pointing to the file inode using the bar 5781 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5782 * the file inode has a link count of 1. 5783 * 5784 * Example 2: 5785 * 5786 * mkdir testdir 5787 * touch foo 5788 * ln foo testdir/foo2 5789 * ln foo testdir/foo3 5790 * sync 5791 * unlink testdir/foo3 5792 * xfs_io -c fsync foo 5793 * <power failure> 5794 * mount fs, triggers log replay 5795 * 5796 * Similar as the first example, after log replay the parent directory 5797 * testdir still has an entry pointing to the inode file with name foo3 5798 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5799 * and has a link count of 2. 5800 */ 5801 if (inode->last_unlink_trans > last_committed) { 5802 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5803 if (ret) 5804 goto end_trans; 5805 } 5806 5807 while (1) { 5808 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5809 break; 5810 5811 inode = BTRFS_I(d_inode(parent)); 5812 if (root != inode->root) 5813 break; 5814 5815 if (inode->generation > last_committed) { 5816 ret = btrfs_log_inode(trans, root, inode, 5817 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5818 if (ret) 5819 goto end_trans; 5820 } 5821 if (IS_ROOT(parent)) 5822 break; 5823 5824 parent = dget_parent(parent); 5825 dput(old_parent); 5826 old_parent = parent; 5827 } 5828 if (log_dentries) 5829 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5830 else 5831 ret = 0; 5832 end_trans: 5833 dput(old_parent); 5834 if (ret < 0) { 5835 btrfs_set_log_full_commit(fs_info, trans); 5836 ret = 1; 5837 } 5838 5839 if (ret) 5840 btrfs_remove_log_ctx(root, ctx); 5841 btrfs_end_log_trans(root); 5842 end_no_trans: 5843 return ret; 5844 } 5845 5846 /* 5847 * it is not safe to log dentry if the chunk root has added new 5848 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5849 * If this returns 1, you must commit the transaction to safely get your 5850 * data on disk. 5851 */ 5852 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5853 struct dentry *dentry, 5854 const loff_t start, 5855 const loff_t end, 5856 struct btrfs_log_ctx *ctx) 5857 { 5858 struct dentry *parent = dget_parent(dentry); 5859 int ret; 5860 5861 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 5862 start, end, LOG_INODE_ALL, ctx); 5863 dput(parent); 5864 5865 return ret; 5866 } 5867 5868 /* 5869 * should be called during mount to recover any replay any log trees 5870 * from the FS 5871 */ 5872 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5873 { 5874 int ret; 5875 struct btrfs_path *path; 5876 struct btrfs_trans_handle *trans; 5877 struct btrfs_key key; 5878 struct btrfs_key found_key; 5879 struct btrfs_key tmp_key; 5880 struct btrfs_root *log; 5881 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5882 struct walk_control wc = { 5883 .process_func = process_one_buffer, 5884 .stage = 0, 5885 }; 5886 5887 path = btrfs_alloc_path(); 5888 if (!path) 5889 return -ENOMEM; 5890 5891 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5892 5893 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5894 if (IS_ERR(trans)) { 5895 ret = PTR_ERR(trans); 5896 goto error; 5897 } 5898 5899 wc.trans = trans; 5900 wc.pin = 1; 5901 5902 ret = walk_log_tree(trans, log_root_tree, &wc); 5903 if (ret) { 5904 btrfs_handle_fs_error(fs_info, ret, 5905 "Failed to pin buffers while recovering log root tree."); 5906 goto error; 5907 } 5908 5909 again: 5910 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5911 key.offset = (u64)-1; 5912 key.type = BTRFS_ROOT_ITEM_KEY; 5913 5914 while (1) { 5915 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5916 5917 if (ret < 0) { 5918 btrfs_handle_fs_error(fs_info, ret, 5919 "Couldn't find tree log root."); 5920 goto error; 5921 } 5922 if (ret > 0) { 5923 if (path->slots[0] == 0) 5924 break; 5925 path->slots[0]--; 5926 } 5927 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5928 path->slots[0]); 5929 btrfs_release_path(path); 5930 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5931 break; 5932 5933 log = btrfs_read_fs_root(log_root_tree, &found_key); 5934 if (IS_ERR(log)) { 5935 ret = PTR_ERR(log); 5936 btrfs_handle_fs_error(fs_info, ret, 5937 "Couldn't read tree log root."); 5938 goto error; 5939 } 5940 5941 tmp_key.objectid = found_key.offset; 5942 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5943 tmp_key.offset = (u64)-1; 5944 5945 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5946 if (IS_ERR(wc.replay_dest)) { 5947 ret = PTR_ERR(wc.replay_dest); 5948 free_extent_buffer(log->node); 5949 free_extent_buffer(log->commit_root); 5950 kfree(log); 5951 btrfs_handle_fs_error(fs_info, ret, 5952 "Couldn't read target root for tree log recovery."); 5953 goto error; 5954 } 5955 5956 wc.replay_dest->log_root = log; 5957 btrfs_record_root_in_trans(trans, wc.replay_dest); 5958 ret = walk_log_tree(trans, log, &wc); 5959 5960 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5961 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5962 path); 5963 } 5964 5965 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5966 struct btrfs_root *root = wc.replay_dest; 5967 5968 btrfs_release_path(path); 5969 5970 /* 5971 * We have just replayed everything, and the highest 5972 * objectid of fs roots probably has changed in case 5973 * some inode_item's got replayed. 5974 * 5975 * root->objectid_mutex is not acquired as log replay 5976 * could only happen during mount. 5977 */ 5978 ret = btrfs_find_highest_objectid(root, 5979 &root->highest_objectid); 5980 } 5981 5982 key.offset = found_key.offset - 1; 5983 wc.replay_dest->log_root = NULL; 5984 free_extent_buffer(log->node); 5985 free_extent_buffer(log->commit_root); 5986 kfree(log); 5987 5988 if (ret) 5989 goto error; 5990 5991 if (found_key.offset == 0) 5992 break; 5993 } 5994 btrfs_release_path(path); 5995 5996 /* step one is to pin it all, step two is to replay just inodes */ 5997 if (wc.pin) { 5998 wc.pin = 0; 5999 wc.process_func = replay_one_buffer; 6000 wc.stage = LOG_WALK_REPLAY_INODES; 6001 goto again; 6002 } 6003 /* step three is to replay everything */ 6004 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6005 wc.stage++; 6006 goto again; 6007 } 6008 6009 btrfs_free_path(path); 6010 6011 /* step 4: commit the transaction, which also unpins the blocks */ 6012 ret = btrfs_commit_transaction(trans); 6013 if (ret) 6014 return ret; 6015 6016 free_extent_buffer(log_root_tree->node); 6017 log_root_tree->log_root = NULL; 6018 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6019 kfree(log_root_tree); 6020 6021 return 0; 6022 error: 6023 if (wc.trans) 6024 btrfs_end_transaction(wc.trans); 6025 btrfs_free_path(path); 6026 return ret; 6027 } 6028 6029 /* 6030 * there are some corner cases where we want to force a full 6031 * commit instead of allowing a directory to be logged. 6032 * 6033 * They revolve around files there were unlinked from the directory, and 6034 * this function updates the parent directory so that a full commit is 6035 * properly done if it is fsync'd later after the unlinks are done. 6036 * 6037 * Must be called before the unlink operations (updates to the subvolume tree, 6038 * inodes, etc) are done. 6039 */ 6040 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6041 struct btrfs_inode *dir, struct btrfs_inode *inode, 6042 int for_rename) 6043 { 6044 /* 6045 * when we're logging a file, if it hasn't been renamed 6046 * or unlinked, and its inode is fully committed on disk, 6047 * we don't have to worry about walking up the directory chain 6048 * to log its parents. 6049 * 6050 * So, we use the last_unlink_trans field to put this transid 6051 * into the file. When the file is logged we check it and 6052 * don't log the parents if the file is fully on disk. 6053 */ 6054 mutex_lock(&inode->log_mutex); 6055 inode->last_unlink_trans = trans->transid; 6056 mutex_unlock(&inode->log_mutex); 6057 6058 /* 6059 * if this directory was already logged any new 6060 * names for this file/dir will get recorded 6061 */ 6062 smp_mb(); 6063 if (dir->logged_trans == trans->transid) 6064 return; 6065 6066 /* 6067 * if the inode we're about to unlink was logged, 6068 * the log will be properly updated for any new names 6069 */ 6070 if (inode->logged_trans == trans->transid) 6071 return; 6072 6073 /* 6074 * when renaming files across directories, if the directory 6075 * there we're unlinking from gets fsync'd later on, there's 6076 * no way to find the destination directory later and fsync it 6077 * properly. So, we have to be conservative and force commits 6078 * so the new name gets discovered. 6079 */ 6080 if (for_rename) 6081 goto record; 6082 6083 /* we can safely do the unlink without any special recording */ 6084 return; 6085 6086 record: 6087 mutex_lock(&dir->log_mutex); 6088 dir->last_unlink_trans = trans->transid; 6089 mutex_unlock(&dir->log_mutex); 6090 } 6091 6092 /* 6093 * Make sure that if someone attempts to fsync the parent directory of a deleted 6094 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6095 * that after replaying the log tree of the parent directory's root we will not 6096 * see the snapshot anymore and at log replay time we will not see any log tree 6097 * corresponding to the deleted snapshot's root, which could lead to replaying 6098 * it after replaying the log tree of the parent directory (which would replay 6099 * the snapshot delete operation). 6100 * 6101 * Must be called before the actual snapshot destroy operation (updates to the 6102 * parent root and tree of tree roots trees, etc) are done. 6103 */ 6104 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6105 struct btrfs_inode *dir) 6106 { 6107 mutex_lock(&dir->log_mutex); 6108 dir->last_unlink_trans = trans->transid; 6109 mutex_unlock(&dir->log_mutex); 6110 } 6111 6112 /* 6113 * Call this after adding a new name for a file and it will properly 6114 * update the log to reflect the new name. 6115 * 6116 * It will return zero if all goes well, and it will return 1 if a 6117 * full transaction commit is required. 6118 */ 6119 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 6120 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6121 struct dentry *parent) 6122 { 6123 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6124 6125 /* 6126 * this will force the logging code to walk the dentry chain 6127 * up for the file 6128 */ 6129 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6130 inode->last_unlink_trans = trans->transid; 6131 6132 /* 6133 * if this inode hasn't been logged and directory we're renaming it 6134 * from hasn't been logged, we don't need to log it 6135 */ 6136 if (inode->logged_trans <= fs_info->last_trans_committed && 6137 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6138 return 0; 6139 6140 return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6141 LOG_INODE_EXISTS, NULL); 6142 } 6143 6144