1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include <linux/iversion.h> 24 #include "tree-log.h" 25 #include "disk-io.h" 26 #include "locking.h" 27 #include "print-tree.h" 28 #include "backref.h" 29 #include "hash.h" 30 #include "compression.h" 31 #include "qgroup.h" 32 #include "inode-map.h" 33 34 /* magic values for the inode_only field in btrfs_log_inode: 35 * 36 * LOG_INODE_ALL means to log everything 37 * LOG_INODE_EXISTS means to log just enough to recreate the inode 38 * during log replay 39 */ 40 #define LOG_INODE_ALL 0 41 #define LOG_INODE_EXISTS 1 42 #define LOG_OTHER_INODE 2 43 44 /* 45 * directory trouble cases 46 * 47 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 48 * log, we must force a full commit before doing an fsync of the directory 49 * where the unlink was done. 50 * ---> record transid of last unlink/rename per directory 51 * 52 * mkdir foo/some_dir 53 * normal commit 54 * rename foo/some_dir foo2/some_dir 55 * mkdir foo/some_dir 56 * fsync foo/some_dir/some_file 57 * 58 * The fsync above will unlink the original some_dir without recording 59 * it in its new location (foo2). After a crash, some_dir will be gone 60 * unless the fsync of some_file forces a full commit 61 * 62 * 2) we must log any new names for any file or dir that is in the fsync 63 * log. ---> check inode while renaming/linking. 64 * 65 * 2a) we must log any new names for any file or dir during rename 66 * when the directory they are being removed from was logged. 67 * ---> check inode and old parent dir during rename 68 * 69 * 2a is actually the more important variant. With the extra logging 70 * a crash might unlink the old name without recreating the new one 71 * 72 * 3) after a crash, we must go through any directories with a link count 73 * of zero and redo the rm -rf 74 * 75 * mkdir f1/foo 76 * normal commit 77 * rm -rf f1/foo 78 * fsync(f1) 79 * 80 * The directory f1 was fully removed from the FS, but fsync was never 81 * called on f1, only its parent dir. After a crash the rm -rf must 82 * be replayed. This must be able to recurse down the entire 83 * directory tree. The inode link count fixup code takes care of the 84 * ugly details. 85 */ 86 87 /* 88 * stages for the tree walking. The first 89 * stage (0) is to only pin down the blocks we find 90 * the second stage (1) is to make sure that all the inodes 91 * we find in the log are created in the subvolume. 92 * 93 * The last stage is to deal with directories and links and extents 94 * and all the other fun semantics 95 */ 96 #define LOG_WALK_PIN_ONLY 0 97 #define LOG_WALK_REPLAY_INODES 1 98 #define LOG_WALK_REPLAY_DIR_INDEX 2 99 #define LOG_WALK_REPLAY_ALL 3 100 101 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, struct btrfs_inode *inode, 103 int inode_only, 104 const loff_t start, 105 const loff_t end, 106 struct btrfs_log_ctx *ctx); 107 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 108 struct btrfs_root *root, 109 struct btrfs_path *path, u64 objectid); 110 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 111 struct btrfs_root *root, 112 struct btrfs_root *log, 113 struct btrfs_path *path, 114 u64 dirid, int del_all); 115 116 /* 117 * tree logging is a special write ahead log used to make sure that 118 * fsyncs and O_SYNCs can happen without doing full tree commits. 119 * 120 * Full tree commits are expensive because they require commonly 121 * modified blocks to be recowed, creating many dirty pages in the 122 * extent tree an 4x-6x higher write load than ext3. 123 * 124 * Instead of doing a tree commit on every fsync, we use the 125 * key ranges and transaction ids to find items for a given file or directory 126 * that have changed in this transaction. Those items are copied into 127 * a special tree (one per subvolume root), that tree is written to disk 128 * and then the fsync is considered complete. 129 * 130 * After a crash, items are copied out of the log-tree back into the 131 * subvolume tree. Any file data extents found are recorded in the extent 132 * allocation tree, and the log-tree freed. 133 * 134 * The log tree is read three times, once to pin down all the extents it is 135 * using in ram and once, once to create all the inodes logged in the tree 136 * and once to do all the other items. 137 */ 138 139 /* 140 * start a sub transaction and setup the log tree 141 * this increments the log tree writer count to make the people 142 * syncing the tree wait for us to finish 143 */ 144 static int start_log_trans(struct btrfs_trans_handle *trans, 145 struct btrfs_root *root, 146 struct btrfs_log_ctx *ctx) 147 { 148 struct btrfs_fs_info *fs_info = root->fs_info; 149 int ret = 0; 150 151 mutex_lock(&root->log_mutex); 152 153 if (root->log_root) { 154 if (btrfs_need_log_full_commit(fs_info, trans)) { 155 ret = -EAGAIN; 156 goto out; 157 } 158 159 if (!root->log_start_pid) { 160 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 161 root->log_start_pid = current->pid; 162 } else if (root->log_start_pid != current->pid) { 163 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 164 } 165 } else { 166 mutex_lock(&fs_info->tree_log_mutex); 167 if (!fs_info->log_root_tree) 168 ret = btrfs_init_log_root_tree(trans, fs_info); 169 mutex_unlock(&fs_info->tree_log_mutex); 170 if (ret) 171 goto out; 172 173 ret = btrfs_add_log_tree(trans, root); 174 if (ret) 175 goto out; 176 177 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 178 root->log_start_pid = current->pid; 179 } 180 181 atomic_inc(&root->log_batch); 182 atomic_inc(&root->log_writers); 183 if (ctx) { 184 int index = root->log_transid % 2; 185 list_add_tail(&ctx->list, &root->log_ctxs[index]); 186 ctx->log_transid = root->log_transid; 187 } 188 189 out: 190 mutex_unlock(&root->log_mutex); 191 return ret; 192 } 193 194 /* 195 * returns 0 if there was a log transaction running and we were able 196 * to join, or returns -ENOENT if there were not transactions 197 * in progress 198 */ 199 static int join_running_log_trans(struct btrfs_root *root) 200 { 201 int ret = -ENOENT; 202 203 smp_mb(); 204 if (!root->log_root) 205 return -ENOENT; 206 207 mutex_lock(&root->log_mutex); 208 if (root->log_root) { 209 ret = 0; 210 atomic_inc(&root->log_writers); 211 } 212 mutex_unlock(&root->log_mutex); 213 return ret; 214 } 215 216 /* 217 * This either makes the current running log transaction wait 218 * until you call btrfs_end_log_trans() or it makes any future 219 * log transactions wait until you call btrfs_end_log_trans() 220 */ 221 int btrfs_pin_log_trans(struct btrfs_root *root) 222 { 223 int ret = -ENOENT; 224 225 mutex_lock(&root->log_mutex); 226 atomic_inc(&root->log_writers); 227 mutex_unlock(&root->log_mutex); 228 return ret; 229 } 230 231 /* 232 * indicate we're done making changes to the log tree 233 * and wake up anyone waiting to do a sync 234 */ 235 void btrfs_end_log_trans(struct btrfs_root *root) 236 { 237 if (atomic_dec_and_test(&root->log_writers)) { 238 /* 239 * Implicit memory barrier after atomic_dec_and_test 240 */ 241 if (waitqueue_active(&root->log_writer_wait)) 242 wake_up(&root->log_writer_wait); 243 } 244 } 245 246 247 /* 248 * the walk control struct is used to pass state down the chain when 249 * processing the log tree. The stage field tells us which part 250 * of the log tree processing we are currently doing. The others 251 * are state fields used for that specific part 252 */ 253 struct walk_control { 254 /* should we free the extent on disk when done? This is used 255 * at transaction commit time while freeing a log tree 256 */ 257 int free; 258 259 /* should we write out the extent buffer? This is used 260 * while flushing the log tree to disk during a sync 261 */ 262 int write; 263 264 /* should we wait for the extent buffer io to finish? Also used 265 * while flushing the log tree to disk for a sync 266 */ 267 int wait; 268 269 /* pin only walk, we record which extents on disk belong to the 270 * log trees 271 */ 272 int pin; 273 274 /* what stage of the replay code we're currently in */ 275 int stage; 276 277 /* the root we are currently replaying */ 278 struct btrfs_root *replay_dest; 279 280 /* the trans handle for the current replay */ 281 struct btrfs_trans_handle *trans; 282 283 /* the function that gets used to process blocks we find in the 284 * tree. Note the extent_buffer might not be up to date when it is 285 * passed in, and it must be checked or read if you need the data 286 * inside it 287 */ 288 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 289 struct walk_control *wc, u64 gen); 290 }; 291 292 /* 293 * process_func used to pin down extents, write them or wait on them 294 */ 295 static int process_one_buffer(struct btrfs_root *log, 296 struct extent_buffer *eb, 297 struct walk_control *wc, u64 gen) 298 { 299 struct btrfs_fs_info *fs_info = log->fs_info; 300 int ret = 0; 301 302 /* 303 * If this fs is mixed then we need to be able to process the leaves to 304 * pin down any logged extents, so we have to read the block. 305 */ 306 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 307 ret = btrfs_read_buffer(eb, gen); 308 if (ret) 309 return ret; 310 } 311 312 if (wc->pin) 313 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 314 eb->len); 315 316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 317 if (wc->pin && btrfs_header_level(eb) == 0) 318 ret = btrfs_exclude_logged_extents(fs_info, eb); 319 if (wc->write) 320 btrfs_write_tree_block(eb); 321 if (wc->wait) 322 btrfs_wait_tree_block_writeback(eb); 323 } 324 return ret; 325 } 326 327 /* 328 * Item overwrite used by replay and tree logging. eb, slot and key all refer 329 * to the src data we are copying out. 330 * 331 * root is the tree we are copying into, and path is a scratch 332 * path for use in this function (it should be released on entry and 333 * will be released on exit). 334 * 335 * If the key is already in the destination tree the existing item is 336 * overwritten. If the existing item isn't big enough, it is extended. 337 * If it is too large, it is truncated. 338 * 339 * If the key isn't in the destination yet, a new item is inserted. 340 */ 341 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 342 struct btrfs_root *root, 343 struct btrfs_path *path, 344 struct extent_buffer *eb, int slot, 345 struct btrfs_key *key) 346 { 347 struct btrfs_fs_info *fs_info = root->fs_info; 348 int ret; 349 u32 item_size; 350 u64 saved_i_size = 0; 351 int save_old_i_size = 0; 352 unsigned long src_ptr; 353 unsigned long dst_ptr; 354 int overwrite_root = 0; 355 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 356 357 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 358 overwrite_root = 1; 359 360 item_size = btrfs_item_size_nr(eb, slot); 361 src_ptr = btrfs_item_ptr_offset(eb, slot); 362 363 /* look for the key in the destination tree */ 364 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 365 if (ret < 0) 366 return ret; 367 368 if (ret == 0) { 369 char *src_copy; 370 char *dst_copy; 371 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 372 path->slots[0]); 373 if (dst_size != item_size) 374 goto insert; 375 376 if (item_size == 0) { 377 btrfs_release_path(path); 378 return 0; 379 } 380 dst_copy = kmalloc(item_size, GFP_NOFS); 381 src_copy = kmalloc(item_size, GFP_NOFS); 382 if (!dst_copy || !src_copy) { 383 btrfs_release_path(path); 384 kfree(dst_copy); 385 kfree(src_copy); 386 return -ENOMEM; 387 } 388 389 read_extent_buffer(eb, src_copy, src_ptr, item_size); 390 391 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 392 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 393 item_size); 394 ret = memcmp(dst_copy, src_copy, item_size); 395 396 kfree(dst_copy); 397 kfree(src_copy); 398 /* 399 * they have the same contents, just return, this saves 400 * us from cowing blocks in the destination tree and doing 401 * extra writes that may not have been done by a previous 402 * sync 403 */ 404 if (ret == 0) { 405 btrfs_release_path(path); 406 return 0; 407 } 408 409 /* 410 * We need to load the old nbytes into the inode so when we 411 * replay the extents we've logged we get the right nbytes. 412 */ 413 if (inode_item) { 414 struct btrfs_inode_item *item; 415 u64 nbytes; 416 u32 mode; 417 418 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 419 struct btrfs_inode_item); 420 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 421 item = btrfs_item_ptr(eb, slot, 422 struct btrfs_inode_item); 423 btrfs_set_inode_nbytes(eb, item, nbytes); 424 425 /* 426 * If this is a directory we need to reset the i_size to 427 * 0 so that we can set it up properly when replaying 428 * the rest of the items in this log. 429 */ 430 mode = btrfs_inode_mode(eb, item); 431 if (S_ISDIR(mode)) 432 btrfs_set_inode_size(eb, item, 0); 433 } 434 } else if (inode_item) { 435 struct btrfs_inode_item *item; 436 u32 mode; 437 438 /* 439 * New inode, set nbytes to 0 so that the nbytes comes out 440 * properly when we replay the extents. 441 */ 442 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 443 btrfs_set_inode_nbytes(eb, item, 0); 444 445 /* 446 * If this is a directory we need to reset the i_size to 0 so 447 * that we can set it up properly when replaying the rest of 448 * the items in this log. 449 */ 450 mode = btrfs_inode_mode(eb, item); 451 if (S_ISDIR(mode)) 452 btrfs_set_inode_size(eb, item, 0); 453 } 454 insert: 455 btrfs_release_path(path); 456 /* try to insert the key into the destination tree */ 457 path->skip_release_on_error = 1; 458 ret = btrfs_insert_empty_item(trans, root, path, 459 key, item_size); 460 path->skip_release_on_error = 0; 461 462 /* make sure any existing item is the correct size */ 463 if (ret == -EEXIST || ret == -EOVERFLOW) { 464 u32 found_size; 465 found_size = btrfs_item_size_nr(path->nodes[0], 466 path->slots[0]); 467 if (found_size > item_size) 468 btrfs_truncate_item(fs_info, path, item_size, 1); 469 else if (found_size < item_size) 470 btrfs_extend_item(fs_info, path, 471 item_size - found_size); 472 } else if (ret) { 473 return ret; 474 } 475 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 476 path->slots[0]); 477 478 /* don't overwrite an existing inode if the generation number 479 * was logged as zero. This is done when the tree logging code 480 * is just logging an inode to make sure it exists after recovery. 481 * 482 * Also, don't overwrite i_size on directories during replay. 483 * log replay inserts and removes directory items based on the 484 * state of the tree found in the subvolume, and i_size is modified 485 * as it goes 486 */ 487 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 488 struct btrfs_inode_item *src_item; 489 struct btrfs_inode_item *dst_item; 490 491 src_item = (struct btrfs_inode_item *)src_ptr; 492 dst_item = (struct btrfs_inode_item *)dst_ptr; 493 494 if (btrfs_inode_generation(eb, src_item) == 0) { 495 struct extent_buffer *dst_eb = path->nodes[0]; 496 const u64 ino_size = btrfs_inode_size(eb, src_item); 497 498 /* 499 * For regular files an ino_size == 0 is used only when 500 * logging that an inode exists, as part of a directory 501 * fsync, and the inode wasn't fsynced before. In this 502 * case don't set the size of the inode in the fs/subvol 503 * tree, otherwise we would be throwing valid data away. 504 */ 505 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 506 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 507 ino_size != 0) { 508 struct btrfs_map_token token; 509 510 btrfs_init_map_token(&token); 511 btrfs_set_token_inode_size(dst_eb, dst_item, 512 ino_size, &token); 513 } 514 goto no_copy; 515 } 516 517 if (overwrite_root && 518 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 519 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 520 save_old_i_size = 1; 521 saved_i_size = btrfs_inode_size(path->nodes[0], 522 dst_item); 523 } 524 } 525 526 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 527 src_ptr, item_size); 528 529 if (save_old_i_size) { 530 struct btrfs_inode_item *dst_item; 531 dst_item = (struct btrfs_inode_item *)dst_ptr; 532 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 533 } 534 535 /* make sure the generation is filled in */ 536 if (key->type == BTRFS_INODE_ITEM_KEY) { 537 struct btrfs_inode_item *dst_item; 538 dst_item = (struct btrfs_inode_item *)dst_ptr; 539 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 540 btrfs_set_inode_generation(path->nodes[0], dst_item, 541 trans->transid); 542 } 543 } 544 no_copy: 545 btrfs_mark_buffer_dirty(path->nodes[0]); 546 btrfs_release_path(path); 547 return 0; 548 } 549 550 /* 551 * simple helper to read an inode off the disk from a given root 552 * This can only be called for subvolume roots and not for the log 553 */ 554 static noinline struct inode *read_one_inode(struct btrfs_root *root, 555 u64 objectid) 556 { 557 struct btrfs_key key; 558 struct inode *inode; 559 560 key.objectid = objectid; 561 key.type = BTRFS_INODE_ITEM_KEY; 562 key.offset = 0; 563 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 564 if (IS_ERR(inode)) { 565 inode = NULL; 566 } else if (is_bad_inode(inode)) { 567 iput(inode); 568 inode = NULL; 569 } 570 return inode; 571 } 572 573 /* replays a single extent in 'eb' at 'slot' with 'key' into the 574 * subvolume 'root'. path is released on entry and should be released 575 * on exit. 576 * 577 * extents in the log tree have not been allocated out of the extent 578 * tree yet. So, this completes the allocation, taking a reference 579 * as required if the extent already exists or creating a new extent 580 * if it isn't in the extent allocation tree yet. 581 * 582 * The extent is inserted into the file, dropping any existing extents 583 * from the file that overlap the new one. 584 */ 585 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 586 struct btrfs_root *root, 587 struct btrfs_path *path, 588 struct extent_buffer *eb, int slot, 589 struct btrfs_key *key) 590 { 591 struct btrfs_fs_info *fs_info = root->fs_info; 592 int found_type; 593 u64 extent_end; 594 u64 start = key->offset; 595 u64 nbytes = 0; 596 struct btrfs_file_extent_item *item; 597 struct inode *inode = NULL; 598 unsigned long size; 599 int ret = 0; 600 601 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 602 found_type = btrfs_file_extent_type(eb, item); 603 604 if (found_type == BTRFS_FILE_EXTENT_REG || 605 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 606 nbytes = btrfs_file_extent_num_bytes(eb, item); 607 extent_end = start + nbytes; 608 609 /* 610 * We don't add to the inodes nbytes if we are prealloc or a 611 * hole. 612 */ 613 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 614 nbytes = 0; 615 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 616 size = btrfs_file_extent_inline_len(eb, slot, item); 617 nbytes = btrfs_file_extent_ram_bytes(eb, item); 618 extent_end = ALIGN(start + size, 619 fs_info->sectorsize); 620 } else { 621 ret = 0; 622 goto out; 623 } 624 625 inode = read_one_inode(root, key->objectid); 626 if (!inode) { 627 ret = -EIO; 628 goto out; 629 } 630 631 /* 632 * first check to see if we already have this extent in the 633 * file. This must be done before the btrfs_drop_extents run 634 * so we don't try to drop this extent. 635 */ 636 ret = btrfs_lookup_file_extent(trans, root, path, 637 btrfs_ino(BTRFS_I(inode)), start, 0); 638 639 if (ret == 0 && 640 (found_type == BTRFS_FILE_EXTENT_REG || 641 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 642 struct btrfs_file_extent_item cmp1; 643 struct btrfs_file_extent_item cmp2; 644 struct btrfs_file_extent_item *existing; 645 struct extent_buffer *leaf; 646 647 leaf = path->nodes[0]; 648 existing = btrfs_item_ptr(leaf, path->slots[0], 649 struct btrfs_file_extent_item); 650 651 read_extent_buffer(eb, &cmp1, (unsigned long)item, 652 sizeof(cmp1)); 653 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 654 sizeof(cmp2)); 655 656 /* 657 * we already have a pointer to this exact extent, 658 * we don't have to do anything 659 */ 660 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 661 btrfs_release_path(path); 662 goto out; 663 } 664 } 665 btrfs_release_path(path); 666 667 /* drop any overlapping extents */ 668 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 669 if (ret) 670 goto out; 671 672 if (found_type == BTRFS_FILE_EXTENT_REG || 673 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 674 u64 offset; 675 unsigned long dest_offset; 676 struct btrfs_key ins; 677 678 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 679 btrfs_fs_incompat(fs_info, NO_HOLES)) 680 goto update_inode; 681 682 ret = btrfs_insert_empty_item(trans, root, path, key, 683 sizeof(*item)); 684 if (ret) 685 goto out; 686 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 687 path->slots[0]); 688 copy_extent_buffer(path->nodes[0], eb, dest_offset, 689 (unsigned long)item, sizeof(*item)); 690 691 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 692 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 693 ins.type = BTRFS_EXTENT_ITEM_KEY; 694 offset = key->offset - btrfs_file_extent_offset(eb, item); 695 696 /* 697 * Manually record dirty extent, as here we did a shallow 698 * file extent item copy and skip normal backref update, 699 * but modifying extent tree all by ourselves. 700 * So need to manually record dirty extent for qgroup, 701 * as the owner of the file extent changed from log tree 702 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 703 */ 704 ret = btrfs_qgroup_trace_extent(trans, fs_info, 705 btrfs_file_extent_disk_bytenr(eb, item), 706 btrfs_file_extent_disk_num_bytes(eb, item), 707 GFP_NOFS); 708 if (ret < 0) 709 goto out; 710 711 if (ins.objectid > 0) { 712 u64 csum_start; 713 u64 csum_end; 714 LIST_HEAD(ordered_sums); 715 /* 716 * is this extent already allocated in the extent 717 * allocation tree? If so, just add a reference 718 */ 719 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 720 ins.offset); 721 if (ret == 0) { 722 ret = btrfs_inc_extent_ref(trans, root, 723 ins.objectid, ins.offset, 724 0, root->root_key.objectid, 725 key->objectid, offset); 726 if (ret) 727 goto out; 728 } else { 729 /* 730 * insert the extent pointer in the extent 731 * allocation tree 732 */ 733 ret = btrfs_alloc_logged_file_extent(trans, 734 fs_info, 735 root->root_key.objectid, 736 key->objectid, offset, &ins); 737 if (ret) 738 goto out; 739 } 740 btrfs_release_path(path); 741 742 if (btrfs_file_extent_compression(eb, item)) { 743 csum_start = ins.objectid; 744 csum_end = csum_start + ins.offset; 745 } else { 746 csum_start = ins.objectid + 747 btrfs_file_extent_offset(eb, item); 748 csum_end = csum_start + 749 btrfs_file_extent_num_bytes(eb, item); 750 } 751 752 ret = btrfs_lookup_csums_range(root->log_root, 753 csum_start, csum_end - 1, 754 &ordered_sums, 0); 755 if (ret) 756 goto out; 757 /* 758 * Now delete all existing cums in the csum root that 759 * cover our range. We do this because we can have an 760 * extent that is completely referenced by one file 761 * extent item and partially referenced by another 762 * file extent item (like after using the clone or 763 * extent_same ioctls). In this case if we end up doing 764 * the replay of the one that partially references the 765 * extent first, and we do not do the csum deletion 766 * below, we can get 2 csum items in the csum tree that 767 * overlap each other. For example, imagine our log has 768 * the two following file extent items: 769 * 770 * key (257 EXTENT_DATA 409600) 771 * extent data disk byte 12845056 nr 102400 772 * extent data offset 20480 nr 20480 ram 102400 773 * 774 * key (257 EXTENT_DATA 819200) 775 * extent data disk byte 12845056 nr 102400 776 * extent data offset 0 nr 102400 ram 102400 777 * 778 * Where the second one fully references the 100K extent 779 * that starts at disk byte 12845056, and the log tree 780 * has a single csum item that covers the entire range 781 * of the extent: 782 * 783 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 784 * 785 * After the first file extent item is replayed, the 786 * csum tree gets the following csum item: 787 * 788 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 789 * 790 * Which covers the 20K sub-range starting at offset 20K 791 * of our extent. Now when we replay the second file 792 * extent item, if we do not delete existing csum items 793 * that cover any of its blocks, we end up getting two 794 * csum items in our csum tree that overlap each other: 795 * 796 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 797 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 798 * 799 * Which is a problem, because after this anyone trying 800 * to lookup up for the checksum of any block of our 801 * extent starting at an offset of 40K or higher, will 802 * end up looking at the second csum item only, which 803 * does not contain the checksum for any block starting 804 * at offset 40K or higher of our extent. 805 */ 806 while (!list_empty(&ordered_sums)) { 807 struct btrfs_ordered_sum *sums; 808 sums = list_entry(ordered_sums.next, 809 struct btrfs_ordered_sum, 810 list); 811 if (!ret) 812 ret = btrfs_del_csums(trans, fs_info, 813 sums->bytenr, 814 sums->len); 815 if (!ret) 816 ret = btrfs_csum_file_blocks(trans, 817 fs_info->csum_root, sums); 818 list_del(&sums->list); 819 kfree(sums); 820 } 821 if (ret) 822 goto out; 823 } else { 824 btrfs_release_path(path); 825 } 826 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 827 /* inline extents are easy, we just overwrite them */ 828 ret = overwrite_item(trans, root, path, eb, slot, key); 829 if (ret) 830 goto out; 831 } 832 833 inode_add_bytes(inode, nbytes); 834 update_inode: 835 ret = btrfs_update_inode(trans, root, inode); 836 out: 837 if (inode) 838 iput(inode); 839 return ret; 840 } 841 842 /* 843 * when cleaning up conflicts between the directory names in the 844 * subvolume, directory names in the log and directory names in the 845 * inode back references, we may have to unlink inodes from directories. 846 * 847 * This is a helper function to do the unlink of a specific directory 848 * item 849 */ 850 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 851 struct btrfs_root *root, 852 struct btrfs_path *path, 853 struct btrfs_inode *dir, 854 struct btrfs_dir_item *di) 855 { 856 struct btrfs_fs_info *fs_info = root->fs_info; 857 struct inode *inode; 858 char *name; 859 int name_len; 860 struct extent_buffer *leaf; 861 struct btrfs_key location; 862 int ret; 863 864 leaf = path->nodes[0]; 865 866 btrfs_dir_item_key_to_cpu(leaf, di, &location); 867 name_len = btrfs_dir_name_len(leaf, di); 868 name = kmalloc(name_len, GFP_NOFS); 869 if (!name) 870 return -ENOMEM; 871 872 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 873 btrfs_release_path(path); 874 875 inode = read_one_inode(root, location.objectid); 876 if (!inode) { 877 ret = -EIO; 878 goto out; 879 } 880 881 ret = link_to_fixup_dir(trans, root, path, location.objectid); 882 if (ret) 883 goto out; 884 885 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 886 name_len); 887 if (ret) 888 goto out; 889 else 890 ret = btrfs_run_delayed_items(trans, fs_info); 891 out: 892 kfree(name); 893 iput(inode); 894 return ret; 895 } 896 897 /* 898 * helper function to see if a given name and sequence number found 899 * in an inode back reference are already in a directory and correctly 900 * point to this inode 901 */ 902 static noinline int inode_in_dir(struct btrfs_root *root, 903 struct btrfs_path *path, 904 u64 dirid, u64 objectid, u64 index, 905 const char *name, int name_len) 906 { 907 struct btrfs_dir_item *di; 908 struct btrfs_key location; 909 int match = 0; 910 911 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 912 index, name, name_len, 0); 913 if (di && !IS_ERR(di)) { 914 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 915 if (location.objectid != objectid) 916 goto out; 917 } else 918 goto out; 919 btrfs_release_path(path); 920 921 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 922 if (di && !IS_ERR(di)) { 923 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 924 if (location.objectid != objectid) 925 goto out; 926 } else 927 goto out; 928 match = 1; 929 out: 930 btrfs_release_path(path); 931 return match; 932 } 933 934 /* 935 * helper function to check a log tree for a named back reference in 936 * an inode. This is used to decide if a back reference that is 937 * found in the subvolume conflicts with what we find in the log. 938 * 939 * inode backreferences may have multiple refs in a single item, 940 * during replay we process one reference at a time, and we don't 941 * want to delete valid links to a file from the subvolume if that 942 * link is also in the log. 943 */ 944 static noinline int backref_in_log(struct btrfs_root *log, 945 struct btrfs_key *key, 946 u64 ref_objectid, 947 const char *name, int namelen) 948 { 949 struct btrfs_path *path; 950 struct btrfs_inode_ref *ref; 951 unsigned long ptr; 952 unsigned long ptr_end; 953 unsigned long name_ptr; 954 int found_name_len; 955 int item_size; 956 int ret; 957 int match = 0; 958 959 path = btrfs_alloc_path(); 960 if (!path) 961 return -ENOMEM; 962 963 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 964 if (ret != 0) 965 goto out; 966 967 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 968 969 if (key->type == BTRFS_INODE_EXTREF_KEY) { 970 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 971 name, namelen, NULL)) 972 match = 1; 973 974 goto out; 975 } 976 977 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 978 ptr_end = ptr + item_size; 979 while (ptr < ptr_end) { 980 ref = (struct btrfs_inode_ref *)ptr; 981 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 982 if (found_name_len == namelen) { 983 name_ptr = (unsigned long)(ref + 1); 984 ret = memcmp_extent_buffer(path->nodes[0], name, 985 name_ptr, namelen); 986 if (ret == 0) { 987 match = 1; 988 goto out; 989 } 990 } 991 ptr = (unsigned long)(ref + 1) + found_name_len; 992 } 993 out: 994 btrfs_free_path(path); 995 return match; 996 } 997 998 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 999 struct btrfs_root *root, 1000 struct btrfs_path *path, 1001 struct btrfs_root *log_root, 1002 struct btrfs_inode *dir, 1003 struct btrfs_inode *inode, 1004 u64 inode_objectid, u64 parent_objectid, 1005 u64 ref_index, char *name, int namelen, 1006 int *search_done) 1007 { 1008 struct btrfs_fs_info *fs_info = root->fs_info; 1009 int ret; 1010 char *victim_name; 1011 int victim_name_len; 1012 struct extent_buffer *leaf; 1013 struct btrfs_dir_item *di; 1014 struct btrfs_key search_key; 1015 struct btrfs_inode_extref *extref; 1016 1017 again: 1018 /* Search old style refs */ 1019 search_key.objectid = inode_objectid; 1020 search_key.type = BTRFS_INODE_REF_KEY; 1021 search_key.offset = parent_objectid; 1022 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1023 if (ret == 0) { 1024 struct btrfs_inode_ref *victim_ref; 1025 unsigned long ptr; 1026 unsigned long ptr_end; 1027 1028 leaf = path->nodes[0]; 1029 1030 /* are we trying to overwrite a back ref for the root directory 1031 * if so, just jump out, we're done 1032 */ 1033 if (search_key.objectid == search_key.offset) 1034 return 1; 1035 1036 /* check all the names in this back reference to see 1037 * if they are in the log. if so, we allow them to stay 1038 * otherwise they must be unlinked as a conflict 1039 */ 1040 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1041 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1042 while (ptr < ptr_end) { 1043 victim_ref = (struct btrfs_inode_ref *)ptr; 1044 victim_name_len = btrfs_inode_ref_name_len(leaf, 1045 victim_ref); 1046 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1047 if (!victim_name) 1048 return -ENOMEM; 1049 1050 read_extent_buffer(leaf, victim_name, 1051 (unsigned long)(victim_ref + 1), 1052 victim_name_len); 1053 1054 if (!backref_in_log(log_root, &search_key, 1055 parent_objectid, 1056 victim_name, 1057 victim_name_len)) { 1058 inc_nlink(&inode->vfs_inode); 1059 btrfs_release_path(path); 1060 1061 ret = btrfs_unlink_inode(trans, root, dir, inode, 1062 victim_name, victim_name_len); 1063 kfree(victim_name); 1064 if (ret) 1065 return ret; 1066 ret = btrfs_run_delayed_items(trans, fs_info); 1067 if (ret) 1068 return ret; 1069 *search_done = 1; 1070 goto again; 1071 } 1072 kfree(victim_name); 1073 1074 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1075 } 1076 1077 /* 1078 * NOTE: we have searched root tree and checked the 1079 * corresponding ref, it does not need to check again. 1080 */ 1081 *search_done = 1; 1082 } 1083 btrfs_release_path(path); 1084 1085 /* Same search but for extended refs */ 1086 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1087 inode_objectid, parent_objectid, 0, 1088 0); 1089 if (!IS_ERR_OR_NULL(extref)) { 1090 u32 item_size; 1091 u32 cur_offset = 0; 1092 unsigned long base; 1093 struct inode *victim_parent; 1094 1095 leaf = path->nodes[0]; 1096 1097 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1098 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1099 1100 while (cur_offset < item_size) { 1101 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1102 1103 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1104 1105 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1106 goto next; 1107 1108 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1109 if (!victim_name) 1110 return -ENOMEM; 1111 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1112 victim_name_len); 1113 1114 search_key.objectid = inode_objectid; 1115 search_key.type = BTRFS_INODE_EXTREF_KEY; 1116 search_key.offset = btrfs_extref_hash(parent_objectid, 1117 victim_name, 1118 victim_name_len); 1119 ret = 0; 1120 if (!backref_in_log(log_root, &search_key, 1121 parent_objectid, victim_name, 1122 victim_name_len)) { 1123 ret = -ENOENT; 1124 victim_parent = read_one_inode(root, 1125 parent_objectid); 1126 if (victim_parent) { 1127 inc_nlink(&inode->vfs_inode); 1128 btrfs_release_path(path); 1129 1130 ret = btrfs_unlink_inode(trans, root, 1131 BTRFS_I(victim_parent), 1132 inode, 1133 victim_name, 1134 victim_name_len); 1135 if (!ret) 1136 ret = btrfs_run_delayed_items( 1137 trans, 1138 fs_info); 1139 } 1140 iput(victim_parent); 1141 kfree(victim_name); 1142 if (ret) 1143 return ret; 1144 *search_done = 1; 1145 goto again; 1146 } 1147 kfree(victim_name); 1148 next: 1149 cur_offset += victim_name_len + sizeof(*extref); 1150 } 1151 *search_done = 1; 1152 } 1153 btrfs_release_path(path); 1154 1155 /* look for a conflicting sequence number */ 1156 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1157 ref_index, name, namelen, 0); 1158 if (di && !IS_ERR(di)) { 1159 ret = drop_one_dir_item(trans, root, path, dir, di); 1160 if (ret) 1161 return ret; 1162 } 1163 btrfs_release_path(path); 1164 1165 /* look for a conflicing name */ 1166 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1167 name, namelen, 0); 1168 if (di && !IS_ERR(di)) { 1169 ret = drop_one_dir_item(trans, root, path, dir, di); 1170 if (ret) 1171 return ret; 1172 } 1173 btrfs_release_path(path); 1174 1175 return 0; 1176 } 1177 1178 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1179 u32 *namelen, char **name, u64 *index, 1180 u64 *parent_objectid) 1181 { 1182 struct btrfs_inode_extref *extref; 1183 1184 extref = (struct btrfs_inode_extref *)ref_ptr; 1185 1186 *namelen = btrfs_inode_extref_name_len(eb, extref); 1187 *name = kmalloc(*namelen, GFP_NOFS); 1188 if (*name == NULL) 1189 return -ENOMEM; 1190 1191 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1192 *namelen); 1193 1194 *index = btrfs_inode_extref_index(eb, extref); 1195 if (parent_objectid) 1196 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1197 1198 return 0; 1199 } 1200 1201 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1202 u32 *namelen, char **name, u64 *index) 1203 { 1204 struct btrfs_inode_ref *ref; 1205 1206 ref = (struct btrfs_inode_ref *)ref_ptr; 1207 1208 *namelen = btrfs_inode_ref_name_len(eb, ref); 1209 *name = kmalloc(*namelen, GFP_NOFS); 1210 if (*name == NULL) 1211 return -ENOMEM; 1212 1213 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1214 1215 *index = btrfs_inode_ref_index(eb, ref); 1216 1217 return 0; 1218 } 1219 1220 /* 1221 * replay one inode back reference item found in the log tree. 1222 * eb, slot and key refer to the buffer and key found in the log tree. 1223 * root is the destination we are replaying into, and path is for temp 1224 * use by this function. (it should be released on return). 1225 */ 1226 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1227 struct btrfs_root *root, 1228 struct btrfs_root *log, 1229 struct btrfs_path *path, 1230 struct extent_buffer *eb, int slot, 1231 struct btrfs_key *key) 1232 { 1233 struct inode *dir = NULL; 1234 struct inode *inode = NULL; 1235 unsigned long ref_ptr; 1236 unsigned long ref_end; 1237 char *name = NULL; 1238 int namelen; 1239 int ret; 1240 int search_done = 0; 1241 int log_ref_ver = 0; 1242 u64 parent_objectid; 1243 u64 inode_objectid; 1244 u64 ref_index = 0; 1245 int ref_struct_size; 1246 1247 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1248 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1249 1250 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1251 struct btrfs_inode_extref *r; 1252 1253 ref_struct_size = sizeof(struct btrfs_inode_extref); 1254 log_ref_ver = 1; 1255 r = (struct btrfs_inode_extref *)ref_ptr; 1256 parent_objectid = btrfs_inode_extref_parent(eb, r); 1257 } else { 1258 ref_struct_size = sizeof(struct btrfs_inode_ref); 1259 parent_objectid = key->offset; 1260 } 1261 inode_objectid = key->objectid; 1262 1263 /* 1264 * it is possible that we didn't log all the parent directories 1265 * for a given inode. If we don't find the dir, just don't 1266 * copy the back ref in. The link count fixup code will take 1267 * care of the rest 1268 */ 1269 dir = read_one_inode(root, parent_objectid); 1270 if (!dir) { 1271 ret = -ENOENT; 1272 goto out; 1273 } 1274 1275 inode = read_one_inode(root, inode_objectid); 1276 if (!inode) { 1277 ret = -EIO; 1278 goto out; 1279 } 1280 1281 while (ref_ptr < ref_end) { 1282 if (log_ref_ver) { 1283 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1284 &ref_index, &parent_objectid); 1285 /* 1286 * parent object can change from one array 1287 * item to another. 1288 */ 1289 if (!dir) 1290 dir = read_one_inode(root, parent_objectid); 1291 if (!dir) { 1292 ret = -ENOENT; 1293 goto out; 1294 } 1295 } else { 1296 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1297 &ref_index); 1298 } 1299 if (ret) 1300 goto out; 1301 1302 /* if we already have a perfect match, we're done */ 1303 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1304 btrfs_ino(BTRFS_I(inode)), ref_index, 1305 name, namelen)) { 1306 /* 1307 * look for a conflicting back reference in the 1308 * metadata. if we find one we have to unlink that name 1309 * of the file before we add our new link. Later on, we 1310 * overwrite any existing back reference, and we don't 1311 * want to create dangling pointers in the directory. 1312 */ 1313 1314 if (!search_done) { 1315 ret = __add_inode_ref(trans, root, path, log, 1316 BTRFS_I(dir), 1317 BTRFS_I(inode), 1318 inode_objectid, 1319 parent_objectid, 1320 ref_index, name, namelen, 1321 &search_done); 1322 if (ret) { 1323 if (ret == 1) 1324 ret = 0; 1325 goto out; 1326 } 1327 } 1328 1329 /* insert our name */ 1330 ret = btrfs_add_link(trans, BTRFS_I(dir), 1331 BTRFS_I(inode), 1332 name, namelen, 0, ref_index); 1333 if (ret) 1334 goto out; 1335 1336 btrfs_update_inode(trans, root, inode); 1337 } 1338 1339 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1340 kfree(name); 1341 name = NULL; 1342 if (log_ref_ver) { 1343 iput(dir); 1344 dir = NULL; 1345 } 1346 } 1347 1348 /* finally write the back reference in the inode */ 1349 ret = overwrite_item(trans, root, path, eb, slot, key); 1350 out: 1351 btrfs_release_path(path); 1352 kfree(name); 1353 iput(dir); 1354 iput(inode); 1355 return ret; 1356 } 1357 1358 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1359 struct btrfs_root *root, u64 ino) 1360 { 1361 int ret; 1362 1363 ret = btrfs_insert_orphan_item(trans, root, ino); 1364 if (ret == -EEXIST) 1365 ret = 0; 1366 1367 return ret; 1368 } 1369 1370 static int count_inode_extrefs(struct btrfs_root *root, 1371 struct btrfs_inode *inode, struct btrfs_path *path) 1372 { 1373 int ret = 0; 1374 int name_len; 1375 unsigned int nlink = 0; 1376 u32 item_size; 1377 u32 cur_offset = 0; 1378 u64 inode_objectid = btrfs_ino(inode); 1379 u64 offset = 0; 1380 unsigned long ptr; 1381 struct btrfs_inode_extref *extref; 1382 struct extent_buffer *leaf; 1383 1384 while (1) { 1385 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1386 &extref, &offset); 1387 if (ret) 1388 break; 1389 1390 leaf = path->nodes[0]; 1391 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1392 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1393 cur_offset = 0; 1394 1395 while (cur_offset < item_size) { 1396 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1397 name_len = btrfs_inode_extref_name_len(leaf, extref); 1398 1399 nlink++; 1400 1401 cur_offset += name_len + sizeof(*extref); 1402 } 1403 1404 offset++; 1405 btrfs_release_path(path); 1406 } 1407 btrfs_release_path(path); 1408 1409 if (ret < 0 && ret != -ENOENT) 1410 return ret; 1411 return nlink; 1412 } 1413 1414 static int count_inode_refs(struct btrfs_root *root, 1415 struct btrfs_inode *inode, struct btrfs_path *path) 1416 { 1417 int ret; 1418 struct btrfs_key key; 1419 unsigned int nlink = 0; 1420 unsigned long ptr; 1421 unsigned long ptr_end; 1422 int name_len; 1423 u64 ino = btrfs_ino(inode); 1424 1425 key.objectid = ino; 1426 key.type = BTRFS_INODE_REF_KEY; 1427 key.offset = (u64)-1; 1428 1429 while (1) { 1430 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1431 if (ret < 0) 1432 break; 1433 if (ret > 0) { 1434 if (path->slots[0] == 0) 1435 break; 1436 path->slots[0]--; 1437 } 1438 process_slot: 1439 btrfs_item_key_to_cpu(path->nodes[0], &key, 1440 path->slots[0]); 1441 if (key.objectid != ino || 1442 key.type != BTRFS_INODE_REF_KEY) 1443 break; 1444 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1445 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1446 path->slots[0]); 1447 while (ptr < ptr_end) { 1448 struct btrfs_inode_ref *ref; 1449 1450 ref = (struct btrfs_inode_ref *)ptr; 1451 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1452 ref); 1453 ptr = (unsigned long)(ref + 1) + name_len; 1454 nlink++; 1455 } 1456 1457 if (key.offset == 0) 1458 break; 1459 if (path->slots[0] > 0) { 1460 path->slots[0]--; 1461 goto process_slot; 1462 } 1463 key.offset--; 1464 btrfs_release_path(path); 1465 } 1466 btrfs_release_path(path); 1467 1468 return nlink; 1469 } 1470 1471 /* 1472 * There are a few corners where the link count of the file can't 1473 * be properly maintained during replay. So, instead of adding 1474 * lots of complexity to the log code, we just scan the backrefs 1475 * for any file that has been through replay. 1476 * 1477 * The scan will update the link count on the inode to reflect the 1478 * number of back refs found. If it goes down to zero, the iput 1479 * will free the inode. 1480 */ 1481 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1482 struct btrfs_root *root, 1483 struct inode *inode) 1484 { 1485 struct btrfs_path *path; 1486 int ret; 1487 u64 nlink = 0; 1488 u64 ino = btrfs_ino(BTRFS_I(inode)); 1489 1490 path = btrfs_alloc_path(); 1491 if (!path) 1492 return -ENOMEM; 1493 1494 ret = count_inode_refs(root, BTRFS_I(inode), path); 1495 if (ret < 0) 1496 goto out; 1497 1498 nlink = ret; 1499 1500 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1501 if (ret < 0) 1502 goto out; 1503 1504 nlink += ret; 1505 1506 ret = 0; 1507 1508 if (nlink != inode->i_nlink) { 1509 set_nlink(inode, nlink); 1510 btrfs_update_inode(trans, root, inode); 1511 } 1512 BTRFS_I(inode)->index_cnt = (u64)-1; 1513 1514 if (inode->i_nlink == 0) { 1515 if (S_ISDIR(inode->i_mode)) { 1516 ret = replay_dir_deletes(trans, root, NULL, path, 1517 ino, 1); 1518 if (ret) 1519 goto out; 1520 } 1521 ret = insert_orphan_item(trans, root, ino); 1522 } 1523 1524 out: 1525 btrfs_free_path(path); 1526 return ret; 1527 } 1528 1529 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1530 struct btrfs_root *root, 1531 struct btrfs_path *path) 1532 { 1533 int ret; 1534 struct btrfs_key key; 1535 struct inode *inode; 1536 1537 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1538 key.type = BTRFS_ORPHAN_ITEM_KEY; 1539 key.offset = (u64)-1; 1540 while (1) { 1541 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1542 if (ret < 0) 1543 break; 1544 1545 if (ret == 1) { 1546 if (path->slots[0] == 0) 1547 break; 1548 path->slots[0]--; 1549 } 1550 1551 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1552 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1553 key.type != BTRFS_ORPHAN_ITEM_KEY) 1554 break; 1555 1556 ret = btrfs_del_item(trans, root, path); 1557 if (ret) 1558 goto out; 1559 1560 btrfs_release_path(path); 1561 inode = read_one_inode(root, key.offset); 1562 if (!inode) 1563 return -EIO; 1564 1565 ret = fixup_inode_link_count(trans, root, inode); 1566 iput(inode); 1567 if (ret) 1568 goto out; 1569 1570 /* 1571 * fixup on a directory may create new entries, 1572 * make sure we always look for the highset possible 1573 * offset 1574 */ 1575 key.offset = (u64)-1; 1576 } 1577 ret = 0; 1578 out: 1579 btrfs_release_path(path); 1580 return ret; 1581 } 1582 1583 1584 /* 1585 * record a given inode in the fixup dir so we can check its link 1586 * count when replay is done. The link count is incremented here 1587 * so the inode won't go away until we check it 1588 */ 1589 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1590 struct btrfs_root *root, 1591 struct btrfs_path *path, 1592 u64 objectid) 1593 { 1594 struct btrfs_key key; 1595 int ret = 0; 1596 struct inode *inode; 1597 1598 inode = read_one_inode(root, objectid); 1599 if (!inode) 1600 return -EIO; 1601 1602 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1603 key.type = BTRFS_ORPHAN_ITEM_KEY; 1604 key.offset = objectid; 1605 1606 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1607 1608 btrfs_release_path(path); 1609 if (ret == 0) { 1610 if (!inode->i_nlink) 1611 set_nlink(inode, 1); 1612 else 1613 inc_nlink(inode); 1614 ret = btrfs_update_inode(trans, root, inode); 1615 } else if (ret == -EEXIST) { 1616 ret = 0; 1617 } else { 1618 BUG(); /* Logic Error */ 1619 } 1620 iput(inode); 1621 1622 return ret; 1623 } 1624 1625 /* 1626 * when replaying the log for a directory, we only insert names 1627 * for inodes that actually exist. This means an fsync on a directory 1628 * does not implicitly fsync all the new files in it 1629 */ 1630 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1631 struct btrfs_root *root, 1632 u64 dirid, u64 index, 1633 char *name, int name_len, 1634 struct btrfs_key *location) 1635 { 1636 struct inode *inode; 1637 struct inode *dir; 1638 int ret; 1639 1640 inode = read_one_inode(root, location->objectid); 1641 if (!inode) 1642 return -ENOENT; 1643 1644 dir = read_one_inode(root, dirid); 1645 if (!dir) { 1646 iput(inode); 1647 return -EIO; 1648 } 1649 1650 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1651 name_len, 1, index); 1652 1653 /* FIXME, put inode into FIXUP list */ 1654 1655 iput(inode); 1656 iput(dir); 1657 return ret; 1658 } 1659 1660 /* 1661 * Return true if an inode reference exists in the log for the given name, 1662 * inode and parent inode. 1663 */ 1664 static bool name_in_log_ref(struct btrfs_root *log_root, 1665 const char *name, const int name_len, 1666 const u64 dirid, const u64 ino) 1667 { 1668 struct btrfs_key search_key; 1669 1670 search_key.objectid = ino; 1671 search_key.type = BTRFS_INODE_REF_KEY; 1672 search_key.offset = dirid; 1673 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1674 return true; 1675 1676 search_key.type = BTRFS_INODE_EXTREF_KEY; 1677 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1678 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1679 return true; 1680 1681 return false; 1682 } 1683 1684 /* 1685 * take a single entry in a log directory item and replay it into 1686 * the subvolume. 1687 * 1688 * if a conflicting item exists in the subdirectory already, 1689 * the inode it points to is unlinked and put into the link count 1690 * fix up tree. 1691 * 1692 * If a name from the log points to a file or directory that does 1693 * not exist in the FS, it is skipped. fsyncs on directories 1694 * do not force down inodes inside that directory, just changes to the 1695 * names or unlinks in a directory. 1696 * 1697 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1698 * non-existing inode) and 1 if the name was replayed. 1699 */ 1700 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1701 struct btrfs_root *root, 1702 struct btrfs_path *path, 1703 struct extent_buffer *eb, 1704 struct btrfs_dir_item *di, 1705 struct btrfs_key *key) 1706 { 1707 char *name; 1708 int name_len; 1709 struct btrfs_dir_item *dst_di; 1710 struct btrfs_key found_key; 1711 struct btrfs_key log_key; 1712 struct inode *dir; 1713 u8 log_type; 1714 int exists; 1715 int ret = 0; 1716 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1717 bool name_added = false; 1718 1719 dir = read_one_inode(root, key->objectid); 1720 if (!dir) 1721 return -EIO; 1722 1723 name_len = btrfs_dir_name_len(eb, di); 1724 name = kmalloc(name_len, GFP_NOFS); 1725 if (!name) { 1726 ret = -ENOMEM; 1727 goto out; 1728 } 1729 1730 log_type = btrfs_dir_type(eb, di); 1731 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1732 name_len); 1733 1734 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1735 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1736 if (exists == 0) 1737 exists = 1; 1738 else 1739 exists = 0; 1740 btrfs_release_path(path); 1741 1742 if (key->type == BTRFS_DIR_ITEM_KEY) { 1743 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1744 name, name_len, 1); 1745 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1746 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1747 key->objectid, 1748 key->offset, name, 1749 name_len, 1); 1750 } else { 1751 /* Corruption */ 1752 ret = -EINVAL; 1753 goto out; 1754 } 1755 if (IS_ERR_OR_NULL(dst_di)) { 1756 /* we need a sequence number to insert, so we only 1757 * do inserts for the BTRFS_DIR_INDEX_KEY types 1758 */ 1759 if (key->type != BTRFS_DIR_INDEX_KEY) 1760 goto out; 1761 goto insert; 1762 } 1763 1764 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1765 /* the existing item matches the logged item */ 1766 if (found_key.objectid == log_key.objectid && 1767 found_key.type == log_key.type && 1768 found_key.offset == log_key.offset && 1769 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1770 update_size = false; 1771 goto out; 1772 } 1773 1774 /* 1775 * don't drop the conflicting directory entry if the inode 1776 * for the new entry doesn't exist 1777 */ 1778 if (!exists) 1779 goto out; 1780 1781 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1782 if (ret) 1783 goto out; 1784 1785 if (key->type == BTRFS_DIR_INDEX_KEY) 1786 goto insert; 1787 out: 1788 btrfs_release_path(path); 1789 if (!ret && update_size) { 1790 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1791 ret = btrfs_update_inode(trans, root, dir); 1792 } 1793 kfree(name); 1794 iput(dir); 1795 if (!ret && name_added) 1796 ret = 1; 1797 return ret; 1798 1799 insert: 1800 if (name_in_log_ref(root->log_root, name, name_len, 1801 key->objectid, log_key.objectid)) { 1802 /* The dentry will be added later. */ 1803 ret = 0; 1804 update_size = false; 1805 goto out; 1806 } 1807 btrfs_release_path(path); 1808 ret = insert_one_name(trans, root, key->objectid, key->offset, 1809 name, name_len, &log_key); 1810 if (ret && ret != -ENOENT && ret != -EEXIST) 1811 goto out; 1812 if (!ret) 1813 name_added = true; 1814 update_size = false; 1815 ret = 0; 1816 goto out; 1817 } 1818 1819 /* 1820 * find all the names in a directory item and reconcile them into 1821 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1822 * one name in a directory item, but the same code gets used for 1823 * both directory index types 1824 */ 1825 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1826 struct btrfs_root *root, 1827 struct btrfs_path *path, 1828 struct extent_buffer *eb, int slot, 1829 struct btrfs_key *key) 1830 { 1831 int ret = 0; 1832 u32 item_size = btrfs_item_size_nr(eb, slot); 1833 struct btrfs_dir_item *di; 1834 int name_len; 1835 unsigned long ptr; 1836 unsigned long ptr_end; 1837 struct btrfs_path *fixup_path = NULL; 1838 1839 ptr = btrfs_item_ptr_offset(eb, slot); 1840 ptr_end = ptr + item_size; 1841 while (ptr < ptr_end) { 1842 di = (struct btrfs_dir_item *)ptr; 1843 name_len = btrfs_dir_name_len(eb, di); 1844 ret = replay_one_name(trans, root, path, eb, di, key); 1845 if (ret < 0) 1846 break; 1847 ptr = (unsigned long)(di + 1); 1848 ptr += name_len; 1849 1850 /* 1851 * If this entry refers to a non-directory (directories can not 1852 * have a link count > 1) and it was added in the transaction 1853 * that was not committed, make sure we fixup the link count of 1854 * the inode it the entry points to. Otherwise something like 1855 * the following would result in a directory pointing to an 1856 * inode with a wrong link that does not account for this dir 1857 * entry: 1858 * 1859 * mkdir testdir 1860 * touch testdir/foo 1861 * touch testdir/bar 1862 * sync 1863 * 1864 * ln testdir/bar testdir/bar_link 1865 * ln testdir/foo testdir/foo_link 1866 * xfs_io -c "fsync" testdir/bar 1867 * 1868 * <power failure> 1869 * 1870 * mount fs, log replay happens 1871 * 1872 * File foo would remain with a link count of 1 when it has two 1873 * entries pointing to it in the directory testdir. This would 1874 * make it impossible to ever delete the parent directory has 1875 * it would result in stale dentries that can never be deleted. 1876 */ 1877 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1878 struct btrfs_key di_key; 1879 1880 if (!fixup_path) { 1881 fixup_path = btrfs_alloc_path(); 1882 if (!fixup_path) { 1883 ret = -ENOMEM; 1884 break; 1885 } 1886 } 1887 1888 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1889 ret = link_to_fixup_dir(trans, root, fixup_path, 1890 di_key.objectid); 1891 if (ret) 1892 break; 1893 } 1894 ret = 0; 1895 } 1896 btrfs_free_path(fixup_path); 1897 return ret; 1898 } 1899 1900 /* 1901 * directory replay has two parts. There are the standard directory 1902 * items in the log copied from the subvolume, and range items 1903 * created in the log while the subvolume was logged. 1904 * 1905 * The range items tell us which parts of the key space the log 1906 * is authoritative for. During replay, if a key in the subvolume 1907 * directory is in a logged range item, but not actually in the log 1908 * that means it was deleted from the directory before the fsync 1909 * and should be removed. 1910 */ 1911 static noinline int find_dir_range(struct btrfs_root *root, 1912 struct btrfs_path *path, 1913 u64 dirid, int key_type, 1914 u64 *start_ret, u64 *end_ret) 1915 { 1916 struct btrfs_key key; 1917 u64 found_end; 1918 struct btrfs_dir_log_item *item; 1919 int ret; 1920 int nritems; 1921 1922 if (*start_ret == (u64)-1) 1923 return 1; 1924 1925 key.objectid = dirid; 1926 key.type = key_type; 1927 key.offset = *start_ret; 1928 1929 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1930 if (ret < 0) 1931 goto out; 1932 if (ret > 0) { 1933 if (path->slots[0] == 0) 1934 goto out; 1935 path->slots[0]--; 1936 } 1937 if (ret != 0) 1938 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1939 1940 if (key.type != key_type || key.objectid != dirid) { 1941 ret = 1; 1942 goto next; 1943 } 1944 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1945 struct btrfs_dir_log_item); 1946 found_end = btrfs_dir_log_end(path->nodes[0], item); 1947 1948 if (*start_ret >= key.offset && *start_ret <= found_end) { 1949 ret = 0; 1950 *start_ret = key.offset; 1951 *end_ret = found_end; 1952 goto out; 1953 } 1954 ret = 1; 1955 next: 1956 /* check the next slot in the tree to see if it is a valid item */ 1957 nritems = btrfs_header_nritems(path->nodes[0]); 1958 path->slots[0]++; 1959 if (path->slots[0] >= nritems) { 1960 ret = btrfs_next_leaf(root, path); 1961 if (ret) 1962 goto out; 1963 } 1964 1965 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1966 1967 if (key.type != key_type || key.objectid != dirid) { 1968 ret = 1; 1969 goto out; 1970 } 1971 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1972 struct btrfs_dir_log_item); 1973 found_end = btrfs_dir_log_end(path->nodes[0], item); 1974 *start_ret = key.offset; 1975 *end_ret = found_end; 1976 ret = 0; 1977 out: 1978 btrfs_release_path(path); 1979 return ret; 1980 } 1981 1982 /* 1983 * this looks for a given directory item in the log. If the directory 1984 * item is not in the log, the item is removed and the inode it points 1985 * to is unlinked 1986 */ 1987 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1988 struct btrfs_root *root, 1989 struct btrfs_root *log, 1990 struct btrfs_path *path, 1991 struct btrfs_path *log_path, 1992 struct inode *dir, 1993 struct btrfs_key *dir_key) 1994 { 1995 struct btrfs_fs_info *fs_info = root->fs_info; 1996 int ret; 1997 struct extent_buffer *eb; 1998 int slot; 1999 u32 item_size; 2000 struct btrfs_dir_item *di; 2001 struct btrfs_dir_item *log_di; 2002 int name_len; 2003 unsigned long ptr; 2004 unsigned long ptr_end; 2005 char *name; 2006 struct inode *inode; 2007 struct btrfs_key location; 2008 2009 again: 2010 eb = path->nodes[0]; 2011 slot = path->slots[0]; 2012 item_size = btrfs_item_size_nr(eb, slot); 2013 ptr = btrfs_item_ptr_offset(eb, slot); 2014 ptr_end = ptr + item_size; 2015 while (ptr < ptr_end) { 2016 di = (struct btrfs_dir_item *)ptr; 2017 name_len = btrfs_dir_name_len(eb, di); 2018 name = kmalloc(name_len, GFP_NOFS); 2019 if (!name) { 2020 ret = -ENOMEM; 2021 goto out; 2022 } 2023 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2024 name_len); 2025 log_di = NULL; 2026 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2027 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2028 dir_key->objectid, 2029 name, name_len, 0); 2030 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2031 log_di = btrfs_lookup_dir_index_item(trans, log, 2032 log_path, 2033 dir_key->objectid, 2034 dir_key->offset, 2035 name, name_len, 0); 2036 } 2037 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2038 btrfs_dir_item_key_to_cpu(eb, di, &location); 2039 btrfs_release_path(path); 2040 btrfs_release_path(log_path); 2041 inode = read_one_inode(root, location.objectid); 2042 if (!inode) { 2043 kfree(name); 2044 return -EIO; 2045 } 2046 2047 ret = link_to_fixup_dir(trans, root, 2048 path, location.objectid); 2049 if (ret) { 2050 kfree(name); 2051 iput(inode); 2052 goto out; 2053 } 2054 2055 inc_nlink(inode); 2056 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2057 BTRFS_I(inode), name, name_len); 2058 if (!ret) 2059 ret = btrfs_run_delayed_items(trans, fs_info); 2060 kfree(name); 2061 iput(inode); 2062 if (ret) 2063 goto out; 2064 2065 /* there might still be more names under this key 2066 * check and repeat if required 2067 */ 2068 ret = btrfs_search_slot(NULL, root, dir_key, path, 2069 0, 0); 2070 if (ret == 0) 2071 goto again; 2072 ret = 0; 2073 goto out; 2074 } else if (IS_ERR(log_di)) { 2075 kfree(name); 2076 return PTR_ERR(log_di); 2077 } 2078 btrfs_release_path(log_path); 2079 kfree(name); 2080 2081 ptr = (unsigned long)(di + 1); 2082 ptr += name_len; 2083 } 2084 ret = 0; 2085 out: 2086 btrfs_release_path(path); 2087 btrfs_release_path(log_path); 2088 return ret; 2089 } 2090 2091 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2092 struct btrfs_root *root, 2093 struct btrfs_root *log, 2094 struct btrfs_path *path, 2095 const u64 ino) 2096 { 2097 struct btrfs_key search_key; 2098 struct btrfs_path *log_path; 2099 int i; 2100 int nritems; 2101 int ret; 2102 2103 log_path = btrfs_alloc_path(); 2104 if (!log_path) 2105 return -ENOMEM; 2106 2107 search_key.objectid = ino; 2108 search_key.type = BTRFS_XATTR_ITEM_KEY; 2109 search_key.offset = 0; 2110 again: 2111 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2112 if (ret < 0) 2113 goto out; 2114 process_leaf: 2115 nritems = btrfs_header_nritems(path->nodes[0]); 2116 for (i = path->slots[0]; i < nritems; i++) { 2117 struct btrfs_key key; 2118 struct btrfs_dir_item *di; 2119 struct btrfs_dir_item *log_di; 2120 u32 total_size; 2121 u32 cur; 2122 2123 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2124 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2125 ret = 0; 2126 goto out; 2127 } 2128 2129 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2130 total_size = btrfs_item_size_nr(path->nodes[0], i); 2131 cur = 0; 2132 while (cur < total_size) { 2133 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2134 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2135 u32 this_len = sizeof(*di) + name_len + data_len; 2136 char *name; 2137 2138 name = kmalloc(name_len, GFP_NOFS); 2139 if (!name) { 2140 ret = -ENOMEM; 2141 goto out; 2142 } 2143 read_extent_buffer(path->nodes[0], name, 2144 (unsigned long)(di + 1), name_len); 2145 2146 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2147 name, name_len, 0); 2148 btrfs_release_path(log_path); 2149 if (!log_di) { 2150 /* Doesn't exist in log tree, so delete it. */ 2151 btrfs_release_path(path); 2152 di = btrfs_lookup_xattr(trans, root, path, ino, 2153 name, name_len, -1); 2154 kfree(name); 2155 if (IS_ERR(di)) { 2156 ret = PTR_ERR(di); 2157 goto out; 2158 } 2159 ASSERT(di); 2160 ret = btrfs_delete_one_dir_name(trans, root, 2161 path, di); 2162 if (ret) 2163 goto out; 2164 btrfs_release_path(path); 2165 search_key = key; 2166 goto again; 2167 } 2168 kfree(name); 2169 if (IS_ERR(log_di)) { 2170 ret = PTR_ERR(log_di); 2171 goto out; 2172 } 2173 cur += this_len; 2174 di = (struct btrfs_dir_item *)((char *)di + this_len); 2175 } 2176 } 2177 ret = btrfs_next_leaf(root, path); 2178 if (ret > 0) 2179 ret = 0; 2180 else if (ret == 0) 2181 goto process_leaf; 2182 out: 2183 btrfs_free_path(log_path); 2184 btrfs_release_path(path); 2185 return ret; 2186 } 2187 2188 2189 /* 2190 * deletion replay happens before we copy any new directory items 2191 * out of the log or out of backreferences from inodes. It 2192 * scans the log to find ranges of keys that log is authoritative for, 2193 * and then scans the directory to find items in those ranges that are 2194 * not present in the log. 2195 * 2196 * Anything we don't find in the log is unlinked and removed from the 2197 * directory. 2198 */ 2199 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2200 struct btrfs_root *root, 2201 struct btrfs_root *log, 2202 struct btrfs_path *path, 2203 u64 dirid, int del_all) 2204 { 2205 u64 range_start; 2206 u64 range_end; 2207 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2208 int ret = 0; 2209 struct btrfs_key dir_key; 2210 struct btrfs_key found_key; 2211 struct btrfs_path *log_path; 2212 struct inode *dir; 2213 2214 dir_key.objectid = dirid; 2215 dir_key.type = BTRFS_DIR_ITEM_KEY; 2216 log_path = btrfs_alloc_path(); 2217 if (!log_path) 2218 return -ENOMEM; 2219 2220 dir = read_one_inode(root, dirid); 2221 /* it isn't an error if the inode isn't there, that can happen 2222 * because we replay the deletes before we copy in the inode item 2223 * from the log 2224 */ 2225 if (!dir) { 2226 btrfs_free_path(log_path); 2227 return 0; 2228 } 2229 again: 2230 range_start = 0; 2231 range_end = 0; 2232 while (1) { 2233 if (del_all) 2234 range_end = (u64)-1; 2235 else { 2236 ret = find_dir_range(log, path, dirid, key_type, 2237 &range_start, &range_end); 2238 if (ret != 0) 2239 break; 2240 } 2241 2242 dir_key.offset = range_start; 2243 while (1) { 2244 int nritems; 2245 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2246 0, 0); 2247 if (ret < 0) 2248 goto out; 2249 2250 nritems = btrfs_header_nritems(path->nodes[0]); 2251 if (path->slots[0] >= nritems) { 2252 ret = btrfs_next_leaf(root, path); 2253 if (ret) 2254 break; 2255 } 2256 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2257 path->slots[0]); 2258 if (found_key.objectid != dirid || 2259 found_key.type != dir_key.type) 2260 goto next_type; 2261 2262 if (found_key.offset > range_end) 2263 break; 2264 2265 ret = check_item_in_log(trans, root, log, path, 2266 log_path, dir, 2267 &found_key); 2268 if (ret) 2269 goto out; 2270 if (found_key.offset == (u64)-1) 2271 break; 2272 dir_key.offset = found_key.offset + 1; 2273 } 2274 btrfs_release_path(path); 2275 if (range_end == (u64)-1) 2276 break; 2277 range_start = range_end + 1; 2278 } 2279 2280 next_type: 2281 ret = 0; 2282 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2283 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2284 dir_key.type = BTRFS_DIR_INDEX_KEY; 2285 btrfs_release_path(path); 2286 goto again; 2287 } 2288 out: 2289 btrfs_release_path(path); 2290 btrfs_free_path(log_path); 2291 iput(dir); 2292 return ret; 2293 } 2294 2295 /* 2296 * the process_func used to replay items from the log tree. This 2297 * gets called in two different stages. The first stage just looks 2298 * for inodes and makes sure they are all copied into the subvolume. 2299 * 2300 * The second stage copies all the other item types from the log into 2301 * the subvolume. The two stage approach is slower, but gets rid of 2302 * lots of complexity around inodes referencing other inodes that exist 2303 * only in the log (references come from either directory items or inode 2304 * back refs). 2305 */ 2306 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2307 struct walk_control *wc, u64 gen) 2308 { 2309 int nritems; 2310 struct btrfs_path *path; 2311 struct btrfs_root *root = wc->replay_dest; 2312 struct btrfs_key key; 2313 int level; 2314 int i; 2315 int ret; 2316 2317 ret = btrfs_read_buffer(eb, gen); 2318 if (ret) 2319 return ret; 2320 2321 level = btrfs_header_level(eb); 2322 2323 if (level != 0) 2324 return 0; 2325 2326 path = btrfs_alloc_path(); 2327 if (!path) 2328 return -ENOMEM; 2329 2330 nritems = btrfs_header_nritems(eb); 2331 for (i = 0; i < nritems; i++) { 2332 btrfs_item_key_to_cpu(eb, &key, i); 2333 2334 /* inode keys are done during the first stage */ 2335 if (key.type == BTRFS_INODE_ITEM_KEY && 2336 wc->stage == LOG_WALK_REPLAY_INODES) { 2337 struct btrfs_inode_item *inode_item; 2338 u32 mode; 2339 2340 inode_item = btrfs_item_ptr(eb, i, 2341 struct btrfs_inode_item); 2342 ret = replay_xattr_deletes(wc->trans, root, log, 2343 path, key.objectid); 2344 if (ret) 2345 break; 2346 mode = btrfs_inode_mode(eb, inode_item); 2347 if (S_ISDIR(mode)) { 2348 ret = replay_dir_deletes(wc->trans, 2349 root, log, path, key.objectid, 0); 2350 if (ret) 2351 break; 2352 } 2353 ret = overwrite_item(wc->trans, root, path, 2354 eb, i, &key); 2355 if (ret) 2356 break; 2357 2358 /* for regular files, make sure corresponding 2359 * orphan item exist. extents past the new EOF 2360 * will be truncated later by orphan cleanup. 2361 */ 2362 if (S_ISREG(mode)) { 2363 ret = insert_orphan_item(wc->trans, root, 2364 key.objectid); 2365 if (ret) 2366 break; 2367 } 2368 2369 ret = link_to_fixup_dir(wc->trans, root, 2370 path, key.objectid); 2371 if (ret) 2372 break; 2373 } 2374 2375 if (key.type == BTRFS_DIR_INDEX_KEY && 2376 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2377 ret = replay_one_dir_item(wc->trans, root, path, 2378 eb, i, &key); 2379 if (ret) 2380 break; 2381 } 2382 2383 if (wc->stage < LOG_WALK_REPLAY_ALL) 2384 continue; 2385 2386 /* these keys are simply copied */ 2387 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2388 ret = overwrite_item(wc->trans, root, path, 2389 eb, i, &key); 2390 if (ret) 2391 break; 2392 } else if (key.type == BTRFS_INODE_REF_KEY || 2393 key.type == BTRFS_INODE_EXTREF_KEY) { 2394 ret = add_inode_ref(wc->trans, root, log, path, 2395 eb, i, &key); 2396 if (ret && ret != -ENOENT) 2397 break; 2398 ret = 0; 2399 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2400 ret = replay_one_extent(wc->trans, root, path, 2401 eb, i, &key); 2402 if (ret) 2403 break; 2404 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2405 ret = replay_one_dir_item(wc->trans, root, path, 2406 eb, i, &key); 2407 if (ret) 2408 break; 2409 } 2410 } 2411 btrfs_free_path(path); 2412 return ret; 2413 } 2414 2415 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2416 struct btrfs_root *root, 2417 struct btrfs_path *path, int *level, 2418 struct walk_control *wc) 2419 { 2420 struct btrfs_fs_info *fs_info = root->fs_info; 2421 u64 root_owner; 2422 u64 bytenr; 2423 u64 ptr_gen; 2424 struct extent_buffer *next; 2425 struct extent_buffer *cur; 2426 struct extent_buffer *parent; 2427 u32 blocksize; 2428 int ret = 0; 2429 2430 WARN_ON(*level < 0); 2431 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2432 2433 while (*level > 0) { 2434 WARN_ON(*level < 0); 2435 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2436 cur = path->nodes[*level]; 2437 2438 WARN_ON(btrfs_header_level(cur) != *level); 2439 2440 if (path->slots[*level] >= 2441 btrfs_header_nritems(cur)) 2442 break; 2443 2444 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2445 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2446 blocksize = fs_info->nodesize; 2447 2448 parent = path->nodes[*level]; 2449 root_owner = btrfs_header_owner(parent); 2450 2451 next = btrfs_find_create_tree_block(fs_info, bytenr); 2452 if (IS_ERR(next)) 2453 return PTR_ERR(next); 2454 2455 if (*level == 1) { 2456 ret = wc->process_func(root, next, wc, ptr_gen); 2457 if (ret) { 2458 free_extent_buffer(next); 2459 return ret; 2460 } 2461 2462 path->slots[*level]++; 2463 if (wc->free) { 2464 ret = btrfs_read_buffer(next, ptr_gen); 2465 if (ret) { 2466 free_extent_buffer(next); 2467 return ret; 2468 } 2469 2470 if (trans) { 2471 btrfs_tree_lock(next); 2472 btrfs_set_lock_blocking(next); 2473 clean_tree_block(fs_info, next); 2474 btrfs_wait_tree_block_writeback(next); 2475 btrfs_tree_unlock(next); 2476 } else { 2477 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2478 clear_extent_buffer_dirty(next); 2479 } 2480 2481 WARN_ON(root_owner != 2482 BTRFS_TREE_LOG_OBJECTID); 2483 ret = btrfs_free_and_pin_reserved_extent( 2484 fs_info, bytenr, 2485 blocksize); 2486 if (ret) { 2487 free_extent_buffer(next); 2488 return ret; 2489 } 2490 } 2491 free_extent_buffer(next); 2492 continue; 2493 } 2494 ret = btrfs_read_buffer(next, ptr_gen); 2495 if (ret) { 2496 free_extent_buffer(next); 2497 return ret; 2498 } 2499 2500 WARN_ON(*level <= 0); 2501 if (path->nodes[*level-1]) 2502 free_extent_buffer(path->nodes[*level-1]); 2503 path->nodes[*level-1] = next; 2504 *level = btrfs_header_level(next); 2505 path->slots[*level] = 0; 2506 cond_resched(); 2507 } 2508 WARN_ON(*level < 0); 2509 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2510 2511 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2512 2513 cond_resched(); 2514 return 0; 2515 } 2516 2517 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2518 struct btrfs_root *root, 2519 struct btrfs_path *path, int *level, 2520 struct walk_control *wc) 2521 { 2522 struct btrfs_fs_info *fs_info = root->fs_info; 2523 u64 root_owner; 2524 int i; 2525 int slot; 2526 int ret; 2527 2528 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2529 slot = path->slots[i]; 2530 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2531 path->slots[i]++; 2532 *level = i; 2533 WARN_ON(*level == 0); 2534 return 0; 2535 } else { 2536 struct extent_buffer *parent; 2537 if (path->nodes[*level] == root->node) 2538 parent = path->nodes[*level]; 2539 else 2540 parent = path->nodes[*level + 1]; 2541 2542 root_owner = btrfs_header_owner(parent); 2543 ret = wc->process_func(root, path->nodes[*level], wc, 2544 btrfs_header_generation(path->nodes[*level])); 2545 if (ret) 2546 return ret; 2547 2548 if (wc->free) { 2549 struct extent_buffer *next; 2550 2551 next = path->nodes[*level]; 2552 2553 if (trans) { 2554 btrfs_tree_lock(next); 2555 btrfs_set_lock_blocking(next); 2556 clean_tree_block(fs_info, next); 2557 btrfs_wait_tree_block_writeback(next); 2558 btrfs_tree_unlock(next); 2559 } else { 2560 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2561 clear_extent_buffer_dirty(next); 2562 } 2563 2564 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2565 ret = btrfs_free_and_pin_reserved_extent( 2566 fs_info, 2567 path->nodes[*level]->start, 2568 path->nodes[*level]->len); 2569 if (ret) 2570 return ret; 2571 } 2572 free_extent_buffer(path->nodes[*level]); 2573 path->nodes[*level] = NULL; 2574 *level = i + 1; 2575 } 2576 } 2577 return 1; 2578 } 2579 2580 /* 2581 * drop the reference count on the tree rooted at 'snap'. This traverses 2582 * the tree freeing any blocks that have a ref count of zero after being 2583 * decremented. 2584 */ 2585 static int walk_log_tree(struct btrfs_trans_handle *trans, 2586 struct btrfs_root *log, struct walk_control *wc) 2587 { 2588 struct btrfs_fs_info *fs_info = log->fs_info; 2589 int ret = 0; 2590 int wret; 2591 int level; 2592 struct btrfs_path *path; 2593 int orig_level; 2594 2595 path = btrfs_alloc_path(); 2596 if (!path) 2597 return -ENOMEM; 2598 2599 level = btrfs_header_level(log->node); 2600 orig_level = level; 2601 path->nodes[level] = log->node; 2602 extent_buffer_get(log->node); 2603 path->slots[level] = 0; 2604 2605 while (1) { 2606 wret = walk_down_log_tree(trans, log, path, &level, wc); 2607 if (wret > 0) 2608 break; 2609 if (wret < 0) { 2610 ret = wret; 2611 goto out; 2612 } 2613 2614 wret = walk_up_log_tree(trans, log, path, &level, wc); 2615 if (wret > 0) 2616 break; 2617 if (wret < 0) { 2618 ret = wret; 2619 goto out; 2620 } 2621 } 2622 2623 /* was the root node processed? if not, catch it here */ 2624 if (path->nodes[orig_level]) { 2625 ret = wc->process_func(log, path->nodes[orig_level], wc, 2626 btrfs_header_generation(path->nodes[orig_level])); 2627 if (ret) 2628 goto out; 2629 if (wc->free) { 2630 struct extent_buffer *next; 2631 2632 next = path->nodes[orig_level]; 2633 2634 if (trans) { 2635 btrfs_tree_lock(next); 2636 btrfs_set_lock_blocking(next); 2637 clean_tree_block(fs_info, next); 2638 btrfs_wait_tree_block_writeback(next); 2639 btrfs_tree_unlock(next); 2640 } else { 2641 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2642 clear_extent_buffer_dirty(next); 2643 } 2644 2645 WARN_ON(log->root_key.objectid != 2646 BTRFS_TREE_LOG_OBJECTID); 2647 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2648 next->start, next->len); 2649 if (ret) 2650 goto out; 2651 } 2652 } 2653 2654 out: 2655 btrfs_free_path(path); 2656 return ret; 2657 } 2658 2659 /* 2660 * helper function to update the item for a given subvolumes log root 2661 * in the tree of log roots 2662 */ 2663 static int update_log_root(struct btrfs_trans_handle *trans, 2664 struct btrfs_root *log) 2665 { 2666 struct btrfs_fs_info *fs_info = log->fs_info; 2667 int ret; 2668 2669 if (log->log_transid == 1) { 2670 /* insert root item on the first sync */ 2671 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2672 &log->root_key, &log->root_item); 2673 } else { 2674 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2675 &log->root_key, &log->root_item); 2676 } 2677 return ret; 2678 } 2679 2680 static void wait_log_commit(struct btrfs_root *root, int transid) 2681 { 2682 DEFINE_WAIT(wait); 2683 int index = transid % 2; 2684 2685 /* 2686 * we only allow two pending log transactions at a time, 2687 * so we know that if ours is more than 2 older than the 2688 * current transaction, we're done 2689 */ 2690 for (;;) { 2691 prepare_to_wait(&root->log_commit_wait[index], 2692 &wait, TASK_UNINTERRUPTIBLE); 2693 2694 if (!(root->log_transid_committed < transid && 2695 atomic_read(&root->log_commit[index]))) 2696 break; 2697 2698 mutex_unlock(&root->log_mutex); 2699 schedule(); 2700 mutex_lock(&root->log_mutex); 2701 } 2702 finish_wait(&root->log_commit_wait[index], &wait); 2703 } 2704 2705 static void wait_for_writer(struct btrfs_root *root) 2706 { 2707 DEFINE_WAIT(wait); 2708 2709 for (;;) { 2710 prepare_to_wait(&root->log_writer_wait, &wait, 2711 TASK_UNINTERRUPTIBLE); 2712 if (!atomic_read(&root->log_writers)) 2713 break; 2714 2715 mutex_unlock(&root->log_mutex); 2716 schedule(); 2717 mutex_lock(&root->log_mutex); 2718 } 2719 finish_wait(&root->log_writer_wait, &wait); 2720 } 2721 2722 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2723 struct btrfs_log_ctx *ctx) 2724 { 2725 if (!ctx) 2726 return; 2727 2728 mutex_lock(&root->log_mutex); 2729 list_del_init(&ctx->list); 2730 mutex_unlock(&root->log_mutex); 2731 } 2732 2733 /* 2734 * Invoked in log mutex context, or be sure there is no other task which 2735 * can access the list. 2736 */ 2737 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2738 int index, int error) 2739 { 2740 struct btrfs_log_ctx *ctx; 2741 struct btrfs_log_ctx *safe; 2742 2743 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2744 list_del_init(&ctx->list); 2745 ctx->log_ret = error; 2746 } 2747 2748 INIT_LIST_HEAD(&root->log_ctxs[index]); 2749 } 2750 2751 /* 2752 * btrfs_sync_log does sends a given tree log down to the disk and 2753 * updates the super blocks to record it. When this call is done, 2754 * you know that any inodes previously logged are safely on disk only 2755 * if it returns 0. 2756 * 2757 * Any other return value means you need to call btrfs_commit_transaction. 2758 * Some of the edge cases for fsyncing directories that have had unlinks 2759 * or renames done in the past mean that sometimes the only safe 2760 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2761 * that has happened. 2762 */ 2763 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2764 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2765 { 2766 int index1; 2767 int index2; 2768 int mark; 2769 int ret; 2770 struct btrfs_fs_info *fs_info = root->fs_info; 2771 struct btrfs_root *log = root->log_root; 2772 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2773 int log_transid = 0; 2774 struct btrfs_log_ctx root_log_ctx; 2775 struct blk_plug plug; 2776 2777 mutex_lock(&root->log_mutex); 2778 log_transid = ctx->log_transid; 2779 if (root->log_transid_committed >= log_transid) { 2780 mutex_unlock(&root->log_mutex); 2781 return ctx->log_ret; 2782 } 2783 2784 index1 = log_transid % 2; 2785 if (atomic_read(&root->log_commit[index1])) { 2786 wait_log_commit(root, log_transid); 2787 mutex_unlock(&root->log_mutex); 2788 return ctx->log_ret; 2789 } 2790 ASSERT(log_transid == root->log_transid); 2791 atomic_set(&root->log_commit[index1], 1); 2792 2793 /* wait for previous tree log sync to complete */ 2794 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2795 wait_log_commit(root, log_transid - 1); 2796 2797 while (1) { 2798 int batch = atomic_read(&root->log_batch); 2799 /* when we're on an ssd, just kick the log commit out */ 2800 if (!btrfs_test_opt(fs_info, SSD) && 2801 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2802 mutex_unlock(&root->log_mutex); 2803 schedule_timeout_uninterruptible(1); 2804 mutex_lock(&root->log_mutex); 2805 } 2806 wait_for_writer(root); 2807 if (batch == atomic_read(&root->log_batch)) 2808 break; 2809 } 2810 2811 /* bail out if we need to do a full commit */ 2812 if (btrfs_need_log_full_commit(fs_info, trans)) { 2813 ret = -EAGAIN; 2814 btrfs_free_logged_extents(log, log_transid); 2815 mutex_unlock(&root->log_mutex); 2816 goto out; 2817 } 2818 2819 if (log_transid % 2 == 0) 2820 mark = EXTENT_DIRTY; 2821 else 2822 mark = EXTENT_NEW; 2823 2824 /* we start IO on all the marked extents here, but we don't actually 2825 * wait for them until later. 2826 */ 2827 blk_start_plug(&plug); 2828 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2829 if (ret) { 2830 blk_finish_plug(&plug); 2831 btrfs_abort_transaction(trans, ret); 2832 btrfs_free_logged_extents(log, log_transid); 2833 btrfs_set_log_full_commit(fs_info, trans); 2834 mutex_unlock(&root->log_mutex); 2835 goto out; 2836 } 2837 2838 btrfs_set_root_node(&log->root_item, log->node); 2839 2840 root->log_transid++; 2841 log->log_transid = root->log_transid; 2842 root->log_start_pid = 0; 2843 /* 2844 * IO has been started, blocks of the log tree have WRITTEN flag set 2845 * in their headers. new modifications of the log will be written to 2846 * new positions. so it's safe to allow log writers to go in. 2847 */ 2848 mutex_unlock(&root->log_mutex); 2849 2850 btrfs_init_log_ctx(&root_log_ctx, NULL); 2851 2852 mutex_lock(&log_root_tree->log_mutex); 2853 atomic_inc(&log_root_tree->log_batch); 2854 atomic_inc(&log_root_tree->log_writers); 2855 2856 index2 = log_root_tree->log_transid % 2; 2857 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2858 root_log_ctx.log_transid = log_root_tree->log_transid; 2859 2860 mutex_unlock(&log_root_tree->log_mutex); 2861 2862 ret = update_log_root(trans, log); 2863 2864 mutex_lock(&log_root_tree->log_mutex); 2865 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2866 /* 2867 * Implicit memory barrier after atomic_dec_and_test 2868 */ 2869 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2870 wake_up(&log_root_tree->log_writer_wait); 2871 } 2872 2873 if (ret) { 2874 if (!list_empty(&root_log_ctx.list)) 2875 list_del_init(&root_log_ctx.list); 2876 2877 blk_finish_plug(&plug); 2878 btrfs_set_log_full_commit(fs_info, trans); 2879 2880 if (ret != -ENOSPC) { 2881 btrfs_abort_transaction(trans, ret); 2882 mutex_unlock(&log_root_tree->log_mutex); 2883 goto out; 2884 } 2885 btrfs_wait_tree_log_extents(log, mark); 2886 btrfs_free_logged_extents(log, log_transid); 2887 mutex_unlock(&log_root_tree->log_mutex); 2888 ret = -EAGAIN; 2889 goto out; 2890 } 2891 2892 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2893 blk_finish_plug(&plug); 2894 list_del_init(&root_log_ctx.list); 2895 mutex_unlock(&log_root_tree->log_mutex); 2896 ret = root_log_ctx.log_ret; 2897 goto out; 2898 } 2899 2900 index2 = root_log_ctx.log_transid % 2; 2901 if (atomic_read(&log_root_tree->log_commit[index2])) { 2902 blk_finish_plug(&plug); 2903 ret = btrfs_wait_tree_log_extents(log, mark); 2904 btrfs_wait_logged_extents(trans, log, log_transid); 2905 wait_log_commit(log_root_tree, 2906 root_log_ctx.log_transid); 2907 mutex_unlock(&log_root_tree->log_mutex); 2908 if (!ret) 2909 ret = root_log_ctx.log_ret; 2910 goto out; 2911 } 2912 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2913 atomic_set(&log_root_tree->log_commit[index2], 1); 2914 2915 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2916 wait_log_commit(log_root_tree, 2917 root_log_ctx.log_transid - 1); 2918 } 2919 2920 wait_for_writer(log_root_tree); 2921 2922 /* 2923 * now that we've moved on to the tree of log tree roots, 2924 * check the full commit flag again 2925 */ 2926 if (btrfs_need_log_full_commit(fs_info, trans)) { 2927 blk_finish_plug(&plug); 2928 btrfs_wait_tree_log_extents(log, mark); 2929 btrfs_free_logged_extents(log, log_transid); 2930 mutex_unlock(&log_root_tree->log_mutex); 2931 ret = -EAGAIN; 2932 goto out_wake_log_root; 2933 } 2934 2935 ret = btrfs_write_marked_extents(fs_info, 2936 &log_root_tree->dirty_log_pages, 2937 EXTENT_DIRTY | EXTENT_NEW); 2938 blk_finish_plug(&plug); 2939 if (ret) { 2940 btrfs_set_log_full_commit(fs_info, trans); 2941 btrfs_abort_transaction(trans, ret); 2942 btrfs_free_logged_extents(log, log_transid); 2943 mutex_unlock(&log_root_tree->log_mutex); 2944 goto out_wake_log_root; 2945 } 2946 ret = btrfs_wait_tree_log_extents(log, mark); 2947 if (!ret) 2948 ret = btrfs_wait_tree_log_extents(log_root_tree, 2949 EXTENT_NEW | EXTENT_DIRTY); 2950 if (ret) { 2951 btrfs_set_log_full_commit(fs_info, trans); 2952 btrfs_free_logged_extents(log, log_transid); 2953 mutex_unlock(&log_root_tree->log_mutex); 2954 goto out_wake_log_root; 2955 } 2956 btrfs_wait_logged_extents(trans, log, log_transid); 2957 2958 btrfs_set_super_log_root(fs_info->super_for_commit, 2959 log_root_tree->node->start); 2960 btrfs_set_super_log_root_level(fs_info->super_for_commit, 2961 btrfs_header_level(log_root_tree->node)); 2962 2963 log_root_tree->log_transid++; 2964 mutex_unlock(&log_root_tree->log_mutex); 2965 2966 /* 2967 * nobody else is going to jump in and write the the ctree 2968 * super here because the log_commit atomic below is protecting 2969 * us. We must be called with a transaction handle pinning 2970 * the running transaction open, so a full commit can't hop 2971 * in and cause problems either. 2972 */ 2973 ret = write_all_supers(fs_info, 1); 2974 if (ret) { 2975 btrfs_set_log_full_commit(fs_info, trans); 2976 btrfs_abort_transaction(trans, ret); 2977 goto out_wake_log_root; 2978 } 2979 2980 mutex_lock(&root->log_mutex); 2981 if (root->last_log_commit < log_transid) 2982 root->last_log_commit = log_transid; 2983 mutex_unlock(&root->log_mutex); 2984 2985 out_wake_log_root: 2986 mutex_lock(&log_root_tree->log_mutex); 2987 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2988 2989 log_root_tree->log_transid_committed++; 2990 atomic_set(&log_root_tree->log_commit[index2], 0); 2991 mutex_unlock(&log_root_tree->log_mutex); 2992 2993 /* 2994 * The barrier before waitqueue_active is implied by mutex_unlock 2995 */ 2996 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2997 wake_up(&log_root_tree->log_commit_wait[index2]); 2998 out: 2999 mutex_lock(&root->log_mutex); 3000 btrfs_remove_all_log_ctxs(root, index1, ret); 3001 root->log_transid_committed++; 3002 atomic_set(&root->log_commit[index1], 0); 3003 mutex_unlock(&root->log_mutex); 3004 3005 /* 3006 * The barrier before waitqueue_active is implied by mutex_unlock 3007 */ 3008 if (waitqueue_active(&root->log_commit_wait[index1])) 3009 wake_up(&root->log_commit_wait[index1]); 3010 return ret; 3011 } 3012 3013 static void free_log_tree(struct btrfs_trans_handle *trans, 3014 struct btrfs_root *log) 3015 { 3016 int ret; 3017 u64 start; 3018 u64 end; 3019 struct walk_control wc = { 3020 .free = 1, 3021 .process_func = process_one_buffer 3022 }; 3023 3024 ret = walk_log_tree(trans, log, &wc); 3025 /* I don't think this can happen but just in case */ 3026 if (ret) 3027 btrfs_abort_transaction(trans, ret); 3028 3029 while (1) { 3030 ret = find_first_extent_bit(&log->dirty_log_pages, 3031 0, &start, &end, 3032 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, 3033 NULL); 3034 if (ret) 3035 break; 3036 3037 clear_extent_bits(&log->dirty_log_pages, start, end, 3038 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3039 } 3040 3041 /* 3042 * We may have short-circuited the log tree with the full commit logic 3043 * and left ordered extents on our list, so clear these out to keep us 3044 * from leaking inodes and memory. 3045 */ 3046 btrfs_free_logged_extents(log, 0); 3047 btrfs_free_logged_extents(log, 1); 3048 3049 free_extent_buffer(log->node); 3050 kfree(log); 3051 } 3052 3053 /* 3054 * free all the extents used by the tree log. This should be called 3055 * at commit time of the full transaction 3056 */ 3057 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3058 { 3059 if (root->log_root) { 3060 free_log_tree(trans, root->log_root); 3061 root->log_root = NULL; 3062 } 3063 return 0; 3064 } 3065 3066 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3067 struct btrfs_fs_info *fs_info) 3068 { 3069 if (fs_info->log_root_tree) { 3070 free_log_tree(trans, fs_info->log_root_tree); 3071 fs_info->log_root_tree = NULL; 3072 } 3073 return 0; 3074 } 3075 3076 /* 3077 * If both a file and directory are logged, and unlinks or renames are 3078 * mixed in, we have a few interesting corners: 3079 * 3080 * create file X in dir Y 3081 * link file X to X.link in dir Y 3082 * fsync file X 3083 * unlink file X but leave X.link 3084 * fsync dir Y 3085 * 3086 * After a crash we would expect only X.link to exist. But file X 3087 * didn't get fsync'd again so the log has back refs for X and X.link. 3088 * 3089 * We solve this by removing directory entries and inode backrefs from the 3090 * log when a file that was logged in the current transaction is 3091 * unlinked. Any later fsync will include the updated log entries, and 3092 * we'll be able to reconstruct the proper directory items from backrefs. 3093 * 3094 * This optimizations allows us to avoid relogging the entire inode 3095 * or the entire directory. 3096 */ 3097 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3098 struct btrfs_root *root, 3099 const char *name, int name_len, 3100 struct btrfs_inode *dir, u64 index) 3101 { 3102 struct btrfs_root *log; 3103 struct btrfs_dir_item *di; 3104 struct btrfs_path *path; 3105 int ret; 3106 int err = 0; 3107 int bytes_del = 0; 3108 u64 dir_ino = btrfs_ino(dir); 3109 3110 if (dir->logged_trans < trans->transid) 3111 return 0; 3112 3113 ret = join_running_log_trans(root); 3114 if (ret) 3115 return 0; 3116 3117 mutex_lock(&dir->log_mutex); 3118 3119 log = root->log_root; 3120 path = btrfs_alloc_path(); 3121 if (!path) { 3122 err = -ENOMEM; 3123 goto out_unlock; 3124 } 3125 3126 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3127 name, name_len, -1); 3128 if (IS_ERR(di)) { 3129 err = PTR_ERR(di); 3130 goto fail; 3131 } 3132 if (di) { 3133 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3134 bytes_del += name_len; 3135 if (ret) { 3136 err = ret; 3137 goto fail; 3138 } 3139 } 3140 btrfs_release_path(path); 3141 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3142 index, name, name_len, -1); 3143 if (IS_ERR(di)) { 3144 err = PTR_ERR(di); 3145 goto fail; 3146 } 3147 if (di) { 3148 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3149 bytes_del += name_len; 3150 if (ret) { 3151 err = ret; 3152 goto fail; 3153 } 3154 } 3155 3156 /* update the directory size in the log to reflect the names 3157 * we have removed 3158 */ 3159 if (bytes_del) { 3160 struct btrfs_key key; 3161 3162 key.objectid = dir_ino; 3163 key.offset = 0; 3164 key.type = BTRFS_INODE_ITEM_KEY; 3165 btrfs_release_path(path); 3166 3167 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3168 if (ret < 0) { 3169 err = ret; 3170 goto fail; 3171 } 3172 if (ret == 0) { 3173 struct btrfs_inode_item *item; 3174 u64 i_size; 3175 3176 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3177 struct btrfs_inode_item); 3178 i_size = btrfs_inode_size(path->nodes[0], item); 3179 if (i_size > bytes_del) 3180 i_size -= bytes_del; 3181 else 3182 i_size = 0; 3183 btrfs_set_inode_size(path->nodes[0], item, i_size); 3184 btrfs_mark_buffer_dirty(path->nodes[0]); 3185 } else 3186 ret = 0; 3187 btrfs_release_path(path); 3188 } 3189 fail: 3190 btrfs_free_path(path); 3191 out_unlock: 3192 mutex_unlock(&dir->log_mutex); 3193 if (ret == -ENOSPC) { 3194 btrfs_set_log_full_commit(root->fs_info, trans); 3195 ret = 0; 3196 } else if (ret < 0) 3197 btrfs_abort_transaction(trans, ret); 3198 3199 btrfs_end_log_trans(root); 3200 3201 return err; 3202 } 3203 3204 /* see comments for btrfs_del_dir_entries_in_log */ 3205 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3206 struct btrfs_root *root, 3207 const char *name, int name_len, 3208 struct btrfs_inode *inode, u64 dirid) 3209 { 3210 struct btrfs_fs_info *fs_info = root->fs_info; 3211 struct btrfs_root *log; 3212 u64 index; 3213 int ret; 3214 3215 if (inode->logged_trans < trans->transid) 3216 return 0; 3217 3218 ret = join_running_log_trans(root); 3219 if (ret) 3220 return 0; 3221 log = root->log_root; 3222 mutex_lock(&inode->log_mutex); 3223 3224 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3225 dirid, &index); 3226 mutex_unlock(&inode->log_mutex); 3227 if (ret == -ENOSPC) { 3228 btrfs_set_log_full_commit(fs_info, trans); 3229 ret = 0; 3230 } else if (ret < 0 && ret != -ENOENT) 3231 btrfs_abort_transaction(trans, ret); 3232 btrfs_end_log_trans(root); 3233 3234 return ret; 3235 } 3236 3237 /* 3238 * creates a range item in the log for 'dirid'. first_offset and 3239 * last_offset tell us which parts of the key space the log should 3240 * be considered authoritative for. 3241 */ 3242 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3243 struct btrfs_root *log, 3244 struct btrfs_path *path, 3245 int key_type, u64 dirid, 3246 u64 first_offset, u64 last_offset) 3247 { 3248 int ret; 3249 struct btrfs_key key; 3250 struct btrfs_dir_log_item *item; 3251 3252 key.objectid = dirid; 3253 key.offset = first_offset; 3254 if (key_type == BTRFS_DIR_ITEM_KEY) 3255 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3256 else 3257 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3258 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3259 if (ret) 3260 return ret; 3261 3262 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3263 struct btrfs_dir_log_item); 3264 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3265 btrfs_mark_buffer_dirty(path->nodes[0]); 3266 btrfs_release_path(path); 3267 return 0; 3268 } 3269 3270 /* 3271 * log all the items included in the current transaction for a given 3272 * directory. This also creates the range items in the log tree required 3273 * to replay anything deleted before the fsync 3274 */ 3275 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3276 struct btrfs_root *root, struct btrfs_inode *inode, 3277 struct btrfs_path *path, 3278 struct btrfs_path *dst_path, int key_type, 3279 struct btrfs_log_ctx *ctx, 3280 u64 min_offset, u64 *last_offset_ret) 3281 { 3282 struct btrfs_key min_key; 3283 struct btrfs_root *log = root->log_root; 3284 struct extent_buffer *src; 3285 int err = 0; 3286 int ret; 3287 int i; 3288 int nritems; 3289 u64 first_offset = min_offset; 3290 u64 last_offset = (u64)-1; 3291 u64 ino = btrfs_ino(inode); 3292 3293 log = root->log_root; 3294 3295 min_key.objectid = ino; 3296 min_key.type = key_type; 3297 min_key.offset = min_offset; 3298 3299 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3300 3301 /* 3302 * we didn't find anything from this transaction, see if there 3303 * is anything at all 3304 */ 3305 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3306 min_key.objectid = ino; 3307 min_key.type = key_type; 3308 min_key.offset = (u64)-1; 3309 btrfs_release_path(path); 3310 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3311 if (ret < 0) { 3312 btrfs_release_path(path); 3313 return ret; 3314 } 3315 ret = btrfs_previous_item(root, path, ino, key_type); 3316 3317 /* if ret == 0 there are items for this type, 3318 * create a range to tell us the last key of this type. 3319 * otherwise, there are no items in this directory after 3320 * *min_offset, and we create a range to indicate that. 3321 */ 3322 if (ret == 0) { 3323 struct btrfs_key tmp; 3324 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3325 path->slots[0]); 3326 if (key_type == tmp.type) 3327 first_offset = max(min_offset, tmp.offset) + 1; 3328 } 3329 goto done; 3330 } 3331 3332 /* go backward to find any previous key */ 3333 ret = btrfs_previous_item(root, path, ino, key_type); 3334 if (ret == 0) { 3335 struct btrfs_key tmp; 3336 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3337 if (key_type == tmp.type) { 3338 first_offset = tmp.offset; 3339 ret = overwrite_item(trans, log, dst_path, 3340 path->nodes[0], path->slots[0], 3341 &tmp); 3342 if (ret) { 3343 err = ret; 3344 goto done; 3345 } 3346 } 3347 } 3348 btrfs_release_path(path); 3349 3350 /* find the first key from this transaction again */ 3351 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3352 if (WARN_ON(ret != 0)) 3353 goto done; 3354 3355 /* 3356 * we have a block from this transaction, log every item in it 3357 * from our directory 3358 */ 3359 while (1) { 3360 struct btrfs_key tmp; 3361 src = path->nodes[0]; 3362 nritems = btrfs_header_nritems(src); 3363 for (i = path->slots[0]; i < nritems; i++) { 3364 struct btrfs_dir_item *di; 3365 3366 btrfs_item_key_to_cpu(src, &min_key, i); 3367 3368 if (min_key.objectid != ino || min_key.type != key_type) 3369 goto done; 3370 ret = overwrite_item(trans, log, dst_path, src, i, 3371 &min_key); 3372 if (ret) { 3373 err = ret; 3374 goto done; 3375 } 3376 3377 /* 3378 * We must make sure that when we log a directory entry, 3379 * the corresponding inode, after log replay, has a 3380 * matching link count. For example: 3381 * 3382 * touch foo 3383 * mkdir mydir 3384 * sync 3385 * ln foo mydir/bar 3386 * xfs_io -c "fsync" mydir 3387 * <crash> 3388 * <mount fs and log replay> 3389 * 3390 * Would result in a fsync log that when replayed, our 3391 * file inode would have a link count of 1, but we get 3392 * two directory entries pointing to the same inode. 3393 * After removing one of the names, it would not be 3394 * possible to remove the other name, which resulted 3395 * always in stale file handle errors, and would not 3396 * be possible to rmdir the parent directory, since 3397 * its i_size could never decrement to the value 3398 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3399 */ 3400 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3401 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3402 if (ctx && 3403 (btrfs_dir_transid(src, di) == trans->transid || 3404 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3405 tmp.type != BTRFS_ROOT_ITEM_KEY) 3406 ctx->log_new_dentries = true; 3407 } 3408 path->slots[0] = nritems; 3409 3410 /* 3411 * look ahead to the next item and see if it is also 3412 * from this directory and from this transaction 3413 */ 3414 ret = btrfs_next_leaf(root, path); 3415 if (ret == 1) { 3416 last_offset = (u64)-1; 3417 goto done; 3418 } 3419 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3420 if (tmp.objectid != ino || tmp.type != key_type) { 3421 last_offset = (u64)-1; 3422 goto done; 3423 } 3424 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3425 ret = overwrite_item(trans, log, dst_path, 3426 path->nodes[0], path->slots[0], 3427 &tmp); 3428 if (ret) 3429 err = ret; 3430 else 3431 last_offset = tmp.offset; 3432 goto done; 3433 } 3434 } 3435 done: 3436 btrfs_release_path(path); 3437 btrfs_release_path(dst_path); 3438 3439 if (err == 0) { 3440 *last_offset_ret = last_offset; 3441 /* 3442 * insert the log range keys to indicate where the log 3443 * is valid 3444 */ 3445 ret = insert_dir_log_key(trans, log, path, key_type, 3446 ino, first_offset, last_offset); 3447 if (ret) 3448 err = ret; 3449 } 3450 return err; 3451 } 3452 3453 /* 3454 * logging directories is very similar to logging inodes, We find all the items 3455 * from the current transaction and write them to the log. 3456 * 3457 * The recovery code scans the directory in the subvolume, and if it finds a 3458 * key in the range logged that is not present in the log tree, then it means 3459 * that dir entry was unlinked during the transaction. 3460 * 3461 * In order for that scan to work, we must include one key smaller than 3462 * the smallest logged by this transaction and one key larger than the largest 3463 * key logged by this transaction. 3464 */ 3465 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3466 struct btrfs_root *root, struct btrfs_inode *inode, 3467 struct btrfs_path *path, 3468 struct btrfs_path *dst_path, 3469 struct btrfs_log_ctx *ctx) 3470 { 3471 u64 min_key; 3472 u64 max_key; 3473 int ret; 3474 int key_type = BTRFS_DIR_ITEM_KEY; 3475 3476 again: 3477 min_key = 0; 3478 max_key = 0; 3479 while (1) { 3480 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3481 ctx, min_key, &max_key); 3482 if (ret) 3483 return ret; 3484 if (max_key == (u64)-1) 3485 break; 3486 min_key = max_key + 1; 3487 } 3488 3489 if (key_type == BTRFS_DIR_ITEM_KEY) { 3490 key_type = BTRFS_DIR_INDEX_KEY; 3491 goto again; 3492 } 3493 return 0; 3494 } 3495 3496 /* 3497 * a helper function to drop items from the log before we relog an 3498 * inode. max_key_type indicates the highest item type to remove. 3499 * This cannot be run for file data extents because it does not 3500 * free the extents they point to. 3501 */ 3502 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3503 struct btrfs_root *log, 3504 struct btrfs_path *path, 3505 u64 objectid, int max_key_type) 3506 { 3507 int ret; 3508 struct btrfs_key key; 3509 struct btrfs_key found_key; 3510 int start_slot; 3511 3512 key.objectid = objectid; 3513 key.type = max_key_type; 3514 key.offset = (u64)-1; 3515 3516 while (1) { 3517 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3518 BUG_ON(ret == 0); /* Logic error */ 3519 if (ret < 0) 3520 break; 3521 3522 if (path->slots[0] == 0) 3523 break; 3524 3525 path->slots[0]--; 3526 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3527 path->slots[0]); 3528 3529 if (found_key.objectid != objectid) 3530 break; 3531 3532 found_key.offset = 0; 3533 found_key.type = 0; 3534 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3535 &start_slot); 3536 3537 ret = btrfs_del_items(trans, log, path, start_slot, 3538 path->slots[0] - start_slot + 1); 3539 /* 3540 * If start slot isn't 0 then we don't need to re-search, we've 3541 * found the last guy with the objectid in this tree. 3542 */ 3543 if (ret || start_slot != 0) 3544 break; 3545 btrfs_release_path(path); 3546 } 3547 btrfs_release_path(path); 3548 if (ret > 0) 3549 ret = 0; 3550 return ret; 3551 } 3552 3553 static void fill_inode_item(struct btrfs_trans_handle *trans, 3554 struct extent_buffer *leaf, 3555 struct btrfs_inode_item *item, 3556 struct inode *inode, int log_inode_only, 3557 u64 logged_isize) 3558 { 3559 struct btrfs_map_token token; 3560 3561 btrfs_init_map_token(&token); 3562 3563 if (log_inode_only) { 3564 /* set the generation to zero so the recover code 3565 * can tell the difference between an logging 3566 * just to say 'this inode exists' and a logging 3567 * to say 'update this inode with these values' 3568 */ 3569 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3570 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3571 } else { 3572 btrfs_set_token_inode_generation(leaf, item, 3573 BTRFS_I(inode)->generation, 3574 &token); 3575 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3576 } 3577 3578 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3579 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3580 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3581 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3582 3583 btrfs_set_token_timespec_sec(leaf, &item->atime, 3584 inode->i_atime.tv_sec, &token); 3585 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3586 inode->i_atime.tv_nsec, &token); 3587 3588 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3589 inode->i_mtime.tv_sec, &token); 3590 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3591 inode->i_mtime.tv_nsec, &token); 3592 3593 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3594 inode->i_ctime.tv_sec, &token); 3595 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3596 inode->i_ctime.tv_nsec, &token); 3597 3598 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3599 &token); 3600 3601 btrfs_set_token_inode_sequence(leaf, item, 3602 inode_peek_iversion(inode), &token); 3603 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3604 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3605 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3606 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3607 } 3608 3609 static int log_inode_item(struct btrfs_trans_handle *trans, 3610 struct btrfs_root *log, struct btrfs_path *path, 3611 struct btrfs_inode *inode) 3612 { 3613 struct btrfs_inode_item *inode_item; 3614 int ret; 3615 3616 ret = btrfs_insert_empty_item(trans, log, path, 3617 &inode->location, sizeof(*inode_item)); 3618 if (ret && ret != -EEXIST) 3619 return ret; 3620 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3621 struct btrfs_inode_item); 3622 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3623 0, 0); 3624 btrfs_release_path(path); 3625 return 0; 3626 } 3627 3628 static noinline int copy_items(struct btrfs_trans_handle *trans, 3629 struct btrfs_inode *inode, 3630 struct btrfs_path *dst_path, 3631 struct btrfs_path *src_path, u64 *last_extent, 3632 int start_slot, int nr, int inode_only, 3633 u64 logged_isize) 3634 { 3635 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3636 unsigned long src_offset; 3637 unsigned long dst_offset; 3638 struct btrfs_root *log = inode->root->log_root; 3639 struct btrfs_file_extent_item *extent; 3640 struct btrfs_inode_item *inode_item; 3641 struct extent_buffer *src = src_path->nodes[0]; 3642 struct btrfs_key first_key, last_key, key; 3643 int ret; 3644 struct btrfs_key *ins_keys; 3645 u32 *ins_sizes; 3646 char *ins_data; 3647 int i; 3648 struct list_head ordered_sums; 3649 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3650 bool has_extents = false; 3651 bool need_find_last_extent = true; 3652 bool done = false; 3653 3654 INIT_LIST_HEAD(&ordered_sums); 3655 3656 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3657 nr * sizeof(u32), GFP_NOFS); 3658 if (!ins_data) 3659 return -ENOMEM; 3660 3661 first_key.objectid = (u64)-1; 3662 3663 ins_sizes = (u32 *)ins_data; 3664 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3665 3666 for (i = 0; i < nr; i++) { 3667 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3668 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3669 } 3670 ret = btrfs_insert_empty_items(trans, log, dst_path, 3671 ins_keys, ins_sizes, nr); 3672 if (ret) { 3673 kfree(ins_data); 3674 return ret; 3675 } 3676 3677 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3678 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3679 dst_path->slots[0]); 3680 3681 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3682 3683 if (i == nr - 1) 3684 last_key = ins_keys[i]; 3685 3686 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3687 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3688 dst_path->slots[0], 3689 struct btrfs_inode_item); 3690 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3691 &inode->vfs_inode, 3692 inode_only == LOG_INODE_EXISTS, 3693 logged_isize); 3694 } else { 3695 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3696 src_offset, ins_sizes[i]); 3697 } 3698 3699 /* 3700 * We set need_find_last_extent here in case we know we were 3701 * processing other items and then walk into the first extent in 3702 * the inode. If we don't hit an extent then nothing changes, 3703 * we'll do the last search the next time around. 3704 */ 3705 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3706 has_extents = true; 3707 if (first_key.objectid == (u64)-1) 3708 first_key = ins_keys[i]; 3709 } else { 3710 need_find_last_extent = false; 3711 } 3712 3713 /* take a reference on file data extents so that truncates 3714 * or deletes of this inode don't have to relog the inode 3715 * again 3716 */ 3717 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3718 !skip_csum) { 3719 int found_type; 3720 extent = btrfs_item_ptr(src, start_slot + i, 3721 struct btrfs_file_extent_item); 3722 3723 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3724 continue; 3725 3726 found_type = btrfs_file_extent_type(src, extent); 3727 if (found_type == BTRFS_FILE_EXTENT_REG) { 3728 u64 ds, dl, cs, cl; 3729 ds = btrfs_file_extent_disk_bytenr(src, 3730 extent); 3731 /* ds == 0 is a hole */ 3732 if (ds == 0) 3733 continue; 3734 3735 dl = btrfs_file_extent_disk_num_bytes(src, 3736 extent); 3737 cs = btrfs_file_extent_offset(src, extent); 3738 cl = btrfs_file_extent_num_bytes(src, 3739 extent); 3740 if (btrfs_file_extent_compression(src, 3741 extent)) { 3742 cs = 0; 3743 cl = dl; 3744 } 3745 3746 ret = btrfs_lookup_csums_range( 3747 fs_info->csum_root, 3748 ds + cs, ds + cs + cl - 1, 3749 &ordered_sums, 0); 3750 if (ret) { 3751 btrfs_release_path(dst_path); 3752 kfree(ins_data); 3753 return ret; 3754 } 3755 } 3756 } 3757 } 3758 3759 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3760 btrfs_release_path(dst_path); 3761 kfree(ins_data); 3762 3763 /* 3764 * we have to do this after the loop above to avoid changing the 3765 * log tree while trying to change the log tree. 3766 */ 3767 ret = 0; 3768 while (!list_empty(&ordered_sums)) { 3769 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3770 struct btrfs_ordered_sum, 3771 list); 3772 if (!ret) 3773 ret = btrfs_csum_file_blocks(trans, log, sums); 3774 list_del(&sums->list); 3775 kfree(sums); 3776 } 3777 3778 if (!has_extents) 3779 return ret; 3780 3781 if (need_find_last_extent && *last_extent == first_key.offset) { 3782 /* 3783 * We don't have any leafs between our current one and the one 3784 * we processed before that can have file extent items for our 3785 * inode (and have a generation number smaller than our current 3786 * transaction id). 3787 */ 3788 need_find_last_extent = false; 3789 } 3790 3791 /* 3792 * Because we use btrfs_search_forward we could skip leaves that were 3793 * not modified and then assume *last_extent is valid when it really 3794 * isn't. So back up to the previous leaf and read the end of the last 3795 * extent before we go and fill in holes. 3796 */ 3797 if (need_find_last_extent) { 3798 u64 len; 3799 3800 ret = btrfs_prev_leaf(inode->root, src_path); 3801 if (ret < 0) 3802 return ret; 3803 if (ret) 3804 goto fill_holes; 3805 if (src_path->slots[0]) 3806 src_path->slots[0]--; 3807 src = src_path->nodes[0]; 3808 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3809 if (key.objectid != btrfs_ino(inode) || 3810 key.type != BTRFS_EXTENT_DATA_KEY) 3811 goto fill_holes; 3812 extent = btrfs_item_ptr(src, src_path->slots[0], 3813 struct btrfs_file_extent_item); 3814 if (btrfs_file_extent_type(src, extent) == 3815 BTRFS_FILE_EXTENT_INLINE) { 3816 len = btrfs_file_extent_inline_len(src, 3817 src_path->slots[0], 3818 extent); 3819 *last_extent = ALIGN(key.offset + len, 3820 fs_info->sectorsize); 3821 } else { 3822 len = btrfs_file_extent_num_bytes(src, extent); 3823 *last_extent = key.offset + len; 3824 } 3825 } 3826 fill_holes: 3827 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3828 * things could have happened 3829 * 3830 * 1) A merge could have happened, so we could currently be on a leaf 3831 * that holds what we were copying in the first place. 3832 * 2) A split could have happened, and now not all of the items we want 3833 * are on the same leaf. 3834 * 3835 * So we need to adjust how we search for holes, we need to drop the 3836 * path and re-search for the first extent key we found, and then walk 3837 * forward until we hit the last one we copied. 3838 */ 3839 if (need_find_last_extent) { 3840 /* btrfs_prev_leaf could return 1 without releasing the path */ 3841 btrfs_release_path(src_path); 3842 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3843 src_path, 0, 0); 3844 if (ret < 0) 3845 return ret; 3846 ASSERT(ret == 0); 3847 src = src_path->nodes[0]; 3848 i = src_path->slots[0]; 3849 } else { 3850 i = start_slot; 3851 } 3852 3853 /* 3854 * Ok so here we need to go through and fill in any holes we may have 3855 * to make sure that holes are punched for those areas in case they had 3856 * extents previously. 3857 */ 3858 while (!done) { 3859 u64 offset, len; 3860 u64 extent_end; 3861 3862 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3863 ret = btrfs_next_leaf(inode->root, src_path); 3864 if (ret < 0) 3865 return ret; 3866 ASSERT(ret == 0); 3867 src = src_path->nodes[0]; 3868 i = 0; 3869 } 3870 3871 btrfs_item_key_to_cpu(src, &key, i); 3872 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3873 done = true; 3874 if (key.objectid != btrfs_ino(inode) || 3875 key.type != BTRFS_EXTENT_DATA_KEY) { 3876 i++; 3877 continue; 3878 } 3879 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3880 if (btrfs_file_extent_type(src, extent) == 3881 BTRFS_FILE_EXTENT_INLINE) { 3882 len = btrfs_file_extent_inline_len(src, i, extent); 3883 extent_end = ALIGN(key.offset + len, 3884 fs_info->sectorsize); 3885 } else { 3886 len = btrfs_file_extent_num_bytes(src, extent); 3887 extent_end = key.offset + len; 3888 } 3889 i++; 3890 3891 if (*last_extent == key.offset) { 3892 *last_extent = extent_end; 3893 continue; 3894 } 3895 offset = *last_extent; 3896 len = key.offset - *last_extent; 3897 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3898 offset, 0, 0, len, 0, len, 0, 0, 0); 3899 if (ret) 3900 break; 3901 *last_extent = extent_end; 3902 } 3903 /* 3904 * Need to let the callers know we dropped the path so they should 3905 * re-search. 3906 */ 3907 if (!ret && need_find_last_extent) 3908 ret = 1; 3909 return ret; 3910 } 3911 3912 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3913 { 3914 struct extent_map *em1, *em2; 3915 3916 em1 = list_entry(a, struct extent_map, list); 3917 em2 = list_entry(b, struct extent_map, list); 3918 3919 if (em1->start < em2->start) 3920 return -1; 3921 else if (em1->start > em2->start) 3922 return 1; 3923 return 0; 3924 } 3925 3926 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3927 struct inode *inode, 3928 struct btrfs_root *root, 3929 const struct extent_map *em, 3930 const struct list_head *logged_list, 3931 bool *ordered_io_error) 3932 { 3933 struct btrfs_fs_info *fs_info = root->fs_info; 3934 struct btrfs_ordered_extent *ordered; 3935 struct btrfs_root *log = root->log_root; 3936 u64 mod_start = em->mod_start; 3937 u64 mod_len = em->mod_len; 3938 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3939 u64 csum_offset; 3940 u64 csum_len; 3941 LIST_HEAD(ordered_sums); 3942 int ret = 0; 3943 3944 *ordered_io_error = false; 3945 3946 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3947 em->block_start == EXTENT_MAP_HOLE) 3948 return 0; 3949 3950 /* 3951 * Wait far any ordered extent that covers our extent map. If it 3952 * finishes without an error, first check and see if our csums are on 3953 * our outstanding ordered extents. 3954 */ 3955 list_for_each_entry(ordered, logged_list, log_list) { 3956 struct btrfs_ordered_sum *sum; 3957 3958 if (!mod_len) 3959 break; 3960 3961 if (ordered->file_offset + ordered->len <= mod_start || 3962 mod_start + mod_len <= ordered->file_offset) 3963 continue; 3964 3965 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3966 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3967 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3968 const u64 start = ordered->file_offset; 3969 const u64 end = ordered->file_offset + ordered->len - 1; 3970 3971 WARN_ON(ordered->inode != inode); 3972 filemap_fdatawrite_range(inode->i_mapping, start, end); 3973 } 3974 3975 wait_event(ordered->wait, 3976 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3977 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3978 3979 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3980 /* 3981 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3982 * i_mapping flags, so that the next fsync won't get 3983 * an outdated io error too. 3984 */ 3985 filemap_check_errors(inode->i_mapping); 3986 *ordered_io_error = true; 3987 break; 3988 } 3989 /* 3990 * We are going to copy all the csums on this ordered extent, so 3991 * go ahead and adjust mod_start and mod_len in case this 3992 * ordered extent has already been logged. 3993 */ 3994 if (ordered->file_offset > mod_start) { 3995 if (ordered->file_offset + ordered->len >= 3996 mod_start + mod_len) 3997 mod_len = ordered->file_offset - mod_start; 3998 /* 3999 * If we have this case 4000 * 4001 * |--------- logged extent ---------| 4002 * |----- ordered extent ----| 4003 * 4004 * Just don't mess with mod_start and mod_len, we'll 4005 * just end up logging more csums than we need and it 4006 * will be ok. 4007 */ 4008 } else { 4009 if (ordered->file_offset + ordered->len < 4010 mod_start + mod_len) { 4011 mod_len = (mod_start + mod_len) - 4012 (ordered->file_offset + ordered->len); 4013 mod_start = ordered->file_offset + 4014 ordered->len; 4015 } else { 4016 mod_len = 0; 4017 } 4018 } 4019 4020 if (skip_csum) 4021 continue; 4022 4023 /* 4024 * To keep us from looping for the above case of an ordered 4025 * extent that falls inside of the logged extent. 4026 */ 4027 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4028 &ordered->flags)) 4029 continue; 4030 4031 list_for_each_entry(sum, &ordered->list, list) { 4032 ret = btrfs_csum_file_blocks(trans, log, sum); 4033 if (ret) 4034 break; 4035 } 4036 } 4037 4038 if (*ordered_io_error || !mod_len || ret || skip_csum) 4039 return ret; 4040 4041 if (em->compress_type) { 4042 csum_offset = 0; 4043 csum_len = max(em->block_len, em->orig_block_len); 4044 } else { 4045 csum_offset = mod_start - em->start; 4046 csum_len = mod_len; 4047 } 4048 4049 /* block start is already adjusted for the file extent offset. */ 4050 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4051 em->block_start + csum_offset, 4052 em->block_start + csum_offset + 4053 csum_len - 1, &ordered_sums, 0); 4054 if (ret) 4055 return ret; 4056 4057 while (!list_empty(&ordered_sums)) { 4058 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4059 struct btrfs_ordered_sum, 4060 list); 4061 if (!ret) 4062 ret = btrfs_csum_file_blocks(trans, log, sums); 4063 list_del(&sums->list); 4064 kfree(sums); 4065 } 4066 4067 return ret; 4068 } 4069 4070 static int log_one_extent(struct btrfs_trans_handle *trans, 4071 struct btrfs_inode *inode, struct btrfs_root *root, 4072 const struct extent_map *em, 4073 struct btrfs_path *path, 4074 const struct list_head *logged_list, 4075 struct btrfs_log_ctx *ctx) 4076 { 4077 struct btrfs_root *log = root->log_root; 4078 struct btrfs_file_extent_item *fi; 4079 struct extent_buffer *leaf; 4080 struct btrfs_map_token token; 4081 struct btrfs_key key; 4082 u64 extent_offset = em->start - em->orig_start; 4083 u64 block_len; 4084 int ret; 4085 int extent_inserted = 0; 4086 bool ordered_io_err = false; 4087 4088 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4089 logged_list, &ordered_io_err); 4090 if (ret) 4091 return ret; 4092 4093 if (ordered_io_err) { 4094 ctx->io_err = -EIO; 4095 return ctx->io_err; 4096 } 4097 4098 btrfs_init_map_token(&token); 4099 4100 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4101 em->start + em->len, NULL, 0, 1, 4102 sizeof(*fi), &extent_inserted); 4103 if (ret) 4104 return ret; 4105 4106 if (!extent_inserted) { 4107 key.objectid = btrfs_ino(inode); 4108 key.type = BTRFS_EXTENT_DATA_KEY; 4109 key.offset = em->start; 4110 4111 ret = btrfs_insert_empty_item(trans, log, path, &key, 4112 sizeof(*fi)); 4113 if (ret) 4114 return ret; 4115 } 4116 leaf = path->nodes[0]; 4117 fi = btrfs_item_ptr(leaf, path->slots[0], 4118 struct btrfs_file_extent_item); 4119 4120 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4121 &token); 4122 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4123 btrfs_set_token_file_extent_type(leaf, fi, 4124 BTRFS_FILE_EXTENT_PREALLOC, 4125 &token); 4126 else 4127 btrfs_set_token_file_extent_type(leaf, fi, 4128 BTRFS_FILE_EXTENT_REG, 4129 &token); 4130 4131 block_len = max(em->block_len, em->orig_block_len); 4132 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4133 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4134 em->block_start, 4135 &token); 4136 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4137 &token); 4138 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4139 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4140 em->block_start - 4141 extent_offset, &token); 4142 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4143 &token); 4144 } else { 4145 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4146 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4147 &token); 4148 } 4149 4150 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4151 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4152 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4153 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4154 &token); 4155 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4156 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4157 btrfs_mark_buffer_dirty(leaf); 4158 4159 btrfs_release_path(path); 4160 4161 return ret; 4162 } 4163 4164 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4165 struct btrfs_root *root, 4166 struct btrfs_inode *inode, 4167 struct btrfs_path *path, 4168 struct list_head *logged_list, 4169 struct btrfs_log_ctx *ctx, 4170 const u64 start, 4171 const u64 end) 4172 { 4173 struct extent_map *em, *n; 4174 struct list_head extents; 4175 struct extent_map_tree *tree = &inode->extent_tree; 4176 u64 logged_start, logged_end; 4177 u64 test_gen; 4178 int ret = 0; 4179 int num = 0; 4180 4181 INIT_LIST_HEAD(&extents); 4182 4183 down_write(&inode->dio_sem); 4184 write_lock(&tree->lock); 4185 test_gen = root->fs_info->last_trans_committed; 4186 logged_start = start; 4187 logged_end = end; 4188 4189 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4190 list_del_init(&em->list); 4191 /* 4192 * Just an arbitrary number, this can be really CPU intensive 4193 * once we start getting a lot of extents, and really once we 4194 * have a bunch of extents we just want to commit since it will 4195 * be faster. 4196 */ 4197 if (++num > 32768) { 4198 list_del_init(&tree->modified_extents); 4199 ret = -EFBIG; 4200 goto process; 4201 } 4202 4203 if (em->generation <= test_gen) 4204 continue; 4205 4206 if (em->start < logged_start) 4207 logged_start = em->start; 4208 if ((em->start + em->len - 1) > logged_end) 4209 logged_end = em->start + em->len - 1; 4210 4211 /* Need a ref to keep it from getting evicted from cache */ 4212 refcount_inc(&em->refs); 4213 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4214 list_add_tail(&em->list, &extents); 4215 num++; 4216 } 4217 4218 list_sort(NULL, &extents, extent_cmp); 4219 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4220 /* 4221 * Some ordered extents started by fsync might have completed 4222 * before we could collect them into the list logged_list, which 4223 * means they're gone, not in our logged_list nor in the inode's 4224 * ordered tree. We want the application/user space to know an 4225 * error happened while attempting to persist file data so that 4226 * it can take proper action. If such error happened, we leave 4227 * without writing to the log tree and the fsync must report the 4228 * file data write error and not commit the current transaction. 4229 */ 4230 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4231 if (ret) 4232 ctx->io_err = ret; 4233 process: 4234 while (!list_empty(&extents)) { 4235 em = list_entry(extents.next, struct extent_map, list); 4236 4237 list_del_init(&em->list); 4238 4239 /* 4240 * If we had an error we just need to delete everybody from our 4241 * private list. 4242 */ 4243 if (ret) { 4244 clear_em_logging(tree, em); 4245 free_extent_map(em); 4246 continue; 4247 } 4248 4249 write_unlock(&tree->lock); 4250 4251 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4252 ctx); 4253 write_lock(&tree->lock); 4254 clear_em_logging(tree, em); 4255 free_extent_map(em); 4256 } 4257 WARN_ON(!list_empty(&extents)); 4258 write_unlock(&tree->lock); 4259 up_write(&inode->dio_sem); 4260 4261 btrfs_release_path(path); 4262 return ret; 4263 } 4264 4265 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4266 struct btrfs_path *path, u64 *size_ret) 4267 { 4268 struct btrfs_key key; 4269 int ret; 4270 4271 key.objectid = btrfs_ino(inode); 4272 key.type = BTRFS_INODE_ITEM_KEY; 4273 key.offset = 0; 4274 4275 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4276 if (ret < 0) { 4277 return ret; 4278 } else if (ret > 0) { 4279 *size_ret = 0; 4280 } else { 4281 struct btrfs_inode_item *item; 4282 4283 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4284 struct btrfs_inode_item); 4285 *size_ret = btrfs_inode_size(path->nodes[0], item); 4286 } 4287 4288 btrfs_release_path(path); 4289 return 0; 4290 } 4291 4292 /* 4293 * At the moment we always log all xattrs. This is to figure out at log replay 4294 * time which xattrs must have their deletion replayed. If a xattr is missing 4295 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4296 * because if a xattr is deleted, the inode is fsynced and a power failure 4297 * happens, causing the log to be replayed the next time the fs is mounted, 4298 * we want the xattr to not exist anymore (same behaviour as other filesystems 4299 * with a journal, ext3/4, xfs, f2fs, etc). 4300 */ 4301 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4302 struct btrfs_root *root, 4303 struct btrfs_inode *inode, 4304 struct btrfs_path *path, 4305 struct btrfs_path *dst_path) 4306 { 4307 int ret; 4308 struct btrfs_key key; 4309 const u64 ino = btrfs_ino(inode); 4310 int ins_nr = 0; 4311 int start_slot = 0; 4312 4313 key.objectid = ino; 4314 key.type = BTRFS_XATTR_ITEM_KEY; 4315 key.offset = 0; 4316 4317 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4318 if (ret < 0) 4319 return ret; 4320 4321 while (true) { 4322 int slot = path->slots[0]; 4323 struct extent_buffer *leaf = path->nodes[0]; 4324 int nritems = btrfs_header_nritems(leaf); 4325 4326 if (slot >= nritems) { 4327 if (ins_nr > 0) { 4328 u64 last_extent = 0; 4329 4330 ret = copy_items(trans, inode, dst_path, path, 4331 &last_extent, start_slot, 4332 ins_nr, 1, 0); 4333 /* can't be 1, extent items aren't processed */ 4334 ASSERT(ret <= 0); 4335 if (ret < 0) 4336 return ret; 4337 ins_nr = 0; 4338 } 4339 ret = btrfs_next_leaf(root, path); 4340 if (ret < 0) 4341 return ret; 4342 else if (ret > 0) 4343 break; 4344 continue; 4345 } 4346 4347 btrfs_item_key_to_cpu(leaf, &key, slot); 4348 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4349 break; 4350 4351 if (ins_nr == 0) 4352 start_slot = slot; 4353 ins_nr++; 4354 path->slots[0]++; 4355 cond_resched(); 4356 } 4357 if (ins_nr > 0) { 4358 u64 last_extent = 0; 4359 4360 ret = copy_items(trans, inode, dst_path, path, 4361 &last_extent, start_slot, 4362 ins_nr, 1, 0); 4363 /* can't be 1, extent items aren't processed */ 4364 ASSERT(ret <= 0); 4365 if (ret < 0) 4366 return ret; 4367 } 4368 4369 return 0; 4370 } 4371 4372 /* 4373 * If the no holes feature is enabled we need to make sure any hole between the 4374 * last extent and the i_size of our inode is explicitly marked in the log. This 4375 * is to make sure that doing something like: 4376 * 4377 * 1) create file with 128Kb of data 4378 * 2) truncate file to 64Kb 4379 * 3) truncate file to 256Kb 4380 * 4) fsync file 4381 * 5) <crash/power failure> 4382 * 6) mount fs and trigger log replay 4383 * 4384 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4385 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4386 * file correspond to a hole. The presence of explicit holes in a log tree is 4387 * what guarantees that log replay will remove/adjust file extent items in the 4388 * fs/subvol tree. 4389 * 4390 * Here we do not need to care about holes between extents, that is already done 4391 * by copy_items(). We also only need to do this in the full sync path, where we 4392 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4393 * lookup the list of modified extent maps and if any represents a hole, we 4394 * insert a corresponding extent representing a hole in the log tree. 4395 */ 4396 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4397 struct btrfs_root *root, 4398 struct btrfs_inode *inode, 4399 struct btrfs_path *path) 4400 { 4401 struct btrfs_fs_info *fs_info = root->fs_info; 4402 int ret; 4403 struct btrfs_key key; 4404 u64 hole_start; 4405 u64 hole_size; 4406 struct extent_buffer *leaf; 4407 struct btrfs_root *log = root->log_root; 4408 const u64 ino = btrfs_ino(inode); 4409 const u64 i_size = i_size_read(&inode->vfs_inode); 4410 4411 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4412 return 0; 4413 4414 key.objectid = ino; 4415 key.type = BTRFS_EXTENT_DATA_KEY; 4416 key.offset = (u64)-1; 4417 4418 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4419 ASSERT(ret != 0); 4420 if (ret < 0) 4421 return ret; 4422 4423 ASSERT(path->slots[0] > 0); 4424 path->slots[0]--; 4425 leaf = path->nodes[0]; 4426 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4427 4428 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4429 /* inode does not have any extents */ 4430 hole_start = 0; 4431 hole_size = i_size; 4432 } else { 4433 struct btrfs_file_extent_item *extent; 4434 u64 len; 4435 4436 /* 4437 * If there's an extent beyond i_size, an explicit hole was 4438 * already inserted by copy_items(). 4439 */ 4440 if (key.offset >= i_size) 4441 return 0; 4442 4443 extent = btrfs_item_ptr(leaf, path->slots[0], 4444 struct btrfs_file_extent_item); 4445 4446 if (btrfs_file_extent_type(leaf, extent) == 4447 BTRFS_FILE_EXTENT_INLINE) { 4448 len = btrfs_file_extent_inline_len(leaf, 4449 path->slots[0], 4450 extent); 4451 ASSERT(len == i_size || 4452 (len == fs_info->sectorsize && 4453 btrfs_file_extent_compression(leaf, extent) != 4454 BTRFS_COMPRESS_NONE)); 4455 return 0; 4456 } 4457 4458 len = btrfs_file_extent_num_bytes(leaf, extent); 4459 /* Last extent goes beyond i_size, no need to log a hole. */ 4460 if (key.offset + len > i_size) 4461 return 0; 4462 hole_start = key.offset + len; 4463 hole_size = i_size - hole_start; 4464 } 4465 btrfs_release_path(path); 4466 4467 /* Last extent ends at i_size. */ 4468 if (hole_size == 0) 4469 return 0; 4470 4471 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4472 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4473 hole_size, 0, hole_size, 0, 0, 0); 4474 return ret; 4475 } 4476 4477 /* 4478 * When we are logging a new inode X, check if it doesn't have a reference that 4479 * matches the reference from some other inode Y created in a past transaction 4480 * and that was renamed in the current transaction. If we don't do this, then at 4481 * log replay time we can lose inode Y (and all its files if it's a directory): 4482 * 4483 * mkdir /mnt/x 4484 * echo "hello world" > /mnt/x/foobar 4485 * sync 4486 * mv /mnt/x /mnt/y 4487 * mkdir /mnt/x # or touch /mnt/x 4488 * xfs_io -c fsync /mnt/x 4489 * <power fail> 4490 * mount fs, trigger log replay 4491 * 4492 * After the log replay procedure, we would lose the first directory and all its 4493 * files (file foobar). 4494 * For the case where inode Y is not a directory we simply end up losing it: 4495 * 4496 * echo "123" > /mnt/foo 4497 * sync 4498 * mv /mnt/foo /mnt/bar 4499 * echo "abc" > /mnt/foo 4500 * xfs_io -c fsync /mnt/foo 4501 * <power fail> 4502 * 4503 * We also need this for cases where a snapshot entry is replaced by some other 4504 * entry (file or directory) otherwise we end up with an unreplayable log due to 4505 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4506 * if it were a regular entry: 4507 * 4508 * mkdir /mnt/x 4509 * btrfs subvolume snapshot /mnt /mnt/x/snap 4510 * btrfs subvolume delete /mnt/x/snap 4511 * rmdir /mnt/x 4512 * mkdir /mnt/x 4513 * fsync /mnt/x or fsync some new file inside it 4514 * <power fail> 4515 * 4516 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4517 * the same transaction. 4518 */ 4519 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4520 const int slot, 4521 const struct btrfs_key *key, 4522 struct btrfs_inode *inode, 4523 u64 *other_ino) 4524 { 4525 int ret; 4526 struct btrfs_path *search_path; 4527 char *name = NULL; 4528 u32 name_len = 0; 4529 u32 item_size = btrfs_item_size_nr(eb, slot); 4530 u32 cur_offset = 0; 4531 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4532 4533 search_path = btrfs_alloc_path(); 4534 if (!search_path) 4535 return -ENOMEM; 4536 search_path->search_commit_root = 1; 4537 search_path->skip_locking = 1; 4538 4539 while (cur_offset < item_size) { 4540 u64 parent; 4541 u32 this_name_len; 4542 u32 this_len; 4543 unsigned long name_ptr; 4544 struct btrfs_dir_item *di; 4545 4546 if (key->type == BTRFS_INODE_REF_KEY) { 4547 struct btrfs_inode_ref *iref; 4548 4549 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4550 parent = key->offset; 4551 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4552 name_ptr = (unsigned long)(iref + 1); 4553 this_len = sizeof(*iref) + this_name_len; 4554 } else { 4555 struct btrfs_inode_extref *extref; 4556 4557 extref = (struct btrfs_inode_extref *)(ptr + 4558 cur_offset); 4559 parent = btrfs_inode_extref_parent(eb, extref); 4560 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4561 name_ptr = (unsigned long)&extref->name; 4562 this_len = sizeof(*extref) + this_name_len; 4563 } 4564 4565 if (this_name_len > name_len) { 4566 char *new_name; 4567 4568 new_name = krealloc(name, this_name_len, GFP_NOFS); 4569 if (!new_name) { 4570 ret = -ENOMEM; 4571 goto out; 4572 } 4573 name_len = this_name_len; 4574 name = new_name; 4575 } 4576 4577 read_extent_buffer(eb, name, name_ptr, this_name_len); 4578 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4579 parent, name, this_name_len, 0); 4580 if (di && !IS_ERR(di)) { 4581 struct btrfs_key di_key; 4582 4583 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4584 di, &di_key); 4585 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4586 ret = 1; 4587 *other_ino = di_key.objectid; 4588 } else { 4589 ret = -EAGAIN; 4590 } 4591 goto out; 4592 } else if (IS_ERR(di)) { 4593 ret = PTR_ERR(di); 4594 goto out; 4595 } 4596 btrfs_release_path(search_path); 4597 4598 cur_offset += this_len; 4599 } 4600 ret = 0; 4601 out: 4602 btrfs_free_path(search_path); 4603 kfree(name); 4604 return ret; 4605 } 4606 4607 /* log a single inode in the tree log. 4608 * At least one parent directory for this inode must exist in the tree 4609 * or be logged already. 4610 * 4611 * Any items from this inode changed by the current transaction are copied 4612 * to the log tree. An extra reference is taken on any extents in this 4613 * file, allowing us to avoid a whole pile of corner cases around logging 4614 * blocks that have been removed from the tree. 4615 * 4616 * See LOG_INODE_ALL and related defines for a description of what inode_only 4617 * does. 4618 * 4619 * This handles both files and directories. 4620 */ 4621 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4622 struct btrfs_root *root, struct btrfs_inode *inode, 4623 int inode_only, 4624 const loff_t start, 4625 const loff_t end, 4626 struct btrfs_log_ctx *ctx) 4627 { 4628 struct btrfs_fs_info *fs_info = root->fs_info; 4629 struct btrfs_path *path; 4630 struct btrfs_path *dst_path; 4631 struct btrfs_key min_key; 4632 struct btrfs_key max_key; 4633 struct btrfs_root *log = root->log_root; 4634 LIST_HEAD(logged_list); 4635 u64 last_extent = 0; 4636 int err = 0; 4637 int ret; 4638 int nritems; 4639 int ins_start_slot = 0; 4640 int ins_nr; 4641 bool fast_search = false; 4642 u64 ino = btrfs_ino(inode); 4643 struct extent_map_tree *em_tree = &inode->extent_tree; 4644 u64 logged_isize = 0; 4645 bool need_log_inode_item = true; 4646 4647 path = btrfs_alloc_path(); 4648 if (!path) 4649 return -ENOMEM; 4650 dst_path = btrfs_alloc_path(); 4651 if (!dst_path) { 4652 btrfs_free_path(path); 4653 return -ENOMEM; 4654 } 4655 4656 min_key.objectid = ino; 4657 min_key.type = BTRFS_INODE_ITEM_KEY; 4658 min_key.offset = 0; 4659 4660 max_key.objectid = ino; 4661 4662 4663 /* today the code can only do partial logging of directories */ 4664 if (S_ISDIR(inode->vfs_inode.i_mode) || 4665 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4666 &inode->runtime_flags) && 4667 inode_only >= LOG_INODE_EXISTS)) 4668 max_key.type = BTRFS_XATTR_ITEM_KEY; 4669 else 4670 max_key.type = (u8)-1; 4671 max_key.offset = (u64)-1; 4672 4673 /* 4674 * Only run delayed items if we are a dir or a new file. 4675 * Otherwise commit the delayed inode only, which is needed in 4676 * order for the log replay code to mark inodes for link count 4677 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4678 */ 4679 if (S_ISDIR(inode->vfs_inode.i_mode) || 4680 inode->generation > fs_info->last_trans_committed) 4681 ret = btrfs_commit_inode_delayed_items(trans, inode); 4682 else 4683 ret = btrfs_commit_inode_delayed_inode(inode); 4684 4685 if (ret) { 4686 btrfs_free_path(path); 4687 btrfs_free_path(dst_path); 4688 return ret; 4689 } 4690 4691 if (inode_only == LOG_OTHER_INODE) { 4692 inode_only = LOG_INODE_EXISTS; 4693 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4694 } else { 4695 mutex_lock(&inode->log_mutex); 4696 } 4697 4698 /* 4699 * a brute force approach to making sure we get the most uptodate 4700 * copies of everything. 4701 */ 4702 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4703 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4704 4705 if (inode_only == LOG_INODE_EXISTS) 4706 max_key_type = BTRFS_XATTR_ITEM_KEY; 4707 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4708 } else { 4709 if (inode_only == LOG_INODE_EXISTS) { 4710 /* 4711 * Make sure the new inode item we write to the log has 4712 * the same isize as the current one (if it exists). 4713 * This is necessary to prevent data loss after log 4714 * replay, and also to prevent doing a wrong expanding 4715 * truncate - for e.g. create file, write 4K into offset 4716 * 0, fsync, write 4K into offset 4096, add hard link, 4717 * fsync some other file (to sync log), power fail - if 4718 * we use the inode's current i_size, after log replay 4719 * we get a 8Kb file, with the last 4Kb extent as a hole 4720 * (zeroes), as if an expanding truncate happened, 4721 * instead of getting a file of 4Kb only. 4722 */ 4723 err = logged_inode_size(log, inode, path, &logged_isize); 4724 if (err) 4725 goto out_unlock; 4726 } 4727 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4728 &inode->runtime_flags)) { 4729 if (inode_only == LOG_INODE_EXISTS) { 4730 max_key.type = BTRFS_XATTR_ITEM_KEY; 4731 ret = drop_objectid_items(trans, log, path, ino, 4732 max_key.type); 4733 } else { 4734 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4735 &inode->runtime_flags); 4736 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4737 &inode->runtime_flags); 4738 while(1) { 4739 ret = btrfs_truncate_inode_items(trans, 4740 log, &inode->vfs_inode, 0, 0); 4741 if (ret != -EAGAIN) 4742 break; 4743 } 4744 } 4745 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4746 &inode->runtime_flags) || 4747 inode_only == LOG_INODE_EXISTS) { 4748 if (inode_only == LOG_INODE_ALL) 4749 fast_search = true; 4750 max_key.type = BTRFS_XATTR_ITEM_KEY; 4751 ret = drop_objectid_items(trans, log, path, ino, 4752 max_key.type); 4753 } else { 4754 if (inode_only == LOG_INODE_ALL) 4755 fast_search = true; 4756 goto log_extents; 4757 } 4758 4759 } 4760 if (ret) { 4761 err = ret; 4762 goto out_unlock; 4763 } 4764 4765 while (1) { 4766 ins_nr = 0; 4767 ret = btrfs_search_forward(root, &min_key, 4768 path, trans->transid); 4769 if (ret < 0) { 4770 err = ret; 4771 goto out_unlock; 4772 } 4773 if (ret != 0) 4774 break; 4775 again: 4776 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4777 if (min_key.objectid != ino) 4778 break; 4779 if (min_key.type > max_key.type) 4780 break; 4781 4782 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4783 need_log_inode_item = false; 4784 4785 if ((min_key.type == BTRFS_INODE_REF_KEY || 4786 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4787 inode->generation == trans->transid) { 4788 u64 other_ino = 0; 4789 4790 ret = btrfs_check_ref_name_override(path->nodes[0], 4791 path->slots[0], &min_key, inode, 4792 &other_ino); 4793 if (ret < 0) { 4794 err = ret; 4795 goto out_unlock; 4796 } else if (ret > 0 && ctx && 4797 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4798 struct btrfs_key inode_key; 4799 struct inode *other_inode; 4800 4801 if (ins_nr > 0) { 4802 ins_nr++; 4803 } else { 4804 ins_nr = 1; 4805 ins_start_slot = path->slots[0]; 4806 } 4807 ret = copy_items(trans, inode, dst_path, path, 4808 &last_extent, ins_start_slot, 4809 ins_nr, inode_only, 4810 logged_isize); 4811 if (ret < 0) { 4812 err = ret; 4813 goto out_unlock; 4814 } 4815 ins_nr = 0; 4816 btrfs_release_path(path); 4817 inode_key.objectid = other_ino; 4818 inode_key.type = BTRFS_INODE_ITEM_KEY; 4819 inode_key.offset = 0; 4820 other_inode = btrfs_iget(fs_info->sb, 4821 &inode_key, root, 4822 NULL); 4823 /* 4824 * If the other inode that had a conflicting dir 4825 * entry was deleted in the current transaction, 4826 * we don't need to do more work nor fallback to 4827 * a transaction commit. 4828 */ 4829 if (IS_ERR(other_inode) && 4830 PTR_ERR(other_inode) == -ENOENT) { 4831 goto next_key; 4832 } else if (IS_ERR(other_inode)) { 4833 err = PTR_ERR(other_inode); 4834 goto out_unlock; 4835 } 4836 /* 4837 * We are safe logging the other inode without 4838 * acquiring its i_mutex as long as we log with 4839 * the LOG_INODE_EXISTS mode. We're safe against 4840 * concurrent renames of the other inode as well 4841 * because during a rename we pin the log and 4842 * update the log with the new name before we 4843 * unpin it. 4844 */ 4845 err = btrfs_log_inode(trans, root, 4846 BTRFS_I(other_inode), 4847 LOG_OTHER_INODE, 0, LLONG_MAX, 4848 ctx); 4849 iput(other_inode); 4850 if (err) 4851 goto out_unlock; 4852 else 4853 goto next_key; 4854 } 4855 } 4856 4857 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4858 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4859 if (ins_nr == 0) 4860 goto next_slot; 4861 ret = copy_items(trans, inode, dst_path, path, 4862 &last_extent, ins_start_slot, 4863 ins_nr, inode_only, logged_isize); 4864 if (ret < 0) { 4865 err = ret; 4866 goto out_unlock; 4867 } 4868 ins_nr = 0; 4869 if (ret) { 4870 btrfs_release_path(path); 4871 continue; 4872 } 4873 goto next_slot; 4874 } 4875 4876 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4877 ins_nr++; 4878 goto next_slot; 4879 } else if (!ins_nr) { 4880 ins_start_slot = path->slots[0]; 4881 ins_nr = 1; 4882 goto next_slot; 4883 } 4884 4885 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4886 ins_start_slot, ins_nr, inode_only, 4887 logged_isize); 4888 if (ret < 0) { 4889 err = ret; 4890 goto out_unlock; 4891 } 4892 if (ret) { 4893 ins_nr = 0; 4894 btrfs_release_path(path); 4895 continue; 4896 } 4897 ins_nr = 1; 4898 ins_start_slot = path->slots[0]; 4899 next_slot: 4900 4901 nritems = btrfs_header_nritems(path->nodes[0]); 4902 path->slots[0]++; 4903 if (path->slots[0] < nritems) { 4904 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4905 path->slots[0]); 4906 goto again; 4907 } 4908 if (ins_nr) { 4909 ret = copy_items(trans, inode, dst_path, path, 4910 &last_extent, ins_start_slot, 4911 ins_nr, inode_only, logged_isize); 4912 if (ret < 0) { 4913 err = ret; 4914 goto out_unlock; 4915 } 4916 ret = 0; 4917 ins_nr = 0; 4918 } 4919 btrfs_release_path(path); 4920 next_key: 4921 if (min_key.offset < (u64)-1) { 4922 min_key.offset++; 4923 } else if (min_key.type < max_key.type) { 4924 min_key.type++; 4925 min_key.offset = 0; 4926 } else { 4927 break; 4928 } 4929 } 4930 if (ins_nr) { 4931 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4932 ins_start_slot, ins_nr, inode_only, 4933 logged_isize); 4934 if (ret < 0) { 4935 err = ret; 4936 goto out_unlock; 4937 } 4938 ret = 0; 4939 ins_nr = 0; 4940 } 4941 4942 btrfs_release_path(path); 4943 btrfs_release_path(dst_path); 4944 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4945 if (err) 4946 goto out_unlock; 4947 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4948 btrfs_release_path(path); 4949 btrfs_release_path(dst_path); 4950 err = btrfs_log_trailing_hole(trans, root, inode, path); 4951 if (err) 4952 goto out_unlock; 4953 } 4954 log_extents: 4955 btrfs_release_path(path); 4956 btrfs_release_path(dst_path); 4957 if (need_log_inode_item) { 4958 err = log_inode_item(trans, log, dst_path, inode); 4959 if (err) 4960 goto out_unlock; 4961 } 4962 if (fast_search) { 4963 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4964 &logged_list, ctx, start, end); 4965 if (ret) { 4966 err = ret; 4967 goto out_unlock; 4968 } 4969 } else if (inode_only == LOG_INODE_ALL) { 4970 struct extent_map *em, *n; 4971 4972 write_lock(&em_tree->lock); 4973 /* 4974 * We can't just remove every em if we're called for a ranged 4975 * fsync - that is, one that doesn't cover the whole possible 4976 * file range (0 to LLONG_MAX). This is because we can have 4977 * em's that fall outside the range we're logging and therefore 4978 * their ordered operations haven't completed yet 4979 * (btrfs_finish_ordered_io() not invoked yet). This means we 4980 * didn't get their respective file extent item in the fs/subvol 4981 * tree yet, and need to let the next fast fsync (one which 4982 * consults the list of modified extent maps) find the em so 4983 * that it logs a matching file extent item and waits for the 4984 * respective ordered operation to complete (if it's still 4985 * running). 4986 * 4987 * Removing every em outside the range we're logging would make 4988 * the next fast fsync not log their matching file extent items, 4989 * therefore making us lose data after a log replay. 4990 */ 4991 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4992 list) { 4993 const u64 mod_end = em->mod_start + em->mod_len - 1; 4994 4995 if (em->mod_start >= start && mod_end <= end) 4996 list_del_init(&em->list); 4997 } 4998 write_unlock(&em_tree->lock); 4999 } 5000 5001 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5002 ret = log_directory_changes(trans, root, inode, path, dst_path, 5003 ctx); 5004 if (ret) { 5005 err = ret; 5006 goto out_unlock; 5007 } 5008 } 5009 5010 spin_lock(&inode->lock); 5011 inode->logged_trans = trans->transid; 5012 inode->last_log_commit = inode->last_sub_trans; 5013 spin_unlock(&inode->lock); 5014 out_unlock: 5015 if (unlikely(err)) 5016 btrfs_put_logged_extents(&logged_list); 5017 else 5018 btrfs_submit_logged_extents(&logged_list, log); 5019 mutex_unlock(&inode->log_mutex); 5020 5021 btrfs_free_path(path); 5022 btrfs_free_path(dst_path); 5023 return err; 5024 } 5025 5026 /* 5027 * Check if we must fallback to a transaction commit when logging an inode. 5028 * This must be called after logging the inode and is used only in the context 5029 * when fsyncing an inode requires the need to log some other inode - in which 5030 * case we can't lock the i_mutex of each other inode we need to log as that 5031 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5032 * log inodes up or down in the hierarchy) or rename operations for example. So 5033 * we take the log_mutex of the inode after we have logged it and then check for 5034 * its last_unlink_trans value - this is safe because any task setting 5035 * last_unlink_trans must take the log_mutex and it must do this before it does 5036 * the actual unlink operation, so if we do this check before a concurrent task 5037 * sets last_unlink_trans it means we've logged a consistent version/state of 5038 * all the inode items, otherwise we are not sure and must do a transaction 5039 * commit (the concurrent task might have only updated last_unlink_trans before 5040 * we logged the inode or it might have also done the unlink). 5041 */ 5042 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5043 struct btrfs_inode *inode) 5044 { 5045 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5046 bool ret = false; 5047 5048 mutex_lock(&inode->log_mutex); 5049 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5050 /* 5051 * Make sure any commits to the log are forced to be full 5052 * commits. 5053 */ 5054 btrfs_set_log_full_commit(fs_info, trans); 5055 ret = true; 5056 } 5057 mutex_unlock(&inode->log_mutex); 5058 5059 return ret; 5060 } 5061 5062 /* 5063 * follow the dentry parent pointers up the chain and see if any 5064 * of the directories in it require a full commit before they can 5065 * be logged. Returns zero if nothing special needs to be done or 1 if 5066 * a full commit is required. 5067 */ 5068 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5069 struct btrfs_inode *inode, 5070 struct dentry *parent, 5071 struct super_block *sb, 5072 u64 last_committed) 5073 { 5074 int ret = 0; 5075 struct dentry *old_parent = NULL; 5076 struct btrfs_inode *orig_inode = inode; 5077 5078 /* 5079 * for regular files, if its inode is already on disk, we don't 5080 * have to worry about the parents at all. This is because 5081 * we can use the last_unlink_trans field to record renames 5082 * and other fun in this file. 5083 */ 5084 if (S_ISREG(inode->vfs_inode.i_mode) && 5085 inode->generation <= last_committed && 5086 inode->last_unlink_trans <= last_committed) 5087 goto out; 5088 5089 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5090 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5091 goto out; 5092 inode = BTRFS_I(d_inode(parent)); 5093 } 5094 5095 while (1) { 5096 /* 5097 * If we are logging a directory then we start with our inode, 5098 * not our parent's inode, so we need to skip setting the 5099 * logged_trans so that further down in the log code we don't 5100 * think this inode has already been logged. 5101 */ 5102 if (inode != orig_inode) 5103 inode->logged_trans = trans->transid; 5104 smp_mb(); 5105 5106 if (btrfs_must_commit_transaction(trans, inode)) { 5107 ret = 1; 5108 break; 5109 } 5110 5111 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5112 break; 5113 5114 if (IS_ROOT(parent)) { 5115 inode = BTRFS_I(d_inode(parent)); 5116 if (btrfs_must_commit_transaction(trans, inode)) 5117 ret = 1; 5118 break; 5119 } 5120 5121 parent = dget_parent(parent); 5122 dput(old_parent); 5123 old_parent = parent; 5124 inode = BTRFS_I(d_inode(parent)); 5125 5126 } 5127 dput(old_parent); 5128 out: 5129 return ret; 5130 } 5131 5132 struct btrfs_dir_list { 5133 u64 ino; 5134 struct list_head list; 5135 }; 5136 5137 /* 5138 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5139 * details about the why it is needed. 5140 * This is a recursive operation - if an existing dentry corresponds to a 5141 * directory, that directory's new entries are logged too (same behaviour as 5142 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5143 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5144 * complains about the following circular lock dependency / possible deadlock: 5145 * 5146 * CPU0 CPU1 5147 * ---- ---- 5148 * lock(&type->i_mutex_dir_key#3/2); 5149 * lock(sb_internal#2); 5150 * lock(&type->i_mutex_dir_key#3/2); 5151 * lock(&sb->s_type->i_mutex_key#14); 5152 * 5153 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5154 * sb_start_intwrite() in btrfs_start_transaction(). 5155 * Not locking i_mutex of the inodes is still safe because: 5156 * 5157 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5158 * that while logging the inode new references (names) are added or removed 5159 * from the inode, leaving the logged inode item with a link count that does 5160 * not match the number of logged inode reference items. This is fine because 5161 * at log replay time we compute the real number of links and correct the 5162 * link count in the inode item (see replay_one_buffer() and 5163 * link_to_fixup_dir()); 5164 * 5165 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5166 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5167 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5168 * has a size that doesn't match the sum of the lengths of all the logged 5169 * names. This does not result in a problem because if a dir_item key is 5170 * logged but its matching dir_index key is not logged, at log replay time we 5171 * don't use it to replay the respective name (see replay_one_name()). On the 5172 * other hand if only the dir_index key ends up being logged, the respective 5173 * name is added to the fs/subvol tree with both the dir_item and dir_index 5174 * keys created (see replay_one_name()). 5175 * The directory's inode item with a wrong i_size is not a problem as well, 5176 * since we don't use it at log replay time to set the i_size in the inode 5177 * item of the fs/subvol tree (see overwrite_item()). 5178 */ 5179 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5180 struct btrfs_root *root, 5181 struct btrfs_inode *start_inode, 5182 struct btrfs_log_ctx *ctx) 5183 { 5184 struct btrfs_fs_info *fs_info = root->fs_info; 5185 struct btrfs_root *log = root->log_root; 5186 struct btrfs_path *path; 5187 LIST_HEAD(dir_list); 5188 struct btrfs_dir_list *dir_elem; 5189 int ret = 0; 5190 5191 path = btrfs_alloc_path(); 5192 if (!path) 5193 return -ENOMEM; 5194 5195 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5196 if (!dir_elem) { 5197 btrfs_free_path(path); 5198 return -ENOMEM; 5199 } 5200 dir_elem->ino = btrfs_ino(start_inode); 5201 list_add_tail(&dir_elem->list, &dir_list); 5202 5203 while (!list_empty(&dir_list)) { 5204 struct extent_buffer *leaf; 5205 struct btrfs_key min_key; 5206 int nritems; 5207 int i; 5208 5209 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5210 list); 5211 if (ret) 5212 goto next_dir_inode; 5213 5214 min_key.objectid = dir_elem->ino; 5215 min_key.type = BTRFS_DIR_ITEM_KEY; 5216 min_key.offset = 0; 5217 again: 5218 btrfs_release_path(path); 5219 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5220 if (ret < 0) { 5221 goto next_dir_inode; 5222 } else if (ret > 0) { 5223 ret = 0; 5224 goto next_dir_inode; 5225 } 5226 5227 process_leaf: 5228 leaf = path->nodes[0]; 5229 nritems = btrfs_header_nritems(leaf); 5230 for (i = path->slots[0]; i < nritems; i++) { 5231 struct btrfs_dir_item *di; 5232 struct btrfs_key di_key; 5233 struct inode *di_inode; 5234 struct btrfs_dir_list *new_dir_elem; 5235 int log_mode = LOG_INODE_EXISTS; 5236 int type; 5237 5238 btrfs_item_key_to_cpu(leaf, &min_key, i); 5239 if (min_key.objectid != dir_elem->ino || 5240 min_key.type != BTRFS_DIR_ITEM_KEY) 5241 goto next_dir_inode; 5242 5243 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5244 type = btrfs_dir_type(leaf, di); 5245 if (btrfs_dir_transid(leaf, di) < trans->transid && 5246 type != BTRFS_FT_DIR) 5247 continue; 5248 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5249 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5250 continue; 5251 5252 btrfs_release_path(path); 5253 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5254 if (IS_ERR(di_inode)) { 5255 ret = PTR_ERR(di_inode); 5256 goto next_dir_inode; 5257 } 5258 5259 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5260 iput(di_inode); 5261 break; 5262 } 5263 5264 ctx->log_new_dentries = false; 5265 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5266 log_mode = LOG_INODE_ALL; 5267 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5268 log_mode, 0, LLONG_MAX, ctx); 5269 if (!ret && 5270 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5271 ret = 1; 5272 iput(di_inode); 5273 if (ret) 5274 goto next_dir_inode; 5275 if (ctx->log_new_dentries) { 5276 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5277 GFP_NOFS); 5278 if (!new_dir_elem) { 5279 ret = -ENOMEM; 5280 goto next_dir_inode; 5281 } 5282 new_dir_elem->ino = di_key.objectid; 5283 list_add_tail(&new_dir_elem->list, &dir_list); 5284 } 5285 break; 5286 } 5287 if (i == nritems) { 5288 ret = btrfs_next_leaf(log, path); 5289 if (ret < 0) { 5290 goto next_dir_inode; 5291 } else if (ret > 0) { 5292 ret = 0; 5293 goto next_dir_inode; 5294 } 5295 goto process_leaf; 5296 } 5297 if (min_key.offset < (u64)-1) { 5298 min_key.offset++; 5299 goto again; 5300 } 5301 next_dir_inode: 5302 list_del(&dir_elem->list); 5303 kfree(dir_elem); 5304 } 5305 5306 btrfs_free_path(path); 5307 return ret; 5308 } 5309 5310 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5311 struct btrfs_inode *inode, 5312 struct btrfs_log_ctx *ctx) 5313 { 5314 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5315 int ret; 5316 struct btrfs_path *path; 5317 struct btrfs_key key; 5318 struct btrfs_root *root = inode->root; 5319 const u64 ino = btrfs_ino(inode); 5320 5321 path = btrfs_alloc_path(); 5322 if (!path) 5323 return -ENOMEM; 5324 path->skip_locking = 1; 5325 path->search_commit_root = 1; 5326 5327 key.objectid = ino; 5328 key.type = BTRFS_INODE_REF_KEY; 5329 key.offset = 0; 5330 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5331 if (ret < 0) 5332 goto out; 5333 5334 while (true) { 5335 struct extent_buffer *leaf = path->nodes[0]; 5336 int slot = path->slots[0]; 5337 u32 cur_offset = 0; 5338 u32 item_size; 5339 unsigned long ptr; 5340 5341 if (slot >= btrfs_header_nritems(leaf)) { 5342 ret = btrfs_next_leaf(root, path); 5343 if (ret < 0) 5344 goto out; 5345 else if (ret > 0) 5346 break; 5347 continue; 5348 } 5349 5350 btrfs_item_key_to_cpu(leaf, &key, slot); 5351 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5352 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5353 break; 5354 5355 item_size = btrfs_item_size_nr(leaf, slot); 5356 ptr = btrfs_item_ptr_offset(leaf, slot); 5357 while (cur_offset < item_size) { 5358 struct btrfs_key inode_key; 5359 struct inode *dir_inode; 5360 5361 inode_key.type = BTRFS_INODE_ITEM_KEY; 5362 inode_key.offset = 0; 5363 5364 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5365 struct btrfs_inode_extref *extref; 5366 5367 extref = (struct btrfs_inode_extref *) 5368 (ptr + cur_offset); 5369 inode_key.objectid = btrfs_inode_extref_parent( 5370 leaf, extref); 5371 cur_offset += sizeof(*extref); 5372 cur_offset += btrfs_inode_extref_name_len(leaf, 5373 extref); 5374 } else { 5375 inode_key.objectid = key.offset; 5376 cur_offset = item_size; 5377 } 5378 5379 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5380 root, NULL); 5381 /* If parent inode was deleted, skip it. */ 5382 if (IS_ERR(dir_inode)) 5383 continue; 5384 5385 if (ctx) 5386 ctx->log_new_dentries = false; 5387 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5388 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5389 if (!ret && 5390 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5391 ret = 1; 5392 if (!ret && ctx && ctx->log_new_dentries) 5393 ret = log_new_dir_dentries(trans, root, 5394 BTRFS_I(dir_inode), ctx); 5395 iput(dir_inode); 5396 if (ret) 5397 goto out; 5398 } 5399 path->slots[0]++; 5400 } 5401 ret = 0; 5402 out: 5403 btrfs_free_path(path); 5404 return ret; 5405 } 5406 5407 /* 5408 * helper function around btrfs_log_inode to make sure newly created 5409 * parent directories also end up in the log. A minimal inode and backref 5410 * only logging is done of any parent directories that are older than 5411 * the last committed transaction 5412 */ 5413 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5414 struct btrfs_root *root, 5415 struct btrfs_inode *inode, 5416 struct dentry *parent, 5417 const loff_t start, 5418 const loff_t end, 5419 int inode_only, 5420 struct btrfs_log_ctx *ctx) 5421 { 5422 struct btrfs_fs_info *fs_info = root->fs_info; 5423 struct super_block *sb; 5424 struct dentry *old_parent = NULL; 5425 int ret = 0; 5426 u64 last_committed = fs_info->last_trans_committed; 5427 bool log_dentries = false; 5428 struct btrfs_inode *orig_inode = inode; 5429 5430 sb = inode->vfs_inode.i_sb; 5431 5432 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5433 ret = 1; 5434 goto end_no_trans; 5435 } 5436 5437 /* 5438 * The prev transaction commit doesn't complete, we need do 5439 * full commit by ourselves. 5440 */ 5441 if (fs_info->last_trans_log_full_commit > 5442 fs_info->last_trans_committed) { 5443 ret = 1; 5444 goto end_no_trans; 5445 } 5446 5447 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5448 ret = 1; 5449 goto end_no_trans; 5450 } 5451 5452 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5453 last_committed); 5454 if (ret) 5455 goto end_no_trans; 5456 5457 if (btrfs_inode_in_log(inode, trans->transid)) { 5458 ret = BTRFS_NO_LOG_SYNC; 5459 goto end_no_trans; 5460 } 5461 5462 ret = start_log_trans(trans, root, ctx); 5463 if (ret) 5464 goto end_no_trans; 5465 5466 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5467 if (ret) 5468 goto end_trans; 5469 5470 /* 5471 * for regular files, if its inode is already on disk, we don't 5472 * have to worry about the parents at all. This is because 5473 * we can use the last_unlink_trans field to record renames 5474 * and other fun in this file. 5475 */ 5476 if (S_ISREG(inode->vfs_inode.i_mode) && 5477 inode->generation <= last_committed && 5478 inode->last_unlink_trans <= last_committed) { 5479 ret = 0; 5480 goto end_trans; 5481 } 5482 5483 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5484 log_dentries = true; 5485 5486 /* 5487 * On unlink we must make sure all our current and old parent directory 5488 * inodes are fully logged. This is to prevent leaving dangling 5489 * directory index entries in directories that were our parents but are 5490 * not anymore. Not doing this results in old parent directory being 5491 * impossible to delete after log replay (rmdir will always fail with 5492 * error -ENOTEMPTY). 5493 * 5494 * Example 1: 5495 * 5496 * mkdir testdir 5497 * touch testdir/foo 5498 * ln testdir/foo testdir/bar 5499 * sync 5500 * unlink testdir/bar 5501 * xfs_io -c fsync testdir/foo 5502 * <power failure> 5503 * mount fs, triggers log replay 5504 * 5505 * If we don't log the parent directory (testdir), after log replay the 5506 * directory still has an entry pointing to the file inode using the bar 5507 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5508 * the file inode has a link count of 1. 5509 * 5510 * Example 2: 5511 * 5512 * mkdir testdir 5513 * touch foo 5514 * ln foo testdir/foo2 5515 * ln foo testdir/foo3 5516 * sync 5517 * unlink testdir/foo3 5518 * xfs_io -c fsync foo 5519 * <power failure> 5520 * mount fs, triggers log replay 5521 * 5522 * Similar as the first example, after log replay the parent directory 5523 * testdir still has an entry pointing to the inode file with name foo3 5524 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5525 * and has a link count of 2. 5526 */ 5527 if (inode->last_unlink_trans > last_committed) { 5528 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5529 if (ret) 5530 goto end_trans; 5531 } 5532 5533 while (1) { 5534 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5535 break; 5536 5537 inode = BTRFS_I(d_inode(parent)); 5538 if (root != inode->root) 5539 break; 5540 5541 if (inode->generation > last_committed) { 5542 ret = btrfs_log_inode(trans, root, inode, 5543 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5544 if (ret) 5545 goto end_trans; 5546 } 5547 if (IS_ROOT(parent)) 5548 break; 5549 5550 parent = dget_parent(parent); 5551 dput(old_parent); 5552 old_parent = parent; 5553 } 5554 if (log_dentries) 5555 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5556 else 5557 ret = 0; 5558 end_trans: 5559 dput(old_parent); 5560 if (ret < 0) { 5561 btrfs_set_log_full_commit(fs_info, trans); 5562 ret = 1; 5563 } 5564 5565 if (ret) 5566 btrfs_remove_log_ctx(root, ctx); 5567 btrfs_end_log_trans(root); 5568 end_no_trans: 5569 return ret; 5570 } 5571 5572 /* 5573 * it is not safe to log dentry if the chunk root has added new 5574 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5575 * If this returns 1, you must commit the transaction to safely get your 5576 * data on disk. 5577 */ 5578 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5579 struct btrfs_root *root, struct dentry *dentry, 5580 const loff_t start, 5581 const loff_t end, 5582 struct btrfs_log_ctx *ctx) 5583 { 5584 struct dentry *parent = dget_parent(dentry); 5585 int ret; 5586 5587 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5588 parent, start, end, LOG_INODE_ALL, ctx); 5589 dput(parent); 5590 5591 return ret; 5592 } 5593 5594 /* 5595 * should be called during mount to recover any replay any log trees 5596 * from the FS 5597 */ 5598 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5599 { 5600 int ret; 5601 struct btrfs_path *path; 5602 struct btrfs_trans_handle *trans; 5603 struct btrfs_key key; 5604 struct btrfs_key found_key; 5605 struct btrfs_key tmp_key; 5606 struct btrfs_root *log; 5607 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5608 struct walk_control wc = { 5609 .process_func = process_one_buffer, 5610 .stage = 0, 5611 }; 5612 5613 path = btrfs_alloc_path(); 5614 if (!path) 5615 return -ENOMEM; 5616 5617 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5618 5619 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5620 if (IS_ERR(trans)) { 5621 ret = PTR_ERR(trans); 5622 goto error; 5623 } 5624 5625 wc.trans = trans; 5626 wc.pin = 1; 5627 5628 ret = walk_log_tree(trans, log_root_tree, &wc); 5629 if (ret) { 5630 btrfs_handle_fs_error(fs_info, ret, 5631 "Failed to pin buffers while recovering log root tree."); 5632 goto error; 5633 } 5634 5635 again: 5636 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5637 key.offset = (u64)-1; 5638 key.type = BTRFS_ROOT_ITEM_KEY; 5639 5640 while (1) { 5641 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5642 5643 if (ret < 0) { 5644 btrfs_handle_fs_error(fs_info, ret, 5645 "Couldn't find tree log root."); 5646 goto error; 5647 } 5648 if (ret > 0) { 5649 if (path->slots[0] == 0) 5650 break; 5651 path->slots[0]--; 5652 } 5653 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5654 path->slots[0]); 5655 btrfs_release_path(path); 5656 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5657 break; 5658 5659 log = btrfs_read_fs_root(log_root_tree, &found_key); 5660 if (IS_ERR(log)) { 5661 ret = PTR_ERR(log); 5662 btrfs_handle_fs_error(fs_info, ret, 5663 "Couldn't read tree log root."); 5664 goto error; 5665 } 5666 5667 tmp_key.objectid = found_key.offset; 5668 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5669 tmp_key.offset = (u64)-1; 5670 5671 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5672 if (IS_ERR(wc.replay_dest)) { 5673 ret = PTR_ERR(wc.replay_dest); 5674 free_extent_buffer(log->node); 5675 free_extent_buffer(log->commit_root); 5676 kfree(log); 5677 btrfs_handle_fs_error(fs_info, ret, 5678 "Couldn't read target root for tree log recovery."); 5679 goto error; 5680 } 5681 5682 wc.replay_dest->log_root = log; 5683 btrfs_record_root_in_trans(trans, wc.replay_dest); 5684 ret = walk_log_tree(trans, log, &wc); 5685 5686 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5687 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5688 path); 5689 } 5690 5691 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5692 struct btrfs_root *root = wc.replay_dest; 5693 5694 btrfs_release_path(path); 5695 5696 /* 5697 * We have just replayed everything, and the highest 5698 * objectid of fs roots probably has changed in case 5699 * some inode_item's got replayed. 5700 * 5701 * root->objectid_mutex is not acquired as log replay 5702 * could only happen during mount. 5703 */ 5704 ret = btrfs_find_highest_objectid(root, 5705 &root->highest_objectid); 5706 } 5707 5708 key.offset = found_key.offset - 1; 5709 wc.replay_dest->log_root = NULL; 5710 free_extent_buffer(log->node); 5711 free_extent_buffer(log->commit_root); 5712 kfree(log); 5713 5714 if (ret) 5715 goto error; 5716 5717 if (found_key.offset == 0) 5718 break; 5719 } 5720 btrfs_release_path(path); 5721 5722 /* step one is to pin it all, step two is to replay just inodes */ 5723 if (wc.pin) { 5724 wc.pin = 0; 5725 wc.process_func = replay_one_buffer; 5726 wc.stage = LOG_WALK_REPLAY_INODES; 5727 goto again; 5728 } 5729 /* step three is to replay everything */ 5730 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5731 wc.stage++; 5732 goto again; 5733 } 5734 5735 btrfs_free_path(path); 5736 5737 /* step 4: commit the transaction, which also unpins the blocks */ 5738 ret = btrfs_commit_transaction(trans); 5739 if (ret) 5740 return ret; 5741 5742 free_extent_buffer(log_root_tree->node); 5743 log_root_tree->log_root = NULL; 5744 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5745 kfree(log_root_tree); 5746 5747 return 0; 5748 error: 5749 if (wc.trans) 5750 btrfs_end_transaction(wc.trans); 5751 btrfs_free_path(path); 5752 return ret; 5753 } 5754 5755 /* 5756 * there are some corner cases where we want to force a full 5757 * commit instead of allowing a directory to be logged. 5758 * 5759 * They revolve around files there were unlinked from the directory, and 5760 * this function updates the parent directory so that a full commit is 5761 * properly done if it is fsync'd later after the unlinks are done. 5762 * 5763 * Must be called before the unlink operations (updates to the subvolume tree, 5764 * inodes, etc) are done. 5765 */ 5766 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5767 struct btrfs_inode *dir, struct btrfs_inode *inode, 5768 int for_rename) 5769 { 5770 /* 5771 * when we're logging a file, if it hasn't been renamed 5772 * or unlinked, and its inode is fully committed on disk, 5773 * we don't have to worry about walking up the directory chain 5774 * to log its parents. 5775 * 5776 * So, we use the last_unlink_trans field to put this transid 5777 * into the file. When the file is logged we check it and 5778 * don't log the parents if the file is fully on disk. 5779 */ 5780 mutex_lock(&inode->log_mutex); 5781 inode->last_unlink_trans = trans->transid; 5782 mutex_unlock(&inode->log_mutex); 5783 5784 /* 5785 * if this directory was already logged any new 5786 * names for this file/dir will get recorded 5787 */ 5788 smp_mb(); 5789 if (dir->logged_trans == trans->transid) 5790 return; 5791 5792 /* 5793 * if the inode we're about to unlink was logged, 5794 * the log will be properly updated for any new names 5795 */ 5796 if (inode->logged_trans == trans->transid) 5797 return; 5798 5799 /* 5800 * when renaming files across directories, if the directory 5801 * there we're unlinking from gets fsync'd later on, there's 5802 * no way to find the destination directory later and fsync it 5803 * properly. So, we have to be conservative and force commits 5804 * so the new name gets discovered. 5805 */ 5806 if (for_rename) 5807 goto record; 5808 5809 /* we can safely do the unlink without any special recording */ 5810 return; 5811 5812 record: 5813 mutex_lock(&dir->log_mutex); 5814 dir->last_unlink_trans = trans->transid; 5815 mutex_unlock(&dir->log_mutex); 5816 } 5817 5818 /* 5819 * Make sure that if someone attempts to fsync the parent directory of a deleted 5820 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5821 * that after replaying the log tree of the parent directory's root we will not 5822 * see the snapshot anymore and at log replay time we will not see any log tree 5823 * corresponding to the deleted snapshot's root, which could lead to replaying 5824 * it after replaying the log tree of the parent directory (which would replay 5825 * the snapshot delete operation). 5826 * 5827 * Must be called before the actual snapshot destroy operation (updates to the 5828 * parent root and tree of tree roots trees, etc) are done. 5829 */ 5830 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5831 struct btrfs_inode *dir) 5832 { 5833 mutex_lock(&dir->log_mutex); 5834 dir->last_unlink_trans = trans->transid; 5835 mutex_unlock(&dir->log_mutex); 5836 } 5837 5838 /* 5839 * Call this after adding a new name for a file and it will properly 5840 * update the log to reflect the new name. 5841 * 5842 * It will return zero if all goes well, and it will return 1 if a 5843 * full transaction commit is required. 5844 */ 5845 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5846 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5847 struct dentry *parent) 5848 { 5849 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5850 struct btrfs_root *root = inode->root; 5851 5852 /* 5853 * this will force the logging code to walk the dentry chain 5854 * up for the file 5855 */ 5856 if (S_ISREG(inode->vfs_inode.i_mode)) 5857 inode->last_unlink_trans = trans->transid; 5858 5859 /* 5860 * if this inode hasn't been logged and directory we're renaming it 5861 * from hasn't been logged, we don't need to log it 5862 */ 5863 if (inode->logged_trans <= fs_info->last_trans_committed && 5864 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5865 return 0; 5866 5867 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5868 LLONG_MAX, LOG_INODE_EXISTS, NULL); 5869 } 5870 5871