1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 #include "compression.h" 30 #include "qgroup.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 #define LOG_OTHER_INODE 2 41 42 /* 43 * directory trouble cases 44 * 45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 46 * log, we must force a full commit before doing an fsync of the directory 47 * where the unlink was done. 48 * ---> record transid of last unlink/rename per directory 49 * 50 * mkdir foo/some_dir 51 * normal commit 52 * rename foo/some_dir foo2/some_dir 53 * mkdir foo/some_dir 54 * fsync foo/some_dir/some_file 55 * 56 * The fsync above will unlink the original some_dir without recording 57 * it in its new location (foo2). After a crash, some_dir will be gone 58 * unless the fsync of some_file forces a full commit 59 * 60 * 2) we must log any new names for any file or dir that is in the fsync 61 * log. ---> check inode while renaming/linking. 62 * 63 * 2a) we must log any new names for any file or dir during rename 64 * when the directory they are being removed from was logged. 65 * ---> check inode and old parent dir during rename 66 * 67 * 2a is actually the more important variant. With the extra logging 68 * a crash might unlink the old name without recreating the new one 69 * 70 * 3) after a crash, we must go through any directories with a link count 71 * of zero and redo the rm -rf 72 * 73 * mkdir f1/foo 74 * normal commit 75 * rm -rf f1/foo 76 * fsync(f1) 77 * 78 * The directory f1 was fully removed from the FS, but fsync was never 79 * called on f1, only its parent dir. After a crash the rm -rf must 80 * be replayed. This must be able to recurse down the entire 81 * directory tree. The inode link count fixup code takes care of the 82 * ugly details. 83 */ 84 85 /* 86 * stages for the tree walking. The first 87 * stage (0) is to only pin down the blocks we find 88 * the second stage (1) is to make sure that all the inodes 89 * we find in the log are created in the subvolume. 90 * 91 * The last stage is to deal with directories and links and extents 92 * and all the other fun semantics 93 */ 94 #define LOG_WALK_PIN_ONLY 0 95 #define LOG_WALK_REPLAY_INODES 1 96 #define LOG_WALK_REPLAY_DIR_INDEX 2 97 #define LOG_WALK_REPLAY_ALL 3 98 99 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, struct btrfs_inode *inode, 101 int inode_only, 102 const loff_t start, 103 const loff_t end, 104 struct btrfs_log_ctx *ctx); 105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_path *path, u64 objectid); 108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_root *log, 111 struct btrfs_path *path, 112 u64 dirid, int del_all); 113 114 /* 115 * tree logging is a special write ahead log used to make sure that 116 * fsyncs and O_SYNCs can happen without doing full tree commits. 117 * 118 * Full tree commits are expensive because they require commonly 119 * modified blocks to be recowed, creating many dirty pages in the 120 * extent tree an 4x-6x higher write load than ext3. 121 * 122 * Instead of doing a tree commit on every fsync, we use the 123 * key ranges and transaction ids to find items for a given file or directory 124 * that have changed in this transaction. Those items are copied into 125 * a special tree (one per subvolume root), that tree is written to disk 126 * and then the fsync is considered complete. 127 * 128 * After a crash, items are copied out of the log-tree back into the 129 * subvolume tree. Any file data extents found are recorded in the extent 130 * allocation tree, and the log-tree freed. 131 * 132 * The log tree is read three times, once to pin down all the extents it is 133 * using in ram and once, once to create all the inodes logged in the tree 134 * and once to do all the other items. 135 */ 136 137 /* 138 * start a sub transaction and setup the log tree 139 * this increments the log tree writer count to make the people 140 * syncing the tree wait for us to finish 141 */ 142 static int start_log_trans(struct btrfs_trans_handle *trans, 143 struct btrfs_root *root, 144 struct btrfs_log_ctx *ctx) 145 { 146 struct btrfs_fs_info *fs_info = root->fs_info; 147 int ret = 0; 148 149 mutex_lock(&root->log_mutex); 150 151 if (root->log_root) { 152 if (btrfs_need_log_full_commit(fs_info, trans)) { 153 ret = -EAGAIN; 154 goto out; 155 } 156 157 if (!root->log_start_pid) { 158 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 root->log_start_pid = current->pid; 160 } else if (root->log_start_pid != current->pid) { 161 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 162 } 163 } else { 164 mutex_lock(&fs_info->tree_log_mutex); 165 if (!fs_info->log_root_tree) 166 ret = btrfs_init_log_root_tree(trans, fs_info); 167 mutex_unlock(&fs_info->tree_log_mutex); 168 if (ret) 169 goto out; 170 171 ret = btrfs_add_log_tree(trans, root); 172 if (ret) 173 goto out; 174 175 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 176 root->log_start_pid = current->pid; 177 } 178 179 atomic_inc(&root->log_batch); 180 atomic_inc(&root->log_writers); 181 if (ctx) { 182 int index = root->log_transid % 2; 183 list_add_tail(&ctx->list, &root->log_ctxs[index]); 184 ctx->log_transid = root->log_transid; 185 } 186 187 out: 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * returns 0 if there was a log transaction running and we were able 194 * to join, or returns -ENOENT if there were not transactions 195 * in progress 196 */ 197 static int join_running_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 smp_mb(); 202 if (!root->log_root) 203 return -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 if (root->log_root) { 207 ret = 0; 208 atomic_inc(&root->log_writers); 209 } 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * This either makes the current running log transaction wait 216 * until you call btrfs_end_log_trans() or it makes any future 217 * log transactions wait until you call btrfs_end_log_trans() 218 */ 219 int btrfs_pin_log_trans(struct btrfs_root *root) 220 { 221 int ret = -ENOENT; 222 223 mutex_lock(&root->log_mutex); 224 atomic_inc(&root->log_writers); 225 mutex_unlock(&root->log_mutex); 226 return ret; 227 } 228 229 /* 230 * indicate we're done making changes to the log tree 231 * and wake up anyone waiting to do a sync 232 */ 233 void btrfs_end_log_trans(struct btrfs_root *root) 234 { 235 if (atomic_dec_and_test(&root->log_writers)) { 236 /* 237 * Implicit memory barrier after atomic_dec_and_test 238 */ 239 if (waitqueue_active(&root->log_writer_wait)) 240 wake_up(&root->log_writer_wait); 241 } 242 } 243 244 245 /* 246 * the walk control struct is used to pass state down the chain when 247 * processing the log tree. The stage field tells us which part 248 * of the log tree processing we are currently doing. The others 249 * are state fields used for that specific part 250 */ 251 struct walk_control { 252 /* should we free the extent on disk when done? This is used 253 * at transaction commit time while freeing a log tree 254 */ 255 int free; 256 257 /* should we write out the extent buffer? This is used 258 * while flushing the log tree to disk during a sync 259 */ 260 int write; 261 262 /* should we wait for the extent buffer io to finish? Also used 263 * while flushing the log tree to disk for a sync 264 */ 265 int wait; 266 267 /* pin only walk, we record which extents on disk belong to the 268 * log trees 269 */ 270 int pin; 271 272 /* what stage of the replay code we're currently in */ 273 int stage; 274 275 /* the root we are currently replaying */ 276 struct btrfs_root *replay_dest; 277 278 /* the trans handle for the current replay */ 279 struct btrfs_trans_handle *trans; 280 281 /* the function that gets used to process blocks we find in the 282 * tree. Note the extent_buffer might not be up to date when it is 283 * passed in, and it must be checked or read if you need the data 284 * inside it 285 */ 286 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287 struct walk_control *wc, u64 gen); 288 }; 289 290 /* 291 * process_func used to pin down extents, write them or wait on them 292 */ 293 static int process_one_buffer(struct btrfs_root *log, 294 struct extent_buffer *eb, 295 struct walk_control *wc, u64 gen) 296 { 297 struct btrfs_fs_info *fs_info = log->fs_info; 298 int ret = 0; 299 300 /* 301 * If this fs is mixed then we need to be able to process the leaves to 302 * pin down any logged extents, so we have to read the block. 303 */ 304 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 305 ret = btrfs_read_buffer(eb, gen); 306 if (ret) 307 return ret; 308 } 309 310 if (wc->pin) 311 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 312 eb->len); 313 314 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 315 if (wc->pin && btrfs_header_level(eb) == 0) 316 ret = btrfs_exclude_logged_extents(fs_info, eb); 317 if (wc->write) 318 btrfs_write_tree_block(eb); 319 if (wc->wait) 320 btrfs_wait_tree_block_writeback(eb); 321 } 322 return ret; 323 } 324 325 /* 326 * Item overwrite used by replay and tree logging. eb, slot and key all refer 327 * to the src data we are copying out. 328 * 329 * root is the tree we are copying into, and path is a scratch 330 * path for use in this function (it should be released on entry and 331 * will be released on exit). 332 * 333 * If the key is already in the destination tree the existing item is 334 * overwritten. If the existing item isn't big enough, it is extended. 335 * If it is too large, it is truncated. 336 * 337 * If the key isn't in the destination yet, a new item is inserted. 338 */ 339 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 340 struct btrfs_root *root, 341 struct btrfs_path *path, 342 struct extent_buffer *eb, int slot, 343 struct btrfs_key *key) 344 { 345 struct btrfs_fs_info *fs_info = root->fs_info; 346 int ret; 347 u32 item_size; 348 u64 saved_i_size = 0; 349 int save_old_i_size = 0; 350 unsigned long src_ptr; 351 unsigned long dst_ptr; 352 int overwrite_root = 0; 353 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 354 355 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 356 overwrite_root = 1; 357 358 item_size = btrfs_item_size_nr(eb, slot); 359 src_ptr = btrfs_item_ptr_offset(eb, slot); 360 361 /* look for the key in the destination tree */ 362 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 363 if (ret < 0) 364 return ret; 365 366 if (ret == 0) { 367 char *src_copy; 368 char *dst_copy; 369 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 370 path->slots[0]); 371 if (dst_size != item_size) 372 goto insert; 373 374 if (item_size == 0) { 375 btrfs_release_path(path); 376 return 0; 377 } 378 dst_copy = kmalloc(item_size, GFP_NOFS); 379 src_copy = kmalloc(item_size, GFP_NOFS); 380 if (!dst_copy || !src_copy) { 381 btrfs_release_path(path); 382 kfree(dst_copy); 383 kfree(src_copy); 384 return -ENOMEM; 385 } 386 387 read_extent_buffer(eb, src_copy, src_ptr, item_size); 388 389 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 390 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 391 item_size); 392 ret = memcmp(dst_copy, src_copy, item_size); 393 394 kfree(dst_copy); 395 kfree(src_copy); 396 /* 397 * they have the same contents, just return, this saves 398 * us from cowing blocks in the destination tree and doing 399 * extra writes that may not have been done by a previous 400 * sync 401 */ 402 if (ret == 0) { 403 btrfs_release_path(path); 404 return 0; 405 } 406 407 /* 408 * We need to load the old nbytes into the inode so when we 409 * replay the extents we've logged we get the right nbytes. 410 */ 411 if (inode_item) { 412 struct btrfs_inode_item *item; 413 u64 nbytes; 414 u32 mode; 415 416 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 417 struct btrfs_inode_item); 418 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 419 item = btrfs_item_ptr(eb, slot, 420 struct btrfs_inode_item); 421 btrfs_set_inode_nbytes(eb, item, nbytes); 422 423 /* 424 * If this is a directory we need to reset the i_size to 425 * 0 so that we can set it up properly when replaying 426 * the rest of the items in this log. 427 */ 428 mode = btrfs_inode_mode(eb, item); 429 if (S_ISDIR(mode)) 430 btrfs_set_inode_size(eb, item, 0); 431 } 432 } else if (inode_item) { 433 struct btrfs_inode_item *item; 434 u32 mode; 435 436 /* 437 * New inode, set nbytes to 0 so that the nbytes comes out 438 * properly when we replay the extents. 439 */ 440 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 441 btrfs_set_inode_nbytes(eb, item, 0); 442 443 /* 444 * If this is a directory we need to reset the i_size to 0 so 445 * that we can set it up properly when replaying the rest of 446 * the items in this log. 447 */ 448 mode = btrfs_inode_mode(eb, item); 449 if (S_ISDIR(mode)) 450 btrfs_set_inode_size(eb, item, 0); 451 } 452 insert: 453 btrfs_release_path(path); 454 /* try to insert the key into the destination tree */ 455 path->skip_release_on_error = 1; 456 ret = btrfs_insert_empty_item(trans, root, path, 457 key, item_size); 458 path->skip_release_on_error = 0; 459 460 /* make sure any existing item is the correct size */ 461 if (ret == -EEXIST || ret == -EOVERFLOW) { 462 u32 found_size; 463 found_size = btrfs_item_size_nr(path->nodes[0], 464 path->slots[0]); 465 if (found_size > item_size) 466 btrfs_truncate_item(fs_info, path, item_size, 1); 467 else if (found_size < item_size) 468 btrfs_extend_item(fs_info, path, 469 item_size - found_size); 470 } else if (ret) { 471 return ret; 472 } 473 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 474 path->slots[0]); 475 476 /* don't overwrite an existing inode if the generation number 477 * was logged as zero. This is done when the tree logging code 478 * is just logging an inode to make sure it exists after recovery. 479 * 480 * Also, don't overwrite i_size on directories during replay. 481 * log replay inserts and removes directory items based on the 482 * state of the tree found in the subvolume, and i_size is modified 483 * as it goes 484 */ 485 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 486 struct btrfs_inode_item *src_item; 487 struct btrfs_inode_item *dst_item; 488 489 src_item = (struct btrfs_inode_item *)src_ptr; 490 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 492 if (btrfs_inode_generation(eb, src_item) == 0) { 493 struct extent_buffer *dst_eb = path->nodes[0]; 494 const u64 ino_size = btrfs_inode_size(eb, src_item); 495 496 /* 497 * For regular files an ino_size == 0 is used only when 498 * logging that an inode exists, as part of a directory 499 * fsync, and the inode wasn't fsynced before. In this 500 * case don't set the size of the inode in the fs/subvol 501 * tree, otherwise we would be throwing valid data away. 502 */ 503 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 505 ino_size != 0) { 506 struct btrfs_map_token token; 507 508 btrfs_init_map_token(&token); 509 btrfs_set_token_inode_size(dst_eb, dst_item, 510 ino_size, &token); 511 } 512 goto no_copy; 513 } 514 515 if (overwrite_root && 516 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 517 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 518 save_old_i_size = 1; 519 saved_i_size = btrfs_inode_size(path->nodes[0], 520 dst_item); 521 } 522 } 523 524 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 525 src_ptr, item_size); 526 527 if (save_old_i_size) { 528 struct btrfs_inode_item *dst_item; 529 dst_item = (struct btrfs_inode_item *)dst_ptr; 530 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 531 } 532 533 /* make sure the generation is filled in */ 534 if (key->type == BTRFS_INODE_ITEM_KEY) { 535 struct btrfs_inode_item *dst_item; 536 dst_item = (struct btrfs_inode_item *)dst_ptr; 537 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 538 btrfs_set_inode_generation(path->nodes[0], dst_item, 539 trans->transid); 540 } 541 } 542 no_copy: 543 btrfs_mark_buffer_dirty(path->nodes[0]); 544 btrfs_release_path(path); 545 return 0; 546 } 547 548 /* 549 * simple helper to read an inode off the disk from a given root 550 * This can only be called for subvolume roots and not for the log 551 */ 552 static noinline struct inode *read_one_inode(struct btrfs_root *root, 553 u64 objectid) 554 { 555 struct btrfs_key key; 556 struct inode *inode; 557 558 key.objectid = objectid; 559 key.type = BTRFS_INODE_ITEM_KEY; 560 key.offset = 0; 561 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 562 if (IS_ERR(inode)) { 563 inode = NULL; 564 } else if (is_bad_inode(inode)) { 565 iput(inode); 566 inode = NULL; 567 } 568 return inode; 569 } 570 571 /* replays a single extent in 'eb' at 'slot' with 'key' into the 572 * subvolume 'root'. path is released on entry and should be released 573 * on exit. 574 * 575 * extents in the log tree have not been allocated out of the extent 576 * tree yet. So, this completes the allocation, taking a reference 577 * as required if the extent already exists or creating a new extent 578 * if it isn't in the extent allocation tree yet. 579 * 580 * The extent is inserted into the file, dropping any existing extents 581 * from the file that overlap the new one. 582 */ 583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, 585 struct btrfs_path *path, 586 struct extent_buffer *eb, int slot, 587 struct btrfs_key *key) 588 { 589 struct btrfs_fs_info *fs_info = root->fs_info; 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, 617 fs_info->sectorsize); 618 } else { 619 ret = 0; 620 goto out; 621 } 622 623 inode = read_one_inode(root, key->objectid); 624 if (!inode) { 625 ret = -EIO; 626 goto out; 627 } 628 629 /* 630 * first check to see if we already have this extent in the 631 * file. This must be done before the btrfs_drop_extents run 632 * so we don't try to drop this extent. 633 */ 634 ret = btrfs_lookup_file_extent(trans, root, path, 635 btrfs_ino(BTRFS_I(inode)), start, 0); 636 637 if (ret == 0 && 638 (found_type == BTRFS_FILE_EXTENT_REG || 639 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 640 struct btrfs_file_extent_item cmp1; 641 struct btrfs_file_extent_item cmp2; 642 struct btrfs_file_extent_item *existing; 643 struct extent_buffer *leaf; 644 645 leaf = path->nodes[0]; 646 existing = btrfs_item_ptr(leaf, path->slots[0], 647 struct btrfs_file_extent_item); 648 649 read_extent_buffer(eb, &cmp1, (unsigned long)item, 650 sizeof(cmp1)); 651 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 652 sizeof(cmp2)); 653 654 /* 655 * we already have a pointer to this exact extent, 656 * we don't have to do anything 657 */ 658 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 659 btrfs_release_path(path); 660 goto out; 661 } 662 } 663 btrfs_release_path(path); 664 665 /* drop any overlapping extents */ 666 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 667 if (ret) 668 goto out; 669 670 if (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 672 u64 offset; 673 unsigned long dest_offset; 674 struct btrfs_key ins; 675 676 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 677 btrfs_fs_incompat(fs_info, NO_HOLES)) 678 goto update_inode; 679 680 ret = btrfs_insert_empty_item(trans, root, path, key, 681 sizeof(*item)); 682 if (ret) 683 goto out; 684 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 685 path->slots[0]); 686 copy_extent_buffer(path->nodes[0], eb, dest_offset, 687 (unsigned long)item, sizeof(*item)); 688 689 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 690 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 691 ins.type = BTRFS_EXTENT_ITEM_KEY; 692 offset = key->offset - btrfs_file_extent_offset(eb, item); 693 694 /* 695 * Manually record dirty extent, as here we did a shallow 696 * file extent item copy and skip normal backref update, 697 * but modifying extent tree all by ourselves. 698 * So need to manually record dirty extent for qgroup, 699 * as the owner of the file extent changed from log tree 700 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 701 */ 702 ret = btrfs_qgroup_trace_extent(trans, fs_info, 703 btrfs_file_extent_disk_bytenr(eb, item), 704 btrfs_file_extent_disk_num_bytes(eb, item), 705 GFP_NOFS); 706 if (ret < 0) 707 goto out; 708 709 if (ins.objectid > 0) { 710 u64 csum_start; 711 u64 csum_end; 712 LIST_HEAD(ordered_sums); 713 /* 714 * is this extent already allocated in the extent 715 * allocation tree? If so, just add a reference 716 */ 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 718 ins.offset); 719 if (ret == 0) { 720 ret = btrfs_inc_extent_ref(trans, root, 721 ins.objectid, ins.offset, 722 0, root->root_key.objectid, 723 key->objectid, offset); 724 if (ret) 725 goto out; 726 } else { 727 /* 728 * insert the extent pointer in the extent 729 * allocation tree 730 */ 731 ret = btrfs_alloc_logged_file_extent(trans, 732 fs_info, 733 root->root_key.objectid, 734 key->objectid, offset, &ins); 735 if (ret) 736 goto out; 737 } 738 btrfs_release_path(path); 739 740 if (btrfs_file_extent_compression(eb, item)) { 741 csum_start = ins.objectid; 742 csum_end = csum_start + ins.offset; 743 } else { 744 csum_start = ins.objectid + 745 btrfs_file_extent_offset(eb, item); 746 csum_end = csum_start + 747 btrfs_file_extent_num_bytes(eb, item); 748 } 749 750 ret = btrfs_lookup_csums_range(root->log_root, 751 csum_start, csum_end - 1, 752 &ordered_sums, 0); 753 if (ret) 754 goto out; 755 /* 756 * Now delete all existing cums in the csum root that 757 * cover our range. We do this because we can have an 758 * extent that is completely referenced by one file 759 * extent item and partially referenced by another 760 * file extent item (like after using the clone or 761 * extent_same ioctls). In this case if we end up doing 762 * the replay of the one that partially references the 763 * extent first, and we do not do the csum deletion 764 * below, we can get 2 csum items in the csum tree that 765 * overlap each other. For example, imagine our log has 766 * the two following file extent items: 767 * 768 * key (257 EXTENT_DATA 409600) 769 * extent data disk byte 12845056 nr 102400 770 * extent data offset 20480 nr 20480 ram 102400 771 * 772 * key (257 EXTENT_DATA 819200) 773 * extent data disk byte 12845056 nr 102400 774 * extent data offset 0 nr 102400 ram 102400 775 * 776 * Where the second one fully references the 100K extent 777 * that starts at disk byte 12845056, and the log tree 778 * has a single csum item that covers the entire range 779 * of the extent: 780 * 781 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 782 * 783 * After the first file extent item is replayed, the 784 * csum tree gets the following csum item: 785 * 786 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 787 * 788 * Which covers the 20K sub-range starting at offset 20K 789 * of our extent. Now when we replay the second file 790 * extent item, if we do not delete existing csum items 791 * that cover any of its blocks, we end up getting two 792 * csum items in our csum tree that overlap each other: 793 * 794 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 795 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 796 * 797 * Which is a problem, because after this anyone trying 798 * to lookup up for the checksum of any block of our 799 * extent starting at an offset of 40K or higher, will 800 * end up looking at the second csum item only, which 801 * does not contain the checksum for any block starting 802 * at offset 40K or higher of our extent. 803 */ 804 while (!list_empty(&ordered_sums)) { 805 struct btrfs_ordered_sum *sums; 806 sums = list_entry(ordered_sums.next, 807 struct btrfs_ordered_sum, 808 list); 809 if (!ret) 810 ret = btrfs_del_csums(trans, fs_info, 811 sums->bytenr, 812 sums->len); 813 if (!ret) 814 ret = btrfs_csum_file_blocks(trans, 815 fs_info->csum_root, sums); 816 list_del(&sums->list); 817 kfree(sums); 818 } 819 if (ret) 820 goto out; 821 } else { 822 btrfs_release_path(path); 823 } 824 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 825 /* inline extents are easy, we just overwrite them */ 826 ret = overwrite_item(trans, root, path, eb, slot, key); 827 if (ret) 828 goto out; 829 } 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct btrfs_fs_info *fs_info = root->fs_info; 855 struct inode *inode; 856 char *name; 857 int name_len; 858 struct extent_buffer *leaf; 859 struct btrfs_key location; 860 int ret; 861 862 leaf = path->nodes[0]; 863 864 btrfs_dir_item_key_to_cpu(leaf, di, &location); 865 name_len = btrfs_dir_name_len(leaf, di); 866 name = kmalloc(name_len, GFP_NOFS); 867 if (!name) 868 return -ENOMEM; 869 870 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 871 btrfs_release_path(path); 872 873 inode = read_one_inode(root, location.objectid); 874 if (!inode) { 875 ret = -EIO; 876 goto out; 877 } 878 879 ret = link_to_fixup_dir(trans, root, path, location.objectid); 880 if (ret) 881 goto out; 882 883 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 884 name_len); 885 if (ret) 886 goto out; 887 else 888 ret = btrfs_run_delayed_items(trans, fs_info); 889 out: 890 kfree(name); 891 iput(inode); 892 return ret; 893 } 894 895 /* 896 * helper function to see if a given name and sequence number found 897 * in an inode back reference are already in a directory and correctly 898 * point to this inode 899 */ 900 static noinline int inode_in_dir(struct btrfs_root *root, 901 struct btrfs_path *path, 902 u64 dirid, u64 objectid, u64 index, 903 const char *name, int name_len) 904 { 905 struct btrfs_dir_item *di; 906 struct btrfs_key location; 907 int match = 0; 908 909 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 910 index, name, name_len, 0); 911 if (di && !IS_ERR(di)) { 912 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 913 if (location.objectid != objectid) 914 goto out; 915 } else 916 goto out; 917 btrfs_release_path(path); 918 919 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 920 if (di && !IS_ERR(di)) { 921 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 922 if (location.objectid != objectid) 923 goto out; 924 } else 925 goto out; 926 match = 1; 927 out: 928 btrfs_release_path(path); 929 return match; 930 } 931 932 /* 933 * helper function to check a log tree for a named back reference in 934 * an inode. This is used to decide if a back reference that is 935 * found in the subvolume conflicts with what we find in the log. 936 * 937 * inode backreferences may have multiple refs in a single item, 938 * during replay we process one reference at a time, and we don't 939 * want to delete valid links to a file from the subvolume if that 940 * link is also in the log. 941 */ 942 static noinline int backref_in_log(struct btrfs_root *log, 943 struct btrfs_key *key, 944 u64 ref_objectid, 945 const char *name, int namelen) 946 { 947 struct btrfs_path *path; 948 struct btrfs_inode_ref *ref; 949 unsigned long ptr; 950 unsigned long ptr_end; 951 unsigned long name_ptr; 952 int found_name_len; 953 int item_size; 954 int ret; 955 int match = 0; 956 957 path = btrfs_alloc_path(); 958 if (!path) 959 return -ENOMEM; 960 961 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 962 if (ret != 0) 963 goto out; 964 965 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 966 967 if (key->type == BTRFS_INODE_EXTREF_KEY) { 968 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 969 name, namelen, NULL)) 970 match = 1; 971 972 goto out; 973 } 974 975 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 976 ptr_end = ptr + item_size; 977 while (ptr < ptr_end) { 978 ref = (struct btrfs_inode_ref *)ptr; 979 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 980 if (found_name_len == namelen) { 981 name_ptr = (unsigned long)(ref + 1); 982 ret = memcmp_extent_buffer(path->nodes[0], name, 983 name_ptr, namelen); 984 if (ret == 0) { 985 match = 1; 986 goto out; 987 } 988 } 989 ptr = (unsigned long)(ref + 1) + found_name_len; 990 } 991 out: 992 btrfs_free_path(path); 993 return match; 994 } 995 996 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 997 struct btrfs_root *root, 998 struct btrfs_path *path, 999 struct btrfs_root *log_root, 1000 struct btrfs_inode *dir, 1001 struct btrfs_inode *inode, 1002 u64 inode_objectid, u64 parent_objectid, 1003 u64 ref_index, char *name, int namelen, 1004 int *search_done) 1005 { 1006 struct btrfs_fs_info *fs_info = root->fs_info; 1007 int ret; 1008 char *victim_name; 1009 int victim_name_len; 1010 struct extent_buffer *leaf; 1011 struct btrfs_dir_item *di; 1012 struct btrfs_key search_key; 1013 struct btrfs_inode_extref *extref; 1014 1015 again: 1016 /* Search old style refs */ 1017 search_key.objectid = inode_objectid; 1018 search_key.type = BTRFS_INODE_REF_KEY; 1019 search_key.offset = parent_objectid; 1020 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1021 if (ret == 0) { 1022 struct btrfs_inode_ref *victim_ref; 1023 unsigned long ptr; 1024 unsigned long ptr_end; 1025 1026 leaf = path->nodes[0]; 1027 1028 /* are we trying to overwrite a back ref for the root directory 1029 * if so, just jump out, we're done 1030 */ 1031 if (search_key.objectid == search_key.offset) 1032 return 1; 1033 1034 /* check all the names in this back reference to see 1035 * if they are in the log. if so, we allow them to stay 1036 * otherwise they must be unlinked as a conflict 1037 */ 1038 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1039 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1040 while (ptr < ptr_end) { 1041 victim_ref = (struct btrfs_inode_ref *)ptr; 1042 victim_name_len = btrfs_inode_ref_name_len(leaf, 1043 victim_ref); 1044 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1045 if (!victim_name) 1046 return -ENOMEM; 1047 1048 read_extent_buffer(leaf, victim_name, 1049 (unsigned long)(victim_ref + 1), 1050 victim_name_len); 1051 1052 if (!backref_in_log(log_root, &search_key, 1053 parent_objectid, 1054 victim_name, 1055 victim_name_len)) { 1056 inc_nlink(&inode->vfs_inode); 1057 btrfs_release_path(path); 1058 1059 ret = btrfs_unlink_inode(trans, root, dir, inode, 1060 victim_name, victim_name_len); 1061 kfree(victim_name); 1062 if (ret) 1063 return ret; 1064 ret = btrfs_run_delayed_items(trans, fs_info); 1065 if (ret) 1066 return ret; 1067 *search_done = 1; 1068 goto again; 1069 } 1070 kfree(victim_name); 1071 1072 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1073 } 1074 1075 /* 1076 * NOTE: we have searched root tree and checked the 1077 * corresponding ref, it does not need to check again. 1078 */ 1079 *search_done = 1; 1080 } 1081 btrfs_release_path(path); 1082 1083 /* Same search but for extended refs */ 1084 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1085 inode_objectid, parent_objectid, 0, 1086 0); 1087 if (!IS_ERR_OR_NULL(extref)) { 1088 u32 item_size; 1089 u32 cur_offset = 0; 1090 unsigned long base; 1091 struct inode *victim_parent; 1092 1093 leaf = path->nodes[0]; 1094 1095 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1096 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1097 1098 while (cur_offset < item_size) { 1099 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1100 1101 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1102 1103 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1104 goto next; 1105 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1110 victim_name_len); 1111 1112 search_key.objectid = inode_objectid; 1113 search_key.type = BTRFS_INODE_EXTREF_KEY; 1114 search_key.offset = btrfs_extref_hash(parent_objectid, 1115 victim_name, 1116 victim_name_len); 1117 ret = 0; 1118 if (!backref_in_log(log_root, &search_key, 1119 parent_objectid, victim_name, 1120 victim_name_len)) { 1121 ret = -ENOENT; 1122 victim_parent = read_one_inode(root, 1123 parent_objectid); 1124 if (victim_parent) { 1125 inc_nlink(&inode->vfs_inode); 1126 btrfs_release_path(path); 1127 1128 ret = btrfs_unlink_inode(trans, root, 1129 BTRFS_I(victim_parent), 1130 inode, 1131 victim_name, 1132 victim_name_len); 1133 if (!ret) 1134 ret = btrfs_run_delayed_items( 1135 trans, 1136 fs_info); 1137 } 1138 iput(victim_parent); 1139 kfree(victim_name); 1140 if (ret) 1141 return ret; 1142 *search_done = 1; 1143 goto again; 1144 } 1145 kfree(victim_name); 1146 next: 1147 cur_offset += victim_name_len + sizeof(*extref); 1148 } 1149 *search_done = 1; 1150 } 1151 btrfs_release_path(path); 1152 1153 /* look for a conflicting sequence number */ 1154 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1155 ref_index, name, namelen, 0); 1156 if (di && !IS_ERR(di)) { 1157 ret = drop_one_dir_item(trans, root, path, dir, di); 1158 if (ret) 1159 return ret; 1160 } 1161 btrfs_release_path(path); 1162 1163 /* look for a conflicing name */ 1164 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1165 name, namelen, 0); 1166 if (di && !IS_ERR(di)) { 1167 ret = drop_one_dir_item(trans, root, path, dir, di); 1168 if (ret) 1169 return ret; 1170 } 1171 btrfs_release_path(path); 1172 1173 return 0; 1174 } 1175 1176 static int extref_get_fields(struct extent_buffer *eb, int slot, 1177 unsigned long ref_ptr, u32 *namelen, char **name, 1178 u64 *index, u64 *parent_objectid) 1179 { 1180 struct btrfs_inode_extref *extref; 1181 1182 extref = (struct btrfs_inode_extref *)ref_ptr; 1183 1184 *namelen = btrfs_inode_extref_name_len(eb, extref); 1185 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, 1186 *namelen)) 1187 return -EIO; 1188 1189 *name = kmalloc(*namelen, GFP_NOFS); 1190 if (*name == NULL) 1191 return -ENOMEM; 1192 1193 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1194 *namelen); 1195 1196 *index = btrfs_inode_extref_index(eb, extref); 1197 if (parent_objectid) 1198 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1199 1200 return 0; 1201 } 1202 1203 static int ref_get_fields(struct extent_buffer *eb, int slot, 1204 unsigned long ref_ptr, u32 *namelen, char **name, 1205 u64 *index) 1206 { 1207 struct btrfs_inode_ref *ref; 1208 1209 ref = (struct btrfs_inode_ref *)ref_ptr; 1210 1211 *namelen = btrfs_inode_ref_name_len(eb, ref); 1212 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), 1213 *namelen)) 1214 return -EIO; 1215 1216 *name = kmalloc(*namelen, GFP_NOFS); 1217 if (*name == NULL) 1218 return -ENOMEM; 1219 1220 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1221 1222 *index = btrfs_inode_ref_index(eb, ref); 1223 1224 return 0; 1225 } 1226 1227 /* 1228 * replay one inode back reference item found in the log tree. 1229 * eb, slot and key refer to the buffer and key found in the log tree. 1230 * root is the destination we are replaying into, and path is for temp 1231 * use by this function. (it should be released on return). 1232 */ 1233 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1234 struct btrfs_root *root, 1235 struct btrfs_root *log, 1236 struct btrfs_path *path, 1237 struct extent_buffer *eb, int slot, 1238 struct btrfs_key *key) 1239 { 1240 struct inode *dir = NULL; 1241 struct inode *inode = NULL; 1242 unsigned long ref_ptr; 1243 unsigned long ref_end; 1244 char *name = NULL; 1245 int namelen; 1246 int ret; 1247 int search_done = 0; 1248 int log_ref_ver = 0; 1249 u64 parent_objectid; 1250 u64 inode_objectid; 1251 u64 ref_index = 0; 1252 int ref_struct_size; 1253 1254 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1255 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1256 1257 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1258 struct btrfs_inode_extref *r; 1259 1260 ref_struct_size = sizeof(struct btrfs_inode_extref); 1261 log_ref_ver = 1; 1262 r = (struct btrfs_inode_extref *)ref_ptr; 1263 parent_objectid = btrfs_inode_extref_parent(eb, r); 1264 } else { 1265 ref_struct_size = sizeof(struct btrfs_inode_ref); 1266 parent_objectid = key->offset; 1267 } 1268 inode_objectid = key->objectid; 1269 1270 /* 1271 * it is possible that we didn't log all the parent directories 1272 * for a given inode. If we don't find the dir, just don't 1273 * copy the back ref in. The link count fixup code will take 1274 * care of the rest 1275 */ 1276 dir = read_one_inode(root, parent_objectid); 1277 if (!dir) { 1278 ret = -ENOENT; 1279 goto out; 1280 } 1281 1282 inode = read_one_inode(root, inode_objectid); 1283 if (!inode) { 1284 ret = -EIO; 1285 goto out; 1286 } 1287 1288 while (ref_ptr < ref_end) { 1289 if (log_ref_ver) { 1290 ret = extref_get_fields(eb, slot, ref_ptr, &namelen, 1291 &name, &ref_index, &parent_objectid); 1292 /* 1293 * parent object can change from one array 1294 * item to another. 1295 */ 1296 if (!dir) 1297 dir = read_one_inode(root, parent_objectid); 1298 if (!dir) { 1299 ret = -ENOENT; 1300 goto out; 1301 } 1302 } else { 1303 ret = ref_get_fields(eb, slot, ref_ptr, &namelen, 1304 &name, &ref_index); 1305 } 1306 if (ret) 1307 goto out; 1308 1309 /* if we already have a perfect match, we're done */ 1310 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1311 btrfs_ino(BTRFS_I(inode)), ref_index, 1312 name, namelen)) { 1313 /* 1314 * look for a conflicting back reference in the 1315 * metadata. if we find one we have to unlink that name 1316 * of the file before we add our new link. Later on, we 1317 * overwrite any existing back reference, and we don't 1318 * want to create dangling pointers in the directory. 1319 */ 1320 1321 if (!search_done) { 1322 ret = __add_inode_ref(trans, root, path, log, 1323 BTRFS_I(dir), 1324 BTRFS_I(inode), 1325 inode_objectid, 1326 parent_objectid, 1327 ref_index, name, namelen, 1328 &search_done); 1329 if (ret) { 1330 if (ret == 1) 1331 ret = 0; 1332 goto out; 1333 } 1334 } 1335 1336 /* insert our name */ 1337 ret = btrfs_add_link(trans, BTRFS_I(dir), 1338 BTRFS_I(inode), 1339 name, namelen, 0, ref_index); 1340 if (ret) 1341 goto out; 1342 1343 btrfs_update_inode(trans, root, inode); 1344 } 1345 1346 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1347 kfree(name); 1348 name = NULL; 1349 if (log_ref_ver) { 1350 iput(dir); 1351 dir = NULL; 1352 } 1353 } 1354 1355 /* finally write the back reference in the inode */ 1356 ret = overwrite_item(trans, root, path, eb, slot, key); 1357 out: 1358 btrfs_release_path(path); 1359 kfree(name); 1360 iput(dir); 1361 iput(inode); 1362 return ret; 1363 } 1364 1365 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1366 struct btrfs_root *root, u64 ino) 1367 { 1368 int ret; 1369 1370 ret = btrfs_insert_orphan_item(trans, root, ino); 1371 if (ret == -EEXIST) 1372 ret = 0; 1373 1374 return ret; 1375 } 1376 1377 static int count_inode_extrefs(struct btrfs_root *root, 1378 struct btrfs_inode *inode, struct btrfs_path *path) 1379 { 1380 int ret = 0; 1381 int name_len; 1382 unsigned int nlink = 0; 1383 u32 item_size; 1384 u32 cur_offset = 0; 1385 u64 inode_objectid = btrfs_ino(inode); 1386 u64 offset = 0; 1387 unsigned long ptr; 1388 struct btrfs_inode_extref *extref; 1389 struct extent_buffer *leaf; 1390 1391 while (1) { 1392 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1393 &extref, &offset); 1394 if (ret) 1395 break; 1396 1397 leaf = path->nodes[0]; 1398 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1399 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1400 cur_offset = 0; 1401 1402 while (cur_offset < item_size) { 1403 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1404 name_len = btrfs_inode_extref_name_len(leaf, extref); 1405 1406 nlink++; 1407 1408 cur_offset += name_len + sizeof(*extref); 1409 } 1410 1411 offset++; 1412 btrfs_release_path(path); 1413 } 1414 btrfs_release_path(path); 1415 1416 if (ret < 0 && ret != -ENOENT) 1417 return ret; 1418 return nlink; 1419 } 1420 1421 static int count_inode_refs(struct btrfs_root *root, 1422 struct btrfs_inode *inode, struct btrfs_path *path) 1423 { 1424 int ret; 1425 struct btrfs_key key; 1426 unsigned int nlink = 0; 1427 unsigned long ptr; 1428 unsigned long ptr_end; 1429 int name_len; 1430 u64 ino = btrfs_ino(inode); 1431 1432 key.objectid = ino; 1433 key.type = BTRFS_INODE_REF_KEY; 1434 key.offset = (u64)-1; 1435 1436 while (1) { 1437 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1438 if (ret < 0) 1439 break; 1440 if (ret > 0) { 1441 if (path->slots[0] == 0) 1442 break; 1443 path->slots[0]--; 1444 } 1445 process_slot: 1446 btrfs_item_key_to_cpu(path->nodes[0], &key, 1447 path->slots[0]); 1448 if (key.objectid != ino || 1449 key.type != BTRFS_INODE_REF_KEY) 1450 break; 1451 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1452 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1453 path->slots[0]); 1454 while (ptr < ptr_end) { 1455 struct btrfs_inode_ref *ref; 1456 1457 ref = (struct btrfs_inode_ref *)ptr; 1458 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1459 ref); 1460 ptr = (unsigned long)(ref + 1) + name_len; 1461 nlink++; 1462 } 1463 1464 if (key.offset == 0) 1465 break; 1466 if (path->slots[0] > 0) { 1467 path->slots[0]--; 1468 goto process_slot; 1469 } 1470 key.offset--; 1471 btrfs_release_path(path); 1472 } 1473 btrfs_release_path(path); 1474 1475 return nlink; 1476 } 1477 1478 /* 1479 * There are a few corners where the link count of the file can't 1480 * be properly maintained during replay. So, instead of adding 1481 * lots of complexity to the log code, we just scan the backrefs 1482 * for any file that has been through replay. 1483 * 1484 * The scan will update the link count on the inode to reflect the 1485 * number of back refs found. If it goes down to zero, the iput 1486 * will free the inode. 1487 */ 1488 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1489 struct btrfs_root *root, 1490 struct inode *inode) 1491 { 1492 struct btrfs_path *path; 1493 int ret; 1494 u64 nlink = 0; 1495 u64 ino = btrfs_ino(BTRFS_I(inode)); 1496 1497 path = btrfs_alloc_path(); 1498 if (!path) 1499 return -ENOMEM; 1500 1501 ret = count_inode_refs(root, BTRFS_I(inode), path); 1502 if (ret < 0) 1503 goto out; 1504 1505 nlink = ret; 1506 1507 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1508 if (ret < 0) 1509 goto out; 1510 1511 nlink += ret; 1512 1513 ret = 0; 1514 1515 if (nlink != inode->i_nlink) { 1516 set_nlink(inode, nlink); 1517 btrfs_update_inode(trans, root, inode); 1518 } 1519 BTRFS_I(inode)->index_cnt = (u64)-1; 1520 1521 if (inode->i_nlink == 0) { 1522 if (S_ISDIR(inode->i_mode)) { 1523 ret = replay_dir_deletes(trans, root, NULL, path, 1524 ino, 1); 1525 if (ret) 1526 goto out; 1527 } 1528 ret = insert_orphan_item(trans, root, ino); 1529 } 1530 1531 out: 1532 btrfs_free_path(path); 1533 return ret; 1534 } 1535 1536 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1537 struct btrfs_root *root, 1538 struct btrfs_path *path) 1539 { 1540 int ret; 1541 struct btrfs_key key; 1542 struct inode *inode; 1543 1544 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1545 key.type = BTRFS_ORPHAN_ITEM_KEY; 1546 key.offset = (u64)-1; 1547 while (1) { 1548 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1549 if (ret < 0) 1550 break; 1551 1552 if (ret == 1) { 1553 if (path->slots[0] == 0) 1554 break; 1555 path->slots[0]--; 1556 } 1557 1558 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1559 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1560 key.type != BTRFS_ORPHAN_ITEM_KEY) 1561 break; 1562 1563 ret = btrfs_del_item(trans, root, path); 1564 if (ret) 1565 goto out; 1566 1567 btrfs_release_path(path); 1568 inode = read_one_inode(root, key.offset); 1569 if (!inode) 1570 return -EIO; 1571 1572 ret = fixup_inode_link_count(trans, root, inode); 1573 iput(inode); 1574 if (ret) 1575 goto out; 1576 1577 /* 1578 * fixup on a directory may create new entries, 1579 * make sure we always look for the highset possible 1580 * offset 1581 */ 1582 key.offset = (u64)-1; 1583 } 1584 ret = 0; 1585 out: 1586 btrfs_release_path(path); 1587 return ret; 1588 } 1589 1590 1591 /* 1592 * record a given inode in the fixup dir so we can check its link 1593 * count when replay is done. The link count is incremented here 1594 * so the inode won't go away until we check it 1595 */ 1596 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1597 struct btrfs_root *root, 1598 struct btrfs_path *path, 1599 u64 objectid) 1600 { 1601 struct btrfs_key key; 1602 int ret = 0; 1603 struct inode *inode; 1604 1605 inode = read_one_inode(root, objectid); 1606 if (!inode) 1607 return -EIO; 1608 1609 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1610 key.type = BTRFS_ORPHAN_ITEM_KEY; 1611 key.offset = objectid; 1612 1613 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1614 1615 btrfs_release_path(path); 1616 if (ret == 0) { 1617 if (!inode->i_nlink) 1618 set_nlink(inode, 1); 1619 else 1620 inc_nlink(inode); 1621 ret = btrfs_update_inode(trans, root, inode); 1622 } else if (ret == -EEXIST) { 1623 ret = 0; 1624 } else { 1625 BUG(); /* Logic Error */ 1626 } 1627 iput(inode); 1628 1629 return ret; 1630 } 1631 1632 /* 1633 * when replaying the log for a directory, we only insert names 1634 * for inodes that actually exist. This means an fsync on a directory 1635 * does not implicitly fsync all the new files in it 1636 */ 1637 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1638 struct btrfs_root *root, 1639 u64 dirid, u64 index, 1640 char *name, int name_len, 1641 struct btrfs_key *location) 1642 { 1643 struct inode *inode; 1644 struct inode *dir; 1645 int ret; 1646 1647 inode = read_one_inode(root, location->objectid); 1648 if (!inode) 1649 return -ENOENT; 1650 1651 dir = read_one_inode(root, dirid); 1652 if (!dir) { 1653 iput(inode); 1654 return -EIO; 1655 } 1656 1657 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1658 name_len, 1, index); 1659 1660 /* FIXME, put inode into FIXUP list */ 1661 1662 iput(inode); 1663 iput(dir); 1664 return ret; 1665 } 1666 1667 /* 1668 * Return true if an inode reference exists in the log for the given name, 1669 * inode and parent inode. 1670 */ 1671 static bool name_in_log_ref(struct btrfs_root *log_root, 1672 const char *name, const int name_len, 1673 const u64 dirid, const u64 ino) 1674 { 1675 struct btrfs_key search_key; 1676 1677 search_key.objectid = ino; 1678 search_key.type = BTRFS_INODE_REF_KEY; 1679 search_key.offset = dirid; 1680 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1681 return true; 1682 1683 search_key.type = BTRFS_INODE_EXTREF_KEY; 1684 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1685 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1686 return true; 1687 1688 return false; 1689 } 1690 1691 /* 1692 * take a single entry in a log directory item and replay it into 1693 * the subvolume. 1694 * 1695 * if a conflicting item exists in the subdirectory already, 1696 * the inode it points to is unlinked and put into the link count 1697 * fix up tree. 1698 * 1699 * If a name from the log points to a file or directory that does 1700 * not exist in the FS, it is skipped. fsyncs on directories 1701 * do not force down inodes inside that directory, just changes to the 1702 * names or unlinks in a directory. 1703 * 1704 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1705 * non-existing inode) and 1 if the name was replayed. 1706 */ 1707 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1708 struct btrfs_root *root, 1709 struct btrfs_path *path, 1710 struct extent_buffer *eb, 1711 struct btrfs_dir_item *di, 1712 struct btrfs_key *key) 1713 { 1714 char *name; 1715 int name_len; 1716 struct btrfs_dir_item *dst_di; 1717 struct btrfs_key found_key; 1718 struct btrfs_key log_key; 1719 struct inode *dir; 1720 u8 log_type; 1721 int exists; 1722 int ret = 0; 1723 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1724 bool name_added = false; 1725 1726 dir = read_one_inode(root, key->objectid); 1727 if (!dir) 1728 return -EIO; 1729 1730 name_len = btrfs_dir_name_len(eb, di); 1731 name = kmalloc(name_len, GFP_NOFS); 1732 if (!name) { 1733 ret = -ENOMEM; 1734 goto out; 1735 } 1736 1737 log_type = btrfs_dir_type(eb, di); 1738 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1739 name_len); 1740 1741 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1742 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1743 if (exists == 0) 1744 exists = 1; 1745 else 1746 exists = 0; 1747 btrfs_release_path(path); 1748 1749 if (key->type == BTRFS_DIR_ITEM_KEY) { 1750 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1751 name, name_len, 1); 1752 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1753 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1754 key->objectid, 1755 key->offset, name, 1756 name_len, 1); 1757 } else { 1758 /* Corruption */ 1759 ret = -EINVAL; 1760 goto out; 1761 } 1762 if (IS_ERR_OR_NULL(dst_di)) { 1763 /* we need a sequence number to insert, so we only 1764 * do inserts for the BTRFS_DIR_INDEX_KEY types 1765 */ 1766 if (key->type != BTRFS_DIR_INDEX_KEY) 1767 goto out; 1768 goto insert; 1769 } 1770 1771 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1772 /* the existing item matches the logged item */ 1773 if (found_key.objectid == log_key.objectid && 1774 found_key.type == log_key.type && 1775 found_key.offset == log_key.offset && 1776 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1777 update_size = false; 1778 goto out; 1779 } 1780 1781 /* 1782 * don't drop the conflicting directory entry if the inode 1783 * for the new entry doesn't exist 1784 */ 1785 if (!exists) 1786 goto out; 1787 1788 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1789 if (ret) 1790 goto out; 1791 1792 if (key->type == BTRFS_DIR_INDEX_KEY) 1793 goto insert; 1794 out: 1795 btrfs_release_path(path); 1796 if (!ret && update_size) { 1797 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1798 ret = btrfs_update_inode(trans, root, dir); 1799 } 1800 kfree(name); 1801 iput(dir); 1802 if (!ret && name_added) 1803 ret = 1; 1804 return ret; 1805 1806 insert: 1807 if (name_in_log_ref(root->log_root, name, name_len, 1808 key->objectid, log_key.objectid)) { 1809 /* The dentry will be added later. */ 1810 ret = 0; 1811 update_size = false; 1812 goto out; 1813 } 1814 btrfs_release_path(path); 1815 ret = insert_one_name(trans, root, key->objectid, key->offset, 1816 name, name_len, &log_key); 1817 if (ret && ret != -ENOENT && ret != -EEXIST) 1818 goto out; 1819 if (!ret) 1820 name_added = true; 1821 update_size = false; 1822 ret = 0; 1823 goto out; 1824 } 1825 1826 /* 1827 * find all the names in a directory item and reconcile them into 1828 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1829 * one name in a directory item, but the same code gets used for 1830 * both directory index types 1831 */ 1832 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1833 struct btrfs_root *root, 1834 struct btrfs_path *path, 1835 struct extent_buffer *eb, int slot, 1836 struct btrfs_key *key) 1837 { 1838 struct btrfs_fs_info *fs_info = root->fs_info; 1839 int ret = 0; 1840 u32 item_size = btrfs_item_size_nr(eb, slot); 1841 struct btrfs_dir_item *di; 1842 int name_len; 1843 unsigned long ptr; 1844 unsigned long ptr_end; 1845 struct btrfs_path *fixup_path = NULL; 1846 1847 ptr = btrfs_item_ptr_offset(eb, slot); 1848 ptr_end = ptr + item_size; 1849 while (ptr < ptr_end) { 1850 di = (struct btrfs_dir_item *)ptr; 1851 if (verify_dir_item(fs_info, eb, slot, di)) 1852 return -EIO; 1853 name_len = btrfs_dir_name_len(eb, di); 1854 ret = replay_one_name(trans, root, path, eb, di, key); 1855 if (ret < 0) 1856 break; 1857 ptr = (unsigned long)(di + 1); 1858 ptr += name_len; 1859 1860 /* 1861 * If this entry refers to a non-directory (directories can not 1862 * have a link count > 1) and it was added in the transaction 1863 * that was not committed, make sure we fixup the link count of 1864 * the inode it the entry points to. Otherwise something like 1865 * the following would result in a directory pointing to an 1866 * inode with a wrong link that does not account for this dir 1867 * entry: 1868 * 1869 * mkdir testdir 1870 * touch testdir/foo 1871 * touch testdir/bar 1872 * sync 1873 * 1874 * ln testdir/bar testdir/bar_link 1875 * ln testdir/foo testdir/foo_link 1876 * xfs_io -c "fsync" testdir/bar 1877 * 1878 * <power failure> 1879 * 1880 * mount fs, log replay happens 1881 * 1882 * File foo would remain with a link count of 1 when it has two 1883 * entries pointing to it in the directory testdir. This would 1884 * make it impossible to ever delete the parent directory has 1885 * it would result in stale dentries that can never be deleted. 1886 */ 1887 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1888 struct btrfs_key di_key; 1889 1890 if (!fixup_path) { 1891 fixup_path = btrfs_alloc_path(); 1892 if (!fixup_path) { 1893 ret = -ENOMEM; 1894 break; 1895 } 1896 } 1897 1898 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1899 ret = link_to_fixup_dir(trans, root, fixup_path, 1900 di_key.objectid); 1901 if (ret) 1902 break; 1903 } 1904 ret = 0; 1905 } 1906 btrfs_free_path(fixup_path); 1907 return ret; 1908 } 1909 1910 /* 1911 * directory replay has two parts. There are the standard directory 1912 * items in the log copied from the subvolume, and range items 1913 * created in the log while the subvolume was logged. 1914 * 1915 * The range items tell us which parts of the key space the log 1916 * is authoritative for. During replay, if a key in the subvolume 1917 * directory is in a logged range item, but not actually in the log 1918 * that means it was deleted from the directory before the fsync 1919 * and should be removed. 1920 */ 1921 static noinline int find_dir_range(struct btrfs_root *root, 1922 struct btrfs_path *path, 1923 u64 dirid, int key_type, 1924 u64 *start_ret, u64 *end_ret) 1925 { 1926 struct btrfs_key key; 1927 u64 found_end; 1928 struct btrfs_dir_log_item *item; 1929 int ret; 1930 int nritems; 1931 1932 if (*start_ret == (u64)-1) 1933 return 1; 1934 1935 key.objectid = dirid; 1936 key.type = key_type; 1937 key.offset = *start_ret; 1938 1939 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1940 if (ret < 0) 1941 goto out; 1942 if (ret > 0) { 1943 if (path->slots[0] == 0) 1944 goto out; 1945 path->slots[0]--; 1946 } 1947 if (ret != 0) 1948 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1949 1950 if (key.type != key_type || key.objectid != dirid) { 1951 ret = 1; 1952 goto next; 1953 } 1954 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1955 struct btrfs_dir_log_item); 1956 found_end = btrfs_dir_log_end(path->nodes[0], item); 1957 1958 if (*start_ret >= key.offset && *start_ret <= found_end) { 1959 ret = 0; 1960 *start_ret = key.offset; 1961 *end_ret = found_end; 1962 goto out; 1963 } 1964 ret = 1; 1965 next: 1966 /* check the next slot in the tree to see if it is a valid item */ 1967 nritems = btrfs_header_nritems(path->nodes[0]); 1968 path->slots[0]++; 1969 if (path->slots[0] >= nritems) { 1970 ret = btrfs_next_leaf(root, path); 1971 if (ret) 1972 goto out; 1973 } 1974 1975 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1976 1977 if (key.type != key_type || key.objectid != dirid) { 1978 ret = 1; 1979 goto out; 1980 } 1981 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1982 struct btrfs_dir_log_item); 1983 found_end = btrfs_dir_log_end(path->nodes[0], item); 1984 *start_ret = key.offset; 1985 *end_ret = found_end; 1986 ret = 0; 1987 out: 1988 btrfs_release_path(path); 1989 return ret; 1990 } 1991 1992 /* 1993 * this looks for a given directory item in the log. If the directory 1994 * item is not in the log, the item is removed and the inode it points 1995 * to is unlinked 1996 */ 1997 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1998 struct btrfs_root *root, 1999 struct btrfs_root *log, 2000 struct btrfs_path *path, 2001 struct btrfs_path *log_path, 2002 struct inode *dir, 2003 struct btrfs_key *dir_key) 2004 { 2005 struct btrfs_fs_info *fs_info = root->fs_info; 2006 int ret; 2007 struct extent_buffer *eb; 2008 int slot; 2009 u32 item_size; 2010 struct btrfs_dir_item *di; 2011 struct btrfs_dir_item *log_di; 2012 int name_len; 2013 unsigned long ptr; 2014 unsigned long ptr_end; 2015 char *name; 2016 struct inode *inode; 2017 struct btrfs_key location; 2018 2019 again: 2020 eb = path->nodes[0]; 2021 slot = path->slots[0]; 2022 item_size = btrfs_item_size_nr(eb, slot); 2023 ptr = btrfs_item_ptr_offset(eb, slot); 2024 ptr_end = ptr + item_size; 2025 while (ptr < ptr_end) { 2026 di = (struct btrfs_dir_item *)ptr; 2027 if (verify_dir_item(fs_info, eb, slot, di)) { 2028 ret = -EIO; 2029 goto out; 2030 } 2031 2032 name_len = btrfs_dir_name_len(eb, di); 2033 name = kmalloc(name_len, GFP_NOFS); 2034 if (!name) { 2035 ret = -ENOMEM; 2036 goto out; 2037 } 2038 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2039 name_len); 2040 log_di = NULL; 2041 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2042 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2043 dir_key->objectid, 2044 name, name_len, 0); 2045 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2046 log_di = btrfs_lookup_dir_index_item(trans, log, 2047 log_path, 2048 dir_key->objectid, 2049 dir_key->offset, 2050 name, name_len, 0); 2051 } 2052 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2053 btrfs_dir_item_key_to_cpu(eb, di, &location); 2054 btrfs_release_path(path); 2055 btrfs_release_path(log_path); 2056 inode = read_one_inode(root, location.objectid); 2057 if (!inode) { 2058 kfree(name); 2059 return -EIO; 2060 } 2061 2062 ret = link_to_fixup_dir(trans, root, 2063 path, location.objectid); 2064 if (ret) { 2065 kfree(name); 2066 iput(inode); 2067 goto out; 2068 } 2069 2070 inc_nlink(inode); 2071 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2072 BTRFS_I(inode), name, name_len); 2073 if (!ret) 2074 ret = btrfs_run_delayed_items(trans, fs_info); 2075 kfree(name); 2076 iput(inode); 2077 if (ret) 2078 goto out; 2079 2080 /* there might still be more names under this key 2081 * check and repeat if required 2082 */ 2083 ret = btrfs_search_slot(NULL, root, dir_key, path, 2084 0, 0); 2085 if (ret == 0) 2086 goto again; 2087 ret = 0; 2088 goto out; 2089 } else if (IS_ERR(log_di)) { 2090 kfree(name); 2091 return PTR_ERR(log_di); 2092 } 2093 btrfs_release_path(log_path); 2094 kfree(name); 2095 2096 ptr = (unsigned long)(di + 1); 2097 ptr += name_len; 2098 } 2099 ret = 0; 2100 out: 2101 btrfs_release_path(path); 2102 btrfs_release_path(log_path); 2103 return ret; 2104 } 2105 2106 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2107 struct btrfs_root *root, 2108 struct btrfs_root *log, 2109 struct btrfs_path *path, 2110 const u64 ino) 2111 { 2112 struct btrfs_fs_info *fs_info = root->fs_info; 2113 struct btrfs_key search_key; 2114 struct btrfs_path *log_path; 2115 int i; 2116 int nritems; 2117 int ret; 2118 2119 log_path = btrfs_alloc_path(); 2120 if (!log_path) 2121 return -ENOMEM; 2122 2123 search_key.objectid = ino; 2124 search_key.type = BTRFS_XATTR_ITEM_KEY; 2125 search_key.offset = 0; 2126 again: 2127 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2128 if (ret < 0) 2129 goto out; 2130 process_leaf: 2131 nritems = btrfs_header_nritems(path->nodes[0]); 2132 for (i = path->slots[0]; i < nritems; i++) { 2133 struct btrfs_key key; 2134 struct btrfs_dir_item *di; 2135 struct btrfs_dir_item *log_di; 2136 u32 total_size; 2137 u32 cur; 2138 2139 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2140 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2141 ret = 0; 2142 goto out; 2143 } 2144 2145 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2146 total_size = btrfs_item_size_nr(path->nodes[0], i); 2147 cur = 0; 2148 while (cur < total_size) { 2149 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2150 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2151 u32 this_len = sizeof(*di) + name_len + data_len; 2152 char *name; 2153 2154 ret = verify_dir_item(fs_info, path->nodes[0], i, di); 2155 if (ret) { 2156 ret = -EIO; 2157 goto out; 2158 } 2159 name = kmalloc(name_len, GFP_NOFS); 2160 if (!name) { 2161 ret = -ENOMEM; 2162 goto out; 2163 } 2164 read_extent_buffer(path->nodes[0], name, 2165 (unsigned long)(di + 1), name_len); 2166 2167 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2168 name, name_len, 0); 2169 btrfs_release_path(log_path); 2170 if (!log_di) { 2171 /* Doesn't exist in log tree, so delete it. */ 2172 btrfs_release_path(path); 2173 di = btrfs_lookup_xattr(trans, root, path, ino, 2174 name, name_len, -1); 2175 kfree(name); 2176 if (IS_ERR(di)) { 2177 ret = PTR_ERR(di); 2178 goto out; 2179 } 2180 ASSERT(di); 2181 ret = btrfs_delete_one_dir_name(trans, root, 2182 path, di); 2183 if (ret) 2184 goto out; 2185 btrfs_release_path(path); 2186 search_key = key; 2187 goto again; 2188 } 2189 kfree(name); 2190 if (IS_ERR(log_di)) { 2191 ret = PTR_ERR(log_di); 2192 goto out; 2193 } 2194 cur += this_len; 2195 di = (struct btrfs_dir_item *)((char *)di + this_len); 2196 } 2197 } 2198 ret = btrfs_next_leaf(root, path); 2199 if (ret > 0) 2200 ret = 0; 2201 else if (ret == 0) 2202 goto process_leaf; 2203 out: 2204 btrfs_free_path(log_path); 2205 btrfs_release_path(path); 2206 return ret; 2207 } 2208 2209 2210 /* 2211 * deletion replay happens before we copy any new directory items 2212 * out of the log or out of backreferences from inodes. It 2213 * scans the log to find ranges of keys that log is authoritative for, 2214 * and then scans the directory to find items in those ranges that are 2215 * not present in the log. 2216 * 2217 * Anything we don't find in the log is unlinked and removed from the 2218 * directory. 2219 */ 2220 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2221 struct btrfs_root *root, 2222 struct btrfs_root *log, 2223 struct btrfs_path *path, 2224 u64 dirid, int del_all) 2225 { 2226 u64 range_start; 2227 u64 range_end; 2228 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2229 int ret = 0; 2230 struct btrfs_key dir_key; 2231 struct btrfs_key found_key; 2232 struct btrfs_path *log_path; 2233 struct inode *dir; 2234 2235 dir_key.objectid = dirid; 2236 dir_key.type = BTRFS_DIR_ITEM_KEY; 2237 log_path = btrfs_alloc_path(); 2238 if (!log_path) 2239 return -ENOMEM; 2240 2241 dir = read_one_inode(root, dirid); 2242 /* it isn't an error if the inode isn't there, that can happen 2243 * because we replay the deletes before we copy in the inode item 2244 * from the log 2245 */ 2246 if (!dir) { 2247 btrfs_free_path(log_path); 2248 return 0; 2249 } 2250 again: 2251 range_start = 0; 2252 range_end = 0; 2253 while (1) { 2254 if (del_all) 2255 range_end = (u64)-1; 2256 else { 2257 ret = find_dir_range(log, path, dirid, key_type, 2258 &range_start, &range_end); 2259 if (ret != 0) 2260 break; 2261 } 2262 2263 dir_key.offset = range_start; 2264 while (1) { 2265 int nritems; 2266 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2267 0, 0); 2268 if (ret < 0) 2269 goto out; 2270 2271 nritems = btrfs_header_nritems(path->nodes[0]); 2272 if (path->slots[0] >= nritems) { 2273 ret = btrfs_next_leaf(root, path); 2274 if (ret) 2275 break; 2276 } 2277 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2278 path->slots[0]); 2279 if (found_key.objectid != dirid || 2280 found_key.type != dir_key.type) 2281 goto next_type; 2282 2283 if (found_key.offset > range_end) 2284 break; 2285 2286 ret = check_item_in_log(trans, root, log, path, 2287 log_path, dir, 2288 &found_key); 2289 if (ret) 2290 goto out; 2291 if (found_key.offset == (u64)-1) 2292 break; 2293 dir_key.offset = found_key.offset + 1; 2294 } 2295 btrfs_release_path(path); 2296 if (range_end == (u64)-1) 2297 break; 2298 range_start = range_end + 1; 2299 } 2300 2301 next_type: 2302 ret = 0; 2303 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2304 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2305 dir_key.type = BTRFS_DIR_INDEX_KEY; 2306 btrfs_release_path(path); 2307 goto again; 2308 } 2309 out: 2310 btrfs_release_path(path); 2311 btrfs_free_path(log_path); 2312 iput(dir); 2313 return ret; 2314 } 2315 2316 /* 2317 * the process_func used to replay items from the log tree. This 2318 * gets called in two different stages. The first stage just looks 2319 * for inodes and makes sure they are all copied into the subvolume. 2320 * 2321 * The second stage copies all the other item types from the log into 2322 * the subvolume. The two stage approach is slower, but gets rid of 2323 * lots of complexity around inodes referencing other inodes that exist 2324 * only in the log (references come from either directory items or inode 2325 * back refs). 2326 */ 2327 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2328 struct walk_control *wc, u64 gen) 2329 { 2330 int nritems; 2331 struct btrfs_path *path; 2332 struct btrfs_root *root = wc->replay_dest; 2333 struct btrfs_key key; 2334 int level; 2335 int i; 2336 int ret; 2337 2338 ret = btrfs_read_buffer(eb, gen); 2339 if (ret) 2340 return ret; 2341 2342 level = btrfs_header_level(eb); 2343 2344 if (level != 0) 2345 return 0; 2346 2347 path = btrfs_alloc_path(); 2348 if (!path) 2349 return -ENOMEM; 2350 2351 nritems = btrfs_header_nritems(eb); 2352 for (i = 0; i < nritems; i++) { 2353 btrfs_item_key_to_cpu(eb, &key, i); 2354 2355 /* inode keys are done during the first stage */ 2356 if (key.type == BTRFS_INODE_ITEM_KEY && 2357 wc->stage == LOG_WALK_REPLAY_INODES) { 2358 struct btrfs_inode_item *inode_item; 2359 u32 mode; 2360 2361 inode_item = btrfs_item_ptr(eb, i, 2362 struct btrfs_inode_item); 2363 ret = replay_xattr_deletes(wc->trans, root, log, 2364 path, key.objectid); 2365 if (ret) 2366 break; 2367 mode = btrfs_inode_mode(eb, inode_item); 2368 if (S_ISDIR(mode)) { 2369 ret = replay_dir_deletes(wc->trans, 2370 root, log, path, key.objectid, 0); 2371 if (ret) 2372 break; 2373 } 2374 ret = overwrite_item(wc->trans, root, path, 2375 eb, i, &key); 2376 if (ret) 2377 break; 2378 2379 /* for regular files, make sure corresponding 2380 * orphan item exist. extents past the new EOF 2381 * will be truncated later by orphan cleanup. 2382 */ 2383 if (S_ISREG(mode)) { 2384 ret = insert_orphan_item(wc->trans, root, 2385 key.objectid); 2386 if (ret) 2387 break; 2388 } 2389 2390 ret = link_to_fixup_dir(wc->trans, root, 2391 path, key.objectid); 2392 if (ret) 2393 break; 2394 } 2395 2396 if (key.type == BTRFS_DIR_INDEX_KEY && 2397 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2398 ret = replay_one_dir_item(wc->trans, root, path, 2399 eb, i, &key); 2400 if (ret) 2401 break; 2402 } 2403 2404 if (wc->stage < LOG_WALK_REPLAY_ALL) 2405 continue; 2406 2407 /* these keys are simply copied */ 2408 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2409 ret = overwrite_item(wc->trans, root, path, 2410 eb, i, &key); 2411 if (ret) 2412 break; 2413 } else if (key.type == BTRFS_INODE_REF_KEY || 2414 key.type == BTRFS_INODE_EXTREF_KEY) { 2415 ret = add_inode_ref(wc->trans, root, log, path, 2416 eb, i, &key); 2417 if (ret && ret != -ENOENT) 2418 break; 2419 ret = 0; 2420 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2421 ret = replay_one_extent(wc->trans, root, path, 2422 eb, i, &key); 2423 if (ret) 2424 break; 2425 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2426 ret = replay_one_dir_item(wc->trans, root, path, 2427 eb, i, &key); 2428 if (ret) 2429 break; 2430 } 2431 } 2432 btrfs_free_path(path); 2433 return ret; 2434 } 2435 2436 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2437 struct btrfs_root *root, 2438 struct btrfs_path *path, int *level, 2439 struct walk_control *wc) 2440 { 2441 struct btrfs_fs_info *fs_info = root->fs_info; 2442 u64 root_owner; 2443 u64 bytenr; 2444 u64 ptr_gen; 2445 struct extent_buffer *next; 2446 struct extent_buffer *cur; 2447 struct extent_buffer *parent; 2448 u32 blocksize; 2449 int ret = 0; 2450 2451 WARN_ON(*level < 0); 2452 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2453 2454 while (*level > 0) { 2455 WARN_ON(*level < 0); 2456 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2457 cur = path->nodes[*level]; 2458 2459 WARN_ON(btrfs_header_level(cur) != *level); 2460 2461 if (path->slots[*level] >= 2462 btrfs_header_nritems(cur)) 2463 break; 2464 2465 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2466 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2467 blocksize = fs_info->nodesize; 2468 2469 parent = path->nodes[*level]; 2470 root_owner = btrfs_header_owner(parent); 2471 2472 next = btrfs_find_create_tree_block(fs_info, bytenr); 2473 if (IS_ERR(next)) 2474 return PTR_ERR(next); 2475 2476 if (*level == 1) { 2477 ret = wc->process_func(root, next, wc, ptr_gen); 2478 if (ret) { 2479 free_extent_buffer(next); 2480 return ret; 2481 } 2482 2483 path->slots[*level]++; 2484 if (wc->free) { 2485 ret = btrfs_read_buffer(next, ptr_gen); 2486 if (ret) { 2487 free_extent_buffer(next); 2488 return ret; 2489 } 2490 2491 if (trans) { 2492 btrfs_tree_lock(next); 2493 btrfs_set_lock_blocking(next); 2494 clean_tree_block(fs_info, next); 2495 btrfs_wait_tree_block_writeback(next); 2496 btrfs_tree_unlock(next); 2497 } 2498 2499 WARN_ON(root_owner != 2500 BTRFS_TREE_LOG_OBJECTID); 2501 ret = btrfs_free_and_pin_reserved_extent( 2502 fs_info, bytenr, 2503 blocksize); 2504 if (ret) { 2505 free_extent_buffer(next); 2506 return ret; 2507 } 2508 } 2509 free_extent_buffer(next); 2510 continue; 2511 } 2512 ret = btrfs_read_buffer(next, ptr_gen); 2513 if (ret) { 2514 free_extent_buffer(next); 2515 return ret; 2516 } 2517 2518 WARN_ON(*level <= 0); 2519 if (path->nodes[*level-1]) 2520 free_extent_buffer(path->nodes[*level-1]); 2521 path->nodes[*level-1] = next; 2522 *level = btrfs_header_level(next); 2523 path->slots[*level] = 0; 2524 cond_resched(); 2525 } 2526 WARN_ON(*level < 0); 2527 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2528 2529 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2530 2531 cond_resched(); 2532 return 0; 2533 } 2534 2535 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2536 struct btrfs_root *root, 2537 struct btrfs_path *path, int *level, 2538 struct walk_control *wc) 2539 { 2540 struct btrfs_fs_info *fs_info = root->fs_info; 2541 u64 root_owner; 2542 int i; 2543 int slot; 2544 int ret; 2545 2546 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2547 slot = path->slots[i]; 2548 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2549 path->slots[i]++; 2550 *level = i; 2551 WARN_ON(*level == 0); 2552 return 0; 2553 } else { 2554 struct extent_buffer *parent; 2555 if (path->nodes[*level] == root->node) 2556 parent = path->nodes[*level]; 2557 else 2558 parent = path->nodes[*level + 1]; 2559 2560 root_owner = btrfs_header_owner(parent); 2561 ret = wc->process_func(root, path->nodes[*level], wc, 2562 btrfs_header_generation(path->nodes[*level])); 2563 if (ret) 2564 return ret; 2565 2566 if (wc->free) { 2567 struct extent_buffer *next; 2568 2569 next = path->nodes[*level]; 2570 2571 if (trans) { 2572 btrfs_tree_lock(next); 2573 btrfs_set_lock_blocking(next); 2574 clean_tree_block(fs_info, next); 2575 btrfs_wait_tree_block_writeback(next); 2576 btrfs_tree_unlock(next); 2577 } 2578 2579 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2580 ret = btrfs_free_and_pin_reserved_extent( 2581 fs_info, 2582 path->nodes[*level]->start, 2583 path->nodes[*level]->len); 2584 if (ret) 2585 return ret; 2586 } 2587 free_extent_buffer(path->nodes[*level]); 2588 path->nodes[*level] = NULL; 2589 *level = i + 1; 2590 } 2591 } 2592 return 1; 2593 } 2594 2595 /* 2596 * drop the reference count on the tree rooted at 'snap'. This traverses 2597 * the tree freeing any blocks that have a ref count of zero after being 2598 * decremented. 2599 */ 2600 static int walk_log_tree(struct btrfs_trans_handle *trans, 2601 struct btrfs_root *log, struct walk_control *wc) 2602 { 2603 struct btrfs_fs_info *fs_info = log->fs_info; 2604 int ret = 0; 2605 int wret; 2606 int level; 2607 struct btrfs_path *path; 2608 int orig_level; 2609 2610 path = btrfs_alloc_path(); 2611 if (!path) 2612 return -ENOMEM; 2613 2614 level = btrfs_header_level(log->node); 2615 orig_level = level; 2616 path->nodes[level] = log->node; 2617 extent_buffer_get(log->node); 2618 path->slots[level] = 0; 2619 2620 while (1) { 2621 wret = walk_down_log_tree(trans, log, path, &level, wc); 2622 if (wret > 0) 2623 break; 2624 if (wret < 0) { 2625 ret = wret; 2626 goto out; 2627 } 2628 2629 wret = walk_up_log_tree(trans, log, path, &level, wc); 2630 if (wret > 0) 2631 break; 2632 if (wret < 0) { 2633 ret = wret; 2634 goto out; 2635 } 2636 } 2637 2638 /* was the root node processed? if not, catch it here */ 2639 if (path->nodes[orig_level]) { 2640 ret = wc->process_func(log, path->nodes[orig_level], wc, 2641 btrfs_header_generation(path->nodes[orig_level])); 2642 if (ret) 2643 goto out; 2644 if (wc->free) { 2645 struct extent_buffer *next; 2646 2647 next = path->nodes[orig_level]; 2648 2649 if (trans) { 2650 btrfs_tree_lock(next); 2651 btrfs_set_lock_blocking(next); 2652 clean_tree_block(fs_info, next); 2653 btrfs_wait_tree_block_writeback(next); 2654 btrfs_tree_unlock(next); 2655 } 2656 2657 WARN_ON(log->root_key.objectid != 2658 BTRFS_TREE_LOG_OBJECTID); 2659 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2660 next->start, next->len); 2661 if (ret) 2662 goto out; 2663 } 2664 } 2665 2666 out: 2667 btrfs_free_path(path); 2668 return ret; 2669 } 2670 2671 /* 2672 * helper function to update the item for a given subvolumes log root 2673 * in the tree of log roots 2674 */ 2675 static int update_log_root(struct btrfs_trans_handle *trans, 2676 struct btrfs_root *log) 2677 { 2678 struct btrfs_fs_info *fs_info = log->fs_info; 2679 int ret; 2680 2681 if (log->log_transid == 1) { 2682 /* insert root item on the first sync */ 2683 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2684 &log->root_key, &log->root_item); 2685 } else { 2686 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2687 &log->root_key, &log->root_item); 2688 } 2689 return ret; 2690 } 2691 2692 static void wait_log_commit(struct btrfs_root *root, int transid) 2693 { 2694 DEFINE_WAIT(wait); 2695 int index = transid % 2; 2696 2697 /* 2698 * we only allow two pending log transactions at a time, 2699 * so we know that if ours is more than 2 older than the 2700 * current transaction, we're done 2701 */ 2702 for (;;) { 2703 prepare_to_wait(&root->log_commit_wait[index], 2704 &wait, TASK_UNINTERRUPTIBLE); 2705 2706 if (!(root->log_transid_committed < transid && 2707 atomic_read(&root->log_commit[index]))) 2708 break; 2709 2710 mutex_unlock(&root->log_mutex); 2711 schedule(); 2712 mutex_lock(&root->log_mutex); 2713 } 2714 finish_wait(&root->log_commit_wait[index], &wait); 2715 } 2716 2717 static void wait_for_writer(struct btrfs_root *root) 2718 { 2719 DEFINE_WAIT(wait); 2720 2721 for (;;) { 2722 prepare_to_wait(&root->log_writer_wait, &wait, 2723 TASK_UNINTERRUPTIBLE); 2724 if (!atomic_read(&root->log_writers)) 2725 break; 2726 2727 mutex_unlock(&root->log_mutex); 2728 schedule(); 2729 mutex_lock(&root->log_mutex); 2730 } 2731 finish_wait(&root->log_writer_wait, &wait); 2732 } 2733 2734 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2735 struct btrfs_log_ctx *ctx) 2736 { 2737 if (!ctx) 2738 return; 2739 2740 mutex_lock(&root->log_mutex); 2741 list_del_init(&ctx->list); 2742 mutex_unlock(&root->log_mutex); 2743 } 2744 2745 /* 2746 * Invoked in log mutex context, or be sure there is no other task which 2747 * can access the list. 2748 */ 2749 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2750 int index, int error) 2751 { 2752 struct btrfs_log_ctx *ctx; 2753 struct btrfs_log_ctx *safe; 2754 2755 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2756 list_del_init(&ctx->list); 2757 ctx->log_ret = error; 2758 } 2759 2760 INIT_LIST_HEAD(&root->log_ctxs[index]); 2761 } 2762 2763 /* 2764 * btrfs_sync_log does sends a given tree log down to the disk and 2765 * updates the super blocks to record it. When this call is done, 2766 * you know that any inodes previously logged are safely on disk only 2767 * if it returns 0. 2768 * 2769 * Any other return value means you need to call btrfs_commit_transaction. 2770 * Some of the edge cases for fsyncing directories that have had unlinks 2771 * or renames done in the past mean that sometimes the only safe 2772 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2773 * that has happened. 2774 */ 2775 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2776 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2777 { 2778 int index1; 2779 int index2; 2780 int mark; 2781 int ret; 2782 struct btrfs_fs_info *fs_info = root->fs_info; 2783 struct btrfs_root *log = root->log_root; 2784 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2785 int log_transid = 0; 2786 struct btrfs_log_ctx root_log_ctx; 2787 struct blk_plug plug; 2788 2789 mutex_lock(&root->log_mutex); 2790 log_transid = ctx->log_transid; 2791 if (root->log_transid_committed >= log_transid) { 2792 mutex_unlock(&root->log_mutex); 2793 return ctx->log_ret; 2794 } 2795 2796 index1 = log_transid % 2; 2797 if (atomic_read(&root->log_commit[index1])) { 2798 wait_log_commit(root, log_transid); 2799 mutex_unlock(&root->log_mutex); 2800 return ctx->log_ret; 2801 } 2802 ASSERT(log_transid == root->log_transid); 2803 atomic_set(&root->log_commit[index1], 1); 2804 2805 /* wait for previous tree log sync to complete */ 2806 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2807 wait_log_commit(root, log_transid - 1); 2808 2809 while (1) { 2810 int batch = atomic_read(&root->log_batch); 2811 /* when we're on an ssd, just kick the log commit out */ 2812 if (!btrfs_test_opt(fs_info, SSD) && 2813 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2814 mutex_unlock(&root->log_mutex); 2815 schedule_timeout_uninterruptible(1); 2816 mutex_lock(&root->log_mutex); 2817 } 2818 wait_for_writer(root); 2819 if (batch == atomic_read(&root->log_batch)) 2820 break; 2821 } 2822 2823 /* bail out if we need to do a full commit */ 2824 if (btrfs_need_log_full_commit(fs_info, trans)) { 2825 ret = -EAGAIN; 2826 btrfs_free_logged_extents(log, log_transid); 2827 mutex_unlock(&root->log_mutex); 2828 goto out; 2829 } 2830 2831 if (log_transid % 2 == 0) 2832 mark = EXTENT_DIRTY; 2833 else 2834 mark = EXTENT_NEW; 2835 2836 /* we start IO on all the marked extents here, but we don't actually 2837 * wait for them until later. 2838 */ 2839 blk_start_plug(&plug); 2840 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2841 if (ret) { 2842 blk_finish_plug(&plug); 2843 btrfs_abort_transaction(trans, ret); 2844 btrfs_free_logged_extents(log, log_transid); 2845 btrfs_set_log_full_commit(fs_info, trans); 2846 mutex_unlock(&root->log_mutex); 2847 goto out; 2848 } 2849 2850 btrfs_set_root_node(&log->root_item, log->node); 2851 2852 root->log_transid++; 2853 log->log_transid = root->log_transid; 2854 root->log_start_pid = 0; 2855 /* 2856 * IO has been started, blocks of the log tree have WRITTEN flag set 2857 * in their headers. new modifications of the log will be written to 2858 * new positions. so it's safe to allow log writers to go in. 2859 */ 2860 mutex_unlock(&root->log_mutex); 2861 2862 btrfs_init_log_ctx(&root_log_ctx, NULL); 2863 2864 mutex_lock(&log_root_tree->log_mutex); 2865 atomic_inc(&log_root_tree->log_batch); 2866 atomic_inc(&log_root_tree->log_writers); 2867 2868 index2 = log_root_tree->log_transid % 2; 2869 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2870 root_log_ctx.log_transid = log_root_tree->log_transid; 2871 2872 mutex_unlock(&log_root_tree->log_mutex); 2873 2874 ret = update_log_root(trans, log); 2875 2876 mutex_lock(&log_root_tree->log_mutex); 2877 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2878 /* 2879 * Implicit memory barrier after atomic_dec_and_test 2880 */ 2881 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2882 wake_up(&log_root_tree->log_writer_wait); 2883 } 2884 2885 if (ret) { 2886 if (!list_empty(&root_log_ctx.list)) 2887 list_del_init(&root_log_ctx.list); 2888 2889 blk_finish_plug(&plug); 2890 btrfs_set_log_full_commit(fs_info, trans); 2891 2892 if (ret != -ENOSPC) { 2893 btrfs_abort_transaction(trans, ret); 2894 mutex_unlock(&log_root_tree->log_mutex); 2895 goto out; 2896 } 2897 btrfs_wait_tree_log_extents(log, mark); 2898 btrfs_free_logged_extents(log, log_transid); 2899 mutex_unlock(&log_root_tree->log_mutex); 2900 ret = -EAGAIN; 2901 goto out; 2902 } 2903 2904 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2905 blk_finish_plug(&plug); 2906 list_del_init(&root_log_ctx.list); 2907 mutex_unlock(&log_root_tree->log_mutex); 2908 ret = root_log_ctx.log_ret; 2909 goto out; 2910 } 2911 2912 index2 = root_log_ctx.log_transid % 2; 2913 if (atomic_read(&log_root_tree->log_commit[index2])) { 2914 blk_finish_plug(&plug); 2915 ret = btrfs_wait_tree_log_extents(log, mark); 2916 btrfs_wait_logged_extents(trans, log, log_transid); 2917 wait_log_commit(log_root_tree, 2918 root_log_ctx.log_transid); 2919 mutex_unlock(&log_root_tree->log_mutex); 2920 if (!ret) 2921 ret = root_log_ctx.log_ret; 2922 goto out; 2923 } 2924 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2925 atomic_set(&log_root_tree->log_commit[index2], 1); 2926 2927 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2928 wait_log_commit(log_root_tree, 2929 root_log_ctx.log_transid - 1); 2930 } 2931 2932 wait_for_writer(log_root_tree); 2933 2934 /* 2935 * now that we've moved on to the tree of log tree roots, 2936 * check the full commit flag again 2937 */ 2938 if (btrfs_need_log_full_commit(fs_info, trans)) { 2939 blk_finish_plug(&plug); 2940 btrfs_wait_tree_log_extents(log, mark); 2941 btrfs_free_logged_extents(log, log_transid); 2942 mutex_unlock(&log_root_tree->log_mutex); 2943 ret = -EAGAIN; 2944 goto out_wake_log_root; 2945 } 2946 2947 ret = btrfs_write_marked_extents(fs_info, 2948 &log_root_tree->dirty_log_pages, 2949 EXTENT_DIRTY | EXTENT_NEW); 2950 blk_finish_plug(&plug); 2951 if (ret) { 2952 btrfs_set_log_full_commit(fs_info, trans); 2953 btrfs_abort_transaction(trans, ret); 2954 btrfs_free_logged_extents(log, log_transid); 2955 mutex_unlock(&log_root_tree->log_mutex); 2956 goto out_wake_log_root; 2957 } 2958 ret = btrfs_wait_tree_log_extents(log, mark); 2959 if (!ret) 2960 ret = btrfs_wait_tree_log_extents(log_root_tree, 2961 EXTENT_NEW | EXTENT_DIRTY); 2962 if (ret) { 2963 btrfs_set_log_full_commit(fs_info, trans); 2964 btrfs_free_logged_extents(log, log_transid); 2965 mutex_unlock(&log_root_tree->log_mutex); 2966 goto out_wake_log_root; 2967 } 2968 btrfs_wait_logged_extents(trans, log, log_transid); 2969 2970 btrfs_set_super_log_root(fs_info->super_for_commit, 2971 log_root_tree->node->start); 2972 btrfs_set_super_log_root_level(fs_info->super_for_commit, 2973 btrfs_header_level(log_root_tree->node)); 2974 2975 log_root_tree->log_transid++; 2976 mutex_unlock(&log_root_tree->log_mutex); 2977 2978 /* 2979 * nobody else is going to jump in and write the the ctree 2980 * super here because the log_commit atomic below is protecting 2981 * us. We must be called with a transaction handle pinning 2982 * the running transaction open, so a full commit can't hop 2983 * in and cause problems either. 2984 */ 2985 ret = write_all_supers(fs_info, 1); 2986 if (ret) { 2987 btrfs_set_log_full_commit(fs_info, trans); 2988 btrfs_abort_transaction(trans, ret); 2989 goto out_wake_log_root; 2990 } 2991 2992 mutex_lock(&root->log_mutex); 2993 if (root->last_log_commit < log_transid) 2994 root->last_log_commit = log_transid; 2995 mutex_unlock(&root->log_mutex); 2996 2997 out_wake_log_root: 2998 mutex_lock(&log_root_tree->log_mutex); 2999 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3000 3001 log_root_tree->log_transid_committed++; 3002 atomic_set(&log_root_tree->log_commit[index2], 0); 3003 mutex_unlock(&log_root_tree->log_mutex); 3004 3005 /* 3006 * The barrier before waitqueue_active is implied by mutex_unlock 3007 */ 3008 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3009 wake_up(&log_root_tree->log_commit_wait[index2]); 3010 out: 3011 mutex_lock(&root->log_mutex); 3012 btrfs_remove_all_log_ctxs(root, index1, ret); 3013 root->log_transid_committed++; 3014 atomic_set(&root->log_commit[index1], 0); 3015 mutex_unlock(&root->log_mutex); 3016 3017 /* 3018 * The barrier before waitqueue_active is implied by mutex_unlock 3019 */ 3020 if (waitqueue_active(&root->log_commit_wait[index1])) 3021 wake_up(&root->log_commit_wait[index1]); 3022 return ret; 3023 } 3024 3025 static void free_log_tree(struct btrfs_trans_handle *trans, 3026 struct btrfs_root *log) 3027 { 3028 int ret; 3029 u64 start; 3030 u64 end; 3031 struct walk_control wc = { 3032 .free = 1, 3033 .process_func = process_one_buffer 3034 }; 3035 3036 ret = walk_log_tree(trans, log, &wc); 3037 /* I don't think this can happen but just in case */ 3038 if (ret) 3039 btrfs_abort_transaction(trans, ret); 3040 3041 while (1) { 3042 ret = find_first_extent_bit(&log->dirty_log_pages, 3043 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 3044 NULL); 3045 if (ret) 3046 break; 3047 3048 clear_extent_bits(&log->dirty_log_pages, start, end, 3049 EXTENT_DIRTY | EXTENT_NEW); 3050 } 3051 3052 /* 3053 * We may have short-circuited the log tree with the full commit logic 3054 * and left ordered extents on our list, so clear these out to keep us 3055 * from leaking inodes and memory. 3056 */ 3057 btrfs_free_logged_extents(log, 0); 3058 btrfs_free_logged_extents(log, 1); 3059 3060 free_extent_buffer(log->node); 3061 kfree(log); 3062 } 3063 3064 /* 3065 * free all the extents used by the tree log. This should be called 3066 * at commit time of the full transaction 3067 */ 3068 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3069 { 3070 if (root->log_root) { 3071 free_log_tree(trans, root->log_root); 3072 root->log_root = NULL; 3073 } 3074 return 0; 3075 } 3076 3077 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3078 struct btrfs_fs_info *fs_info) 3079 { 3080 if (fs_info->log_root_tree) { 3081 free_log_tree(trans, fs_info->log_root_tree); 3082 fs_info->log_root_tree = NULL; 3083 } 3084 return 0; 3085 } 3086 3087 /* 3088 * If both a file and directory are logged, and unlinks or renames are 3089 * mixed in, we have a few interesting corners: 3090 * 3091 * create file X in dir Y 3092 * link file X to X.link in dir Y 3093 * fsync file X 3094 * unlink file X but leave X.link 3095 * fsync dir Y 3096 * 3097 * After a crash we would expect only X.link to exist. But file X 3098 * didn't get fsync'd again so the log has back refs for X and X.link. 3099 * 3100 * We solve this by removing directory entries and inode backrefs from the 3101 * log when a file that was logged in the current transaction is 3102 * unlinked. Any later fsync will include the updated log entries, and 3103 * we'll be able to reconstruct the proper directory items from backrefs. 3104 * 3105 * This optimizations allows us to avoid relogging the entire inode 3106 * or the entire directory. 3107 */ 3108 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3109 struct btrfs_root *root, 3110 const char *name, int name_len, 3111 struct btrfs_inode *dir, u64 index) 3112 { 3113 struct btrfs_root *log; 3114 struct btrfs_dir_item *di; 3115 struct btrfs_path *path; 3116 int ret; 3117 int err = 0; 3118 int bytes_del = 0; 3119 u64 dir_ino = btrfs_ino(dir); 3120 3121 if (dir->logged_trans < trans->transid) 3122 return 0; 3123 3124 ret = join_running_log_trans(root); 3125 if (ret) 3126 return 0; 3127 3128 mutex_lock(&dir->log_mutex); 3129 3130 log = root->log_root; 3131 path = btrfs_alloc_path(); 3132 if (!path) { 3133 err = -ENOMEM; 3134 goto out_unlock; 3135 } 3136 3137 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3138 name, name_len, -1); 3139 if (IS_ERR(di)) { 3140 err = PTR_ERR(di); 3141 goto fail; 3142 } 3143 if (di) { 3144 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3145 bytes_del += name_len; 3146 if (ret) { 3147 err = ret; 3148 goto fail; 3149 } 3150 } 3151 btrfs_release_path(path); 3152 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3153 index, name, name_len, -1); 3154 if (IS_ERR(di)) { 3155 err = PTR_ERR(di); 3156 goto fail; 3157 } 3158 if (di) { 3159 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3160 bytes_del += name_len; 3161 if (ret) { 3162 err = ret; 3163 goto fail; 3164 } 3165 } 3166 3167 /* update the directory size in the log to reflect the names 3168 * we have removed 3169 */ 3170 if (bytes_del) { 3171 struct btrfs_key key; 3172 3173 key.objectid = dir_ino; 3174 key.offset = 0; 3175 key.type = BTRFS_INODE_ITEM_KEY; 3176 btrfs_release_path(path); 3177 3178 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3179 if (ret < 0) { 3180 err = ret; 3181 goto fail; 3182 } 3183 if (ret == 0) { 3184 struct btrfs_inode_item *item; 3185 u64 i_size; 3186 3187 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3188 struct btrfs_inode_item); 3189 i_size = btrfs_inode_size(path->nodes[0], item); 3190 if (i_size > bytes_del) 3191 i_size -= bytes_del; 3192 else 3193 i_size = 0; 3194 btrfs_set_inode_size(path->nodes[0], item, i_size); 3195 btrfs_mark_buffer_dirty(path->nodes[0]); 3196 } else 3197 ret = 0; 3198 btrfs_release_path(path); 3199 } 3200 fail: 3201 btrfs_free_path(path); 3202 out_unlock: 3203 mutex_unlock(&dir->log_mutex); 3204 if (ret == -ENOSPC) { 3205 btrfs_set_log_full_commit(root->fs_info, trans); 3206 ret = 0; 3207 } else if (ret < 0) 3208 btrfs_abort_transaction(trans, ret); 3209 3210 btrfs_end_log_trans(root); 3211 3212 return err; 3213 } 3214 3215 /* see comments for btrfs_del_dir_entries_in_log */ 3216 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3217 struct btrfs_root *root, 3218 const char *name, int name_len, 3219 struct btrfs_inode *inode, u64 dirid) 3220 { 3221 struct btrfs_fs_info *fs_info = root->fs_info; 3222 struct btrfs_root *log; 3223 u64 index; 3224 int ret; 3225 3226 if (inode->logged_trans < trans->transid) 3227 return 0; 3228 3229 ret = join_running_log_trans(root); 3230 if (ret) 3231 return 0; 3232 log = root->log_root; 3233 mutex_lock(&inode->log_mutex); 3234 3235 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3236 dirid, &index); 3237 mutex_unlock(&inode->log_mutex); 3238 if (ret == -ENOSPC) { 3239 btrfs_set_log_full_commit(fs_info, trans); 3240 ret = 0; 3241 } else if (ret < 0 && ret != -ENOENT) 3242 btrfs_abort_transaction(trans, ret); 3243 btrfs_end_log_trans(root); 3244 3245 return ret; 3246 } 3247 3248 /* 3249 * creates a range item in the log for 'dirid'. first_offset and 3250 * last_offset tell us which parts of the key space the log should 3251 * be considered authoritative for. 3252 */ 3253 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3254 struct btrfs_root *log, 3255 struct btrfs_path *path, 3256 int key_type, u64 dirid, 3257 u64 first_offset, u64 last_offset) 3258 { 3259 int ret; 3260 struct btrfs_key key; 3261 struct btrfs_dir_log_item *item; 3262 3263 key.objectid = dirid; 3264 key.offset = first_offset; 3265 if (key_type == BTRFS_DIR_ITEM_KEY) 3266 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3267 else 3268 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3269 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3270 if (ret) 3271 return ret; 3272 3273 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3274 struct btrfs_dir_log_item); 3275 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3276 btrfs_mark_buffer_dirty(path->nodes[0]); 3277 btrfs_release_path(path); 3278 return 0; 3279 } 3280 3281 /* 3282 * log all the items included in the current transaction for a given 3283 * directory. This also creates the range items in the log tree required 3284 * to replay anything deleted before the fsync 3285 */ 3286 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3287 struct btrfs_root *root, struct btrfs_inode *inode, 3288 struct btrfs_path *path, 3289 struct btrfs_path *dst_path, int key_type, 3290 struct btrfs_log_ctx *ctx, 3291 u64 min_offset, u64 *last_offset_ret) 3292 { 3293 struct btrfs_key min_key; 3294 struct btrfs_root *log = root->log_root; 3295 struct extent_buffer *src; 3296 int err = 0; 3297 int ret; 3298 int i; 3299 int nritems; 3300 u64 first_offset = min_offset; 3301 u64 last_offset = (u64)-1; 3302 u64 ino = btrfs_ino(inode); 3303 3304 log = root->log_root; 3305 3306 min_key.objectid = ino; 3307 min_key.type = key_type; 3308 min_key.offset = min_offset; 3309 3310 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3311 3312 /* 3313 * we didn't find anything from this transaction, see if there 3314 * is anything at all 3315 */ 3316 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3317 min_key.objectid = ino; 3318 min_key.type = key_type; 3319 min_key.offset = (u64)-1; 3320 btrfs_release_path(path); 3321 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3322 if (ret < 0) { 3323 btrfs_release_path(path); 3324 return ret; 3325 } 3326 ret = btrfs_previous_item(root, path, ino, key_type); 3327 3328 /* if ret == 0 there are items for this type, 3329 * create a range to tell us the last key of this type. 3330 * otherwise, there are no items in this directory after 3331 * *min_offset, and we create a range to indicate that. 3332 */ 3333 if (ret == 0) { 3334 struct btrfs_key tmp; 3335 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3336 path->slots[0]); 3337 if (key_type == tmp.type) 3338 first_offset = max(min_offset, tmp.offset) + 1; 3339 } 3340 goto done; 3341 } 3342 3343 /* go backward to find any previous key */ 3344 ret = btrfs_previous_item(root, path, ino, key_type); 3345 if (ret == 0) { 3346 struct btrfs_key tmp; 3347 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3348 if (key_type == tmp.type) { 3349 first_offset = tmp.offset; 3350 ret = overwrite_item(trans, log, dst_path, 3351 path->nodes[0], path->slots[0], 3352 &tmp); 3353 if (ret) { 3354 err = ret; 3355 goto done; 3356 } 3357 } 3358 } 3359 btrfs_release_path(path); 3360 3361 /* find the first key from this transaction again */ 3362 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3363 if (WARN_ON(ret != 0)) 3364 goto done; 3365 3366 /* 3367 * we have a block from this transaction, log every item in it 3368 * from our directory 3369 */ 3370 while (1) { 3371 struct btrfs_key tmp; 3372 src = path->nodes[0]; 3373 nritems = btrfs_header_nritems(src); 3374 for (i = path->slots[0]; i < nritems; i++) { 3375 struct btrfs_dir_item *di; 3376 3377 btrfs_item_key_to_cpu(src, &min_key, i); 3378 3379 if (min_key.objectid != ino || min_key.type != key_type) 3380 goto done; 3381 ret = overwrite_item(trans, log, dst_path, src, i, 3382 &min_key); 3383 if (ret) { 3384 err = ret; 3385 goto done; 3386 } 3387 3388 /* 3389 * We must make sure that when we log a directory entry, 3390 * the corresponding inode, after log replay, has a 3391 * matching link count. For example: 3392 * 3393 * touch foo 3394 * mkdir mydir 3395 * sync 3396 * ln foo mydir/bar 3397 * xfs_io -c "fsync" mydir 3398 * <crash> 3399 * <mount fs and log replay> 3400 * 3401 * Would result in a fsync log that when replayed, our 3402 * file inode would have a link count of 1, but we get 3403 * two directory entries pointing to the same inode. 3404 * After removing one of the names, it would not be 3405 * possible to remove the other name, which resulted 3406 * always in stale file handle errors, and would not 3407 * be possible to rmdir the parent directory, since 3408 * its i_size could never decrement to the value 3409 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3410 */ 3411 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3412 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3413 if (ctx && 3414 (btrfs_dir_transid(src, di) == trans->transid || 3415 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3416 tmp.type != BTRFS_ROOT_ITEM_KEY) 3417 ctx->log_new_dentries = true; 3418 } 3419 path->slots[0] = nritems; 3420 3421 /* 3422 * look ahead to the next item and see if it is also 3423 * from this directory and from this transaction 3424 */ 3425 ret = btrfs_next_leaf(root, path); 3426 if (ret == 1) { 3427 last_offset = (u64)-1; 3428 goto done; 3429 } 3430 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3431 if (tmp.objectid != ino || tmp.type != key_type) { 3432 last_offset = (u64)-1; 3433 goto done; 3434 } 3435 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3436 ret = overwrite_item(trans, log, dst_path, 3437 path->nodes[0], path->slots[0], 3438 &tmp); 3439 if (ret) 3440 err = ret; 3441 else 3442 last_offset = tmp.offset; 3443 goto done; 3444 } 3445 } 3446 done: 3447 btrfs_release_path(path); 3448 btrfs_release_path(dst_path); 3449 3450 if (err == 0) { 3451 *last_offset_ret = last_offset; 3452 /* 3453 * insert the log range keys to indicate where the log 3454 * is valid 3455 */ 3456 ret = insert_dir_log_key(trans, log, path, key_type, 3457 ino, first_offset, last_offset); 3458 if (ret) 3459 err = ret; 3460 } 3461 return err; 3462 } 3463 3464 /* 3465 * logging directories is very similar to logging inodes, We find all the items 3466 * from the current transaction and write them to the log. 3467 * 3468 * The recovery code scans the directory in the subvolume, and if it finds a 3469 * key in the range logged that is not present in the log tree, then it means 3470 * that dir entry was unlinked during the transaction. 3471 * 3472 * In order for that scan to work, we must include one key smaller than 3473 * the smallest logged by this transaction and one key larger than the largest 3474 * key logged by this transaction. 3475 */ 3476 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3477 struct btrfs_root *root, struct btrfs_inode *inode, 3478 struct btrfs_path *path, 3479 struct btrfs_path *dst_path, 3480 struct btrfs_log_ctx *ctx) 3481 { 3482 u64 min_key; 3483 u64 max_key; 3484 int ret; 3485 int key_type = BTRFS_DIR_ITEM_KEY; 3486 3487 again: 3488 min_key = 0; 3489 max_key = 0; 3490 while (1) { 3491 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3492 ctx, min_key, &max_key); 3493 if (ret) 3494 return ret; 3495 if (max_key == (u64)-1) 3496 break; 3497 min_key = max_key + 1; 3498 } 3499 3500 if (key_type == BTRFS_DIR_ITEM_KEY) { 3501 key_type = BTRFS_DIR_INDEX_KEY; 3502 goto again; 3503 } 3504 return 0; 3505 } 3506 3507 /* 3508 * a helper function to drop items from the log before we relog an 3509 * inode. max_key_type indicates the highest item type to remove. 3510 * This cannot be run for file data extents because it does not 3511 * free the extents they point to. 3512 */ 3513 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3514 struct btrfs_root *log, 3515 struct btrfs_path *path, 3516 u64 objectid, int max_key_type) 3517 { 3518 int ret; 3519 struct btrfs_key key; 3520 struct btrfs_key found_key; 3521 int start_slot; 3522 3523 key.objectid = objectid; 3524 key.type = max_key_type; 3525 key.offset = (u64)-1; 3526 3527 while (1) { 3528 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3529 BUG_ON(ret == 0); /* Logic error */ 3530 if (ret < 0) 3531 break; 3532 3533 if (path->slots[0] == 0) 3534 break; 3535 3536 path->slots[0]--; 3537 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3538 path->slots[0]); 3539 3540 if (found_key.objectid != objectid) 3541 break; 3542 3543 found_key.offset = 0; 3544 found_key.type = 0; 3545 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3546 &start_slot); 3547 3548 ret = btrfs_del_items(trans, log, path, start_slot, 3549 path->slots[0] - start_slot + 1); 3550 /* 3551 * If start slot isn't 0 then we don't need to re-search, we've 3552 * found the last guy with the objectid in this tree. 3553 */ 3554 if (ret || start_slot != 0) 3555 break; 3556 btrfs_release_path(path); 3557 } 3558 btrfs_release_path(path); 3559 if (ret > 0) 3560 ret = 0; 3561 return ret; 3562 } 3563 3564 static void fill_inode_item(struct btrfs_trans_handle *trans, 3565 struct extent_buffer *leaf, 3566 struct btrfs_inode_item *item, 3567 struct inode *inode, int log_inode_only, 3568 u64 logged_isize) 3569 { 3570 struct btrfs_map_token token; 3571 3572 btrfs_init_map_token(&token); 3573 3574 if (log_inode_only) { 3575 /* set the generation to zero so the recover code 3576 * can tell the difference between an logging 3577 * just to say 'this inode exists' and a logging 3578 * to say 'update this inode with these values' 3579 */ 3580 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3581 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3582 } else { 3583 btrfs_set_token_inode_generation(leaf, item, 3584 BTRFS_I(inode)->generation, 3585 &token); 3586 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3587 } 3588 3589 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3590 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3591 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3592 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3593 3594 btrfs_set_token_timespec_sec(leaf, &item->atime, 3595 inode->i_atime.tv_sec, &token); 3596 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3597 inode->i_atime.tv_nsec, &token); 3598 3599 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3600 inode->i_mtime.tv_sec, &token); 3601 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3602 inode->i_mtime.tv_nsec, &token); 3603 3604 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3605 inode->i_ctime.tv_sec, &token); 3606 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3607 inode->i_ctime.tv_nsec, &token); 3608 3609 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3610 &token); 3611 3612 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3613 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3614 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3615 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3616 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3617 } 3618 3619 static int log_inode_item(struct btrfs_trans_handle *trans, 3620 struct btrfs_root *log, struct btrfs_path *path, 3621 struct btrfs_inode *inode) 3622 { 3623 struct btrfs_inode_item *inode_item; 3624 int ret; 3625 3626 ret = btrfs_insert_empty_item(trans, log, path, 3627 &inode->location, sizeof(*inode_item)); 3628 if (ret && ret != -EEXIST) 3629 return ret; 3630 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3631 struct btrfs_inode_item); 3632 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3633 0, 0); 3634 btrfs_release_path(path); 3635 return 0; 3636 } 3637 3638 static noinline int copy_items(struct btrfs_trans_handle *trans, 3639 struct btrfs_inode *inode, 3640 struct btrfs_path *dst_path, 3641 struct btrfs_path *src_path, u64 *last_extent, 3642 int start_slot, int nr, int inode_only, 3643 u64 logged_isize) 3644 { 3645 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3646 unsigned long src_offset; 3647 unsigned long dst_offset; 3648 struct btrfs_root *log = inode->root->log_root; 3649 struct btrfs_file_extent_item *extent; 3650 struct btrfs_inode_item *inode_item; 3651 struct extent_buffer *src = src_path->nodes[0]; 3652 struct btrfs_key first_key, last_key, key; 3653 int ret; 3654 struct btrfs_key *ins_keys; 3655 u32 *ins_sizes; 3656 char *ins_data; 3657 int i; 3658 struct list_head ordered_sums; 3659 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3660 bool has_extents = false; 3661 bool need_find_last_extent = true; 3662 bool done = false; 3663 3664 INIT_LIST_HEAD(&ordered_sums); 3665 3666 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3667 nr * sizeof(u32), GFP_NOFS); 3668 if (!ins_data) 3669 return -ENOMEM; 3670 3671 first_key.objectid = (u64)-1; 3672 3673 ins_sizes = (u32 *)ins_data; 3674 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3675 3676 for (i = 0; i < nr; i++) { 3677 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3678 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3679 } 3680 ret = btrfs_insert_empty_items(trans, log, dst_path, 3681 ins_keys, ins_sizes, nr); 3682 if (ret) { 3683 kfree(ins_data); 3684 return ret; 3685 } 3686 3687 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3688 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3689 dst_path->slots[0]); 3690 3691 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3692 3693 if (i == nr - 1) 3694 last_key = ins_keys[i]; 3695 3696 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3697 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3698 dst_path->slots[0], 3699 struct btrfs_inode_item); 3700 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3701 &inode->vfs_inode, 3702 inode_only == LOG_INODE_EXISTS, 3703 logged_isize); 3704 } else { 3705 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3706 src_offset, ins_sizes[i]); 3707 } 3708 3709 /* 3710 * We set need_find_last_extent here in case we know we were 3711 * processing other items and then walk into the first extent in 3712 * the inode. If we don't hit an extent then nothing changes, 3713 * we'll do the last search the next time around. 3714 */ 3715 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3716 has_extents = true; 3717 if (first_key.objectid == (u64)-1) 3718 first_key = ins_keys[i]; 3719 } else { 3720 need_find_last_extent = false; 3721 } 3722 3723 /* take a reference on file data extents so that truncates 3724 * or deletes of this inode don't have to relog the inode 3725 * again 3726 */ 3727 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3728 !skip_csum) { 3729 int found_type; 3730 extent = btrfs_item_ptr(src, start_slot + i, 3731 struct btrfs_file_extent_item); 3732 3733 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3734 continue; 3735 3736 found_type = btrfs_file_extent_type(src, extent); 3737 if (found_type == BTRFS_FILE_EXTENT_REG) { 3738 u64 ds, dl, cs, cl; 3739 ds = btrfs_file_extent_disk_bytenr(src, 3740 extent); 3741 /* ds == 0 is a hole */ 3742 if (ds == 0) 3743 continue; 3744 3745 dl = btrfs_file_extent_disk_num_bytes(src, 3746 extent); 3747 cs = btrfs_file_extent_offset(src, extent); 3748 cl = btrfs_file_extent_num_bytes(src, 3749 extent); 3750 if (btrfs_file_extent_compression(src, 3751 extent)) { 3752 cs = 0; 3753 cl = dl; 3754 } 3755 3756 ret = btrfs_lookup_csums_range( 3757 fs_info->csum_root, 3758 ds + cs, ds + cs + cl - 1, 3759 &ordered_sums, 0); 3760 if (ret) { 3761 btrfs_release_path(dst_path); 3762 kfree(ins_data); 3763 return ret; 3764 } 3765 } 3766 } 3767 } 3768 3769 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3770 btrfs_release_path(dst_path); 3771 kfree(ins_data); 3772 3773 /* 3774 * we have to do this after the loop above to avoid changing the 3775 * log tree while trying to change the log tree. 3776 */ 3777 ret = 0; 3778 while (!list_empty(&ordered_sums)) { 3779 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3780 struct btrfs_ordered_sum, 3781 list); 3782 if (!ret) 3783 ret = btrfs_csum_file_blocks(trans, log, sums); 3784 list_del(&sums->list); 3785 kfree(sums); 3786 } 3787 3788 if (!has_extents) 3789 return ret; 3790 3791 if (need_find_last_extent && *last_extent == first_key.offset) { 3792 /* 3793 * We don't have any leafs between our current one and the one 3794 * we processed before that can have file extent items for our 3795 * inode (and have a generation number smaller than our current 3796 * transaction id). 3797 */ 3798 need_find_last_extent = false; 3799 } 3800 3801 /* 3802 * Because we use btrfs_search_forward we could skip leaves that were 3803 * not modified and then assume *last_extent is valid when it really 3804 * isn't. So back up to the previous leaf and read the end of the last 3805 * extent before we go and fill in holes. 3806 */ 3807 if (need_find_last_extent) { 3808 u64 len; 3809 3810 ret = btrfs_prev_leaf(inode->root, src_path); 3811 if (ret < 0) 3812 return ret; 3813 if (ret) 3814 goto fill_holes; 3815 if (src_path->slots[0]) 3816 src_path->slots[0]--; 3817 src = src_path->nodes[0]; 3818 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3819 if (key.objectid != btrfs_ino(inode) || 3820 key.type != BTRFS_EXTENT_DATA_KEY) 3821 goto fill_holes; 3822 extent = btrfs_item_ptr(src, src_path->slots[0], 3823 struct btrfs_file_extent_item); 3824 if (btrfs_file_extent_type(src, extent) == 3825 BTRFS_FILE_EXTENT_INLINE) { 3826 len = btrfs_file_extent_inline_len(src, 3827 src_path->slots[0], 3828 extent); 3829 *last_extent = ALIGN(key.offset + len, 3830 fs_info->sectorsize); 3831 } else { 3832 len = btrfs_file_extent_num_bytes(src, extent); 3833 *last_extent = key.offset + len; 3834 } 3835 } 3836 fill_holes: 3837 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3838 * things could have happened 3839 * 3840 * 1) A merge could have happened, so we could currently be on a leaf 3841 * that holds what we were copying in the first place. 3842 * 2) A split could have happened, and now not all of the items we want 3843 * are on the same leaf. 3844 * 3845 * So we need to adjust how we search for holes, we need to drop the 3846 * path and re-search for the first extent key we found, and then walk 3847 * forward until we hit the last one we copied. 3848 */ 3849 if (need_find_last_extent) { 3850 /* btrfs_prev_leaf could return 1 without releasing the path */ 3851 btrfs_release_path(src_path); 3852 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3853 src_path, 0, 0); 3854 if (ret < 0) 3855 return ret; 3856 ASSERT(ret == 0); 3857 src = src_path->nodes[0]; 3858 i = src_path->slots[0]; 3859 } else { 3860 i = start_slot; 3861 } 3862 3863 /* 3864 * Ok so here we need to go through and fill in any holes we may have 3865 * to make sure that holes are punched for those areas in case they had 3866 * extents previously. 3867 */ 3868 while (!done) { 3869 u64 offset, len; 3870 u64 extent_end; 3871 3872 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3873 ret = btrfs_next_leaf(inode->root, src_path); 3874 if (ret < 0) 3875 return ret; 3876 ASSERT(ret == 0); 3877 src = src_path->nodes[0]; 3878 i = 0; 3879 } 3880 3881 btrfs_item_key_to_cpu(src, &key, i); 3882 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3883 done = true; 3884 if (key.objectid != btrfs_ino(inode) || 3885 key.type != BTRFS_EXTENT_DATA_KEY) { 3886 i++; 3887 continue; 3888 } 3889 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3890 if (btrfs_file_extent_type(src, extent) == 3891 BTRFS_FILE_EXTENT_INLINE) { 3892 len = btrfs_file_extent_inline_len(src, i, extent); 3893 extent_end = ALIGN(key.offset + len, 3894 fs_info->sectorsize); 3895 } else { 3896 len = btrfs_file_extent_num_bytes(src, extent); 3897 extent_end = key.offset + len; 3898 } 3899 i++; 3900 3901 if (*last_extent == key.offset) { 3902 *last_extent = extent_end; 3903 continue; 3904 } 3905 offset = *last_extent; 3906 len = key.offset - *last_extent; 3907 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3908 offset, 0, 0, len, 0, len, 0, 0, 0); 3909 if (ret) 3910 break; 3911 *last_extent = extent_end; 3912 } 3913 /* 3914 * Need to let the callers know we dropped the path so they should 3915 * re-search. 3916 */ 3917 if (!ret && need_find_last_extent) 3918 ret = 1; 3919 return ret; 3920 } 3921 3922 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3923 { 3924 struct extent_map *em1, *em2; 3925 3926 em1 = list_entry(a, struct extent_map, list); 3927 em2 = list_entry(b, struct extent_map, list); 3928 3929 if (em1->start < em2->start) 3930 return -1; 3931 else if (em1->start > em2->start) 3932 return 1; 3933 return 0; 3934 } 3935 3936 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3937 struct inode *inode, 3938 struct btrfs_root *root, 3939 const struct extent_map *em, 3940 const struct list_head *logged_list, 3941 bool *ordered_io_error) 3942 { 3943 struct btrfs_fs_info *fs_info = root->fs_info; 3944 struct btrfs_ordered_extent *ordered; 3945 struct btrfs_root *log = root->log_root; 3946 u64 mod_start = em->mod_start; 3947 u64 mod_len = em->mod_len; 3948 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3949 u64 csum_offset; 3950 u64 csum_len; 3951 LIST_HEAD(ordered_sums); 3952 int ret = 0; 3953 3954 *ordered_io_error = false; 3955 3956 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3957 em->block_start == EXTENT_MAP_HOLE) 3958 return 0; 3959 3960 /* 3961 * Wait far any ordered extent that covers our extent map. If it 3962 * finishes without an error, first check and see if our csums are on 3963 * our outstanding ordered extents. 3964 */ 3965 list_for_each_entry(ordered, logged_list, log_list) { 3966 struct btrfs_ordered_sum *sum; 3967 3968 if (!mod_len) 3969 break; 3970 3971 if (ordered->file_offset + ordered->len <= mod_start || 3972 mod_start + mod_len <= ordered->file_offset) 3973 continue; 3974 3975 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3976 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3977 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3978 const u64 start = ordered->file_offset; 3979 const u64 end = ordered->file_offset + ordered->len - 1; 3980 3981 WARN_ON(ordered->inode != inode); 3982 filemap_fdatawrite_range(inode->i_mapping, start, end); 3983 } 3984 3985 wait_event(ordered->wait, 3986 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3987 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3988 3989 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3990 /* 3991 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3992 * i_mapping flags, so that the next fsync won't get 3993 * an outdated io error too. 3994 */ 3995 filemap_check_errors(inode->i_mapping); 3996 *ordered_io_error = true; 3997 break; 3998 } 3999 /* 4000 * We are going to copy all the csums on this ordered extent, so 4001 * go ahead and adjust mod_start and mod_len in case this 4002 * ordered extent has already been logged. 4003 */ 4004 if (ordered->file_offset > mod_start) { 4005 if (ordered->file_offset + ordered->len >= 4006 mod_start + mod_len) 4007 mod_len = ordered->file_offset - mod_start; 4008 /* 4009 * If we have this case 4010 * 4011 * |--------- logged extent ---------| 4012 * |----- ordered extent ----| 4013 * 4014 * Just don't mess with mod_start and mod_len, we'll 4015 * just end up logging more csums than we need and it 4016 * will be ok. 4017 */ 4018 } else { 4019 if (ordered->file_offset + ordered->len < 4020 mod_start + mod_len) { 4021 mod_len = (mod_start + mod_len) - 4022 (ordered->file_offset + ordered->len); 4023 mod_start = ordered->file_offset + 4024 ordered->len; 4025 } else { 4026 mod_len = 0; 4027 } 4028 } 4029 4030 if (skip_csum) 4031 continue; 4032 4033 /* 4034 * To keep us from looping for the above case of an ordered 4035 * extent that falls inside of the logged extent. 4036 */ 4037 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4038 &ordered->flags)) 4039 continue; 4040 4041 list_for_each_entry(sum, &ordered->list, list) { 4042 ret = btrfs_csum_file_blocks(trans, log, sum); 4043 if (ret) 4044 break; 4045 } 4046 } 4047 4048 if (*ordered_io_error || !mod_len || ret || skip_csum) 4049 return ret; 4050 4051 if (em->compress_type) { 4052 csum_offset = 0; 4053 csum_len = max(em->block_len, em->orig_block_len); 4054 } else { 4055 csum_offset = mod_start - em->start; 4056 csum_len = mod_len; 4057 } 4058 4059 /* block start is already adjusted for the file extent offset. */ 4060 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4061 em->block_start + csum_offset, 4062 em->block_start + csum_offset + 4063 csum_len - 1, &ordered_sums, 0); 4064 if (ret) 4065 return ret; 4066 4067 while (!list_empty(&ordered_sums)) { 4068 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4069 struct btrfs_ordered_sum, 4070 list); 4071 if (!ret) 4072 ret = btrfs_csum_file_blocks(trans, log, sums); 4073 list_del(&sums->list); 4074 kfree(sums); 4075 } 4076 4077 return ret; 4078 } 4079 4080 static int log_one_extent(struct btrfs_trans_handle *trans, 4081 struct btrfs_inode *inode, struct btrfs_root *root, 4082 const struct extent_map *em, 4083 struct btrfs_path *path, 4084 const struct list_head *logged_list, 4085 struct btrfs_log_ctx *ctx) 4086 { 4087 struct btrfs_root *log = root->log_root; 4088 struct btrfs_file_extent_item *fi; 4089 struct extent_buffer *leaf; 4090 struct btrfs_map_token token; 4091 struct btrfs_key key; 4092 u64 extent_offset = em->start - em->orig_start; 4093 u64 block_len; 4094 int ret; 4095 int extent_inserted = 0; 4096 bool ordered_io_err = false; 4097 4098 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4099 logged_list, &ordered_io_err); 4100 if (ret) 4101 return ret; 4102 4103 if (ordered_io_err) { 4104 ctx->io_err = -EIO; 4105 return ctx->io_err; 4106 } 4107 4108 btrfs_init_map_token(&token); 4109 4110 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4111 em->start + em->len, NULL, 0, 1, 4112 sizeof(*fi), &extent_inserted); 4113 if (ret) 4114 return ret; 4115 4116 if (!extent_inserted) { 4117 key.objectid = btrfs_ino(inode); 4118 key.type = BTRFS_EXTENT_DATA_KEY; 4119 key.offset = em->start; 4120 4121 ret = btrfs_insert_empty_item(trans, log, path, &key, 4122 sizeof(*fi)); 4123 if (ret) 4124 return ret; 4125 } 4126 leaf = path->nodes[0]; 4127 fi = btrfs_item_ptr(leaf, path->slots[0], 4128 struct btrfs_file_extent_item); 4129 4130 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4131 &token); 4132 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4133 btrfs_set_token_file_extent_type(leaf, fi, 4134 BTRFS_FILE_EXTENT_PREALLOC, 4135 &token); 4136 else 4137 btrfs_set_token_file_extent_type(leaf, fi, 4138 BTRFS_FILE_EXTENT_REG, 4139 &token); 4140 4141 block_len = max(em->block_len, em->orig_block_len); 4142 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4143 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4144 em->block_start, 4145 &token); 4146 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4147 &token); 4148 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4149 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4150 em->block_start - 4151 extent_offset, &token); 4152 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4153 &token); 4154 } else { 4155 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4156 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4157 &token); 4158 } 4159 4160 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4161 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4162 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4163 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4164 &token); 4165 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4166 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4167 btrfs_mark_buffer_dirty(leaf); 4168 4169 btrfs_release_path(path); 4170 4171 return ret; 4172 } 4173 4174 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4175 struct btrfs_root *root, 4176 struct btrfs_inode *inode, 4177 struct btrfs_path *path, 4178 struct list_head *logged_list, 4179 struct btrfs_log_ctx *ctx, 4180 const u64 start, 4181 const u64 end) 4182 { 4183 struct extent_map *em, *n; 4184 struct list_head extents; 4185 struct extent_map_tree *tree = &inode->extent_tree; 4186 u64 logged_start, logged_end; 4187 u64 test_gen; 4188 int ret = 0; 4189 int num = 0; 4190 4191 INIT_LIST_HEAD(&extents); 4192 4193 down_write(&inode->dio_sem); 4194 write_lock(&tree->lock); 4195 test_gen = root->fs_info->last_trans_committed; 4196 logged_start = start; 4197 logged_end = end; 4198 4199 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4200 list_del_init(&em->list); 4201 /* 4202 * Just an arbitrary number, this can be really CPU intensive 4203 * once we start getting a lot of extents, and really once we 4204 * have a bunch of extents we just want to commit since it will 4205 * be faster. 4206 */ 4207 if (++num > 32768) { 4208 list_del_init(&tree->modified_extents); 4209 ret = -EFBIG; 4210 goto process; 4211 } 4212 4213 if (em->generation <= test_gen) 4214 continue; 4215 4216 if (em->start < logged_start) 4217 logged_start = em->start; 4218 if ((em->start + em->len - 1) > logged_end) 4219 logged_end = em->start + em->len - 1; 4220 4221 /* Need a ref to keep it from getting evicted from cache */ 4222 refcount_inc(&em->refs); 4223 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4224 list_add_tail(&em->list, &extents); 4225 num++; 4226 } 4227 4228 list_sort(NULL, &extents, extent_cmp); 4229 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4230 /* 4231 * Some ordered extents started by fsync might have completed 4232 * before we could collect them into the list logged_list, which 4233 * means they're gone, not in our logged_list nor in the inode's 4234 * ordered tree. We want the application/user space to know an 4235 * error happened while attempting to persist file data so that 4236 * it can take proper action. If such error happened, we leave 4237 * without writing to the log tree and the fsync must report the 4238 * file data write error and not commit the current transaction. 4239 */ 4240 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4241 if (ret) 4242 ctx->io_err = ret; 4243 process: 4244 while (!list_empty(&extents)) { 4245 em = list_entry(extents.next, struct extent_map, list); 4246 4247 list_del_init(&em->list); 4248 4249 /* 4250 * If we had an error we just need to delete everybody from our 4251 * private list. 4252 */ 4253 if (ret) { 4254 clear_em_logging(tree, em); 4255 free_extent_map(em); 4256 continue; 4257 } 4258 4259 write_unlock(&tree->lock); 4260 4261 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4262 ctx); 4263 write_lock(&tree->lock); 4264 clear_em_logging(tree, em); 4265 free_extent_map(em); 4266 } 4267 WARN_ON(!list_empty(&extents)); 4268 write_unlock(&tree->lock); 4269 up_write(&inode->dio_sem); 4270 4271 btrfs_release_path(path); 4272 return ret; 4273 } 4274 4275 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4276 struct btrfs_path *path, u64 *size_ret) 4277 { 4278 struct btrfs_key key; 4279 int ret; 4280 4281 key.objectid = btrfs_ino(inode); 4282 key.type = BTRFS_INODE_ITEM_KEY; 4283 key.offset = 0; 4284 4285 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4286 if (ret < 0) { 4287 return ret; 4288 } else if (ret > 0) { 4289 *size_ret = 0; 4290 } else { 4291 struct btrfs_inode_item *item; 4292 4293 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4294 struct btrfs_inode_item); 4295 *size_ret = btrfs_inode_size(path->nodes[0], item); 4296 } 4297 4298 btrfs_release_path(path); 4299 return 0; 4300 } 4301 4302 /* 4303 * At the moment we always log all xattrs. This is to figure out at log replay 4304 * time which xattrs must have their deletion replayed. If a xattr is missing 4305 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4306 * because if a xattr is deleted, the inode is fsynced and a power failure 4307 * happens, causing the log to be replayed the next time the fs is mounted, 4308 * we want the xattr to not exist anymore (same behaviour as other filesystems 4309 * with a journal, ext3/4, xfs, f2fs, etc). 4310 */ 4311 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4312 struct btrfs_root *root, 4313 struct btrfs_inode *inode, 4314 struct btrfs_path *path, 4315 struct btrfs_path *dst_path) 4316 { 4317 int ret; 4318 struct btrfs_key key; 4319 const u64 ino = btrfs_ino(inode); 4320 int ins_nr = 0; 4321 int start_slot = 0; 4322 4323 key.objectid = ino; 4324 key.type = BTRFS_XATTR_ITEM_KEY; 4325 key.offset = 0; 4326 4327 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4328 if (ret < 0) 4329 return ret; 4330 4331 while (true) { 4332 int slot = path->slots[0]; 4333 struct extent_buffer *leaf = path->nodes[0]; 4334 int nritems = btrfs_header_nritems(leaf); 4335 4336 if (slot >= nritems) { 4337 if (ins_nr > 0) { 4338 u64 last_extent = 0; 4339 4340 ret = copy_items(trans, inode, dst_path, path, 4341 &last_extent, start_slot, 4342 ins_nr, 1, 0); 4343 /* can't be 1, extent items aren't processed */ 4344 ASSERT(ret <= 0); 4345 if (ret < 0) 4346 return ret; 4347 ins_nr = 0; 4348 } 4349 ret = btrfs_next_leaf(root, path); 4350 if (ret < 0) 4351 return ret; 4352 else if (ret > 0) 4353 break; 4354 continue; 4355 } 4356 4357 btrfs_item_key_to_cpu(leaf, &key, slot); 4358 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4359 break; 4360 4361 if (ins_nr == 0) 4362 start_slot = slot; 4363 ins_nr++; 4364 path->slots[0]++; 4365 cond_resched(); 4366 } 4367 if (ins_nr > 0) { 4368 u64 last_extent = 0; 4369 4370 ret = copy_items(trans, inode, dst_path, path, 4371 &last_extent, start_slot, 4372 ins_nr, 1, 0); 4373 /* can't be 1, extent items aren't processed */ 4374 ASSERT(ret <= 0); 4375 if (ret < 0) 4376 return ret; 4377 } 4378 4379 return 0; 4380 } 4381 4382 /* 4383 * If the no holes feature is enabled we need to make sure any hole between the 4384 * last extent and the i_size of our inode is explicitly marked in the log. This 4385 * is to make sure that doing something like: 4386 * 4387 * 1) create file with 128Kb of data 4388 * 2) truncate file to 64Kb 4389 * 3) truncate file to 256Kb 4390 * 4) fsync file 4391 * 5) <crash/power failure> 4392 * 6) mount fs and trigger log replay 4393 * 4394 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4395 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4396 * file correspond to a hole. The presence of explicit holes in a log tree is 4397 * what guarantees that log replay will remove/adjust file extent items in the 4398 * fs/subvol tree. 4399 * 4400 * Here we do not need to care about holes between extents, that is already done 4401 * by copy_items(). We also only need to do this in the full sync path, where we 4402 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4403 * lookup the list of modified extent maps and if any represents a hole, we 4404 * insert a corresponding extent representing a hole in the log tree. 4405 */ 4406 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4407 struct btrfs_root *root, 4408 struct btrfs_inode *inode, 4409 struct btrfs_path *path) 4410 { 4411 struct btrfs_fs_info *fs_info = root->fs_info; 4412 int ret; 4413 struct btrfs_key key; 4414 u64 hole_start; 4415 u64 hole_size; 4416 struct extent_buffer *leaf; 4417 struct btrfs_root *log = root->log_root; 4418 const u64 ino = btrfs_ino(inode); 4419 const u64 i_size = i_size_read(&inode->vfs_inode); 4420 4421 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4422 return 0; 4423 4424 key.objectid = ino; 4425 key.type = BTRFS_EXTENT_DATA_KEY; 4426 key.offset = (u64)-1; 4427 4428 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4429 ASSERT(ret != 0); 4430 if (ret < 0) 4431 return ret; 4432 4433 ASSERT(path->slots[0] > 0); 4434 path->slots[0]--; 4435 leaf = path->nodes[0]; 4436 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4437 4438 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4439 /* inode does not have any extents */ 4440 hole_start = 0; 4441 hole_size = i_size; 4442 } else { 4443 struct btrfs_file_extent_item *extent; 4444 u64 len; 4445 4446 /* 4447 * If there's an extent beyond i_size, an explicit hole was 4448 * already inserted by copy_items(). 4449 */ 4450 if (key.offset >= i_size) 4451 return 0; 4452 4453 extent = btrfs_item_ptr(leaf, path->slots[0], 4454 struct btrfs_file_extent_item); 4455 4456 if (btrfs_file_extent_type(leaf, extent) == 4457 BTRFS_FILE_EXTENT_INLINE) { 4458 len = btrfs_file_extent_inline_len(leaf, 4459 path->slots[0], 4460 extent); 4461 ASSERT(len == i_size || 4462 (len == fs_info->sectorsize && 4463 btrfs_file_extent_compression(leaf, extent) != 4464 BTRFS_COMPRESS_NONE)); 4465 return 0; 4466 } 4467 4468 len = btrfs_file_extent_num_bytes(leaf, extent); 4469 /* Last extent goes beyond i_size, no need to log a hole. */ 4470 if (key.offset + len > i_size) 4471 return 0; 4472 hole_start = key.offset + len; 4473 hole_size = i_size - hole_start; 4474 } 4475 btrfs_release_path(path); 4476 4477 /* Last extent ends at i_size. */ 4478 if (hole_size == 0) 4479 return 0; 4480 4481 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4482 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4483 hole_size, 0, hole_size, 0, 0, 0); 4484 return ret; 4485 } 4486 4487 /* 4488 * When we are logging a new inode X, check if it doesn't have a reference that 4489 * matches the reference from some other inode Y created in a past transaction 4490 * and that was renamed in the current transaction. If we don't do this, then at 4491 * log replay time we can lose inode Y (and all its files if it's a directory): 4492 * 4493 * mkdir /mnt/x 4494 * echo "hello world" > /mnt/x/foobar 4495 * sync 4496 * mv /mnt/x /mnt/y 4497 * mkdir /mnt/x # or touch /mnt/x 4498 * xfs_io -c fsync /mnt/x 4499 * <power fail> 4500 * mount fs, trigger log replay 4501 * 4502 * After the log replay procedure, we would lose the first directory and all its 4503 * files (file foobar). 4504 * For the case where inode Y is not a directory we simply end up losing it: 4505 * 4506 * echo "123" > /mnt/foo 4507 * sync 4508 * mv /mnt/foo /mnt/bar 4509 * echo "abc" > /mnt/foo 4510 * xfs_io -c fsync /mnt/foo 4511 * <power fail> 4512 * 4513 * We also need this for cases where a snapshot entry is replaced by some other 4514 * entry (file or directory) otherwise we end up with an unreplayable log due to 4515 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4516 * if it were a regular entry: 4517 * 4518 * mkdir /mnt/x 4519 * btrfs subvolume snapshot /mnt /mnt/x/snap 4520 * btrfs subvolume delete /mnt/x/snap 4521 * rmdir /mnt/x 4522 * mkdir /mnt/x 4523 * fsync /mnt/x or fsync some new file inside it 4524 * <power fail> 4525 * 4526 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4527 * the same transaction. 4528 */ 4529 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4530 const int slot, 4531 const struct btrfs_key *key, 4532 struct btrfs_inode *inode, 4533 u64 *other_ino) 4534 { 4535 int ret; 4536 struct btrfs_path *search_path; 4537 char *name = NULL; 4538 u32 name_len = 0; 4539 u32 item_size = btrfs_item_size_nr(eb, slot); 4540 u32 cur_offset = 0; 4541 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4542 4543 search_path = btrfs_alloc_path(); 4544 if (!search_path) 4545 return -ENOMEM; 4546 search_path->search_commit_root = 1; 4547 search_path->skip_locking = 1; 4548 4549 while (cur_offset < item_size) { 4550 u64 parent; 4551 u32 this_name_len; 4552 u32 this_len; 4553 unsigned long name_ptr; 4554 struct btrfs_dir_item *di; 4555 4556 if (key->type == BTRFS_INODE_REF_KEY) { 4557 struct btrfs_inode_ref *iref; 4558 4559 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4560 parent = key->offset; 4561 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4562 name_ptr = (unsigned long)(iref + 1); 4563 this_len = sizeof(*iref) + this_name_len; 4564 } else { 4565 struct btrfs_inode_extref *extref; 4566 4567 extref = (struct btrfs_inode_extref *)(ptr + 4568 cur_offset); 4569 parent = btrfs_inode_extref_parent(eb, extref); 4570 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4571 name_ptr = (unsigned long)&extref->name; 4572 this_len = sizeof(*extref) + this_name_len; 4573 } 4574 4575 ret = btrfs_is_name_len_valid(eb, slot, name_ptr, 4576 this_name_len); 4577 if (!ret) { 4578 ret = -EIO; 4579 goto out; 4580 } 4581 if (this_name_len > name_len) { 4582 char *new_name; 4583 4584 new_name = krealloc(name, this_name_len, GFP_NOFS); 4585 if (!new_name) { 4586 ret = -ENOMEM; 4587 goto out; 4588 } 4589 name_len = this_name_len; 4590 name = new_name; 4591 } 4592 4593 read_extent_buffer(eb, name, name_ptr, this_name_len); 4594 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4595 parent, name, this_name_len, 0); 4596 if (di && !IS_ERR(di)) { 4597 struct btrfs_key di_key; 4598 4599 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4600 di, &di_key); 4601 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4602 ret = 1; 4603 *other_ino = di_key.objectid; 4604 } else { 4605 ret = -EAGAIN; 4606 } 4607 goto out; 4608 } else if (IS_ERR(di)) { 4609 ret = PTR_ERR(di); 4610 goto out; 4611 } 4612 btrfs_release_path(search_path); 4613 4614 cur_offset += this_len; 4615 } 4616 ret = 0; 4617 out: 4618 btrfs_free_path(search_path); 4619 kfree(name); 4620 return ret; 4621 } 4622 4623 /* log a single inode in the tree log. 4624 * At least one parent directory for this inode must exist in the tree 4625 * or be logged already. 4626 * 4627 * Any items from this inode changed by the current transaction are copied 4628 * to the log tree. An extra reference is taken on any extents in this 4629 * file, allowing us to avoid a whole pile of corner cases around logging 4630 * blocks that have been removed from the tree. 4631 * 4632 * See LOG_INODE_ALL and related defines for a description of what inode_only 4633 * does. 4634 * 4635 * This handles both files and directories. 4636 */ 4637 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4638 struct btrfs_root *root, struct btrfs_inode *inode, 4639 int inode_only, 4640 const loff_t start, 4641 const loff_t end, 4642 struct btrfs_log_ctx *ctx) 4643 { 4644 struct btrfs_fs_info *fs_info = root->fs_info; 4645 struct btrfs_path *path; 4646 struct btrfs_path *dst_path; 4647 struct btrfs_key min_key; 4648 struct btrfs_key max_key; 4649 struct btrfs_root *log = root->log_root; 4650 LIST_HEAD(logged_list); 4651 u64 last_extent = 0; 4652 int err = 0; 4653 int ret; 4654 int nritems; 4655 int ins_start_slot = 0; 4656 int ins_nr; 4657 bool fast_search = false; 4658 u64 ino = btrfs_ino(inode); 4659 struct extent_map_tree *em_tree = &inode->extent_tree; 4660 u64 logged_isize = 0; 4661 bool need_log_inode_item = true; 4662 4663 path = btrfs_alloc_path(); 4664 if (!path) 4665 return -ENOMEM; 4666 dst_path = btrfs_alloc_path(); 4667 if (!dst_path) { 4668 btrfs_free_path(path); 4669 return -ENOMEM; 4670 } 4671 4672 min_key.objectid = ino; 4673 min_key.type = BTRFS_INODE_ITEM_KEY; 4674 min_key.offset = 0; 4675 4676 max_key.objectid = ino; 4677 4678 4679 /* today the code can only do partial logging of directories */ 4680 if (S_ISDIR(inode->vfs_inode.i_mode) || 4681 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4682 &inode->runtime_flags) && 4683 inode_only >= LOG_INODE_EXISTS)) 4684 max_key.type = BTRFS_XATTR_ITEM_KEY; 4685 else 4686 max_key.type = (u8)-1; 4687 max_key.offset = (u64)-1; 4688 4689 /* 4690 * Only run delayed items if we are a dir or a new file. 4691 * Otherwise commit the delayed inode only, which is needed in 4692 * order for the log replay code to mark inodes for link count 4693 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4694 */ 4695 if (S_ISDIR(inode->vfs_inode.i_mode) || 4696 inode->generation > fs_info->last_trans_committed) 4697 ret = btrfs_commit_inode_delayed_items(trans, inode); 4698 else 4699 ret = btrfs_commit_inode_delayed_inode(inode); 4700 4701 if (ret) { 4702 btrfs_free_path(path); 4703 btrfs_free_path(dst_path); 4704 return ret; 4705 } 4706 4707 if (inode_only == LOG_OTHER_INODE) { 4708 inode_only = LOG_INODE_EXISTS; 4709 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4710 } else { 4711 mutex_lock(&inode->log_mutex); 4712 } 4713 4714 /* 4715 * a brute force approach to making sure we get the most uptodate 4716 * copies of everything. 4717 */ 4718 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4719 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4720 4721 if (inode_only == LOG_INODE_EXISTS) 4722 max_key_type = BTRFS_XATTR_ITEM_KEY; 4723 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4724 } else { 4725 if (inode_only == LOG_INODE_EXISTS) { 4726 /* 4727 * Make sure the new inode item we write to the log has 4728 * the same isize as the current one (if it exists). 4729 * This is necessary to prevent data loss after log 4730 * replay, and also to prevent doing a wrong expanding 4731 * truncate - for e.g. create file, write 4K into offset 4732 * 0, fsync, write 4K into offset 4096, add hard link, 4733 * fsync some other file (to sync log), power fail - if 4734 * we use the inode's current i_size, after log replay 4735 * we get a 8Kb file, with the last 4Kb extent as a hole 4736 * (zeroes), as if an expanding truncate happened, 4737 * instead of getting a file of 4Kb only. 4738 */ 4739 err = logged_inode_size(log, inode, path, &logged_isize); 4740 if (err) 4741 goto out_unlock; 4742 } 4743 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4744 &inode->runtime_flags)) { 4745 if (inode_only == LOG_INODE_EXISTS) { 4746 max_key.type = BTRFS_XATTR_ITEM_KEY; 4747 ret = drop_objectid_items(trans, log, path, ino, 4748 max_key.type); 4749 } else { 4750 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4751 &inode->runtime_flags); 4752 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4753 &inode->runtime_flags); 4754 while(1) { 4755 ret = btrfs_truncate_inode_items(trans, 4756 log, &inode->vfs_inode, 0, 0); 4757 if (ret != -EAGAIN) 4758 break; 4759 } 4760 } 4761 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4762 &inode->runtime_flags) || 4763 inode_only == LOG_INODE_EXISTS) { 4764 if (inode_only == LOG_INODE_ALL) 4765 fast_search = true; 4766 max_key.type = BTRFS_XATTR_ITEM_KEY; 4767 ret = drop_objectid_items(trans, log, path, ino, 4768 max_key.type); 4769 } else { 4770 if (inode_only == LOG_INODE_ALL) 4771 fast_search = true; 4772 goto log_extents; 4773 } 4774 4775 } 4776 if (ret) { 4777 err = ret; 4778 goto out_unlock; 4779 } 4780 4781 while (1) { 4782 ins_nr = 0; 4783 ret = btrfs_search_forward(root, &min_key, 4784 path, trans->transid); 4785 if (ret < 0) { 4786 err = ret; 4787 goto out_unlock; 4788 } 4789 if (ret != 0) 4790 break; 4791 again: 4792 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4793 if (min_key.objectid != ino) 4794 break; 4795 if (min_key.type > max_key.type) 4796 break; 4797 4798 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4799 need_log_inode_item = false; 4800 4801 if ((min_key.type == BTRFS_INODE_REF_KEY || 4802 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4803 inode->generation == trans->transid) { 4804 u64 other_ino = 0; 4805 4806 ret = btrfs_check_ref_name_override(path->nodes[0], 4807 path->slots[0], &min_key, inode, 4808 &other_ino); 4809 if (ret < 0) { 4810 err = ret; 4811 goto out_unlock; 4812 } else if (ret > 0 && ctx && 4813 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4814 struct btrfs_key inode_key; 4815 struct inode *other_inode; 4816 4817 if (ins_nr > 0) { 4818 ins_nr++; 4819 } else { 4820 ins_nr = 1; 4821 ins_start_slot = path->slots[0]; 4822 } 4823 ret = copy_items(trans, inode, dst_path, path, 4824 &last_extent, ins_start_slot, 4825 ins_nr, inode_only, 4826 logged_isize); 4827 if (ret < 0) { 4828 err = ret; 4829 goto out_unlock; 4830 } 4831 ins_nr = 0; 4832 btrfs_release_path(path); 4833 inode_key.objectid = other_ino; 4834 inode_key.type = BTRFS_INODE_ITEM_KEY; 4835 inode_key.offset = 0; 4836 other_inode = btrfs_iget(fs_info->sb, 4837 &inode_key, root, 4838 NULL); 4839 /* 4840 * If the other inode that had a conflicting dir 4841 * entry was deleted in the current transaction, 4842 * we don't need to do more work nor fallback to 4843 * a transaction commit. 4844 */ 4845 if (IS_ERR(other_inode) && 4846 PTR_ERR(other_inode) == -ENOENT) { 4847 goto next_key; 4848 } else if (IS_ERR(other_inode)) { 4849 err = PTR_ERR(other_inode); 4850 goto out_unlock; 4851 } 4852 /* 4853 * We are safe logging the other inode without 4854 * acquiring its i_mutex as long as we log with 4855 * the LOG_INODE_EXISTS mode. We're safe against 4856 * concurrent renames of the other inode as well 4857 * because during a rename we pin the log and 4858 * update the log with the new name before we 4859 * unpin it. 4860 */ 4861 err = btrfs_log_inode(trans, root, 4862 BTRFS_I(other_inode), 4863 LOG_OTHER_INODE, 0, LLONG_MAX, 4864 ctx); 4865 iput(other_inode); 4866 if (err) 4867 goto out_unlock; 4868 else 4869 goto next_key; 4870 } 4871 } 4872 4873 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4874 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4875 if (ins_nr == 0) 4876 goto next_slot; 4877 ret = copy_items(trans, inode, dst_path, path, 4878 &last_extent, ins_start_slot, 4879 ins_nr, inode_only, logged_isize); 4880 if (ret < 0) { 4881 err = ret; 4882 goto out_unlock; 4883 } 4884 ins_nr = 0; 4885 if (ret) { 4886 btrfs_release_path(path); 4887 continue; 4888 } 4889 goto next_slot; 4890 } 4891 4892 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4893 ins_nr++; 4894 goto next_slot; 4895 } else if (!ins_nr) { 4896 ins_start_slot = path->slots[0]; 4897 ins_nr = 1; 4898 goto next_slot; 4899 } 4900 4901 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4902 ins_start_slot, ins_nr, inode_only, 4903 logged_isize); 4904 if (ret < 0) { 4905 err = ret; 4906 goto out_unlock; 4907 } 4908 if (ret) { 4909 ins_nr = 0; 4910 btrfs_release_path(path); 4911 continue; 4912 } 4913 ins_nr = 1; 4914 ins_start_slot = path->slots[0]; 4915 next_slot: 4916 4917 nritems = btrfs_header_nritems(path->nodes[0]); 4918 path->slots[0]++; 4919 if (path->slots[0] < nritems) { 4920 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4921 path->slots[0]); 4922 goto again; 4923 } 4924 if (ins_nr) { 4925 ret = copy_items(trans, inode, dst_path, path, 4926 &last_extent, ins_start_slot, 4927 ins_nr, inode_only, logged_isize); 4928 if (ret < 0) { 4929 err = ret; 4930 goto out_unlock; 4931 } 4932 ret = 0; 4933 ins_nr = 0; 4934 } 4935 btrfs_release_path(path); 4936 next_key: 4937 if (min_key.offset < (u64)-1) { 4938 min_key.offset++; 4939 } else if (min_key.type < max_key.type) { 4940 min_key.type++; 4941 min_key.offset = 0; 4942 } else { 4943 break; 4944 } 4945 } 4946 if (ins_nr) { 4947 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4948 ins_start_slot, ins_nr, inode_only, 4949 logged_isize); 4950 if (ret < 0) { 4951 err = ret; 4952 goto out_unlock; 4953 } 4954 ret = 0; 4955 ins_nr = 0; 4956 } 4957 4958 btrfs_release_path(path); 4959 btrfs_release_path(dst_path); 4960 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4961 if (err) 4962 goto out_unlock; 4963 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4964 btrfs_release_path(path); 4965 btrfs_release_path(dst_path); 4966 err = btrfs_log_trailing_hole(trans, root, inode, path); 4967 if (err) 4968 goto out_unlock; 4969 } 4970 log_extents: 4971 btrfs_release_path(path); 4972 btrfs_release_path(dst_path); 4973 if (need_log_inode_item) { 4974 err = log_inode_item(trans, log, dst_path, inode); 4975 if (err) 4976 goto out_unlock; 4977 } 4978 if (fast_search) { 4979 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4980 &logged_list, ctx, start, end); 4981 if (ret) { 4982 err = ret; 4983 goto out_unlock; 4984 } 4985 } else if (inode_only == LOG_INODE_ALL) { 4986 struct extent_map *em, *n; 4987 4988 write_lock(&em_tree->lock); 4989 /* 4990 * We can't just remove every em if we're called for a ranged 4991 * fsync - that is, one that doesn't cover the whole possible 4992 * file range (0 to LLONG_MAX). This is because we can have 4993 * em's that fall outside the range we're logging and therefore 4994 * their ordered operations haven't completed yet 4995 * (btrfs_finish_ordered_io() not invoked yet). This means we 4996 * didn't get their respective file extent item in the fs/subvol 4997 * tree yet, and need to let the next fast fsync (one which 4998 * consults the list of modified extent maps) find the em so 4999 * that it logs a matching file extent item and waits for the 5000 * respective ordered operation to complete (if it's still 5001 * running). 5002 * 5003 * Removing every em outside the range we're logging would make 5004 * the next fast fsync not log their matching file extent items, 5005 * therefore making us lose data after a log replay. 5006 */ 5007 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5008 list) { 5009 const u64 mod_end = em->mod_start + em->mod_len - 1; 5010 5011 if (em->mod_start >= start && mod_end <= end) 5012 list_del_init(&em->list); 5013 } 5014 write_unlock(&em_tree->lock); 5015 } 5016 5017 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5018 ret = log_directory_changes(trans, root, inode, path, dst_path, 5019 ctx); 5020 if (ret) { 5021 err = ret; 5022 goto out_unlock; 5023 } 5024 } 5025 5026 spin_lock(&inode->lock); 5027 inode->logged_trans = trans->transid; 5028 inode->last_log_commit = inode->last_sub_trans; 5029 spin_unlock(&inode->lock); 5030 out_unlock: 5031 if (unlikely(err)) 5032 btrfs_put_logged_extents(&logged_list); 5033 else 5034 btrfs_submit_logged_extents(&logged_list, log); 5035 mutex_unlock(&inode->log_mutex); 5036 5037 btrfs_free_path(path); 5038 btrfs_free_path(dst_path); 5039 return err; 5040 } 5041 5042 /* 5043 * Check if we must fallback to a transaction commit when logging an inode. 5044 * This must be called after logging the inode and is used only in the context 5045 * when fsyncing an inode requires the need to log some other inode - in which 5046 * case we can't lock the i_mutex of each other inode we need to log as that 5047 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5048 * log inodes up or down in the hierarchy) or rename operations for example. So 5049 * we take the log_mutex of the inode after we have logged it and then check for 5050 * its last_unlink_trans value - this is safe because any task setting 5051 * last_unlink_trans must take the log_mutex and it must do this before it does 5052 * the actual unlink operation, so if we do this check before a concurrent task 5053 * sets last_unlink_trans it means we've logged a consistent version/state of 5054 * all the inode items, otherwise we are not sure and must do a transaction 5055 * commit (the concurrent task might have only updated last_unlink_trans before 5056 * we logged the inode or it might have also done the unlink). 5057 */ 5058 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5059 struct btrfs_inode *inode) 5060 { 5061 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5062 bool ret = false; 5063 5064 mutex_lock(&inode->log_mutex); 5065 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5066 /* 5067 * Make sure any commits to the log are forced to be full 5068 * commits. 5069 */ 5070 btrfs_set_log_full_commit(fs_info, trans); 5071 ret = true; 5072 } 5073 mutex_unlock(&inode->log_mutex); 5074 5075 return ret; 5076 } 5077 5078 /* 5079 * follow the dentry parent pointers up the chain and see if any 5080 * of the directories in it require a full commit before they can 5081 * be logged. Returns zero if nothing special needs to be done or 1 if 5082 * a full commit is required. 5083 */ 5084 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5085 struct btrfs_inode *inode, 5086 struct dentry *parent, 5087 struct super_block *sb, 5088 u64 last_committed) 5089 { 5090 int ret = 0; 5091 struct dentry *old_parent = NULL; 5092 struct btrfs_inode *orig_inode = inode; 5093 5094 /* 5095 * for regular files, if its inode is already on disk, we don't 5096 * have to worry about the parents at all. This is because 5097 * we can use the last_unlink_trans field to record renames 5098 * and other fun in this file. 5099 */ 5100 if (S_ISREG(inode->vfs_inode.i_mode) && 5101 inode->generation <= last_committed && 5102 inode->last_unlink_trans <= last_committed) 5103 goto out; 5104 5105 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5106 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5107 goto out; 5108 inode = BTRFS_I(d_inode(parent)); 5109 } 5110 5111 while (1) { 5112 /* 5113 * If we are logging a directory then we start with our inode, 5114 * not our parent's inode, so we need to skip setting the 5115 * logged_trans so that further down in the log code we don't 5116 * think this inode has already been logged. 5117 */ 5118 if (inode != orig_inode) 5119 inode->logged_trans = trans->transid; 5120 smp_mb(); 5121 5122 if (btrfs_must_commit_transaction(trans, inode)) { 5123 ret = 1; 5124 break; 5125 } 5126 5127 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5128 break; 5129 5130 if (IS_ROOT(parent)) { 5131 inode = BTRFS_I(d_inode(parent)); 5132 if (btrfs_must_commit_transaction(trans, inode)) 5133 ret = 1; 5134 break; 5135 } 5136 5137 parent = dget_parent(parent); 5138 dput(old_parent); 5139 old_parent = parent; 5140 inode = BTRFS_I(d_inode(parent)); 5141 5142 } 5143 dput(old_parent); 5144 out: 5145 return ret; 5146 } 5147 5148 struct btrfs_dir_list { 5149 u64 ino; 5150 struct list_head list; 5151 }; 5152 5153 /* 5154 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5155 * details about the why it is needed. 5156 * This is a recursive operation - if an existing dentry corresponds to a 5157 * directory, that directory's new entries are logged too (same behaviour as 5158 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5159 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5160 * complains about the following circular lock dependency / possible deadlock: 5161 * 5162 * CPU0 CPU1 5163 * ---- ---- 5164 * lock(&type->i_mutex_dir_key#3/2); 5165 * lock(sb_internal#2); 5166 * lock(&type->i_mutex_dir_key#3/2); 5167 * lock(&sb->s_type->i_mutex_key#14); 5168 * 5169 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5170 * sb_start_intwrite() in btrfs_start_transaction(). 5171 * Not locking i_mutex of the inodes is still safe because: 5172 * 5173 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5174 * that while logging the inode new references (names) are added or removed 5175 * from the inode, leaving the logged inode item with a link count that does 5176 * not match the number of logged inode reference items. This is fine because 5177 * at log replay time we compute the real number of links and correct the 5178 * link count in the inode item (see replay_one_buffer() and 5179 * link_to_fixup_dir()); 5180 * 5181 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5182 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5183 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5184 * has a size that doesn't match the sum of the lengths of all the logged 5185 * names. This does not result in a problem because if a dir_item key is 5186 * logged but its matching dir_index key is not logged, at log replay time we 5187 * don't use it to replay the respective name (see replay_one_name()). On the 5188 * other hand if only the dir_index key ends up being logged, the respective 5189 * name is added to the fs/subvol tree with both the dir_item and dir_index 5190 * keys created (see replay_one_name()). 5191 * The directory's inode item with a wrong i_size is not a problem as well, 5192 * since we don't use it at log replay time to set the i_size in the inode 5193 * item of the fs/subvol tree (see overwrite_item()). 5194 */ 5195 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5196 struct btrfs_root *root, 5197 struct btrfs_inode *start_inode, 5198 struct btrfs_log_ctx *ctx) 5199 { 5200 struct btrfs_fs_info *fs_info = root->fs_info; 5201 struct btrfs_root *log = root->log_root; 5202 struct btrfs_path *path; 5203 LIST_HEAD(dir_list); 5204 struct btrfs_dir_list *dir_elem; 5205 int ret = 0; 5206 5207 path = btrfs_alloc_path(); 5208 if (!path) 5209 return -ENOMEM; 5210 5211 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5212 if (!dir_elem) { 5213 btrfs_free_path(path); 5214 return -ENOMEM; 5215 } 5216 dir_elem->ino = btrfs_ino(start_inode); 5217 list_add_tail(&dir_elem->list, &dir_list); 5218 5219 while (!list_empty(&dir_list)) { 5220 struct extent_buffer *leaf; 5221 struct btrfs_key min_key; 5222 int nritems; 5223 int i; 5224 5225 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5226 list); 5227 if (ret) 5228 goto next_dir_inode; 5229 5230 min_key.objectid = dir_elem->ino; 5231 min_key.type = BTRFS_DIR_ITEM_KEY; 5232 min_key.offset = 0; 5233 again: 5234 btrfs_release_path(path); 5235 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5236 if (ret < 0) { 5237 goto next_dir_inode; 5238 } else if (ret > 0) { 5239 ret = 0; 5240 goto next_dir_inode; 5241 } 5242 5243 process_leaf: 5244 leaf = path->nodes[0]; 5245 nritems = btrfs_header_nritems(leaf); 5246 for (i = path->slots[0]; i < nritems; i++) { 5247 struct btrfs_dir_item *di; 5248 struct btrfs_key di_key; 5249 struct inode *di_inode; 5250 struct btrfs_dir_list *new_dir_elem; 5251 int log_mode = LOG_INODE_EXISTS; 5252 int type; 5253 5254 btrfs_item_key_to_cpu(leaf, &min_key, i); 5255 if (min_key.objectid != dir_elem->ino || 5256 min_key.type != BTRFS_DIR_ITEM_KEY) 5257 goto next_dir_inode; 5258 5259 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5260 type = btrfs_dir_type(leaf, di); 5261 if (btrfs_dir_transid(leaf, di) < trans->transid && 5262 type != BTRFS_FT_DIR) 5263 continue; 5264 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5265 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5266 continue; 5267 5268 btrfs_release_path(path); 5269 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5270 if (IS_ERR(di_inode)) { 5271 ret = PTR_ERR(di_inode); 5272 goto next_dir_inode; 5273 } 5274 5275 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5276 iput(di_inode); 5277 break; 5278 } 5279 5280 ctx->log_new_dentries = false; 5281 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5282 log_mode = LOG_INODE_ALL; 5283 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5284 log_mode, 0, LLONG_MAX, ctx); 5285 if (!ret && 5286 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5287 ret = 1; 5288 iput(di_inode); 5289 if (ret) 5290 goto next_dir_inode; 5291 if (ctx->log_new_dentries) { 5292 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5293 GFP_NOFS); 5294 if (!new_dir_elem) { 5295 ret = -ENOMEM; 5296 goto next_dir_inode; 5297 } 5298 new_dir_elem->ino = di_key.objectid; 5299 list_add_tail(&new_dir_elem->list, &dir_list); 5300 } 5301 break; 5302 } 5303 if (i == nritems) { 5304 ret = btrfs_next_leaf(log, path); 5305 if (ret < 0) { 5306 goto next_dir_inode; 5307 } else if (ret > 0) { 5308 ret = 0; 5309 goto next_dir_inode; 5310 } 5311 goto process_leaf; 5312 } 5313 if (min_key.offset < (u64)-1) { 5314 min_key.offset++; 5315 goto again; 5316 } 5317 next_dir_inode: 5318 list_del(&dir_elem->list); 5319 kfree(dir_elem); 5320 } 5321 5322 btrfs_free_path(path); 5323 return ret; 5324 } 5325 5326 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5327 struct btrfs_inode *inode, 5328 struct btrfs_log_ctx *ctx) 5329 { 5330 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5331 int ret; 5332 struct btrfs_path *path; 5333 struct btrfs_key key; 5334 struct btrfs_root *root = inode->root; 5335 const u64 ino = btrfs_ino(inode); 5336 5337 path = btrfs_alloc_path(); 5338 if (!path) 5339 return -ENOMEM; 5340 path->skip_locking = 1; 5341 path->search_commit_root = 1; 5342 5343 key.objectid = ino; 5344 key.type = BTRFS_INODE_REF_KEY; 5345 key.offset = 0; 5346 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5347 if (ret < 0) 5348 goto out; 5349 5350 while (true) { 5351 struct extent_buffer *leaf = path->nodes[0]; 5352 int slot = path->slots[0]; 5353 u32 cur_offset = 0; 5354 u32 item_size; 5355 unsigned long ptr; 5356 5357 if (slot >= btrfs_header_nritems(leaf)) { 5358 ret = btrfs_next_leaf(root, path); 5359 if (ret < 0) 5360 goto out; 5361 else if (ret > 0) 5362 break; 5363 continue; 5364 } 5365 5366 btrfs_item_key_to_cpu(leaf, &key, slot); 5367 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5368 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5369 break; 5370 5371 item_size = btrfs_item_size_nr(leaf, slot); 5372 ptr = btrfs_item_ptr_offset(leaf, slot); 5373 while (cur_offset < item_size) { 5374 struct btrfs_key inode_key; 5375 struct inode *dir_inode; 5376 5377 inode_key.type = BTRFS_INODE_ITEM_KEY; 5378 inode_key.offset = 0; 5379 5380 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5381 struct btrfs_inode_extref *extref; 5382 5383 extref = (struct btrfs_inode_extref *) 5384 (ptr + cur_offset); 5385 inode_key.objectid = btrfs_inode_extref_parent( 5386 leaf, extref); 5387 cur_offset += sizeof(*extref); 5388 cur_offset += btrfs_inode_extref_name_len(leaf, 5389 extref); 5390 } else { 5391 inode_key.objectid = key.offset; 5392 cur_offset = item_size; 5393 } 5394 5395 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5396 root, NULL); 5397 /* If parent inode was deleted, skip it. */ 5398 if (IS_ERR(dir_inode)) 5399 continue; 5400 5401 if (ctx) 5402 ctx->log_new_dentries = false; 5403 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5404 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5405 if (!ret && 5406 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5407 ret = 1; 5408 if (!ret && ctx && ctx->log_new_dentries) 5409 ret = log_new_dir_dentries(trans, root, 5410 BTRFS_I(dir_inode), ctx); 5411 iput(dir_inode); 5412 if (ret) 5413 goto out; 5414 } 5415 path->slots[0]++; 5416 } 5417 ret = 0; 5418 out: 5419 btrfs_free_path(path); 5420 return ret; 5421 } 5422 5423 /* 5424 * helper function around btrfs_log_inode to make sure newly created 5425 * parent directories also end up in the log. A minimal inode and backref 5426 * only logging is done of any parent directories that are older than 5427 * the last committed transaction 5428 */ 5429 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5430 struct btrfs_root *root, 5431 struct btrfs_inode *inode, 5432 struct dentry *parent, 5433 const loff_t start, 5434 const loff_t end, 5435 int exists_only, 5436 struct btrfs_log_ctx *ctx) 5437 { 5438 struct btrfs_fs_info *fs_info = root->fs_info; 5439 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5440 struct super_block *sb; 5441 struct dentry *old_parent = NULL; 5442 int ret = 0; 5443 u64 last_committed = fs_info->last_trans_committed; 5444 bool log_dentries = false; 5445 struct btrfs_inode *orig_inode = inode; 5446 5447 sb = inode->vfs_inode.i_sb; 5448 5449 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5450 ret = 1; 5451 goto end_no_trans; 5452 } 5453 5454 /* 5455 * The prev transaction commit doesn't complete, we need do 5456 * full commit by ourselves. 5457 */ 5458 if (fs_info->last_trans_log_full_commit > 5459 fs_info->last_trans_committed) { 5460 ret = 1; 5461 goto end_no_trans; 5462 } 5463 5464 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5465 ret = 1; 5466 goto end_no_trans; 5467 } 5468 5469 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5470 last_committed); 5471 if (ret) 5472 goto end_no_trans; 5473 5474 if (btrfs_inode_in_log(inode, trans->transid)) { 5475 ret = BTRFS_NO_LOG_SYNC; 5476 goto end_no_trans; 5477 } 5478 5479 ret = start_log_trans(trans, root, ctx); 5480 if (ret) 5481 goto end_no_trans; 5482 5483 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5484 if (ret) 5485 goto end_trans; 5486 5487 /* 5488 * for regular files, if its inode is already on disk, we don't 5489 * have to worry about the parents at all. This is because 5490 * we can use the last_unlink_trans field to record renames 5491 * and other fun in this file. 5492 */ 5493 if (S_ISREG(inode->vfs_inode.i_mode) && 5494 inode->generation <= last_committed && 5495 inode->last_unlink_trans <= last_committed) { 5496 ret = 0; 5497 goto end_trans; 5498 } 5499 5500 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5501 log_dentries = true; 5502 5503 /* 5504 * On unlink we must make sure all our current and old parent directory 5505 * inodes are fully logged. This is to prevent leaving dangling 5506 * directory index entries in directories that were our parents but are 5507 * not anymore. Not doing this results in old parent directory being 5508 * impossible to delete after log replay (rmdir will always fail with 5509 * error -ENOTEMPTY). 5510 * 5511 * Example 1: 5512 * 5513 * mkdir testdir 5514 * touch testdir/foo 5515 * ln testdir/foo testdir/bar 5516 * sync 5517 * unlink testdir/bar 5518 * xfs_io -c fsync testdir/foo 5519 * <power failure> 5520 * mount fs, triggers log replay 5521 * 5522 * If we don't log the parent directory (testdir), after log replay the 5523 * directory still has an entry pointing to the file inode using the bar 5524 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5525 * the file inode has a link count of 1. 5526 * 5527 * Example 2: 5528 * 5529 * mkdir testdir 5530 * touch foo 5531 * ln foo testdir/foo2 5532 * ln foo testdir/foo3 5533 * sync 5534 * unlink testdir/foo3 5535 * xfs_io -c fsync foo 5536 * <power failure> 5537 * mount fs, triggers log replay 5538 * 5539 * Similar as the first example, after log replay the parent directory 5540 * testdir still has an entry pointing to the inode file with name foo3 5541 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5542 * and has a link count of 2. 5543 */ 5544 if (inode->last_unlink_trans > last_committed) { 5545 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5546 if (ret) 5547 goto end_trans; 5548 } 5549 5550 while (1) { 5551 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5552 break; 5553 5554 inode = BTRFS_I(d_inode(parent)); 5555 if (root != inode->root) 5556 break; 5557 5558 if (inode->generation > last_committed) { 5559 ret = btrfs_log_inode(trans, root, inode, 5560 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5561 if (ret) 5562 goto end_trans; 5563 } 5564 if (IS_ROOT(parent)) 5565 break; 5566 5567 parent = dget_parent(parent); 5568 dput(old_parent); 5569 old_parent = parent; 5570 } 5571 if (log_dentries) 5572 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5573 else 5574 ret = 0; 5575 end_trans: 5576 dput(old_parent); 5577 if (ret < 0) { 5578 btrfs_set_log_full_commit(fs_info, trans); 5579 ret = 1; 5580 } 5581 5582 if (ret) 5583 btrfs_remove_log_ctx(root, ctx); 5584 btrfs_end_log_trans(root); 5585 end_no_trans: 5586 return ret; 5587 } 5588 5589 /* 5590 * it is not safe to log dentry if the chunk root has added new 5591 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5592 * If this returns 1, you must commit the transaction to safely get your 5593 * data on disk. 5594 */ 5595 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5596 struct btrfs_root *root, struct dentry *dentry, 5597 const loff_t start, 5598 const loff_t end, 5599 struct btrfs_log_ctx *ctx) 5600 { 5601 struct dentry *parent = dget_parent(dentry); 5602 int ret; 5603 5604 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5605 parent, start, end, 0, ctx); 5606 dput(parent); 5607 5608 return ret; 5609 } 5610 5611 /* 5612 * should be called during mount to recover any replay any log trees 5613 * from the FS 5614 */ 5615 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5616 { 5617 int ret; 5618 struct btrfs_path *path; 5619 struct btrfs_trans_handle *trans; 5620 struct btrfs_key key; 5621 struct btrfs_key found_key; 5622 struct btrfs_key tmp_key; 5623 struct btrfs_root *log; 5624 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5625 struct walk_control wc = { 5626 .process_func = process_one_buffer, 5627 .stage = 0, 5628 }; 5629 5630 path = btrfs_alloc_path(); 5631 if (!path) 5632 return -ENOMEM; 5633 5634 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5635 5636 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5637 if (IS_ERR(trans)) { 5638 ret = PTR_ERR(trans); 5639 goto error; 5640 } 5641 5642 wc.trans = trans; 5643 wc.pin = 1; 5644 5645 ret = walk_log_tree(trans, log_root_tree, &wc); 5646 if (ret) { 5647 btrfs_handle_fs_error(fs_info, ret, 5648 "Failed to pin buffers while recovering log root tree."); 5649 goto error; 5650 } 5651 5652 again: 5653 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5654 key.offset = (u64)-1; 5655 key.type = BTRFS_ROOT_ITEM_KEY; 5656 5657 while (1) { 5658 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5659 5660 if (ret < 0) { 5661 btrfs_handle_fs_error(fs_info, ret, 5662 "Couldn't find tree log root."); 5663 goto error; 5664 } 5665 if (ret > 0) { 5666 if (path->slots[0] == 0) 5667 break; 5668 path->slots[0]--; 5669 } 5670 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5671 path->slots[0]); 5672 btrfs_release_path(path); 5673 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5674 break; 5675 5676 log = btrfs_read_fs_root(log_root_tree, &found_key); 5677 if (IS_ERR(log)) { 5678 ret = PTR_ERR(log); 5679 btrfs_handle_fs_error(fs_info, ret, 5680 "Couldn't read tree log root."); 5681 goto error; 5682 } 5683 5684 tmp_key.objectid = found_key.offset; 5685 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5686 tmp_key.offset = (u64)-1; 5687 5688 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5689 if (IS_ERR(wc.replay_dest)) { 5690 ret = PTR_ERR(wc.replay_dest); 5691 free_extent_buffer(log->node); 5692 free_extent_buffer(log->commit_root); 5693 kfree(log); 5694 btrfs_handle_fs_error(fs_info, ret, 5695 "Couldn't read target root for tree log recovery."); 5696 goto error; 5697 } 5698 5699 wc.replay_dest->log_root = log; 5700 btrfs_record_root_in_trans(trans, wc.replay_dest); 5701 ret = walk_log_tree(trans, log, &wc); 5702 5703 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5704 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5705 path); 5706 } 5707 5708 key.offset = found_key.offset - 1; 5709 wc.replay_dest->log_root = NULL; 5710 free_extent_buffer(log->node); 5711 free_extent_buffer(log->commit_root); 5712 kfree(log); 5713 5714 if (ret) 5715 goto error; 5716 5717 if (found_key.offset == 0) 5718 break; 5719 } 5720 btrfs_release_path(path); 5721 5722 /* step one is to pin it all, step two is to replay just inodes */ 5723 if (wc.pin) { 5724 wc.pin = 0; 5725 wc.process_func = replay_one_buffer; 5726 wc.stage = LOG_WALK_REPLAY_INODES; 5727 goto again; 5728 } 5729 /* step three is to replay everything */ 5730 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5731 wc.stage++; 5732 goto again; 5733 } 5734 5735 btrfs_free_path(path); 5736 5737 /* step 4: commit the transaction, which also unpins the blocks */ 5738 ret = btrfs_commit_transaction(trans); 5739 if (ret) 5740 return ret; 5741 5742 free_extent_buffer(log_root_tree->node); 5743 log_root_tree->log_root = NULL; 5744 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5745 kfree(log_root_tree); 5746 5747 return 0; 5748 error: 5749 if (wc.trans) 5750 btrfs_end_transaction(wc.trans); 5751 btrfs_free_path(path); 5752 return ret; 5753 } 5754 5755 /* 5756 * there are some corner cases where we want to force a full 5757 * commit instead of allowing a directory to be logged. 5758 * 5759 * They revolve around files there were unlinked from the directory, and 5760 * this function updates the parent directory so that a full commit is 5761 * properly done if it is fsync'd later after the unlinks are done. 5762 * 5763 * Must be called before the unlink operations (updates to the subvolume tree, 5764 * inodes, etc) are done. 5765 */ 5766 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5767 struct btrfs_inode *dir, struct btrfs_inode *inode, 5768 int for_rename) 5769 { 5770 /* 5771 * when we're logging a file, if it hasn't been renamed 5772 * or unlinked, and its inode is fully committed on disk, 5773 * we don't have to worry about walking up the directory chain 5774 * to log its parents. 5775 * 5776 * So, we use the last_unlink_trans field to put this transid 5777 * into the file. When the file is logged we check it and 5778 * don't log the parents if the file is fully on disk. 5779 */ 5780 mutex_lock(&inode->log_mutex); 5781 inode->last_unlink_trans = trans->transid; 5782 mutex_unlock(&inode->log_mutex); 5783 5784 /* 5785 * if this directory was already logged any new 5786 * names for this file/dir will get recorded 5787 */ 5788 smp_mb(); 5789 if (dir->logged_trans == trans->transid) 5790 return; 5791 5792 /* 5793 * if the inode we're about to unlink was logged, 5794 * the log will be properly updated for any new names 5795 */ 5796 if (inode->logged_trans == trans->transid) 5797 return; 5798 5799 /* 5800 * when renaming files across directories, if the directory 5801 * there we're unlinking from gets fsync'd later on, there's 5802 * no way to find the destination directory later and fsync it 5803 * properly. So, we have to be conservative and force commits 5804 * so the new name gets discovered. 5805 */ 5806 if (for_rename) 5807 goto record; 5808 5809 /* we can safely do the unlink without any special recording */ 5810 return; 5811 5812 record: 5813 mutex_lock(&dir->log_mutex); 5814 dir->last_unlink_trans = trans->transid; 5815 mutex_unlock(&dir->log_mutex); 5816 } 5817 5818 /* 5819 * Make sure that if someone attempts to fsync the parent directory of a deleted 5820 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5821 * that after replaying the log tree of the parent directory's root we will not 5822 * see the snapshot anymore and at log replay time we will not see any log tree 5823 * corresponding to the deleted snapshot's root, which could lead to replaying 5824 * it after replaying the log tree of the parent directory (which would replay 5825 * the snapshot delete operation). 5826 * 5827 * Must be called before the actual snapshot destroy operation (updates to the 5828 * parent root and tree of tree roots trees, etc) are done. 5829 */ 5830 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5831 struct btrfs_inode *dir) 5832 { 5833 mutex_lock(&dir->log_mutex); 5834 dir->last_unlink_trans = trans->transid; 5835 mutex_unlock(&dir->log_mutex); 5836 } 5837 5838 /* 5839 * Call this after adding a new name for a file and it will properly 5840 * update the log to reflect the new name. 5841 * 5842 * It will return zero if all goes well, and it will return 1 if a 5843 * full transaction commit is required. 5844 */ 5845 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5846 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5847 struct dentry *parent) 5848 { 5849 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5850 struct btrfs_root *root = inode->root; 5851 5852 /* 5853 * this will force the logging code to walk the dentry chain 5854 * up for the file 5855 */ 5856 if (S_ISREG(inode->vfs_inode.i_mode)) 5857 inode->last_unlink_trans = trans->transid; 5858 5859 /* 5860 * if this inode hasn't been logged and directory we're renaming it 5861 * from hasn't been logged, we don't need to log it 5862 */ 5863 if (inode->logged_trans <= fs_info->last_trans_committed && 5864 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5865 return 0; 5866 5867 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5868 LLONG_MAX, 1, NULL); 5869 } 5870 5871