1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 #include "compression.h" 30 #include "qgroup.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 #define LOG_OTHER_INODE 2 41 42 /* 43 * directory trouble cases 44 * 45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 46 * log, we must force a full commit before doing an fsync of the directory 47 * where the unlink was done. 48 * ---> record transid of last unlink/rename per directory 49 * 50 * mkdir foo/some_dir 51 * normal commit 52 * rename foo/some_dir foo2/some_dir 53 * mkdir foo/some_dir 54 * fsync foo/some_dir/some_file 55 * 56 * The fsync above will unlink the original some_dir without recording 57 * it in its new location (foo2). After a crash, some_dir will be gone 58 * unless the fsync of some_file forces a full commit 59 * 60 * 2) we must log any new names for any file or dir that is in the fsync 61 * log. ---> check inode while renaming/linking. 62 * 63 * 2a) we must log any new names for any file or dir during rename 64 * when the directory they are being removed from was logged. 65 * ---> check inode and old parent dir during rename 66 * 67 * 2a is actually the more important variant. With the extra logging 68 * a crash might unlink the old name without recreating the new one 69 * 70 * 3) after a crash, we must go through any directories with a link count 71 * of zero and redo the rm -rf 72 * 73 * mkdir f1/foo 74 * normal commit 75 * rm -rf f1/foo 76 * fsync(f1) 77 * 78 * The directory f1 was fully removed from the FS, but fsync was never 79 * called on f1, only its parent dir. After a crash the rm -rf must 80 * be replayed. This must be able to recurse down the entire 81 * directory tree. The inode link count fixup code takes care of the 82 * ugly details. 83 */ 84 85 /* 86 * stages for the tree walking. The first 87 * stage (0) is to only pin down the blocks we find 88 * the second stage (1) is to make sure that all the inodes 89 * we find in the log are created in the subvolume. 90 * 91 * The last stage is to deal with directories and links and extents 92 * and all the other fun semantics 93 */ 94 #define LOG_WALK_PIN_ONLY 0 95 #define LOG_WALK_REPLAY_INODES 1 96 #define LOG_WALK_REPLAY_DIR_INDEX 2 97 #define LOG_WALK_REPLAY_ALL 3 98 99 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, struct btrfs_inode *inode, 101 int inode_only, 102 const loff_t start, 103 const loff_t end, 104 struct btrfs_log_ctx *ctx); 105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_path *path, u64 objectid); 108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_root *log, 111 struct btrfs_path *path, 112 u64 dirid, int del_all); 113 114 /* 115 * tree logging is a special write ahead log used to make sure that 116 * fsyncs and O_SYNCs can happen without doing full tree commits. 117 * 118 * Full tree commits are expensive because they require commonly 119 * modified blocks to be recowed, creating many dirty pages in the 120 * extent tree an 4x-6x higher write load than ext3. 121 * 122 * Instead of doing a tree commit on every fsync, we use the 123 * key ranges and transaction ids to find items for a given file or directory 124 * that have changed in this transaction. Those items are copied into 125 * a special tree (one per subvolume root), that tree is written to disk 126 * and then the fsync is considered complete. 127 * 128 * After a crash, items are copied out of the log-tree back into the 129 * subvolume tree. Any file data extents found are recorded in the extent 130 * allocation tree, and the log-tree freed. 131 * 132 * The log tree is read three times, once to pin down all the extents it is 133 * using in ram and once, once to create all the inodes logged in the tree 134 * and once to do all the other items. 135 */ 136 137 /* 138 * start a sub transaction and setup the log tree 139 * this increments the log tree writer count to make the people 140 * syncing the tree wait for us to finish 141 */ 142 static int start_log_trans(struct btrfs_trans_handle *trans, 143 struct btrfs_root *root, 144 struct btrfs_log_ctx *ctx) 145 { 146 struct btrfs_fs_info *fs_info = root->fs_info; 147 int ret = 0; 148 149 mutex_lock(&root->log_mutex); 150 151 if (root->log_root) { 152 if (btrfs_need_log_full_commit(fs_info, trans)) { 153 ret = -EAGAIN; 154 goto out; 155 } 156 157 if (!root->log_start_pid) { 158 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 root->log_start_pid = current->pid; 160 } else if (root->log_start_pid != current->pid) { 161 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 162 } 163 } else { 164 mutex_lock(&fs_info->tree_log_mutex); 165 if (!fs_info->log_root_tree) 166 ret = btrfs_init_log_root_tree(trans, fs_info); 167 mutex_unlock(&fs_info->tree_log_mutex); 168 if (ret) 169 goto out; 170 171 ret = btrfs_add_log_tree(trans, root); 172 if (ret) 173 goto out; 174 175 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 176 root->log_start_pid = current->pid; 177 } 178 179 atomic_inc(&root->log_batch); 180 atomic_inc(&root->log_writers); 181 if (ctx) { 182 int index = root->log_transid % 2; 183 list_add_tail(&ctx->list, &root->log_ctxs[index]); 184 ctx->log_transid = root->log_transid; 185 } 186 187 out: 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * returns 0 if there was a log transaction running and we were able 194 * to join, or returns -ENOENT if there were not transactions 195 * in progress 196 */ 197 static int join_running_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 smp_mb(); 202 if (!root->log_root) 203 return -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 if (root->log_root) { 207 ret = 0; 208 atomic_inc(&root->log_writers); 209 } 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * This either makes the current running log transaction wait 216 * until you call btrfs_end_log_trans() or it makes any future 217 * log transactions wait until you call btrfs_end_log_trans() 218 */ 219 int btrfs_pin_log_trans(struct btrfs_root *root) 220 { 221 int ret = -ENOENT; 222 223 mutex_lock(&root->log_mutex); 224 atomic_inc(&root->log_writers); 225 mutex_unlock(&root->log_mutex); 226 return ret; 227 } 228 229 /* 230 * indicate we're done making changes to the log tree 231 * and wake up anyone waiting to do a sync 232 */ 233 void btrfs_end_log_trans(struct btrfs_root *root) 234 { 235 if (atomic_dec_and_test(&root->log_writers)) { 236 /* 237 * Implicit memory barrier after atomic_dec_and_test 238 */ 239 if (waitqueue_active(&root->log_writer_wait)) 240 wake_up(&root->log_writer_wait); 241 } 242 } 243 244 245 /* 246 * the walk control struct is used to pass state down the chain when 247 * processing the log tree. The stage field tells us which part 248 * of the log tree processing we are currently doing. The others 249 * are state fields used for that specific part 250 */ 251 struct walk_control { 252 /* should we free the extent on disk when done? This is used 253 * at transaction commit time while freeing a log tree 254 */ 255 int free; 256 257 /* should we write out the extent buffer? This is used 258 * while flushing the log tree to disk during a sync 259 */ 260 int write; 261 262 /* should we wait for the extent buffer io to finish? Also used 263 * while flushing the log tree to disk for a sync 264 */ 265 int wait; 266 267 /* pin only walk, we record which extents on disk belong to the 268 * log trees 269 */ 270 int pin; 271 272 /* what stage of the replay code we're currently in */ 273 int stage; 274 275 /* the root we are currently replaying */ 276 struct btrfs_root *replay_dest; 277 278 /* the trans handle for the current replay */ 279 struct btrfs_trans_handle *trans; 280 281 /* the function that gets used to process blocks we find in the 282 * tree. Note the extent_buffer might not be up to date when it is 283 * passed in, and it must be checked or read if you need the data 284 * inside it 285 */ 286 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287 struct walk_control *wc, u64 gen); 288 }; 289 290 /* 291 * process_func used to pin down extents, write them or wait on them 292 */ 293 static int process_one_buffer(struct btrfs_root *log, 294 struct extent_buffer *eb, 295 struct walk_control *wc, u64 gen) 296 { 297 struct btrfs_fs_info *fs_info = log->fs_info; 298 int ret = 0; 299 300 /* 301 * If this fs is mixed then we need to be able to process the leaves to 302 * pin down any logged extents, so we have to read the block. 303 */ 304 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 305 ret = btrfs_read_buffer(eb, gen); 306 if (ret) 307 return ret; 308 } 309 310 if (wc->pin) 311 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 312 eb->len); 313 314 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 315 if (wc->pin && btrfs_header_level(eb) == 0) 316 ret = btrfs_exclude_logged_extents(fs_info, eb); 317 if (wc->write) 318 btrfs_write_tree_block(eb); 319 if (wc->wait) 320 btrfs_wait_tree_block_writeback(eb); 321 } 322 return ret; 323 } 324 325 /* 326 * Item overwrite used by replay and tree logging. eb, slot and key all refer 327 * to the src data we are copying out. 328 * 329 * root is the tree we are copying into, and path is a scratch 330 * path for use in this function (it should be released on entry and 331 * will be released on exit). 332 * 333 * If the key is already in the destination tree the existing item is 334 * overwritten. If the existing item isn't big enough, it is extended. 335 * If it is too large, it is truncated. 336 * 337 * If the key isn't in the destination yet, a new item is inserted. 338 */ 339 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 340 struct btrfs_root *root, 341 struct btrfs_path *path, 342 struct extent_buffer *eb, int slot, 343 struct btrfs_key *key) 344 { 345 struct btrfs_fs_info *fs_info = root->fs_info; 346 int ret; 347 u32 item_size; 348 u64 saved_i_size = 0; 349 int save_old_i_size = 0; 350 unsigned long src_ptr; 351 unsigned long dst_ptr; 352 int overwrite_root = 0; 353 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 354 355 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 356 overwrite_root = 1; 357 358 item_size = btrfs_item_size_nr(eb, slot); 359 src_ptr = btrfs_item_ptr_offset(eb, slot); 360 361 /* look for the key in the destination tree */ 362 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 363 if (ret < 0) 364 return ret; 365 366 if (ret == 0) { 367 char *src_copy; 368 char *dst_copy; 369 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 370 path->slots[0]); 371 if (dst_size != item_size) 372 goto insert; 373 374 if (item_size == 0) { 375 btrfs_release_path(path); 376 return 0; 377 } 378 dst_copy = kmalloc(item_size, GFP_NOFS); 379 src_copy = kmalloc(item_size, GFP_NOFS); 380 if (!dst_copy || !src_copy) { 381 btrfs_release_path(path); 382 kfree(dst_copy); 383 kfree(src_copy); 384 return -ENOMEM; 385 } 386 387 read_extent_buffer(eb, src_copy, src_ptr, item_size); 388 389 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 390 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 391 item_size); 392 ret = memcmp(dst_copy, src_copy, item_size); 393 394 kfree(dst_copy); 395 kfree(src_copy); 396 /* 397 * they have the same contents, just return, this saves 398 * us from cowing blocks in the destination tree and doing 399 * extra writes that may not have been done by a previous 400 * sync 401 */ 402 if (ret == 0) { 403 btrfs_release_path(path); 404 return 0; 405 } 406 407 /* 408 * We need to load the old nbytes into the inode so when we 409 * replay the extents we've logged we get the right nbytes. 410 */ 411 if (inode_item) { 412 struct btrfs_inode_item *item; 413 u64 nbytes; 414 u32 mode; 415 416 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 417 struct btrfs_inode_item); 418 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 419 item = btrfs_item_ptr(eb, slot, 420 struct btrfs_inode_item); 421 btrfs_set_inode_nbytes(eb, item, nbytes); 422 423 /* 424 * If this is a directory we need to reset the i_size to 425 * 0 so that we can set it up properly when replaying 426 * the rest of the items in this log. 427 */ 428 mode = btrfs_inode_mode(eb, item); 429 if (S_ISDIR(mode)) 430 btrfs_set_inode_size(eb, item, 0); 431 } 432 } else if (inode_item) { 433 struct btrfs_inode_item *item; 434 u32 mode; 435 436 /* 437 * New inode, set nbytes to 0 so that the nbytes comes out 438 * properly when we replay the extents. 439 */ 440 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 441 btrfs_set_inode_nbytes(eb, item, 0); 442 443 /* 444 * If this is a directory we need to reset the i_size to 0 so 445 * that we can set it up properly when replaying the rest of 446 * the items in this log. 447 */ 448 mode = btrfs_inode_mode(eb, item); 449 if (S_ISDIR(mode)) 450 btrfs_set_inode_size(eb, item, 0); 451 } 452 insert: 453 btrfs_release_path(path); 454 /* try to insert the key into the destination tree */ 455 path->skip_release_on_error = 1; 456 ret = btrfs_insert_empty_item(trans, root, path, 457 key, item_size); 458 path->skip_release_on_error = 0; 459 460 /* make sure any existing item is the correct size */ 461 if (ret == -EEXIST || ret == -EOVERFLOW) { 462 u32 found_size; 463 found_size = btrfs_item_size_nr(path->nodes[0], 464 path->slots[0]); 465 if (found_size > item_size) 466 btrfs_truncate_item(fs_info, path, item_size, 1); 467 else if (found_size < item_size) 468 btrfs_extend_item(fs_info, path, 469 item_size - found_size); 470 } else if (ret) { 471 return ret; 472 } 473 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 474 path->slots[0]); 475 476 /* don't overwrite an existing inode if the generation number 477 * was logged as zero. This is done when the tree logging code 478 * is just logging an inode to make sure it exists after recovery. 479 * 480 * Also, don't overwrite i_size on directories during replay. 481 * log replay inserts and removes directory items based on the 482 * state of the tree found in the subvolume, and i_size is modified 483 * as it goes 484 */ 485 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 486 struct btrfs_inode_item *src_item; 487 struct btrfs_inode_item *dst_item; 488 489 src_item = (struct btrfs_inode_item *)src_ptr; 490 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 492 if (btrfs_inode_generation(eb, src_item) == 0) { 493 struct extent_buffer *dst_eb = path->nodes[0]; 494 const u64 ino_size = btrfs_inode_size(eb, src_item); 495 496 /* 497 * For regular files an ino_size == 0 is used only when 498 * logging that an inode exists, as part of a directory 499 * fsync, and the inode wasn't fsynced before. In this 500 * case don't set the size of the inode in the fs/subvol 501 * tree, otherwise we would be throwing valid data away. 502 */ 503 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 505 ino_size != 0) { 506 struct btrfs_map_token token; 507 508 btrfs_init_map_token(&token); 509 btrfs_set_token_inode_size(dst_eb, dst_item, 510 ino_size, &token); 511 } 512 goto no_copy; 513 } 514 515 if (overwrite_root && 516 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 517 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 518 save_old_i_size = 1; 519 saved_i_size = btrfs_inode_size(path->nodes[0], 520 dst_item); 521 } 522 } 523 524 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 525 src_ptr, item_size); 526 527 if (save_old_i_size) { 528 struct btrfs_inode_item *dst_item; 529 dst_item = (struct btrfs_inode_item *)dst_ptr; 530 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 531 } 532 533 /* make sure the generation is filled in */ 534 if (key->type == BTRFS_INODE_ITEM_KEY) { 535 struct btrfs_inode_item *dst_item; 536 dst_item = (struct btrfs_inode_item *)dst_ptr; 537 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 538 btrfs_set_inode_generation(path->nodes[0], dst_item, 539 trans->transid); 540 } 541 } 542 no_copy: 543 btrfs_mark_buffer_dirty(path->nodes[0]); 544 btrfs_release_path(path); 545 return 0; 546 } 547 548 /* 549 * simple helper to read an inode off the disk from a given root 550 * This can only be called for subvolume roots and not for the log 551 */ 552 static noinline struct inode *read_one_inode(struct btrfs_root *root, 553 u64 objectid) 554 { 555 struct btrfs_key key; 556 struct inode *inode; 557 558 key.objectid = objectid; 559 key.type = BTRFS_INODE_ITEM_KEY; 560 key.offset = 0; 561 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 562 if (IS_ERR(inode)) { 563 inode = NULL; 564 } else if (is_bad_inode(inode)) { 565 iput(inode); 566 inode = NULL; 567 } 568 return inode; 569 } 570 571 /* replays a single extent in 'eb' at 'slot' with 'key' into the 572 * subvolume 'root'. path is released on entry and should be released 573 * on exit. 574 * 575 * extents in the log tree have not been allocated out of the extent 576 * tree yet. So, this completes the allocation, taking a reference 577 * as required if the extent already exists or creating a new extent 578 * if it isn't in the extent allocation tree yet. 579 * 580 * The extent is inserted into the file, dropping any existing extents 581 * from the file that overlap the new one. 582 */ 583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, 585 struct btrfs_path *path, 586 struct extent_buffer *eb, int slot, 587 struct btrfs_key *key) 588 { 589 struct btrfs_fs_info *fs_info = root->fs_info; 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, 617 fs_info->sectorsize); 618 } else { 619 ret = 0; 620 goto out; 621 } 622 623 inode = read_one_inode(root, key->objectid); 624 if (!inode) { 625 ret = -EIO; 626 goto out; 627 } 628 629 /* 630 * first check to see if we already have this extent in the 631 * file. This must be done before the btrfs_drop_extents run 632 * so we don't try to drop this extent. 633 */ 634 ret = btrfs_lookup_file_extent(trans, root, path, 635 btrfs_ino(BTRFS_I(inode)), start, 0); 636 637 if (ret == 0 && 638 (found_type == BTRFS_FILE_EXTENT_REG || 639 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 640 struct btrfs_file_extent_item cmp1; 641 struct btrfs_file_extent_item cmp2; 642 struct btrfs_file_extent_item *existing; 643 struct extent_buffer *leaf; 644 645 leaf = path->nodes[0]; 646 existing = btrfs_item_ptr(leaf, path->slots[0], 647 struct btrfs_file_extent_item); 648 649 read_extent_buffer(eb, &cmp1, (unsigned long)item, 650 sizeof(cmp1)); 651 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 652 sizeof(cmp2)); 653 654 /* 655 * we already have a pointer to this exact extent, 656 * we don't have to do anything 657 */ 658 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 659 btrfs_release_path(path); 660 goto out; 661 } 662 } 663 btrfs_release_path(path); 664 665 /* drop any overlapping extents */ 666 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 667 if (ret) 668 goto out; 669 670 if (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 672 u64 offset; 673 unsigned long dest_offset; 674 struct btrfs_key ins; 675 676 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 677 btrfs_fs_incompat(fs_info, NO_HOLES)) 678 goto update_inode; 679 680 ret = btrfs_insert_empty_item(trans, root, path, key, 681 sizeof(*item)); 682 if (ret) 683 goto out; 684 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 685 path->slots[0]); 686 copy_extent_buffer(path->nodes[0], eb, dest_offset, 687 (unsigned long)item, sizeof(*item)); 688 689 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 690 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 691 ins.type = BTRFS_EXTENT_ITEM_KEY; 692 offset = key->offset - btrfs_file_extent_offset(eb, item); 693 694 /* 695 * Manually record dirty extent, as here we did a shallow 696 * file extent item copy and skip normal backref update, 697 * but modifying extent tree all by ourselves. 698 * So need to manually record dirty extent for qgroup, 699 * as the owner of the file extent changed from log tree 700 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 701 */ 702 ret = btrfs_qgroup_trace_extent(trans, fs_info, 703 btrfs_file_extent_disk_bytenr(eb, item), 704 btrfs_file_extent_disk_num_bytes(eb, item), 705 GFP_NOFS); 706 if (ret < 0) 707 goto out; 708 709 if (ins.objectid > 0) { 710 u64 csum_start; 711 u64 csum_end; 712 LIST_HEAD(ordered_sums); 713 /* 714 * is this extent already allocated in the extent 715 * allocation tree? If so, just add a reference 716 */ 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 718 ins.offset); 719 if (ret == 0) { 720 ret = btrfs_inc_extent_ref(trans, fs_info, 721 ins.objectid, ins.offset, 722 0, root->root_key.objectid, 723 key->objectid, offset); 724 if (ret) 725 goto out; 726 } else { 727 /* 728 * insert the extent pointer in the extent 729 * allocation tree 730 */ 731 ret = btrfs_alloc_logged_file_extent(trans, 732 fs_info, 733 root->root_key.objectid, 734 key->objectid, offset, &ins); 735 if (ret) 736 goto out; 737 } 738 btrfs_release_path(path); 739 740 if (btrfs_file_extent_compression(eb, item)) { 741 csum_start = ins.objectid; 742 csum_end = csum_start + ins.offset; 743 } else { 744 csum_start = ins.objectid + 745 btrfs_file_extent_offset(eb, item); 746 csum_end = csum_start + 747 btrfs_file_extent_num_bytes(eb, item); 748 } 749 750 ret = btrfs_lookup_csums_range(root->log_root, 751 csum_start, csum_end - 1, 752 &ordered_sums, 0); 753 if (ret) 754 goto out; 755 /* 756 * Now delete all existing cums in the csum root that 757 * cover our range. We do this because we can have an 758 * extent that is completely referenced by one file 759 * extent item and partially referenced by another 760 * file extent item (like after using the clone or 761 * extent_same ioctls). In this case if we end up doing 762 * the replay of the one that partially references the 763 * extent first, and we do not do the csum deletion 764 * below, we can get 2 csum items in the csum tree that 765 * overlap each other. For example, imagine our log has 766 * the two following file extent items: 767 * 768 * key (257 EXTENT_DATA 409600) 769 * extent data disk byte 12845056 nr 102400 770 * extent data offset 20480 nr 20480 ram 102400 771 * 772 * key (257 EXTENT_DATA 819200) 773 * extent data disk byte 12845056 nr 102400 774 * extent data offset 0 nr 102400 ram 102400 775 * 776 * Where the second one fully references the 100K extent 777 * that starts at disk byte 12845056, and the log tree 778 * has a single csum item that covers the entire range 779 * of the extent: 780 * 781 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 782 * 783 * After the first file extent item is replayed, the 784 * csum tree gets the following csum item: 785 * 786 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 787 * 788 * Which covers the 20K sub-range starting at offset 20K 789 * of our extent. Now when we replay the second file 790 * extent item, if we do not delete existing csum items 791 * that cover any of its blocks, we end up getting two 792 * csum items in our csum tree that overlap each other: 793 * 794 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 795 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 796 * 797 * Which is a problem, because after this anyone trying 798 * to lookup up for the checksum of any block of our 799 * extent starting at an offset of 40K or higher, will 800 * end up looking at the second csum item only, which 801 * does not contain the checksum for any block starting 802 * at offset 40K or higher of our extent. 803 */ 804 while (!list_empty(&ordered_sums)) { 805 struct btrfs_ordered_sum *sums; 806 sums = list_entry(ordered_sums.next, 807 struct btrfs_ordered_sum, 808 list); 809 if (!ret) 810 ret = btrfs_del_csums(trans, fs_info, 811 sums->bytenr, 812 sums->len); 813 if (!ret) 814 ret = btrfs_csum_file_blocks(trans, 815 fs_info->csum_root, sums); 816 list_del(&sums->list); 817 kfree(sums); 818 } 819 if (ret) 820 goto out; 821 } else { 822 btrfs_release_path(path); 823 } 824 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 825 /* inline extents are easy, we just overwrite them */ 826 ret = overwrite_item(trans, root, path, eb, slot, key); 827 if (ret) 828 goto out; 829 } 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct btrfs_fs_info *fs_info = root->fs_info; 855 struct inode *inode; 856 char *name; 857 int name_len; 858 struct extent_buffer *leaf; 859 struct btrfs_key location; 860 int ret; 861 862 leaf = path->nodes[0]; 863 864 btrfs_dir_item_key_to_cpu(leaf, di, &location); 865 name_len = btrfs_dir_name_len(leaf, di); 866 name = kmalloc(name_len, GFP_NOFS); 867 if (!name) 868 return -ENOMEM; 869 870 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 871 btrfs_release_path(path); 872 873 inode = read_one_inode(root, location.objectid); 874 if (!inode) { 875 ret = -EIO; 876 goto out; 877 } 878 879 ret = link_to_fixup_dir(trans, root, path, location.objectid); 880 if (ret) 881 goto out; 882 883 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 884 name_len); 885 if (ret) 886 goto out; 887 else 888 ret = btrfs_run_delayed_items(trans, fs_info); 889 out: 890 kfree(name); 891 iput(inode); 892 return ret; 893 } 894 895 /* 896 * helper function to see if a given name and sequence number found 897 * in an inode back reference are already in a directory and correctly 898 * point to this inode 899 */ 900 static noinline int inode_in_dir(struct btrfs_root *root, 901 struct btrfs_path *path, 902 u64 dirid, u64 objectid, u64 index, 903 const char *name, int name_len) 904 { 905 struct btrfs_dir_item *di; 906 struct btrfs_key location; 907 int match = 0; 908 909 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 910 index, name, name_len, 0); 911 if (di && !IS_ERR(di)) { 912 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 913 if (location.objectid != objectid) 914 goto out; 915 } else 916 goto out; 917 btrfs_release_path(path); 918 919 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 920 if (di && !IS_ERR(di)) { 921 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 922 if (location.objectid != objectid) 923 goto out; 924 } else 925 goto out; 926 match = 1; 927 out: 928 btrfs_release_path(path); 929 return match; 930 } 931 932 /* 933 * helper function to check a log tree for a named back reference in 934 * an inode. This is used to decide if a back reference that is 935 * found in the subvolume conflicts with what we find in the log. 936 * 937 * inode backreferences may have multiple refs in a single item, 938 * during replay we process one reference at a time, and we don't 939 * want to delete valid links to a file from the subvolume if that 940 * link is also in the log. 941 */ 942 static noinline int backref_in_log(struct btrfs_root *log, 943 struct btrfs_key *key, 944 u64 ref_objectid, 945 const char *name, int namelen) 946 { 947 struct btrfs_path *path; 948 struct btrfs_inode_ref *ref; 949 unsigned long ptr; 950 unsigned long ptr_end; 951 unsigned long name_ptr; 952 int found_name_len; 953 int item_size; 954 int ret; 955 int match = 0; 956 957 path = btrfs_alloc_path(); 958 if (!path) 959 return -ENOMEM; 960 961 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 962 if (ret != 0) 963 goto out; 964 965 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 966 967 if (key->type == BTRFS_INODE_EXTREF_KEY) { 968 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 969 name, namelen, NULL)) 970 match = 1; 971 972 goto out; 973 } 974 975 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 976 ptr_end = ptr + item_size; 977 while (ptr < ptr_end) { 978 ref = (struct btrfs_inode_ref *)ptr; 979 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 980 if (found_name_len == namelen) { 981 name_ptr = (unsigned long)(ref + 1); 982 ret = memcmp_extent_buffer(path->nodes[0], name, 983 name_ptr, namelen); 984 if (ret == 0) { 985 match = 1; 986 goto out; 987 } 988 } 989 ptr = (unsigned long)(ref + 1) + found_name_len; 990 } 991 out: 992 btrfs_free_path(path); 993 return match; 994 } 995 996 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 997 struct btrfs_root *root, 998 struct btrfs_path *path, 999 struct btrfs_root *log_root, 1000 struct btrfs_inode *dir, 1001 struct btrfs_inode *inode, 1002 u64 inode_objectid, u64 parent_objectid, 1003 u64 ref_index, char *name, int namelen, 1004 int *search_done) 1005 { 1006 struct btrfs_fs_info *fs_info = root->fs_info; 1007 int ret; 1008 char *victim_name; 1009 int victim_name_len; 1010 struct extent_buffer *leaf; 1011 struct btrfs_dir_item *di; 1012 struct btrfs_key search_key; 1013 struct btrfs_inode_extref *extref; 1014 1015 again: 1016 /* Search old style refs */ 1017 search_key.objectid = inode_objectid; 1018 search_key.type = BTRFS_INODE_REF_KEY; 1019 search_key.offset = parent_objectid; 1020 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1021 if (ret == 0) { 1022 struct btrfs_inode_ref *victim_ref; 1023 unsigned long ptr; 1024 unsigned long ptr_end; 1025 1026 leaf = path->nodes[0]; 1027 1028 /* are we trying to overwrite a back ref for the root directory 1029 * if so, just jump out, we're done 1030 */ 1031 if (search_key.objectid == search_key.offset) 1032 return 1; 1033 1034 /* check all the names in this back reference to see 1035 * if they are in the log. if so, we allow them to stay 1036 * otherwise they must be unlinked as a conflict 1037 */ 1038 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1039 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1040 while (ptr < ptr_end) { 1041 victim_ref = (struct btrfs_inode_ref *)ptr; 1042 victim_name_len = btrfs_inode_ref_name_len(leaf, 1043 victim_ref); 1044 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1045 if (!victim_name) 1046 return -ENOMEM; 1047 1048 read_extent_buffer(leaf, victim_name, 1049 (unsigned long)(victim_ref + 1), 1050 victim_name_len); 1051 1052 if (!backref_in_log(log_root, &search_key, 1053 parent_objectid, 1054 victim_name, 1055 victim_name_len)) { 1056 inc_nlink(&inode->vfs_inode); 1057 btrfs_release_path(path); 1058 1059 ret = btrfs_unlink_inode(trans, root, dir, inode, 1060 victim_name, victim_name_len); 1061 kfree(victim_name); 1062 if (ret) 1063 return ret; 1064 ret = btrfs_run_delayed_items(trans, fs_info); 1065 if (ret) 1066 return ret; 1067 *search_done = 1; 1068 goto again; 1069 } 1070 kfree(victim_name); 1071 1072 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1073 } 1074 1075 /* 1076 * NOTE: we have searched root tree and checked the 1077 * corresponding ref, it does not need to check again. 1078 */ 1079 *search_done = 1; 1080 } 1081 btrfs_release_path(path); 1082 1083 /* Same search but for extended refs */ 1084 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1085 inode_objectid, parent_objectid, 0, 1086 0); 1087 if (!IS_ERR_OR_NULL(extref)) { 1088 u32 item_size; 1089 u32 cur_offset = 0; 1090 unsigned long base; 1091 struct inode *victim_parent; 1092 1093 leaf = path->nodes[0]; 1094 1095 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1096 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1097 1098 while (cur_offset < item_size) { 1099 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1100 1101 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1102 1103 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1104 goto next; 1105 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1110 victim_name_len); 1111 1112 search_key.objectid = inode_objectid; 1113 search_key.type = BTRFS_INODE_EXTREF_KEY; 1114 search_key.offset = btrfs_extref_hash(parent_objectid, 1115 victim_name, 1116 victim_name_len); 1117 ret = 0; 1118 if (!backref_in_log(log_root, &search_key, 1119 parent_objectid, victim_name, 1120 victim_name_len)) { 1121 ret = -ENOENT; 1122 victim_parent = read_one_inode(root, 1123 parent_objectid); 1124 if (victim_parent) { 1125 inc_nlink(&inode->vfs_inode); 1126 btrfs_release_path(path); 1127 1128 ret = btrfs_unlink_inode(trans, root, 1129 BTRFS_I(victim_parent), 1130 inode, 1131 victim_name, 1132 victim_name_len); 1133 if (!ret) 1134 ret = btrfs_run_delayed_items( 1135 trans, 1136 fs_info); 1137 } 1138 iput(victim_parent); 1139 kfree(victim_name); 1140 if (ret) 1141 return ret; 1142 *search_done = 1; 1143 goto again; 1144 } 1145 kfree(victim_name); 1146 if (ret) 1147 return ret; 1148 next: 1149 cur_offset += victim_name_len + sizeof(*extref); 1150 } 1151 *search_done = 1; 1152 } 1153 btrfs_release_path(path); 1154 1155 /* look for a conflicting sequence number */ 1156 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1157 ref_index, name, namelen, 0); 1158 if (di && !IS_ERR(di)) { 1159 ret = drop_one_dir_item(trans, root, path, dir, di); 1160 if (ret) 1161 return ret; 1162 } 1163 btrfs_release_path(path); 1164 1165 /* look for a conflicing name */ 1166 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1167 name, namelen, 0); 1168 if (di && !IS_ERR(di)) { 1169 ret = drop_one_dir_item(trans, root, path, dir, di); 1170 if (ret) 1171 return ret; 1172 } 1173 btrfs_release_path(path); 1174 1175 return 0; 1176 } 1177 1178 static int extref_get_fields(struct extent_buffer *eb, int slot, 1179 unsigned long ref_ptr, u32 *namelen, char **name, 1180 u64 *index, u64 *parent_objectid) 1181 { 1182 struct btrfs_inode_extref *extref; 1183 1184 extref = (struct btrfs_inode_extref *)ref_ptr; 1185 1186 *namelen = btrfs_inode_extref_name_len(eb, extref); 1187 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, 1188 *namelen)) 1189 return -EIO; 1190 1191 *name = kmalloc(*namelen, GFP_NOFS); 1192 if (*name == NULL) 1193 return -ENOMEM; 1194 1195 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1196 *namelen); 1197 1198 *index = btrfs_inode_extref_index(eb, extref); 1199 if (parent_objectid) 1200 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1201 1202 return 0; 1203 } 1204 1205 static int ref_get_fields(struct extent_buffer *eb, int slot, 1206 unsigned long ref_ptr, u32 *namelen, char **name, 1207 u64 *index) 1208 { 1209 struct btrfs_inode_ref *ref; 1210 1211 ref = (struct btrfs_inode_ref *)ref_ptr; 1212 1213 *namelen = btrfs_inode_ref_name_len(eb, ref); 1214 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), 1215 *namelen)) 1216 return -EIO; 1217 1218 *name = kmalloc(*namelen, GFP_NOFS); 1219 if (*name == NULL) 1220 return -ENOMEM; 1221 1222 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1223 1224 *index = btrfs_inode_ref_index(eb, ref); 1225 1226 return 0; 1227 } 1228 1229 /* 1230 * replay one inode back reference item found in the log tree. 1231 * eb, slot and key refer to the buffer and key found in the log tree. 1232 * root is the destination we are replaying into, and path is for temp 1233 * use by this function. (it should be released on return). 1234 */ 1235 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1236 struct btrfs_root *root, 1237 struct btrfs_root *log, 1238 struct btrfs_path *path, 1239 struct extent_buffer *eb, int slot, 1240 struct btrfs_key *key) 1241 { 1242 struct inode *dir = NULL; 1243 struct inode *inode = NULL; 1244 unsigned long ref_ptr; 1245 unsigned long ref_end; 1246 char *name = NULL; 1247 int namelen; 1248 int ret; 1249 int search_done = 0; 1250 int log_ref_ver = 0; 1251 u64 parent_objectid; 1252 u64 inode_objectid; 1253 u64 ref_index = 0; 1254 int ref_struct_size; 1255 1256 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1257 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1258 1259 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1260 struct btrfs_inode_extref *r; 1261 1262 ref_struct_size = sizeof(struct btrfs_inode_extref); 1263 log_ref_ver = 1; 1264 r = (struct btrfs_inode_extref *)ref_ptr; 1265 parent_objectid = btrfs_inode_extref_parent(eb, r); 1266 } else { 1267 ref_struct_size = sizeof(struct btrfs_inode_ref); 1268 parent_objectid = key->offset; 1269 } 1270 inode_objectid = key->objectid; 1271 1272 /* 1273 * it is possible that we didn't log all the parent directories 1274 * for a given inode. If we don't find the dir, just don't 1275 * copy the back ref in. The link count fixup code will take 1276 * care of the rest 1277 */ 1278 dir = read_one_inode(root, parent_objectid); 1279 if (!dir) { 1280 ret = -ENOENT; 1281 goto out; 1282 } 1283 1284 inode = read_one_inode(root, inode_objectid); 1285 if (!inode) { 1286 ret = -EIO; 1287 goto out; 1288 } 1289 1290 while (ref_ptr < ref_end) { 1291 if (log_ref_ver) { 1292 ret = extref_get_fields(eb, slot, ref_ptr, &namelen, 1293 &name, &ref_index, &parent_objectid); 1294 /* 1295 * parent object can change from one array 1296 * item to another. 1297 */ 1298 if (!dir) 1299 dir = read_one_inode(root, parent_objectid); 1300 if (!dir) { 1301 ret = -ENOENT; 1302 goto out; 1303 } 1304 } else { 1305 ret = ref_get_fields(eb, slot, ref_ptr, &namelen, 1306 &name, &ref_index); 1307 } 1308 if (ret) 1309 goto out; 1310 1311 /* if we already have a perfect match, we're done */ 1312 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1313 btrfs_ino(BTRFS_I(inode)), ref_index, 1314 name, namelen)) { 1315 /* 1316 * look for a conflicting back reference in the 1317 * metadata. if we find one we have to unlink that name 1318 * of the file before we add our new link. Later on, we 1319 * overwrite any existing back reference, and we don't 1320 * want to create dangling pointers in the directory. 1321 */ 1322 1323 if (!search_done) { 1324 ret = __add_inode_ref(trans, root, path, log, 1325 BTRFS_I(dir), 1326 BTRFS_I(inode), 1327 inode_objectid, 1328 parent_objectid, 1329 ref_index, name, namelen, 1330 &search_done); 1331 if (ret) { 1332 if (ret == 1) 1333 ret = 0; 1334 goto out; 1335 } 1336 } 1337 1338 /* insert our name */ 1339 ret = btrfs_add_link(trans, BTRFS_I(dir), 1340 BTRFS_I(inode), 1341 name, namelen, 0, ref_index); 1342 if (ret) 1343 goto out; 1344 1345 btrfs_update_inode(trans, root, inode); 1346 } 1347 1348 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1349 kfree(name); 1350 name = NULL; 1351 if (log_ref_ver) { 1352 iput(dir); 1353 dir = NULL; 1354 } 1355 } 1356 1357 /* finally write the back reference in the inode */ 1358 ret = overwrite_item(trans, root, path, eb, slot, key); 1359 out: 1360 btrfs_release_path(path); 1361 kfree(name); 1362 iput(dir); 1363 iput(inode); 1364 return ret; 1365 } 1366 1367 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1368 struct btrfs_root *root, u64 ino) 1369 { 1370 int ret; 1371 1372 ret = btrfs_insert_orphan_item(trans, root, ino); 1373 if (ret == -EEXIST) 1374 ret = 0; 1375 1376 return ret; 1377 } 1378 1379 static int count_inode_extrefs(struct btrfs_root *root, 1380 struct btrfs_inode *inode, struct btrfs_path *path) 1381 { 1382 int ret = 0; 1383 int name_len; 1384 unsigned int nlink = 0; 1385 u32 item_size; 1386 u32 cur_offset = 0; 1387 u64 inode_objectid = btrfs_ino(inode); 1388 u64 offset = 0; 1389 unsigned long ptr; 1390 struct btrfs_inode_extref *extref; 1391 struct extent_buffer *leaf; 1392 1393 while (1) { 1394 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1395 &extref, &offset); 1396 if (ret) 1397 break; 1398 1399 leaf = path->nodes[0]; 1400 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1401 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1402 cur_offset = 0; 1403 1404 while (cur_offset < item_size) { 1405 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1406 name_len = btrfs_inode_extref_name_len(leaf, extref); 1407 1408 nlink++; 1409 1410 cur_offset += name_len + sizeof(*extref); 1411 } 1412 1413 offset++; 1414 btrfs_release_path(path); 1415 } 1416 btrfs_release_path(path); 1417 1418 if (ret < 0 && ret != -ENOENT) 1419 return ret; 1420 return nlink; 1421 } 1422 1423 static int count_inode_refs(struct btrfs_root *root, 1424 struct btrfs_inode *inode, struct btrfs_path *path) 1425 { 1426 int ret; 1427 struct btrfs_key key; 1428 unsigned int nlink = 0; 1429 unsigned long ptr; 1430 unsigned long ptr_end; 1431 int name_len; 1432 u64 ino = btrfs_ino(inode); 1433 1434 key.objectid = ino; 1435 key.type = BTRFS_INODE_REF_KEY; 1436 key.offset = (u64)-1; 1437 1438 while (1) { 1439 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1440 if (ret < 0) 1441 break; 1442 if (ret > 0) { 1443 if (path->slots[0] == 0) 1444 break; 1445 path->slots[0]--; 1446 } 1447 process_slot: 1448 btrfs_item_key_to_cpu(path->nodes[0], &key, 1449 path->slots[0]); 1450 if (key.objectid != ino || 1451 key.type != BTRFS_INODE_REF_KEY) 1452 break; 1453 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1454 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1455 path->slots[0]); 1456 while (ptr < ptr_end) { 1457 struct btrfs_inode_ref *ref; 1458 1459 ref = (struct btrfs_inode_ref *)ptr; 1460 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1461 ref); 1462 ptr = (unsigned long)(ref + 1) + name_len; 1463 nlink++; 1464 } 1465 1466 if (key.offset == 0) 1467 break; 1468 if (path->slots[0] > 0) { 1469 path->slots[0]--; 1470 goto process_slot; 1471 } 1472 key.offset--; 1473 btrfs_release_path(path); 1474 } 1475 btrfs_release_path(path); 1476 1477 return nlink; 1478 } 1479 1480 /* 1481 * There are a few corners where the link count of the file can't 1482 * be properly maintained during replay. So, instead of adding 1483 * lots of complexity to the log code, we just scan the backrefs 1484 * for any file that has been through replay. 1485 * 1486 * The scan will update the link count on the inode to reflect the 1487 * number of back refs found. If it goes down to zero, the iput 1488 * will free the inode. 1489 */ 1490 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1491 struct btrfs_root *root, 1492 struct inode *inode) 1493 { 1494 struct btrfs_path *path; 1495 int ret; 1496 u64 nlink = 0; 1497 u64 ino = btrfs_ino(BTRFS_I(inode)); 1498 1499 path = btrfs_alloc_path(); 1500 if (!path) 1501 return -ENOMEM; 1502 1503 ret = count_inode_refs(root, BTRFS_I(inode), path); 1504 if (ret < 0) 1505 goto out; 1506 1507 nlink = ret; 1508 1509 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1510 if (ret < 0) 1511 goto out; 1512 1513 nlink += ret; 1514 1515 ret = 0; 1516 1517 if (nlink != inode->i_nlink) { 1518 set_nlink(inode, nlink); 1519 btrfs_update_inode(trans, root, inode); 1520 } 1521 BTRFS_I(inode)->index_cnt = (u64)-1; 1522 1523 if (inode->i_nlink == 0) { 1524 if (S_ISDIR(inode->i_mode)) { 1525 ret = replay_dir_deletes(trans, root, NULL, path, 1526 ino, 1); 1527 if (ret) 1528 goto out; 1529 } 1530 ret = insert_orphan_item(trans, root, ino); 1531 } 1532 1533 out: 1534 btrfs_free_path(path); 1535 return ret; 1536 } 1537 1538 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1539 struct btrfs_root *root, 1540 struct btrfs_path *path) 1541 { 1542 int ret; 1543 struct btrfs_key key; 1544 struct inode *inode; 1545 1546 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1547 key.type = BTRFS_ORPHAN_ITEM_KEY; 1548 key.offset = (u64)-1; 1549 while (1) { 1550 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1551 if (ret < 0) 1552 break; 1553 1554 if (ret == 1) { 1555 if (path->slots[0] == 0) 1556 break; 1557 path->slots[0]--; 1558 } 1559 1560 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1561 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1562 key.type != BTRFS_ORPHAN_ITEM_KEY) 1563 break; 1564 1565 ret = btrfs_del_item(trans, root, path); 1566 if (ret) 1567 goto out; 1568 1569 btrfs_release_path(path); 1570 inode = read_one_inode(root, key.offset); 1571 if (!inode) 1572 return -EIO; 1573 1574 ret = fixup_inode_link_count(trans, root, inode); 1575 iput(inode); 1576 if (ret) 1577 goto out; 1578 1579 /* 1580 * fixup on a directory may create new entries, 1581 * make sure we always look for the highset possible 1582 * offset 1583 */ 1584 key.offset = (u64)-1; 1585 } 1586 ret = 0; 1587 out: 1588 btrfs_release_path(path); 1589 return ret; 1590 } 1591 1592 1593 /* 1594 * record a given inode in the fixup dir so we can check its link 1595 * count when replay is done. The link count is incremented here 1596 * so the inode won't go away until we check it 1597 */ 1598 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1599 struct btrfs_root *root, 1600 struct btrfs_path *path, 1601 u64 objectid) 1602 { 1603 struct btrfs_key key; 1604 int ret = 0; 1605 struct inode *inode; 1606 1607 inode = read_one_inode(root, objectid); 1608 if (!inode) 1609 return -EIO; 1610 1611 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1612 key.type = BTRFS_ORPHAN_ITEM_KEY; 1613 key.offset = objectid; 1614 1615 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1616 1617 btrfs_release_path(path); 1618 if (ret == 0) { 1619 if (!inode->i_nlink) 1620 set_nlink(inode, 1); 1621 else 1622 inc_nlink(inode); 1623 ret = btrfs_update_inode(trans, root, inode); 1624 } else if (ret == -EEXIST) { 1625 ret = 0; 1626 } else { 1627 BUG(); /* Logic Error */ 1628 } 1629 iput(inode); 1630 1631 return ret; 1632 } 1633 1634 /* 1635 * when replaying the log for a directory, we only insert names 1636 * for inodes that actually exist. This means an fsync on a directory 1637 * does not implicitly fsync all the new files in it 1638 */ 1639 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1640 struct btrfs_root *root, 1641 u64 dirid, u64 index, 1642 char *name, int name_len, 1643 struct btrfs_key *location) 1644 { 1645 struct inode *inode; 1646 struct inode *dir; 1647 int ret; 1648 1649 inode = read_one_inode(root, location->objectid); 1650 if (!inode) 1651 return -ENOENT; 1652 1653 dir = read_one_inode(root, dirid); 1654 if (!dir) { 1655 iput(inode); 1656 return -EIO; 1657 } 1658 1659 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1660 name_len, 1, index); 1661 1662 /* FIXME, put inode into FIXUP list */ 1663 1664 iput(inode); 1665 iput(dir); 1666 return ret; 1667 } 1668 1669 /* 1670 * Return true if an inode reference exists in the log for the given name, 1671 * inode and parent inode. 1672 */ 1673 static bool name_in_log_ref(struct btrfs_root *log_root, 1674 const char *name, const int name_len, 1675 const u64 dirid, const u64 ino) 1676 { 1677 struct btrfs_key search_key; 1678 1679 search_key.objectid = ino; 1680 search_key.type = BTRFS_INODE_REF_KEY; 1681 search_key.offset = dirid; 1682 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1683 return true; 1684 1685 search_key.type = BTRFS_INODE_EXTREF_KEY; 1686 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1687 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1688 return true; 1689 1690 return false; 1691 } 1692 1693 /* 1694 * take a single entry in a log directory item and replay it into 1695 * the subvolume. 1696 * 1697 * if a conflicting item exists in the subdirectory already, 1698 * the inode it points to is unlinked and put into the link count 1699 * fix up tree. 1700 * 1701 * If a name from the log points to a file or directory that does 1702 * not exist in the FS, it is skipped. fsyncs on directories 1703 * do not force down inodes inside that directory, just changes to the 1704 * names or unlinks in a directory. 1705 * 1706 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1707 * non-existing inode) and 1 if the name was replayed. 1708 */ 1709 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1710 struct btrfs_root *root, 1711 struct btrfs_path *path, 1712 struct extent_buffer *eb, 1713 struct btrfs_dir_item *di, 1714 struct btrfs_key *key) 1715 { 1716 char *name; 1717 int name_len; 1718 struct btrfs_dir_item *dst_di; 1719 struct btrfs_key found_key; 1720 struct btrfs_key log_key; 1721 struct inode *dir; 1722 u8 log_type; 1723 int exists; 1724 int ret = 0; 1725 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1726 bool name_added = false; 1727 1728 dir = read_one_inode(root, key->objectid); 1729 if (!dir) 1730 return -EIO; 1731 1732 name_len = btrfs_dir_name_len(eb, di); 1733 name = kmalloc(name_len, GFP_NOFS); 1734 if (!name) { 1735 ret = -ENOMEM; 1736 goto out; 1737 } 1738 1739 log_type = btrfs_dir_type(eb, di); 1740 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1741 name_len); 1742 1743 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1744 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1745 if (exists == 0) 1746 exists = 1; 1747 else 1748 exists = 0; 1749 btrfs_release_path(path); 1750 1751 if (key->type == BTRFS_DIR_ITEM_KEY) { 1752 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1753 name, name_len, 1); 1754 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1755 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1756 key->objectid, 1757 key->offset, name, 1758 name_len, 1); 1759 } else { 1760 /* Corruption */ 1761 ret = -EINVAL; 1762 goto out; 1763 } 1764 if (IS_ERR_OR_NULL(dst_di)) { 1765 /* we need a sequence number to insert, so we only 1766 * do inserts for the BTRFS_DIR_INDEX_KEY types 1767 */ 1768 if (key->type != BTRFS_DIR_INDEX_KEY) 1769 goto out; 1770 goto insert; 1771 } 1772 1773 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1774 /* the existing item matches the logged item */ 1775 if (found_key.objectid == log_key.objectid && 1776 found_key.type == log_key.type && 1777 found_key.offset == log_key.offset && 1778 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1779 update_size = false; 1780 goto out; 1781 } 1782 1783 /* 1784 * don't drop the conflicting directory entry if the inode 1785 * for the new entry doesn't exist 1786 */ 1787 if (!exists) 1788 goto out; 1789 1790 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1791 if (ret) 1792 goto out; 1793 1794 if (key->type == BTRFS_DIR_INDEX_KEY) 1795 goto insert; 1796 out: 1797 btrfs_release_path(path); 1798 if (!ret && update_size) { 1799 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1800 ret = btrfs_update_inode(trans, root, dir); 1801 } 1802 kfree(name); 1803 iput(dir); 1804 if (!ret && name_added) 1805 ret = 1; 1806 return ret; 1807 1808 insert: 1809 if (name_in_log_ref(root->log_root, name, name_len, 1810 key->objectid, log_key.objectid)) { 1811 /* The dentry will be added later. */ 1812 ret = 0; 1813 update_size = false; 1814 goto out; 1815 } 1816 btrfs_release_path(path); 1817 ret = insert_one_name(trans, root, key->objectid, key->offset, 1818 name, name_len, &log_key); 1819 if (ret && ret != -ENOENT && ret != -EEXIST) 1820 goto out; 1821 if (!ret) 1822 name_added = true; 1823 update_size = false; 1824 ret = 0; 1825 goto out; 1826 } 1827 1828 /* 1829 * find all the names in a directory item and reconcile them into 1830 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1831 * one name in a directory item, but the same code gets used for 1832 * both directory index types 1833 */ 1834 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1835 struct btrfs_root *root, 1836 struct btrfs_path *path, 1837 struct extent_buffer *eb, int slot, 1838 struct btrfs_key *key) 1839 { 1840 struct btrfs_fs_info *fs_info = root->fs_info; 1841 int ret = 0; 1842 u32 item_size = btrfs_item_size_nr(eb, slot); 1843 struct btrfs_dir_item *di; 1844 int name_len; 1845 unsigned long ptr; 1846 unsigned long ptr_end; 1847 struct btrfs_path *fixup_path = NULL; 1848 1849 ptr = btrfs_item_ptr_offset(eb, slot); 1850 ptr_end = ptr + item_size; 1851 while (ptr < ptr_end) { 1852 di = (struct btrfs_dir_item *)ptr; 1853 if (verify_dir_item(fs_info, eb, slot, di)) 1854 return -EIO; 1855 name_len = btrfs_dir_name_len(eb, di); 1856 ret = replay_one_name(trans, root, path, eb, di, key); 1857 if (ret < 0) 1858 break; 1859 ptr = (unsigned long)(di + 1); 1860 ptr += name_len; 1861 1862 /* 1863 * If this entry refers to a non-directory (directories can not 1864 * have a link count > 1) and it was added in the transaction 1865 * that was not committed, make sure we fixup the link count of 1866 * the inode it the entry points to. Otherwise something like 1867 * the following would result in a directory pointing to an 1868 * inode with a wrong link that does not account for this dir 1869 * entry: 1870 * 1871 * mkdir testdir 1872 * touch testdir/foo 1873 * touch testdir/bar 1874 * sync 1875 * 1876 * ln testdir/bar testdir/bar_link 1877 * ln testdir/foo testdir/foo_link 1878 * xfs_io -c "fsync" testdir/bar 1879 * 1880 * <power failure> 1881 * 1882 * mount fs, log replay happens 1883 * 1884 * File foo would remain with a link count of 1 when it has two 1885 * entries pointing to it in the directory testdir. This would 1886 * make it impossible to ever delete the parent directory has 1887 * it would result in stale dentries that can never be deleted. 1888 */ 1889 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1890 struct btrfs_key di_key; 1891 1892 if (!fixup_path) { 1893 fixup_path = btrfs_alloc_path(); 1894 if (!fixup_path) { 1895 ret = -ENOMEM; 1896 break; 1897 } 1898 } 1899 1900 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1901 ret = link_to_fixup_dir(trans, root, fixup_path, 1902 di_key.objectid); 1903 if (ret) 1904 break; 1905 } 1906 ret = 0; 1907 } 1908 btrfs_free_path(fixup_path); 1909 return ret; 1910 } 1911 1912 /* 1913 * directory replay has two parts. There are the standard directory 1914 * items in the log copied from the subvolume, and range items 1915 * created in the log while the subvolume was logged. 1916 * 1917 * The range items tell us which parts of the key space the log 1918 * is authoritative for. During replay, if a key in the subvolume 1919 * directory is in a logged range item, but not actually in the log 1920 * that means it was deleted from the directory before the fsync 1921 * and should be removed. 1922 */ 1923 static noinline int find_dir_range(struct btrfs_root *root, 1924 struct btrfs_path *path, 1925 u64 dirid, int key_type, 1926 u64 *start_ret, u64 *end_ret) 1927 { 1928 struct btrfs_key key; 1929 u64 found_end; 1930 struct btrfs_dir_log_item *item; 1931 int ret; 1932 int nritems; 1933 1934 if (*start_ret == (u64)-1) 1935 return 1; 1936 1937 key.objectid = dirid; 1938 key.type = key_type; 1939 key.offset = *start_ret; 1940 1941 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1942 if (ret < 0) 1943 goto out; 1944 if (ret > 0) { 1945 if (path->slots[0] == 0) 1946 goto out; 1947 path->slots[0]--; 1948 } 1949 if (ret != 0) 1950 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1951 1952 if (key.type != key_type || key.objectid != dirid) { 1953 ret = 1; 1954 goto next; 1955 } 1956 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1957 struct btrfs_dir_log_item); 1958 found_end = btrfs_dir_log_end(path->nodes[0], item); 1959 1960 if (*start_ret >= key.offset && *start_ret <= found_end) { 1961 ret = 0; 1962 *start_ret = key.offset; 1963 *end_ret = found_end; 1964 goto out; 1965 } 1966 ret = 1; 1967 next: 1968 /* check the next slot in the tree to see if it is a valid item */ 1969 nritems = btrfs_header_nritems(path->nodes[0]); 1970 path->slots[0]++; 1971 if (path->slots[0] >= nritems) { 1972 ret = btrfs_next_leaf(root, path); 1973 if (ret) 1974 goto out; 1975 } 1976 1977 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1978 1979 if (key.type != key_type || key.objectid != dirid) { 1980 ret = 1; 1981 goto out; 1982 } 1983 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1984 struct btrfs_dir_log_item); 1985 found_end = btrfs_dir_log_end(path->nodes[0], item); 1986 *start_ret = key.offset; 1987 *end_ret = found_end; 1988 ret = 0; 1989 out: 1990 btrfs_release_path(path); 1991 return ret; 1992 } 1993 1994 /* 1995 * this looks for a given directory item in the log. If the directory 1996 * item is not in the log, the item is removed and the inode it points 1997 * to is unlinked 1998 */ 1999 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2000 struct btrfs_root *root, 2001 struct btrfs_root *log, 2002 struct btrfs_path *path, 2003 struct btrfs_path *log_path, 2004 struct inode *dir, 2005 struct btrfs_key *dir_key) 2006 { 2007 struct btrfs_fs_info *fs_info = root->fs_info; 2008 int ret; 2009 struct extent_buffer *eb; 2010 int slot; 2011 u32 item_size; 2012 struct btrfs_dir_item *di; 2013 struct btrfs_dir_item *log_di; 2014 int name_len; 2015 unsigned long ptr; 2016 unsigned long ptr_end; 2017 char *name; 2018 struct inode *inode; 2019 struct btrfs_key location; 2020 2021 again: 2022 eb = path->nodes[0]; 2023 slot = path->slots[0]; 2024 item_size = btrfs_item_size_nr(eb, slot); 2025 ptr = btrfs_item_ptr_offset(eb, slot); 2026 ptr_end = ptr + item_size; 2027 while (ptr < ptr_end) { 2028 di = (struct btrfs_dir_item *)ptr; 2029 if (verify_dir_item(fs_info, eb, slot, di)) { 2030 ret = -EIO; 2031 goto out; 2032 } 2033 2034 name_len = btrfs_dir_name_len(eb, di); 2035 name = kmalloc(name_len, GFP_NOFS); 2036 if (!name) { 2037 ret = -ENOMEM; 2038 goto out; 2039 } 2040 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2041 name_len); 2042 log_di = NULL; 2043 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2044 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2045 dir_key->objectid, 2046 name, name_len, 0); 2047 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2048 log_di = btrfs_lookup_dir_index_item(trans, log, 2049 log_path, 2050 dir_key->objectid, 2051 dir_key->offset, 2052 name, name_len, 0); 2053 } 2054 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2055 btrfs_dir_item_key_to_cpu(eb, di, &location); 2056 btrfs_release_path(path); 2057 btrfs_release_path(log_path); 2058 inode = read_one_inode(root, location.objectid); 2059 if (!inode) { 2060 kfree(name); 2061 return -EIO; 2062 } 2063 2064 ret = link_to_fixup_dir(trans, root, 2065 path, location.objectid); 2066 if (ret) { 2067 kfree(name); 2068 iput(inode); 2069 goto out; 2070 } 2071 2072 inc_nlink(inode); 2073 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2074 BTRFS_I(inode), name, name_len); 2075 if (!ret) 2076 ret = btrfs_run_delayed_items(trans, fs_info); 2077 kfree(name); 2078 iput(inode); 2079 if (ret) 2080 goto out; 2081 2082 /* there might still be more names under this key 2083 * check and repeat if required 2084 */ 2085 ret = btrfs_search_slot(NULL, root, dir_key, path, 2086 0, 0); 2087 if (ret == 0) 2088 goto again; 2089 ret = 0; 2090 goto out; 2091 } else if (IS_ERR(log_di)) { 2092 kfree(name); 2093 return PTR_ERR(log_di); 2094 } 2095 btrfs_release_path(log_path); 2096 kfree(name); 2097 2098 ptr = (unsigned long)(di + 1); 2099 ptr += name_len; 2100 } 2101 ret = 0; 2102 out: 2103 btrfs_release_path(path); 2104 btrfs_release_path(log_path); 2105 return ret; 2106 } 2107 2108 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2109 struct btrfs_root *root, 2110 struct btrfs_root *log, 2111 struct btrfs_path *path, 2112 const u64 ino) 2113 { 2114 struct btrfs_fs_info *fs_info = root->fs_info; 2115 struct btrfs_key search_key; 2116 struct btrfs_path *log_path; 2117 int i; 2118 int nritems; 2119 int ret; 2120 2121 log_path = btrfs_alloc_path(); 2122 if (!log_path) 2123 return -ENOMEM; 2124 2125 search_key.objectid = ino; 2126 search_key.type = BTRFS_XATTR_ITEM_KEY; 2127 search_key.offset = 0; 2128 again: 2129 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2130 if (ret < 0) 2131 goto out; 2132 process_leaf: 2133 nritems = btrfs_header_nritems(path->nodes[0]); 2134 for (i = path->slots[0]; i < nritems; i++) { 2135 struct btrfs_key key; 2136 struct btrfs_dir_item *di; 2137 struct btrfs_dir_item *log_di; 2138 u32 total_size; 2139 u32 cur; 2140 2141 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2142 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2143 ret = 0; 2144 goto out; 2145 } 2146 2147 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2148 total_size = btrfs_item_size_nr(path->nodes[0], i); 2149 cur = 0; 2150 while (cur < total_size) { 2151 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2152 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2153 u32 this_len = sizeof(*di) + name_len + data_len; 2154 char *name; 2155 2156 ret = verify_dir_item(fs_info, path->nodes[0], i, di); 2157 if (ret) { 2158 ret = -EIO; 2159 goto out; 2160 } 2161 name = kmalloc(name_len, GFP_NOFS); 2162 if (!name) { 2163 ret = -ENOMEM; 2164 goto out; 2165 } 2166 read_extent_buffer(path->nodes[0], name, 2167 (unsigned long)(di + 1), name_len); 2168 2169 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2170 name, name_len, 0); 2171 btrfs_release_path(log_path); 2172 if (!log_di) { 2173 /* Doesn't exist in log tree, so delete it. */ 2174 btrfs_release_path(path); 2175 di = btrfs_lookup_xattr(trans, root, path, ino, 2176 name, name_len, -1); 2177 kfree(name); 2178 if (IS_ERR(di)) { 2179 ret = PTR_ERR(di); 2180 goto out; 2181 } 2182 ASSERT(di); 2183 ret = btrfs_delete_one_dir_name(trans, root, 2184 path, di); 2185 if (ret) 2186 goto out; 2187 btrfs_release_path(path); 2188 search_key = key; 2189 goto again; 2190 } 2191 kfree(name); 2192 if (IS_ERR(log_di)) { 2193 ret = PTR_ERR(log_di); 2194 goto out; 2195 } 2196 cur += this_len; 2197 di = (struct btrfs_dir_item *)((char *)di + this_len); 2198 } 2199 } 2200 ret = btrfs_next_leaf(root, path); 2201 if (ret > 0) 2202 ret = 0; 2203 else if (ret == 0) 2204 goto process_leaf; 2205 out: 2206 btrfs_free_path(log_path); 2207 btrfs_release_path(path); 2208 return ret; 2209 } 2210 2211 2212 /* 2213 * deletion replay happens before we copy any new directory items 2214 * out of the log or out of backreferences from inodes. It 2215 * scans the log to find ranges of keys that log is authoritative for, 2216 * and then scans the directory to find items in those ranges that are 2217 * not present in the log. 2218 * 2219 * Anything we don't find in the log is unlinked and removed from the 2220 * directory. 2221 */ 2222 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2223 struct btrfs_root *root, 2224 struct btrfs_root *log, 2225 struct btrfs_path *path, 2226 u64 dirid, int del_all) 2227 { 2228 u64 range_start; 2229 u64 range_end; 2230 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2231 int ret = 0; 2232 struct btrfs_key dir_key; 2233 struct btrfs_key found_key; 2234 struct btrfs_path *log_path; 2235 struct inode *dir; 2236 2237 dir_key.objectid = dirid; 2238 dir_key.type = BTRFS_DIR_ITEM_KEY; 2239 log_path = btrfs_alloc_path(); 2240 if (!log_path) 2241 return -ENOMEM; 2242 2243 dir = read_one_inode(root, dirid); 2244 /* it isn't an error if the inode isn't there, that can happen 2245 * because we replay the deletes before we copy in the inode item 2246 * from the log 2247 */ 2248 if (!dir) { 2249 btrfs_free_path(log_path); 2250 return 0; 2251 } 2252 again: 2253 range_start = 0; 2254 range_end = 0; 2255 while (1) { 2256 if (del_all) 2257 range_end = (u64)-1; 2258 else { 2259 ret = find_dir_range(log, path, dirid, key_type, 2260 &range_start, &range_end); 2261 if (ret != 0) 2262 break; 2263 } 2264 2265 dir_key.offset = range_start; 2266 while (1) { 2267 int nritems; 2268 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2269 0, 0); 2270 if (ret < 0) 2271 goto out; 2272 2273 nritems = btrfs_header_nritems(path->nodes[0]); 2274 if (path->slots[0] >= nritems) { 2275 ret = btrfs_next_leaf(root, path); 2276 if (ret) 2277 break; 2278 } 2279 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2280 path->slots[0]); 2281 if (found_key.objectid != dirid || 2282 found_key.type != dir_key.type) 2283 goto next_type; 2284 2285 if (found_key.offset > range_end) 2286 break; 2287 2288 ret = check_item_in_log(trans, root, log, path, 2289 log_path, dir, 2290 &found_key); 2291 if (ret) 2292 goto out; 2293 if (found_key.offset == (u64)-1) 2294 break; 2295 dir_key.offset = found_key.offset + 1; 2296 } 2297 btrfs_release_path(path); 2298 if (range_end == (u64)-1) 2299 break; 2300 range_start = range_end + 1; 2301 } 2302 2303 next_type: 2304 ret = 0; 2305 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2306 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2307 dir_key.type = BTRFS_DIR_INDEX_KEY; 2308 btrfs_release_path(path); 2309 goto again; 2310 } 2311 out: 2312 btrfs_release_path(path); 2313 btrfs_free_path(log_path); 2314 iput(dir); 2315 return ret; 2316 } 2317 2318 /* 2319 * the process_func used to replay items from the log tree. This 2320 * gets called in two different stages. The first stage just looks 2321 * for inodes and makes sure they are all copied into the subvolume. 2322 * 2323 * The second stage copies all the other item types from the log into 2324 * the subvolume. The two stage approach is slower, but gets rid of 2325 * lots of complexity around inodes referencing other inodes that exist 2326 * only in the log (references come from either directory items or inode 2327 * back refs). 2328 */ 2329 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2330 struct walk_control *wc, u64 gen) 2331 { 2332 int nritems; 2333 struct btrfs_path *path; 2334 struct btrfs_root *root = wc->replay_dest; 2335 struct btrfs_key key; 2336 int level; 2337 int i; 2338 int ret; 2339 2340 ret = btrfs_read_buffer(eb, gen); 2341 if (ret) 2342 return ret; 2343 2344 level = btrfs_header_level(eb); 2345 2346 if (level != 0) 2347 return 0; 2348 2349 path = btrfs_alloc_path(); 2350 if (!path) 2351 return -ENOMEM; 2352 2353 nritems = btrfs_header_nritems(eb); 2354 for (i = 0; i < nritems; i++) { 2355 btrfs_item_key_to_cpu(eb, &key, i); 2356 2357 /* inode keys are done during the first stage */ 2358 if (key.type == BTRFS_INODE_ITEM_KEY && 2359 wc->stage == LOG_WALK_REPLAY_INODES) { 2360 struct btrfs_inode_item *inode_item; 2361 u32 mode; 2362 2363 inode_item = btrfs_item_ptr(eb, i, 2364 struct btrfs_inode_item); 2365 ret = replay_xattr_deletes(wc->trans, root, log, 2366 path, key.objectid); 2367 if (ret) 2368 break; 2369 mode = btrfs_inode_mode(eb, inode_item); 2370 if (S_ISDIR(mode)) { 2371 ret = replay_dir_deletes(wc->trans, 2372 root, log, path, key.objectid, 0); 2373 if (ret) 2374 break; 2375 } 2376 ret = overwrite_item(wc->trans, root, path, 2377 eb, i, &key); 2378 if (ret) 2379 break; 2380 2381 /* for regular files, make sure corresponding 2382 * orphan item exist. extents past the new EOF 2383 * will be truncated later by orphan cleanup. 2384 */ 2385 if (S_ISREG(mode)) { 2386 ret = insert_orphan_item(wc->trans, root, 2387 key.objectid); 2388 if (ret) 2389 break; 2390 } 2391 2392 ret = link_to_fixup_dir(wc->trans, root, 2393 path, key.objectid); 2394 if (ret) 2395 break; 2396 } 2397 2398 if (key.type == BTRFS_DIR_INDEX_KEY && 2399 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2400 ret = replay_one_dir_item(wc->trans, root, path, 2401 eb, i, &key); 2402 if (ret) 2403 break; 2404 } 2405 2406 if (wc->stage < LOG_WALK_REPLAY_ALL) 2407 continue; 2408 2409 /* these keys are simply copied */ 2410 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2411 ret = overwrite_item(wc->trans, root, path, 2412 eb, i, &key); 2413 if (ret) 2414 break; 2415 } else if (key.type == BTRFS_INODE_REF_KEY || 2416 key.type == BTRFS_INODE_EXTREF_KEY) { 2417 ret = add_inode_ref(wc->trans, root, log, path, 2418 eb, i, &key); 2419 if (ret && ret != -ENOENT) 2420 break; 2421 ret = 0; 2422 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2423 ret = replay_one_extent(wc->trans, root, path, 2424 eb, i, &key); 2425 if (ret) 2426 break; 2427 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2428 ret = replay_one_dir_item(wc->trans, root, path, 2429 eb, i, &key); 2430 if (ret) 2431 break; 2432 } 2433 } 2434 btrfs_free_path(path); 2435 return ret; 2436 } 2437 2438 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2439 struct btrfs_root *root, 2440 struct btrfs_path *path, int *level, 2441 struct walk_control *wc) 2442 { 2443 struct btrfs_fs_info *fs_info = root->fs_info; 2444 u64 root_owner; 2445 u64 bytenr; 2446 u64 ptr_gen; 2447 struct extent_buffer *next; 2448 struct extent_buffer *cur; 2449 struct extent_buffer *parent; 2450 u32 blocksize; 2451 int ret = 0; 2452 2453 WARN_ON(*level < 0); 2454 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2455 2456 while (*level > 0) { 2457 WARN_ON(*level < 0); 2458 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2459 cur = path->nodes[*level]; 2460 2461 WARN_ON(btrfs_header_level(cur) != *level); 2462 2463 if (path->slots[*level] >= 2464 btrfs_header_nritems(cur)) 2465 break; 2466 2467 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2468 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2469 blocksize = fs_info->nodesize; 2470 2471 parent = path->nodes[*level]; 2472 root_owner = btrfs_header_owner(parent); 2473 2474 next = btrfs_find_create_tree_block(fs_info, bytenr); 2475 if (IS_ERR(next)) 2476 return PTR_ERR(next); 2477 2478 if (*level == 1) { 2479 ret = wc->process_func(root, next, wc, ptr_gen); 2480 if (ret) { 2481 free_extent_buffer(next); 2482 return ret; 2483 } 2484 2485 path->slots[*level]++; 2486 if (wc->free) { 2487 ret = btrfs_read_buffer(next, ptr_gen); 2488 if (ret) { 2489 free_extent_buffer(next); 2490 return ret; 2491 } 2492 2493 if (trans) { 2494 btrfs_tree_lock(next); 2495 btrfs_set_lock_blocking(next); 2496 clean_tree_block(fs_info, next); 2497 btrfs_wait_tree_block_writeback(next); 2498 btrfs_tree_unlock(next); 2499 } 2500 2501 WARN_ON(root_owner != 2502 BTRFS_TREE_LOG_OBJECTID); 2503 ret = btrfs_free_and_pin_reserved_extent( 2504 fs_info, bytenr, 2505 blocksize); 2506 if (ret) { 2507 free_extent_buffer(next); 2508 return ret; 2509 } 2510 } 2511 free_extent_buffer(next); 2512 continue; 2513 } 2514 ret = btrfs_read_buffer(next, ptr_gen); 2515 if (ret) { 2516 free_extent_buffer(next); 2517 return ret; 2518 } 2519 2520 WARN_ON(*level <= 0); 2521 if (path->nodes[*level-1]) 2522 free_extent_buffer(path->nodes[*level-1]); 2523 path->nodes[*level-1] = next; 2524 *level = btrfs_header_level(next); 2525 path->slots[*level] = 0; 2526 cond_resched(); 2527 } 2528 WARN_ON(*level < 0); 2529 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2530 2531 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2532 2533 cond_resched(); 2534 return 0; 2535 } 2536 2537 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2538 struct btrfs_root *root, 2539 struct btrfs_path *path, int *level, 2540 struct walk_control *wc) 2541 { 2542 struct btrfs_fs_info *fs_info = root->fs_info; 2543 u64 root_owner; 2544 int i; 2545 int slot; 2546 int ret; 2547 2548 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2549 slot = path->slots[i]; 2550 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2551 path->slots[i]++; 2552 *level = i; 2553 WARN_ON(*level == 0); 2554 return 0; 2555 } else { 2556 struct extent_buffer *parent; 2557 if (path->nodes[*level] == root->node) 2558 parent = path->nodes[*level]; 2559 else 2560 parent = path->nodes[*level + 1]; 2561 2562 root_owner = btrfs_header_owner(parent); 2563 ret = wc->process_func(root, path->nodes[*level], wc, 2564 btrfs_header_generation(path->nodes[*level])); 2565 if (ret) 2566 return ret; 2567 2568 if (wc->free) { 2569 struct extent_buffer *next; 2570 2571 next = path->nodes[*level]; 2572 2573 if (trans) { 2574 btrfs_tree_lock(next); 2575 btrfs_set_lock_blocking(next); 2576 clean_tree_block(fs_info, next); 2577 btrfs_wait_tree_block_writeback(next); 2578 btrfs_tree_unlock(next); 2579 } 2580 2581 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2582 ret = btrfs_free_and_pin_reserved_extent( 2583 fs_info, 2584 path->nodes[*level]->start, 2585 path->nodes[*level]->len); 2586 if (ret) 2587 return ret; 2588 } 2589 free_extent_buffer(path->nodes[*level]); 2590 path->nodes[*level] = NULL; 2591 *level = i + 1; 2592 } 2593 } 2594 return 1; 2595 } 2596 2597 /* 2598 * drop the reference count on the tree rooted at 'snap'. This traverses 2599 * the tree freeing any blocks that have a ref count of zero after being 2600 * decremented. 2601 */ 2602 static int walk_log_tree(struct btrfs_trans_handle *trans, 2603 struct btrfs_root *log, struct walk_control *wc) 2604 { 2605 struct btrfs_fs_info *fs_info = log->fs_info; 2606 int ret = 0; 2607 int wret; 2608 int level; 2609 struct btrfs_path *path; 2610 int orig_level; 2611 2612 path = btrfs_alloc_path(); 2613 if (!path) 2614 return -ENOMEM; 2615 2616 level = btrfs_header_level(log->node); 2617 orig_level = level; 2618 path->nodes[level] = log->node; 2619 extent_buffer_get(log->node); 2620 path->slots[level] = 0; 2621 2622 while (1) { 2623 wret = walk_down_log_tree(trans, log, path, &level, wc); 2624 if (wret > 0) 2625 break; 2626 if (wret < 0) { 2627 ret = wret; 2628 goto out; 2629 } 2630 2631 wret = walk_up_log_tree(trans, log, path, &level, wc); 2632 if (wret > 0) 2633 break; 2634 if (wret < 0) { 2635 ret = wret; 2636 goto out; 2637 } 2638 } 2639 2640 /* was the root node processed? if not, catch it here */ 2641 if (path->nodes[orig_level]) { 2642 ret = wc->process_func(log, path->nodes[orig_level], wc, 2643 btrfs_header_generation(path->nodes[orig_level])); 2644 if (ret) 2645 goto out; 2646 if (wc->free) { 2647 struct extent_buffer *next; 2648 2649 next = path->nodes[orig_level]; 2650 2651 if (trans) { 2652 btrfs_tree_lock(next); 2653 btrfs_set_lock_blocking(next); 2654 clean_tree_block(fs_info, next); 2655 btrfs_wait_tree_block_writeback(next); 2656 btrfs_tree_unlock(next); 2657 } 2658 2659 WARN_ON(log->root_key.objectid != 2660 BTRFS_TREE_LOG_OBJECTID); 2661 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2662 next->start, next->len); 2663 if (ret) 2664 goto out; 2665 } 2666 } 2667 2668 out: 2669 btrfs_free_path(path); 2670 return ret; 2671 } 2672 2673 /* 2674 * helper function to update the item for a given subvolumes log root 2675 * in the tree of log roots 2676 */ 2677 static int update_log_root(struct btrfs_trans_handle *trans, 2678 struct btrfs_root *log) 2679 { 2680 struct btrfs_fs_info *fs_info = log->fs_info; 2681 int ret; 2682 2683 if (log->log_transid == 1) { 2684 /* insert root item on the first sync */ 2685 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2686 &log->root_key, &log->root_item); 2687 } else { 2688 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2689 &log->root_key, &log->root_item); 2690 } 2691 return ret; 2692 } 2693 2694 static void wait_log_commit(struct btrfs_root *root, int transid) 2695 { 2696 DEFINE_WAIT(wait); 2697 int index = transid % 2; 2698 2699 /* 2700 * we only allow two pending log transactions at a time, 2701 * so we know that if ours is more than 2 older than the 2702 * current transaction, we're done 2703 */ 2704 do { 2705 prepare_to_wait(&root->log_commit_wait[index], 2706 &wait, TASK_UNINTERRUPTIBLE); 2707 mutex_unlock(&root->log_mutex); 2708 2709 if (root->log_transid_committed < transid && 2710 atomic_read(&root->log_commit[index])) 2711 schedule(); 2712 2713 finish_wait(&root->log_commit_wait[index], &wait); 2714 mutex_lock(&root->log_mutex); 2715 } while (root->log_transid_committed < transid && 2716 atomic_read(&root->log_commit[index])); 2717 } 2718 2719 static void wait_for_writer(struct btrfs_root *root) 2720 { 2721 DEFINE_WAIT(wait); 2722 2723 while (atomic_read(&root->log_writers)) { 2724 prepare_to_wait(&root->log_writer_wait, 2725 &wait, TASK_UNINTERRUPTIBLE); 2726 mutex_unlock(&root->log_mutex); 2727 if (atomic_read(&root->log_writers)) 2728 schedule(); 2729 finish_wait(&root->log_writer_wait, &wait); 2730 mutex_lock(&root->log_mutex); 2731 } 2732 } 2733 2734 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2735 struct btrfs_log_ctx *ctx) 2736 { 2737 if (!ctx) 2738 return; 2739 2740 mutex_lock(&root->log_mutex); 2741 list_del_init(&ctx->list); 2742 mutex_unlock(&root->log_mutex); 2743 } 2744 2745 /* 2746 * Invoked in log mutex context, or be sure there is no other task which 2747 * can access the list. 2748 */ 2749 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2750 int index, int error) 2751 { 2752 struct btrfs_log_ctx *ctx; 2753 struct btrfs_log_ctx *safe; 2754 2755 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2756 list_del_init(&ctx->list); 2757 ctx->log_ret = error; 2758 } 2759 2760 INIT_LIST_HEAD(&root->log_ctxs[index]); 2761 } 2762 2763 /* 2764 * btrfs_sync_log does sends a given tree log down to the disk and 2765 * updates the super blocks to record it. When this call is done, 2766 * you know that any inodes previously logged are safely on disk only 2767 * if it returns 0. 2768 * 2769 * Any other return value means you need to call btrfs_commit_transaction. 2770 * Some of the edge cases for fsyncing directories that have had unlinks 2771 * or renames done in the past mean that sometimes the only safe 2772 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2773 * that has happened. 2774 */ 2775 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2776 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2777 { 2778 int index1; 2779 int index2; 2780 int mark; 2781 int ret; 2782 struct btrfs_fs_info *fs_info = root->fs_info; 2783 struct btrfs_root *log = root->log_root; 2784 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2785 int log_transid = 0; 2786 struct btrfs_log_ctx root_log_ctx; 2787 struct blk_plug plug; 2788 2789 mutex_lock(&root->log_mutex); 2790 log_transid = ctx->log_transid; 2791 if (root->log_transid_committed >= log_transid) { 2792 mutex_unlock(&root->log_mutex); 2793 return ctx->log_ret; 2794 } 2795 2796 index1 = log_transid % 2; 2797 if (atomic_read(&root->log_commit[index1])) { 2798 wait_log_commit(root, log_transid); 2799 mutex_unlock(&root->log_mutex); 2800 return ctx->log_ret; 2801 } 2802 ASSERT(log_transid == root->log_transid); 2803 atomic_set(&root->log_commit[index1], 1); 2804 2805 /* wait for previous tree log sync to complete */ 2806 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2807 wait_log_commit(root, log_transid - 1); 2808 2809 while (1) { 2810 int batch = atomic_read(&root->log_batch); 2811 /* when we're on an ssd, just kick the log commit out */ 2812 if (!btrfs_test_opt(fs_info, SSD) && 2813 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2814 mutex_unlock(&root->log_mutex); 2815 schedule_timeout_uninterruptible(1); 2816 mutex_lock(&root->log_mutex); 2817 } 2818 wait_for_writer(root); 2819 if (batch == atomic_read(&root->log_batch)) 2820 break; 2821 } 2822 2823 /* bail out if we need to do a full commit */ 2824 if (btrfs_need_log_full_commit(fs_info, trans)) { 2825 ret = -EAGAIN; 2826 btrfs_free_logged_extents(log, log_transid); 2827 mutex_unlock(&root->log_mutex); 2828 goto out; 2829 } 2830 2831 if (log_transid % 2 == 0) 2832 mark = EXTENT_DIRTY; 2833 else 2834 mark = EXTENT_NEW; 2835 2836 /* we start IO on all the marked extents here, but we don't actually 2837 * wait for them until later. 2838 */ 2839 blk_start_plug(&plug); 2840 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2841 if (ret) { 2842 blk_finish_plug(&plug); 2843 btrfs_abort_transaction(trans, ret); 2844 btrfs_free_logged_extents(log, log_transid); 2845 btrfs_set_log_full_commit(fs_info, trans); 2846 mutex_unlock(&root->log_mutex); 2847 goto out; 2848 } 2849 2850 btrfs_set_root_node(&log->root_item, log->node); 2851 2852 root->log_transid++; 2853 log->log_transid = root->log_transid; 2854 root->log_start_pid = 0; 2855 /* 2856 * IO has been started, blocks of the log tree have WRITTEN flag set 2857 * in their headers. new modifications of the log will be written to 2858 * new positions. so it's safe to allow log writers to go in. 2859 */ 2860 mutex_unlock(&root->log_mutex); 2861 2862 btrfs_init_log_ctx(&root_log_ctx, NULL); 2863 2864 mutex_lock(&log_root_tree->log_mutex); 2865 atomic_inc(&log_root_tree->log_batch); 2866 atomic_inc(&log_root_tree->log_writers); 2867 2868 index2 = log_root_tree->log_transid % 2; 2869 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2870 root_log_ctx.log_transid = log_root_tree->log_transid; 2871 2872 mutex_unlock(&log_root_tree->log_mutex); 2873 2874 ret = update_log_root(trans, log); 2875 2876 mutex_lock(&log_root_tree->log_mutex); 2877 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2878 /* 2879 * Implicit memory barrier after atomic_dec_and_test 2880 */ 2881 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2882 wake_up(&log_root_tree->log_writer_wait); 2883 } 2884 2885 if (ret) { 2886 if (!list_empty(&root_log_ctx.list)) 2887 list_del_init(&root_log_ctx.list); 2888 2889 blk_finish_plug(&plug); 2890 btrfs_set_log_full_commit(fs_info, trans); 2891 2892 if (ret != -ENOSPC) { 2893 btrfs_abort_transaction(trans, ret); 2894 mutex_unlock(&log_root_tree->log_mutex); 2895 goto out; 2896 } 2897 btrfs_wait_tree_log_extents(log, mark); 2898 btrfs_free_logged_extents(log, log_transid); 2899 mutex_unlock(&log_root_tree->log_mutex); 2900 ret = -EAGAIN; 2901 goto out; 2902 } 2903 2904 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2905 blk_finish_plug(&plug); 2906 list_del_init(&root_log_ctx.list); 2907 mutex_unlock(&log_root_tree->log_mutex); 2908 ret = root_log_ctx.log_ret; 2909 goto out; 2910 } 2911 2912 index2 = root_log_ctx.log_transid % 2; 2913 if (atomic_read(&log_root_tree->log_commit[index2])) { 2914 blk_finish_plug(&plug); 2915 ret = btrfs_wait_tree_log_extents(log, mark); 2916 btrfs_wait_logged_extents(trans, log, log_transid); 2917 wait_log_commit(log_root_tree, 2918 root_log_ctx.log_transid); 2919 mutex_unlock(&log_root_tree->log_mutex); 2920 if (!ret) 2921 ret = root_log_ctx.log_ret; 2922 goto out; 2923 } 2924 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2925 atomic_set(&log_root_tree->log_commit[index2], 1); 2926 2927 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2928 wait_log_commit(log_root_tree, 2929 root_log_ctx.log_transid - 1); 2930 } 2931 2932 wait_for_writer(log_root_tree); 2933 2934 /* 2935 * now that we've moved on to the tree of log tree roots, 2936 * check the full commit flag again 2937 */ 2938 if (btrfs_need_log_full_commit(fs_info, trans)) { 2939 blk_finish_plug(&plug); 2940 btrfs_wait_tree_log_extents(log, mark); 2941 btrfs_free_logged_extents(log, log_transid); 2942 mutex_unlock(&log_root_tree->log_mutex); 2943 ret = -EAGAIN; 2944 goto out_wake_log_root; 2945 } 2946 2947 ret = btrfs_write_marked_extents(fs_info, 2948 &log_root_tree->dirty_log_pages, 2949 EXTENT_DIRTY | EXTENT_NEW); 2950 blk_finish_plug(&plug); 2951 if (ret) { 2952 btrfs_set_log_full_commit(fs_info, trans); 2953 btrfs_abort_transaction(trans, ret); 2954 btrfs_free_logged_extents(log, log_transid); 2955 mutex_unlock(&log_root_tree->log_mutex); 2956 goto out_wake_log_root; 2957 } 2958 ret = btrfs_wait_tree_log_extents(log, mark); 2959 if (!ret) 2960 ret = btrfs_wait_tree_log_extents(log_root_tree, 2961 EXTENT_NEW | EXTENT_DIRTY); 2962 if (ret) { 2963 btrfs_set_log_full_commit(fs_info, trans); 2964 btrfs_free_logged_extents(log, log_transid); 2965 mutex_unlock(&log_root_tree->log_mutex); 2966 goto out_wake_log_root; 2967 } 2968 btrfs_wait_logged_extents(trans, log, log_transid); 2969 2970 btrfs_set_super_log_root(fs_info->super_for_commit, 2971 log_root_tree->node->start); 2972 btrfs_set_super_log_root_level(fs_info->super_for_commit, 2973 btrfs_header_level(log_root_tree->node)); 2974 2975 log_root_tree->log_transid++; 2976 mutex_unlock(&log_root_tree->log_mutex); 2977 2978 /* 2979 * nobody else is going to jump in and write the the ctree 2980 * super here because the log_commit atomic below is protecting 2981 * us. We must be called with a transaction handle pinning 2982 * the running transaction open, so a full commit can't hop 2983 * in and cause problems either. 2984 */ 2985 ret = write_all_supers(fs_info, 1); 2986 if (ret) { 2987 btrfs_set_log_full_commit(fs_info, trans); 2988 btrfs_abort_transaction(trans, ret); 2989 goto out_wake_log_root; 2990 } 2991 2992 mutex_lock(&root->log_mutex); 2993 if (root->last_log_commit < log_transid) 2994 root->last_log_commit = log_transid; 2995 mutex_unlock(&root->log_mutex); 2996 2997 out_wake_log_root: 2998 mutex_lock(&log_root_tree->log_mutex); 2999 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3000 3001 log_root_tree->log_transid_committed++; 3002 atomic_set(&log_root_tree->log_commit[index2], 0); 3003 mutex_unlock(&log_root_tree->log_mutex); 3004 3005 /* 3006 * The barrier before waitqueue_active is implied by mutex_unlock 3007 */ 3008 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3009 wake_up(&log_root_tree->log_commit_wait[index2]); 3010 out: 3011 mutex_lock(&root->log_mutex); 3012 btrfs_remove_all_log_ctxs(root, index1, ret); 3013 root->log_transid_committed++; 3014 atomic_set(&root->log_commit[index1], 0); 3015 mutex_unlock(&root->log_mutex); 3016 3017 /* 3018 * The barrier before waitqueue_active is implied by mutex_unlock 3019 */ 3020 if (waitqueue_active(&root->log_commit_wait[index1])) 3021 wake_up(&root->log_commit_wait[index1]); 3022 return ret; 3023 } 3024 3025 static void free_log_tree(struct btrfs_trans_handle *trans, 3026 struct btrfs_root *log) 3027 { 3028 int ret; 3029 u64 start; 3030 u64 end; 3031 struct walk_control wc = { 3032 .free = 1, 3033 .process_func = process_one_buffer 3034 }; 3035 3036 ret = walk_log_tree(trans, log, &wc); 3037 /* I don't think this can happen but just in case */ 3038 if (ret) 3039 btrfs_abort_transaction(trans, ret); 3040 3041 while (1) { 3042 ret = find_first_extent_bit(&log->dirty_log_pages, 3043 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 3044 NULL); 3045 if (ret) 3046 break; 3047 3048 clear_extent_bits(&log->dirty_log_pages, start, end, 3049 EXTENT_DIRTY | EXTENT_NEW); 3050 } 3051 3052 /* 3053 * We may have short-circuited the log tree with the full commit logic 3054 * and left ordered extents on our list, so clear these out to keep us 3055 * from leaking inodes and memory. 3056 */ 3057 btrfs_free_logged_extents(log, 0); 3058 btrfs_free_logged_extents(log, 1); 3059 3060 free_extent_buffer(log->node); 3061 kfree(log); 3062 } 3063 3064 /* 3065 * free all the extents used by the tree log. This should be called 3066 * at commit time of the full transaction 3067 */ 3068 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3069 { 3070 if (root->log_root) { 3071 free_log_tree(trans, root->log_root); 3072 root->log_root = NULL; 3073 } 3074 return 0; 3075 } 3076 3077 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3078 struct btrfs_fs_info *fs_info) 3079 { 3080 if (fs_info->log_root_tree) { 3081 free_log_tree(trans, fs_info->log_root_tree); 3082 fs_info->log_root_tree = NULL; 3083 } 3084 return 0; 3085 } 3086 3087 /* 3088 * If both a file and directory are logged, and unlinks or renames are 3089 * mixed in, we have a few interesting corners: 3090 * 3091 * create file X in dir Y 3092 * link file X to X.link in dir Y 3093 * fsync file X 3094 * unlink file X but leave X.link 3095 * fsync dir Y 3096 * 3097 * After a crash we would expect only X.link to exist. But file X 3098 * didn't get fsync'd again so the log has back refs for X and X.link. 3099 * 3100 * We solve this by removing directory entries and inode backrefs from the 3101 * log when a file that was logged in the current transaction is 3102 * unlinked. Any later fsync will include the updated log entries, and 3103 * we'll be able to reconstruct the proper directory items from backrefs. 3104 * 3105 * This optimizations allows us to avoid relogging the entire inode 3106 * or the entire directory. 3107 */ 3108 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3109 struct btrfs_root *root, 3110 const char *name, int name_len, 3111 struct btrfs_inode *dir, u64 index) 3112 { 3113 struct btrfs_root *log; 3114 struct btrfs_dir_item *di; 3115 struct btrfs_path *path; 3116 int ret; 3117 int err = 0; 3118 int bytes_del = 0; 3119 u64 dir_ino = btrfs_ino(dir); 3120 3121 if (dir->logged_trans < trans->transid) 3122 return 0; 3123 3124 ret = join_running_log_trans(root); 3125 if (ret) 3126 return 0; 3127 3128 mutex_lock(&dir->log_mutex); 3129 3130 log = root->log_root; 3131 path = btrfs_alloc_path(); 3132 if (!path) { 3133 err = -ENOMEM; 3134 goto out_unlock; 3135 } 3136 3137 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3138 name, name_len, -1); 3139 if (IS_ERR(di)) { 3140 err = PTR_ERR(di); 3141 goto fail; 3142 } 3143 if (di) { 3144 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3145 bytes_del += name_len; 3146 if (ret) { 3147 err = ret; 3148 goto fail; 3149 } 3150 } 3151 btrfs_release_path(path); 3152 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3153 index, name, name_len, -1); 3154 if (IS_ERR(di)) { 3155 err = PTR_ERR(di); 3156 goto fail; 3157 } 3158 if (di) { 3159 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3160 bytes_del += name_len; 3161 if (ret) { 3162 err = ret; 3163 goto fail; 3164 } 3165 } 3166 3167 /* update the directory size in the log to reflect the names 3168 * we have removed 3169 */ 3170 if (bytes_del) { 3171 struct btrfs_key key; 3172 3173 key.objectid = dir_ino; 3174 key.offset = 0; 3175 key.type = BTRFS_INODE_ITEM_KEY; 3176 btrfs_release_path(path); 3177 3178 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3179 if (ret < 0) { 3180 err = ret; 3181 goto fail; 3182 } 3183 if (ret == 0) { 3184 struct btrfs_inode_item *item; 3185 u64 i_size; 3186 3187 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3188 struct btrfs_inode_item); 3189 i_size = btrfs_inode_size(path->nodes[0], item); 3190 if (i_size > bytes_del) 3191 i_size -= bytes_del; 3192 else 3193 i_size = 0; 3194 btrfs_set_inode_size(path->nodes[0], item, i_size); 3195 btrfs_mark_buffer_dirty(path->nodes[0]); 3196 } else 3197 ret = 0; 3198 btrfs_release_path(path); 3199 } 3200 fail: 3201 btrfs_free_path(path); 3202 out_unlock: 3203 mutex_unlock(&dir->log_mutex); 3204 if (ret == -ENOSPC) { 3205 btrfs_set_log_full_commit(root->fs_info, trans); 3206 ret = 0; 3207 } else if (ret < 0) 3208 btrfs_abort_transaction(trans, ret); 3209 3210 btrfs_end_log_trans(root); 3211 3212 return err; 3213 } 3214 3215 /* see comments for btrfs_del_dir_entries_in_log */ 3216 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3217 struct btrfs_root *root, 3218 const char *name, int name_len, 3219 struct btrfs_inode *inode, u64 dirid) 3220 { 3221 struct btrfs_fs_info *fs_info = root->fs_info; 3222 struct btrfs_root *log; 3223 u64 index; 3224 int ret; 3225 3226 if (inode->logged_trans < trans->transid) 3227 return 0; 3228 3229 ret = join_running_log_trans(root); 3230 if (ret) 3231 return 0; 3232 log = root->log_root; 3233 mutex_lock(&inode->log_mutex); 3234 3235 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3236 dirid, &index); 3237 mutex_unlock(&inode->log_mutex); 3238 if (ret == -ENOSPC) { 3239 btrfs_set_log_full_commit(fs_info, trans); 3240 ret = 0; 3241 } else if (ret < 0 && ret != -ENOENT) 3242 btrfs_abort_transaction(trans, ret); 3243 btrfs_end_log_trans(root); 3244 3245 return ret; 3246 } 3247 3248 /* 3249 * creates a range item in the log for 'dirid'. first_offset and 3250 * last_offset tell us which parts of the key space the log should 3251 * be considered authoritative for. 3252 */ 3253 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3254 struct btrfs_root *log, 3255 struct btrfs_path *path, 3256 int key_type, u64 dirid, 3257 u64 first_offset, u64 last_offset) 3258 { 3259 int ret; 3260 struct btrfs_key key; 3261 struct btrfs_dir_log_item *item; 3262 3263 key.objectid = dirid; 3264 key.offset = first_offset; 3265 if (key_type == BTRFS_DIR_ITEM_KEY) 3266 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3267 else 3268 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3269 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3270 if (ret) 3271 return ret; 3272 3273 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3274 struct btrfs_dir_log_item); 3275 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3276 btrfs_mark_buffer_dirty(path->nodes[0]); 3277 btrfs_release_path(path); 3278 return 0; 3279 } 3280 3281 /* 3282 * log all the items included in the current transaction for a given 3283 * directory. This also creates the range items in the log tree required 3284 * to replay anything deleted before the fsync 3285 */ 3286 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3287 struct btrfs_root *root, struct btrfs_inode *inode, 3288 struct btrfs_path *path, 3289 struct btrfs_path *dst_path, int key_type, 3290 struct btrfs_log_ctx *ctx, 3291 u64 min_offset, u64 *last_offset_ret) 3292 { 3293 struct btrfs_key min_key; 3294 struct btrfs_root *log = root->log_root; 3295 struct extent_buffer *src; 3296 int err = 0; 3297 int ret; 3298 int i; 3299 int nritems; 3300 u64 first_offset = min_offset; 3301 u64 last_offset = (u64)-1; 3302 u64 ino = btrfs_ino(inode); 3303 3304 log = root->log_root; 3305 3306 min_key.objectid = ino; 3307 min_key.type = key_type; 3308 min_key.offset = min_offset; 3309 3310 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3311 3312 /* 3313 * we didn't find anything from this transaction, see if there 3314 * is anything at all 3315 */ 3316 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3317 min_key.objectid = ino; 3318 min_key.type = key_type; 3319 min_key.offset = (u64)-1; 3320 btrfs_release_path(path); 3321 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3322 if (ret < 0) { 3323 btrfs_release_path(path); 3324 return ret; 3325 } 3326 ret = btrfs_previous_item(root, path, ino, key_type); 3327 3328 /* if ret == 0 there are items for this type, 3329 * create a range to tell us the last key of this type. 3330 * otherwise, there are no items in this directory after 3331 * *min_offset, and we create a range to indicate that. 3332 */ 3333 if (ret == 0) { 3334 struct btrfs_key tmp; 3335 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3336 path->slots[0]); 3337 if (key_type == tmp.type) 3338 first_offset = max(min_offset, tmp.offset) + 1; 3339 } 3340 goto done; 3341 } 3342 3343 /* go backward to find any previous key */ 3344 ret = btrfs_previous_item(root, path, ino, key_type); 3345 if (ret == 0) { 3346 struct btrfs_key tmp; 3347 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3348 if (key_type == tmp.type) { 3349 first_offset = tmp.offset; 3350 ret = overwrite_item(trans, log, dst_path, 3351 path->nodes[0], path->slots[0], 3352 &tmp); 3353 if (ret) { 3354 err = ret; 3355 goto done; 3356 } 3357 } 3358 } 3359 btrfs_release_path(path); 3360 3361 /* find the first key from this transaction again */ 3362 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3363 if (WARN_ON(ret != 0)) 3364 goto done; 3365 3366 /* 3367 * we have a block from this transaction, log every item in it 3368 * from our directory 3369 */ 3370 while (1) { 3371 struct btrfs_key tmp; 3372 src = path->nodes[0]; 3373 nritems = btrfs_header_nritems(src); 3374 for (i = path->slots[0]; i < nritems; i++) { 3375 struct btrfs_dir_item *di; 3376 3377 btrfs_item_key_to_cpu(src, &min_key, i); 3378 3379 if (min_key.objectid != ino || min_key.type != key_type) 3380 goto done; 3381 ret = overwrite_item(trans, log, dst_path, src, i, 3382 &min_key); 3383 if (ret) { 3384 err = ret; 3385 goto done; 3386 } 3387 3388 /* 3389 * We must make sure that when we log a directory entry, 3390 * the corresponding inode, after log replay, has a 3391 * matching link count. For example: 3392 * 3393 * touch foo 3394 * mkdir mydir 3395 * sync 3396 * ln foo mydir/bar 3397 * xfs_io -c "fsync" mydir 3398 * <crash> 3399 * <mount fs and log replay> 3400 * 3401 * Would result in a fsync log that when replayed, our 3402 * file inode would have a link count of 1, but we get 3403 * two directory entries pointing to the same inode. 3404 * After removing one of the names, it would not be 3405 * possible to remove the other name, which resulted 3406 * always in stale file handle errors, and would not 3407 * be possible to rmdir the parent directory, since 3408 * its i_size could never decrement to the value 3409 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3410 */ 3411 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3412 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3413 if (ctx && 3414 (btrfs_dir_transid(src, di) == trans->transid || 3415 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3416 tmp.type != BTRFS_ROOT_ITEM_KEY) 3417 ctx->log_new_dentries = true; 3418 } 3419 path->slots[0] = nritems; 3420 3421 /* 3422 * look ahead to the next item and see if it is also 3423 * from this directory and from this transaction 3424 */ 3425 ret = btrfs_next_leaf(root, path); 3426 if (ret == 1) { 3427 last_offset = (u64)-1; 3428 goto done; 3429 } 3430 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3431 if (tmp.objectid != ino || tmp.type != key_type) { 3432 last_offset = (u64)-1; 3433 goto done; 3434 } 3435 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3436 ret = overwrite_item(trans, log, dst_path, 3437 path->nodes[0], path->slots[0], 3438 &tmp); 3439 if (ret) 3440 err = ret; 3441 else 3442 last_offset = tmp.offset; 3443 goto done; 3444 } 3445 } 3446 done: 3447 btrfs_release_path(path); 3448 btrfs_release_path(dst_path); 3449 3450 if (err == 0) { 3451 *last_offset_ret = last_offset; 3452 /* 3453 * insert the log range keys to indicate where the log 3454 * is valid 3455 */ 3456 ret = insert_dir_log_key(trans, log, path, key_type, 3457 ino, first_offset, last_offset); 3458 if (ret) 3459 err = ret; 3460 } 3461 return err; 3462 } 3463 3464 /* 3465 * logging directories is very similar to logging inodes, We find all the items 3466 * from the current transaction and write them to the log. 3467 * 3468 * The recovery code scans the directory in the subvolume, and if it finds a 3469 * key in the range logged that is not present in the log tree, then it means 3470 * that dir entry was unlinked during the transaction. 3471 * 3472 * In order for that scan to work, we must include one key smaller than 3473 * the smallest logged by this transaction and one key larger than the largest 3474 * key logged by this transaction. 3475 */ 3476 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3477 struct btrfs_root *root, struct btrfs_inode *inode, 3478 struct btrfs_path *path, 3479 struct btrfs_path *dst_path, 3480 struct btrfs_log_ctx *ctx) 3481 { 3482 u64 min_key; 3483 u64 max_key; 3484 int ret; 3485 int key_type = BTRFS_DIR_ITEM_KEY; 3486 3487 again: 3488 min_key = 0; 3489 max_key = 0; 3490 while (1) { 3491 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3492 ctx, min_key, &max_key); 3493 if (ret) 3494 return ret; 3495 if (max_key == (u64)-1) 3496 break; 3497 min_key = max_key + 1; 3498 } 3499 3500 if (key_type == BTRFS_DIR_ITEM_KEY) { 3501 key_type = BTRFS_DIR_INDEX_KEY; 3502 goto again; 3503 } 3504 return 0; 3505 } 3506 3507 /* 3508 * a helper function to drop items from the log before we relog an 3509 * inode. max_key_type indicates the highest item type to remove. 3510 * This cannot be run for file data extents because it does not 3511 * free the extents they point to. 3512 */ 3513 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3514 struct btrfs_root *log, 3515 struct btrfs_path *path, 3516 u64 objectid, int max_key_type) 3517 { 3518 int ret; 3519 struct btrfs_key key; 3520 struct btrfs_key found_key; 3521 int start_slot; 3522 3523 key.objectid = objectid; 3524 key.type = max_key_type; 3525 key.offset = (u64)-1; 3526 3527 while (1) { 3528 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3529 BUG_ON(ret == 0); /* Logic error */ 3530 if (ret < 0) 3531 break; 3532 3533 if (path->slots[0] == 0) 3534 break; 3535 3536 path->slots[0]--; 3537 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3538 path->slots[0]); 3539 3540 if (found_key.objectid != objectid) 3541 break; 3542 3543 found_key.offset = 0; 3544 found_key.type = 0; 3545 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3546 &start_slot); 3547 3548 ret = btrfs_del_items(trans, log, path, start_slot, 3549 path->slots[0] - start_slot + 1); 3550 /* 3551 * If start slot isn't 0 then we don't need to re-search, we've 3552 * found the last guy with the objectid in this tree. 3553 */ 3554 if (ret || start_slot != 0) 3555 break; 3556 btrfs_release_path(path); 3557 } 3558 btrfs_release_path(path); 3559 if (ret > 0) 3560 ret = 0; 3561 return ret; 3562 } 3563 3564 static void fill_inode_item(struct btrfs_trans_handle *trans, 3565 struct extent_buffer *leaf, 3566 struct btrfs_inode_item *item, 3567 struct inode *inode, int log_inode_only, 3568 u64 logged_isize) 3569 { 3570 struct btrfs_map_token token; 3571 3572 btrfs_init_map_token(&token); 3573 3574 if (log_inode_only) { 3575 /* set the generation to zero so the recover code 3576 * can tell the difference between an logging 3577 * just to say 'this inode exists' and a logging 3578 * to say 'update this inode with these values' 3579 */ 3580 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3581 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3582 } else { 3583 btrfs_set_token_inode_generation(leaf, item, 3584 BTRFS_I(inode)->generation, 3585 &token); 3586 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3587 } 3588 3589 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3590 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3591 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3592 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3593 3594 btrfs_set_token_timespec_sec(leaf, &item->atime, 3595 inode->i_atime.tv_sec, &token); 3596 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3597 inode->i_atime.tv_nsec, &token); 3598 3599 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3600 inode->i_mtime.tv_sec, &token); 3601 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3602 inode->i_mtime.tv_nsec, &token); 3603 3604 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3605 inode->i_ctime.tv_sec, &token); 3606 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3607 inode->i_ctime.tv_nsec, &token); 3608 3609 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3610 &token); 3611 3612 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3613 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3614 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3615 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3616 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3617 } 3618 3619 static int log_inode_item(struct btrfs_trans_handle *trans, 3620 struct btrfs_root *log, struct btrfs_path *path, 3621 struct btrfs_inode *inode) 3622 { 3623 struct btrfs_inode_item *inode_item; 3624 int ret; 3625 3626 ret = btrfs_insert_empty_item(trans, log, path, 3627 &inode->location, sizeof(*inode_item)); 3628 if (ret && ret != -EEXIST) 3629 return ret; 3630 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3631 struct btrfs_inode_item); 3632 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3633 0, 0); 3634 btrfs_release_path(path); 3635 return 0; 3636 } 3637 3638 static noinline int copy_items(struct btrfs_trans_handle *trans, 3639 struct btrfs_inode *inode, 3640 struct btrfs_path *dst_path, 3641 struct btrfs_path *src_path, u64 *last_extent, 3642 int start_slot, int nr, int inode_only, 3643 u64 logged_isize) 3644 { 3645 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3646 unsigned long src_offset; 3647 unsigned long dst_offset; 3648 struct btrfs_root *log = inode->root->log_root; 3649 struct btrfs_file_extent_item *extent; 3650 struct btrfs_inode_item *inode_item; 3651 struct extent_buffer *src = src_path->nodes[0]; 3652 struct btrfs_key first_key, last_key, key; 3653 int ret; 3654 struct btrfs_key *ins_keys; 3655 u32 *ins_sizes; 3656 char *ins_data; 3657 int i; 3658 struct list_head ordered_sums; 3659 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3660 bool has_extents = false; 3661 bool need_find_last_extent = true; 3662 bool done = false; 3663 3664 INIT_LIST_HEAD(&ordered_sums); 3665 3666 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3667 nr * sizeof(u32), GFP_NOFS); 3668 if (!ins_data) 3669 return -ENOMEM; 3670 3671 first_key.objectid = (u64)-1; 3672 3673 ins_sizes = (u32 *)ins_data; 3674 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3675 3676 for (i = 0; i < nr; i++) { 3677 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3678 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3679 } 3680 ret = btrfs_insert_empty_items(trans, log, dst_path, 3681 ins_keys, ins_sizes, nr); 3682 if (ret) { 3683 kfree(ins_data); 3684 return ret; 3685 } 3686 3687 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3688 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3689 dst_path->slots[0]); 3690 3691 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3692 3693 if ((i == (nr - 1))) 3694 last_key = ins_keys[i]; 3695 3696 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3697 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3698 dst_path->slots[0], 3699 struct btrfs_inode_item); 3700 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3701 &inode->vfs_inode, 3702 inode_only == LOG_INODE_EXISTS, 3703 logged_isize); 3704 } else { 3705 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3706 src_offset, ins_sizes[i]); 3707 } 3708 3709 /* 3710 * We set need_find_last_extent here in case we know we were 3711 * processing other items and then walk into the first extent in 3712 * the inode. If we don't hit an extent then nothing changes, 3713 * we'll do the last search the next time around. 3714 */ 3715 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3716 has_extents = true; 3717 if (first_key.objectid == (u64)-1) 3718 first_key = ins_keys[i]; 3719 } else { 3720 need_find_last_extent = false; 3721 } 3722 3723 /* take a reference on file data extents so that truncates 3724 * or deletes of this inode don't have to relog the inode 3725 * again 3726 */ 3727 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3728 !skip_csum) { 3729 int found_type; 3730 extent = btrfs_item_ptr(src, start_slot + i, 3731 struct btrfs_file_extent_item); 3732 3733 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3734 continue; 3735 3736 found_type = btrfs_file_extent_type(src, extent); 3737 if (found_type == BTRFS_FILE_EXTENT_REG) { 3738 u64 ds, dl, cs, cl; 3739 ds = btrfs_file_extent_disk_bytenr(src, 3740 extent); 3741 /* ds == 0 is a hole */ 3742 if (ds == 0) 3743 continue; 3744 3745 dl = btrfs_file_extent_disk_num_bytes(src, 3746 extent); 3747 cs = btrfs_file_extent_offset(src, extent); 3748 cl = btrfs_file_extent_num_bytes(src, 3749 extent); 3750 if (btrfs_file_extent_compression(src, 3751 extent)) { 3752 cs = 0; 3753 cl = dl; 3754 } 3755 3756 ret = btrfs_lookup_csums_range( 3757 fs_info->csum_root, 3758 ds + cs, ds + cs + cl - 1, 3759 &ordered_sums, 0); 3760 if (ret) { 3761 btrfs_release_path(dst_path); 3762 kfree(ins_data); 3763 return ret; 3764 } 3765 } 3766 } 3767 } 3768 3769 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3770 btrfs_release_path(dst_path); 3771 kfree(ins_data); 3772 3773 /* 3774 * we have to do this after the loop above to avoid changing the 3775 * log tree while trying to change the log tree. 3776 */ 3777 ret = 0; 3778 while (!list_empty(&ordered_sums)) { 3779 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3780 struct btrfs_ordered_sum, 3781 list); 3782 if (!ret) 3783 ret = btrfs_csum_file_blocks(trans, log, sums); 3784 list_del(&sums->list); 3785 kfree(sums); 3786 } 3787 3788 if (!has_extents) 3789 return ret; 3790 3791 if (need_find_last_extent && *last_extent == first_key.offset) { 3792 /* 3793 * We don't have any leafs between our current one and the one 3794 * we processed before that can have file extent items for our 3795 * inode (and have a generation number smaller than our current 3796 * transaction id). 3797 */ 3798 need_find_last_extent = false; 3799 } 3800 3801 /* 3802 * Because we use btrfs_search_forward we could skip leaves that were 3803 * not modified and then assume *last_extent is valid when it really 3804 * isn't. So back up to the previous leaf and read the end of the last 3805 * extent before we go and fill in holes. 3806 */ 3807 if (need_find_last_extent) { 3808 u64 len; 3809 3810 ret = btrfs_prev_leaf(inode->root, src_path); 3811 if (ret < 0) 3812 return ret; 3813 if (ret) 3814 goto fill_holes; 3815 if (src_path->slots[0]) 3816 src_path->slots[0]--; 3817 src = src_path->nodes[0]; 3818 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3819 if (key.objectid != btrfs_ino(inode) || 3820 key.type != BTRFS_EXTENT_DATA_KEY) 3821 goto fill_holes; 3822 extent = btrfs_item_ptr(src, src_path->slots[0], 3823 struct btrfs_file_extent_item); 3824 if (btrfs_file_extent_type(src, extent) == 3825 BTRFS_FILE_EXTENT_INLINE) { 3826 len = btrfs_file_extent_inline_len(src, 3827 src_path->slots[0], 3828 extent); 3829 *last_extent = ALIGN(key.offset + len, 3830 fs_info->sectorsize); 3831 } else { 3832 len = btrfs_file_extent_num_bytes(src, extent); 3833 *last_extent = key.offset + len; 3834 } 3835 } 3836 fill_holes: 3837 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3838 * things could have happened 3839 * 3840 * 1) A merge could have happened, so we could currently be on a leaf 3841 * that holds what we were copying in the first place. 3842 * 2) A split could have happened, and now not all of the items we want 3843 * are on the same leaf. 3844 * 3845 * So we need to adjust how we search for holes, we need to drop the 3846 * path and re-search for the first extent key we found, and then walk 3847 * forward until we hit the last one we copied. 3848 */ 3849 if (need_find_last_extent) { 3850 /* btrfs_prev_leaf could return 1 without releasing the path */ 3851 btrfs_release_path(src_path); 3852 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3853 src_path, 0, 0); 3854 if (ret < 0) 3855 return ret; 3856 ASSERT(ret == 0); 3857 src = src_path->nodes[0]; 3858 i = src_path->slots[0]; 3859 } else { 3860 i = start_slot; 3861 } 3862 3863 /* 3864 * Ok so here we need to go through and fill in any holes we may have 3865 * to make sure that holes are punched for those areas in case they had 3866 * extents previously. 3867 */ 3868 while (!done) { 3869 u64 offset, len; 3870 u64 extent_end; 3871 3872 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3873 ret = btrfs_next_leaf(inode->root, src_path); 3874 if (ret < 0) 3875 return ret; 3876 ASSERT(ret == 0); 3877 src = src_path->nodes[0]; 3878 i = 0; 3879 } 3880 3881 btrfs_item_key_to_cpu(src, &key, i); 3882 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3883 done = true; 3884 if (key.objectid != btrfs_ino(inode) || 3885 key.type != BTRFS_EXTENT_DATA_KEY) { 3886 i++; 3887 continue; 3888 } 3889 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3890 if (btrfs_file_extent_type(src, extent) == 3891 BTRFS_FILE_EXTENT_INLINE) { 3892 len = btrfs_file_extent_inline_len(src, i, extent); 3893 extent_end = ALIGN(key.offset + len, 3894 fs_info->sectorsize); 3895 } else { 3896 len = btrfs_file_extent_num_bytes(src, extent); 3897 extent_end = key.offset + len; 3898 } 3899 i++; 3900 3901 if (*last_extent == key.offset) { 3902 *last_extent = extent_end; 3903 continue; 3904 } 3905 offset = *last_extent; 3906 len = key.offset - *last_extent; 3907 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3908 offset, 0, 0, len, 0, len, 0, 0, 0); 3909 if (ret) 3910 break; 3911 *last_extent = extent_end; 3912 } 3913 /* 3914 * Need to let the callers know we dropped the path so they should 3915 * re-search. 3916 */ 3917 if (!ret && need_find_last_extent) 3918 ret = 1; 3919 return ret; 3920 } 3921 3922 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3923 { 3924 struct extent_map *em1, *em2; 3925 3926 em1 = list_entry(a, struct extent_map, list); 3927 em2 = list_entry(b, struct extent_map, list); 3928 3929 if (em1->start < em2->start) 3930 return -1; 3931 else if (em1->start > em2->start) 3932 return 1; 3933 return 0; 3934 } 3935 3936 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3937 struct inode *inode, 3938 struct btrfs_root *root, 3939 const struct extent_map *em, 3940 const struct list_head *logged_list, 3941 bool *ordered_io_error) 3942 { 3943 struct btrfs_fs_info *fs_info = root->fs_info; 3944 struct btrfs_ordered_extent *ordered; 3945 struct btrfs_root *log = root->log_root; 3946 u64 mod_start = em->mod_start; 3947 u64 mod_len = em->mod_len; 3948 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3949 u64 csum_offset; 3950 u64 csum_len; 3951 LIST_HEAD(ordered_sums); 3952 int ret = 0; 3953 3954 *ordered_io_error = false; 3955 3956 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3957 em->block_start == EXTENT_MAP_HOLE) 3958 return 0; 3959 3960 /* 3961 * Wait far any ordered extent that covers our extent map. If it 3962 * finishes without an error, first check and see if our csums are on 3963 * our outstanding ordered extents. 3964 */ 3965 list_for_each_entry(ordered, logged_list, log_list) { 3966 struct btrfs_ordered_sum *sum; 3967 3968 if (!mod_len) 3969 break; 3970 3971 if (ordered->file_offset + ordered->len <= mod_start || 3972 mod_start + mod_len <= ordered->file_offset) 3973 continue; 3974 3975 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3976 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3977 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3978 const u64 start = ordered->file_offset; 3979 const u64 end = ordered->file_offset + ordered->len - 1; 3980 3981 WARN_ON(ordered->inode != inode); 3982 filemap_fdatawrite_range(inode->i_mapping, start, end); 3983 } 3984 3985 wait_event(ordered->wait, 3986 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3987 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3988 3989 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3990 /* 3991 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3992 * i_mapping flags, so that the next fsync won't get 3993 * an outdated io error too. 3994 */ 3995 filemap_check_errors(inode->i_mapping); 3996 *ordered_io_error = true; 3997 break; 3998 } 3999 /* 4000 * We are going to copy all the csums on this ordered extent, so 4001 * go ahead and adjust mod_start and mod_len in case this 4002 * ordered extent has already been logged. 4003 */ 4004 if (ordered->file_offset > mod_start) { 4005 if (ordered->file_offset + ordered->len >= 4006 mod_start + mod_len) 4007 mod_len = ordered->file_offset - mod_start; 4008 /* 4009 * If we have this case 4010 * 4011 * |--------- logged extent ---------| 4012 * |----- ordered extent ----| 4013 * 4014 * Just don't mess with mod_start and mod_len, we'll 4015 * just end up logging more csums than we need and it 4016 * will be ok. 4017 */ 4018 } else { 4019 if (ordered->file_offset + ordered->len < 4020 mod_start + mod_len) { 4021 mod_len = (mod_start + mod_len) - 4022 (ordered->file_offset + ordered->len); 4023 mod_start = ordered->file_offset + 4024 ordered->len; 4025 } else { 4026 mod_len = 0; 4027 } 4028 } 4029 4030 if (skip_csum) 4031 continue; 4032 4033 /* 4034 * To keep us from looping for the above case of an ordered 4035 * extent that falls inside of the logged extent. 4036 */ 4037 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4038 &ordered->flags)) 4039 continue; 4040 4041 list_for_each_entry(sum, &ordered->list, list) { 4042 ret = btrfs_csum_file_blocks(trans, log, sum); 4043 if (ret) 4044 break; 4045 } 4046 } 4047 4048 if (*ordered_io_error || !mod_len || ret || skip_csum) 4049 return ret; 4050 4051 if (em->compress_type) { 4052 csum_offset = 0; 4053 csum_len = max(em->block_len, em->orig_block_len); 4054 } else { 4055 csum_offset = mod_start - em->start; 4056 csum_len = mod_len; 4057 } 4058 4059 /* block start is already adjusted for the file extent offset. */ 4060 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4061 em->block_start + csum_offset, 4062 em->block_start + csum_offset + 4063 csum_len - 1, &ordered_sums, 0); 4064 if (ret) 4065 return ret; 4066 4067 while (!list_empty(&ordered_sums)) { 4068 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4069 struct btrfs_ordered_sum, 4070 list); 4071 if (!ret) 4072 ret = btrfs_csum_file_blocks(trans, log, sums); 4073 list_del(&sums->list); 4074 kfree(sums); 4075 } 4076 4077 return ret; 4078 } 4079 4080 static int log_one_extent(struct btrfs_trans_handle *trans, 4081 struct btrfs_inode *inode, struct btrfs_root *root, 4082 const struct extent_map *em, 4083 struct btrfs_path *path, 4084 const struct list_head *logged_list, 4085 struct btrfs_log_ctx *ctx) 4086 { 4087 struct btrfs_root *log = root->log_root; 4088 struct btrfs_file_extent_item *fi; 4089 struct extent_buffer *leaf; 4090 struct btrfs_map_token token; 4091 struct btrfs_key key; 4092 u64 extent_offset = em->start - em->orig_start; 4093 u64 block_len; 4094 int ret; 4095 int extent_inserted = 0; 4096 bool ordered_io_err = false; 4097 4098 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4099 logged_list, &ordered_io_err); 4100 if (ret) 4101 return ret; 4102 4103 if (ordered_io_err) { 4104 ctx->io_err = -EIO; 4105 return 0; 4106 } 4107 4108 btrfs_init_map_token(&token); 4109 4110 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4111 em->start + em->len, NULL, 0, 1, 4112 sizeof(*fi), &extent_inserted); 4113 if (ret) 4114 return ret; 4115 4116 if (!extent_inserted) { 4117 key.objectid = btrfs_ino(inode); 4118 key.type = BTRFS_EXTENT_DATA_KEY; 4119 key.offset = em->start; 4120 4121 ret = btrfs_insert_empty_item(trans, log, path, &key, 4122 sizeof(*fi)); 4123 if (ret) 4124 return ret; 4125 } 4126 leaf = path->nodes[0]; 4127 fi = btrfs_item_ptr(leaf, path->slots[0], 4128 struct btrfs_file_extent_item); 4129 4130 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4131 &token); 4132 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4133 btrfs_set_token_file_extent_type(leaf, fi, 4134 BTRFS_FILE_EXTENT_PREALLOC, 4135 &token); 4136 else 4137 btrfs_set_token_file_extent_type(leaf, fi, 4138 BTRFS_FILE_EXTENT_REG, 4139 &token); 4140 4141 block_len = max(em->block_len, em->orig_block_len); 4142 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4143 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4144 em->block_start, 4145 &token); 4146 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4147 &token); 4148 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4149 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4150 em->block_start - 4151 extent_offset, &token); 4152 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4153 &token); 4154 } else { 4155 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4156 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4157 &token); 4158 } 4159 4160 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4161 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4162 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4163 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4164 &token); 4165 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4166 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4167 btrfs_mark_buffer_dirty(leaf); 4168 4169 btrfs_release_path(path); 4170 4171 return ret; 4172 } 4173 4174 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4175 struct btrfs_root *root, 4176 struct btrfs_inode *inode, 4177 struct btrfs_path *path, 4178 struct list_head *logged_list, 4179 struct btrfs_log_ctx *ctx, 4180 const u64 start, 4181 const u64 end) 4182 { 4183 struct extent_map *em, *n; 4184 struct list_head extents; 4185 struct extent_map_tree *tree = &inode->extent_tree; 4186 u64 test_gen; 4187 int ret = 0; 4188 int num = 0; 4189 4190 INIT_LIST_HEAD(&extents); 4191 4192 down_write(&inode->dio_sem); 4193 write_lock(&tree->lock); 4194 test_gen = root->fs_info->last_trans_committed; 4195 4196 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4197 list_del_init(&em->list); 4198 4199 /* 4200 * Just an arbitrary number, this can be really CPU intensive 4201 * once we start getting a lot of extents, and really once we 4202 * have a bunch of extents we just want to commit since it will 4203 * be faster. 4204 */ 4205 if (++num > 32768) { 4206 list_del_init(&tree->modified_extents); 4207 ret = -EFBIG; 4208 goto process; 4209 } 4210 4211 if (em->generation <= test_gen) 4212 continue; 4213 /* Need a ref to keep it from getting evicted from cache */ 4214 refcount_inc(&em->refs); 4215 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4216 list_add_tail(&em->list, &extents); 4217 num++; 4218 } 4219 4220 list_sort(NULL, &extents, extent_cmp); 4221 btrfs_get_logged_extents(inode, logged_list, start, end); 4222 /* 4223 * Some ordered extents started by fsync might have completed 4224 * before we could collect them into the list logged_list, which 4225 * means they're gone, not in our logged_list nor in the inode's 4226 * ordered tree. We want the application/user space to know an 4227 * error happened while attempting to persist file data so that 4228 * it can take proper action. If such error happened, we leave 4229 * without writing to the log tree and the fsync must report the 4230 * file data write error and not commit the current transaction. 4231 */ 4232 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4233 if (ret) 4234 ctx->io_err = ret; 4235 process: 4236 while (!list_empty(&extents)) { 4237 em = list_entry(extents.next, struct extent_map, list); 4238 4239 list_del_init(&em->list); 4240 4241 /* 4242 * If we had an error we just need to delete everybody from our 4243 * private list. 4244 */ 4245 if (ret) { 4246 clear_em_logging(tree, em); 4247 free_extent_map(em); 4248 continue; 4249 } 4250 4251 write_unlock(&tree->lock); 4252 4253 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4254 ctx); 4255 write_lock(&tree->lock); 4256 clear_em_logging(tree, em); 4257 free_extent_map(em); 4258 } 4259 WARN_ON(!list_empty(&extents)); 4260 write_unlock(&tree->lock); 4261 up_write(&inode->dio_sem); 4262 4263 btrfs_release_path(path); 4264 return ret; 4265 } 4266 4267 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4268 struct btrfs_path *path, u64 *size_ret) 4269 { 4270 struct btrfs_key key; 4271 int ret; 4272 4273 key.objectid = btrfs_ino(inode); 4274 key.type = BTRFS_INODE_ITEM_KEY; 4275 key.offset = 0; 4276 4277 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4278 if (ret < 0) { 4279 return ret; 4280 } else if (ret > 0) { 4281 *size_ret = 0; 4282 } else { 4283 struct btrfs_inode_item *item; 4284 4285 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4286 struct btrfs_inode_item); 4287 *size_ret = btrfs_inode_size(path->nodes[0], item); 4288 } 4289 4290 btrfs_release_path(path); 4291 return 0; 4292 } 4293 4294 /* 4295 * At the moment we always log all xattrs. This is to figure out at log replay 4296 * time which xattrs must have their deletion replayed. If a xattr is missing 4297 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4298 * because if a xattr is deleted, the inode is fsynced and a power failure 4299 * happens, causing the log to be replayed the next time the fs is mounted, 4300 * we want the xattr to not exist anymore (same behaviour as other filesystems 4301 * with a journal, ext3/4, xfs, f2fs, etc). 4302 */ 4303 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4304 struct btrfs_root *root, 4305 struct btrfs_inode *inode, 4306 struct btrfs_path *path, 4307 struct btrfs_path *dst_path) 4308 { 4309 int ret; 4310 struct btrfs_key key; 4311 const u64 ino = btrfs_ino(inode); 4312 int ins_nr = 0; 4313 int start_slot = 0; 4314 4315 key.objectid = ino; 4316 key.type = BTRFS_XATTR_ITEM_KEY; 4317 key.offset = 0; 4318 4319 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4320 if (ret < 0) 4321 return ret; 4322 4323 while (true) { 4324 int slot = path->slots[0]; 4325 struct extent_buffer *leaf = path->nodes[0]; 4326 int nritems = btrfs_header_nritems(leaf); 4327 4328 if (slot >= nritems) { 4329 if (ins_nr > 0) { 4330 u64 last_extent = 0; 4331 4332 ret = copy_items(trans, inode, dst_path, path, 4333 &last_extent, start_slot, 4334 ins_nr, 1, 0); 4335 /* can't be 1, extent items aren't processed */ 4336 ASSERT(ret <= 0); 4337 if (ret < 0) 4338 return ret; 4339 ins_nr = 0; 4340 } 4341 ret = btrfs_next_leaf(root, path); 4342 if (ret < 0) 4343 return ret; 4344 else if (ret > 0) 4345 break; 4346 continue; 4347 } 4348 4349 btrfs_item_key_to_cpu(leaf, &key, slot); 4350 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4351 break; 4352 4353 if (ins_nr == 0) 4354 start_slot = slot; 4355 ins_nr++; 4356 path->slots[0]++; 4357 cond_resched(); 4358 } 4359 if (ins_nr > 0) { 4360 u64 last_extent = 0; 4361 4362 ret = copy_items(trans, inode, dst_path, path, 4363 &last_extent, start_slot, 4364 ins_nr, 1, 0); 4365 /* can't be 1, extent items aren't processed */ 4366 ASSERT(ret <= 0); 4367 if (ret < 0) 4368 return ret; 4369 } 4370 4371 return 0; 4372 } 4373 4374 /* 4375 * If the no holes feature is enabled we need to make sure any hole between the 4376 * last extent and the i_size of our inode is explicitly marked in the log. This 4377 * is to make sure that doing something like: 4378 * 4379 * 1) create file with 128Kb of data 4380 * 2) truncate file to 64Kb 4381 * 3) truncate file to 256Kb 4382 * 4) fsync file 4383 * 5) <crash/power failure> 4384 * 6) mount fs and trigger log replay 4385 * 4386 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4387 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4388 * file correspond to a hole. The presence of explicit holes in a log tree is 4389 * what guarantees that log replay will remove/adjust file extent items in the 4390 * fs/subvol tree. 4391 * 4392 * Here we do not need to care about holes between extents, that is already done 4393 * by copy_items(). We also only need to do this in the full sync path, where we 4394 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4395 * lookup the list of modified extent maps and if any represents a hole, we 4396 * insert a corresponding extent representing a hole in the log tree. 4397 */ 4398 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4399 struct btrfs_root *root, 4400 struct btrfs_inode *inode, 4401 struct btrfs_path *path) 4402 { 4403 struct btrfs_fs_info *fs_info = root->fs_info; 4404 int ret; 4405 struct btrfs_key key; 4406 u64 hole_start; 4407 u64 hole_size; 4408 struct extent_buffer *leaf; 4409 struct btrfs_root *log = root->log_root; 4410 const u64 ino = btrfs_ino(inode); 4411 const u64 i_size = i_size_read(&inode->vfs_inode); 4412 4413 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4414 return 0; 4415 4416 key.objectid = ino; 4417 key.type = BTRFS_EXTENT_DATA_KEY; 4418 key.offset = (u64)-1; 4419 4420 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4421 ASSERT(ret != 0); 4422 if (ret < 0) 4423 return ret; 4424 4425 ASSERT(path->slots[0] > 0); 4426 path->slots[0]--; 4427 leaf = path->nodes[0]; 4428 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4429 4430 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4431 /* inode does not have any extents */ 4432 hole_start = 0; 4433 hole_size = i_size; 4434 } else { 4435 struct btrfs_file_extent_item *extent; 4436 u64 len; 4437 4438 /* 4439 * If there's an extent beyond i_size, an explicit hole was 4440 * already inserted by copy_items(). 4441 */ 4442 if (key.offset >= i_size) 4443 return 0; 4444 4445 extent = btrfs_item_ptr(leaf, path->slots[0], 4446 struct btrfs_file_extent_item); 4447 4448 if (btrfs_file_extent_type(leaf, extent) == 4449 BTRFS_FILE_EXTENT_INLINE) { 4450 len = btrfs_file_extent_inline_len(leaf, 4451 path->slots[0], 4452 extent); 4453 ASSERT(len == i_size); 4454 return 0; 4455 } 4456 4457 len = btrfs_file_extent_num_bytes(leaf, extent); 4458 /* Last extent goes beyond i_size, no need to log a hole. */ 4459 if (key.offset + len > i_size) 4460 return 0; 4461 hole_start = key.offset + len; 4462 hole_size = i_size - hole_start; 4463 } 4464 btrfs_release_path(path); 4465 4466 /* Last extent ends at i_size. */ 4467 if (hole_size == 0) 4468 return 0; 4469 4470 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4471 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4472 hole_size, 0, hole_size, 0, 0, 0); 4473 return ret; 4474 } 4475 4476 /* 4477 * When we are logging a new inode X, check if it doesn't have a reference that 4478 * matches the reference from some other inode Y created in a past transaction 4479 * and that was renamed in the current transaction. If we don't do this, then at 4480 * log replay time we can lose inode Y (and all its files if it's a directory): 4481 * 4482 * mkdir /mnt/x 4483 * echo "hello world" > /mnt/x/foobar 4484 * sync 4485 * mv /mnt/x /mnt/y 4486 * mkdir /mnt/x # or touch /mnt/x 4487 * xfs_io -c fsync /mnt/x 4488 * <power fail> 4489 * mount fs, trigger log replay 4490 * 4491 * After the log replay procedure, we would lose the first directory and all its 4492 * files (file foobar). 4493 * For the case where inode Y is not a directory we simply end up losing it: 4494 * 4495 * echo "123" > /mnt/foo 4496 * sync 4497 * mv /mnt/foo /mnt/bar 4498 * echo "abc" > /mnt/foo 4499 * xfs_io -c fsync /mnt/foo 4500 * <power fail> 4501 * 4502 * We also need this for cases where a snapshot entry is replaced by some other 4503 * entry (file or directory) otherwise we end up with an unreplayable log due to 4504 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4505 * if it were a regular entry: 4506 * 4507 * mkdir /mnt/x 4508 * btrfs subvolume snapshot /mnt /mnt/x/snap 4509 * btrfs subvolume delete /mnt/x/snap 4510 * rmdir /mnt/x 4511 * mkdir /mnt/x 4512 * fsync /mnt/x or fsync some new file inside it 4513 * <power fail> 4514 * 4515 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4516 * the same transaction. 4517 */ 4518 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4519 const int slot, 4520 const struct btrfs_key *key, 4521 struct btrfs_inode *inode, 4522 u64 *other_ino) 4523 { 4524 int ret; 4525 struct btrfs_path *search_path; 4526 char *name = NULL; 4527 u32 name_len = 0; 4528 u32 item_size = btrfs_item_size_nr(eb, slot); 4529 u32 cur_offset = 0; 4530 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4531 4532 search_path = btrfs_alloc_path(); 4533 if (!search_path) 4534 return -ENOMEM; 4535 search_path->search_commit_root = 1; 4536 search_path->skip_locking = 1; 4537 4538 while (cur_offset < item_size) { 4539 u64 parent; 4540 u32 this_name_len; 4541 u32 this_len; 4542 unsigned long name_ptr; 4543 struct btrfs_dir_item *di; 4544 4545 if (key->type == BTRFS_INODE_REF_KEY) { 4546 struct btrfs_inode_ref *iref; 4547 4548 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4549 parent = key->offset; 4550 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4551 name_ptr = (unsigned long)(iref + 1); 4552 this_len = sizeof(*iref) + this_name_len; 4553 } else { 4554 struct btrfs_inode_extref *extref; 4555 4556 extref = (struct btrfs_inode_extref *)(ptr + 4557 cur_offset); 4558 parent = btrfs_inode_extref_parent(eb, extref); 4559 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4560 name_ptr = (unsigned long)&extref->name; 4561 this_len = sizeof(*extref) + this_name_len; 4562 } 4563 4564 ret = btrfs_is_name_len_valid(eb, slot, name_ptr, 4565 this_name_len); 4566 if (!ret) { 4567 ret = -EIO; 4568 goto out; 4569 } 4570 if (this_name_len > name_len) { 4571 char *new_name; 4572 4573 new_name = krealloc(name, this_name_len, GFP_NOFS); 4574 if (!new_name) { 4575 ret = -ENOMEM; 4576 goto out; 4577 } 4578 name_len = this_name_len; 4579 name = new_name; 4580 } 4581 4582 read_extent_buffer(eb, name, name_ptr, this_name_len); 4583 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4584 parent, name, this_name_len, 0); 4585 if (di && !IS_ERR(di)) { 4586 struct btrfs_key di_key; 4587 4588 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4589 di, &di_key); 4590 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4591 ret = 1; 4592 *other_ino = di_key.objectid; 4593 } else { 4594 ret = -EAGAIN; 4595 } 4596 goto out; 4597 } else if (IS_ERR(di)) { 4598 ret = PTR_ERR(di); 4599 goto out; 4600 } 4601 btrfs_release_path(search_path); 4602 4603 cur_offset += this_len; 4604 } 4605 ret = 0; 4606 out: 4607 btrfs_free_path(search_path); 4608 kfree(name); 4609 return ret; 4610 } 4611 4612 /* log a single inode in the tree log. 4613 * At least one parent directory for this inode must exist in the tree 4614 * or be logged already. 4615 * 4616 * Any items from this inode changed by the current transaction are copied 4617 * to the log tree. An extra reference is taken on any extents in this 4618 * file, allowing us to avoid a whole pile of corner cases around logging 4619 * blocks that have been removed from the tree. 4620 * 4621 * See LOG_INODE_ALL and related defines for a description of what inode_only 4622 * does. 4623 * 4624 * This handles both files and directories. 4625 */ 4626 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4627 struct btrfs_root *root, struct btrfs_inode *inode, 4628 int inode_only, 4629 const loff_t start, 4630 const loff_t end, 4631 struct btrfs_log_ctx *ctx) 4632 { 4633 struct btrfs_fs_info *fs_info = root->fs_info; 4634 struct btrfs_path *path; 4635 struct btrfs_path *dst_path; 4636 struct btrfs_key min_key; 4637 struct btrfs_key max_key; 4638 struct btrfs_root *log = root->log_root; 4639 struct extent_buffer *src = NULL; 4640 LIST_HEAD(logged_list); 4641 u64 last_extent = 0; 4642 int err = 0; 4643 int ret; 4644 int nritems; 4645 int ins_start_slot = 0; 4646 int ins_nr; 4647 bool fast_search = false; 4648 u64 ino = btrfs_ino(inode); 4649 struct extent_map_tree *em_tree = &inode->extent_tree; 4650 u64 logged_isize = 0; 4651 bool need_log_inode_item = true; 4652 4653 path = btrfs_alloc_path(); 4654 if (!path) 4655 return -ENOMEM; 4656 dst_path = btrfs_alloc_path(); 4657 if (!dst_path) { 4658 btrfs_free_path(path); 4659 return -ENOMEM; 4660 } 4661 4662 min_key.objectid = ino; 4663 min_key.type = BTRFS_INODE_ITEM_KEY; 4664 min_key.offset = 0; 4665 4666 max_key.objectid = ino; 4667 4668 4669 /* today the code can only do partial logging of directories */ 4670 if (S_ISDIR(inode->vfs_inode.i_mode) || 4671 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4672 &inode->runtime_flags) && 4673 inode_only >= LOG_INODE_EXISTS)) 4674 max_key.type = BTRFS_XATTR_ITEM_KEY; 4675 else 4676 max_key.type = (u8)-1; 4677 max_key.offset = (u64)-1; 4678 4679 /* 4680 * Only run delayed items if we are a dir or a new file. 4681 * Otherwise commit the delayed inode only, which is needed in 4682 * order for the log replay code to mark inodes for link count 4683 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4684 */ 4685 if (S_ISDIR(inode->vfs_inode.i_mode) || 4686 inode->generation > fs_info->last_trans_committed) 4687 ret = btrfs_commit_inode_delayed_items(trans, inode); 4688 else 4689 ret = btrfs_commit_inode_delayed_inode(inode); 4690 4691 if (ret) { 4692 btrfs_free_path(path); 4693 btrfs_free_path(dst_path); 4694 return ret; 4695 } 4696 4697 if (inode_only == LOG_OTHER_INODE) { 4698 inode_only = LOG_INODE_EXISTS; 4699 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4700 } else { 4701 mutex_lock(&inode->log_mutex); 4702 } 4703 4704 /* 4705 * a brute force approach to making sure we get the most uptodate 4706 * copies of everything. 4707 */ 4708 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4709 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4710 4711 if (inode_only == LOG_INODE_EXISTS) 4712 max_key_type = BTRFS_XATTR_ITEM_KEY; 4713 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4714 } else { 4715 if (inode_only == LOG_INODE_EXISTS) { 4716 /* 4717 * Make sure the new inode item we write to the log has 4718 * the same isize as the current one (if it exists). 4719 * This is necessary to prevent data loss after log 4720 * replay, and also to prevent doing a wrong expanding 4721 * truncate - for e.g. create file, write 4K into offset 4722 * 0, fsync, write 4K into offset 4096, add hard link, 4723 * fsync some other file (to sync log), power fail - if 4724 * we use the inode's current i_size, after log replay 4725 * we get a 8Kb file, with the last 4Kb extent as a hole 4726 * (zeroes), as if an expanding truncate happened, 4727 * instead of getting a file of 4Kb only. 4728 */ 4729 err = logged_inode_size(log, inode, path, &logged_isize); 4730 if (err) 4731 goto out_unlock; 4732 } 4733 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4734 &inode->runtime_flags)) { 4735 if (inode_only == LOG_INODE_EXISTS) { 4736 max_key.type = BTRFS_XATTR_ITEM_KEY; 4737 ret = drop_objectid_items(trans, log, path, ino, 4738 max_key.type); 4739 } else { 4740 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4741 &inode->runtime_flags); 4742 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4743 &inode->runtime_flags); 4744 while(1) { 4745 ret = btrfs_truncate_inode_items(trans, 4746 log, &inode->vfs_inode, 0, 0); 4747 if (ret != -EAGAIN) 4748 break; 4749 } 4750 } 4751 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4752 &inode->runtime_flags) || 4753 inode_only == LOG_INODE_EXISTS) { 4754 if (inode_only == LOG_INODE_ALL) 4755 fast_search = true; 4756 max_key.type = BTRFS_XATTR_ITEM_KEY; 4757 ret = drop_objectid_items(trans, log, path, ino, 4758 max_key.type); 4759 } else { 4760 if (inode_only == LOG_INODE_ALL) 4761 fast_search = true; 4762 goto log_extents; 4763 } 4764 4765 } 4766 if (ret) { 4767 err = ret; 4768 goto out_unlock; 4769 } 4770 4771 while (1) { 4772 ins_nr = 0; 4773 ret = btrfs_search_forward(root, &min_key, 4774 path, trans->transid); 4775 if (ret < 0) { 4776 err = ret; 4777 goto out_unlock; 4778 } 4779 if (ret != 0) 4780 break; 4781 again: 4782 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4783 if (min_key.objectid != ino) 4784 break; 4785 if (min_key.type > max_key.type) 4786 break; 4787 4788 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4789 need_log_inode_item = false; 4790 4791 if ((min_key.type == BTRFS_INODE_REF_KEY || 4792 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4793 inode->generation == trans->transid) { 4794 u64 other_ino = 0; 4795 4796 ret = btrfs_check_ref_name_override(path->nodes[0], 4797 path->slots[0], &min_key, inode, 4798 &other_ino); 4799 if (ret < 0) { 4800 err = ret; 4801 goto out_unlock; 4802 } else if (ret > 0 && ctx && 4803 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4804 struct btrfs_key inode_key; 4805 struct inode *other_inode; 4806 4807 if (ins_nr > 0) { 4808 ins_nr++; 4809 } else { 4810 ins_nr = 1; 4811 ins_start_slot = path->slots[0]; 4812 } 4813 ret = copy_items(trans, inode, dst_path, path, 4814 &last_extent, ins_start_slot, 4815 ins_nr, inode_only, 4816 logged_isize); 4817 if (ret < 0) { 4818 err = ret; 4819 goto out_unlock; 4820 } 4821 ins_nr = 0; 4822 btrfs_release_path(path); 4823 inode_key.objectid = other_ino; 4824 inode_key.type = BTRFS_INODE_ITEM_KEY; 4825 inode_key.offset = 0; 4826 other_inode = btrfs_iget(fs_info->sb, 4827 &inode_key, root, 4828 NULL); 4829 /* 4830 * If the other inode that had a conflicting dir 4831 * entry was deleted in the current transaction, 4832 * we don't need to do more work nor fallback to 4833 * a transaction commit. 4834 */ 4835 if (IS_ERR(other_inode) && 4836 PTR_ERR(other_inode) == -ENOENT) { 4837 goto next_key; 4838 } else if (IS_ERR(other_inode)) { 4839 err = PTR_ERR(other_inode); 4840 goto out_unlock; 4841 } 4842 /* 4843 * We are safe logging the other inode without 4844 * acquiring its i_mutex as long as we log with 4845 * the LOG_INODE_EXISTS mode. We're safe against 4846 * concurrent renames of the other inode as well 4847 * because during a rename we pin the log and 4848 * update the log with the new name before we 4849 * unpin it. 4850 */ 4851 err = btrfs_log_inode(trans, root, 4852 BTRFS_I(other_inode), 4853 LOG_OTHER_INODE, 0, LLONG_MAX, 4854 ctx); 4855 iput(other_inode); 4856 if (err) 4857 goto out_unlock; 4858 else 4859 goto next_key; 4860 } 4861 } 4862 4863 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4864 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4865 if (ins_nr == 0) 4866 goto next_slot; 4867 ret = copy_items(trans, inode, dst_path, path, 4868 &last_extent, ins_start_slot, 4869 ins_nr, inode_only, logged_isize); 4870 if (ret < 0) { 4871 err = ret; 4872 goto out_unlock; 4873 } 4874 ins_nr = 0; 4875 if (ret) { 4876 btrfs_release_path(path); 4877 continue; 4878 } 4879 goto next_slot; 4880 } 4881 4882 src = path->nodes[0]; 4883 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4884 ins_nr++; 4885 goto next_slot; 4886 } else if (!ins_nr) { 4887 ins_start_slot = path->slots[0]; 4888 ins_nr = 1; 4889 goto next_slot; 4890 } 4891 4892 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4893 ins_start_slot, ins_nr, inode_only, 4894 logged_isize); 4895 if (ret < 0) { 4896 err = ret; 4897 goto out_unlock; 4898 } 4899 if (ret) { 4900 ins_nr = 0; 4901 btrfs_release_path(path); 4902 continue; 4903 } 4904 ins_nr = 1; 4905 ins_start_slot = path->slots[0]; 4906 next_slot: 4907 4908 nritems = btrfs_header_nritems(path->nodes[0]); 4909 path->slots[0]++; 4910 if (path->slots[0] < nritems) { 4911 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4912 path->slots[0]); 4913 goto again; 4914 } 4915 if (ins_nr) { 4916 ret = copy_items(trans, inode, dst_path, path, 4917 &last_extent, ins_start_slot, 4918 ins_nr, inode_only, logged_isize); 4919 if (ret < 0) { 4920 err = ret; 4921 goto out_unlock; 4922 } 4923 ret = 0; 4924 ins_nr = 0; 4925 } 4926 btrfs_release_path(path); 4927 next_key: 4928 if (min_key.offset < (u64)-1) { 4929 min_key.offset++; 4930 } else if (min_key.type < max_key.type) { 4931 min_key.type++; 4932 min_key.offset = 0; 4933 } else { 4934 break; 4935 } 4936 } 4937 if (ins_nr) { 4938 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4939 ins_start_slot, ins_nr, inode_only, 4940 logged_isize); 4941 if (ret < 0) { 4942 err = ret; 4943 goto out_unlock; 4944 } 4945 ret = 0; 4946 ins_nr = 0; 4947 } 4948 4949 btrfs_release_path(path); 4950 btrfs_release_path(dst_path); 4951 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4952 if (err) 4953 goto out_unlock; 4954 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4955 btrfs_release_path(path); 4956 btrfs_release_path(dst_path); 4957 err = btrfs_log_trailing_hole(trans, root, inode, path); 4958 if (err) 4959 goto out_unlock; 4960 } 4961 log_extents: 4962 btrfs_release_path(path); 4963 btrfs_release_path(dst_path); 4964 if (need_log_inode_item) { 4965 err = log_inode_item(trans, log, dst_path, inode); 4966 if (err) 4967 goto out_unlock; 4968 } 4969 if (fast_search) { 4970 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4971 &logged_list, ctx, start, end); 4972 if (ret) { 4973 err = ret; 4974 goto out_unlock; 4975 } 4976 } else if (inode_only == LOG_INODE_ALL) { 4977 struct extent_map *em, *n; 4978 4979 write_lock(&em_tree->lock); 4980 /* 4981 * We can't just remove every em if we're called for a ranged 4982 * fsync - that is, one that doesn't cover the whole possible 4983 * file range (0 to LLONG_MAX). This is because we can have 4984 * em's that fall outside the range we're logging and therefore 4985 * their ordered operations haven't completed yet 4986 * (btrfs_finish_ordered_io() not invoked yet). This means we 4987 * didn't get their respective file extent item in the fs/subvol 4988 * tree yet, and need to let the next fast fsync (one which 4989 * consults the list of modified extent maps) find the em so 4990 * that it logs a matching file extent item and waits for the 4991 * respective ordered operation to complete (if it's still 4992 * running). 4993 * 4994 * Removing every em outside the range we're logging would make 4995 * the next fast fsync not log their matching file extent items, 4996 * therefore making us lose data after a log replay. 4997 */ 4998 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4999 list) { 5000 const u64 mod_end = em->mod_start + em->mod_len - 1; 5001 5002 if (em->mod_start >= start && mod_end <= end) 5003 list_del_init(&em->list); 5004 } 5005 write_unlock(&em_tree->lock); 5006 } 5007 5008 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5009 ret = log_directory_changes(trans, root, inode, path, dst_path, 5010 ctx); 5011 if (ret) { 5012 err = ret; 5013 goto out_unlock; 5014 } 5015 } 5016 5017 spin_lock(&inode->lock); 5018 inode->logged_trans = trans->transid; 5019 inode->last_log_commit = inode->last_sub_trans; 5020 spin_unlock(&inode->lock); 5021 out_unlock: 5022 if (unlikely(err)) 5023 btrfs_put_logged_extents(&logged_list); 5024 else 5025 btrfs_submit_logged_extents(&logged_list, log); 5026 mutex_unlock(&inode->log_mutex); 5027 5028 btrfs_free_path(path); 5029 btrfs_free_path(dst_path); 5030 return err; 5031 } 5032 5033 /* 5034 * Check if we must fallback to a transaction commit when logging an inode. 5035 * This must be called after logging the inode and is used only in the context 5036 * when fsyncing an inode requires the need to log some other inode - in which 5037 * case we can't lock the i_mutex of each other inode we need to log as that 5038 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5039 * log inodes up or down in the hierarchy) or rename operations for example. So 5040 * we take the log_mutex of the inode after we have logged it and then check for 5041 * its last_unlink_trans value - this is safe because any task setting 5042 * last_unlink_trans must take the log_mutex and it must do this before it does 5043 * the actual unlink operation, so if we do this check before a concurrent task 5044 * sets last_unlink_trans it means we've logged a consistent version/state of 5045 * all the inode items, otherwise we are not sure and must do a transaction 5046 * commit (the concurrent task might have only updated last_unlink_trans before 5047 * we logged the inode or it might have also done the unlink). 5048 */ 5049 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5050 struct btrfs_inode *inode) 5051 { 5052 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5053 bool ret = false; 5054 5055 mutex_lock(&inode->log_mutex); 5056 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5057 /* 5058 * Make sure any commits to the log are forced to be full 5059 * commits. 5060 */ 5061 btrfs_set_log_full_commit(fs_info, trans); 5062 ret = true; 5063 } 5064 mutex_unlock(&inode->log_mutex); 5065 5066 return ret; 5067 } 5068 5069 /* 5070 * follow the dentry parent pointers up the chain and see if any 5071 * of the directories in it require a full commit before they can 5072 * be logged. Returns zero if nothing special needs to be done or 1 if 5073 * a full commit is required. 5074 */ 5075 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5076 struct btrfs_inode *inode, 5077 struct dentry *parent, 5078 struct super_block *sb, 5079 u64 last_committed) 5080 { 5081 int ret = 0; 5082 struct dentry *old_parent = NULL; 5083 struct btrfs_inode *orig_inode = inode; 5084 5085 /* 5086 * for regular files, if its inode is already on disk, we don't 5087 * have to worry about the parents at all. This is because 5088 * we can use the last_unlink_trans field to record renames 5089 * and other fun in this file. 5090 */ 5091 if (S_ISREG(inode->vfs_inode.i_mode) && 5092 inode->generation <= last_committed && 5093 inode->last_unlink_trans <= last_committed) 5094 goto out; 5095 5096 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5097 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5098 goto out; 5099 inode = BTRFS_I(d_inode(parent)); 5100 } 5101 5102 while (1) { 5103 /* 5104 * If we are logging a directory then we start with our inode, 5105 * not our parent's inode, so we need to skip setting the 5106 * logged_trans so that further down in the log code we don't 5107 * think this inode has already been logged. 5108 */ 5109 if (inode != orig_inode) 5110 inode->logged_trans = trans->transid; 5111 smp_mb(); 5112 5113 if (btrfs_must_commit_transaction(trans, inode)) { 5114 ret = 1; 5115 break; 5116 } 5117 5118 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5119 break; 5120 5121 if (IS_ROOT(parent)) { 5122 inode = BTRFS_I(d_inode(parent)); 5123 if (btrfs_must_commit_transaction(trans, inode)) 5124 ret = 1; 5125 break; 5126 } 5127 5128 parent = dget_parent(parent); 5129 dput(old_parent); 5130 old_parent = parent; 5131 inode = BTRFS_I(d_inode(parent)); 5132 5133 } 5134 dput(old_parent); 5135 out: 5136 return ret; 5137 } 5138 5139 struct btrfs_dir_list { 5140 u64 ino; 5141 struct list_head list; 5142 }; 5143 5144 /* 5145 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5146 * details about the why it is needed. 5147 * This is a recursive operation - if an existing dentry corresponds to a 5148 * directory, that directory's new entries are logged too (same behaviour as 5149 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5150 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5151 * complains about the following circular lock dependency / possible deadlock: 5152 * 5153 * CPU0 CPU1 5154 * ---- ---- 5155 * lock(&type->i_mutex_dir_key#3/2); 5156 * lock(sb_internal#2); 5157 * lock(&type->i_mutex_dir_key#3/2); 5158 * lock(&sb->s_type->i_mutex_key#14); 5159 * 5160 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5161 * sb_start_intwrite() in btrfs_start_transaction(). 5162 * Not locking i_mutex of the inodes is still safe because: 5163 * 5164 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5165 * that while logging the inode new references (names) are added or removed 5166 * from the inode, leaving the logged inode item with a link count that does 5167 * not match the number of logged inode reference items. This is fine because 5168 * at log replay time we compute the real number of links and correct the 5169 * link count in the inode item (see replay_one_buffer() and 5170 * link_to_fixup_dir()); 5171 * 5172 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5173 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5174 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5175 * has a size that doesn't match the sum of the lengths of all the logged 5176 * names. This does not result in a problem because if a dir_item key is 5177 * logged but its matching dir_index key is not logged, at log replay time we 5178 * don't use it to replay the respective name (see replay_one_name()). On the 5179 * other hand if only the dir_index key ends up being logged, the respective 5180 * name is added to the fs/subvol tree with both the dir_item and dir_index 5181 * keys created (see replay_one_name()). 5182 * The directory's inode item with a wrong i_size is not a problem as well, 5183 * since we don't use it at log replay time to set the i_size in the inode 5184 * item of the fs/subvol tree (see overwrite_item()). 5185 */ 5186 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5187 struct btrfs_root *root, 5188 struct btrfs_inode *start_inode, 5189 struct btrfs_log_ctx *ctx) 5190 { 5191 struct btrfs_fs_info *fs_info = root->fs_info; 5192 struct btrfs_root *log = root->log_root; 5193 struct btrfs_path *path; 5194 LIST_HEAD(dir_list); 5195 struct btrfs_dir_list *dir_elem; 5196 int ret = 0; 5197 5198 path = btrfs_alloc_path(); 5199 if (!path) 5200 return -ENOMEM; 5201 5202 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5203 if (!dir_elem) { 5204 btrfs_free_path(path); 5205 return -ENOMEM; 5206 } 5207 dir_elem->ino = btrfs_ino(start_inode); 5208 list_add_tail(&dir_elem->list, &dir_list); 5209 5210 while (!list_empty(&dir_list)) { 5211 struct extent_buffer *leaf; 5212 struct btrfs_key min_key; 5213 int nritems; 5214 int i; 5215 5216 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5217 list); 5218 if (ret) 5219 goto next_dir_inode; 5220 5221 min_key.objectid = dir_elem->ino; 5222 min_key.type = BTRFS_DIR_ITEM_KEY; 5223 min_key.offset = 0; 5224 again: 5225 btrfs_release_path(path); 5226 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5227 if (ret < 0) { 5228 goto next_dir_inode; 5229 } else if (ret > 0) { 5230 ret = 0; 5231 goto next_dir_inode; 5232 } 5233 5234 process_leaf: 5235 leaf = path->nodes[0]; 5236 nritems = btrfs_header_nritems(leaf); 5237 for (i = path->slots[0]; i < nritems; i++) { 5238 struct btrfs_dir_item *di; 5239 struct btrfs_key di_key; 5240 struct inode *di_inode; 5241 struct btrfs_dir_list *new_dir_elem; 5242 int log_mode = LOG_INODE_EXISTS; 5243 int type; 5244 5245 btrfs_item_key_to_cpu(leaf, &min_key, i); 5246 if (min_key.objectid != dir_elem->ino || 5247 min_key.type != BTRFS_DIR_ITEM_KEY) 5248 goto next_dir_inode; 5249 5250 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5251 type = btrfs_dir_type(leaf, di); 5252 if (btrfs_dir_transid(leaf, di) < trans->transid && 5253 type != BTRFS_FT_DIR) 5254 continue; 5255 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5256 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5257 continue; 5258 5259 btrfs_release_path(path); 5260 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5261 if (IS_ERR(di_inode)) { 5262 ret = PTR_ERR(di_inode); 5263 goto next_dir_inode; 5264 } 5265 5266 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5267 iput(di_inode); 5268 break; 5269 } 5270 5271 ctx->log_new_dentries = false; 5272 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5273 log_mode = LOG_INODE_ALL; 5274 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5275 log_mode, 0, LLONG_MAX, ctx); 5276 if (!ret && 5277 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5278 ret = 1; 5279 iput(di_inode); 5280 if (ret) 5281 goto next_dir_inode; 5282 if (ctx->log_new_dentries) { 5283 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5284 GFP_NOFS); 5285 if (!new_dir_elem) { 5286 ret = -ENOMEM; 5287 goto next_dir_inode; 5288 } 5289 new_dir_elem->ino = di_key.objectid; 5290 list_add_tail(&new_dir_elem->list, &dir_list); 5291 } 5292 break; 5293 } 5294 if (i == nritems) { 5295 ret = btrfs_next_leaf(log, path); 5296 if (ret < 0) { 5297 goto next_dir_inode; 5298 } else if (ret > 0) { 5299 ret = 0; 5300 goto next_dir_inode; 5301 } 5302 goto process_leaf; 5303 } 5304 if (min_key.offset < (u64)-1) { 5305 min_key.offset++; 5306 goto again; 5307 } 5308 next_dir_inode: 5309 list_del(&dir_elem->list); 5310 kfree(dir_elem); 5311 } 5312 5313 btrfs_free_path(path); 5314 return ret; 5315 } 5316 5317 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5318 struct btrfs_inode *inode, 5319 struct btrfs_log_ctx *ctx) 5320 { 5321 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5322 int ret; 5323 struct btrfs_path *path; 5324 struct btrfs_key key; 5325 struct btrfs_root *root = inode->root; 5326 const u64 ino = btrfs_ino(inode); 5327 5328 path = btrfs_alloc_path(); 5329 if (!path) 5330 return -ENOMEM; 5331 path->skip_locking = 1; 5332 path->search_commit_root = 1; 5333 5334 key.objectid = ino; 5335 key.type = BTRFS_INODE_REF_KEY; 5336 key.offset = 0; 5337 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5338 if (ret < 0) 5339 goto out; 5340 5341 while (true) { 5342 struct extent_buffer *leaf = path->nodes[0]; 5343 int slot = path->slots[0]; 5344 u32 cur_offset = 0; 5345 u32 item_size; 5346 unsigned long ptr; 5347 5348 if (slot >= btrfs_header_nritems(leaf)) { 5349 ret = btrfs_next_leaf(root, path); 5350 if (ret < 0) 5351 goto out; 5352 else if (ret > 0) 5353 break; 5354 continue; 5355 } 5356 5357 btrfs_item_key_to_cpu(leaf, &key, slot); 5358 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5359 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5360 break; 5361 5362 item_size = btrfs_item_size_nr(leaf, slot); 5363 ptr = btrfs_item_ptr_offset(leaf, slot); 5364 while (cur_offset < item_size) { 5365 struct btrfs_key inode_key; 5366 struct inode *dir_inode; 5367 5368 inode_key.type = BTRFS_INODE_ITEM_KEY; 5369 inode_key.offset = 0; 5370 5371 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5372 struct btrfs_inode_extref *extref; 5373 5374 extref = (struct btrfs_inode_extref *) 5375 (ptr + cur_offset); 5376 inode_key.objectid = btrfs_inode_extref_parent( 5377 leaf, extref); 5378 cur_offset += sizeof(*extref); 5379 cur_offset += btrfs_inode_extref_name_len(leaf, 5380 extref); 5381 } else { 5382 inode_key.objectid = key.offset; 5383 cur_offset = item_size; 5384 } 5385 5386 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5387 root, NULL); 5388 /* If parent inode was deleted, skip it. */ 5389 if (IS_ERR(dir_inode)) 5390 continue; 5391 5392 if (ctx) 5393 ctx->log_new_dentries = false; 5394 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5395 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5396 if (!ret && 5397 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5398 ret = 1; 5399 if (!ret && ctx && ctx->log_new_dentries) 5400 ret = log_new_dir_dentries(trans, root, 5401 BTRFS_I(dir_inode), ctx); 5402 iput(dir_inode); 5403 if (ret) 5404 goto out; 5405 } 5406 path->slots[0]++; 5407 } 5408 ret = 0; 5409 out: 5410 btrfs_free_path(path); 5411 return ret; 5412 } 5413 5414 /* 5415 * helper function around btrfs_log_inode to make sure newly created 5416 * parent directories also end up in the log. A minimal inode and backref 5417 * only logging is done of any parent directories that are older than 5418 * the last committed transaction 5419 */ 5420 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5421 struct btrfs_root *root, 5422 struct btrfs_inode *inode, 5423 struct dentry *parent, 5424 const loff_t start, 5425 const loff_t end, 5426 int exists_only, 5427 struct btrfs_log_ctx *ctx) 5428 { 5429 struct btrfs_fs_info *fs_info = root->fs_info; 5430 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5431 struct super_block *sb; 5432 struct dentry *old_parent = NULL; 5433 int ret = 0; 5434 u64 last_committed = fs_info->last_trans_committed; 5435 bool log_dentries = false; 5436 struct btrfs_inode *orig_inode = inode; 5437 5438 sb = inode->vfs_inode.i_sb; 5439 5440 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5441 ret = 1; 5442 goto end_no_trans; 5443 } 5444 5445 /* 5446 * The prev transaction commit doesn't complete, we need do 5447 * full commit by ourselves. 5448 */ 5449 if (fs_info->last_trans_log_full_commit > 5450 fs_info->last_trans_committed) { 5451 ret = 1; 5452 goto end_no_trans; 5453 } 5454 5455 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5456 ret = 1; 5457 goto end_no_trans; 5458 } 5459 5460 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5461 last_committed); 5462 if (ret) 5463 goto end_no_trans; 5464 5465 if (btrfs_inode_in_log(inode, trans->transid)) { 5466 ret = BTRFS_NO_LOG_SYNC; 5467 goto end_no_trans; 5468 } 5469 5470 ret = start_log_trans(trans, root, ctx); 5471 if (ret) 5472 goto end_no_trans; 5473 5474 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5475 if (ret) 5476 goto end_trans; 5477 5478 /* 5479 * for regular files, if its inode is already on disk, we don't 5480 * have to worry about the parents at all. This is because 5481 * we can use the last_unlink_trans field to record renames 5482 * and other fun in this file. 5483 */ 5484 if (S_ISREG(inode->vfs_inode.i_mode) && 5485 inode->generation <= last_committed && 5486 inode->last_unlink_trans <= last_committed) { 5487 ret = 0; 5488 goto end_trans; 5489 } 5490 5491 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5492 log_dentries = true; 5493 5494 /* 5495 * On unlink we must make sure all our current and old parent directory 5496 * inodes are fully logged. This is to prevent leaving dangling 5497 * directory index entries in directories that were our parents but are 5498 * not anymore. Not doing this results in old parent directory being 5499 * impossible to delete after log replay (rmdir will always fail with 5500 * error -ENOTEMPTY). 5501 * 5502 * Example 1: 5503 * 5504 * mkdir testdir 5505 * touch testdir/foo 5506 * ln testdir/foo testdir/bar 5507 * sync 5508 * unlink testdir/bar 5509 * xfs_io -c fsync testdir/foo 5510 * <power failure> 5511 * mount fs, triggers log replay 5512 * 5513 * If we don't log the parent directory (testdir), after log replay the 5514 * directory still has an entry pointing to the file inode using the bar 5515 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5516 * the file inode has a link count of 1. 5517 * 5518 * Example 2: 5519 * 5520 * mkdir testdir 5521 * touch foo 5522 * ln foo testdir/foo2 5523 * ln foo testdir/foo3 5524 * sync 5525 * unlink testdir/foo3 5526 * xfs_io -c fsync foo 5527 * <power failure> 5528 * mount fs, triggers log replay 5529 * 5530 * Similar as the first example, after log replay the parent directory 5531 * testdir still has an entry pointing to the inode file with name foo3 5532 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5533 * and has a link count of 2. 5534 */ 5535 if (inode->last_unlink_trans > last_committed) { 5536 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5537 if (ret) 5538 goto end_trans; 5539 } 5540 5541 while (1) { 5542 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5543 break; 5544 5545 inode = BTRFS_I(d_inode(parent)); 5546 if (root != inode->root) 5547 break; 5548 5549 if (inode->generation > last_committed) { 5550 ret = btrfs_log_inode(trans, root, inode, 5551 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5552 if (ret) 5553 goto end_trans; 5554 } 5555 if (IS_ROOT(parent)) 5556 break; 5557 5558 parent = dget_parent(parent); 5559 dput(old_parent); 5560 old_parent = parent; 5561 } 5562 if (log_dentries) 5563 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5564 else 5565 ret = 0; 5566 end_trans: 5567 dput(old_parent); 5568 if (ret < 0) { 5569 btrfs_set_log_full_commit(fs_info, trans); 5570 ret = 1; 5571 } 5572 5573 if (ret) 5574 btrfs_remove_log_ctx(root, ctx); 5575 btrfs_end_log_trans(root); 5576 end_no_trans: 5577 return ret; 5578 } 5579 5580 /* 5581 * it is not safe to log dentry if the chunk root has added new 5582 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5583 * If this returns 1, you must commit the transaction to safely get your 5584 * data on disk. 5585 */ 5586 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5587 struct btrfs_root *root, struct dentry *dentry, 5588 const loff_t start, 5589 const loff_t end, 5590 struct btrfs_log_ctx *ctx) 5591 { 5592 struct dentry *parent = dget_parent(dentry); 5593 int ret; 5594 5595 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5596 parent, start, end, 0, ctx); 5597 dput(parent); 5598 5599 return ret; 5600 } 5601 5602 /* 5603 * should be called during mount to recover any replay any log trees 5604 * from the FS 5605 */ 5606 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5607 { 5608 int ret; 5609 struct btrfs_path *path; 5610 struct btrfs_trans_handle *trans; 5611 struct btrfs_key key; 5612 struct btrfs_key found_key; 5613 struct btrfs_key tmp_key; 5614 struct btrfs_root *log; 5615 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5616 struct walk_control wc = { 5617 .process_func = process_one_buffer, 5618 .stage = 0, 5619 }; 5620 5621 path = btrfs_alloc_path(); 5622 if (!path) 5623 return -ENOMEM; 5624 5625 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5626 5627 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5628 if (IS_ERR(trans)) { 5629 ret = PTR_ERR(trans); 5630 goto error; 5631 } 5632 5633 wc.trans = trans; 5634 wc.pin = 1; 5635 5636 ret = walk_log_tree(trans, log_root_tree, &wc); 5637 if (ret) { 5638 btrfs_handle_fs_error(fs_info, ret, 5639 "Failed to pin buffers while recovering log root tree."); 5640 goto error; 5641 } 5642 5643 again: 5644 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5645 key.offset = (u64)-1; 5646 key.type = BTRFS_ROOT_ITEM_KEY; 5647 5648 while (1) { 5649 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5650 5651 if (ret < 0) { 5652 btrfs_handle_fs_error(fs_info, ret, 5653 "Couldn't find tree log root."); 5654 goto error; 5655 } 5656 if (ret > 0) { 5657 if (path->slots[0] == 0) 5658 break; 5659 path->slots[0]--; 5660 } 5661 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5662 path->slots[0]); 5663 btrfs_release_path(path); 5664 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5665 break; 5666 5667 log = btrfs_read_fs_root(log_root_tree, &found_key); 5668 if (IS_ERR(log)) { 5669 ret = PTR_ERR(log); 5670 btrfs_handle_fs_error(fs_info, ret, 5671 "Couldn't read tree log root."); 5672 goto error; 5673 } 5674 5675 tmp_key.objectid = found_key.offset; 5676 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5677 tmp_key.offset = (u64)-1; 5678 5679 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5680 if (IS_ERR(wc.replay_dest)) { 5681 ret = PTR_ERR(wc.replay_dest); 5682 free_extent_buffer(log->node); 5683 free_extent_buffer(log->commit_root); 5684 kfree(log); 5685 btrfs_handle_fs_error(fs_info, ret, 5686 "Couldn't read target root for tree log recovery."); 5687 goto error; 5688 } 5689 5690 wc.replay_dest->log_root = log; 5691 btrfs_record_root_in_trans(trans, wc.replay_dest); 5692 ret = walk_log_tree(trans, log, &wc); 5693 5694 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5695 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5696 path); 5697 } 5698 5699 key.offset = found_key.offset - 1; 5700 wc.replay_dest->log_root = NULL; 5701 free_extent_buffer(log->node); 5702 free_extent_buffer(log->commit_root); 5703 kfree(log); 5704 5705 if (ret) 5706 goto error; 5707 5708 if (found_key.offset == 0) 5709 break; 5710 } 5711 btrfs_release_path(path); 5712 5713 /* step one is to pin it all, step two is to replay just inodes */ 5714 if (wc.pin) { 5715 wc.pin = 0; 5716 wc.process_func = replay_one_buffer; 5717 wc.stage = LOG_WALK_REPLAY_INODES; 5718 goto again; 5719 } 5720 /* step three is to replay everything */ 5721 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5722 wc.stage++; 5723 goto again; 5724 } 5725 5726 btrfs_free_path(path); 5727 5728 /* step 4: commit the transaction, which also unpins the blocks */ 5729 ret = btrfs_commit_transaction(trans); 5730 if (ret) 5731 return ret; 5732 5733 free_extent_buffer(log_root_tree->node); 5734 log_root_tree->log_root = NULL; 5735 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5736 kfree(log_root_tree); 5737 5738 return 0; 5739 error: 5740 if (wc.trans) 5741 btrfs_end_transaction(wc.trans); 5742 btrfs_free_path(path); 5743 return ret; 5744 } 5745 5746 /* 5747 * there are some corner cases where we want to force a full 5748 * commit instead of allowing a directory to be logged. 5749 * 5750 * They revolve around files there were unlinked from the directory, and 5751 * this function updates the parent directory so that a full commit is 5752 * properly done if it is fsync'd later after the unlinks are done. 5753 * 5754 * Must be called before the unlink operations (updates to the subvolume tree, 5755 * inodes, etc) are done. 5756 */ 5757 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5758 struct btrfs_inode *dir, struct btrfs_inode *inode, 5759 int for_rename) 5760 { 5761 /* 5762 * when we're logging a file, if it hasn't been renamed 5763 * or unlinked, and its inode is fully committed on disk, 5764 * we don't have to worry about walking up the directory chain 5765 * to log its parents. 5766 * 5767 * So, we use the last_unlink_trans field to put this transid 5768 * into the file. When the file is logged we check it and 5769 * don't log the parents if the file is fully on disk. 5770 */ 5771 mutex_lock(&inode->log_mutex); 5772 inode->last_unlink_trans = trans->transid; 5773 mutex_unlock(&inode->log_mutex); 5774 5775 /* 5776 * if this directory was already logged any new 5777 * names for this file/dir will get recorded 5778 */ 5779 smp_mb(); 5780 if (dir->logged_trans == trans->transid) 5781 return; 5782 5783 /* 5784 * if the inode we're about to unlink was logged, 5785 * the log will be properly updated for any new names 5786 */ 5787 if (inode->logged_trans == trans->transid) 5788 return; 5789 5790 /* 5791 * when renaming files across directories, if the directory 5792 * there we're unlinking from gets fsync'd later on, there's 5793 * no way to find the destination directory later and fsync it 5794 * properly. So, we have to be conservative and force commits 5795 * so the new name gets discovered. 5796 */ 5797 if (for_rename) 5798 goto record; 5799 5800 /* we can safely do the unlink without any special recording */ 5801 return; 5802 5803 record: 5804 mutex_lock(&dir->log_mutex); 5805 dir->last_unlink_trans = trans->transid; 5806 mutex_unlock(&dir->log_mutex); 5807 } 5808 5809 /* 5810 * Make sure that if someone attempts to fsync the parent directory of a deleted 5811 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5812 * that after replaying the log tree of the parent directory's root we will not 5813 * see the snapshot anymore and at log replay time we will not see any log tree 5814 * corresponding to the deleted snapshot's root, which could lead to replaying 5815 * it after replaying the log tree of the parent directory (which would replay 5816 * the snapshot delete operation). 5817 * 5818 * Must be called before the actual snapshot destroy operation (updates to the 5819 * parent root and tree of tree roots trees, etc) are done. 5820 */ 5821 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5822 struct btrfs_inode *dir) 5823 { 5824 mutex_lock(&dir->log_mutex); 5825 dir->last_unlink_trans = trans->transid; 5826 mutex_unlock(&dir->log_mutex); 5827 } 5828 5829 /* 5830 * Call this after adding a new name for a file and it will properly 5831 * update the log to reflect the new name. 5832 * 5833 * It will return zero if all goes well, and it will return 1 if a 5834 * full transaction commit is required. 5835 */ 5836 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5837 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5838 struct dentry *parent) 5839 { 5840 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5841 struct btrfs_root *root = inode->root; 5842 5843 /* 5844 * this will force the logging code to walk the dentry chain 5845 * up for the file 5846 */ 5847 if (S_ISREG(inode->vfs_inode.i_mode)) 5848 inode->last_unlink_trans = trans->transid; 5849 5850 /* 5851 * if this inode hasn't been logged and directory we're renaming it 5852 * from hasn't been logged, we don't need to log it 5853 */ 5854 if (inode->logged_trans <= fs_info->last_trans_committed && 5855 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5856 return 0; 5857 5858 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5859 LLONG_MAX, 1, NULL); 5860 } 5861 5862