1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 #include "compression.h" 30 #include "qgroup.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 #define LOG_OTHER_INODE 2 41 42 /* 43 * directory trouble cases 44 * 45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 46 * log, we must force a full commit before doing an fsync of the directory 47 * where the unlink was done. 48 * ---> record transid of last unlink/rename per directory 49 * 50 * mkdir foo/some_dir 51 * normal commit 52 * rename foo/some_dir foo2/some_dir 53 * mkdir foo/some_dir 54 * fsync foo/some_dir/some_file 55 * 56 * The fsync above will unlink the original some_dir without recording 57 * it in its new location (foo2). After a crash, some_dir will be gone 58 * unless the fsync of some_file forces a full commit 59 * 60 * 2) we must log any new names for any file or dir that is in the fsync 61 * log. ---> check inode while renaming/linking. 62 * 63 * 2a) we must log any new names for any file or dir during rename 64 * when the directory they are being removed from was logged. 65 * ---> check inode and old parent dir during rename 66 * 67 * 2a is actually the more important variant. With the extra logging 68 * a crash might unlink the old name without recreating the new one 69 * 70 * 3) after a crash, we must go through any directories with a link count 71 * of zero and redo the rm -rf 72 * 73 * mkdir f1/foo 74 * normal commit 75 * rm -rf f1/foo 76 * fsync(f1) 77 * 78 * The directory f1 was fully removed from the FS, but fsync was never 79 * called on f1, only its parent dir. After a crash the rm -rf must 80 * be replayed. This must be able to recurse down the entire 81 * directory tree. The inode link count fixup code takes care of the 82 * ugly details. 83 */ 84 85 /* 86 * stages for the tree walking. The first 87 * stage (0) is to only pin down the blocks we find 88 * the second stage (1) is to make sure that all the inodes 89 * we find in the log are created in the subvolume. 90 * 91 * The last stage is to deal with directories and links and extents 92 * and all the other fun semantics 93 */ 94 #define LOG_WALK_PIN_ONLY 0 95 #define LOG_WALK_REPLAY_INODES 1 96 #define LOG_WALK_REPLAY_DIR_INDEX 2 97 #define LOG_WALK_REPLAY_ALL 3 98 99 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, struct btrfs_inode *inode, 101 int inode_only, 102 const loff_t start, 103 const loff_t end, 104 struct btrfs_log_ctx *ctx); 105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_path *path, u64 objectid); 108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_root *log, 111 struct btrfs_path *path, 112 u64 dirid, int del_all); 113 114 /* 115 * tree logging is a special write ahead log used to make sure that 116 * fsyncs and O_SYNCs can happen without doing full tree commits. 117 * 118 * Full tree commits are expensive because they require commonly 119 * modified blocks to be recowed, creating many dirty pages in the 120 * extent tree an 4x-6x higher write load than ext3. 121 * 122 * Instead of doing a tree commit on every fsync, we use the 123 * key ranges and transaction ids to find items for a given file or directory 124 * that have changed in this transaction. Those items are copied into 125 * a special tree (one per subvolume root), that tree is written to disk 126 * and then the fsync is considered complete. 127 * 128 * After a crash, items are copied out of the log-tree back into the 129 * subvolume tree. Any file data extents found are recorded in the extent 130 * allocation tree, and the log-tree freed. 131 * 132 * The log tree is read three times, once to pin down all the extents it is 133 * using in ram and once, once to create all the inodes logged in the tree 134 * and once to do all the other items. 135 */ 136 137 /* 138 * start a sub transaction and setup the log tree 139 * this increments the log tree writer count to make the people 140 * syncing the tree wait for us to finish 141 */ 142 static int start_log_trans(struct btrfs_trans_handle *trans, 143 struct btrfs_root *root, 144 struct btrfs_log_ctx *ctx) 145 { 146 struct btrfs_fs_info *fs_info = root->fs_info; 147 int ret = 0; 148 149 mutex_lock(&root->log_mutex); 150 151 if (root->log_root) { 152 if (btrfs_need_log_full_commit(fs_info, trans)) { 153 ret = -EAGAIN; 154 goto out; 155 } 156 157 if (!root->log_start_pid) { 158 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 root->log_start_pid = current->pid; 160 } else if (root->log_start_pid != current->pid) { 161 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 162 } 163 } else { 164 mutex_lock(&fs_info->tree_log_mutex); 165 if (!fs_info->log_root_tree) 166 ret = btrfs_init_log_root_tree(trans, fs_info); 167 mutex_unlock(&fs_info->tree_log_mutex); 168 if (ret) 169 goto out; 170 171 ret = btrfs_add_log_tree(trans, root); 172 if (ret) 173 goto out; 174 175 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 176 root->log_start_pid = current->pid; 177 } 178 179 atomic_inc(&root->log_batch); 180 atomic_inc(&root->log_writers); 181 if (ctx) { 182 int index = root->log_transid % 2; 183 list_add_tail(&ctx->list, &root->log_ctxs[index]); 184 ctx->log_transid = root->log_transid; 185 } 186 187 out: 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * returns 0 if there was a log transaction running and we were able 194 * to join, or returns -ENOENT if there were not transactions 195 * in progress 196 */ 197 static int join_running_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 smp_mb(); 202 if (!root->log_root) 203 return -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 if (root->log_root) { 207 ret = 0; 208 atomic_inc(&root->log_writers); 209 } 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * This either makes the current running log transaction wait 216 * until you call btrfs_end_log_trans() or it makes any future 217 * log transactions wait until you call btrfs_end_log_trans() 218 */ 219 int btrfs_pin_log_trans(struct btrfs_root *root) 220 { 221 int ret = -ENOENT; 222 223 mutex_lock(&root->log_mutex); 224 atomic_inc(&root->log_writers); 225 mutex_unlock(&root->log_mutex); 226 return ret; 227 } 228 229 /* 230 * indicate we're done making changes to the log tree 231 * and wake up anyone waiting to do a sync 232 */ 233 void btrfs_end_log_trans(struct btrfs_root *root) 234 { 235 if (atomic_dec_and_test(&root->log_writers)) { 236 /* 237 * Implicit memory barrier after atomic_dec_and_test 238 */ 239 if (waitqueue_active(&root->log_writer_wait)) 240 wake_up(&root->log_writer_wait); 241 } 242 } 243 244 245 /* 246 * the walk control struct is used to pass state down the chain when 247 * processing the log tree. The stage field tells us which part 248 * of the log tree processing we are currently doing. The others 249 * are state fields used for that specific part 250 */ 251 struct walk_control { 252 /* should we free the extent on disk when done? This is used 253 * at transaction commit time while freeing a log tree 254 */ 255 int free; 256 257 /* should we write out the extent buffer? This is used 258 * while flushing the log tree to disk during a sync 259 */ 260 int write; 261 262 /* should we wait for the extent buffer io to finish? Also used 263 * while flushing the log tree to disk for a sync 264 */ 265 int wait; 266 267 /* pin only walk, we record which extents on disk belong to the 268 * log trees 269 */ 270 int pin; 271 272 /* what stage of the replay code we're currently in */ 273 int stage; 274 275 /* the root we are currently replaying */ 276 struct btrfs_root *replay_dest; 277 278 /* the trans handle for the current replay */ 279 struct btrfs_trans_handle *trans; 280 281 /* the function that gets used to process blocks we find in the 282 * tree. Note the extent_buffer might not be up to date when it is 283 * passed in, and it must be checked or read if you need the data 284 * inside it 285 */ 286 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287 struct walk_control *wc, u64 gen); 288 }; 289 290 /* 291 * process_func used to pin down extents, write them or wait on them 292 */ 293 static int process_one_buffer(struct btrfs_root *log, 294 struct extent_buffer *eb, 295 struct walk_control *wc, u64 gen) 296 { 297 struct btrfs_fs_info *fs_info = log->fs_info; 298 int ret = 0; 299 300 /* 301 * If this fs is mixed then we need to be able to process the leaves to 302 * pin down any logged extents, so we have to read the block. 303 */ 304 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 305 ret = btrfs_read_buffer(eb, gen); 306 if (ret) 307 return ret; 308 } 309 310 if (wc->pin) 311 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 312 eb->len); 313 314 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 315 if (wc->pin && btrfs_header_level(eb) == 0) 316 ret = btrfs_exclude_logged_extents(fs_info, eb); 317 if (wc->write) 318 btrfs_write_tree_block(eb); 319 if (wc->wait) 320 btrfs_wait_tree_block_writeback(eb); 321 } 322 return ret; 323 } 324 325 /* 326 * Item overwrite used by replay and tree logging. eb, slot and key all refer 327 * to the src data we are copying out. 328 * 329 * root is the tree we are copying into, and path is a scratch 330 * path for use in this function (it should be released on entry and 331 * will be released on exit). 332 * 333 * If the key is already in the destination tree the existing item is 334 * overwritten. If the existing item isn't big enough, it is extended. 335 * If it is too large, it is truncated. 336 * 337 * If the key isn't in the destination yet, a new item is inserted. 338 */ 339 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 340 struct btrfs_root *root, 341 struct btrfs_path *path, 342 struct extent_buffer *eb, int slot, 343 struct btrfs_key *key) 344 { 345 struct btrfs_fs_info *fs_info = root->fs_info; 346 int ret; 347 u32 item_size; 348 u64 saved_i_size = 0; 349 int save_old_i_size = 0; 350 unsigned long src_ptr; 351 unsigned long dst_ptr; 352 int overwrite_root = 0; 353 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 354 355 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 356 overwrite_root = 1; 357 358 item_size = btrfs_item_size_nr(eb, slot); 359 src_ptr = btrfs_item_ptr_offset(eb, slot); 360 361 /* look for the key in the destination tree */ 362 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 363 if (ret < 0) 364 return ret; 365 366 if (ret == 0) { 367 char *src_copy; 368 char *dst_copy; 369 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 370 path->slots[0]); 371 if (dst_size != item_size) 372 goto insert; 373 374 if (item_size == 0) { 375 btrfs_release_path(path); 376 return 0; 377 } 378 dst_copy = kmalloc(item_size, GFP_NOFS); 379 src_copy = kmalloc(item_size, GFP_NOFS); 380 if (!dst_copy || !src_copy) { 381 btrfs_release_path(path); 382 kfree(dst_copy); 383 kfree(src_copy); 384 return -ENOMEM; 385 } 386 387 read_extent_buffer(eb, src_copy, src_ptr, item_size); 388 389 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 390 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 391 item_size); 392 ret = memcmp(dst_copy, src_copy, item_size); 393 394 kfree(dst_copy); 395 kfree(src_copy); 396 /* 397 * they have the same contents, just return, this saves 398 * us from cowing blocks in the destination tree and doing 399 * extra writes that may not have been done by a previous 400 * sync 401 */ 402 if (ret == 0) { 403 btrfs_release_path(path); 404 return 0; 405 } 406 407 /* 408 * We need to load the old nbytes into the inode so when we 409 * replay the extents we've logged we get the right nbytes. 410 */ 411 if (inode_item) { 412 struct btrfs_inode_item *item; 413 u64 nbytes; 414 u32 mode; 415 416 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 417 struct btrfs_inode_item); 418 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 419 item = btrfs_item_ptr(eb, slot, 420 struct btrfs_inode_item); 421 btrfs_set_inode_nbytes(eb, item, nbytes); 422 423 /* 424 * If this is a directory we need to reset the i_size to 425 * 0 so that we can set it up properly when replaying 426 * the rest of the items in this log. 427 */ 428 mode = btrfs_inode_mode(eb, item); 429 if (S_ISDIR(mode)) 430 btrfs_set_inode_size(eb, item, 0); 431 } 432 } else if (inode_item) { 433 struct btrfs_inode_item *item; 434 u32 mode; 435 436 /* 437 * New inode, set nbytes to 0 so that the nbytes comes out 438 * properly when we replay the extents. 439 */ 440 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 441 btrfs_set_inode_nbytes(eb, item, 0); 442 443 /* 444 * If this is a directory we need to reset the i_size to 0 so 445 * that we can set it up properly when replaying the rest of 446 * the items in this log. 447 */ 448 mode = btrfs_inode_mode(eb, item); 449 if (S_ISDIR(mode)) 450 btrfs_set_inode_size(eb, item, 0); 451 } 452 insert: 453 btrfs_release_path(path); 454 /* try to insert the key into the destination tree */ 455 path->skip_release_on_error = 1; 456 ret = btrfs_insert_empty_item(trans, root, path, 457 key, item_size); 458 path->skip_release_on_error = 0; 459 460 /* make sure any existing item is the correct size */ 461 if (ret == -EEXIST || ret == -EOVERFLOW) { 462 u32 found_size; 463 found_size = btrfs_item_size_nr(path->nodes[0], 464 path->slots[0]); 465 if (found_size > item_size) 466 btrfs_truncate_item(fs_info, path, item_size, 1); 467 else if (found_size < item_size) 468 btrfs_extend_item(fs_info, path, 469 item_size - found_size); 470 } else if (ret) { 471 return ret; 472 } 473 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 474 path->slots[0]); 475 476 /* don't overwrite an existing inode if the generation number 477 * was logged as zero. This is done when the tree logging code 478 * is just logging an inode to make sure it exists after recovery. 479 * 480 * Also, don't overwrite i_size on directories during replay. 481 * log replay inserts and removes directory items based on the 482 * state of the tree found in the subvolume, and i_size is modified 483 * as it goes 484 */ 485 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 486 struct btrfs_inode_item *src_item; 487 struct btrfs_inode_item *dst_item; 488 489 src_item = (struct btrfs_inode_item *)src_ptr; 490 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 492 if (btrfs_inode_generation(eb, src_item) == 0) { 493 struct extent_buffer *dst_eb = path->nodes[0]; 494 const u64 ino_size = btrfs_inode_size(eb, src_item); 495 496 /* 497 * For regular files an ino_size == 0 is used only when 498 * logging that an inode exists, as part of a directory 499 * fsync, and the inode wasn't fsynced before. In this 500 * case don't set the size of the inode in the fs/subvol 501 * tree, otherwise we would be throwing valid data away. 502 */ 503 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 505 ino_size != 0) { 506 struct btrfs_map_token token; 507 508 btrfs_init_map_token(&token); 509 btrfs_set_token_inode_size(dst_eb, dst_item, 510 ino_size, &token); 511 } 512 goto no_copy; 513 } 514 515 if (overwrite_root && 516 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 517 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 518 save_old_i_size = 1; 519 saved_i_size = btrfs_inode_size(path->nodes[0], 520 dst_item); 521 } 522 } 523 524 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 525 src_ptr, item_size); 526 527 if (save_old_i_size) { 528 struct btrfs_inode_item *dst_item; 529 dst_item = (struct btrfs_inode_item *)dst_ptr; 530 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 531 } 532 533 /* make sure the generation is filled in */ 534 if (key->type == BTRFS_INODE_ITEM_KEY) { 535 struct btrfs_inode_item *dst_item; 536 dst_item = (struct btrfs_inode_item *)dst_ptr; 537 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 538 btrfs_set_inode_generation(path->nodes[0], dst_item, 539 trans->transid); 540 } 541 } 542 no_copy: 543 btrfs_mark_buffer_dirty(path->nodes[0]); 544 btrfs_release_path(path); 545 return 0; 546 } 547 548 /* 549 * simple helper to read an inode off the disk from a given root 550 * This can only be called for subvolume roots and not for the log 551 */ 552 static noinline struct inode *read_one_inode(struct btrfs_root *root, 553 u64 objectid) 554 { 555 struct btrfs_key key; 556 struct inode *inode; 557 558 key.objectid = objectid; 559 key.type = BTRFS_INODE_ITEM_KEY; 560 key.offset = 0; 561 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 562 if (IS_ERR(inode)) { 563 inode = NULL; 564 } else if (is_bad_inode(inode)) { 565 iput(inode); 566 inode = NULL; 567 } 568 return inode; 569 } 570 571 /* replays a single extent in 'eb' at 'slot' with 'key' into the 572 * subvolume 'root'. path is released on entry and should be released 573 * on exit. 574 * 575 * extents in the log tree have not been allocated out of the extent 576 * tree yet. So, this completes the allocation, taking a reference 577 * as required if the extent already exists or creating a new extent 578 * if it isn't in the extent allocation tree yet. 579 * 580 * The extent is inserted into the file, dropping any existing extents 581 * from the file that overlap the new one. 582 */ 583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, 585 struct btrfs_path *path, 586 struct extent_buffer *eb, int slot, 587 struct btrfs_key *key) 588 { 589 struct btrfs_fs_info *fs_info = root->fs_info; 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, 617 fs_info->sectorsize); 618 } else { 619 ret = 0; 620 goto out; 621 } 622 623 inode = read_one_inode(root, key->objectid); 624 if (!inode) { 625 ret = -EIO; 626 goto out; 627 } 628 629 /* 630 * first check to see if we already have this extent in the 631 * file. This must be done before the btrfs_drop_extents run 632 * so we don't try to drop this extent. 633 */ 634 ret = btrfs_lookup_file_extent(trans, root, path, 635 btrfs_ino(BTRFS_I(inode)), start, 0); 636 637 if (ret == 0 && 638 (found_type == BTRFS_FILE_EXTENT_REG || 639 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 640 struct btrfs_file_extent_item cmp1; 641 struct btrfs_file_extent_item cmp2; 642 struct btrfs_file_extent_item *existing; 643 struct extent_buffer *leaf; 644 645 leaf = path->nodes[0]; 646 existing = btrfs_item_ptr(leaf, path->slots[0], 647 struct btrfs_file_extent_item); 648 649 read_extent_buffer(eb, &cmp1, (unsigned long)item, 650 sizeof(cmp1)); 651 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 652 sizeof(cmp2)); 653 654 /* 655 * we already have a pointer to this exact extent, 656 * we don't have to do anything 657 */ 658 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 659 btrfs_release_path(path); 660 goto out; 661 } 662 } 663 btrfs_release_path(path); 664 665 /* drop any overlapping extents */ 666 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 667 if (ret) 668 goto out; 669 670 if (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 672 u64 offset; 673 unsigned long dest_offset; 674 struct btrfs_key ins; 675 676 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 677 btrfs_fs_incompat(fs_info, NO_HOLES)) 678 goto update_inode; 679 680 ret = btrfs_insert_empty_item(trans, root, path, key, 681 sizeof(*item)); 682 if (ret) 683 goto out; 684 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 685 path->slots[0]); 686 copy_extent_buffer(path->nodes[0], eb, dest_offset, 687 (unsigned long)item, sizeof(*item)); 688 689 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 690 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 691 ins.type = BTRFS_EXTENT_ITEM_KEY; 692 offset = key->offset - btrfs_file_extent_offset(eb, item); 693 694 /* 695 * Manually record dirty extent, as here we did a shallow 696 * file extent item copy and skip normal backref update, 697 * but modifying extent tree all by ourselves. 698 * So need to manually record dirty extent for qgroup, 699 * as the owner of the file extent changed from log tree 700 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 701 */ 702 ret = btrfs_qgroup_trace_extent(trans, fs_info, 703 btrfs_file_extent_disk_bytenr(eb, item), 704 btrfs_file_extent_disk_num_bytes(eb, item), 705 GFP_NOFS); 706 if (ret < 0) 707 goto out; 708 709 if (ins.objectid > 0) { 710 u64 csum_start; 711 u64 csum_end; 712 LIST_HEAD(ordered_sums); 713 /* 714 * is this extent already allocated in the extent 715 * allocation tree? If so, just add a reference 716 */ 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 718 ins.offset); 719 if (ret == 0) { 720 ret = btrfs_inc_extent_ref(trans, fs_info, 721 ins.objectid, ins.offset, 722 0, root->root_key.objectid, 723 key->objectid, offset); 724 if (ret) 725 goto out; 726 } else { 727 /* 728 * insert the extent pointer in the extent 729 * allocation tree 730 */ 731 ret = btrfs_alloc_logged_file_extent(trans, 732 fs_info, 733 root->root_key.objectid, 734 key->objectid, offset, &ins); 735 if (ret) 736 goto out; 737 } 738 btrfs_release_path(path); 739 740 if (btrfs_file_extent_compression(eb, item)) { 741 csum_start = ins.objectid; 742 csum_end = csum_start + ins.offset; 743 } else { 744 csum_start = ins.objectid + 745 btrfs_file_extent_offset(eb, item); 746 csum_end = csum_start + 747 btrfs_file_extent_num_bytes(eb, item); 748 } 749 750 ret = btrfs_lookup_csums_range(root->log_root, 751 csum_start, csum_end - 1, 752 &ordered_sums, 0); 753 if (ret) 754 goto out; 755 /* 756 * Now delete all existing cums in the csum root that 757 * cover our range. We do this because we can have an 758 * extent that is completely referenced by one file 759 * extent item and partially referenced by another 760 * file extent item (like after using the clone or 761 * extent_same ioctls). In this case if we end up doing 762 * the replay of the one that partially references the 763 * extent first, and we do not do the csum deletion 764 * below, we can get 2 csum items in the csum tree that 765 * overlap each other. For example, imagine our log has 766 * the two following file extent items: 767 * 768 * key (257 EXTENT_DATA 409600) 769 * extent data disk byte 12845056 nr 102400 770 * extent data offset 20480 nr 20480 ram 102400 771 * 772 * key (257 EXTENT_DATA 819200) 773 * extent data disk byte 12845056 nr 102400 774 * extent data offset 0 nr 102400 ram 102400 775 * 776 * Where the second one fully references the 100K extent 777 * that starts at disk byte 12845056, and the log tree 778 * has a single csum item that covers the entire range 779 * of the extent: 780 * 781 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 782 * 783 * After the first file extent item is replayed, the 784 * csum tree gets the following csum item: 785 * 786 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 787 * 788 * Which covers the 20K sub-range starting at offset 20K 789 * of our extent. Now when we replay the second file 790 * extent item, if we do not delete existing csum items 791 * that cover any of its blocks, we end up getting two 792 * csum items in our csum tree that overlap each other: 793 * 794 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 795 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 796 * 797 * Which is a problem, because after this anyone trying 798 * to lookup up for the checksum of any block of our 799 * extent starting at an offset of 40K or higher, will 800 * end up looking at the second csum item only, which 801 * does not contain the checksum for any block starting 802 * at offset 40K or higher of our extent. 803 */ 804 while (!list_empty(&ordered_sums)) { 805 struct btrfs_ordered_sum *sums; 806 sums = list_entry(ordered_sums.next, 807 struct btrfs_ordered_sum, 808 list); 809 if (!ret) 810 ret = btrfs_del_csums(trans, fs_info, 811 sums->bytenr, 812 sums->len); 813 if (!ret) 814 ret = btrfs_csum_file_blocks(trans, 815 fs_info->csum_root, sums); 816 list_del(&sums->list); 817 kfree(sums); 818 } 819 if (ret) 820 goto out; 821 } else { 822 btrfs_release_path(path); 823 } 824 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 825 /* inline extents are easy, we just overwrite them */ 826 ret = overwrite_item(trans, root, path, eb, slot, key); 827 if (ret) 828 goto out; 829 } 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct btrfs_fs_info *fs_info = root->fs_info; 855 struct inode *inode; 856 char *name; 857 int name_len; 858 struct extent_buffer *leaf; 859 struct btrfs_key location; 860 int ret; 861 862 leaf = path->nodes[0]; 863 864 btrfs_dir_item_key_to_cpu(leaf, di, &location); 865 name_len = btrfs_dir_name_len(leaf, di); 866 name = kmalloc(name_len, GFP_NOFS); 867 if (!name) 868 return -ENOMEM; 869 870 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 871 btrfs_release_path(path); 872 873 inode = read_one_inode(root, location.objectid); 874 if (!inode) { 875 ret = -EIO; 876 goto out; 877 } 878 879 ret = link_to_fixup_dir(trans, root, path, location.objectid); 880 if (ret) 881 goto out; 882 883 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 884 name_len); 885 if (ret) 886 goto out; 887 else 888 ret = btrfs_run_delayed_items(trans, fs_info); 889 out: 890 kfree(name); 891 iput(inode); 892 return ret; 893 } 894 895 /* 896 * helper function to see if a given name and sequence number found 897 * in an inode back reference are already in a directory and correctly 898 * point to this inode 899 */ 900 static noinline int inode_in_dir(struct btrfs_root *root, 901 struct btrfs_path *path, 902 u64 dirid, u64 objectid, u64 index, 903 const char *name, int name_len) 904 { 905 struct btrfs_dir_item *di; 906 struct btrfs_key location; 907 int match = 0; 908 909 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 910 index, name, name_len, 0); 911 if (di && !IS_ERR(di)) { 912 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 913 if (location.objectid != objectid) 914 goto out; 915 } else 916 goto out; 917 btrfs_release_path(path); 918 919 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 920 if (di && !IS_ERR(di)) { 921 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 922 if (location.objectid != objectid) 923 goto out; 924 } else 925 goto out; 926 match = 1; 927 out: 928 btrfs_release_path(path); 929 return match; 930 } 931 932 /* 933 * helper function to check a log tree for a named back reference in 934 * an inode. This is used to decide if a back reference that is 935 * found in the subvolume conflicts with what we find in the log. 936 * 937 * inode backreferences may have multiple refs in a single item, 938 * during replay we process one reference at a time, and we don't 939 * want to delete valid links to a file from the subvolume if that 940 * link is also in the log. 941 */ 942 static noinline int backref_in_log(struct btrfs_root *log, 943 struct btrfs_key *key, 944 u64 ref_objectid, 945 const char *name, int namelen) 946 { 947 struct btrfs_path *path; 948 struct btrfs_inode_ref *ref; 949 unsigned long ptr; 950 unsigned long ptr_end; 951 unsigned long name_ptr; 952 int found_name_len; 953 int item_size; 954 int ret; 955 int match = 0; 956 957 path = btrfs_alloc_path(); 958 if (!path) 959 return -ENOMEM; 960 961 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 962 if (ret != 0) 963 goto out; 964 965 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 966 967 if (key->type == BTRFS_INODE_EXTREF_KEY) { 968 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 969 name, namelen, NULL)) 970 match = 1; 971 972 goto out; 973 } 974 975 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 976 ptr_end = ptr + item_size; 977 while (ptr < ptr_end) { 978 ref = (struct btrfs_inode_ref *)ptr; 979 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 980 if (found_name_len == namelen) { 981 name_ptr = (unsigned long)(ref + 1); 982 ret = memcmp_extent_buffer(path->nodes[0], name, 983 name_ptr, namelen); 984 if (ret == 0) { 985 match = 1; 986 goto out; 987 } 988 } 989 ptr = (unsigned long)(ref + 1) + found_name_len; 990 } 991 out: 992 btrfs_free_path(path); 993 return match; 994 } 995 996 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 997 struct btrfs_root *root, 998 struct btrfs_path *path, 999 struct btrfs_root *log_root, 1000 struct btrfs_inode *dir, 1001 struct btrfs_inode *inode, 1002 u64 inode_objectid, u64 parent_objectid, 1003 u64 ref_index, char *name, int namelen, 1004 int *search_done) 1005 { 1006 struct btrfs_fs_info *fs_info = root->fs_info; 1007 int ret; 1008 char *victim_name; 1009 int victim_name_len; 1010 struct extent_buffer *leaf; 1011 struct btrfs_dir_item *di; 1012 struct btrfs_key search_key; 1013 struct btrfs_inode_extref *extref; 1014 1015 again: 1016 /* Search old style refs */ 1017 search_key.objectid = inode_objectid; 1018 search_key.type = BTRFS_INODE_REF_KEY; 1019 search_key.offset = parent_objectid; 1020 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1021 if (ret == 0) { 1022 struct btrfs_inode_ref *victim_ref; 1023 unsigned long ptr; 1024 unsigned long ptr_end; 1025 1026 leaf = path->nodes[0]; 1027 1028 /* are we trying to overwrite a back ref for the root directory 1029 * if so, just jump out, we're done 1030 */ 1031 if (search_key.objectid == search_key.offset) 1032 return 1; 1033 1034 /* check all the names in this back reference to see 1035 * if they are in the log. if so, we allow them to stay 1036 * otherwise they must be unlinked as a conflict 1037 */ 1038 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1039 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1040 while (ptr < ptr_end) { 1041 victim_ref = (struct btrfs_inode_ref *)ptr; 1042 victim_name_len = btrfs_inode_ref_name_len(leaf, 1043 victim_ref); 1044 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1045 if (!victim_name) 1046 return -ENOMEM; 1047 1048 read_extent_buffer(leaf, victim_name, 1049 (unsigned long)(victim_ref + 1), 1050 victim_name_len); 1051 1052 if (!backref_in_log(log_root, &search_key, 1053 parent_objectid, 1054 victim_name, 1055 victim_name_len)) { 1056 inc_nlink(&inode->vfs_inode); 1057 btrfs_release_path(path); 1058 1059 ret = btrfs_unlink_inode(trans, root, dir, inode, 1060 victim_name, victim_name_len); 1061 kfree(victim_name); 1062 if (ret) 1063 return ret; 1064 ret = btrfs_run_delayed_items(trans, fs_info); 1065 if (ret) 1066 return ret; 1067 *search_done = 1; 1068 goto again; 1069 } 1070 kfree(victim_name); 1071 1072 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1073 } 1074 1075 /* 1076 * NOTE: we have searched root tree and checked the 1077 * corresponding ref, it does not need to check again. 1078 */ 1079 *search_done = 1; 1080 } 1081 btrfs_release_path(path); 1082 1083 /* Same search but for extended refs */ 1084 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1085 inode_objectid, parent_objectid, 0, 1086 0); 1087 if (!IS_ERR_OR_NULL(extref)) { 1088 u32 item_size; 1089 u32 cur_offset = 0; 1090 unsigned long base; 1091 struct inode *victim_parent; 1092 1093 leaf = path->nodes[0]; 1094 1095 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1096 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1097 1098 while (cur_offset < item_size) { 1099 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1100 1101 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1102 1103 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1104 goto next; 1105 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1110 victim_name_len); 1111 1112 search_key.objectid = inode_objectid; 1113 search_key.type = BTRFS_INODE_EXTREF_KEY; 1114 search_key.offset = btrfs_extref_hash(parent_objectid, 1115 victim_name, 1116 victim_name_len); 1117 ret = 0; 1118 if (!backref_in_log(log_root, &search_key, 1119 parent_objectid, victim_name, 1120 victim_name_len)) { 1121 ret = -ENOENT; 1122 victim_parent = read_one_inode(root, 1123 parent_objectid); 1124 if (victim_parent) { 1125 inc_nlink(&inode->vfs_inode); 1126 btrfs_release_path(path); 1127 1128 ret = btrfs_unlink_inode(trans, root, 1129 BTRFS_I(victim_parent), 1130 inode, 1131 victim_name, 1132 victim_name_len); 1133 if (!ret) 1134 ret = btrfs_run_delayed_items( 1135 trans, 1136 fs_info); 1137 } 1138 iput(victim_parent); 1139 kfree(victim_name); 1140 if (ret) 1141 return ret; 1142 *search_done = 1; 1143 goto again; 1144 } 1145 kfree(victim_name); 1146 if (ret) 1147 return ret; 1148 next: 1149 cur_offset += victim_name_len + sizeof(*extref); 1150 } 1151 *search_done = 1; 1152 } 1153 btrfs_release_path(path); 1154 1155 /* look for a conflicting sequence number */ 1156 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1157 ref_index, name, namelen, 0); 1158 if (di && !IS_ERR(di)) { 1159 ret = drop_one_dir_item(trans, root, path, dir, di); 1160 if (ret) 1161 return ret; 1162 } 1163 btrfs_release_path(path); 1164 1165 /* look for a conflicing name */ 1166 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1167 name, namelen, 0); 1168 if (di && !IS_ERR(di)) { 1169 ret = drop_one_dir_item(trans, root, path, dir, di); 1170 if (ret) 1171 return ret; 1172 } 1173 btrfs_release_path(path); 1174 1175 return 0; 1176 } 1177 1178 static int extref_get_fields(struct extent_buffer *eb, int slot, 1179 unsigned long ref_ptr, u32 *namelen, char **name, 1180 u64 *index, u64 *parent_objectid) 1181 { 1182 struct btrfs_inode_extref *extref; 1183 1184 extref = (struct btrfs_inode_extref *)ref_ptr; 1185 1186 *namelen = btrfs_inode_extref_name_len(eb, extref); 1187 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, 1188 *namelen)) 1189 return -EIO; 1190 1191 *name = kmalloc(*namelen, GFP_NOFS); 1192 if (*name == NULL) 1193 return -ENOMEM; 1194 1195 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1196 *namelen); 1197 1198 *index = btrfs_inode_extref_index(eb, extref); 1199 if (parent_objectid) 1200 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1201 1202 return 0; 1203 } 1204 1205 static int ref_get_fields(struct extent_buffer *eb, int slot, 1206 unsigned long ref_ptr, u32 *namelen, char **name, 1207 u64 *index) 1208 { 1209 struct btrfs_inode_ref *ref; 1210 1211 ref = (struct btrfs_inode_ref *)ref_ptr; 1212 1213 *namelen = btrfs_inode_ref_name_len(eb, ref); 1214 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), 1215 *namelen)) 1216 return -EIO; 1217 1218 *name = kmalloc(*namelen, GFP_NOFS); 1219 if (*name == NULL) 1220 return -ENOMEM; 1221 1222 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1223 1224 *index = btrfs_inode_ref_index(eb, ref); 1225 1226 return 0; 1227 } 1228 1229 /* 1230 * replay one inode back reference item found in the log tree. 1231 * eb, slot and key refer to the buffer and key found in the log tree. 1232 * root is the destination we are replaying into, and path is for temp 1233 * use by this function. (it should be released on return). 1234 */ 1235 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1236 struct btrfs_root *root, 1237 struct btrfs_root *log, 1238 struct btrfs_path *path, 1239 struct extent_buffer *eb, int slot, 1240 struct btrfs_key *key) 1241 { 1242 struct inode *dir = NULL; 1243 struct inode *inode = NULL; 1244 unsigned long ref_ptr; 1245 unsigned long ref_end; 1246 char *name = NULL; 1247 int namelen; 1248 int ret; 1249 int search_done = 0; 1250 int log_ref_ver = 0; 1251 u64 parent_objectid; 1252 u64 inode_objectid; 1253 u64 ref_index = 0; 1254 int ref_struct_size; 1255 1256 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1257 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1258 1259 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1260 struct btrfs_inode_extref *r; 1261 1262 ref_struct_size = sizeof(struct btrfs_inode_extref); 1263 log_ref_ver = 1; 1264 r = (struct btrfs_inode_extref *)ref_ptr; 1265 parent_objectid = btrfs_inode_extref_parent(eb, r); 1266 } else { 1267 ref_struct_size = sizeof(struct btrfs_inode_ref); 1268 parent_objectid = key->offset; 1269 } 1270 inode_objectid = key->objectid; 1271 1272 /* 1273 * it is possible that we didn't log all the parent directories 1274 * for a given inode. If we don't find the dir, just don't 1275 * copy the back ref in. The link count fixup code will take 1276 * care of the rest 1277 */ 1278 dir = read_one_inode(root, parent_objectid); 1279 if (!dir) { 1280 ret = -ENOENT; 1281 goto out; 1282 } 1283 1284 inode = read_one_inode(root, inode_objectid); 1285 if (!inode) { 1286 ret = -EIO; 1287 goto out; 1288 } 1289 1290 while (ref_ptr < ref_end) { 1291 if (log_ref_ver) { 1292 ret = extref_get_fields(eb, slot, ref_ptr, &namelen, 1293 &name, &ref_index, &parent_objectid); 1294 /* 1295 * parent object can change from one array 1296 * item to another. 1297 */ 1298 if (!dir) 1299 dir = read_one_inode(root, parent_objectid); 1300 if (!dir) { 1301 ret = -ENOENT; 1302 goto out; 1303 } 1304 } else { 1305 ret = ref_get_fields(eb, slot, ref_ptr, &namelen, 1306 &name, &ref_index); 1307 } 1308 if (ret) 1309 goto out; 1310 1311 /* if we already have a perfect match, we're done */ 1312 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1313 btrfs_ino(BTRFS_I(inode)), ref_index, 1314 name, namelen)) { 1315 /* 1316 * look for a conflicting back reference in the 1317 * metadata. if we find one we have to unlink that name 1318 * of the file before we add our new link. Later on, we 1319 * overwrite any existing back reference, and we don't 1320 * want to create dangling pointers in the directory. 1321 */ 1322 1323 if (!search_done) { 1324 ret = __add_inode_ref(trans, root, path, log, 1325 BTRFS_I(dir), 1326 BTRFS_I(inode), 1327 inode_objectid, 1328 parent_objectid, 1329 ref_index, name, namelen, 1330 &search_done); 1331 if (ret) { 1332 if (ret == 1) 1333 ret = 0; 1334 goto out; 1335 } 1336 } 1337 1338 /* insert our name */ 1339 ret = btrfs_add_link(trans, BTRFS_I(dir), 1340 BTRFS_I(inode), 1341 name, namelen, 0, ref_index); 1342 if (ret) 1343 goto out; 1344 1345 btrfs_update_inode(trans, root, inode); 1346 } 1347 1348 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1349 kfree(name); 1350 name = NULL; 1351 if (log_ref_ver) { 1352 iput(dir); 1353 dir = NULL; 1354 } 1355 } 1356 1357 /* finally write the back reference in the inode */ 1358 ret = overwrite_item(trans, root, path, eb, slot, key); 1359 out: 1360 btrfs_release_path(path); 1361 kfree(name); 1362 iput(dir); 1363 iput(inode); 1364 return ret; 1365 } 1366 1367 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1368 struct btrfs_root *root, u64 ino) 1369 { 1370 int ret; 1371 1372 ret = btrfs_insert_orphan_item(trans, root, ino); 1373 if (ret == -EEXIST) 1374 ret = 0; 1375 1376 return ret; 1377 } 1378 1379 static int count_inode_extrefs(struct btrfs_root *root, 1380 struct btrfs_inode *inode, struct btrfs_path *path) 1381 { 1382 int ret = 0; 1383 int name_len; 1384 unsigned int nlink = 0; 1385 u32 item_size; 1386 u32 cur_offset = 0; 1387 u64 inode_objectid = btrfs_ino(inode); 1388 u64 offset = 0; 1389 unsigned long ptr; 1390 struct btrfs_inode_extref *extref; 1391 struct extent_buffer *leaf; 1392 1393 while (1) { 1394 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1395 &extref, &offset); 1396 if (ret) 1397 break; 1398 1399 leaf = path->nodes[0]; 1400 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1401 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1402 cur_offset = 0; 1403 1404 while (cur_offset < item_size) { 1405 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1406 name_len = btrfs_inode_extref_name_len(leaf, extref); 1407 1408 nlink++; 1409 1410 cur_offset += name_len + sizeof(*extref); 1411 } 1412 1413 offset++; 1414 btrfs_release_path(path); 1415 } 1416 btrfs_release_path(path); 1417 1418 if (ret < 0 && ret != -ENOENT) 1419 return ret; 1420 return nlink; 1421 } 1422 1423 static int count_inode_refs(struct btrfs_root *root, 1424 struct btrfs_inode *inode, struct btrfs_path *path) 1425 { 1426 int ret; 1427 struct btrfs_key key; 1428 unsigned int nlink = 0; 1429 unsigned long ptr; 1430 unsigned long ptr_end; 1431 int name_len; 1432 u64 ino = btrfs_ino(inode); 1433 1434 key.objectid = ino; 1435 key.type = BTRFS_INODE_REF_KEY; 1436 key.offset = (u64)-1; 1437 1438 while (1) { 1439 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1440 if (ret < 0) 1441 break; 1442 if (ret > 0) { 1443 if (path->slots[0] == 0) 1444 break; 1445 path->slots[0]--; 1446 } 1447 process_slot: 1448 btrfs_item_key_to_cpu(path->nodes[0], &key, 1449 path->slots[0]); 1450 if (key.objectid != ino || 1451 key.type != BTRFS_INODE_REF_KEY) 1452 break; 1453 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1454 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1455 path->slots[0]); 1456 while (ptr < ptr_end) { 1457 struct btrfs_inode_ref *ref; 1458 1459 ref = (struct btrfs_inode_ref *)ptr; 1460 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1461 ref); 1462 ptr = (unsigned long)(ref + 1) + name_len; 1463 nlink++; 1464 } 1465 1466 if (key.offset == 0) 1467 break; 1468 if (path->slots[0] > 0) { 1469 path->slots[0]--; 1470 goto process_slot; 1471 } 1472 key.offset--; 1473 btrfs_release_path(path); 1474 } 1475 btrfs_release_path(path); 1476 1477 return nlink; 1478 } 1479 1480 /* 1481 * There are a few corners where the link count of the file can't 1482 * be properly maintained during replay. So, instead of adding 1483 * lots of complexity to the log code, we just scan the backrefs 1484 * for any file that has been through replay. 1485 * 1486 * The scan will update the link count on the inode to reflect the 1487 * number of back refs found. If it goes down to zero, the iput 1488 * will free the inode. 1489 */ 1490 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1491 struct btrfs_root *root, 1492 struct inode *inode) 1493 { 1494 struct btrfs_path *path; 1495 int ret; 1496 u64 nlink = 0; 1497 u64 ino = btrfs_ino(BTRFS_I(inode)); 1498 1499 path = btrfs_alloc_path(); 1500 if (!path) 1501 return -ENOMEM; 1502 1503 ret = count_inode_refs(root, BTRFS_I(inode), path); 1504 if (ret < 0) 1505 goto out; 1506 1507 nlink = ret; 1508 1509 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1510 if (ret < 0) 1511 goto out; 1512 1513 nlink += ret; 1514 1515 ret = 0; 1516 1517 if (nlink != inode->i_nlink) { 1518 set_nlink(inode, nlink); 1519 btrfs_update_inode(trans, root, inode); 1520 } 1521 BTRFS_I(inode)->index_cnt = (u64)-1; 1522 1523 if (inode->i_nlink == 0) { 1524 if (S_ISDIR(inode->i_mode)) { 1525 ret = replay_dir_deletes(trans, root, NULL, path, 1526 ino, 1); 1527 if (ret) 1528 goto out; 1529 } 1530 ret = insert_orphan_item(trans, root, ino); 1531 } 1532 1533 out: 1534 btrfs_free_path(path); 1535 return ret; 1536 } 1537 1538 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1539 struct btrfs_root *root, 1540 struct btrfs_path *path) 1541 { 1542 int ret; 1543 struct btrfs_key key; 1544 struct inode *inode; 1545 1546 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1547 key.type = BTRFS_ORPHAN_ITEM_KEY; 1548 key.offset = (u64)-1; 1549 while (1) { 1550 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1551 if (ret < 0) 1552 break; 1553 1554 if (ret == 1) { 1555 if (path->slots[0] == 0) 1556 break; 1557 path->slots[0]--; 1558 } 1559 1560 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1561 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1562 key.type != BTRFS_ORPHAN_ITEM_KEY) 1563 break; 1564 1565 ret = btrfs_del_item(trans, root, path); 1566 if (ret) 1567 goto out; 1568 1569 btrfs_release_path(path); 1570 inode = read_one_inode(root, key.offset); 1571 if (!inode) 1572 return -EIO; 1573 1574 ret = fixup_inode_link_count(trans, root, inode); 1575 iput(inode); 1576 if (ret) 1577 goto out; 1578 1579 /* 1580 * fixup on a directory may create new entries, 1581 * make sure we always look for the highset possible 1582 * offset 1583 */ 1584 key.offset = (u64)-1; 1585 } 1586 ret = 0; 1587 out: 1588 btrfs_release_path(path); 1589 return ret; 1590 } 1591 1592 1593 /* 1594 * record a given inode in the fixup dir so we can check its link 1595 * count when replay is done. The link count is incremented here 1596 * so the inode won't go away until we check it 1597 */ 1598 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1599 struct btrfs_root *root, 1600 struct btrfs_path *path, 1601 u64 objectid) 1602 { 1603 struct btrfs_key key; 1604 int ret = 0; 1605 struct inode *inode; 1606 1607 inode = read_one_inode(root, objectid); 1608 if (!inode) 1609 return -EIO; 1610 1611 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1612 key.type = BTRFS_ORPHAN_ITEM_KEY; 1613 key.offset = objectid; 1614 1615 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1616 1617 btrfs_release_path(path); 1618 if (ret == 0) { 1619 if (!inode->i_nlink) 1620 set_nlink(inode, 1); 1621 else 1622 inc_nlink(inode); 1623 ret = btrfs_update_inode(trans, root, inode); 1624 } else if (ret == -EEXIST) { 1625 ret = 0; 1626 } else { 1627 BUG(); /* Logic Error */ 1628 } 1629 iput(inode); 1630 1631 return ret; 1632 } 1633 1634 /* 1635 * when replaying the log for a directory, we only insert names 1636 * for inodes that actually exist. This means an fsync on a directory 1637 * does not implicitly fsync all the new files in it 1638 */ 1639 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1640 struct btrfs_root *root, 1641 u64 dirid, u64 index, 1642 char *name, int name_len, 1643 struct btrfs_key *location) 1644 { 1645 struct inode *inode; 1646 struct inode *dir; 1647 int ret; 1648 1649 inode = read_one_inode(root, location->objectid); 1650 if (!inode) 1651 return -ENOENT; 1652 1653 dir = read_one_inode(root, dirid); 1654 if (!dir) { 1655 iput(inode); 1656 return -EIO; 1657 } 1658 1659 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1660 name_len, 1, index); 1661 1662 /* FIXME, put inode into FIXUP list */ 1663 1664 iput(inode); 1665 iput(dir); 1666 return ret; 1667 } 1668 1669 /* 1670 * Return true if an inode reference exists in the log for the given name, 1671 * inode and parent inode. 1672 */ 1673 static bool name_in_log_ref(struct btrfs_root *log_root, 1674 const char *name, const int name_len, 1675 const u64 dirid, const u64 ino) 1676 { 1677 struct btrfs_key search_key; 1678 1679 search_key.objectid = ino; 1680 search_key.type = BTRFS_INODE_REF_KEY; 1681 search_key.offset = dirid; 1682 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1683 return true; 1684 1685 search_key.type = BTRFS_INODE_EXTREF_KEY; 1686 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1687 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1688 return true; 1689 1690 return false; 1691 } 1692 1693 /* 1694 * take a single entry in a log directory item and replay it into 1695 * the subvolume. 1696 * 1697 * if a conflicting item exists in the subdirectory already, 1698 * the inode it points to is unlinked and put into the link count 1699 * fix up tree. 1700 * 1701 * If a name from the log points to a file or directory that does 1702 * not exist in the FS, it is skipped. fsyncs on directories 1703 * do not force down inodes inside that directory, just changes to the 1704 * names or unlinks in a directory. 1705 * 1706 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1707 * non-existing inode) and 1 if the name was replayed. 1708 */ 1709 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1710 struct btrfs_root *root, 1711 struct btrfs_path *path, 1712 struct extent_buffer *eb, 1713 struct btrfs_dir_item *di, 1714 struct btrfs_key *key) 1715 { 1716 char *name; 1717 int name_len; 1718 struct btrfs_dir_item *dst_di; 1719 struct btrfs_key found_key; 1720 struct btrfs_key log_key; 1721 struct inode *dir; 1722 u8 log_type; 1723 int exists; 1724 int ret = 0; 1725 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1726 bool name_added = false; 1727 1728 dir = read_one_inode(root, key->objectid); 1729 if (!dir) 1730 return -EIO; 1731 1732 name_len = btrfs_dir_name_len(eb, di); 1733 name = kmalloc(name_len, GFP_NOFS); 1734 if (!name) { 1735 ret = -ENOMEM; 1736 goto out; 1737 } 1738 1739 log_type = btrfs_dir_type(eb, di); 1740 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1741 name_len); 1742 1743 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1744 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1745 if (exists == 0) 1746 exists = 1; 1747 else 1748 exists = 0; 1749 btrfs_release_path(path); 1750 1751 if (key->type == BTRFS_DIR_ITEM_KEY) { 1752 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1753 name, name_len, 1); 1754 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1755 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1756 key->objectid, 1757 key->offset, name, 1758 name_len, 1); 1759 } else { 1760 /* Corruption */ 1761 ret = -EINVAL; 1762 goto out; 1763 } 1764 if (IS_ERR_OR_NULL(dst_di)) { 1765 /* we need a sequence number to insert, so we only 1766 * do inserts for the BTRFS_DIR_INDEX_KEY types 1767 */ 1768 if (key->type != BTRFS_DIR_INDEX_KEY) 1769 goto out; 1770 goto insert; 1771 } 1772 1773 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1774 /* the existing item matches the logged item */ 1775 if (found_key.objectid == log_key.objectid && 1776 found_key.type == log_key.type && 1777 found_key.offset == log_key.offset && 1778 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1779 update_size = false; 1780 goto out; 1781 } 1782 1783 /* 1784 * don't drop the conflicting directory entry if the inode 1785 * for the new entry doesn't exist 1786 */ 1787 if (!exists) 1788 goto out; 1789 1790 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1791 if (ret) 1792 goto out; 1793 1794 if (key->type == BTRFS_DIR_INDEX_KEY) 1795 goto insert; 1796 out: 1797 btrfs_release_path(path); 1798 if (!ret && update_size) { 1799 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1800 ret = btrfs_update_inode(trans, root, dir); 1801 } 1802 kfree(name); 1803 iput(dir); 1804 if (!ret && name_added) 1805 ret = 1; 1806 return ret; 1807 1808 insert: 1809 if (name_in_log_ref(root->log_root, name, name_len, 1810 key->objectid, log_key.objectid)) { 1811 /* The dentry will be added later. */ 1812 ret = 0; 1813 update_size = false; 1814 goto out; 1815 } 1816 btrfs_release_path(path); 1817 ret = insert_one_name(trans, root, key->objectid, key->offset, 1818 name, name_len, &log_key); 1819 if (ret && ret != -ENOENT && ret != -EEXIST) 1820 goto out; 1821 if (!ret) 1822 name_added = true; 1823 update_size = false; 1824 ret = 0; 1825 goto out; 1826 } 1827 1828 /* 1829 * find all the names in a directory item and reconcile them into 1830 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1831 * one name in a directory item, but the same code gets used for 1832 * both directory index types 1833 */ 1834 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1835 struct btrfs_root *root, 1836 struct btrfs_path *path, 1837 struct extent_buffer *eb, int slot, 1838 struct btrfs_key *key) 1839 { 1840 struct btrfs_fs_info *fs_info = root->fs_info; 1841 int ret = 0; 1842 u32 item_size = btrfs_item_size_nr(eb, slot); 1843 struct btrfs_dir_item *di; 1844 int name_len; 1845 unsigned long ptr; 1846 unsigned long ptr_end; 1847 struct btrfs_path *fixup_path = NULL; 1848 1849 ptr = btrfs_item_ptr_offset(eb, slot); 1850 ptr_end = ptr + item_size; 1851 while (ptr < ptr_end) { 1852 di = (struct btrfs_dir_item *)ptr; 1853 if (verify_dir_item(fs_info, eb, slot, di)) 1854 return -EIO; 1855 name_len = btrfs_dir_name_len(eb, di); 1856 ret = replay_one_name(trans, root, path, eb, di, key); 1857 if (ret < 0) 1858 break; 1859 ptr = (unsigned long)(di + 1); 1860 ptr += name_len; 1861 1862 /* 1863 * If this entry refers to a non-directory (directories can not 1864 * have a link count > 1) and it was added in the transaction 1865 * that was not committed, make sure we fixup the link count of 1866 * the inode it the entry points to. Otherwise something like 1867 * the following would result in a directory pointing to an 1868 * inode with a wrong link that does not account for this dir 1869 * entry: 1870 * 1871 * mkdir testdir 1872 * touch testdir/foo 1873 * touch testdir/bar 1874 * sync 1875 * 1876 * ln testdir/bar testdir/bar_link 1877 * ln testdir/foo testdir/foo_link 1878 * xfs_io -c "fsync" testdir/bar 1879 * 1880 * <power failure> 1881 * 1882 * mount fs, log replay happens 1883 * 1884 * File foo would remain with a link count of 1 when it has two 1885 * entries pointing to it in the directory testdir. This would 1886 * make it impossible to ever delete the parent directory has 1887 * it would result in stale dentries that can never be deleted. 1888 */ 1889 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1890 struct btrfs_key di_key; 1891 1892 if (!fixup_path) { 1893 fixup_path = btrfs_alloc_path(); 1894 if (!fixup_path) { 1895 ret = -ENOMEM; 1896 break; 1897 } 1898 } 1899 1900 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1901 ret = link_to_fixup_dir(trans, root, fixup_path, 1902 di_key.objectid); 1903 if (ret) 1904 break; 1905 } 1906 ret = 0; 1907 } 1908 btrfs_free_path(fixup_path); 1909 return ret; 1910 } 1911 1912 /* 1913 * directory replay has two parts. There are the standard directory 1914 * items in the log copied from the subvolume, and range items 1915 * created in the log while the subvolume was logged. 1916 * 1917 * The range items tell us which parts of the key space the log 1918 * is authoritative for. During replay, if a key in the subvolume 1919 * directory is in a logged range item, but not actually in the log 1920 * that means it was deleted from the directory before the fsync 1921 * and should be removed. 1922 */ 1923 static noinline int find_dir_range(struct btrfs_root *root, 1924 struct btrfs_path *path, 1925 u64 dirid, int key_type, 1926 u64 *start_ret, u64 *end_ret) 1927 { 1928 struct btrfs_key key; 1929 u64 found_end; 1930 struct btrfs_dir_log_item *item; 1931 int ret; 1932 int nritems; 1933 1934 if (*start_ret == (u64)-1) 1935 return 1; 1936 1937 key.objectid = dirid; 1938 key.type = key_type; 1939 key.offset = *start_ret; 1940 1941 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1942 if (ret < 0) 1943 goto out; 1944 if (ret > 0) { 1945 if (path->slots[0] == 0) 1946 goto out; 1947 path->slots[0]--; 1948 } 1949 if (ret != 0) 1950 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1951 1952 if (key.type != key_type || key.objectid != dirid) { 1953 ret = 1; 1954 goto next; 1955 } 1956 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1957 struct btrfs_dir_log_item); 1958 found_end = btrfs_dir_log_end(path->nodes[0], item); 1959 1960 if (*start_ret >= key.offset && *start_ret <= found_end) { 1961 ret = 0; 1962 *start_ret = key.offset; 1963 *end_ret = found_end; 1964 goto out; 1965 } 1966 ret = 1; 1967 next: 1968 /* check the next slot in the tree to see if it is a valid item */ 1969 nritems = btrfs_header_nritems(path->nodes[0]); 1970 path->slots[0]++; 1971 if (path->slots[0] >= nritems) { 1972 ret = btrfs_next_leaf(root, path); 1973 if (ret) 1974 goto out; 1975 } 1976 1977 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1978 1979 if (key.type != key_type || key.objectid != dirid) { 1980 ret = 1; 1981 goto out; 1982 } 1983 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1984 struct btrfs_dir_log_item); 1985 found_end = btrfs_dir_log_end(path->nodes[0], item); 1986 *start_ret = key.offset; 1987 *end_ret = found_end; 1988 ret = 0; 1989 out: 1990 btrfs_release_path(path); 1991 return ret; 1992 } 1993 1994 /* 1995 * this looks for a given directory item in the log. If the directory 1996 * item is not in the log, the item is removed and the inode it points 1997 * to is unlinked 1998 */ 1999 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2000 struct btrfs_root *root, 2001 struct btrfs_root *log, 2002 struct btrfs_path *path, 2003 struct btrfs_path *log_path, 2004 struct inode *dir, 2005 struct btrfs_key *dir_key) 2006 { 2007 struct btrfs_fs_info *fs_info = root->fs_info; 2008 int ret; 2009 struct extent_buffer *eb; 2010 int slot; 2011 u32 item_size; 2012 struct btrfs_dir_item *di; 2013 struct btrfs_dir_item *log_di; 2014 int name_len; 2015 unsigned long ptr; 2016 unsigned long ptr_end; 2017 char *name; 2018 struct inode *inode; 2019 struct btrfs_key location; 2020 2021 again: 2022 eb = path->nodes[0]; 2023 slot = path->slots[0]; 2024 item_size = btrfs_item_size_nr(eb, slot); 2025 ptr = btrfs_item_ptr_offset(eb, slot); 2026 ptr_end = ptr + item_size; 2027 while (ptr < ptr_end) { 2028 di = (struct btrfs_dir_item *)ptr; 2029 if (verify_dir_item(fs_info, eb, slot, di)) { 2030 ret = -EIO; 2031 goto out; 2032 } 2033 2034 name_len = btrfs_dir_name_len(eb, di); 2035 name = kmalloc(name_len, GFP_NOFS); 2036 if (!name) { 2037 ret = -ENOMEM; 2038 goto out; 2039 } 2040 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2041 name_len); 2042 log_di = NULL; 2043 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2044 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2045 dir_key->objectid, 2046 name, name_len, 0); 2047 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2048 log_di = btrfs_lookup_dir_index_item(trans, log, 2049 log_path, 2050 dir_key->objectid, 2051 dir_key->offset, 2052 name, name_len, 0); 2053 } 2054 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2055 btrfs_dir_item_key_to_cpu(eb, di, &location); 2056 btrfs_release_path(path); 2057 btrfs_release_path(log_path); 2058 inode = read_one_inode(root, location.objectid); 2059 if (!inode) { 2060 kfree(name); 2061 return -EIO; 2062 } 2063 2064 ret = link_to_fixup_dir(trans, root, 2065 path, location.objectid); 2066 if (ret) { 2067 kfree(name); 2068 iput(inode); 2069 goto out; 2070 } 2071 2072 inc_nlink(inode); 2073 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2074 BTRFS_I(inode), name, name_len); 2075 if (!ret) 2076 ret = btrfs_run_delayed_items(trans, fs_info); 2077 kfree(name); 2078 iput(inode); 2079 if (ret) 2080 goto out; 2081 2082 /* there might still be more names under this key 2083 * check and repeat if required 2084 */ 2085 ret = btrfs_search_slot(NULL, root, dir_key, path, 2086 0, 0); 2087 if (ret == 0) 2088 goto again; 2089 ret = 0; 2090 goto out; 2091 } else if (IS_ERR(log_di)) { 2092 kfree(name); 2093 return PTR_ERR(log_di); 2094 } 2095 btrfs_release_path(log_path); 2096 kfree(name); 2097 2098 ptr = (unsigned long)(di + 1); 2099 ptr += name_len; 2100 } 2101 ret = 0; 2102 out: 2103 btrfs_release_path(path); 2104 btrfs_release_path(log_path); 2105 return ret; 2106 } 2107 2108 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2109 struct btrfs_root *root, 2110 struct btrfs_root *log, 2111 struct btrfs_path *path, 2112 const u64 ino) 2113 { 2114 struct btrfs_fs_info *fs_info = root->fs_info; 2115 struct btrfs_key search_key; 2116 struct btrfs_path *log_path; 2117 int i; 2118 int nritems; 2119 int ret; 2120 2121 log_path = btrfs_alloc_path(); 2122 if (!log_path) 2123 return -ENOMEM; 2124 2125 search_key.objectid = ino; 2126 search_key.type = BTRFS_XATTR_ITEM_KEY; 2127 search_key.offset = 0; 2128 again: 2129 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2130 if (ret < 0) 2131 goto out; 2132 process_leaf: 2133 nritems = btrfs_header_nritems(path->nodes[0]); 2134 for (i = path->slots[0]; i < nritems; i++) { 2135 struct btrfs_key key; 2136 struct btrfs_dir_item *di; 2137 struct btrfs_dir_item *log_di; 2138 u32 total_size; 2139 u32 cur; 2140 2141 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2142 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2143 ret = 0; 2144 goto out; 2145 } 2146 2147 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2148 total_size = btrfs_item_size_nr(path->nodes[0], i); 2149 cur = 0; 2150 while (cur < total_size) { 2151 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2152 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2153 u32 this_len = sizeof(*di) + name_len + data_len; 2154 char *name; 2155 2156 ret = verify_dir_item(fs_info, path->nodes[0], 2157 path->slots[0], di); 2158 if (ret) { 2159 ret = -EIO; 2160 goto out; 2161 } 2162 name = kmalloc(name_len, GFP_NOFS); 2163 if (!name) { 2164 ret = -ENOMEM; 2165 goto out; 2166 } 2167 read_extent_buffer(path->nodes[0], name, 2168 (unsigned long)(di + 1), name_len); 2169 2170 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2171 name, name_len, 0); 2172 btrfs_release_path(log_path); 2173 if (!log_di) { 2174 /* Doesn't exist in log tree, so delete it. */ 2175 btrfs_release_path(path); 2176 di = btrfs_lookup_xattr(trans, root, path, ino, 2177 name, name_len, -1); 2178 kfree(name); 2179 if (IS_ERR(di)) { 2180 ret = PTR_ERR(di); 2181 goto out; 2182 } 2183 ASSERT(di); 2184 ret = btrfs_delete_one_dir_name(trans, root, 2185 path, di); 2186 if (ret) 2187 goto out; 2188 btrfs_release_path(path); 2189 search_key = key; 2190 goto again; 2191 } 2192 kfree(name); 2193 if (IS_ERR(log_di)) { 2194 ret = PTR_ERR(log_di); 2195 goto out; 2196 } 2197 cur += this_len; 2198 di = (struct btrfs_dir_item *)((char *)di + this_len); 2199 } 2200 } 2201 ret = btrfs_next_leaf(root, path); 2202 if (ret > 0) 2203 ret = 0; 2204 else if (ret == 0) 2205 goto process_leaf; 2206 out: 2207 btrfs_free_path(log_path); 2208 btrfs_release_path(path); 2209 return ret; 2210 } 2211 2212 2213 /* 2214 * deletion replay happens before we copy any new directory items 2215 * out of the log or out of backreferences from inodes. It 2216 * scans the log to find ranges of keys that log is authoritative for, 2217 * and then scans the directory to find items in those ranges that are 2218 * not present in the log. 2219 * 2220 * Anything we don't find in the log is unlinked and removed from the 2221 * directory. 2222 */ 2223 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2224 struct btrfs_root *root, 2225 struct btrfs_root *log, 2226 struct btrfs_path *path, 2227 u64 dirid, int del_all) 2228 { 2229 u64 range_start; 2230 u64 range_end; 2231 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2232 int ret = 0; 2233 struct btrfs_key dir_key; 2234 struct btrfs_key found_key; 2235 struct btrfs_path *log_path; 2236 struct inode *dir; 2237 2238 dir_key.objectid = dirid; 2239 dir_key.type = BTRFS_DIR_ITEM_KEY; 2240 log_path = btrfs_alloc_path(); 2241 if (!log_path) 2242 return -ENOMEM; 2243 2244 dir = read_one_inode(root, dirid); 2245 /* it isn't an error if the inode isn't there, that can happen 2246 * because we replay the deletes before we copy in the inode item 2247 * from the log 2248 */ 2249 if (!dir) { 2250 btrfs_free_path(log_path); 2251 return 0; 2252 } 2253 again: 2254 range_start = 0; 2255 range_end = 0; 2256 while (1) { 2257 if (del_all) 2258 range_end = (u64)-1; 2259 else { 2260 ret = find_dir_range(log, path, dirid, key_type, 2261 &range_start, &range_end); 2262 if (ret != 0) 2263 break; 2264 } 2265 2266 dir_key.offset = range_start; 2267 while (1) { 2268 int nritems; 2269 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2270 0, 0); 2271 if (ret < 0) 2272 goto out; 2273 2274 nritems = btrfs_header_nritems(path->nodes[0]); 2275 if (path->slots[0] >= nritems) { 2276 ret = btrfs_next_leaf(root, path); 2277 if (ret) 2278 break; 2279 } 2280 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2281 path->slots[0]); 2282 if (found_key.objectid != dirid || 2283 found_key.type != dir_key.type) 2284 goto next_type; 2285 2286 if (found_key.offset > range_end) 2287 break; 2288 2289 ret = check_item_in_log(trans, root, log, path, 2290 log_path, dir, 2291 &found_key); 2292 if (ret) 2293 goto out; 2294 if (found_key.offset == (u64)-1) 2295 break; 2296 dir_key.offset = found_key.offset + 1; 2297 } 2298 btrfs_release_path(path); 2299 if (range_end == (u64)-1) 2300 break; 2301 range_start = range_end + 1; 2302 } 2303 2304 next_type: 2305 ret = 0; 2306 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2307 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2308 dir_key.type = BTRFS_DIR_INDEX_KEY; 2309 btrfs_release_path(path); 2310 goto again; 2311 } 2312 out: 2313 btrfs_release_path(path); 2314 btrfs_free_path(log_path); 2315 iput(dir); 2316 return ret; 2317 } 2318 2319 /* 2320 * the process_func used to replay items from the log tree. This 2321 * gets called in two different stages. The first stage just looks 2322 * for inodes and makes sure they are all copied into the subvolume. 2323 * 2324 * The second stage copies all the other item types from the log into 2325 * the subvolume. The two stage approach is slower, but gets rid of 2326 * lots of complexity around inodes referencing other inodes that exist 2327 * only in the log (references come from either directory items or inode 2328 * back refs). 2329 */ 2330 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2331 struct walk_control *wc, u64 gen) 2332 { 2333 int nritems; 2334 struct btrfs_path *path; 2335 struct btrfs_root *root = wc->replay_dest; 2336 struct btrfs_key key; 2337 int level; 2338 int i; 2339 int ret; 2340 2341 ret = btrfs_read_buffer(eb, gen); 2342 if (ret) 2343 return ret; 2344 2345 level = btrfs_header_level(eb); 2346 2347 if (level != 0) 2348 return 0; 2349 2350 path = btrfs_alloc_path(); 2351 if (!path) 2352 return -ENOMEM; 2353 2354 nritems = btrfs_header_nritems(eb); 2355 for (i = 0; i < nritems; i++) { 2356 btrfs_item_key_to_cpu(eb, &key, i); 2357 2358 /* inode keys are done during the first stage */ 2359 if (key.type == BTRFS_INODE_ITEM_KEY && 2360 wc->stage == LOG_WALK_REPLAY_INODES) { 2361 struct btrfs_inode_item *inode_item; 2362 u32 mode; 2363 2364 inode_item = btrfs_item_ptr(eb, i, 2365 struct btrfs_inode_item); 2366 ret = replay_xattr_deletes(wc->trans, root, log, 2367 path, key.objectid); 2368 if (ret) 2369 break; 2370 mode = btrfs_inode_mode(eb, inode_item); 2371 if (S_ISDIR(mode)) { 2372 ret = replay_dir_deletes(wc->trans, 2373 root, log, path, key.objectid, 0); 2374 if (ret) 2375 break; 2376 } 2377 ret = overwrite_item(wc->trans, root, path, 2378 eb, i, &key); 2379 if (ret) 2380 break; 2381 2382 /* for regular files, make sure corresponding 2383 * orphan item exist. extents past the new EOF 2384 * will be truncated later by orphan cleanup. 2385 */ 2386 if (S_ISREG(mode)) { 2387 ret = insert_orphan_item(wc->trans, root, 2388 key.objectid); 2389 if (ret) 2390 break; 2391 } 2392 2393 ret = link_to_fixup_dir(wc->trans, root, 2394 path, key.objectid); 2395 if (ret) 2396 break; 2397 } 2398 2399 if (key.type == BTRFS_DIR_INDEX_KEY && 2400 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2401 ret = replay_one_dir_item(wc->trans, root, path, 2402 eb, i, &key); 2403 if (ret) 2404 break; 2405 } 2406 2407 if (wc->stage < LOG_WALK_REPLAY_ALL) 2408 continue; 2409 2410 /* these keys are simply copied */ 2411 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2412 ret = overwrite_item(wc->trans, root, path, 2413 eb, i, &key); 2414 if (ret) 2415 break; 2416 } else if (key.type == BTRFS_INODE_REF_KEY || 2417 key.type == BTRFS_INODE_EXTREF_KEY) { 2418 ret = add_inode_ref(wc->trans, root, log, path, 2419 eb, i, &key); 2420 if (ret && ret != -ENOENT) 2421 break; 2422 ret = 0; 2423 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2424 ret = replay_one_extent(wc->trans, root, path, 2425 eb, i, &key); 2426 if (ret) 2427 break; 2428 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2429 ret = replay_one_dir_item(wc->trans, root, path, 2430 eb, i, &key); 2431 if (ret) 2432 break; 2433 } 2434 } 2435 btrfs_free_path(path); 2436 return ret; 2437 } 2438 2439 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2440 struct btrfs_root *root, 2441 struct btrfs_path *path, int *level, 2442 struct walk_control *wc) 2443 { 2444 struct btrfs_fs_info *fs_info = root->fs_info; 2445 u64 root_owner; 2446 u64 bytenr; 2447 u64 ptr_gen; 2448 struct extent_buffer *next; 2449 struct extent_buffer *cur; 2450 struct extent_buffer *parent; 2451 u32 blocksize; 2452 int ret = 0; 2453 2454 WARN_ON(*level < 0); 2455 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2456 2457 while (*level > 0) { 2458 WARN_ON(*level < 0); 2459 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2460 cur = path->nodes[*level]; 2461 2462 WARN_ON(btrfs_header_level(cur) != *level); 2463 2464 if (path->slots[*level] >= 2465 btrfs_header_nritems(cur)) 2466 break; 2467 2468 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2469 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2470 blocksize = fs_info->nodesize; 2471 2472 parent = path->nodes[*level]; 2473 root_owner = btrfs_header_owner(parent); 2474 2475 next = btrfs_find_create_tree_block(fs_info, bytenr); 2476 if (IS_ERR(next)) 2477 return PTR_ERR(next); 2478 2479 if (*level == 1) { 2480 ret = wc->process_func(root, next, wc, ptr_gen); 2481 if (ret) { 2482 free_extent_buffer(next); 2483 return ret; 2484 } 2485 2486 path->slots[*level]++; 2487 if (wc->free) { 2488 ret = btrfs_read_buffer(next, ptr_gen); 2489 if (ret) { 2490 free_extent_buffer(next); 2491 return ret; 2492 } 2493 2494 if (trans) { 2495 btrfs_tree_lock(next); 2496 btrfs_set_lock_blocking(next); 2497 clean_tree_block(fs_info, next); 2498 btrfs_wait_tree_block_writeback(next); 2499 btrfs_tree_unlock(next); 2500 } 2501 2502 WARN_ON(root_owner != 2503 BTRFS_TREE_LOG_OBJECTID); 2504 ret = btrfs_free_and_pin_reserved_extent( 2505 fs_info, bytenr, 2506 blocksize); 2507 if (ret) { 2508 free_extent_buffer(next); 2509 return ret; 2510 } 2511 } 2512 free_extent_buffer(next); 2513 continue; 2514 } 2515 ret = btrfs_read_buffer(next, ptr_gen); 2516 if (ret) { 2517 free_extent_buffer(next); 2518 return ret; 2519 } 2520 2521 WARN_ON(*level <= 0); 2522 if (path->nodes[*level-1]) 2523 free_extent_buffer(path->nodes[*level-1]); 2524 path->nodes[*level-1] = next; 2525 *level = btrfs_header_level(next); 2526 path->slots[*level] = 0; 2527 cond_resched(); 2528 } 2529 WARN_ON(*level < 0); 2530 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2531 2532 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2533 2534 cond_resched(); 2535 return 0; 2536 } 2537 2538 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2539 struct btrfs_root *root, 2540 struct btrfs_path *path, int *level, 2541 struct walk_control *wc) 2542 { 2543 struct btrfs_fs_info *fs_info = root->fs_info; 2544 u64 root_owner; 2545 int i; 2546 int slot; 2547 int ret; 2548 2549 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2550 slot = path->slots[i]; 2551 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2552 path->slots[i]++; 2553 *level = i; 2554 WARN_ON(*level == 0); 2555 return 0; 2556 } else { 2557 struct extent_buffer *parent; 2558 if (path->nodes[*level] == root->node) 2559 parent = path->nodes[*level]; 2560 else 2561 parent = path->nodes[*level + 1]; 2562 2563 root_owner = btrfs_header_owner(parent); 2564 ret = wc->process_func(root, path->nodes[*level], wc, 2565 btrfs_header_generation(path->nodes[*level])); 2566 if (ret) 2567 return ret; 2568 2569 if (wc->free) { 2570 struct extent_buffer *next; 2571 2572 next = path->nodes[*level]; 2573 2574 if (trans) { 2575 btrfs_tree_lock(next); 2576 btrfs_set_lock_blocking(next); 2577 clean_tree_block(fs_info, next); 2578 btrfs_wait_tree_block_writeback(next); 2579 btrfs_tree_unlock(next); 2580 } 2581 2582 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2583 ret = btrfs_free_and_pin_reserved_extent( 2584 fs_info, 2585 path->nodes[*level]->start, 2586 path->nodes[*level]->len); 2587 if (ret) 2588 return ret; 2589 } 2590 free_extent_buffer(path->nodes[*level]); 2591 path->nodes[*level] = NULL; 2592 *level = i + 1; 2593 } 2594 } 2595 return 1; 2596 } 2597 2598 /* 2599 * drop the reference count on the tree rooted at 'snap'. This traverses 2600 * the tree freeing any blocks that have a ref count of zero after being 2601 * decremented. 2602 */ 2603 static int walk_log_tree(struct btrfs_trans_handle *trans, 2604 struct btrfs_root *log, struct walk_control *wc) 2605 { 2606 struct btrfs_fs_info *fs_info = log->fs_info; 2607 int ret = 0; 2608 int wret; 2609 int level; 2610 struct btrfs_path *path; 2611 int orig_level; 2612 2613 path = btrfs_alloc_path(); 2614 if (!path) 2615 return -ENOMEM; 2616 2617 level = btrfs_header_level(log->node); 2618 orig_level = level; 2619 path->nodes[level] = log->node; 2620 extent_buffer_get(log->node); 2621 path->slots[level] = 0; 2622 2623 while (1) { 2624 wret = walk_down_log_tree(trans, log, path, &level, wc); 2625 if (wret > 0) 2626 break; 2627 if (wret < 0) { 2628 ret = wret; 2629 goto out; 2630 } 2631 2632 wret = walk_up_log_tree(trans, log, path, &level, wc); 2633 if (wret > 0) 2634 break; 2635 if (wret < 0) { 2636 ret = wret; 2637 goto out; 2638 } 2639 } 2640 2641 /* was the root node processed? if not, catch it here */ 2642 if (path->nodes[orig_level]) { 2643 ret = wc->process_func(log, path->nodes[orig_level], wc, 2644 btrfs_header_generation(path->nodes[orig_level])); 2645 if (ret) 2646 goto out; 2647 if (wc->free) { 2648 struct extent_buffer *next; 2649 2650 next = path->nodes[orig_level]; 2651 2652 if (trans) { 2653 btrfs_tree_lock(next); 2654 btrfs_set_lock_blocking(next); 2655 clean_tree_block(fs_info, next); 2656 btrfs_wait_tree_block_writeback(next); 2657 btrfs_tree_unlock(next); 2658 } 2659 2660 WARN_ON(log->root_key.objectid != 2661 BTRFS_TREE_LOG_OBJECTID); 2662 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2663 next->start, next->len); 2664 if (ret) 2665 goto out; 2666 } 2667 } 2668 2669 out: 2670 btrfs_free_path(path); 2671 return ret; 2672 } 2673 2674 /* 2675 * helper function to update the item for a given subvolumes log root 2676 * in the tree of log roots 2677 */ 2678 static int update_log_root(struct btrfs_trans_handle *trans, 2679 struct btrfs_root *log) 2680 { 2681 struct btrfs_fs_info *fs_info = log->fs_info; 2682 int ret; 2683 2684 if (log->log_transid == 1) { 2685 /* insert root item on the first sync */ 2686 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2687 &log->root_key, &log->root_item); 2688 } else { 2689 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2690 &log->root_key, &log->root_item); 2691 } 2692 return ret; 2693 } 2694 2695 static void wait_log_commit(struct btrfs_root *root, int transid) 2696 { 2697 DEFINE_WAIT(wait); 2698 int index = transid % 2; 2699 2700 /* 2701 * we only allow two pending log transactions at a time, 2702 * so we know that if ours is more than 2 older than the 2703 * current transaction, we're done 2704 */ 2705 do { 2706 prepare_to_wait(&root->log_commit_wait[index], 2707 &wait, TASK_UNINTERRUPTIBLE); 2708 mutex_unlock(&root->log_mutex); 2709 2710 if (root->log_transid_committed < transid && 2711 atomic_read(&root->log_commit[index])) 2712 schedule(); 2713 2714 finish_wait(&root->log_commit_wait[index], &wait); 2715 mutex_lock(&root->log_mutex); 2716 } while (root->log_transid_committed < transid && 2717 atomic_read(&root->log_commit[index])); 2718 } 2719 2720 static void wait_for_writer(struct btrfs_root *root) 2721 { 2722 DEFINE_WAIT(wait); 2723 2724 while (atomic_read(&root->log_writers)) { 2725 prepare_to_wait(&root->log_writer_wait, 2726 &wait, TASK_UNINTERRUPTIBLE); 2727 mutex_unlock(&root->log_mutex); 2728 if (atomic_read(&root->log_writers)) 2729 schedule(); 2730 finish_wait(&root->log_writer_wait, &wait); 2731 mutex_lock(&root->log_mutex); 2732 } 2733 } 2734 2735 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2736 struct btrfs_log_ctx *ctx) 2737 { 2738 if (!ctx) 2739 return; 2740 2741 mutex_lock(&root->log_mutex); 2742 list_del_init(&ctx->list); 2743 mutex_unlock(&root->log_mutex); 2744 } 2745 2746 /* 2747 * Invoked in log mutex context, or be sure there is no other task which 2748 * can access the list. 2749 */ 2750 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2751 int index, int error) 2752 { 2753 struct btrfs_log_ctx *ctx; 2754 struct btrfs_log_ctx *safe; 2755 2756 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2757 list_del_init(&ctx->list); 2758 ctx->log_ret = error; 2759 } 2760 2761 INIT_LIST_HEAD(&root->log_ctxs[index]); 2762 } 2763 2764 /* 2765 * btrfs_sync_log does sends a given tree log down to the disk and 2766 * updates the super blocks to record it. When this call is done, 2767 * you know that any inodes previously logged are safely on disk only 2768 * if it returns 0. 2769 * 2770 * Any other return value means you need to call btrfs_commit_transaction. 2771 * Some of the edge cases for fsyncing directories that have had unlinks 2772 * or renames done in the past mean that sometimes the only safe 2773 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2774 * that has happened. 2775 */ 2776 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2777 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2778 { 2779 int index1; 2780 int index2; 2781 int mark; 2782 int ret; 2783 struct btrfs_fs_info *fs_info = root->fs_info; 2784 struct btrfs_root *log = root->log_root; 2785 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2786 int log_transid = 0; 2787 struct btrfs_log_ctx root_log_ctx; 2788 struct blk_plug plug; 2789 2790 mutex_lock(&root->log_mutex); 2791 log_transid = ctx->log_transid; 2792 if (root->log_transid_committed >= log_transid) { 2793 mutex_unlock(&root->log_mutex); 2794 return ctx->log_ret; 2795 } 2796 2797 index1 = log_transid % 2; 2798 if (atomic_read(&root->log_commit[index1])) { 2799 wait_log_commit(root, log_transid); 2800 mutex_unlock(&root->log_mutex); 2801 return ctx->log_ret; 2802 } 2803 ASSERT(log_transid == root->log_transid); 2804 atomic_set(&root->log_commit[index1], 1); 2805 2806 /* wait for previous tree log sync to complete */ 2807 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2808 wait_log_commit(root, log_transid - 1); 2809 2810 while (1) { 2811 int batch = atomic_read(&root->log_batch); 2812 /* when we're on an ssd, just kick the log commit out */ 2813 if (!btrfs_test_opt(fs_info, SSD) && 2814 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2815 mutex_unlock(&root->log_mutex); 2816 schedule_timeout_uninterruptible(1); 2817 mutex_lock(&root->log_mutex); 2818 } 2819 wait_for_writer(root); 2820 if (batch == atomic_read(&root->log_batch)) 2821 break; 2822 } 2823 2824 /* bail out if we need to do a full commit */ 2825 if (btrfs_need_log_full_commit(fs_info, trans)) { 2826 ret = -EAGAIN; 2827 btrfs_free_logged_extents(log, log_transid); 2828 mutex_unlock(&root->log_mutex); 2829 goto out; 2830 } 2831 2832 if (log_transid % 2 == 0) 2833 mark = EXTENT_DIRTY; 2834 else 2835 mark = EXTENT_NEW; 2836 2837 /* we start IO on all the marked extents here, but we don't actually 2838 * wait for them until later. 2839 */ 2840 blk_start_plug(&plug); 2841 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2842 if (ret) { 2843 blk_finish_plug(&plug); 2844 btrfs_abort_transaction(trans, ret); 2845 btrfs_free_logged_extents(log, log_transid); 2846 btrfs_set_log_full_commit(fs_info, trans); 2847 mutex_unlock(&root->log_mutex); 2848 goto out; 2849 } 2850 2851 btrfs_set_root_node(&log->root_item, log->node); 2852 2853 root->log_transid++; 2854 log->log_transid = root->log_transid; 2855 root->log_start_pid = 0; 2856 /* 2857 * IO has been started, blocks of the log tree have WRITTEN flag set 2858 * in their headers. new modifications of the log will be written to 2859 * new positions. so it's safe to allow log writers to go in. 2860 */ 2861 mutex_unlock(&root->log_mutex); 2862 2863 btrfs_init_log_ctx(&root_log_ctx, NULL); 2864 2865 mutex_lock(&log_root_tree->log_mutex); 2866 atomic_inc(&log_root_tree->log_batch); 2867 atomic_inc(&log_root_tree->log_writers); 2868 2869 index2 = log_root_tree->log_transid % 2; 2870 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2871 root_log_ctx.log_transid = log_root_tree->log_transid; 2872 2873 mutex_unlock(&log_root_tree->log_mutex); 2874 2875 ret = update_log_root(trans, log); 2876 2877 mutex_lock(&log_root_tree->log_mutex); 2878 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2879 /* 2880 * Implicit memory barrier after atomic_dec_and_test 2881 */ 2882 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2883 wake_up(&log_root_tree->log_writer_wait); 2884 } 2885 2886 if (ret) { 2887 if (!list_empty(&root_log_ctx.list)) 2888 list_del_init(&root_log_ctx.list); 2889 2890 blk_finish_plug(&plug); 2891 btrfs_set_log_full_commit(fs_info, trans); 2892 2893 if (ret != -ENOSPC) { 2894 btrfs_abort_transaction(trans, ret); 2895 mutex_unlock(&log_root_tree->log_mutex); 2896 goto out; 2897 } 2898 btrfs_wait_tree_log_extents(log, mark); 2899 btrfs_free_logged_extents(log, log_transid); 2900 mutex_unlock(&log_root_tree->log_mutex); 2901 ret = -EAGAIN; 2902 goto out; 2903 } 2904 2905 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2906 blk_finish_plug(&plug); 2907 list_del_init(&root_log_ctx.list); 2908 mutex_unlock(&log_root_tree->log_mutex); 2909 ret = root_log_ctx.log_ret; 2910 goto out; 2911 } 2912 2913 index2 = root_log_ctx.log_transid % 2; 2914 if (atomic_read(&log_root_tree->log_commit[index2])) { 2915 blk_finish_plug(&plug); 2916 ret = btrfs_wait_tree_log_extents(log, mark); 2917 btrfs_wait_logged_extents(trans, log, log_transid); 2918 wait_log_commit(log_root_tree, 2919 root_log_ctx.log_transid); 2920 mutex_unlock(&log_root_tree->log_mutex); 2921 if (!ret) 2922 ret = root_log_ctx.log_ret; 2923 goto out; 2924 } 2925 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2926 atomic_set(&log_root_tree->log_commit[index2], 1); 2927 2928 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2929 wait_log_commit(log_root_tree, 2930 root_log_ctx.log_transid - 1); 2931 } 2932 2933 wait_for_writer(log_root_tree); 2934 2935 /* 2936 * now that we've moved on to the tree of log tree roots, 2937 * check the full commit flag again 2938 */ 2939 if (btrfs_need_log_full_commit(fs_info, trans)) { 2940 blk_finish_plug(&plug); 2941 btrfs_wait_tree_log_extents(log, mark); 2942 btrfs_free_logged_extents(log, log_transid); 2943 mutex_unlock(&log_root_tree->log_mutex); 2944 ret = -EAGAIN; 2945 goto out_wake_log_root; 2946 } 2947 2948 ret = btrfs_write_marked_extents(fs_info, 2949 &log_root_tree->dirty_log_pages, 2950 EXTENT_DIRTY | EXTENT_NEW); 2951 blk_finish_plug(&plug); 2952 if (ret) { 2953 btrfs_set_log_full_commit(fs_info, trans); 2954 btrfs_abort_transaction(trans, ret); 2955 btrfs_free_logged_extents(log, log_transid); 2956 mutex_unlock(&log_root_tree->log_mutex); 2957 goto out_wake_log_root; 2958 } 2959 ret = btrfs_wait_tree_log_extents(log, mark); 2960 if (!ret) 2961 ret = btrfs_wait_tree_log_extents(log_root_tree, 2962 EXTENT_NEW | EXTENT_DIRTY); 2963 if (ret) { 2964 btrfs_set_log_full_commit(fs_info, trans); 2965 btrfs_free_logged_extents(log, log_transid); 2966 mutex_unlock(&log_root_tree->log_mutex); 2967 goto out_wake_log_root; 2968 } 2969 btrfs_wait_logged_extents(trans, log, log_transid); 2970 2971 btrfs_set_super_log_root(fs_info->super_for_commit, 2972 log_root_tree->node->start); 2973 btrfs_set_super_log_root_level(fs_info->super_for_commit, 2974 btrfs_header_level(log_root_tree->node)); 2975 2976 log_root_tree->log_transid++; 2977 mutex_unlock(&log_root_tree->log_mutex); 2978 2979 /* 2980 * nobody else is going to jump in and write the the ctree 2981 * super here because the log_commit atomic below is protecting 2982 * us. We must be called with a transaction handle pinning 2983 * the running transaction open, so a full commit can't hop 2984 * in and cause problems either. 2985 */ 2986 ret = write_all_supers(fs_info, 1); 2987 if (ret) { 2988 btrfs_set_log_full_commit(fs_info, trans); 2989 btrfs_abort_transaction(trans, ret); 2990 goto out_wake_log_root; 2991 } 2992 2993 mutex_lock(&root->log_mutex); 2994 if (root->last_log_commit < log_transid) 2995 root->last_log_commit = log_transid; 2996 mutex_unlock(&root->log_mutex); 2997 2998 out_wake_log_root: 2999 mutex_lock(&log_root_tree->log_mutex); 3000 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3001 3002 log_root_tree->log_transid_committed++; 3003 atomic_set(&log_root_tree->log_commit[index2], 0); 3004 mutex_unlock(&log_root_tree->log_mutex); 3005 3006 /* 3007 * The barrier before waitqueue_active is implied by mutex_unlock 3008 */ 3009 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3010 wake_up(&log_root_tree->log_commit_wait[index2]); 3011 out: 3012 mutex_lock(&root->log_mutex); 3013 btrfs_remove_all_log_ctxs(root, index1, ret); 3014 root->log_transid_committed++; 3015 atomic_set(&root->log_commit[index1], 0); 3016 mutex_unlock(&root->log_mutex); 3017 3018 /* 3019 * The barrier before waitqueue_active is implied by mutex_unlock 3020 */ 3021 if (waitqueue_active(&root->log_commit_wait[index1])) 3022 wake_up(&root->log_commit_wait[index1]); 3023 return ret; 3024 } 3025 3026 static void free_log_tree(struct btrfs_trans_handle *trans, 3027 struct btrfs_root *log) 3028 { 3029 int ret; 3030 u64 start; 3031 u64 end; 3032 struct walk_control wc = { 3033 .free = 1, 3034 .process_func = process_one_buffer 3035 }; 3036 3037 ret = walk_log_tree(trans, log, &wc); 3038 /* I don't think this can happen but just in case */ 3039 if (ret) 3040 btrfs_abort_transaction(trans, ret); 3041 3042 while (1) { 3043 ret = find_first_extent_bit(&log->dirty_log_pages, 3044 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 3045 NULL); 3046 if (ret) 3047 break; 3048 3049 clear_extent_bits(&log->dirty_log_pages, start, end, 3050 EXTENT_DIRTY | EXTENT_NEW); 3051 } 3052 3053 /* 3054 * We may have short-circuited the log tree with the full commit logic 3055 * and left ordered extents on our list, so clear these out to keep us 3056 * from leaking inodes and memory. 3057 */ 3058 btrfs_free_logged_extents(log, 0); 3059 btrfs_free_logged_extents(log, 1); 3060 3061 free_extent_buffer(log->node); 3062 kfree(log); 3063 } 3064 3065 /* 3066 * free all the extents used by the tree log. This should be called 3067 * at commit time of the full transaction 3068 */ 3069 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3070 { 3071 if (root->log_root) { 3072 free_log_tree(trans, root->log_root); 3073 root->log_root = NULL; 3074 } 3075 return 0; 3076 } 3077 3078 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3079 struct btrfs_fs_info *fs_info) 3080 { 3081 if (fs_info->log_root_tree) { 3082 free_log_tree(trans, fs_info->log_root_tree); 3083 fs_info->log_root_tree = NULL; 3084 } 3085 return 0; 3086 } 3087 3088 /* 3089 * If both a file and directory are logged, and unlinks or renames are 3090 * mixed in, we have a few interesting corners: 3091 * 3092 * create file X in dir Y 3093 * link file X to X.link in dir Y 3094 * fsync file X 3095 * unlink file X but leave X.link 3096 * fsync dir Y 3097 * 3098 * After a crash we would expect only X.link to exist. But file X 3099 * didn't get fsync'd again so the log has back refs for X and X.link. 3100 * 3101 * We solve this by removing directory entries and inode backrefs from the 3102 * log when a file that was logged in the current transaction is 3103 * unlinked. Any later fsync will include the updated log entries, and 3104 * we'll be able to reconstruct the proper directory items from backrefs. 3105 * 3106 * This optimizations allows us to avoid relogging the entire inode 3107 * or the entire directory. 3108 */ 3109 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3110 struct btrfs_root *root, 3111 const char *name, int name_len, 3112 struct btrfs_inode *dir, u64 index) 3113 { 3114 struct btrfs_root *log; 3115 struct btrfs_dir_item *di; 3116 struct btrfs_path *path; 3117 int ret; 3118 int err = 0; 3119 int bytes_del = 0; 3120 u64 dir_ino = btrfs_ino(dir); 3121 3122 if (dir->logged_trans < trans->transid) 3123 return 0; 3124 3125 ret = join_running_log_trans(root); 3126 if (ret) 3127 return 0; 3128 3129 mutex_lock(&dir->log_mutex); 3130 3131 log = root->log_root; 3132 path = btrfs_alloc_path(); 3133 if (!path) { 3134 err = -ENOMEM; 3135 goto out_unlock; 3136 } 3137 3138 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3139 name, name_len, -1); 3140 if (IS_ERR(di)) { 3141 err = PTR_ERR(di); 3142 goto fail; 3143 } 3144 if (di) { 3145 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3146 bytes_del += name_len; 3147 if (ret) { 3148 err = ret; 3149 goto fail; 3150 } 3151 } 3152 btrfs_release_path(path); 3153 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3154 index, name, name_len, -1); 3155 if (IS_ERR(di)) { 3156 err = PTR_ERR(di); 3157 goto fail; 3158 } 3159 if (di) { 3160 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3161 bytes_del += name_len; 3162 if (ret) { 3163 err = ret; 3164 goto fail; 3165 } 3166 } 3167 3168 /* update the directory size in the log to reflect the names 3169 * we have removed 3170 */ 3171 if (bytes_del) { 3172 struct btrfs_key key; 3173 3174 key.objectid = dir_ino; 3175 key.offset = 0; 3176 key.type = BTRFS_INODE_ITEM_KEY; 3177 btrfs_release_path(path); 3178 3179 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3180 if (ret < 0) { 3181 err = ret; 3182 goto fail; 3183 } 3184 if (ret == 0) { 3185 struct btrfs_inode_item *item; 3186 u64 i_size; 3187 3188 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3189 struct btrfs_inode_item); 3190 i_size = btrfs_inode_size(path->nodes[0], item); 3191 if (i_size > bytes_del) 3192 i_size -= bytes_del; 3193 else 3194 i_size = 0; 3195 btrfs_set_inode_size(path->nodes[0], item, i_size); 3196 btrfs_mark_buffer_dirty(path->nodes[0]); 3197 } else 3198 ret = 0; 3199 btrfs_release_path(path); 3200 } 3201 fail: 3202 btrfs_free_path(path); 3203 out_unlock: 3204 mutex_unlock(&dir->log_mutex); 3205 if (ret == -ENOSPC) { 3206 btrfs_set_log_full_commit(root->fs_info, trans); 3207 ret = 0; 3208 } else if (ret < 0) 3209 btrfs_abort_transaction(trans, ret); 3210 3211 btrfs_end_log_trans(root); 3212 3213 return err; 3214 } 3215 3216 /* see comments for btrfs_del_dir_entries_in_log */ 3217 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3218 struct btrfs_root *root, 3219 const char *name, int name_len, 3220 struct btrfs_inode *inode, u64 dirid) 3221 { 3222 struct btrfs_fs_info *fs_info = root->fs_info; 3223 struct btrfs_root *log; 3224 u64 index; 3225 int ret; 3226 3227 if (inode->logged_trans < trans->transid) 3228 return 0; 3229 3230 ret = join_running_log_trans(root); 3231 if (ret) 3232 return 0; 3233 log = root->log_root; 3234 mutex_lock(&inode->log_mutex); 3235 3236 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3237 dirid, &index); 3238 mutex_unlock(&inode->log_mutex); 3239 if (ret == -ENOSPC) { 3240 btrfs_set_log_full_commit(fs_info, trans); 3241 ret = 0; 3242 } else if (ret < 0 && ret != -ENOENT) 3243 btrfs_abort_transaction(trans, ret); 3244 btrfs_end_log_trans(root); 3245 3246 return ret; 3247 } 3248 3249 /* 3250 * creates a range item in the log for 'dirid'. first_offset and 3251 * last_offset tell us which parts of the key space the log should 3252 * be considered authoritative for. 3253 */ 3254 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3255 struct btrfs_root *log, 3256 struct btrfs_path *path, 3257 int key_type, u64 dirid, 3258 u64 first_offset, u64 last_offset) 3259 { 3260 int ret; 3261 struct btrfs_key key; 3262 struct btrfs_dir_log_item *item; 3263 3264 key.objectid = dirid; 3265 key.offset = first_offset; 3266 if (key_type == BTRFS_DIR_ITEM_KEY) 3267 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3268 else 3269 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3270 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3271 if (ret) 3272 return ret; 3273 3274 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3275 struct btrfs_dir_log_item); 3276 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3277 btrfs_mark_buffer_dirty(path->nodes[0]); 3278 btrfs_release_path(path); 3279 return 0; 3280 } 3281 3282 /* 3283 * log all the items included in the current transaction for a given 3284 * directory. This also creates the range items in the log tree required 3285 * to replay anything deleted before the fsync 3286 */ 3287 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3288 struct btrfs_root *root, struct btrfs_inode *inode, 3289 struct btrfs_path *path, 3290 struct btrfs_path *dst_path, int key_type, 3291 struct btrfs_log_ctx *ctx, 3292 u64 min_offset, u64 *last_offset_ret) 3293 { 3294 struct btrfs_key min_key; 3295 struct btrfs_root *log = root->log_root; 3296 struct extent_buffer *src; 3297 int err = 0; 3298 int ret; 3299 int i; 3300 int nritems; 3301 u64 first_offset = min_offset; 3302 u64 last_offset = (u64)-1; 3303 u64 ino = btrfs_ino(inode); 3304 3305 log = root->log_root; 3306 3307 min_key.objectid = ino; 3308 min_key.type = key_type; 3309 min_key.offset = min_offset; 3310 3311 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3312 3313 /* 3314 * we didn't find anything from this transaction, see if there 3315 * is anything at all 3316 */ 3317 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3318 min_key.objectid = ino; 3319 min_key.type = key_type; 3320 min_key.offset = (u64)-1; 3321 btrfs_release_path(path); 3322 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3323 if (ret < 0) { 3324 btrfs_release_path(path); 3325 return ret; 3326 } 3327 ret = btrfs_previous_item(root, path, ino, key_type); 3328 3329 /* if ret == 0 there are items for this type, 3330 * create a range to tell us the last key of this type. 3331 * otherwise, there are no items in this directory after 3332 * *min_offset, and we create a range to indicate that. 3333 */ 3334 if (ret == 0) { 3335 struct btrfs_key tmp; 3336 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3337 path->slots[0]); 3338 if (key_type == tmp.type) 3339 first_offset = max(min_offset, tmp.offset) + 1; 3340 } 3341 goto done; 3342 } 3343 3344 /* go backward to find any previous key */ 3345 ret = btrfs_previous_item(root, path, ino, key_type); 3346 if (ret == 0) { 3347 struct btrfs_key tmp; 3348 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3349 if (key_type == tmp.type) { 3350 first_offset = tmp.offset; 3351 ret = overwrite_item(trans, log, dst_path, 3352 path->nodes[0], path->slots[0], 3353 &tmp); 3354 if (ret) { 3355 err = ret; 3356 goto done; 3357 } 3358 } 3359 } 3360 btrfs_release_path(path); 3361 3362 /* find the first key from this transaction again */ 3363 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3364 if (WARN_ON(ret != 0)) 3365 goto done; 3366 3367 /* 3368 * we have a block from this transaction, log every item in it 3369 * from our directory 3370 */ 3371 while (1) { 3372 struct btrfs_key tmp; 3373 src = path->nodes[0]; 3374 nritems = btrfs_header_nritems(src); 3375 for (i = path->slots[0]; i < nritems; i++) { 3376 struct btrfs_dir_item *di; 3377 3378 btrfs_item_key_to_cpu(src, &min_key, i); 3379 3380 if (min_key.objectid != ino || min_key.type != key_type) 3381 goto done; 3382 ret = overwrite_item(trans, log, dst_path, src, i, 3383 &min_key); 3384 if (ret) { 3385 err = ret; 3386 goto done; 3387 } 3388 3389 /* 3390 * We must make sure that when we log a directory entry, 3391 * the corresponding inode, after log replay, has a 3392 * matching link count. For example: 3393 * 3394 * touch foo 3395 * mkdir mydir 3396 * sync 3397 * ln foo mydir/bar 3398 * xfs_io -c "fsync" mydir 3399 * <crash> 3400 * <mount fs and log replay> 3401 * 3402 * Would result in a fsync log that when replayed, our 3403 * file inode would have a link count of 1, but we get 3404 * two directory entries pointing to the same inode. 3405 * After removing one of the names, it would not be 3406 * possible to remove the other name, which resulted 3407 * always in stale file handle errors, and would not 3408 * be possible to rmdir the parent directory, since 3409 * its i_size could never decrement to the value 3410 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3411 */ 3412 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3413 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3414 if (ctx && 3415 (btrfs_dir_transid(src, di) == trans->transid || 3416 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3417 tmp.type != BTRFS_ROOT_ITEM_KEY) 3418 ctx->log_new_dentries = true; 3419 } 3420 path->slots[0] = nritems; 3421 3422 /* 3423 * look ahead to the next item and see if it is also 3424 * from this directory and from this transaction 3425 */ 3426 ret = btrfs_next_leaf(root, path); 3427 if (ret == 1) { 3428 last_offset = (u64)-1; 3429 goto done; 3430 } 3431 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3432 if (tmp.objectid != ino || tmp.type != key_type) { 3433 last_offset = (u64)-1; 3434 goto done; 3435 } 3436 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3437 ret = overwrite_item(trans, log, dst_path, 3438 path->nodes[0], path->slots[0], 3439 &tmp); 3440 if (ret) 3441 err = ret; 3442 else 3443 last_offset = tmp.offset; 3444 goto done; 3445 } 3446 } 3447 done: 3448 btrfs_release_path(path); 3449 btrfs_release_path(dst_path); 3450 3451 if (err == 0) { 3452 *last_offset_ret = last_offset; 3453 /* 3454 * insert the log range keys to indicate where the log 3455 * is valid 3456 */ 3457 ret = insert_dir_log_key(trans, log, path, key_type, 3458 ino, first_offset, last_offset); 3459 if (ret) 3460 err = ret; 3461 } 3462 return err; 3463 } 3464 3465 /* 3466 * logging directories is very similar to logging inodes, We find all the items 3467 * from the current transaction and write them to the log. 3468 * 3469 * The recovery code scans the directory in the subvolume, and if it finds a 3470 * key in the range logged that is not present in the log tree, then it means 3471 * that dir entry was unlinked during the transaction. 3472 * 3473 * In order for that scan to work, we must include one key smaller than 3474 * the smallest logged by this transaction and one key larger than the largest 3475 * key logged by this transaction. 3476 */ 3477 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3478 struct btrfs_root *root, struct btrfs_inode *inode, 3479 struct btrfs_path *path, 3480 struct btrfs_path *dst_path, 3481 struct btrfs_log_ctx *ctx) 3482 { 3483 u64 min_key; 3484 u64 max_key; 3485 int ret; 3486 int key_type = BTRFS_DIR_ITEM_KEY; 3487 3488 again: 3489 min_key = 0; 3490 max_key = 0; 3491 while (1) { 3492 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3493 ctx, min_key, &max_key); 3494 if (ret) 3495 return ret; 3496 if (max_key == (u64)-1) 3497 break; 3498 min_key = max_key + 1; 3499 } 3500 3501 if (key_type == BTRFS_DIR_ITEM_KEY) { 3502 key_type = BTRFS_DIR_INDEX_KEY; 3503 goto again; 3504 } 3505 return 0; 3506 } 3507 3508 /* 3509 * a helper function to drop items from the log before we relog an 3510 * inode. max_key_type indicates the highest item type to remove. 3511 * This cannot be run for file data extents because it does not 3512 * free the extents they point to. 3513 */ 3514 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3515 struct btrfs_root *log, 3516 struct btrfs_path *path, 3517 u64 objectid, int max_key_type) 3518 { 3519 int ret; 3520 struct btrfs_key key; 3521 struct btrfs_key found_key; 3522 int start_slot; 3523 3524 key.objectid = objectid; 3525 key.type = max_key_type; 3526 key.offset = (u64)-1; 3527 3528 while (1) { 3529 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3530 BUG_ON(ret == 0); /* Logic error */ 3531 if (ret < 0) 3532 break; 3533 3534 if (path->slots[0] == 0) 3535 break; 3536 3537 path->slots[0]--; 3538 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3539 path->slots[0]); 3540 3541 if (found_key.objectid != objectid) 3542 break; 3543 3544 found_key.offset = 0; 3545 found_key.type = 0; 3546 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3547 &start_slot); 3548 3549 ret = btrfs_del_items(trans, log, path, start_slot, 3550 path->slots[0] - start_slot + 1); 3551 /* 3552 * If start slot isn't 0 then we don't need to re-search, we've 3553 * found the last guy with the objectid in this tree. 3554 */ 3555 if (ret || start_slot != 0) 3556 break; 3557 btrfs_release_path(path); 3558 } 3559 btrfs_release_path(path); 3560 if (ret > 0) 3561 ret = 0; 3562 return ret; 3563 } 3564 3565 static void fill_inode_item(struct btrfs_trans_handle *trans, 3566 struct extent_buffer *leaf, 3567 struct btrfs_inode_item *item, 3568 struct inode *inode, int log_inode_only, 3569 u64 logged_isize) 3570 { 3571 struct btrfs_map_token token; 3572 3573 btrfs_init_map_token(&token); 3574 3575 if (log_inode_only) { 3576 /* set the generation to zero so the recover code 3577 * can tell the difference between an logging 3578 * just to say 'this inode exists' and a logging 3579 * to say 'update this inode with these values' 3580 */ 3581 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3582 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3583 } else { 3584 btrfs_set_token_inode_generation(leaf, item, 3585 BTRFS_I(inode)->generation, 3586 &token); 3587 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3588 } 3589 3590 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3591 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3592 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3593 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3594 3595 btrfs_set_token_timespec_sec(leaf, &item->atime, 3596 inode->i_atime.tv_sec, &token); 3597 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3598 inode->i_atime.tv_nsec, &token); 3599 3600 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3601 inode->i_mtime.tv_sec, &token); 3602 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3603 inode->i_mtime.tv_nsec, &token); 3604 3605 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3606 inode->i_ctime.tv_sec, &token); 3607 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3608 inode->i_ctime.tv_nsec, &token); 3609 3610 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3611 &token); 3612 3613 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3614 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3615 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3616 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3617 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3618 } 3619 3620 static int log_inode_item(struct btrfs_trans_handle *trans, 3621 struct btrfs_root *log, struct btrfs_path *path, 3622 struct btrfs_inode *inode) 3623 { 3624 struct btrfs_inode_item *inode_item; 3625 int ret; 3626 3627 ret = btrfs_insert_empty_item(trans, log, path, 3628 &inode->location, sizeof(*inode_item)); 3629 if (ret && ret != -EEXIST) 3630 return ret; 3631 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3632 struct btrfs_inode_item); 3633 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3634 0, 0); 3635 btrfs_release_path(path); 3636 return 0; 3637 } 3638 3639 static noinline int copy_items(struct btrfs_trans_handle *trans, 3640 struct btrfs_inode *inode, 3641 struct btrfs_path *dst_path, 3642 struct btrfs_path *src_path, u64 *last_extent, 3643 int start_slot, int nr, int inode_only, 3644 u64 logged_isize) 3645 { 3646 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3647 unsigned long src_offset; 3648 unsigned long dst_offset; 3649 struct btrfs_root *log = inode->root->log_root; 3650 struct btrfs_file_extent_item *extent; 3651 struct btrfs_inode_item *inode_item; 3652 struct extent_buffer *src = src_path->nodes[0]; 3653 struct btrfs_key first_key, last_key, key; 3654 int ret; 3655 struct btrfs_key *ins_keys; 3656 u32 *ins_sizes; 3657 char *ins_data; 3658 int i; 3659 struct list_head ordered_sums; 3660 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3661 bool has_extents = false; 3662 bool need_find_last_extent = true; 3663 bool done = false; 3664 3665 INIT_LIST_HEAD(&ordered_sums); 3666 3667 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3668 nr * sizeof(u32), GFP_NOFS); 3669 if (!ins_data) 3670 return -ENOMEM; 3671 3672 first_key.objectid = (u64)-1; 3673 3674 ins_sizes = (u32 *)ins_data; 3675 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3676 3677 for (i = 0; i < nr; i++) { 3678 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3679 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3680 } 3681 ret = btrfs_insert_empty_items(trans, log, dst_path, 3682 ins_keys, ins_sizes, nr); 3683 if (ret) { 3684 kfree(ins_data); 3685 return ret; 3686 } 3687 3688 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3689 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3690 dst_path->slots[0]); 3691 3692 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3693 3694 if ((i == (nr - 1))) 3695 last_key = ins_keys[i]; 3696 3697 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3698 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3699 dst_path->slots[0], 3700 struct btrfs_inode_item); 3701 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3702 &inode->vfs_inode, 3703 inode_only == LOG_INODE_EXISTS, 3704 logged_isize); 3705 } else { 3706 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3707 src_offset, ins_sizes[i]); 3708 } 3709 3710 /* 3711 * We set need_find_last_extent here in case we know we were 3712 * processing other items and then walk into the first extent in 3713 * the inode. If we don't hit an extent then nothing changes, 3714 * we'll do the last search the next time around. 3715 */ 3716 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3717 has_extents = true; 3718 if (first_key.objectid == (u64)-1) 3719 first_key = ins_keys[i]; 3720 } else { 3721 need_find_last_extent = false; 3722 } 3723 3724 /* take a reference on file data extents so that truncates 3725 * or deletes of this inode don't have to relog the inode 3726 * again 3727 */ 3728 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3729 !skip_csum) { 3730 int found_type; 3731 extent = btrfs_item_ptr(src, start_slot + i, 3732 struct btrfs_file_extent_item); 3733 3734 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3735 continue; 3736 3737 found_type = btrfs_file_extent_type(src, extent); 3738 if (found_type == BTRFS_FILE_EXTENT_REG) { 3739 u64 ds, dl, cs, cl; 3740 ds = btrfs_file_extent_disk_bytenr(src, 3741 extent); 3742 /* ds == 0 is a hole */ 3743 if (ds == 0) 3744 continue; 3745 3746 dl = btrfs_file_extent_disk_num_bytes(src, 3747 extent); 3748 cs = btrfs_file_extent_offset(src, extent); 3749 cl = btrfs_file_extent_num_bytes(src, 3750 extent); 3751 if (btrfs_file_extent_compression(src, 3752 extent)) { 3753 cs = 0; 3754 cl = dl; 3755 } 3756 3757 ret = btrfs_lookup_csums_range( 3758 fs_info->csum_root, 3759 ds + cs, ds + cs + cl - 1, 3760 &ordered_sums, 0); 3761 if (ret) { 3762 btrfs_release_path(dst_path); 3763 kfree(ins_data); 3764 return ret; 3765 } 3766 } 3767 } 3768 } 3769 3770 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3771 btrfs_release_path(dst_path); 3772 kfree(ins_data); 3773 3774 /* 3775 * we have to do this after the loop above to avoid changing the 3776 * log tree while trying to change the log tree. 3777 */ 3778 ret = 0; 3779 while (!list_empty(&ordered_sums)) { 3780 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3781 struct btrfs_ordered_sum, 3782 list); 3783 if (!ret) 3784 ret = btrfs_csum_file_blocks(trans, log, sums); 3785 list_del(&sums->list); 3786 kfree(sums); 3787 } 3788 3789 if (!has_extents) 3790 return ret; 3791 3792 if (need_find_last_extent && *last_extent == first_key.offset) { 3793 /* 3794 * We don't have any leafs between our current one and the one 3795 * we processed before that can have file extent items for our 3796 * inode (and have a generation number smaller than our current 3797 * transaction id). 3798 */ 3799 need_find_last_extent = false; 3800 } 3801 3802 /* 3803 * Because we use btrfs_search_forward we could skip leaves that were 3804 * not modified and then assume *last_extent is valid when it really 3805 * isn't. So back up to the previous leaf and read the end of the last 3806 * extent before we go and fill in holes. 3807 */ 3808 if (need_find_last_extent) { 3809 u64 len; 3810 3811 ret = btrfs_prev_leaf(inode->root, src_path); 3812 if (ret < 0) 3813 return ret; 3814 if (ret) 3815 goto fill_holes; 3816 if (src_path->slots[0]) 3817 src_path->slots[0]--; 3818 src = src_path->nodes[0]; 3819 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3820 if (key.objectid != btrfs_ino(inode) || 3821 key.type != BTRFS_EXTENT_DATA_KEY) 3822 goto fill_holes; 3823 extent = btrfs_item_ptr(src, src_path->slots[0], 3824 struct btrfs_file_extent_item); 3825 if (btrfs_file_extent_type(src, extent) == 3826 BTRFS_FILE_EXTENT_INLINE) { 3827 len = btrfs_file_extent_inline_len(src, 3828 src_path->slots[0], 3829 extent); 3830 *last_extent = ALIGN(key.offset + len, 3831 fs_info->sectorsize); 3832 } else { 3833 len = btrfs_file_extent_num_bytes(src, extent); 3834 *last_extent = key.offset + len; 3835 } 3836 } 3837 fill_holes: 3838 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3839 * things could have happened 3840 * 3841 * 1) A merge could have happened, so we could currently be on a leaf 3842 * that holds what we were copying in the first place. 3843 * 2) A split could have happened, and now not all of the items we want 3844 * are on the same leaf. 3845 * 3846 * So we need to adjust how we search for holes, we need to drop the 3847 * path and re-search for the first extent key we found, and then walk 3848 * forward until we hit the last one we copied. 3849 */ 3850 if (need_find_last_extent) { 3851 /* btrfs_prev_leaf could return 1 without releasing the path */ 3852 btrfs_release_path(src_path); 3853 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3854 src_path, 0, 0); 3855 if (ret < 0) 3856 return ret; 3857 ASSERT(ret == 0); 3858 src = src_path->nodes[0]; 3859 i = src_path->slots[0]; 3860 } else { 3861 i = start_slot; 3862 } 3863 3864 /* 3865 * Ok so here we need to go through and fill in any holes we may have 3866 * to make sure that holes are punched for those areas in case they had 3867 * extents previously. 3868 */ 3869 while (!done) { 3870 u64 offset, len; 3871 u64 extent_end; 3872 3873 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3874 ret = btrfs_next_leaf(inode->root, src_path); 3875 if (ret < 0) 3876 return ret; 3877 ASSERT(ret == 0); 3878 src = src_path->nodes[0]; 3879 i = 0; 3880 } 3881 3882 btrfs_item_key_to_cpu(src, &key, i); 3883 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3884 done = true; 3885 if (key.objectid != btrfs_ino(inode) || 3886 key.type != BTRFS_EXTENT_DATA_KEY) { 3887 i++; 3888 continue; 3889 } 3890 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3891 if (btrfs_file_extent_type(src, extent) == 3892 BTRFS_FILE_EXTENT_INLINE) { 3893 len = btrfs_file_extent_inline_len(src, i, extent); 3894 extent_end = ALIGN(key.offset + len, 3895 fs_info->sectorsize); 3896 } else { 3897 len = btrfs_file_extent_num_bytes(src, extent); 3898 extent_end = key.offset + len; 3899 } 3900 i++; 3901 3902 if (*last_extent == key.offset) { 3903 *last_extent = extent_end; 3904 continue; 3905 } 3906 offset = *last_extent; 3907 len = key.offset - *last_extent; 3908 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3909 offset, 0, 0, len, 0, len, 0, 0, 0); 3910 if (ret) 3911 break; 3912 *last_extent = extent_end; 3913 } 3914 /* 3915 * Need to let the callers know we dropped the path so they should 3916 * re-search. 3917 */ 3918 if (!ret && need_find_last_extent) 3919 ret = 1; 3920 return ret; 3921 } 3922 3923 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3924 { 3925 struct extent_map *em1, *em2; 3926 3927 em1 = list_entry(a, struct extent_map, list); 3928 em2 = list_entry(b, struct extent_map, list); 3929 3930 if (em1->start < em2->start) 3931 return -1; 3932 else if (em1->start > em2->start) 3933 return 1; 3934 return 0; 3935 } 3936 3937 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3938 struct inode *inode, 3939 struct btrfs_root *root, 3940 const struct extent_map *em, 3941 const struct list_head *logged_list, 3942 bool *ordered_io_error) 3943 { 3944 struct btrfs_fs_info *fs_info = root->fs_info; 3945 struct btrfs_ordered_extent *ordered; 3946 struct btrfs_root *log = root->log_root; 3947 u64 mod_start = em->mod_start; 3948 u64 mod_len = em->mod_len; 3949 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3950 u64 csum_offset; 3951 u64 csum_len; 3952 LIST_HEAD(ordered_sums); 3953 int ret = 0; 3954 3955 *ordered_io_error = false; 3956 3957 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3958 em->block_start == EXTENT_MAP_HOLE) 3959 return 0; 3960 3961 /* 3962 * Wait far any ordered extent that covers our extent map. If it 3963 * finishes without an error, first check and see if our csums are on 3964 * our outstanding ordered extents. 3965 */ 3966 list_for_each_entry(ordered, logged_list, log_list) { 3967 struct btrfs_ordered_sum *sum; 3968 3969 if (!mod_len) 3970 break; 3971 3972 if (ordered->file_offset + ordered->len <= mod_start || 3973 mod_start + mod_len <= ordered->file_offset) 3974 continue; 3975 3976 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3977 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3978 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3979 const u64 start = ordered->file_offset; 3980 const u64 end = ordered->file_offset + ordered->len - 1; 3981 3982 WARN_ON(ordered->inode != inode); 3983 filemap_fdatawrite_range(inode->i_mapping, start, end); 3984 } 3985 3986 wait_event(ordered->wait, 3987 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3988 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3989 3990 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3991 /* 3992 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3993 * i_mapping flags, so that the next fsync won't get 3994 * an outdated io error too. 3995 */ 3996 filemap_check_errors(inode->i_mapping); 3997 *ordered_io_error = true; 3998 break; 3999 } 4000 /* 4001 * We are going to copy all the csums on this ordered extent, so 4002 * go ahead and adjust mod_start and mod_len in case this 4003 * ordered extent has already been logged. 4004 */ 4005 if (ordered->file_offset > mod_start) { 4006 if (ordered->file_offset + ordered->len >= 4007 mod_start + mod_len) 4008 mod_len = ordered->file_offset - mod_start; 4009 /* 4010 * If we have this case 4011 * 4012 * |--------- logged extent ---------| 4013 * |----- ordered extent ----| 4014 * 4015 * Just don't mess with mod_start and mod_len, we'll 4016 * just end up logging more csums than we need and it 4017 * will be ok. 4018 */ 4019 } else { 4020 if (ordered->file_offset + ordered->len < 4021 mod_start + mod_len) { 4022 mod_len = (mod_start + mod_len) - 4023 (ordered->file_offset + ordered->len); 4024 mod_start = ordered->file_offset + 4025 ordered->len; 4026 } else { 4027 mod_len = 0; 4028 } 4029 } 4030 4031 if (skip_csum) 4032 continue; 4033 4034 /* 4035 * To keep us from looping for the above case of an ordered 4036 * extent that falls inside of the logged extent. 4037 */ 4038 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4039 &ordered->flags)) 4040 continue; 4041 4042 list_for_each_entry(sum, &ordered->list, list) { 4043 ret = btrfs_csum_file_blocks(trans, log, sum); 4044 if (ret) 4045 break; 4046 } 4047 } 4048 4049 if (*ordered_io_error || !mod_len || ret || skip_csum) 4050 return ret; 4051 4052 if (em->compress_type) { 4053 csum_offset = 0; 4054 csum_len = max(em->block_len, em->orig_block_len); 4055 } else { 4056 csum_offset = mod_start - em->start; 4057 csum_len = mod_len; 4058 } 4059 4060 /* block start is already adjusted for the file extent offset. */ 4061 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4062 em->block_start + csum_offset, 4063 em->block_start + csum_offset + 4064 csum_len - 1, &ordered_sums, 0); 4065 if (ret) 4066 return ret; 4067 4068 while (!list_empty(&ordered_sums)) { 4069 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4070 struct btrfs_ordered_sum, 4071 list); 4072 if (!ret) 4073 ret = btrfs_csum_file_blocks(trans, log, sums); 4074 list_del(&sums->list); 4075 kfree(sums); 4076 } 4077 4078 return ret; 4079 } 4080 4081 static int log_one_extent(struct btrfs_trans_handle *trans, 4082 struct btrfs_inode *inode, struct btrfs_root *root, 4083 const struct extent_map *em, 4084 struct btrfs_path *path, 4085 const struct list_head *logged_list, 4086 struct btrfs_log_ctx *ctx) 4087 { 4088 struct btrfs_root *log = root->log_root; 4089 struct btrfs_file_extent_item *fi; 4090 struct extent_buffer *leaf; 4091 struct btrfs_map_token token; 4092 struct btrfs_key key; 4093 u64 extent_offset = em->start - em->orig_start; 4094 u64 block_len; 4095 int ret; 4096 int extent_inserted = 0; 4097 bool ordered_io_err = false; 4098 4099 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4100 logged_list, &ordered_io_err); 4101 if (ret) 4102 return ret; 4103 4104 if (ordered_io_err) { 4105 ctx->io_err = -EIO; 4106 return 0; 4107 } 4108 4109 btrfs_init_map_token(&token); 4110 4111 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4112 em->start + em->len, NULL, 0, 1, 4113 sizeof(*fi), &extent_inserted); 4114 if (ret) 4115 return ret; 4116 4117 if (!extent_inserted) { 4118 key.objectid = btrfs_ino(inode); 4119 key.type = BTRFS_EXTENT_DATA_KEY; 4120 key.offset = em->start; 4121 4122 ret = btrfs_insert_empty_item(trans, log, path, &key, 4123 sizeof(*fi)); 4124 if (ret) 4125 return ret; 4126 } 4127 leaf = path->nodes[0]; 4128 fi = btrfs_item_ptr(leaf, path->slots[0], 4129 struct btrfs_file_extent_item); 4130 4131 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4132 &token); 4133 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4134 btrfs_set_token_file_extent_type(leaf, fi, 4135 BTRFS_FILE_EXTENT_PREALLOC, 4136 &token); 4137 else 4138 btrfs_set_token_file_extent_type(leaf, fi, 4139 BTRFS_FILE_EXTENT_REG, 4140 &token); 4141 4142 block_len = max(em->block_len, em->orig_block_len); 4143 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4144 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4145 em->block_start, 4146 &token); 4147 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4148 &token); 4149 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4150 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4151 em->block_start - 4152 extent_offset, &token); 4153 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4154 &token); 4155 } else { 4156 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4157 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4158 &token); 4159 } 4160 4161 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4162 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4163 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4164 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4165 &token); 4166 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4167 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4168 btrfs_mark_buffer_dirty(leaf); 4169 4170 btrfs_release_path(path); 4171 4172 return ret; 4173 } 4174 4175 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4176 struct btrfs_root *root, 4177 struct btrfs_inode *inode, 4178 struct btrfs_path *path, 4179 struct list_head *logged_list, 4180 struct btrfs_log_ctx *ctx, 4181 const u64 start, 4182 const u64 end) 4183 { 4184 struct extent_map *em, *n; 4185 struct list_head extents; 4186 struct extent_map_tree *tree = &inode->extent_tree; 4187 u64 test_gen; 4188 int ret = 0; 4189 int num = 0; 4190 4191 INIT_LIST_HEAD(&extents); 4192 4193 down_write(&inode->dio_sem); 4194 write_lock(&tree->lock); 4195 test_gen = root->fs_info->last_trans_committed; 4196 4197 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4198 list_del_init(&em->list); 4199 4200 /* 4201 * Just an arbitrary number, this can be really CPU intensive 4202 * once we start getting a lot of extents, and really once we 4203 * have a bunch of extents we just want to commit since it will 4204 * be faster. 4205 */ 4206 if (++num > 32768) { 4207 list_del_init(&tree->modified_extents); 4208 ret = -EFBIG; 4209 goto process; 4210 } 4211 4212 if (em->generation <= test_gen) 4213 continue; 4214 /* Need a ref to keep it from getting evicted from cache */ 4215 refcount_inc(&em->refs); 4216 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4217 list_add_tail(&em->list, &extents); 4218 num++; 4219 } 4220 4221 list_sort(NULL, &extents, extent_cmp); 4222 btrfs_get_logged_extents(inode, logged_list, start, end); 4223 /* 4224 * Some ordered extents started by fsync might have completed 4225 * before we could collect them into the list logged_list, which 4226 * means they're gone, not in our logged_list nor in the inode's 4227 * ordered tree. We want the application/user space to know an 4228 * error happened while attempting to persist file data so that 4229 * it can take proper action. If such error happened, we leave 4230 * without writing to the log tree and the fsync must report the 4231 * file data write error and not commit the current transaction. 4232 */ 4233 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4234 if (ret) 4235 ctx->io_err = ret; 4236 process: 4237 while (!list_empty(&extents)) { 4238 em = list_entry(extents.next, struct extent_map, list); 4239 4240 list_del_init(&em->list); 4241 4242 /* 4243 * If we had an error we just need to delete everybody from our 4244 * private list. 4245 */ 4246 if (ret) { 4247 clear_em_logging(tree, em); 4248 free_extent_map(em); 4249 continue; 4250 } 4251 4252 write_unlock(&tree->lock); 4253 4254 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4255 ctx); 4256 write_lock(&tree->lock); 4257 clear_em_logging(tree, em); 4258 free_extent_map(em); 4259 } 4260 WARN_ON(!list_empty(&extents)); 4261 write_unlock(&tree->lock); 4262 up_write(&inode->dio_sem); 4263 4264 btrfs_release_path(path); 4265 return ret; 4266 } 4267 4268 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4269 struct btrfs_path *path, u64 *size_ret) 4270 { 4271 struct btrfs_key key; 4272 int ret; 4273 4274 key.objectid = btrfs_ino(inode); 4275 key.type = BTRFS_INODE_ITEM_KEY; 4276 key.offset = 0; 4277 4278 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4279 if (ret < 0) { 4280 return ret; 4281 } else if (ret > 0) { 4282 *size_ret = 0; 4283 } else { 4284 struct btrfs_inode_item *item; 4285 4286 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4287 struct btrfs_inode_item); 4288 *size_ret = btrfs_inode_size(path->nodes[0], item); 4289 } 4290 4291 btrfs_release_path(path); 4292 return 0; 4293 } 4294 4295 /* 4296 * At the moment we always log all xattrs. This is to figure out at log replay 4297 * time which xattrs must have their deletion replayed. If a xattr is missing 4298 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4299 * because if a xattr is deleted, the inode is fsynced and a power failure 4300 * happens, causing the log to be replayed the next time the fs is mounted, 4301 * we want the xattr to not exist anymore (same behaviour as other filesystems 4302 * with a journal, ext3/4, xfs, f2fs, etc). 4303 */ 4304 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4305 struct btrfs_root *root, 4306 struct btrfs_inode *inode, 4307 struct btrfs_path *path, 4308 struct btrfs_path *dst_path) 4309 { 4310 int ret; 4311 struct btrfs_key key; 4312 const u64 ino = btrfs_ino(inode); 4313 int ins_nr = 0; 4314 int start_slot = 0; 4315 4316 key.objectid = ino; 4317 key.type = BTRFS_XATTR_ITEM_KEY; 4318 key.offset = 0; 4319 4320 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4321 if (ret < 0) 4322 return ret; 4323 4324 while (true) { 4325 int slot = path->slots[0]; 4326 struct extent_buffer *leaf = path->nodes[0]; 4327 int nritems = btrfs_header_nritems(leaf); 4328 4329 if (slot >= nritems) { 4330 if (ins_nr > 0) { 4331 u64 last_extent = 0; 4332 4333 ret = copy_items(trans, inode, dst_path, path, 4334 &last_extent, start_slot, 4335 ins_nr, 1, 0); 4336 /* can't be 1, extent items aren't processed */ 4337 ASSERT(ret <= 0); 4338 if (ret < 0) 4339 return ret; 4340 ins_nr = 0; 4341 } 4342 ret = btrfs_next_leaf(root, path); 4343 if (ret < 0) 4344 return ret; 4345 else if (ret > 0) 4346 break; 4347 continue; 4348 } 4349 4350 btrfs_item_key_to_cpu(leaf, &key, slot); 4351 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4352 break; 4353 4354 if (ins_nr == 0) 4355 start_slot = slot; 4356 ins_nr++; 4357 path->slots[0]++; 4358 cond_resched(); 4359 } 4360 if (ins_nr > 0) { 4361 u64 last_extent = 0; 4362 4363 ret = copy_items(trans, inode, dst_path, path, 4364 &last_extent, start_slot, 4365 ins_nr, 1, 0); 4366 /* can't be 1, extent items aren't processed */ 4367 ASSERT(ret <= 0); 4368 if (ret < 0) 4369 return ret; 4370 } 4371 4372 return 0; 4373 } 4374 4375 /* 4376 * If the no holes feature is enabled we need to make sure any hole between the 4377 * last extent and the i_size of our inode is explicitly marked in the log. This 4378 * is to make sure that doing something like: 4379 * 4380 * 1) create file with 128Kb of data 4381 * 2) truncate file to 64Kb 4382 * 3) truncate file to 256Kb 4383 * 4) fsync file 4384 * 5) <crash/power failure> 4385 * 6) mount fs and trigger log replay 4386 * 4387 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4388 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4389 * file correspond to a hole. The presence of explicit holes in a log tree is 4390 * what guarantees that log replay will remove/adjust file extent items in the 4391 * fs/subvol tree. 4392 * 4393 * Here we do not need to care about holes between extents, that is already done 4394 * by copy_items(). We also only need to do this in the full sync path, where we 4395 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4396 * lookup the list of modified extent maps and if any represents a hole, we 4397 * insert a corresponding extent representing a hole in the log tree. 4398 */ 4399 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4400 struct btrfs_root *root, 4401 struct btrfs_inode *inode, 4402 struct btrfs_path *path) 4403 { 4404 struct btrfs_fs_info *fs_info = root->fs_info; 4405 int ret; 4406 struct btrfs_key key; 4407 u64 hole_start; 4408 u64 hole_size; 4409 struct extent_buffer *leaf; 4410 struct btrfs_root *log = root->log_root; 4411 const u64 ino = btrfs_ino(inode); 4412 const u64 i_size = i_size_read(&inode->vfs_inode); 4413 4414 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4415 return 0; 4416 4417 key.objectid = ino; 4418 key.type = BTRFS_EXTENT_DATA_KEY; 4419 key.offset = (u64)-1; 4420 4421 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4422 ASSERT(ret != 0); 4423 if (ret < 0) 4424 return ret; 4425 4426 ASSERT(path->slots[0] > 0); 4427 path->slots[0]--; 4428 leaf = path->nodes[0]; 4429 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4430 4431 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4432 /* inode does not have any extents */ 4433 hole_start = 0; 4434 hole_size = i_size; 4435 } else { 4436 struct btrfs_file_extent_item *extent; 4437 u64 len; 4438 4439 /* 4440 * If there's an extent beyond i_size, an explicit hole was 4441 * already inserted by copy_items(). 4442 */ 4443 if (key.offset >= i_size) 4444 return 0; 4445 4446 extent = btrfs_item_ptr(leaf, path->slots[0], 4447 struct btrfs_file_extent_item); 4448 4449 if (btrfs_file_extent_type(leaf, extent) == 4450 BTRFS_FILE_EXTENT_INLINE) { 4451 len = btrfs_file_extent_inline_len(leaf, 4452 path->slots[0], 4453 extent); 4454 ASSERT(len == i_size); 4455 return 0; 4456 } 4457 4458 len = btrfs_file_extent_num_bytes(leaf, extent); 4459 /* Last extent goes beyond i_size, no need to log a hole. */ 4460 if (key.offset + len > i_size) 4461 return 0; 4462 hole_start = key.offset + len; 4463 hole_size = i_size - hole_start; 4464 } 4465 btrfs_release_path(path); 4466 4467 /* Last extent ends at i_size. */ 4468 if (hole_size == 0) 4469 return 0; 4470 4471 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4472 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4473 hole_size, 0, hole_size, 0, 0, 0); 4474 return ret; 4475 } 4476 4477 /* 4478 * When we are logging a new inode X, check if it doesn't have a reference that 4479 * matches the reference from some other inode Y created in a past transaction 4480 * and that was renamed in the current transaction. If we don't do this, then at 4481 * log replay time we can lose inode Y (and all its files if it's a directory): 4482 * 4483 * mkdir /mnt/x 4484 * echo "hello world" > /mnt/x/foobar 4485 * sync 4486 * mv /mnt/x /mnt/y 4487 * mkdir /mnt/x # or touch /mnt/x 4488 * xfs_io -c fsync /mnt/x 4489 * <power fail> 4490 * mount fs, trigger log replay 4491 * 4492 * After the log replay procedure, we would lose the first directory and all its 4493 * files (file foobar). 4494 * For the case where inode Y is not a directory we simply end up losing it: 4495 * 4496 * echo "123" > /mnt/foo 4497 * sync 4498 * mv /mnt/foo /mnt/bar 4499 * echo "abc" > /mnt/foo 4500 * xfs_io -c fsync /mnt/foo 4501 * <power fail> 4502 * 4503 * We also need this for cases where a snapshot entry is replaced by some other 4504 * entry (file or directory) otherwise we end up with an unreplayable log due to 4505 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4506 * if it were a regular entry: 4507 * 4508 * mkdir /mnt/x 4509 * btrfs subvolume snapshot /mnt /mnt/x/snap 4510 * btrfs subvolume delete /mnt/x/snap 4511 * rmdir /mnt/x 4512 * mkdir /mnt/x 4513 * fsync /mnt/x or fsync some new file inside it 4514 * <power fail> 4515 * 4516 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4517 * the same transaction. 4518 */ 4519 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4520 const int slot, 4521 const struct btrfs_key *key, 4522 struct btrfs_inode *inode, 4523 u64 *other_ino) 4524 { 4525 int ret; 4526 struct btrfs_path *search_path; 4527 char *name = NULL; 4528 u32 name_len = 0; 4529 u32 item_size = btrfs_item_size_nr(eb, slot); 4530 u32 cur_offset = 0; 4531 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4532 4533 search_path = btrfs_alloc_path(); 4534 if (!search_path) 4535 return -ENOMEM; 4536 search_path->search_commit_root = 1; 4537 search_path->skip_locking = 1; 4538 4539 while (cur_offset < item_size) { 4540 u64 parent; 4541 u32 this_name_len; 4542 u32 this_len; 4543 unsigned long name_ptr; 4544 struct btrfs_dir_item *di; 4545 4546 if (key->type == BTRFS_INODE_REF_KEY) { 4547 struct btrfs_inode_ref *iref; 4548 4549 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4550 parent = key->offset; 4551 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4552 name_ptr = (unsigned long)(iref + 1); 4553 this_len = sizeof(*iref) + this_name_len; 4554 } else { 4555 struct btrfs_inode_extref *extref; 4556 4557 extref = (struct btrfs_inode_extref *)(ptr + 4558 cur_offset); 4559 parent = btrfs_inode_extref_parent(eb, extref); 4560 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4561 name_ptr = (unsigned long)&extref->name; 4562 this_len = sizeof(*extref) + this_name_len; 4563 } 4564 4565 ret = btrfs_is_name_len_valid(eb, slot, name_ptr, 4566 this_name_len); 4567 if (!ret) { 4568 ret = -EIO; 4569 goto out; 4570 } 4571 if (this_name_len > name_len) { 4572 char *new_name; 4573 4574 new_name = krealloc(name, this_name_len, GFP_NOFS); 4575 if (!new_name) { 4576 ret = -ENOMEM; 4577 goto out; 4578 } 4579 name_len = this_name_len; 4580 name = new_name; 4581 } 4582 4583 read_extent_buffer(eb, name, name_ptr, this_name_len); 4584 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4585 parent, name, this_name_len, 0); 4586 if (di && !IS_ERR(di)) { 4587 struct btrfs_key di_key; 4588 4589 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4590 di, &di_key); 4591 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4592 ret = 1; 4593 *other_ino = di_key.objectid; 4594 } else { 4595 ret = -EAGAIN; 4596 } 4597 goto out; 4598 } else if (IS_ERR(di)) { 4599 ret = PTR_ERR(di); 4600 goto out; 4601 } 4602 btrfs_release_path(search_path); 4603 4604 cur_offset += this_len; 4605 } 4606 ret = 0; 4607 out: 4608 btrfs_free_path(search_path); 4609 kfree(name); 4610 return ret; 4611 } 4612 4613 /* log a single inode in the tree log. 4614 * At least one parent directory for this inode must exist in the tree 4615 * or be logged already. 4616 * 4617 * Any items from this inode changed by the current transaction are copied 4618 * to the log tree. An extra reference is taken on any extents in this 4619 * file, allowing us to avoid a whole pile of corner cases around logging 4620 * blocks that have been removed from the tree. 4621 * 4622 * See LOG_INODE_ALL and related defines for a description of what inode_only 4623 * does. 4624 * 4625 * This handles both files and directories. 4626 */ 4627 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4628 struct btrfs_root *root, struct btrfs_inode *inode, 4629 int inode_only, 4630 const loff_t start, 4631 const loff_t end, 4632 struct btrfs_log_ctx *ctx) 4633 { 4634 struct btrfs_fs_info *fs_info = root->fs_info; 4635 struct btrfs_path *path; 4636 struct btrfs_path *dst_path; 4637 struct btrfs_key min_key; 4638 struct btrfs_key max_key; 4639 struct btrfs_root *log = root->log_root; 4640 struct extent_buffer *src = NULL; 4641 LIST_HEAD(logged_list); 4642 u64 last_extent = 0; 4643 int err = 0; 4644 int ret; 4645 int nritems; 4646 int ins_start_slot = 0; 4647 int ins_nr; 4648 bool fast_search = false; 4649 u64 ino = btrfs_ino(inode); 4650 struct extent_map_tree *em_tree = &inode->extent_tree; 4651 u64 logged_isize = 0; 4652 bool need_log_inode_item = true; 4653 4654 path = btrfs_alloc_path(); 4655 if (!path) 4656 return -ENOMEM; 4657 dst_path = btrfs_alloc_path(); 4658 if (!dst_path) { 4659 btrfs_free_path(path); 4660 return -ENOMEM; 4661 } 4662 4663 min_key.objectid = ino; 4664 min_key.type = BTRFS_INODE_ITEM_KEY; 4665 min_key.offset = 0; 4666 4667 max_key.objectid = ino; 4668 4669 4670 /* today the code can only do partial logging of directories */ 4671 if (S_ISDIR(inode->vfs_inode.i_mode) || 4672 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4673 &inode->runtime_flags) && 4674 inode_only >= LOG_INODE_EXISTS)) 4675 max_key.type = BTRFS_XATTR_ITEM_KEY; 4676 else 4677 max_key.type = (u8)-1; 4678 max_key.offset = (u64)-1; 4679 4680 /* 4681 * Only run delayed items if we are a dir or a new file. 4682 * Otherwise commit the delayed inode only, which is needed in 4683 * order for the log replay code to mark inodes for link count 4684 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4685 */ 4686 if (S_ISDIR(inode->vfs_inode.i_mode) || 4687 inode->generation > fs_info->last_trans_committed) 4688 ret = btrfs_commit_inode_delayed_items(trans, inode); 4689 else 4690 ret = btrfs_commit_inode_delayed_inode(inode); 4691 4692 if (ret) { 4693 btrfs_free_path(path); 4694 btrfs_free_path(dst_path); 4695 return ret; 4696 } 4697 4698 if (inode_only == LOG_OTHER_INODE) { 4699 inode_only = LOG_INODE_EXISTS; 4700 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4701 } else { 4702 mutex_lock(&inode->log_mutex); 4703 } 4704 4705 /* 4706 * a brute force approach to making sure we get the most uptodate 4707 * copies of everything. 4708 */ 4709 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4710 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4711 4712 if (inode_only == LOG_INODE_EXISTS) 4713 max_key_type = BTRFS_XATTR_ITEM_KEY; 4714 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4715 } else { 4716 if (inode_only == LOG_INODE_EXISTS) { 4717 /* 4718 * Make sure the new inode item we write to the log has 4719 * the same isize as the current one (if it exists). 4720 * This is necessary to prevent data loss after log 4721 * replay, and also to prevent doing a wrong expanding 4722 * truncate - for e.g. create file, write 4K into offset 4723 * 0, fsync, write 4K into offset 4096, add hard link, 4724 * fsync some other file (to sync log), power fail - if 4725 * we use the inode's current i_size, after log replay 4726 * we get a 8Kb file, with the last 4Kb extent as a hole 4727 * (zeroes), as if an expanding truncate happened, 4728 * instead of getting a file of 4Kb only. 4729 */ 4730 err = logged_inode_size(log, inode, path, &logged_isize); 4731 if (err) 4732 goto out_unlock; 4733 } 4734 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4735 &inode->runtime_flags)) { 4736 if (inode_only == LOG_INODE_EXISTS) { 4737 max_key.type = BTRFS_XATTR_ITEM_KEY; 4738 ret = drop_objectid_items(trans, log, path, ino, 4739 max_key.type); 4740 } else { 4741 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4742 &inode->runtime_flags); 4743 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4744 &inode->runtime_flags); 4745 while(1) { 4746 ret = btrfs_truncate_inode_items(trans, 4747 log, &inode->vfs_inode, 0, 0); 4748 if (ret != -EAGAIN) 4749 break; 4750 } 4751 } 4752 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4753 &inode->runtime_flags) || 4754 inode_only == LOG_INODE_EXISTS) { 4755 if (inode_only == LOG_INODE_ALL) 4756 fast_search = true; 4757 max_key.type = BTRFS_XATTR_ITEM_KEY; 4758 ret = drop_objectid_items(trans, log, path, ino, 4759 max_key.type); 4760 } else { 4761 if (inode_only == LOG_INODE_ALL) 4762 fast_search = true; 4763 goto log_extents; 4764 } 4765 4766 } 4767 if (ret) { 4768 err = ret; 4769 goto out_unlock; 4770 } 4771 4772 while (1) { 4773 ins_nr = 0; 4774 ret = btrfs_search_forward(root, &min_key, 4775 path, trans->transid); 4776 if (ret < 0) { 4777 err = ret; 4778 goto out_unlock; 4779 } 4780 if (ret != 0) 4781 break; 4782 again: 4783 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4784 if (min_key.objectid != ino) 4785 break; 4786 if (min_key.type > max_key.type) 4787 break; 4788 4789 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4790 need_log_inode_item = false; 4791 4792 if ((min_key.type == BTRFS_INODE_REF_KEY || 4793 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4794 inode->generation == trans->transid) { 4795 u64 other_ino = 0; 4796 4797 ret = btrfs_check_ref_name_override(path->nodes[0], 4798 path->slots[0], &min_key, inode, 4799 &other_ino); 4800 if (ret < 0) { 4801 err = ret; 4802 goto out_unlock; 4803 } else if (ret > 0 && ctx && 4804 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4805 struct btrfs_key inode_key; 4806 struct inode *other_inode; 4807 4808 if (ins_nr > 0) { 4809 ins_nr++; 4810 } else { 4811 ins_nr = 1; 4812 ins_start_slot = path->slots[0]; 4813 } 4814 ret = copy_items(trans, inode, dst_path, path, 4815 &last_extent, ins_start_slot, 4816 ins_nr, inode_only, 4817 logged_isize); 4818 if (ret < 0) { 4819 err = ret; 4820 goto out_unlock; 4821 } 4822 ins_nr = 0; 4823 btrfs_release_path(path); 4824 inode_key.objectid = other_ino; 4825 inode_key.type = BTRFS_INODE_ITEM_KEY; 4826 inode_key.offset = 0; 4827 other_inode = btrfs_iget(fs_info->sb, 4828 &inode_key, root, 4829 NULL); 4830 /* 4831 * If the other inode that had a conflicting dir 4832 * entry was deleted in the current transaction, 4833 * we don't need to do more work nor fallback to 4834 * a transaction commit. 4835 */ 4836 if (IS_ERR(other_inode) && 4837 PTR_ERR(other_inode) == -ENOENT) { 4838 goto next_key; 4839 } else if (IS_ERR(other_inode)) { 4840 err = PTR_ERR(other_inode); 4841 goto out_unlock; 4842 } 4843 /* 4844 * We are safe logging the other inode without 4845 * acquiring its i_mutex as long as we log with 4846 * the LOG_INODE_EXISTS mode. We're safe against 4847 * concurrent renames of the other inode as well 4848 * because during a rename we pin the log and 4849 * update the log with the new name before we 4850 * unpin it. 4851 */ 4852 err = btrfs_log_inode(trans, root, 4853 BTRFS_I(other_inode), 4854 LOG_OTHER_INODE, 0, LLONG_MAX, 4855 ctx); 4856 iput(other_inode); 4857 if (err) 4858 goto out_unlock; 4859 else 4860 goto next_key; 4861 } 4862 } 4863 4864 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4865 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4866 if (ins_nr == 0) 4867 goto next_slot; 4868 ret = copy_items(trans, inode, dst_path, path, 4869 &last_extent, ins_start_slot, 4870 ins_nr, inode_only, logged_isize); 4871 if (ret < 0) { 4872 err = ret; 4873 goto out_unlock; 4874 } 4875 ins_nr = 0; 4876 if (ret) { 4877 btrfs_release_path(path); 4878 continue; 4879 } 4880 goto next_slot; 4881 } 4882 4883 src = path->nodes[0]; 4884 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4885 ins_nr++; 4886 goto next_slot; 4887 } else if (!ins_nr) { 4888 ins_start_slot = path->slots[0]; 4889 ins_nr = 1; 4890 goto next_slot; 4891 } 4892 4893 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4894 ins_start_slot, ins_nr, inode_only, 4895 logged_isize); 4896 if (ret < 0) { 4897 err = ret; 4898 goto out_unlock; 4899 } 4900 if (ret) { 4901 ins_nr = 0; 4902 btrfs_release_path(path); 4903 continue; 4904 } 4905 ins_nr = 1; 4906 ins_start_slot = path->slots[0]; 4907 next_slot: 4908 4909 nritems = btrfs_header_nritems(path->nodes[0]); 4910 path->slots[0]++; 4911 if (path->slots[0] < nritems) { 4912 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4913 path->slots[0]); 4914 goto again; 4915 } 4916 if (ins_nr) { 4917 ret = copy_items(trans, inode, dst_path, path, 4918 &last_extent, ins_start_slot, 4919 ins_nr, inode_only, logged_isize); 4920 if (ret < 0) { 4921 err = ret; 4922 goto out_unlock; 4923 } 4924 ret = 0; 4925 ins_nr = 0; 4926 } 4927 btrfs_release_path(path); 4928 next_key: 4929 if (min_key.offset < (u64)-1) { 4930 min_key.offset++; 4931 } else if (min_key.type < max_key.type) { 4932 min_key.type++; 4933 min_key.offset = 0; 4934 } else { 4935 break; 4936 } 4937 } 4938 if (ins_nr) { 4939 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4940 ins_start_slot, ins_nr, inode_only, 4941 logged_isize); 4942 if (ret < 0) { 4943 err = ret; 4944 goto out_unlock; 4945 } 4946 ret = 0; 4947 ins_nr = 0; 4948 } 4949 4950 btrfs_release_path(path); 4951 btrfs_release_path(dst_path); 4952 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4953 if (err) 4954 goto out_unlock; 4955 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4956 btrfs_release_path(path); 4957 btrfs_release_path(dst_path); 4958 err = btrfs_log_trailing_hole(trans, root, inode, path); 4959 if (err) 4960 goto out_unlock; 4961 } 4962 log_extents: 4963 btrfs_release_path(path); 4964 btrfs_release_path(dst_path); 4965 if (need_log_inode_item) { 4966 err = log_inode_item(trans, log, dst_path, inode); 4967 if (err) 4968 goto out_unlock; 4969 } 4970 if (fast_search) { 4971 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4972 &logged_list, ctx, start, end); 4973 if (ret) { 4974 err = ret; 4975 goto out_unlock; 4976 } 4977 } else if (inode_only == LOG_INODE_ALL) { 4978 struct extent_map *em, *n; 4979 4980 write_lock(&em_tree->lock); 4981 /* 4982 * We can't just remove every em if we're called for a ranged 4983 * fsync - that is, one that doesn't cover the whole possible 4984 * file range (0 to LLONG_MAX). This is because we can have 4985 * em's that fall outside the range we're logging and therefore 4986 * their ordered operations haven't completed yet 4987 * (btrfs_finish_ordered_io() not invoked yet). This means we 4988 * didn't get their respective file extent item in the fs/subvol 4989 * tree yet, and need to let the next fast fsync (one which 4990 * consults the list of modified extent maps) find the em so 4991 * that it logs a matching file extent item and waits for the 4992 * respective ordered operation to complete (if it's still 4993 * running). 4994 * 4995 * Removing every em outside the range we're logging would make 4996 * the next fast fsync not log their matching file extent items, 4997 * therefore making us lose data after a log replay. 4998 */ 4999 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5000 list) { 5001 const u64 mod_end = em->mod_start + em->mod_len - 1; 5002 5003 if (em->mod_start >= start && mod_end <= end) 5004 list_del_init(&em->list); 5005 } 5006 write_unlock(&em_tree->lock); 5007 } 5008 5009 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5010 ret = log_directory_changes(trans, root, inode, path, dst_path, 5011 ctx); 5012 if (ret) { 5013 err = ret; 5014 goto out_unlock; 5015 } 5016 } 5017 5018 spin_lock(&inode->lock); 5019 inode->logged_trans = trans->transid; 5020 inode->last_log_commit = inode->last_sub_trans; 5021 spin_unlock(&inode->lock); 5022 out_unlock: 5023 if (unlikely(err)) 5024 btrfs_put_logged_extents(&logged_list); 5025 else 5026 btrfs_submit_logged_extents(&logged_list, log); 5027 mutex_unlock(&inode->log_mutex); 5028 5029 btrfs_free_path(path); 5030 btrfs_free_path(dst_path); 5031 return err; 5032 } 5033 5034 /* 5035 * Check if we must fallback to a transaction commit when logging an inode. 5036 * This must be called after logging the inode and is used only in the context 5037 * when fsyncing an inode requires the need to log some other inode - in which 5038 * case we can't lock the i_mutex of each other inode we need to log as that 5039 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5040 * log inodes up or down in the hierarchy) or rename operations for example. So 5041 * we take the log_mutex of the inode after we have logged it and then check for 5042 * its last_unlink_trans value - this is safe because any task setting 5043 * last_unlink_trans must take the log_mutex and it must do this before it does 5044 * the actual unlink operation, so if we do this check before a concurrent task 5045 * sets last_unlink_trans it means we've logged a consistent version/state of 5046 * all the inode items, otherwise we are not sure and must do a transaction 5047 * commit (the concurrent task might have only updated last_unlink_trans before 5048 * we logged the inode or it might have also done the unlink). 5049 */ 5050 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5051 struct btrfs_inode *inode) 5052 { 5053 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5054 bool ret = false; 5055 5056 mutex_lock(&inode->log_mutex); 5057 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5058 /* 5059 * Make sure any commits to the log are forced to be full 5060 * commits. 5061 */ 5062 btrfs_set_log_full_commit(fs_info, trans); 5063 ret = true; 5064 } 5065 mutex_unlock(&inode->log_mutex); 5066 5067 return ret; 5068 } 5069 5070 /* 5071 * follow the dentry parent pointers up the chain and see if any 5072 * of the directories in it require a full commit before they can 5073 * be logged. Returns zero if nothing special needs to be done or 1 if 5074 * a full commit is required. 5075 */ 5076 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5077 struct btrfs_inode *inode, 5078 struct dentry *parent, 5079 struct super_block *sb, 5080 u64 last_committed) 5081 { 5082 int ret = 0; 5083 struct dentry *old_parent = NULL; 5084 struct btrfs_inode *orig_inode = inode; 5085 5086 /* 5087 * for regular files, if its inode is already on disk, we don't 5088 * have to worry about the parents at all. This is because 5089 * we can use the last_unlink_trans field to record renames 5090 * and other fun in this file. 5091 */ 5092 if (S_ISREG(inode->vfs_inode.i_mode) && 5093 inode->generation <= last_committed && 5094 inode->last_unlink_trans <= last_committed) 5095 goto out; 5096 5097 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5098 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5099 goto out; 5100 inode = BTRFS_I(d_inode(parent)); 5101 } 5102 5103 while (1) { 5104 /* 5105 * If we are logging a directory then we start with our inode, 5106 * not our parent's inode, so we need to skip setting the 5107 * logged_trans so that further down in the log code we don't 5108 * think this inode has already been logged. 5109 */ 5110 if (inode != orig_inode) 5111 inode->logged_trans = trans->transid; 5112 smp_mb(); 5113 5114 if (btrfs_must_commit_transaction(trans, inode)) { 5115 ret = 1; 5116 break; 5117 } 5118 5119 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5120 break; 5121 5122 if (IS_ROOT(parent)) { 5123 inode = BTRFS_I(d_inode(parent)); 5124 if (btrfs_must_commit_transaction(trans, inode)) 5125 ret = 1; 5126 break; 5127 } 5128 5129 parent = dget_parent(parent); 5130 dput(old_parent); 5131 old_parent = parent; 5132 inode = BTRFS_I(d_inode(parent)); 5133 5134 } 5135 dput(old_parent); 5136 out: 5137 return ret; 5138 } 5139 5140 struct btrfs_dir_list { 5141 u64 ino; 5142 struct list_head list; 5143 }; 5144 5145 /* 5146 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5147 * details about the why it is needed. 5148 * This is a recursive operation - if an existing dentry corresponds to a 5149 * directory, that directory's new entries are logged too (same behaviour as 5150 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5151 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5152 * complains about the following circular lock dependency / possible deadlock: 5153 * 5154 * CPU0 CPU1 5155 * ---- ---- 5156 * lock(&type->i_mutex_dir_key#3/2); 5157 * lock(sb_internal#2); 5158 * lock(&type->i_mutex_dir_key#3/2); 5159 * lock(&sb->s_type->i_mutex_key#14); 5160 * 5161 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5162 * sb_start_intwrite() in btrfs_start_transaction(). 5163 * Not locking i_mutex of the inodes is still safe because: 5164 * 5165 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5166 * that while logging the inode new references (names) are added or removed 5167 * from the inode, leaving the logged inode item with a link count that does 5168 * not match the number of logged inode reference items. This is fine because 5169 * at log replay time we compute the real number of links and correct the 5170 * link count in the inode item (see replay_one_buffer() and 5171 * link_to_fixup_dir()); 5172 * 5173 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5174 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5175 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5176 * has a size that doesn't match the sum of the lengths of all the logged 5177 * names. This does not result in a problem because if a dir_item key is 5178 * logged but its matching dir_index key is not logged, at log replay time we 5179 * don't use it to replay the respective name (see replay_one_name()). On the 5180 * other hand if only the dir_index key ends up being logged, the respective 5181 * name is added to the fs/subvol tree with both the dir_item and dir_index 5182 * keys created (see replay_one_name()). 5183 * The directory's inode item with a wrong i_size is not a problem as well, 5184 * since we don't use it at log replay time to set the i_size in the inode 5185 * item of the fs/subvol tree (see overwrite_item()). 5186 */ 5187 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5188 struct btrfs_root *root, 5189 struct btrfs_inode *start_inode, 5190 struct btrfs_log_ctx *ctx) 5191 { 5192 struct btrfs_fs_info *fs_info = root->fs_info; 5193 struct btrfs_root *log = root->log_root; 5194 struct btrfs_path *path; 5195 LIST_HEAD(dir_list); 5196 struct btrfs_dir_list *dir_elem; 5197 int ret = 0; 5198 5199 path = btrfs_alloc_path(); 5200 if (!path) 5201 return -ENOMEM; 5202 5203 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5204 if (!dir_elem) { 5205 btrfs_free_path(path); 5206 return -ENOMEM; 5207 } 5208 dir_elem->ino = btrfs_ino(start_inode); 5209 list_add_tail(&dir_elem->list, &dir_list); 5210 5211 while (!list_empty(&dir_list)) { 5212 struct extent_buffer *leaf; 5213 struct btrfs_key min_key; 5214 int nritems; 5215 int i; 5216 5217 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5218 list); 5219 if (ret) 5220 goto next_dir_inode; 5221 5222 min_key.objectid = dir_elem->ino; 5223 min_key.type = BTRFS_DIR_ITEM_KEY; 5224 min_key.offset = 0; 5225 again: 5226 btrfs_release_path(path); 5227 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5228 if (ret < 0) { 5229 goto next_dir_inode; 5230 } else if (ret > 0) { 5231 ret = 0; 5232 goto next_dir_inode; 5233 } 5234 5235 process_leaf: 5236 leaf = path->nodes[0]; 5237 nritems = btrfs_header_nritems(leaf); 5238 for (i = path->slots[0]; i < nritems; i++) { 5239 struct btrfs_dir_item *di; 5240 struct btrfs_key di_key; 5241 struct inode *di_inode; 5242 struct btrfs_dir_list *new_dir_elem; 5243 int log_mode = LOG_INODE_EXISTS; 5244 int type; 5245 5246 btrfs_item_key_to_cpu(leaf, &min_key, i); 5247 if (min_key.objectid != dir_elem->ino || 5248 min_key.type != BTRFS_DIR_ITEM_KEY) 5249 goto next_dir_inode; 5250 5251 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5252 type = btrfs_dir_type(leaf, di); 5253 if (btrfs_dir_transid(leaf, di) < trans->transid && 5254 type != BTRFS_FT_DIR) 5255 continue; 5256 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5257 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5258 continue; 5259 5260 btrfs_release_path(path); 5261 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5262 if (IS_ERR(di_inode)) { 5263 ret = PTR_ERR(di_inode); 5264 goto next_dir_inode; 5265 } 5266 5267 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5268 iput(di_inode); 5269 break; 5270 } 5271 5272 ctx->log_new_dentries = false; 5273 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5274 log_mode = LOG_INODE_ALL; 5275 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5276 log_mode, 0, LLONG_MAX, ctx); 5277 if (!ret && 5278 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5279 ret = 1; 5280 iput(di_inode); 5281 if (ret) 5282 goto next_dir_inode; 5283 if (ctx->log_new_dentries) { 5284 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5285 GFP_NOFS); 5286 if (!new_dir_elem) { 5287 ret = -ENOMEM; 5288 goto next_dir_inode; 5289 } 5290 new_dir_elem->ino = di_key.objectid; 5291 list_add_tail(&new_dir_elem->list, &dir_list); 5292 } 5293 break; 5294 } 5295 if (i == nritems) { 5296 ret = btrfs_next_leaf(log, path); 5297 if (ret < 0) { 5298 goto next_dir_inode; 5299 } else if (ret > 0) { 5300 ret = 0; 5301 goto next_dir_inode; 5302 } 5303 goto process_leaf; 5304 } 5305 if (min_key.offset < (u64)-1) { 5306 min_key.offset++; 5307 goto again; 5308 } 5309 next_dir_inode: 5310 list_del(&dir_elem->list); 5311 kfree(dir_elem); 5312 } 5313 5314 btrfs_free_path(path); 5315 return ret; 5316 } 5317 5318 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5319 struct btrfs_inode *inode, 5320 struct btrfs_log_ctx *ctx) 5321 { 5322 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5323 int ret; 5324 struct btrfs_path *path; 5325 struct btrfs_key key; 5326 struct btrfs_root *root = inode->root; 5327 const u64 ino = btrfs_ino(inode); 5328 5329 path = btrfs_alloc_path(); 5330 if (!path) 5331 return -ENOMEM; 5332 path->skip_locking = 1; 5333 path->search_commit_root = 1; 5334 5335 key.objectid = ino; 5336 key.type = BTRFS_INODE_REF_KEY; 5337 key.offset = 0; 5338 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5339 if (ret < 0) 5340 goto out; 5341 5342 while (true) { 5343 struct extent_buffer *leaf = path->nodes[0]; 5344 int slot = path->slots[0]; 5345 u32 cur_offset = 0; 5346 u32 item_size; 5347 unsigned long ptr; 5348 5349 if (slot >= btrfs_header_nritems(leaf)) { 5350 ret = btrfs_next_leaf(root, path); 5351 if (ret < 0) 5352 goto out; 5353 else if (ret > 0) 5354 break; 5355 continue; 5356 } 5357 5358 btrfs_item_key_to_cpu(leaf, &key, slot); 5359 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5360 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5361 break; 5362 5363 item_size = btrfs_item_size_nr(leaf, slot); 5364 ptr = btrfs_item_ptr_offset(leaf, slot); 5365 while (cur_offset < item_size) { 5366 struct btrfs_key inode_key; 5367 struct inode *dir_inode; 5368 5369 inode_key.type = BTRFS_INODE_ITEM_KEY; 5370 inode_key.offset = 0; 5371 5372 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5373 struct btrfs_inode_extref *extref; 5374 5375 extref = (struct btrfs_inode_extref *) 5376 (ptr + cur_offset); 5377 inode_key.objectid = btrfs_inode_extref_parent( 5378 leaf, extref); 5379 cur_offset += sizeof(*extref); 5380 cur_offset += btrfs_inode_extref_name_len(leaf, 5381 extref); 5382 } else { 5383 inode_key.objectid = key.offset; 5384 cur_offset = item_size; 5385 } 5386 5387 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5388 root, NULL); 5389 /* If parent inode was deleted, skip it. */ 5390 if (IS_ERR(dir_inode)) 5391 continue; 5392 5393 if (ctx) 5394 ctx->log_new_dentries = false; 5395 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5396 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5397 if (!ret && 5398 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5399 ret = 1; 5400 if (!ret && ctx && ctx->log_new_dentries) 5401 ret = log_new_dir_dentries(trans, root, 5402 BTRFS_I(dir_inode), ctx); 5403 iput(dir_inode); 5404 if (ret) 5405 goto out; 5406 } 5407 path->slots[0]++; 5408 } 5409 ret = 0; 5410 out: 5411 btrfs_free_path(path); 5412 return ret; 5413 } 5414 5415 /* 5416 * helper function around btrfs_log_inode to make sure newly created 5417 * parent directories also end up in the log. A minimal inode and backref 5418 * only logging is done of any parent directories that are older than 5419 * the last committed transaction 5420 */ 5421 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5422 struct btrfs_root *root, 5423 struct btrfs_inode *inode, 5424 struct dentry *parent, 5425 const loff_t start, 5426 const loff_t end, 5427 int exists_only, 5428 struct btrfs_log_ctx *ctx) 5429 { 5430 struct btrfs_fs_info *fs_info = root->fs_info; 5431 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5432 struct super_block *sb; 5433 struct dentry *old_parent = NULL; 5434 int ret = 0; 5435 u64 last_committed = fs_info->last_trans_committed; 5436 bool log_dentries = false; 5437 struct btrfs_inode *orig_inode = inode; 5438 5439 sb = inode->vfs_inode.i_sb; 5440 5441 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5442 ret = 1; 5443 goto end_no_trans; 5444 } 5445 5446 /* 5447 * The prev transaction commit doesn't complete, we need do 5448 * full commit by ourselves. 5449 */ 5450 if (fs_info->last_trans_log_full_commit > 5451 fs_info->last_trans_committed) { 5452 ret = 1; 5453 goto end_no_trans; 5454 } 5455 5456 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5457 ret = 1; 5458 goto end_no_trans; 5459 } 5460 5461 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5462 last_committed); 5463 if (ret) 5464 goto end_no_trans; 5465 5466 if (btrfs_inode_in_log(inode, trans->transid)) { 5467 ret = BTRFS_NO_LOG_SYNC; 5468 goto end_no_trans; 5469 } 5470 5471 ret = start_log_trans(trans, root, ctx); 5472 if (ret) 5473 goto end_no_trans; 5474 5475 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5476 if (ret) 5477 goto end_trans; 5478 5479 /* 5480 * for regular files, if its inode is already on disk, we don't 5481 * have to worry about the parents at all. This is because 5482 * we can use the last_unlink_trans field to record renames 5483 * and other fun in this file. 5484 */ 5485 if (S_ISREG(inode->vfs_inode.i_mode) && 5486 inode->generation <= last_committed && 5487 inode->last_unlink_trans <= last_committed) { 5488 ret = 0; 5489 goto end_trans; 5490 } 5491 5492 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5493 log_dentries = true; 5494 5495 /* 5496 * On unlink we must make sure all our current and old parent directory 5497 * inodes are fully logged. This is to prevent leaving dangling 5498 * directory index entries in directories that were our parents but are 5499 * not anymore. Not doing this results in old parent directory being 5500 * impossible to delete after log replay (rmdir will always fail with 5501 * error -ENOTEMPTY). 5502 * 5503 * Example 1: 5504 * 5505 * mkdir testdir 5506 * touch testdir/foo 5507 * ln testdir/foo testdir/bar 5508 * sync 5509 * unlink testdir/bar 5510 * xfs_io -c fsync testdir/foo 5511 * <power failure> 5512 * mount fs, triggers log replay 5513 * 5514 * If we don't log the parent directory (testdir), after log replay the 5515 * directory still has an entry pointing to the file inode using the bar 5516 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5517 * the file inode has a link count of 1. 5518 * 5519 * Example 2: 5520 * 5521 * mkdir testdir 5522 * touch foo 5523 * ln foo testdir/foo2 5524 * ln foo testdir/foo3 5525 * sync 5526 * unlink testdir/foo3 5527 * xfs_io -c fsync foo 5528 * <power failure> 5529 * mount fs, triggers log replay 5530 * 5531 * Similar as the first example, after log replay the parent directory 5532 * testdir still has an entry pointing to the inode file with name foo3 5533 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5534 * and has a link count of 2. 5535 */ 5536 if (inode->last_unlink_trans > last_committed) { 5537 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5538 if (ret) 5539 goto end_trans; 5540 } 5541 5542 while (1) { 5543 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5544 break; 5545 5546 inode = BTRFS_I(d_inode(parent)); 5547 if (root != inode->root) 5548 break; 5549 5550 if (inode->generation > last_committed) { 5551 ret = btrfs_log_inode(trans, root, inode, 5552 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5553 if (ret) 5554 goto end_trans; 5555 } 5556 if (IS_ROOT(parent)) 5557 break; 5558 5559 parent = dget_parent(parent); 5560 dput(old_parent); 5561 old_parent = parent; 5562 } 5563 if (log_dentries) 5564 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5565 else 5566 ret = 0; 5567 end_trans: 5568 dput(old_parent); 5569 if (ret < 0) { 5570 btrfs_set_log_full_commit(fs_info, trans); 5571 ret = 1; 5572 } 5573 5574 if (ret) 5575 btrfs_remove_log_ctx(root, ctx); 5576 btrfs_end_log_trans(root); 5577 end_no_trans: 5578 return ret; 5579 } 5580 5581 /* 5582 * it is not safe to log dentry if the chunk root has added new 5583 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5584 * If this returns 1, you must commit the transaction to safely get your 5585 * data on disk. 5586 */ 5587 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5588 struct btrfs_root *root, struct dentry *dentry, 5589 const loff_t start, 5590 const loff_t end, 5591 struct btrfs_log_ctx *ctx) 5592 { 5593 struct dentry *parent = dget_parent(dentry); 5594 int ret; 5595 5596 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5597 parent, start, end, 0, ctx); 5598 dput(parent); 5599 5600 return ret; 5601 } 5602 5603 /* 5604 * should be called during mount to recover any replay any log trees 5605 * from the FS 5606 */ 5607 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5608 { 5609 int ret; 5610 struct btrfs_path *path; 5611 struct btrfs_trans_handle *trans; 5612 struct btrfs_key key; 5613 struct btrfs_key found_key; 5614 struct btrfs_key tmp_key; 5615 struct btrfs_root *log; 5616 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5617 struct walk_control wc = { 5618 .process_func = process_one_buffer, 5619 .stage = 0, 5620 }; 5621 5622 path = btrfs_alloc_path(); 5623 if (!path) 5624 return -ENOMEM; 5625 5626 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5627 5628 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5629 if (IS_ERR(trans)) { 5630 ret = PTR_ERR(trans); 5631 goto error; 5632 } 5633 5634 wc.trans = trans; 5635 wc.pin = 1; 5636 5637 ret = walk_log_tree(trans, log_root_tree, &wc); 5638 if (ret) { 5639 btrfs_handle_fs_error(fs_info, ret, 5640 "Failed to pin buffers while recovering log root tree."); 5641 goto error; 5642 } 5643 5644 again: 5645 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5646 key.offset = (u64)-1; 5647 key.type = BTRFS_ROOT_ITEM_KEY; 5648 5649 while (1) { 5650 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5651 5652 if (ret < 0) { 5653 btrfs_handle_fs_error(fs_info, ret, 5654 "Couldn't find tree log root."); 5655 goto error; 5656 } 5657 if (ret > 0) { 5658 if (path->slots[0] == 0) 5659 break; 5660 path->slots[0]--; 5661 } 5662 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5663 path->slots[0]); 5664 btrfs_release_path(path); 5665 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5666 break; 5667 5668 log = btrfs_read_fs_root(log_root_tree, &found_key); 5669 if (IS_ERR(log)) { 5670 ret = PTR_ERR(log); 5671 btrfs_handle_fs_error(fs_info, ret, 5672 "Couldn't read tree log root."); 5673 goto error; 5674 } 5675 5676 tmp_key.objectid = found_key.offset; 5677 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5678 tmp_key.offset = (u64)-1; 5679 5680 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5681 if (IS_ERR(wc.replay_dest)) { 5682 ret = PTR_ERR(wc.replay_dest); 5683 free_extent_buffer(log->node); 5684 free_extent_buffer(log->commit_root); 5685 kfree(log); 5686 btrfs_handle_fs_error(fs_info, ret, 5687 "Couldn't read target root for tree log recovery."); 5688 goto error; 5689 } 5690 5691 wc.replay_dest->log_root = log; 5692 btrfs_record_root_in_trans(trans, wc.replay_dest); 5693 ret = walk_log_tree(trans, log, &wc); 5694 5695 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5696 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5697 path); 5698 } 5699 5700 key.offset = found_key.offset - 1; 5701 wc.replay_dest->log_root = NULL; 5702 free_extent_buffer(log->node); 5703 free_extent_buffer(log->commit_root); 5704 kfree(log); 5705 5706 if (ret) 5707 goto error; 5708 5709 if (found_key.offset == 0) 5710 break; 5711 } 5712 btrfs_release_path(path); 5713 5714 /* step one is to pin it all, step two is to replay just inodes */ 5715 if (wc.pin) { 5716 wc.pin = 0; 5717 wc.process_func = replay_one_buffer; 5718 wc.stage = LOG_WALK_REPLAY_INODES; 5719 goto again; 5720 } 5721 /* step three is to replay everything */ 5722 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5723 wc.stage++; 5724 goto again; 5725 } 5726 5727 btrfs_free_path(path); 5728 5729 /* step 4: commit the transaction, which also unpins the blocks */ 5730 ret = btrfs_commit_transaction(trans); 5731 if (ret) 5732 return ret; 5733 5734 free_extent_buffer(log_root_tree->node); 5735 log_root_tree->log_root = NULL; 5736 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5737 kfree(log_root_tree); 5738 5739 return 0; 5740 error: 5741 if (wc.trans) 5742 btrfs_end_transaction(wc.trans); 5743 btrfs_free_path(path); 5744 return ret; 5745 } 5746 5747 /* 5748 * there are some corner cases where we want to force a full 5749 * commit instead of allowing a directory to be logged. 5750 * 5751 * They revolve around files there were unlinked from the directory, and 5752 * this function updates the parent directory so that a full commit is 5753 * properly done if it is fsync'd later after the unlinks are done. 5754 * 5755 * Must be called before the unlink operations (updates to the subvolume tree, 5756 * inodes, etc) are done. 5757 */ 5758 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5759 struct btrfs_inode *dir, struct btrfs_inode *inode, 5760 int for_rename) 5761 { 5762 /* 5763 * when we're logging a file, if it hasn't been renamed 5764 * or unlinked, and its inode is fully committed on disk, 5765 * we don't have to worry about walking up the directory chain 5766 * to log its parents. 5767 * 5768 * So, we use the last_unlink_trans field to put this transid 5769 * into the file. When the file is logged we check it and 5770 * don't log the parents if the file is fully on disk. 5771 */ 5772 mutex_lock(&inode->log_mutex); 5773 inode->last_unlink_trans = trans->transid; 5774 mutex_unlock(&inode->log_mutex); 5775 5776 /* 5777 * if this directory was already logged any new 5778 * names for this file/dir will get recorded 5779 */ 5780 smp_mb(); 5781 if (dir->logged_trans == trans->transid) 5782 return; 5783 5784 /* 5785 * if the inode we're about to unlink was logged, 5786 * the log will be properly updated for any new names 5787 */ 5788 if (inode->logged_trans == trans->transid) 5789 return; 5790 5791 /* 5792 * when renaming files across directories, if the directory 5793 * there we're unlinking from gets fsync'd later on, there's 5794 * no way to find the destination directory later and fsync it 5795 * properly. So, we have to be conservative and force commits 5796 * so the new name gets discovered. 5797 */ 5798 if (for_rename) 5799 goto record; 5800 5801 /* we can safely do the unlink without any special recording */ 5802 return; 5803 5804 record: 5805 mutex_lock(&dir->log_mutex); 5806 dir->last_unlink_trans = trans->transid; 5807 mutex_unlock(&dir->log_mutex); 5808 } 5809 5810 /* 5811 * Make sure that if someone attempts to fsync the parent directory of a deleted 5812 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5813 * that after replaying the log tree of the parent directory's root we will not 5814 * see the snapshot anymore and at log replay time we will not see any log tree 5815 * corresponding to the deleted snapshot's root, which could lead to replaying 5816 * it after replaying the log tree of the parent directory (which would replay 5817 * the snapshot delete operation). 5818 * 5819 * Must be called before the actual snapshot destroy operation (updates to the 5820 * parent root and tree of tree roots trees, etc) are done. 5821 */ 5822 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5823 struct btrfs_inode *dir) 5824 { 5825 mutex_lock(&dir->log_mutex); 5826 dir->last_unlink_trans = trans->transid; 5827 mutex_unlock(&dir->log_mutex); 5828 } 5829 5830 /* 5831 * Call this after adding a new name for a file and it will properly 5832 * update the log to reflect the new name. 5833 * 5834 * It will return zero if all goes well, and it will return 1 if a 5835 * full transaction commit is required. 5836 */ 5837 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5838 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5839 struct dentry *parent) 5840 { 5841 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5842 struct btrfs_root *root = inode->root; 5843 5844 /* 5845 * this will force the logging code to walk the dentry chain 5846 * up for the file 5847 */ 5848 if (S_ISREG(inode->vfs_inode.i_mode)) 5849 inode->last_unlink_trans = trans->transid; 5850 5851 /* 5852 * if this inode hasn't been logged and directory we're renaming it 5853 * from hasn't been logged, we don't need to log it 5854 */ 5855 if (inode->logged_trans <= fs_info->last_trans_committed && 5856 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5857 return 0; 5858 5859 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5860 LLONG_MAX, 1, NULL); 5861 } 5862 5863