1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 #include "compression.h" 30 #include "qgroup.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 #define LOG_OTHER_INODE 2 41 42 /* 43 * directory trouble cases 44 * 45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 46 * log, we must force a full commit before doing an fsync of the directory 47 * where the unlink was done. 48 * ---> record transid of last unlink/rename per directory 49 * 50 * mkdir foo/some_dir 51 * normal commit 52 * rename foo/some_dir foo2/some_dir 53 * mkdir foo/some_dir 54 * fsync foo/some_dir/some_file 55 * 56 * The fsync above will unlink the original some_dir without recording 57 * it in its new location (foo2). After a crash, some_dir will be gone 58 * unless the fsync of some_file forces a full commit 59 * 60 * 2) we must log any new names for any file or dir that is in the fsync 61 * log. ---> check inode while renaming/linking. 62 * 63 * 2a) we must log any new names for any file or dir during rename 64 * when the directory they are being removed from was logged. 65 * ---> check inode and old parent dir during rename 66 * 67 * 2a is actually the more important variant. With the extra logging 68 * a crash might unlink the old name without recreating the new one 69 * 70 * 3) after a crash, we must go through any directories with a link count 71 * of zero and redo the rm -rf 72 * 73 * mkdir f1/foo 74 * normal commit 75 * rm -rf f1/foo 76 * fsync(f1) 77 * 78 * The directory f1 was fully removed from the FS, but fsync was never 79 * called on f1, only its parent dir. After a crash the rm -rf must 80 * be replayed. This must be able to recurse down the entire 81 * directory tree. The inode link count fixup code takes care of the 82 * ugly details. 83 */ 84 85 /* 86 * stages for the tree walking. The first 87 * stage (0) is to only pin down the blocks we find 88 * the second stage (1) is to make sure that all the inodes 89 * we find in the log are created in the subvolume. 90 * 91 * The last stage is to deal with directories and links and extents 92 * and all the other fun semantics 93 */ 94 #define LOG_WALK_PIN_ONLY 0 95 #define LOG_WALK_REPLAY_INODES 1 96 #define LOG_WALK_REPLAY_DIR_INDEX 2 97 #define LOG_WALK_REPLAY_ALL 3 98 99 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, struct btrfs_inode *inode, 101 int inode_only, 102 const loff_t start, 103 const loff_t end, 104 struct btrfs_log_ctx *ctx); 105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_path *path, u64 objectid); 108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_root *log, 111 struct btrfs_path *path, 112 u64 dirid, int del_all); 113 114 /* 115 * tree logging is a special write ahead log used to make sure that 116 * fsyncs and O_SYNCs can happen without doing full tree commits. 117 * 118 * Full tree commits are expensive because they require commonly 119 * modified blocks to be recowed, creating many dirty pages in the 120 * extent tree an 4x-6x higher write load than ext3. 121 * 122 * Instead of doing a tree commit on every fsync, we use the 123 * key ranges and transaction ids to find items for a given file or directory 124 * that have changed in this transaction. Those items are copied into 125 * a special tree (one per subvolume root), that tree is written to disk 126 * and then the fsync is considered complete. 127 * 128 * After a crash, items are copied out of the log-tree back into the 129 * subvolume tree. Any file data extents found are recorded in the extent 130 * allocation tree, and the log-tree freed. 131 * 132 * The log tree is read three times, once to pin down all the extents it is 133 * using in ram and once, once to create all the inodes logged in the tree 134 * and once to do all the other items. 135 */ 136 137 /* 138 * start a sub transaction and setup the log tree 139 * this increments the log tree writer count to make the people 140 * syncing the tree wait for us to finish 141 */ 142 static int start_log_trans(struct btrfs_trans_handle *trans, 143 struct btrfs_root *root, 144 struct btrfs_log_ctx *ctx) 145 { 146 struct btrfs_fs_info *fs_info = root->fs_info; 147 int ret = 0; 148 149 mutex_lock(&root->log_mutex); 150 151 if (root->log_root) { 152 if (btrfs_need_log_full_commit(fs_info, trans)) { 153 ret = -EAGAIN; 154 goto out; 155 } 156 157 if (!root->log_start_pid) { 158 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 159 root->log_start_pid = current->pid; 160 } else if (root->log_start_pid != current->pid) { 161 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 162 } 163 } else { 164 mutex_lock(&fs_info->tree_log_mutex); 165 if (!fs_info->log_root_tree) 166 ret = btrfs_init_log_root_tree(trans, fs_info); 167 mutex_unlock(&fs_info->tree_log_mutex); 168 if (ret) 169 goto out; 170 171 ret = btrfs_add_log_tree(trans, root); 172 if (ret) 173 goto out; 174 175 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 176 root->log_start_pid = current->pid; 177 } 178 179 atomic_inc(&root->log_batch); 180 atomic_inc(&root->log_writers); 181 if (ctx) { 182 int index = root->log_transid % 2; 183 list_add_tail(&ctx->list, &root->log_ctxs[index]); 184 ctx->log_transid = root->log_transid; 185 } 186 187 out: 188 mutex_unlock(&root->log_mutex); 189 return ret; 190 } 191 192 /* 193 * returns 0 if there was a log transaction running and we were able 194 * to join, or returns -ENOENT if there were not transactions 195 * in progress 196 */ 197 static int join_running_log_trans(struct btrfs_root *root) 198 { 199 int ret = -ENOENT; 200 201 smp_mb(); 202 if (!root->log_root) 203 return -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 if (root->log_root) { 207 ret = 0; 208 atomic_inc(&root->log_writers); 209 } 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * This either makes the current running log transaction wait 216 * until you call btrfs_end_log_trans() or it makes any future 217 * log transactions wait until you call btrfs_end_log_trans() 218 */ 219 int btrfs_pin_log_trans(struct btrfs_root *root) 220 { 221 int ret = -ENOENT; 222 223 mutex_lock(&root->log_mutex); 224 atomic_inc(&root->log_writers); 225 mutex_unlock(&root->log_mutex); 226 return ret; 227 } 228 229 /* 230 * indicate we're done making changes to the log tree 231 * and wake up anyone waiting to do a sync 232 */ 233 void btrfs_end_log_trans(struct btrfs_root *root) 234 { 235 if (atomic_dec_and_test(&root->log_writers)) { 236 /* 237 * Implicit memory barrier after atomic_dec_and_test 238 */ 239 if (waitqueue_active(&root->log_writer_wait)) 240 wake_up(&root->log_writer_wait); 241 } 242 } 243 244 245 /* 246 * the walk control struct is used to pass state down the chain when 247 * processing the log tree. The stage field tells us which part 248 * of the log tree processing we are currently doing. The others 249 * are state fields used for that specific part 250 */ 251 struct walk_control { 252 /* should we free the extent on disk when done? This is used 253 * at transaction commit time while freeing a log tree 254 */ 255 int free; 256 257 /* should we write out the extent buffer? This is used 258 * while flushing the log tree to disk during a sync 259 */ 260 int write; 261 262 /* should we wait for the extent buffer io to finish? Also used 263 * while flushing the log tree to disk for a sync 264 */ 265 int wait; 266 267 /* pin only walk, we record which extents on disk belong to the 268 * log trees 269 */ 270 int pin; 271 272 /* what stage of the replay code we're currently in */ 273 int stage; 274 275 /* the root we are currently replaying */ 276 struct btrfs_root *replay_dest; 277 278 /* the trans handle for the current replay */ 279 struct btrfs_trans_handle *trans; 280 281 /* the function that gets used to process blocks we find in the 282 * tree. Note the extent_buffer might not be up to date when it is 283 * passed in, and it must be checked or read if you need the data 284 * inside it 285 */ 286 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287 struct walk_control *wc, u64 gen); 288 }; 289 290 /* 291 * process_func used to pin down extents, write them or wait on them 292 */ 293 static int process_one_buffer(struct btrfs_root *log, 294 struct extent_buffer *eb, 295 struct walk_control *wc, u64 gen) 296 { 297 struct btrfs_fs_info *fs_info = log->fs_info; 298 int ret = 0; 299 300 /* 301 * If this fs is mixed then we need to be able to process the leaves to 302 * pin down any logged extents, so we have to read the block. 303 */ 304 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 305 ret = btrfs_read_buffer(eb, gen); 306 if (ret) 307 return ret; 308 } 309 310 if (wc->pin) 311 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 312 eb->len); 313 314 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 315 if (wc->pin && btrfs_header_level(eb) == 0) 316 ret = btrfs_exclude_logged_extents(fs_info, eb); 317 if (wc->write) 318 btrfs_write_tree_block(eb); 319 if (wc->wait) 320 btrfs_wait_tree_block_writeback(eb); 321 } 322 return ret; 323 } 324 325 /* 326 * Item overwrite used by replay and tree logging. eb, slot and key all refer 327 * to the src data we are copying out. 328 * 329 * root is the tree we are copying into, and path is a scratch 330 * path for use in this function (it should be released on entry and 331 * will be released on exit). 332 * 333 * If the key is already in the destination tree the existing item is 334 * overwritten. If the existing item isn't big enough, it is extended. 335 * If it is too large, it is truncated. 336 * 337 * If the key isn't in the destination yet, a new item is inserted. 338 */ 339 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 340 struct btrfs_root *root, 341 struct btrfs_path *path, 342 struct extent_buffer *eb, int slot, 343 struct btrfs_key *key) 344 { 345 struct btrfs_fs_info *fs_info = root->fs_info; 346 int ret; 347 u32 item_size; 348 u64 saved_i_size = 0; 349 int save_old_i_size = 0; 350 unsigned long src_ptr; 351 unsigned long dst_ptr; 352 int overwrite_root = 0; 353 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 354 355 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 356 overwrite_root = 1; 357 358 item_size = btrfs_item_size_nr(eb, slot); 359 src_ptr = btrfs_item_ptr_offset(eb, slot); 360 361 /* look for the key in the destination tree */ 362 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 363 if (ret < 0) 364 return ret; 365 366 if (ret == 0) { 367 char *src_copy; 368 char *dst_copy; 369 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 370 path->slots[0]); 371 if (dst_size != item_size) 372 goto insert; 373 374 if (item_size == 0) { 375 btrfs_release_path(path); 376 return 0; 377 } 378 dst_copy = kmalloc(item_size, GFP_NOFS); 379 src_copy = kmalloc(item_size, GFP_NOFS); 380 if (!dst_copy || !src_copy) { 381 btrfs_release_path(path); 382 kfree(dst_copy); 383 kfree(src_copy); 384 return -ENOMEM; 385 } 386 387 read_extent_buffer(eb, src_copy, src_ptr, item_size); 388 389 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 390 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 391 item_size); 392 ret = memcmp(dst_copy, src_copy, item_size); 393 394 kfree(dst_copy); 395 kfree(src_copy); 396 /* 397 * they have the same contents, just return, this saves 398 * us from cowing blocks in the destination tree and doing 399 * extra writes that may not have been done by a previous 400 * sync 401 */ 402 if (ret == 0) { 403 btrfs_release_path(path); 404 return 0; 405 } 406 407 /* 408 * We need to load the old nbytes into the inode so when we 409 * replay the extents we've logged we get the right nbytes. 410 */ 411 if (inode_item) { 412 struct btrfs_inode_item *item; 413 u64 nbytes; 414 u32 mode; 415 416 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 417 struct btrfs_inode_item); 418 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 419 item = btrfs_item_ptr(eb, slot, 420 struct btrfs_inode_item); 421 btrfs_set_inode_nbytes(eb, item, nbytes); 422 423 /* 424 * If this is a directory we need to reset the i_size to 425 * 0 so that we can set it up properly when replaying 426 * the rest of the items in this log. 427 */ 428 mode = btrfs_inode_mode(eb, item); 429 if (S_ISDIR(mode)) 430 btrfs_set_inode_size(eb, item, 0); 431 } 432 } else if (inode_item) { 433 struct btrfs_inode_item *item; 434 u32 mode; 435 436 /* 437 * New inode, set nbytes to 0 so that the nbytes comes out 438 * properly when we replay the extents. 439 */ 440 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 441 btrfs_set_inode_nbytes(eb, item, 0); 442 443 /* 444 * If this is a directory we need to reset the i_size to 0 so 445 * that we can set it up properly when replaying the rest of 446 * the items in this log. 447 */ 448 mode = btrfs_inode_mode(eb, item); 449 if (S_ISDIR(mode)) 450 btrfs_set_inode_size(eb, item, 0); 451 } 452 insert: 453 btrfs_release_path(path); 454 /* try to insert the key into the destination tree */ 455 path->skip_release_on_error = 1; 456 ret = btrfs_insert_empty_item(trans, root, path, 457 key, item_size); 458 path->skip_release_on_error = 0; 459 460 /* make sure any existing item is the correct size */ 461 if (ret == -EEXIST || ret == -EOVERFLOW) { 462 u32 found_size; 463 found_size = btrfs_item_size_nr(path->nodes[0], 464 path->slots[0]); 465 if (found_size > item_size) 466 btrfs_truncate_item(fs_info, path, item_size, 1); 467 else if (found_size < item_size) 468 btrfs_extend_item(fs_info, path, 469 item_size - found_size); 470 } else if (ret) { 471 return ret; 472 } 473 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 474 path->slots[0]); 475 476 /* don't overwrite an existing inode if the generation number 477 * was logged as zero. This is done when the tree logging code 478 * is just logging an inode to make sure it exists after recovery. 479 * 480 * Also, don't overwrite i_size on directories during replay. 481 * log replay inserts and removes directory items based on the 482 * state of the tree found in the subvolume, and i_size is modified 483 * as it goes 484 */ 485 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 486 struct btrfs_inode_item *src_item; 487 struct btrfs_inode_item *dst_item; 488 489 src_item = (struct btrfs_inode_item *)src_ptr; 490 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 492 if (btrfs_inode_generation(eb, src_item) == 0) { 493 struct extent_buffer *dst_eb = path->nodes[0]; 494 const u64 ino_size = btrfs_inode_size(eb, src_item); 495 496 /* 497 * For regular files an ino_size == 0 is used only when 498 * logging that an inode exists, as part of a directory 499 * fsync, and the inode wasn't fsynced before. In this 500 * case don't set the size of the inode in the fs/subvol 501 * tree, otherwise we would be throwing valid data away. 502 */ 503 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 505 ino_size != 0) { 506 struct btrfs_map_token token; 507 508 btrfs_init_map_token(&token); 509 btrfs_set_token_inode_size(dst_eb, dst_item, 510 ino_size, &token); 511 } 512 goto no_copy; 513 } 514 515 if (overwrite_root && 516 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 517 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 518 save_old_i_size = 1; 519 saved_i_size = btrfs_inode_size(path->nodes[0], 520 dst_item); 521 } 522 } 523 524 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 525 src_ptr, item_size); 526 527 if (save_old_i_size) { 528 struct btrfs_inode_item *dst_item; 529 dst_item = (struct btrfs_inode_item *)dst_ptr; 530 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 531 } 532 533 /* make sure the generation is filled in */ 534 if (key->type == BTRFS_INODE_ITEM_KEY) { 535 struct btrfs_inode_item *dst_item; 536 dst_item = (struct btrfs_inode_item *)dst_ptr; 537 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 538 btrfs_set_inode_generation(path->nodes[0], dst_item, 539 trans->transid); 540 } 541 } 542 no_copy: 543 btrfs_mark_buffer_dirty(path->nodes[0]); 544 btrfs_release_path(path); 545 return 0; 546 } 547 548 /* 549 * simple helper to read an inode off the disk from a given root 550 * This can only be called for subvolume roots and not for the log 551 */ 552 static noinline struct inode *read_one_inode(struct btrfs_root *root, 553 u64 objectid) 554 { 555 struct btrfs_key key; 556 struct inode *inode; 557 558 key.objectid = objectid; 559 key.type = BTRFS_INODE_ITEM_KEY; 560 key.offset = 0; 561 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 562 if (IS_ERR(inode)) { 563 inode = NULL; 564 } else if (is_bad_inode(inode)) { 565 iput(inode); 566 inode = NULL; 567 } 568 return inode; 569 } 570 571 /* replays a single extent in 'eb' at 'slot' with 'key' into the 572 * subvolume 'root'. path is released on entry and should be released 573 * on exit. 574 * 575 * extents in the log tree have not been allocated out of the extent 576 * tree yet. So, this completes the allocation, taking a reference 577 * as required if the extent already exists or creating a new extent 578 * if it isn't in the extent allocation tree yet. 579 * 580 * The extent is inserted into the file, dropping any existing extents 581 * from the file that overlap the new one. 582 */ 583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, 585 struct btrfs_path *path, 586 struct extent_buffer *eb, int slot, 587 struct btrfs_key *key) 588 { 589 struct btrfs_fs_info *fs_info = root->fs_info; 590 int found_type; 591 u64 extent_end; 592 u64 start = key->offset; 593 u64 nbytes = 0; 594 struct btrfs_file_extent_item *item; 595 struct inode *inode = NULL; 596 unsigned long size; 597 int ret = 0; 598 599 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600 found_type = btrfs_file_extent_type(eb, item); 601 602 if (found_type == BTRFS_FILE_EXTENT_REG || 603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 604 nbytes = btrfs_file_extent_num_bytes(eb, item); 605 extent_end = start + nbytes; 606 607 /* 608 * We don't add to the inodes nbytes if we are prealloc or a 609 * hole. 610 */ 611 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 612 nbytes = 0; 613 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614 size = btrfs_file_extent_inline_len(eb, slot, item); 615 nbytes = btrfs_file_extent_ram_bytes(eb, item); 616 extent_end = ALIGN(start + size, 617 fs_info->sectorsize); 618 } else { 619 ret = 0; 620 goto out; 621 } 622 623 inode = read_one_inode(root, key->objectid); 624 if (!inode) { 625 ret = -EIO; 626 goto out; 627 } 628 629 /* 630 * first check to see if we already have this extent in the 631 * file. This must be done before the btrfs_drop_extents run 632 * so we don't try to drop this extent. 633 */ 634 ret = btrfs_lookup_file_extent(trans, root, path, 635 btrfs_ino(BTRFS_I(inode)), start, 0); 636 637 if (ret == 0 && 638 (found_type == BTRFS_FILE_EXTENT_REG || 639 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 640 struct btrfs_file_extent_item cmp1; 641 struct btrfs_file_extent_item cmp2; 642 struct btrfs_file_extent_item *existing; 643 struct extent_buffer *leaf; 644 645 leaf = path->nodes[0]; 646 existing = btrfs_item_ptr(leaf, path->slots[0], 647 struct btrfs_file_extent_item); 648 649 read_extent_buffer(eb, &cmp1, (unsigned long)item, 650 sizeof(cmp1)); 651 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 652 sizeof(cmp2)); 653 654 /* 655 * we already have a pointer to this exact extent, 656 * we don't have to do anything 657 */ 658 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 659 btrfs_release_path(path); 660 goto out; 661 } 662 } 663 btrfs_release_path(path); 664 665 /* drop any overlapping extents */ 666 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 667 if (ret) 668 goto out; 669 670 if (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 672 u64 offset; 673 unsigned long dest_offset; 674 struct btrfs_key ins; 675 676 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 677 btrfs_fs_incompat(fs_info, NO_HOLES)) 678 goto update_inode; 679 680 ret = btrfs_insert_empty_item(trans, root, path, key, 681 sizeof(*item)); 682 if (ret) 683 goto out; 684 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 685 path->slots[0]); 686 copy_extent_buffer(path->nodes[0], eb, dest_offset, 687 (unsigned long)item, sizeof(*item)); 688 689 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 690 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 691 ins.type = BTRFS_EXTENT_ITEM_KEY; 692 offset = key->offset - btrfs_file_extent_offset(eb, item); 693 694 /* 695 * Manually record dirty extent, as here we did a shallow 696 * file extent item copy and skip normal backref update, 697 * but modifying extent tree all by ourselves. 698 * So need to manually record dirty extent for qgroup, 699 * as the owner of the file extent changed from log tree 700 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 701 */ 702 ret = btrfs_qgroup_trace_extent(trans, fs_info, 703 btrfs_file_extent_disk_bytenr(eb, item), 704 btrfs_file_extent_disk_num_bytes(eb, item), 705 GFP_NOFS); 706 if (ret < 0) 707 goto out; 708 709 if (ins.objectid > 0) { 710 u64 csum_start; 711 u64 csum_end; 712 LIST_HEAD(ordered_sums); 713 /* 714 * is this extent already allocated in the extent 715 * allocation tree? If so, just add a reference 716 */ 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 718 ins.offset); 719 if (ret == 0) { 720 ret = btrfs_inc_extent_ref(trans, fs_info, 721 ins.objectid, ins.offset, 722 0, root->root_key.objectid, 723 key->objectid, offset); 724 if (ret) 725 goto out; 726 } else { 727 /* 728 * insert the extent pointer in the extent 729 * allocation tree 730 */ 731 ret = btrfs_alloc_logged_file_extent(trans, 732 fs_info, 733 root->root_key.objectid, 734 key->objectid, offset, &ins); 735 if (ret) 736 goto out; 737 } 738 btrfs_release_path(path); 739 740 if (btrfs_file_extent_compression(eb, item)) { 741 csum_start = ins.objectid; 742 csum_end = csum_start + ins.offset; 743 } else { 744 csum_start = ins.objectid + 745 btrfs_file_extent_offset(eb, item); 746 csum_end = csum_start + 747 btrfs_file_extent_num_bytes(eb, item); 748 } 749 750 ret = btrfs_lookup_csums_range(root->log_root, 751 csum_start, csum_end - 1, 752 &ordered_sums, 0); 753 if (ret) 754 goto out; 755 /* 756 * Now delete all existing cums in the csum root that 757 * cover our range. We do this because we can have an 758 * extent that is completely referenced by one file 759 * extent item and partially referenced by another 760 * file extent item (like after using the clone or 761 * extent_same ioctls). In this case if we end up doing 762 * the replay of the one that partially references the 763 * extent first, and we do not do the csum deletion 764 * below, we can get 2 csum items in the csum tree that 765 * overlap each other. For example, imagine our log has 766 * the two following file extent items: 767 * 768 * key (257 EXTENT_DATA 409600) 769 * extent data disk byte 12845056 nr 102400 770 * extent data offset 20480 nr 20480 ram 102400 771 * 772 * key (257 EXTENT_DATA 819200) 773 * extent data disk byte 12845056 nr 102400 774 * extent data offset 0 nr 102400 ram 102400 775 * 776 * Where the second one fully references the 100K extent 777 * that starts at disk byte 12845056, and the log tree 778 * has a single csum item that covers the entire range 779 * of the extent: 780 * 781 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 782 * 783 * After the first file extent item is replayed, the 784 * csum tree gets the following csum item: 785 * 786 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 787 * 788 * Which covers the 20K sub-range starting at offset 20K 789 * of our extent. Now when we replay the second file 790 * extent item, if we do not delete existing csum items 791 * that cover any of its blocks, we end up getting two 792 * csum items in our csum tree that overlap each other: 793 * 794 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 795 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 796 * 797 * Which is a problem, because after this anyone trying 798 * to lookup up for the checksum of any block of our 799 * extent starting at an offset of 40K or higher, will 800 * end up looking at the second csum item only, which 801 * does not contain the checksum for any block starting 802 * at offset 40K or higher of our extent. 803 */ 804 while (!list_empty(&ordered_sums)) { 805 struct btrfs_ordered_sum *sums; 806 sums = list_entry(ordered_sums.next, 807 struct btrfs_ordered_sum, 808 list); 809 if (!ret) 810 ret = btrfs_del_csums(trans, fs_info, 811 sums->bytenr, 812 sums->len); 813 if (!ret) 814 ret = btrfs_csum_file_blocks(trans, 815 fs_info->csum_root, sums); 816 list_del(&sums->list); 817 kfree(sums); 818 } 819 if (ret) 820 goto out; 821 } else { 822 btrfs_release_path(path); 823 } 824 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 825 /* inline extents are easy, we just overwrite them */ 826 ret = overwrite_item(trans, root, path, eb, slot, key); 827 if (ret) 828 goto out; 829 } 830 831 inode_add_bytes(inode, nbytes); 832 update_inode: 833 ret = btrfs_update_inode(trans, root, inode); 834 out: 835 if (inode) 836 iput(inode); 837 return ret; 838 } 839 840 /* 841 * when cleaning up conflicts between the directory names in the 842 * subvolume, directory names in the log and directory names in the 843 * inode back references, we may have to unlink inodes from directories. 844 * 845 * This is a helper function to do the unlink of a specific directory 846 * item 847 */ 848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 849 struct btrfs_root *root, 850 struct btrfs_path *path, 851 struct btrfs_inode *dir, 852 struct btrfs_dir_item *di) 853 { 854 struct btrfs_fs_info *fs_info = root->fs_info; 855 struct inode *inode; 856 char *name; 857 int name_len; 858 struct extent_buffer *leaf; 859 struct btrfs_key location; 860 int ret; 861 862 leaf = path->nodes[0]; 863 864 btrfs_dir_item_key_to_cpu(leaf, di, &location); 865 name_len = btrfs_dir_name_len(leaf, di); 866 name = kmalloc(name_len, GFP_NOFS); 867 if (!name) 868 return -ENOMEM; 869 870 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 871 btrfs_release_path(path); 872 873 inode = read_one_inode(root, location.objectid); 874 if (!inode) { 875 ret = -EIO; 876 goto out; 877 } 878 879 ret = link_to_fixup_dir(trans, root, path, location.objectid); 880 if (ret) 881 goto out; 882 883 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 884 name_len); 885 if (ret) 886 goto out; 887 else 888 ret = btrfs_run_delayed_items(trans, fs_info); 889 out: 890 kfree(name); 891 iput(inode); 892 return ret; 893 } 894 895 /* 896 * helper function to see if a given name and sequence number found 897 * in an inode back reference are already in a directory and correctly 898 * point to this inode 899 */ 900 static noinline int inode_in_dir(struct btrfs_root *root, 901 struct btrfs_path *path, 902 u64 dirid, u64 objectid, u64 index, 903 const char *name, int name_len) 904 { 905 struct btrfs_dir_item *di; 906 struct btrfs_key location; 907 int match = 0; 908 909 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 910 index, name, name_len, 0); 911 if (di && !IS_ERR(di)) { 912 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 913 if (location.objectid != objectid) 914 goto out; 915 } else 916 goto out; 917 btrfs_release_path(path); 918 919 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 920 if (di && !IS_ERR(di)) { 921 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 922 if (location.objectid != objectid) 923 goto out; 924 } else 925 goto out; 926 match = 1; 927 out: 928 btrfs_release_path(path); 929 return match; 930 } 931 932 /* 933 * helper function to check a log tree for a named back reference in 934 * an inode. This is used to decide if a back reference that is 935 * found in the subvolume conflicts with what we find in the log. 936 * 937 * inode backreferences may have multiple refs in a single item, 938 * during replay we process one reference at a time, and we don't 939 * want to delete valid links to a file from the subvolume if that 940 * link is also in the log. 941 */ 942 static noinline int backref_in_log(struct btrfs_root *log, 943 struct btrfs_key *key, 944 u64 ref_objectid, 945 const char *name, int namelen) 946 { 947 struct btrfs_path *path; 948 struct btrfs_inode_ref *ref; 949 unsigned long ptr; 950 unsigned long ptr_end; 951 unsigned long name_ptr; 952 int found_name_len; 953 int item_size; 954 int ret; 955 int match = 0; 956 957 path = btrfs_alloc_path(); 958 if (!path) 959 return -ENOMEM; 960 961 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 962 if (ret != 0) 963 goto out; 964 965 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 966 967 if (key->type == BTRFS_INODE_EXTREF_KEY) { 968 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 969 name, namelen, NULL)) 970 match = 1; 971 972 goto out; 973 } 974 975 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 976 ptr_end = ptr + item_size; 977 while (ptr < ptr_end) { 978 ref = (struct btrfs_inode_ref *)ptr; 979 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 980 if (found_name_len == namelen) { 981 name_ptr = (unsigned long)(ref + 1); 982 ret = memcmp_extent_buffer(path->nodes[0], name, 983 name_ptr, namelen); 984 if (ret == 0) { 985 match = 1; 986 goto out; 987 } 988 } 989 ptr = (unsigned long)(ref + 1) + found_name_len; 990 } 991 out: 992 btrfs_free_path(path); 993 return match; 994 } 995 996 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 997 struct btrfs_root *root, 998 struct btrfs_path *path, 999 struct btrfs_root *log_root, 1000 struct btrfs_inode *dir, 1001 struct btrfs_inode *inode, 1002 u64 inode_objectid, u64 parent_objectid, 1003 u64 ref_index, char *name, int namelen, 1004 int *search_done) 1005 { 1006 struct btrfs_fs_info *fs_info = root->fs_info; 1007 int ret; 1008 char *victim_name; 1009 int victim_name_len; 1010 struct extent_buffer *leaf; 1011 struct btrfs_dir_item *di; 1012 struct btrfs_key search_key; 1013 struct btrfs_inode_extref *extref; 1014 1015 again: 1016 /* Search old style refs */ 1017 search_key.objectid = inode_objectid; 1018 search_key.type = BTRFS_INODE_REF_KEY; 1019 search_key.offset = parent_objectid; 1020 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1021 if (ret == 0) { 1022 struct btrfs_inode_ref *victim_ref; 1023 unsigned long ptr; 1024 unsigned long ptr_end; 1025 1026 leaf = path->nodes[0]; 1027 1028 /* are we trying to overwrite a back ref for the root directory 1029 * if so, just jump out, we're done 1030 */ 1031 if (search_key.objectid == search_key.offset) 1032 return 1; 1033 1034 /* check all the names in this back reference to see 1035 * if they are in the log. if so, we allow them to stay 1036 * otherwise they must be unlinked as a conflict 1037 */ 1038 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1039 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1040 while (ptr < ptr_end) { 1041 victim_ref = (struct btrfs_inode_ref *)ptr; 1042 victim_name_len = btrfs_inode_ref_name_len(leaf, 1043 victim_ref); 1044 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1045 if (!victim_name) 1046 return -ENOMEM; 1047 1048 read_extent_buffer(leaf, victim_name, 1049 (unsigned long)(victim_ref + 1), 1050 victim_name_len); 1051 1052 if (!backref_in_log(log_root, &search_key, 1053 parent_objectid, 1054 victim_name, 1055 victim_name_len)) { 1056 inc_nlink(&inode->vfs_inode); 1057 btrfs_release_path(path); 1058 1059 ret = btrfs_unlink_inode(trans, root, dir, inode, 1060 victim_name, victim_name_len); 1061 kfree(victim_name); 1062 if (ret) 1063 return ret; 1064 ret = btrfs_run_delayed_items(trans, fs_info); 1065 if (ret) 1066 return ret; 1067 *search_done = 1; 1068 goto again; 1069 } 1070 kfree(victim_name); 1071 1072 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1073 } 1074 1075 /* 1076 * NOTE: we have searched root tree and checked the 1077 * corresponding ref, it does not need to check again. 1078 */ 1079 *search_done = 1; 1080 } 1081 btrfs_release_path(path); 1082 1083 /* Same search but for extended refs */ 1084 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1085 inode_objectid, parent_objectid, 0, 1086 0); 1087 if (!IS_ERR_OR_NULL(extref)) { 1088 u32 item_size; 1089 u32 cur_offset = 0; 1090 unsigned long base; 1091 struct inode *victim_parent; 1092 1093 leaf = path->nodes[0]; 1094 1095 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1096 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1097 1098 while (cur_offset < item_size) { 1099 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1100 1101 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1102 1103 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1104 goto next; 1105 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1110 victim_name_len); 1111 1112 search_key.objectid = inode_objectid; 1113 search_key.type = BTRFS_INODE_EXTREF_KEY; 1114 search_key.offset = btrfs_extref_hash(parent_objectid, 1115 victim_name, 1116 victim_name_len); 1117 ret = 0; 1118 if (!backref_in_log(log_root, &search_key, 1119 parent_objectid, victim_name, 1120 victim_name_len)) { 1121 ret = -ENOENT; 1122 victim_parent = read_one_inode(root, 1123 parent_objectid); 1124 if (victim_parent) { 1125 inc_nlink(&inode->vfs_inode); 1126 btrfs_release_path(path); 1127 1128 ret = btrfs_unlink_inode(trans, root, 1129 BTRFS_I(victim_parent), 1130 inode, 1131 victim_name, 1132 victim_name_len); 1133 if (!ret) 1134 ret = btrfs_run_delayed_items( 1135 trans, 1136 fs_info); 1137 } 1138 iput(victim_parent); 1139 kfree(victim_name); 1140 if (ret) 1141 return ret; 1142 *search_done = 1; 1143 goto again; 1144 } 1145 kfree(victim_name); 1146 next: 1147 cur_offset += victim_name_len + sizeof(*extref); 1148 } 1149 *search_done = 1; 1150 } 1151 btrfs_release_path(path); 1152 1153 /* look for a conflicting sequence number */ 1154 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1155 ref_index, name, namelen, 0); 1156 if (di && !IS_ERR(di)) { 1157 ret = drop_one_dir_item(trans, root, path, dir, di); 1158 if (ret) 1159 return ret; 1160 } 1161 btrfs_release_path(path); 1162 1163 /* look for a conflicing name */ 1164 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1165 name, namelen, 0); 1166 if (di && !IS_ERR(di)) { 1167 ret = drop_one_dir_item(trans, root, path, dir, di); 1168 if (ret) 1169 return ret; 1170 } 1171 btrfs_release_path(path); 1172 1173 return 0; 1174 } 1175 1176 static int extref_get_fields(struct extent_buffer *eb, int slot, 1177 unsigned long ref_ptr, u32 *namelen, char **name, 1178 u64 *index, u64 *parent_objectid) 1179 { 1180 struct btrfs_inode_extref *extref; 1181 1182 extref = (struct btrfs_inode_extref *)ref_ptr; 1183 1184 *namelen = btrfs_inode_extref_name_len(eb, extref); 1185 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, 1186 *namelen)) 1187 return -EIO; 1188 1189 *name = kmalloc(*namelen, GFP_NOFS); 1190 if (*name == NULL) 1191 return -ENOMEM; 1192 1193 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1194 *namelen); 1195 1196 *index = btrfs_inode_extref_index(eb, extref); 1197 if (parent_objectid) 1198 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1199 1200 return 0; 1201 } 1202 1203 static int ref_get_fields(struct extent_buffer *eb, int slot, 1204 unsigned long ref_ptr, u32 *namelen, char **name, 1205 u64 *index) 1206 { 1207 struct btrfs_inode_ref *ref; 1208 1209 ref = (struct btrfs_inode_ref *)ref_ptr; 1210 1211 *namelen = btrfs_inode_ref_name_len(eb, ref); 1212 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), 1213 *namelen)) 1214 return -EIO; 1215 1216 *name = kmalloc(*namelen, GFP_NOFS); 1217 if (*name == NULL) 1218 return -ENOMEM; 1219 1220 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1221 1222 *index = btrfs_inode_ref_index(eb, ref); 1223 1224 return 0; 1225 } 1226 1227 /* 1228 * replay one inode back reference item found in the log tree. 1229 * eb, slot and key refer to the buffer and key found in the log tree. 1230 * root is the destination we are replaying into, and path is for temp 1231 * use by this function. (it should be released on return). 1232 */ 1233 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1234 struct btrfs_root *root, 1235 struct btrfs_root *log, 1236 struct btrfs_path *path, 1237 struct extent_buffer *eb, int slot, 1238 struct btrfs_key *key) 1239 { 1240 struct inode *dir = NULL; 1241 struct inode *inode = NULL; 1242 unsigned long ref_ptr; 1243 unsigned long ref_end; 1244 char *name = NULL; 1245 int namelen; 1246 int ret; 1247 int search_done = 0; 1248 int log_ref_ver = 0; 1249 u64 parent_objectid; 1250 u64 inode_objectid; 1251 u64 ref_index = 0; 1252 int ref_struct_size; 1253 1254 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1255 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1256 1257 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1258 struct btrfs_inode_extref *r; 1259 1260 ref_struct_size = sizeof(struct btrfs_inode_extref); 1261 log_ref_ver = 1; 1262 r = (struct btrfs_inode_extref *)ref_ptr; 1263 parent_objectid = btrfs_inode_extref_parent(eb, r); 1264 } else { 1265 ref_struct_size = sizeof(struct btrfs_inode_ref); 1266 parent_objectid = key->offset; 1267 } 1268 inode_objectid = key->objectid; 1269 1270 /* 1271 * it is possible that we didn't log all the parent directories 1272 * for a given inode. If we don't find the dir, just don't 1273 * copy the back ref in. The link count fixup code will take 1274 * care of the rest 1275 */ 1276 dir = read_one_inode(root, parent_objectid); 1277 if (!dir) { 1278 ret = -ENOENT; 1279 goto out; 1280 } 1281 1282 inode = read_one_inode(root, inode_objectid); 1283 if (!inode) { 1284 ret = -EIO; 1285 goto out; 1286 } 1287 1288 while (ref_ptr < ref_end) { 1289 if (log_ref_ver) { 1290 ret = extref_get_fields(eb, slot, ref_ptr, &namelen, 1291 &name, &ref_index, &parent_objectid); 1292 /* 1293 * parent object can change from one array 1294 * item to another. 1295 */ 1296 if (!dir) 1297 dir = read_one_inode(root, parent_objectid); 1298 if (!dir) { 1299 ret = -ENOENT; 1300 goto out; 1301 } 1302 } else { 1303 ret = ref_get_fields(eb, slot, ref_ptr, &namelen, 1304 &name, &ref_index); 1305 } 1306 if (ret) 1307 goto out; 1308 1309 /* if we already have a perfect match, we're done */ 1310 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1311 btrfs_ino(BTRFS_I(inode)), ref_index, 1312 name, namelen)) { 1313 /* 1314 * look for a conflicting back reference in the 1315 * metadata. if we find one we have to unlink that name 1316 * of the file before we add our new link. Later on, we 1317 * overwrite any existing back reference, and we don't 1318 * want to create dangling pointers in the directory. 1319 */ 1320 1321 if (!search_done) { 1322 ret = __add_inode_ref(trans, root, path, log, 1323 BTRFS_I(dir), 1324 BTRFS_I(inode), 1325 inode_objectid, 1326 parent_objectid, 1327 ref_index, name, namelen, 1328 &search_done); 1329 if (ret) { 1330 if (ret == 1) 1331 ret = 0; 1332 goto out; 1333 } 1334 } 1335 1336 /* insert our name */ 1337 ret = btrfs_add_link(trans, BTRFS_I(dir), 1338 BTRFS_I(inode), 1339 name, namelen, 0, ref_index); 1340 if (ret) 1341 goto out; 1342 1343 btrfs_update_inode(trans, root, inode); 1344 } 1345 1346 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1347 kfree(name); 1348 name = NULL; 1349 if (log_ref_ver) { 1350 iput(dir); 1351 dir = NULL; 1352 } 1353 } 1354 1355 /* finally write the back reference in the inode */ 1356 ret = overwrite_item(trans, root, path, eb, slot, key); 1357 out: 1358 btrfs_release_path(path); 1359 kfree(name); 1360 iput(dir); 1361 iput(inode); 1362 return ret; 1363 } 1364 1365 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1366 struct btrfs_root *root, u64 ino) 1367 { 1368 int ret; 1369 1370 ret = btrfs_insert_orphan_item(trans, root, ino); 1371 if (ret == -EEXIST) 1372 ret = 0; 1373 1374 return ret; 1375 } 1376 1377 static int count_inode_extrefs(struct btrfs_root *root, 1378 struct btrfs_inode *inode, struct btrfs_path *path) 1379 { 1380 int ret = 0; 1381 int name_len; 1382 unsigned int nlink = 0; 1383 u32 item_size; 1384 u32 cur_offset = 0; 1385 u64 inode_objectid = btrfs_ino(inode); 1386 u64 offset = 0; 1387 unsigned long ptr; 1388 struct btrfs_inode_extref *extref; 1389 struct extent_buffer *leaf; 1390 1391 while (1) { 1392 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1393 &extref, &offset); 1394 if (ret) 1395 break; 1396 1397 leaf = path->nodes[0]; 1398 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1399 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1400 cur_offset = 0; 1401 1402 while (cur_offset < item_size) { 1403 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1404 name_len = btrfs_inode_extref_name_len(leaf, extref); 1405 1406 nlink++; 1407 1408 cur_offset += name_len + sizeof(*extref); 1409 } 1410 1411 offset++; 1412 btrfs_release_path(path); 1413 } 1414 btrfs_release_path(path); 1415 1416 if (ret < 0 && ret != -ENOENT) 1417 return ret; 1418 return nlink; 1419 } 1420 1421 static int count_inode_refs(struct btrfs_root *root, 1422 struct btrfs_inode *inode, struct btrfs_path *path) 1423 { 1424 int ret; 1425 struct btrfs_key key; 1426 unsigned int nlink = 0; 1427 unsigned long ptr; 1428 unsigned long ptr_end; 1429 int name_len; 1430 u64 ino = btrfs_ino(inode); 1431 1432 key.objectid = ino; 1433 key.type = BTRFS_INODE_REF_KEY; 1434 key.offset = (u64)-1; 1435 1436 while (1) { 1437 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1438 if (ret < 0) 1439 break; 1440 if (ret > 0) { 1441 if (path->slots[0] == 0) 1442 break; 1443 path->slots[0]--; 1444 } 1445 process_slot: 1446 btrfs_item_key_to_cpu(path->nodes[0], &key, 1447 path->slots[0]); 1448 if (key.objectid != ino || 1449 key.type != BTRFS_INODE_REF_KEY) 1450 break; 1451 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1452 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1453 path->slots[0]); 1454 while (ptr < ptr_end) { 1455 struct btrfs_inode_ref *ref; 1456 1457 ref = (struct btrfs_inode_ref *)ptr; 1458 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1459 ref); 1460 ptr = (unsigned long)(ref + 1) + name_len; 1461 nlink++; 1462 } 1463 1464 if (key.offset == 0) 1465 break; 1466 if (path->slots[0] > 0) { 1467 path->slots[0]--; 1468 goto process_slot; 1469 } 1470 key.offset--; 1471 btrfs_release_path(path); 1472 } 1473 btrfs_release_path(path); 1474 1475 return nlink; 1476 } 1477 1478 /* 1479 * There are a few corners where the link count of the file can't 1480 * be properly maintained during replay. So, instead of adding 1481 * lots of complexity to the log code, we just scan the backrefs 1482 * for any file that has been through replay. 1483 * 1484 * The scan will update the link count on the inode to reflect the 1485 * number of back refs found. If it goes down to zero, the iput 1486 * will free the inode. 1487 */ 1488 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1489 struct btrfs_root *root, 1490 struct inode *inode) 1491 { 1492 struct btrfs_path *path; 1493 int ret; 1494 u64 nlink = 0; 1495 u64 ino = btrfs_ino(BTRFS_I(inode)); 1496 1497 path = btrfs_alloc_path(); 1498 if (!path) 1499 return -ENOMEM; 1500 1501 ret = count_inode_refs(root, BTRFS_I(inode), path); 1502 if (ret < 0) 1503 goto out; 1504 1505 nlink = ret; 1506 1507 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1508 if (ret < 0) 1509 goto out; 1510 1511 nlink += ret; 1512 1513 ret = 0; 1514 1515 if (nlink != inode->i_nlink) { 1516 set_nlink(inode, nlink); 1517 btrfs_update_inode(trans, root, inode); 1518 } 1519 BTRFS_I(inode)->index_cnt = (u64)-1; 1520 1521 if (inode->i_nlink == 0) { 1522 if (S_ISDIR(inode->i_mode)) { 1523 ret = replay_dir_deletes(trans, root, NULL, path, 1524 ino, 1); 1525 if (ret) 1526 goto out; 1527 } 1528 ret = insert_orphan_item(trans, root, ino); 1529 } 1530 1531 out: 1532 btrfs_free_path(path); 1533 return ret; 1534 } 1535 1536 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1537 struct btrfs_root *root, 1538 struct btrfs_path *path) 1539 { 1540 int ret; 1541 struct btrfs_key key; 1542 struct inode *inode; 1543 1544 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1545 key.type = BTRFS_ORPHAN_ITEM_KEY; 1546 key.offset = (u64)-1; 1547 while (1) { 1548 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1549 if (ret < 0) 1550 break; 1551 1552 if (ret == 1) { 1553 if (path->slots[0] == 0) 1554 break; 1555 path->slots[0]--; 1556 } 1557 1558 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1559 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1560 key.type != BTRFS_ORPHAN_ITEM_KEY) 1561 break; 1562 1563 ret = btrfs_del_item(trans, root, path); 1564 if (ret) 1565 goto out; 1566 1567 btrfs_release_path(path); 1568 inode = read_one_inode(root, key.offset); 1569 if (!inode) 1570 return -EIO; 1571 1572 ret = fixup_inode_link_count(trans, root, inode); 1573 iput(inode); 1574 if (ret) 1575 goto out; 1576 1577 /* 1578 * fixup on a directory may create new entries, 1579 * make sure we always look for the highset possible 1580 * offset 1581 */ 1582 key.offset = (u64)-1; 1583 } 1584 ret = 0; 1585 out: 1586 btrfs_release_path(path); 1587 return ret; 1588 } 1589 1590 1591 /* 1592 * record a given inode in the fixup dir so we can check its link 1593 * count when replay is done. The link count is incremented here 1594 * so the inode won't go away until we check it 1595 */ 1596 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1597 struct btrfs_root *root, 1598 struct btrfs_path *path, 1599 u64 objectid) 1600 { 1601 struct btrfs_key key; 1602 int ret = 0; 1603 struct inode *inode; 1604 1605 inode = read_one_inode(root, objectid); 1606 if (!inode) 1607 return -EIO; 1608 1609 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1610 key.type = BTRFS_ORPHAN_ITEM_KEY; 1611 key.offset = objectid; 1612 1613 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1614 1615 btrfs_release_path(path); 1616 if (ret == 0) { 1617 if (!inode->i_nlink) 1618 set_nlink(inode, 1); 1619 else 1620 inc_nlink(inode); 1621 ret = btrfs_update_inode(trans, root, inode); 1622 } else if (ret == -EEXIST) { 1623 ret = 0; 1624 } else { 1625 BUG(); /* Logic Error */ 1626 } 1627 iput(inode); 1628 1629 return ret; 1630 } 1631 1632 /* 1633 * when replaying the log for a directory, we only insert names 1634 * for inodes that actually exist. This means an fsync on a directory 1635 * does not implicitly fsync all the new files in it 1636 */ 1637 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1638 struct btrfs_root *root, 1639 u64 dirid, u64 index, 1640 char *name, int name_len, 1641 struct btrfs_key *location) 1642 { 1643 struct inode *inode; 1644 struct inode *dir; 1645 int ret; 1646 1647 inode = read_one_inode(root, location->objectid); 1648 if (!inode) 1649 return -ENOENT; 1650 1651 dir = read_one_inode(root, dirid); 1652 if (!dir) { 1653 iput(inode); 1654 return -EIO; 1655 } 1656 1657 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1658 name_len, 1, index); 1659 1660 /* FIXME, put inode into FIXUP list */ 1661 1662 iput(inode); 1663 iput(dir); 1664 return ret; 1665 } 1666 1667 /* 1668 * Return true if an inode reference exists in the log for the given name, 1669 * inode and parent inode. 1670 */ 1671 static bool name_in_log_ref(struct btrfs_root *log_root, 1672 const char *name, const int name_len, 1673 const u64 dirid, const u64 ino) 1674 { 1675 struct btrfs_key search_key; 1676 1677 search_key.objectid = ino; 1678 search_key.type = BTRFS_INODE_REF_KEY; 1679 search_key.offset = dirid; 1680 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1681 return true; 1682 1683 search_key.type = BTRFS_INODE_EXTREF_KEY; 1684 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1685 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1686 return true; 1687 1688 return false; 1689 } 1690 1691 /* 1692 * take a single entry in a log directory item and replay it into 1693 * the subvolume. 1694 * 1695 * if a conflicting item exists in the subdirectory already, 1696 * the inode it points to is unlinked and put into the link count 1697 * fix up tree. 1698 * 1699 * If a name from the log points to a file or directory that does 1700 * not exist in the FS, it is skipped. fsyncs on directories 1701 * do not force down inodes inside that directory, just changes to the 1702 * names or unlinks in a directory. 1703 * 1704 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1705 * non-existing inode) and 1 if the name was replayed. 1706 */ 1707 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1708 struct btrfs_root *root, 1709 struct btrfs_path *path, 1710 struct extent_buffer *eb, 1711 struct btrfs_dir_item *di, 1712 struct btrfs_key *key) 1713 { 1714 char *name; 1715 int name_len; 1716 struct btrfs_dir_item *dst_di; 1717 struct btrfs_key found_key; 1718 struct btrfs_key log_key; 1719 struct inode *dir; 1720 u8 log_type; 1721 int exists; 1722 int ret = 0; 1723 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1724 bool name_added = false; 1725 1726 dir = read_one_inode(root, key->objectid); 1727 if (!dir) 1728 return -EIO; 1729 1730 name_len = btrfs_dir_name_len(eb, di); 1731 name = kmalloc(name_len, GFP_NOFS); 1732 if (!name) { 1733 ret = -ENOMEM; 1734 goto out; 1735 } 1736 1737 log_type = btrfs_dir_type(eb, di); 1738 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1739 name_len); 1740 1741 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1742 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1743 if (exists == 0) 1744 exists = 1; 1745 else 1746 exists = 0; 1747 btrfs_release_path(path); 1748 1749 if (key->type == BTRFS_DIR_ITEM_KEY) { 1750 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1751 name, name_len, 1); 1752 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1753 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1754 key->objectid, 1755 key->offset, name, 1756 name_len, 1); 1757 } else { 1758 /* Corruption */ 1759 ret = -EINVAL; 1760 goto out; 1761 } 1762 if (IS_ERR_OR_NULL(dst_di)) { 1763 /* we need a sequence number to insert, so we only 1764 * do inserts for the BTRFS_DIR_INDEX_KEY types 1765 */ 1766 if (key->type != BTRFS_DIR_INDEX_KEY) 1767 goto out; 1768 goto insert; 1769 } 1770 1771 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1772 /* the existing item matches the logged item */ 1773 if (found_key.objectid == log_key.objectid && 1774 found_key.type == log_key.type && 1775 found_key.offset == log_key.offset && 1776 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1777 update_size = false; 1778 goto out; 1779 } 1780 1781 /* 1782 * don't drop the conflicting directory entry if the inode 1783 * for the new entry doesn't exist 1784 */ 1785 if (!exists) 1786 goto out; 1787 1788 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1789 if (ret) 1790 goto out; 1791 1792 if (key->type == BTRFS_DIR_INDEX_KEY) 1793 goto insert; 1794 out: 1795 btrfs_release_path(path); 1796 if (!ret && update_size) { 1797 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1798 ret = btrfs_update_inode(trans, root, dir); 1799 } 1800 kfree(name); 1801 iput(dir); 1802 if (!ret && name_added) 1803 ret = 1; 1804 return ret; 1805 1806 insert: 1807 if (name_in_log_ref(root->log_root, name, name_len, 1808 key->objectid, log_key.objectid)) { 1809 /* The dentry will be added later. */ 1810 ret = 0; 1811 update_size = false; 1812 goto out; 1813 } 1814 btrfs_release_path(path); 1815 ret = insert_one_name(trans, root, key->objectid, key->offset, 1816 name, name_len, &log_key); 1817 if (ret && ret != -ENOENT && ret != -EEXIST) 1818 goto out; 1819 if (!ret) 1820 name_added = true; 1821 update_size = false; 1822 ret = 0; 1823 goto out; 1824 } 1825 1826 /* 1827 * find all the names in a directory item and reconcile them into 1828 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1829 * one name in a directory item, but the same code gets used for 1830 * both directory index types 1831 */ 1832 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1833 struct btrfs_root *root, 1834 struct btrfs_path *path, 1835 struct extent_buffer *eb, int slot, 1836 struct btrfs_key *key) 1837 { 1838 struct btrfs_fs_info *fs_info = root->fs_info; 1839 int ret = 0; 1840 u32 item_size = btrfs_item_size_nr(eb, slot); 1841 struct btrfs_dir_item *di; 1842 int name_len; 1843 unsigned long ptr; 1844 unsigned long ptr_end; 1845 struct btrfs_path *fixup_path = NULL; 1846 1847 ptr = btrfs_item_ptr_offset(eb, slot); 1848 ptr_end = ptr + item_size; 1849 while (ptr < ptr_end) { 1850 di = (struct btrfs_dir_item *)ptr; 1851 if (verify_dir_item(fs_info, eb, slot, di)) 1852 return -EIO; 1853 name_len = btrfs_dir_name_len(eb, di); 1854 ret = replay_one_name(trans, root, path, eb, di, key); 1855 if (ret < 0) 1856 break; 1857 ptr = (unsigned long)(di + 1); 1858 ptr += name_len; 1859 1860 /* 1861 * If this entry refers to a non-directory (directories can not 1862 * have a link count > 1) and it was added in the transaction 1863 * that was not committed, make sure we fixup the link count of 1864 * the inode it the entry points to. Otherwise something like 1865 * the following would result in a directory pointing to an 1866 * inode with a wrong link that does not account for this dir 1867 * entry: 1868 * 1869 * mkdir testdir 1870 * touch testdir/foo 1871 * touch testdir/bar 1872 * sync 1873 * 1874 * ln testdir/bar testdir/bar_link 1875 * ln testdir/foo testdir/foo_link 1876 * xfs_io -c "fsync" testdir/bar 1877 * 1878 * <power failure> 1879 * 1880 * mount fs, log replay happens 1881 * 1882 * File foo would remain with a link count of 1 when it has two 1883 * entries pointing to it in the directory testdir. This would 1884 * make it impossible to ever delete the parent directory has 1885 * it would result in stale dentries that can never be deleted. 1886 */ 1887 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1888 struct btrfs_key di_key; 1889 1890 if (!fixup_path) { 1891 fixup_path = btrfs_alloc_path(); 1892 if (!fixup_path) { 1893 ret = -ENOMEM; 1894 break; 1895 } 1896 } 1897 1898 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1899 ret = link_to_fixup_dir(trans, root, fixup_path, 1900 di_key.objectid); 1901 if (ret) 1902 break; 1903 } 1904 ret = 0; 1905 } 1906 btrfs_free_path(fixup_path); 1907 return ret; 1908 } 1909 1910 /* 1911 * directory replay has two parts. There are the standard directory 1912 * items in the log copied from the subvolume, and range items 1913 * created in the log while the subvolume was logged. 1914 * 1915 * The range items tell us which parts of the key space the log 1916 * is authoritative for. During replay, if a key in the subvolume 1917 * directory is in a logged range item, but not actually in the log 1918 * that means it was deleted from the directory before the fsync 1919 * and should be removed. 1920 */ 1921 static noinline int find_dir_range(struct btrfs_root *root, 1922 struct btrfs_path *path, 1923 u64 dirid, int key_type, 1924 u64 *start_ret, u64 *end_ret) 1925 { 1926 struct btrfs_key key; 1927 u64 found_end; 1928 struct btrfs_dir_log_item *item; 1929 int ret; 1930 int nritems; 1931 1932 if (*start_ret == (u64)-1) 1933 return 1; 1934 1935 key.objectid = dirid; 1936 key.type = key_type; 1937 key.offset = *start_ret; 1938 1939 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1940 if (ret < 0) 1941 goto out; 1942 if (ret > 0) { 1943 if (path->slots[0] == 0) 1944 goto out; 1945 path->slots[0]--; 1946 } 1947 if (ret != 0) 1948 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1949 1950 if (key.type != key_type || key.objectid != dirid) { 1951 ret = 1; 1952 goto next; 1953 } 1954 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1955 struct btrfs_dir_log_item); 1956 found_end = btrfs_dir_log_end(path->nodes[0], item); 1957 1958 if (*start_ret >= key.offset && *start_ret <= found_end) { 1959 ret = 0; 1960 *start_ret = key.offset; 1961 *end_ret = found_end; 1962 goto out; 1963 } 1964 ret = 1; 1965 next: 1966 /* check the next slot in the tree to see if it is a valid item */ 1967 nritems = btrfs_header_nritems(path->nodes[0]); 1968 path->slots[0]++; 1969 if (path->slots[0] >= nritems) { 1970 ret = btrfs_next_leaf(root, path); 1971 if (ret) 1972 goto out; 1973 } 1974 1975 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1976 1977 if (key.type != key_type || key.objectid != dirid) { 1978 ret = 1; 1979 goto out; 1980 } 1981 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1982 struct btrfs_dir_log_item); 1983 found_end = btrfs_dir_log_end(path->nodes[0], item); 1984 *start_ret = key.offset; 1985 *end_ret = found_end; 1986 ret = 0; 1987 out: 1988 btrfs_release_path(path); 1989 return ret; 1990 } 1991 1992 /* 1993 * this looks for a given directory item in the log. If the directory 1994 * item is not in the log, the item is removed and the inode it points 1995 * to is unlinked 1996 */ 1997 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1998 struct btrfs_root *root, 1999 struct btrfs_root *log, 2000 struct btrfs_path *path, 2001 struct btrfs_path *log_path, 2002 struct inode *dir, 2003 struct btrfs_key *dir_key) 2004 { 2005 struct btrfs_fs_info *fs_info = root->fs_info; 2006 int ret; 2007 struct extent_buffer *eb; 2008 int slot; 2009 u32 item_size; 2010 struct btrfs_dir_item *di; 2011 struct btrfs_dir_item *log_di; 2012 int name_len; 2013 unsigned long ptr; 2014 unsigned long ptr_end; 2015 char *name; 2016 struct inode *inode; 2017 struct btrfs_key location; 2018 2019 again: 2020 eb = path->nodes[0]; 2021 slot = path->slots[0]; 2022 item_size = btrfs_item_size_nr(eb, slot); 2023 ptr = btrfs_item_ptr_offset(eb, slot); 2024 ptr_end = ptr + item_size; 2025 while (ptr < ptr_end) { 2026 di = (struct btrfs_dir_item *)ptr; 2027 if (verify_dir_item(fs_info, eb, slot, di)) { 2028 ret = -EIO; 2029 goto out; 2030 } 2031 2032 name_len = btrfs_dir_name_len(eb, di); 2033 name = kmalloc(name_len, GFP_NOFS); 2034 if (!name) { 2035 ret = -ENOMEM; 2036 goto out; 2037 } 2038 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2039 name_len); 2040 log_di = NULL; 2041 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2042 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2043 dir_key->objectid, 2044 name, name_len, 0); 2045 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2046 log_di = btrfs_lookup_dir_index_item(trans, log, 2047 log_path, 2048 dir_key->objectid, 2049 dir_key->offset, 2050 name, name_len, 0); 2051 } 2052 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2053 btrfs_dir_item_key_to_cpu(eb, di, &location); 2054 btrfs_release_path(path); 2055 btrfs_release_path(log_path); 2056 inode = read_one_inode(root, location.objectid); 2057 if (!inode) { 2058 kfree(name); 2059 return -EIO; 2060 } 2061 2062 ret = link_to_fixup_dir(trans, root, 2063 path, location.objectid); 2064 if (ret) { 2065 kfree(name); 2066 iput(inode); 2067 goto out; 2068 } 2069 2070 inc_nlink(inode); 2071 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2072 BTRFS_I(inode), name, name_len); 2073 if (!ret) 2074 ret = btrfs_run_delayed_items(trans, fs_info); 2075 kfree(name); 2076 iput(inode); 2077 if (ret) 2078 goto out; 2079 2080 /* there might still be more names under this key 2081 * check and repeat if required 2082 */ 2083 ret = btrfs_search_slot(NULL, root, dir_key, path, 2084 0, 0); 2085 if (ret == 0) 2086 goto again; 2087 ret = 0; 2088 goto out; 2089 } else if (IS_ERR(log_di)) { 2090 kfree(name); 2091 return PTR_ERR(log_di); 2092 } 2093 btrfs_release_path(log_path); 2094 kfree(name); 2095 2096 ptr = (unsigned long)(di + 1); 2097 ptr += name_len; 2098 } 2099 ret = 0; 2100 out: 2101 btrfs_release_path(path); 2102 btrfs_release_path(log_path); 2103 return ret; 2104 } 2105 2106 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2107 struct btrfs_root *root, 2108 struct btrfs_root *log, 2109 struct btrfs_path *path, 2110 const u64 ino) 2111 { 2112 struct btrfs_fs_info *fs_info = root->fs_info; 2113 struct btrfs_key search_key; 2114 struct btrfs_path *log_path; 2115 int i; 2116 int nritems; 2117 int ret; 2118 2119 log_path = btrfs_alloc_path(); 2120 if (!log_path) 2121 return -ENOMEM; 2122 2123 search_key.objectid = ino; 2124 search_key.type = BTRFS_XATTR_ITEM_KEY; 2125 search_key.offset = 0; 2126 again: 2127 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2128 if (ret < 0) 2129 goto out; 2130 process_leaf: 2131 nritems = btrfs_header_nritems(path->nodes[0]); 2132 for (i = path->slots[0]; i < nritems; i++) { 2133 struct btrfs_key key; 2134 struct btrfs_dir_item *di; 2135 struct btrfs_dir_item *log_di; 2136 u32 total_size; 2137 u32 cur; 2138 2139 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2140 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2141 ret = 0; 2142 goto out; 2143 } 2144 2145 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2146 total_size = btrfs_item_size_nr(path->nodes[0], i); 2147 cur = 0; 2148 while (cur < total_size) { 2149 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2150 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2151 u32 this_len = sizeof(*di) + name_len + data_len; 2152 char *name; 2153 2154 ret = verify_dir_item(fs_info, path->nodes[0], i, di); 2155 if (ret) { 2156 ret = -EIO; 2157 goto out; 2158 } 2159 name = kmalloc(name_len, GFP_NOFS); 2160 if (!name) { 2161 ret = -ENOMEM; 2162 goto out; 2163 } 2164 read_extent_buffer(path->nodes[0], name, 2165 (unsigned long)(di + 1), name_len); 2166 2167 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2168 name, name_len, 0); 2169 btrfs_release_path(log_path); 2170 if (!log_di) { 2171 /* Doesn't exist in log tree, so delete it. */ 2172 btrfs_release_path(path); 2173 di = btrfs_lookup_xattr(trans, root, path, ino, 2174 name, name_len, -1); 2175 kfree(name); 2176 if (IS_ERR(di)) { 2177 ret = PTR_ERR(di); 2178 goto out; 2179 } 2180 ASSERT(di); 2181 ret = btrfs_delete_one_dir_name(trans, root, 2182 path, di); 2183 if (ret) 2184 goto out; 2185 btrfs_release_path(path); 2186 search_key = key; 2187 goto again; 2188 } 2189 kfree(name); 2190 if (IS_ERR(log_di)) { 2191 ret = PTR_ERR(log_di); 2192 goto out; 2193 } 2194 cur += this_len; 2195 di = (struct btrfs_dir_item *)((char *)di + this_len); 2196 } 2197 } 2198 ret = btrfs_next_leaf(root, path); 2199 if (ret > 0) 2200 ret = 0; 2201 else if (ret == 0) 2202 goto process_leaf; 2203 out: 2204 btrfs_free_path(log_path); 2205 btrfs_release_path(path); 2206 return ret; 2207 } 2208 2209 2210 /* 2211 * deletion replay happens before we copy any new directory items 2212 * out of the log or out of backreferences from inodes. It 2213 * scans the log to find ranges of keys that log is authoritative for, 2214 * and then scans the directory to find items in those ranges that are 2215 * not present in the log. 2216 * 2217 * Anything we don't find in the log is unlinked and removed from the 2218 * directory. 2219 */ 2220 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2221 struct btrfs_root *root, 2222 struct btrfs_root *log, 2223 struct btrfs_path *path, 2224 u64 dirid, int del_all) 2225 { 2226 u64 range_start; 2227 u64 range_end; 2228 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2229 int ret = 0; 2230 struct btrfs_key dir_key; 2231 struct btrfs_key found_key; 2232 struct btrfs_path *log_path; 2233 struct inode *dir; 2234 2235 dir_key.objectid = dirid; 2236 dir_key.type = BTRFS_DIR_ITEM_KEY; 2237 log_path = btrfs_alloc_path(); 2238 if (!log_path) 2239 return -ENOMEM; 2240 2241 dir = read_one_inode(root, dirid); 2242 /* it isn't an error if the inode isn't there, that can happen 2243 * because we replay the deletes before we copy in the inode item 2244 * from the log 2245 */ 2246 if (!dir) { 2247 btrfs_free_path(log_path); 2248 return 0; 2249 } 2250 again: 2251 range_start = 0; 2252 range_end = 0; 2253 while (1) { 2254 if (del_all) 2255 range_end = (u64)-1; 2256 else { 2257 ret = find_dir_range(log, path, dirid, key_type, 2258 &range_start, &range_end); 2259 if (ret != 0) 2260 break; 2261 } 2262 2263 dir_key.offset = range_start; 2264 while (1) { 2265 int nritems; 2266 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2267 0, 0); 2268 if (ret < 0) 2269 goto out; 2270 2271 nritems = btrfs_header_nritems(path->nodes[0]); 2272 if (path->slots[0] >= nritems) { 2273 ret = btrfs_next_leaf(root, path); 2274 if (ret) 2275 break; 2276 } 2277 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2278 path->slots[0]); 2279 if (found_key.objectid != dirid || 2280 found_key.type != dir_key.type) 2281 goto next_type; 2282 2283 if (found_key.offset > range_end) 2284 break; 2285 2286 ret = check_item_in_log(trans, root, log, path, 2287 log_path, dir, 2288 &found_key); 2289 if (ret) 2290 goto out; 2291 if (found_key.offset == (u64)-1) 2292 break; 2293 dir_key.offset = found_key.offset + 1; 2294 } 2295 btrfs_release_path(path); 2296 if (range_end == (u64)-1) 2297 break; 2298 range_start = range_end + 1; 2299 } 2300 2301 next_type: 2302 ret = 0; 2303 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2304 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2305 dir_key.type = BTRFS_DIR_INDEX_KEY; 2306 btrfs_release_path(path); 2307 goto again; 2308 } 2309 out: 2310 btrfs_release_path(path); 2311 btrfs_free_path(log_path); 2312 iput(dir); 2313 return ret; 2314 } 2315 2316 /* 2317 * the process_func used to replay items from the log tree. This 2318 * gets called in two different stages. The first stage just looks 2319 * for inodes and makes sure they are all copied into the subvolume. 2320 * 2321 * The second stage copies all the other item types from the log into 2322 * the subvolume. The two stage approach is slower, but gets rid of 2323 * lots of complexity around inodes referencing other inodes that exist 2324 * only in the log (references come from either directory items or inode 2325 * back refs). 2326 */ 2327 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2328 struct walk_control *wc, u64 gen) 2329 { 2330 int nritems; 2331 struct btrfs_path *path; 2332 struct btrfs_root *root = wc->replay_dest; 2333 struct btrfs_key key; 2334 int level; 2335 int i; 2336 int ret; 2337 2338 ret = btrfs_read_buffer(eb, gen); 2339 if (ret) 2340 return ret; 2341 2342 level = btrfs_header_level(eb); 2343 2344 if (level != 0) 2345 return 0; 2346 2347 path = btrfs_alloc_path(); 2348 if (!path) 2349 return -ENOMEM; 2350 2351 nritems = btrfs_header_nritems(eb); 2352 for (i = 0; i < nritems; i++) { 2353 btrfs_item_key_to_cpu(eb, &key, i); 2354 2355 /* inode keys are done during the first stage */ 2356 if (key.type == BTRFS_INODE_ITEM_KEY && 2357 wc->stage == LOG_WALK_REPLAY_INODES) { 2358 struct btrfs_inode_item *inode_item; 2359 u32 mode; 2360 2361 inode_item = btrfs_item_ptr(eb, i, 2362 struct btrfs_inode_item); 2363 ret = replay_xattr_deletes(wc->trans, root, log, 2364 path, key.objectid); 2365 if (ret) 2366 break; 2367 mode = btrfs_inode_mode(eb, inode_item); 2368 if (S_ISDIR(mode)) { 2369 ret = replay_dir_deletes(wc->trans, 2370 root, log, path, key.objectid, 0); 2371 if (ret) 2372 break; 2373 } 2374 ret = overwrite_item(wc->trans, root, path, 2375 eb, i, &key); 2376 if (ret) 2377 break; 2378 2379 /* for regular files, make sure corresponding 2380 * orphan item exist. extents past the new EOF 2381 * will be truncated later by orphan cleanup. 2382 */ 2383 if (S_ISREG(mode)) { 2384 ret = insert_orphan_item(wc->trans, root, 2385 key.objectid); 2386 if (ret) 2387 break; 2388 } 2389 2390 ret = link_to_fixup_dir(wc->trans, root, 2391 path, key.objectid); 2392 if (ret) 2393 break; 2394 } 2395 2396 if (key.type == BTRFS_DIR_INDEX_KEY && 2397 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2398 ret = replay_one_dir_item(wc->trans, root, path, 2399 eb, i, &key); 2400 if (ret) 2401 break; 2402 } 2403 2404 if (wc->stage < LOG_WALK_REPLAY_ALL) 2405 continue; 2406 2407 /* these keys are simply copied */ 2408 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2409 ret = overwrite_item(wc->trans, root, path, 2410 eb, i, &key); 2411 if (ret) 2412 break; 2413 } else if (key.type == BTRFS_INODE_REF_KEY || 2414 key.type == BTRFS_INODE_EXTREF_KEY) { 2415 ret = add_inode_ref(wc->trans, root, log, path, 2416 eb, i, &key); 2417 if (ret && ret != -ENOENT) 2418 break; 2419 ret = 0; 2420 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2421 ret = replay_one_extent(wc->trans, root, path, 2422 eb, i, &key); 2423 if (ret) 2424 break; 2425 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2426 ret = replay_one_dir_item(wc->trans, root, path, 2427 eb, i, &key); 2428 if (ret) 2429 break; 2430 } 2431 } 2432 btrfs_free_path(path); 2433 return ret; 2434 } 2435 2436 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2437 struct btrfs_root *root, 2438 struct btrfs_path *path, int *level, 2439 struct walk_control *wc) 2440 { 2441 struct btrfs_fs_info *fs_info = root->fs_info; 2442 u64 root_owner; 2443 u64 bytenr; 2444 u64 ptr_gen; 2445 struct extent_buffer *next; 2446 struct extent_buffer *cur; 2447 struct extent_buffer *parent; 2448 u32 blocksize; 2449 int ret = 0; 2450 2451 WARN_ON(*level < 0); 2452 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2453 2454 while (*level > 0) { 2455 WARN_ON(*level < 0); 2456 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2457 cur = path->nodes[*level]; 2458 2459 WARN_ON(btrfs_header_level(cur) != *level); 2460 2461 if (path->slots[*level] >= 2462 btrfs_header_nritems(cur)) 2463 break; 2464 2465 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2466 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2467 blocksize = fs_info->nodesize; 2468 2469 parent = path->nodes[*level]; 2470 root_owner = btrfs_header_owner(parent); 2471 2472 next = btrfs_find_create_tree_block(fs_info, bytenr); 2473 if (IS_ERR(next)) 2474 return PTR_ERR(next); 2475 2476 if (*level == 1) { 2477 ret = wc->process_func(root, next, wc, ptr_gen); 2478 if (ret) { 2479 free_extent_buffer(next); 2480 return ret; 2481 } 2482 2483 path->slots[*level]++; 2484 if (wc->free) { 2485 ret = btrfs_read_buffer(next, ptr_gen); 2486 if (ret) { 2487 free_extent_buffer(next); 2488 return ret; 2489 } 2490 2491 if (trans) { 2492 btrfs_tree_lock(next); 2493 btrfs_set_lock_blocking(next); 2494 clean_tree_block(fs_info, next); 2495 btrfs_wait_tree_block_writeback(next); 2496 btrfs_tree_unlock(next); 2497 } 2498 2499 WARN_ON(root_owner != 2500 BTRFS_TREE_LOG_OBJECTID); 2501 ret = btrfs_free_and_pin_reserved_extent( 2502 fs_info, bytenr, 2503 blocksize); 2504 if (ret) { 2505 free_extent_buffer(next); 2506 return ret; 2507 } 2508 } 2509 free_extent_buffer(next); 2510 continue; 2511 } 2512 ret = btrfs_read_buffer(next, ptr_gen); 2513 if (ret) { 2514 free_extent_buffer(next); 2515 return ret; 2516 } 2517 2518 WARN_ON(*level <= 0); 2519 if (path->nodes[*level-1]) 2520 free_extent_buffer(path->nodes[*level-1]); 2521 path->nodes[*level-1] = next; 2522 *level = btrfs_header_level(next); 2523 path->slots[*level] = 0; 2524 cond_resched(); 2525 } 2526 WARN_ON(*level < 0); 2527 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2528 2529 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2530 2531 cond_resched(); 2532 return 0; 2533 } 2534 2535 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2536 struct btrfs_root *root, 2537 struct btrfs_path *path, int *level, 2538 struct walk_control *wc) 2539 { 2540 struct btrfs_fs_info *fs_info = root->fs_info; 2541 u64 root_owner; 2542 int i; 2543 int slot; 2544 int ret; 2545 2546 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2547 slot = path->slots[i]; 2548 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2549 path->slots[i]++; 2550 *level = i; 2551 WARN_ON(*level == 0); 2552 return 0; 2553 } else { 2554 struct extent_buffer *parent; 2555 if (path->nodes[*level] == root->node) 2556 parent = path->nodes[*level]; 2557 else 2558 parent = path->nodes[*level + 1]; 2559 2560 root_owner = btrfs_header_owner(parent); 2561 ret = wc->process_func(root, path->nodes[*level], wc, 2562 btrfs_header_generation(path->nodes[*level])); 2563 if (ret) 2564 return ret; 2565 2566 if (wc->free) { 2567 struct extent_buffer *next; 2568 2569 next = path->nodes[*level]; 2570 2571 if (trans) { 2572 btrfs_tree_lock(next); 2573 btrfs_set_lock_blocking(next); 2574 clean_tree_block(fs_info, next); 2575 btrfs_wait_tree_block_writeback(next); 2576 btrfs_tree_unlock(next); 2577 } 2578 2579 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2580 ret = btrfs_free_and_pin_reserved_extent( 2581 fs_info, 2582 path->nodes[*level]->start, 2583 path->nodes[*level]->len); 2584 if (ret) 2585 return ret; 2586 } 2587 free_extent_buffer(path->nodes[*level]); 2588 path->nodes[*level] = NULL; 2589 *level = i + 1; 2590 } 2591 } 2592 return 1; 2593 } 2594 2595 /* 2596 * drop the reference count on the tree rooted at 'snap'. This traverses 2597 * the tree freeing any blocks that have a ref count of zero after being 2598 * decremented. 2599 */ 2600 static int walk_log_tree(struct btrfs_trans_handle *trans, 2601 struct btrfs_root *log, struct walk_control *wc) 2602 { 2603 struct btrfs_fs_info *fs_info = log->fs_info; 2604 int ret = 0; 2605 int wret; 2606 int level; 2607 struct btrfs_path *path; 2608 int orig_level; 2609 2610 path = btrfs_alloc_path(); 2611 if (!path) 2612 return -ENOMEM; 2613 2614 level = btrfs_header_level(log->node); 2615 orig_level = level; 2616 path->nodes[level] = log->node; 2617 extent_buffer_get(log->node); 2618 path->slots[level] = 0; 2619 2620 while (1) { 2621 wret = walk_down_log_tree(trans, log, path, &level, wc); 2622 if (wret > 0) 2623 break; 2624 if (wret < 0) { 2625 ret = wret; 2626 goto out; 2627 } 2628 2629 wret = walk_up_log_tree(trans, log, path, &level, wc); 2630 if (wret > 0) 2631 break; 2632 if (wret < 0) { 2633 ret = wret; 2634 goto out; 2635 } 2636 } 2637 2638 /* was the root node processed? if not, catch it here */ 2639 if (path->nodes[orig_level]) { 2640 ret = wc->process_func(log, path->nodes[orig_level], wc, 2641 btrfs_header_generation(path->nodes[orig_level])); 2642 if (ret) 2643 goto out; 2644 if (wc->free) { 2645 struct extent_buffer *next; 2646 2647 next = path->nodes[orig_level]; 2648 2649 if (trans) { 2650 btrfs_tree_lock(next); 2651 btrfs_set_lock_blocking(next); 2652 clean_tree_block(fs_info, next); 2653 btrfs_wait_tree_block_writeback(next); 2654 btrfs_tree_unlock(next); 2655 } 2656 2657 WARN_ON(log->root_key.objectid != 2658 BTRFS_TREE_LOG_OBJECTID); 2659 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2660 next->start, next->len); 2661 if (ret) 2662 goto out; 2663 } 2664 } 2665 2666 out: 2667 btrfs_free_path(path); 2668 return ret; 2669 } 2670 2671 /* 2672 * helper function to update the item for a given subvolumes log root 2673 * in the tree of log roots 2674 */ 2675 static int update_log_root(struct btrfs_trans_handle *trans, 2676 struct btrfs_root *log) 2677 { 2678 struct btrfs_fs_info *fs_info = log->fs_info; 2679 int ret; 2680 2681 if (log->log_transid == 1) { 2682 /* insert root item on the first sync */ 2683 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2684 &log->root_key, &log->root_item); 2685 } else { 2686 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2687 &log->root_key, &log->root_item); 2688 } 2689 return ret; 2690 } 2691 2692 static void wait_log_commit(struct btrfs_root *root, int transid) 2693 { 2694 DEFINE_WAIT(wait); 2695 int index = transid % 2; 2696 2697 /* 2698 * we only allow two pending log transactions at a time, 2699 * so we know that if ours is more than 2 older than the 2700 * current transaction, we're done 2701 */ 2702 do { 2703 prepare_to_wait(&root->log_commit_wait[index], 2704 &wait, TASK_UNINTERRUPTIBLE); 2705 mutex_unlock(&root->log_mutex); 2706 2707 if (root->log_transid_committed < transid && 2708 atomic_read(&root->log_commit[index])) 2709 schedule(); 2710 2711 finish_wait(&root->log_commit_wait[index], &wait); 2712 mutex_lock(&root->log_mutex); 2713 } while (root->log_transid_committed < transid && 2714 atomic_read(&root->log_commit[index])); 2715 } 2716 2717 static void wait_for_writer(struct btrfs_root *root) 2718 { 2719 DEFINE_WAIT(wait); 2720 2721 while (atomic_read(&root->log_writers)) { 2722 prepare_to_wait(&root->log_writer_wait, 2723 &wait, TASK_UNINTERRUPTIBLE); 2724 mutex_unlock(&root->log_mutex); 2725 if (atomic_read(&root->log_writers)) 2726 schedule(); 2727 finish_wait(&root->log_writer_wait, &wait); 2728 mutex_lock(&root->log_mutex); 2729 } 2730 } 2731 2732 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2733 struct btrfs_log_ctx *ctx) 2734 { 2735 if (!ctx) 2736 return; 2737 2738 mutex_lock(&root->log_mutex); 2739 list_del_init(&ctx->list); 2740 mutex_unlock(&root->log_mutex); 2741 } 2742 2743 /* 2744 * Invoked in log mutex context, or be sure there is no other task which 2745 * can access the list. 2746 */ 2747 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2748 int index, int error) 2749 { 2750 struct btrfs_log_ctx *ctx; 2751 struct btrfs_log_ctx *safe; 2752 2753 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2754 list_del_init(&ctx->list); 2755 ctx->log_ret = error; 2756 } 2757 2758 INIT_LIST_HEAD(&root->log_ctxs[index]); 2759 } 2760 2761 /* 2762 * btrfs_sync_log does sends a given tree log down to the disk and 2763 * updates the super blocks to record it. When this call is done, 2764 * you know that any inodes previously logged are safely on disk only 2765 * if it returns 0. 2766 * 2767 * Any other return value means you need to call btrfs_commit_transaction. 2768 * Some of the edge cases for fsyncing directories that have had unlinks 2769 * or renames done in the past mean that sometimes the only safe 2770 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2771 * that has happened. 2772 */ 2773 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2774 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2775 { 2776 int index1; 2777 int index2; 2778 int mark; 2779 int ret; 2780 struct btrfs_fs_info *fs_info = root->fs_info; 2781 struct btrfs_root *log = root->log_root; 2782 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2783 int log_transid = 0; 2784 struct btrfs_log_ctx root_log_ctx; 2785 struct blk_plug plug; 2786 2787 mutex_lock(&root->log_mutex); 2788 log_transid = ctx->log_transid; 2789 if (root->log_transid_committed >= log_transid) { 2790 mutex_unlock(&root->log_mutex); 2791 return ctx->log_ret; 2792 } 2793 2794 index1 = log_transid % 2; 2795 if (atomic_read(&root->log_commit[index1])) { 2796 wait_log_commit(root, log_transid); 2797 mutex_unlock(&root->log_mutex); 2798 return ctx->log_ret; 2799 } 2800 ASSERT(log_transid == root->log_transid); 2801 atomic_set(&root->log_commit[index1], 1); 2802 2803 /* wait for previous tree log sync to complete */ 2804 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2805 wait_log_commit(root, log_transid - 1); 2806 2807 while (1) { 2808 int batch = atomic_read(&root->log_batch); 2809 /* when we're on an ssd, just kick the log commit out */ 2810 if (!btrfs_test_opt(fs_info, SSD) && 2811 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2812 mutex_unlock(&root->log_mutex); 2813 schedule_timeout_uninterruptible(1); 2814 mutex_lock(&root->log_mutex); 2815 } 2816 wait_for_writer(root); 2817 if (batch == atomic_read(&root->log_batch)) 2818 break; 2819 } 2820 2821 /* bail out if we need to do a full commit */ 2822 if (btrfs_need_log_full_commit(fs_info, trans)) { 2823 ret = -EAGAIN; 2824 btrfs_free_logged_extents(log, log_transid); 2825 mutex_unlock(&root->log_mutex); 2826 goto out; 2827 } 2828 2829 if (log_transid % 2 == 0) 2830 mark = EXTENT_DIRTY; 2831 else 2832 mark = EXTENT_NEW; 2833 2834 /* we start IO on all the marked extents here, but we don't actually 2835 * wait for them until later. 2836 */ 2837 blk_start_plug(&plug); 2838 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2839 if (ret) { 2840 blk_finish_plug(&plug); 2841 btrfs_abort_transaction(trans, ret); 2842 btrfs_free_logged_extents(log, log_transid); 2843 btrfs_set_log_full_commit(fs_info, trans); 2844 mutex_unlock(&root->log_mutex); 2845 goto out; 2846 } 2847 2848 btrfs_set_root_node(&log->root_item, log->node); 2849 2850 root->log_transid++; 2851 log->log_transid = root->log_transid; 2852 root->log_start_pid = 0; 2853 /* 2854 * IO has been started, blocks of the log tree have WRITTEN flag set 2855 * in their headers. new modifications of the log will be written to 2856 * new positions. so it's safe to allow log writers to go in. 2857 */ 2858 mutex_unlock(&root->log_mutex); 2859 2860 btrfs_init_log_ctx(&root_log_ctx, NULL); 2861 2862 mutex_lock(&log_root_tree->log_mutex); 2863 atomic_inc(&log_root_tree->log_batch); 2864 atomic_inc(&log_root_tree->log_writers); 2865 2866 index2 = log_root_tree->log_transid % 2; 2867 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2868 root_log_ctx.log_transid = log_root_tree->log_transid; 2869 2870 mutex_unlock(&log_root_tree->log_mutex); 2871 2872 ret = update_log_root(trans, log); 2873 2874 mutex_lock(&log_root_tree->log_mutex); 2875 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2876 /* 2877 * Implicit memory barrier after atomic_dec_and_test 2878 */ 2879 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2880 wake_up(&log_root_tree->log_writer_wait); 2881 } 2882 2883 if (ret) { 2884 if (!list_empty(&root_log_ctx.list)) 2885 list_del_init(&root_log_ctx.list); 2886 2887 blk_finish_plug(&plug); 2888 btrfs_set_log_full_commit(fs_info, trans); 2889 2890 if (ret != -ENOSPC) { 2891 btrfs_abort_transaction(trans, ret); 2892 mutex_unlock(&log_root_tree->log_mutex); 2893 goto out; 2894 } 2895 btrfs_wait_tree_log_extents(log, mark); 2896 btrfs_free_logged_extents(log, log_transid); 2897 mutex_unlock(&log_root_tree->log_mutex); 2898 ret = -EAGAIN; 2899 goto out; 2900 } 2901 2902 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2903 blk_finish_plug(&plug); 2904 list_del_init(&root_log_ctx.list); 2905 mutex_unlock(&log_root_tree->log_mutex); 2906 ret = root_log_ctx.log_ret; 2907 goto out; 2908 } 2909 2910 index2 = root_log_ctx.log_transid % 2; 2911 if (atomic_read(&log_root_tree->log_commit[index2])) { 2912 blk_finish_plug(&plug); 2913 ret = btrfs_wait_tree_log_extents(log, mark); 2914 btrfs_wait_logged_extents(trans, log, log_transid); 2915 wait_log_commit(log_root_tree, 2916 root_log_ctx.log_transid); 2917 mutex_unlock(&log_root_tree->log_mutex); 2918 if (!ret) 2919 ret = root_log_ctx.log_ret; 2920 goto out; 2921 } 2922 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2923 atomic_set(&log_root_tree->log_commit[index2], 1); 2924 2925 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2926 wait_log_commit(log_root_tree, 2927 root_log_ctx.log_transid - 1); 2928 } 2929 2930 wait_for_writer(log_root_tree); 2931 2932 /* 2933 * now that we've moved on to the tree of log tree roots, 2934 * check the full commit flag again 2935 */ 2936 if (btrfs_need_log_full_commit(fs_info, trans)) { 2937 blk_finish_plug(&plug); 2938 btrfs_wait_tree_log_extents(log, mark); 2939 btrfs_free_logged_extents(log, log_transid); 2940 mutex_unlock(&log_root_tree->log_mutex); 2941 ret = -EAGAIN; 2942 goto out_wake_log_root; 2943 } 2944 2945 ret = btrfs_write_marked_extents(fs_info, 2946 &log_root_tree->dirty_log_pages, 2947 EXTENT_DIRTY | EXTENT_NEW); 2948 blk_finish_plug(&plug); 2949 if (ret) { 2950 btrfs_set_log_full_commit(fs_info, trans); 2951 btrfs_abort_transaction(trans, ret); 2952 btrfs_free_logged_extents(log, log_transid); 2953 mutex_unlock(&log_root_tree->log_mutex); 2954 goto out_wake_log_root; 2955 } 2956 ret = btrfs_wait_tree_log_extents(log, mark); 2957 if (!ret) 2958 ret = btrfs_wait_tree_log_extents(log_root_tree, 2959 EXTENT_NEW | EXTENT_DIRTY); 2960 if (ret) { 2961 btrfs_set_log_full_commit(fs_info, trans); 2962 btrfs_free_logged_extents(log, log_transid); 2963 mutex_unlock(&log_root_tree->log_mutex); 2964 goto out_wake_log_root; 2965 } 2966 btrfs_wait_logged_extents(trans, log, log_transid); 2967 2968 btrfs_set_super_log_root(fs_info->super_for_commit, 2969 log_root_tree->node->start); 2970 btrfs_set_super_log_root_level(fs_info->super_for_commit, 2971 btrfs_header_level(log_root_tree->node)); 2972 2973 log_root_tree->log_transid++; 2974 mutex_unlock(&log_root_tree->log_mutex); 2975 2976 /* 2977 * nobody else is going to jump in and write the the ctree 2978 * super here because the log_commit atomic below is protecting 2979 * us. We must be called with a transaction handle pinning 2980 * the running transaction open, so a full commit can't hop 2981 * in and cause problems either. 2982 */ 2983 ret = write_all_supers(fs_info, 1); 2984 if (ret) { 2985 btrfs_set_log_full_commit(fs_info, trans); 2986 btrfs_abort_transaction(trans, ret); 2987 goto out_wake_log_root; 2988 } 2989 2990 mutex_lock(&root->log_mutex); 2991 if (root->last_log_commit < log_transid) 2992 root->last_log_commit = log_transid; 2993 mutex_unlock(&root->log_mutex); 2994 2995 out_wake_log_root: 2996 mutex_lock(&log_root_tree->log_mutex); 2997 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2998 2999 log_root_tree->log_transid_committed++; 3000 atomic_set(&log_root_tree->log_commit[index2], 0); 3001 mutex_unlock(&log_root_tree->log_mutex); 3002 3003 /* 3004 * The barrier before waitqueue_active is implied by mutex_unlock 3005 */ 3006 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3007 wake_up(&log_root_tree->log_commit_wait[index2]); 3008 out: 3009 mutex_lock(&root->log_mutex); 3010 btrfs_remove_all_log_ctxs(root, index1, ret); 3011 root->log_transid_committed++; 3012 atomic_set(&root->log_commit[index1], 0); 3013 mutex_unlock(&root->log_mutex); 3014 3015 /* 3016 * The barrier before waitqueue_active is implied by mutex_unlock 3017 */ 3018 if (waitqueue_active(&root->log_commit_wait[index1])) 3019 wake_up(&root->log_commit_wait[index1]); 3020 return ret; 3021 } 3022 3023 static void free_log_tree(struct btrfs_trans_handle *trans, 3024 struct btrfs_root *log) 3025 { 3026 int ret; 3027 u64 start; 3028 u64 end; 3029 struct walk_control wc = { 3030 .free = 1, 3031 .process_func = process_one_buffer 3032 }; 3033 3034 ret = walk_log_tree(trans, log, &wc); 3035 /* I don't think this can happen but just in case */ 3036 if (ret) 3037 btrfs_abort_transaction(trans, ret); 3038 3039 while (1) { 3040 ret = find_first_extent_bit(&log->dirty_log_pages, 3041 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 3042 NULL); 3043 if (ret) 3044 break; 3045 3046 clear_extent_bits(&log->dirty_log_pages, start, end, 3047 EXTENT_DIRTY | EXTENT_NEW); 3048 } 3049 3050 /* 3051 * We may have short-circuited the log tree with the full commit logic 3052 * and left ordered extents on our list, so clear these out to keep us 3053 * from leaking inodes and memory. 3054 */ 3055 btrfs_free_logged_extents(log, 0); 3056 btrfs_free_logged_extents(log, 1); 3057 3058 free_extent_buffer(log->node); 3059 kfree(log); 3060 } 3061 3062 /* 3063 * free all the extents used by the tree log. This should be called 3064 * at commit time of the full transaction 3065 */ 3066 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3067 { 3068 if (root->log_root) { 3069 free_log_tree(trans, root->log_root); 3070 root->log_root = NULL; 3071 } 3072 return 0; 3073 } 3074 3075 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3076 struct btrfs_fs_info *fs_info) 3077 { 3078 if (fs_info->log_root_tree) { 3079 free_log_tree(trans, fs_info->log_root_tree); 3080 fs_info->log_root_tree = NULL; 3081 } 3082 return 0; 3083 } 3084 3085 /* 3086 * If both a file and directory are logged, and unlinks or renames are 3087 * mixed in, we have a few interesting corners: 3088 * 3089 * create file X in dir Y 3090 * link file X to X.link in dir Y 3091 * fsync file X 3092 * unlink file X but leave X.link 3093 * fsync dir Y 3094 * 3095 * After a crash we would expect only X.link to exist. But file X 3096 * didn't get fsync'd again so the log has back refs for X and X.link. 3097 * 3098 * We solve this by removing directory entries and inode backrefs from the 3099 * log when a file that was logged in the current transaction is 3100 * unlinked. Any later fsync will include the updated log entries, and 3101 * we'll be able to reconstruct the proper directory items from backrefs. 3102 * 3103 * This optimizations allows us to avoid relogging the entire inode 3104 * or the entire directory. 3105 */ 3106 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3107 struct btrfs_root *root, 3108 const char *name, int name_len, 3109 struct btrfs_inode *dir, u64 index) 3110 { 3111 struct btrfs_root *log; 3112 struct btrfs_dir_item *di; 3113 struct btrfs_path *path; 3114 int ret; 3115 int err = 0; 3116 int bytes_del = 0; 3117 u64 dir_ino = btrfs_ino(dir); 3118 3119 if (dir->logged_trans < trans->transid) 3120 return 0; 3121 3122 ret = join_running_log_trans(root); 3123 if (ret) 3124 return 0; 3125 3126 mutex_lock(&dir->log_mutex); 3127 3128 log = root->log_root; 3129 path = btrfs_alloc_path(); 3130 if (!path) { 3131 err = -ENOMEM; 3132 goto out_unlock; 3133 } 3134 3135 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3136 name, name_len, -1); 3137 if (IS_ERR(di)) { 3138 err = PTR_ERR(di); 3139 goto fail; 3140 } 3141 if (di) { 3142 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3143 bytes_del += name_len; 3144 if (ret) { 3145 err = ret; 3146 goto fail; 3147 } 3148 } 3149 btrfs_release_path(path); 3150 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3151 index, name, name_len, -1); 3152 if (IS_ERR(di)) { 3153 err = PTR_ERR(di); 3154 goto fail; 3155 } 3156 if (di) { 3157 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3158 bytes_del += name_len; 3159 if (ret) { 3160 err = ret; 3161 goto fail; 3162 } 3163 } 3164 3165 /* update the directory size in the log to reflect the names 3166 * we have removed 3167 */ 3168 if (bytes_del) { 3169 struct btrfs_key key; 3170 3171 key.objectid = dir_ino; 3172 key.offset = 0; 3173 key.type = BTRFS_INODE_ITEM_KEY; 3174 btrfs_release_path(path); 3175 3176 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3177 if (ret < 0) { 3178 err = ret; 3179 goto fail; 3180 } 3181 if (ret == 0) { 3182 struct btrfs_inode_item *item; 3183 u64 i_size; 3184 3185 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3186 struct btrfs_inode_item); 3187 i_size = btrfs_inode_size(path->nodes[0], item); 3188 if (i_size > bytes_del) 3189 i_size -= bytes_del; 3190 else 3191 i_size = 0; 3192 btrfs_set_inode_size(path->nodes[0], item, i_size); 3193 btrfs_mark_buffer_dirty(path->nodes[0]); 3194 } else 3195 ret = 0; 3196 btrfs_release_path(path); 3197 } 3198 fail: 3199 btrfs_free_path(path); 3200 out_unlock: 3201 mutex_unlock(&dir->log_mutex); 3202 if (ret == -ENOSPC) { 3203 btrfs_set_log_full_commit(root->fs_info, trans); 3204 ret = 0; 3205 } else if (ret < 0) 3206 btrfs_abort_transaction(trans, ret); 3207 3208 btrfs_end_log_trans(root); 3209 3210 return err; 3211 } 3212 3213 /* see comments for btrfs_del_dir_entries_in_log */ 3214 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3215 struct btrfs_root *root, 3216 const char *name, int name_len, 3217 struct btrfs_inode *inode, u64 dirid) 3218 { 3219 struct btrfs_fs_info *fs_info = root->fs_info; 3220 struct btrfs_root *log; 3221 u64 index; 3222 int ret; 3223 3224 if (inode->logged_trans < trans->transid) 3225 return 0; 3226 3227 ret = join_running_log_trans(root); 3228 if (ret) 3229 return 0; 3230 log = root->log_root; 3231 mutex_lock(&inode->log_mutex); 3232 3233 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3234 dirid, &index); 3235 mutex_unlock(&inode->log_mutex); 3236 if (ret == -ENOSPC) { 3237 btrfs_set_log_full_commit(fs_info, trans); 3238 ret = 0; 3239 } else if (ret < 0 && ret != -ENOENT) 3240 btrfs_abort_transaction(trans, ret); 3241 btrfs_end_log_trans(root); 3242 3243 return ret; 3244 } 3245 3246 /* 3247 * creates a range item in the log for 'dirid'. first_offset and 3248 * last_offset tell us which parts of the key space the log should 3249 * be considered authoritative for. 3250 */ 3251 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3252 struct btrfs_root *log, 3253 struct btrfs_path *path, 3254 int key_type, u64 dirid, 3255 u64 first_offset, u64 last_offset) 3256 { 3257 int ret; 3258 struct btrfs_key key; 3259 struct btrfs_dir_log_item *item; 3260 3261 key.objectid = dirid; 3262 key.offset = first_offset; 3263 if (key_type == BTRFS_DIR_ITEM_KEY) 3264 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3265 else 3266 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3267 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3268 if (ret) 3269 return ret; 3270 3271 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3272 struct btrfs_dir_log_item); 3273 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3274 btrfs_mark_buffer_dirty(path->nodes[0]); 3275 btrfs_release_path(path); 3276 return 0; 3277 } 3278 3279 /* 3280 * log all the items included in the current transaction for a given 3281 * directory. This also creates the range items in the log tree required 3282 * to replay anything deleted before the fsync 3283 */ 3284 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3285 struct btrfs_root *root, struct btrfs_inode *inode, 3286 struct btrfs_path *path, 3287 struct btrfs_path *dst_path, int key_type, 3288 struct btrfs_log_ctx *ctx, 3289 u64 min_offset, u64 *last_offset_ret) 3290 { 3291 struct btrfs_key min_key; 3292 struct btrfs_root *log = root->log_root; 3293 struct extent_buffer *src; 3294 int err = 0; 3295 int ret; 3296 int i; 3297 int nritems; 3298 u64 first_offset = min_offset; 3299 u64 last_offset = (u64)-1; 3300 u64 ino = btrfs_ino(inode); 3301 3302 log = root->log_root; 3303 3304 min_key.objectid = ino; 3305 min_key.type = key_type; 3306 min_key.offset = min_offset; 3307 3308 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3309 3310 /* 3311 * we didn't find anything from this transaction, see if there 3312 * is anything at all 3313 */ 3314 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3315 min_key.objectid = ino; 3316 min_key.type = key_type; 3317 min_key.offset = (u64)-1; 3318 btrfs_release_path(path); 3319 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3320 if (ret < 0) { 3321 btrfs_release_path(path); 3322 return ret; 3323 } 3324 ret = btrfs_previous_item(root, path, ino, key_type); 3325 3326 /* if ret == 0 there are items for this type, 3327 * create a range to tell us the last key of this type. 3328 * otherwise, there are no items in this directory after 3329 * *min_offset, and we create a range to indicate that. 3330 */ 3331 if (ret == 0) { 3332 struct btrfs_key tmp; 3333 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3334 path->slots[0]); 3335 if (key_type == tmp.type) 3336 first_offset = max(min_offset, tmp.offset) + 1; 3337 } 3338 goto done; 3339 } 3340 3341 /* go backward to find any previous key */ 3342 ret = btrfs_previous_item(root, path, ino, key_type); 3343 if (ret == 0) { 3344 struct btrfs_key tmp; 3345 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3346 if (key_type == tmp.type) { 3347 first_offset = tmp.offset; 3348 ret = overwrite_item(trans, log, dst_path, 3349 path->nodes[0], path->slots[0], 3350 &tmp); 3351 if (ret) { 3352 err = ret; 3353 goto done; 3354 } 3355 } 3356 } 3357 btrfs_release_path(path); 3358 3359 /* find the first key from this transaction again */ 3360 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3361 if (WARN_ON(ret != 0)) 3362 goto done; 3363 3364 /* 3365 * we have a block from this transaction, log every item in it 3366 * from our directory 3367 */ 3368 while (1) { 3369 struct btrfs_key tmp; 3370 src = path->nodes[0]; 3371 nritems = btrfs_header_nritems(src); 3372 for (i = path->slots[0]; i < nritems; i++) { 3373 struct btrfs_dir_item *di; 3374 3375 btrfs_item_key_to_cpu(src, &min_key, i); 3376 3377 if (min_key.objectid != ino || min_key.type != key_type) 3378 goto done; 3379 ret = overwrite_item(trans, log, dst_path, src, i, 3380 &min_key); 3381 if (ret) { 3382 err = ret; 3383 goto done; 3384 } 3385 3386 /* 3387 * We must make sure that when we log a directory entry, 3388 * the corresponding inode, after log replay, has a 3389 * matching link count. For example: 3390 * 3391 * touch foo 3392 * mkdir mydir 3393 * sync 3394 * ln foo mydir/bar 3395 * xfs_io -c "fsync" mydir 3396 * <crash> 3397 * <mount fs and log replay> 3398 * 3399 * Would result in a fsync log that when replayed, our 3400 * file inode would have a link count of 1, but we get 3401 * two directory entries pointing to the same inode. 3402 * After removing one of the names, it would not be 3403 * possible to remove the other name, which resulted 3404 * always in stale file handle errors, and would not 3405 * be possible to rmdir the parent directory, since 3406 * its i_size could never decrement to the value 3407 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3408 */ 3409 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3410 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3411 if (ctx && 3412 (btrfs_dir_transid(src, di) == trans->transid || 3413 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3414 tmp.type != BTRFS_ROOT_ITEM_KEY) 3415 ctx->log_new_dentries = true; 3416 } 3417 path->slots[0] = nritems; 3418 3419 /* 3420 * look ahead to the next item and see if it is also 3421 * from this directory and from this transaction 3422 */ 3423 ret = btrfs_next_leaf(root, path); 3424 if (ret == 1) { 3425 last_offset = (u64)-1; 3426 goto done; 3427 } 3428 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3429 if (tmp.objectid != ino || tmp.type != key_type) { 3430 last_offset = (u64)-1; 3431 goto done; 3432 } 3433 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3434 ret = overwrite_item(trans, log, dst_path, 3435 path->nodes[0], path->slots[0], 3436 &tmp); 3437 if (ret) 3438 err = ret; 3439 else 3440 last_offset = tmp.offset; 3441 goto done; 3442 } 3443 } 3444 done: 3445 btrfs_release_path(path); 3446 btrfs_release_path(dst_path); 3447 3448 if (err == 0) { 3449 *last_offset_ret = last_offset; 3450 /* 3451 * insert the log range keys to indicate where the log 3452 * is valid 3453 */ 3454 ret = insert_dir_log_key(trans, log, path, key_type, 3455 ino, first_offset, last_offset); 3456 if (ret) 3457 err = ret; 3458 } 3459 return err; 3460 } 3461 3462 /* 3463 * logging directories is very similar to logging inodes, We find all the items 3464 * from the current transaction and write them to the log. 3465 * 3466 * The recovery code scans the directory in the subvolume, and if it finds a 3467 * key in the range logged that is not present in the log tree, then it means 3468 * that dir entry was unlinked during the transaction. 3469 * 3470 * In order for that scan to work, we must include one key smaller than 3471 * the smallest logged by this transaction and one key larger than the largest 3472 * key logged by this transaction. 3473 */ 3474 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3475 struct btrfs_root *root, struct btrfs_inode *inode, 3476 struct btrfs_path *path, 3477 struct btrfs_path *dst_path, 3478 struct btrfs_log_ctx *ctx) 3479 { 3480 u64 min_key; 3481 u64 max_key; 3482 int ret; 3483 int key_type = BTRFS_DIR_ITEM_KEY; 3484 3485 again: 3486 min_key = 0; 3487 max_key = 0; 3488 while (1) { 3489 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3490 ctx, min_key, &max_key); 3491 if (ret) 3492 return ret; 3493 if (max_key == (u64)-1) 3494 break; 3495 min_key = max_key + 1; 3496 } 3497 3498 if (key_type == BTRFS_DIR_ITEM_KEY) { 3499 key_type = BTRFS_DIR_INDEX_KEY; 3500 goto again; 3501 } 3502 return 0; 3503 } 3504 3505 /* 3506 * a helper function to drop items from the log before we relog an 3507 * inode. max_key_type indicates the highest item type to remove. 3508 * This cannot be run for file data extents because it does not 3509 * free the extents they point to. 3510 */ 3511 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3512 struct btrfs_root *log, 3513 struct btrfs_path *path, 3514 u64 objectid, int max_key_type) 3515 { 3516 int ret; 3517 struct btrfs_key key; 3518 struct btrfs_key found_key; 3519 int start_slot; 3520 3521 key.objectid = objectid; 3522 key.type = max_key_type; 3523 key.offset = (u64)-1; 3524 3525 while (1) { 3526 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3527 BUG_ON(ret == 0); /* Logic error */ 3528 if (ret < 0) 3529 break; 3530 3531 if (path->slots[0] == 0) 3532 break; 3533 3534 path->slots[0]--; 3535 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3536 path->slots[0]); 3537 3538 if (found_key.objectid != objectid) 3539 break; 3540 3541 found_key.offset = 0; 3542 found_key.type = 0; 3543 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3544 &start_slot); 3545 3546 ret = btrfs_del_items(trans, log, path, start_slot, 3547 path->slots[0] - start_slot + 1); 3548 /* 3549 * If start slot isn't 0 then we don't need to re-search, we've 3550 * found the last guy with the objectid in this tree. 3551 */ 3552 if (ret || start_slot != 0) 3553 break; 3554 btrfs_release_path(path); 3555 } 3556 btrfs_release_path(path); 3557 if (ret > 0) 3558 ret = 0; 3559 return ret; 3560 } 3561 3562 static void fill_inode_item(struct btrfs_trans_handle *trans, 3563 struct extent_buffer *leaf, 3564 struct btrfs_inode_item *item, 3565 struct inode *inode, int log_inode_only, 3566 u64 logged_isize) 3567 { 3568 struct btrfs_map_token token; 3569 3570 btrfs_init_map_token(&token); 3571 3572 if (log_inode_only) { 3573 /* set the generation to zero so the recover code 3574 * can tell the difference between an logging 3575 * just to say 'this inode exists' and a logging 3576 * to say 'update this inode with these values' 3577 */ 3578 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3579 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3580 } else { 3581 btrfs_set_token_inode_generation(leaf, item, 3582 BTRFS_I(inode)->generation, 3583 &token); 3584 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3585 } 3586 3587 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3588 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3589 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3590 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3591 3592 btrfs_set_token_timespec_sec(leaf, &item->atime, 3593 inode->i_atime.tv_sec, &token); 3594 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3595 inode->i_atime.tv_nsec, &token); 3596 3597 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3598 inode->i_mtime.tv_sec, &token); 3599 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3600 inode->i_mtime.tv_nsec, &token); 3601 3602 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3603 inode->i_ctime.tv_sec, &token); 3604 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3605 inode->i_ctime.tv_nsec, &token); 3606 3607 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3608 &token); 3609 3610 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3611 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3612 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3613 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3614 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3615 } 3616 3617 static int log_inode_item(struct btrfs_trans_handle *trans, 3618 struct btrfs_root *log, struct btrfs_path *path, 3619 struct btrfs_inode *inode) 3620 { 3621 struct btrfs_inode_item *inode_item; 3622 int ret; 3623 3624 ret = btrfs_insert_empty_item(trans, log, path, 3625 &inode->location, sizeof(*inode_item)); 3626 if (ret && ret != -EEXIST) 3627 return ret; 3628 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3629 struct btrfs_inode_item); 3630 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3631 0, 0); 3632 btrfs_release_path(path); 3633 return 0; 3634 } 3635 3636 static noinline int copy_items(struct btrfs_trans_handle *trans, 3637 struct btrfs_inode *inode, 3638 struct btrfs_path *dst_path, 3639 struct btrfs_path *src_path, u64 *last_extent, 3640 int start_slot, int nr, int inode_only, 3641 u64 logged_isize) 3642 { 3643 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3644 unsigned long src_offset; 3645 unsigned long dst_offset; 3646 struct btrfs_root *log = inode->root->log_root; 3647 struct btrfs_file_extent_item *extent; 3648 struct btrfs_inode_item *inode_item; 3649 struct extent_buffer *src = src_path->nodes[0]; 3650 struct btrfs_key first_key, last_key, key; 3651 int ret; 3652 struct btrfs_key *ins_keys; 3653 u32 *ins_sizes; 3654 char *ins_data; 3655 int i; 3656 struct list_head ordered_sums; 3657 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3658 bool has_extents = false; 3659 bool need_find_last_extent = true; 3660 bool done = false; 3661 3662 INIT_LIST_HEAD(&ordered_sums); 3663 3664 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3665 nr * sizeof(u32), GFP_NOFS); 3666 if (!ins_data) 3667 return -ENOMEM; 3668 3669 first_key.objectid = (u64)-1; 3670 3671 ins_sizes = (u32 *)ins_data; 3672 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3673 3674 for (i = 0; i < nr; i++) { 3675 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3676 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3677 } 3678 ret = btrfs_insert_empty_items(trans, log, dst_path, 3679 ins_keys, ins_sizes, nr); 3680 if (ret) { 3681 kfree(ins_data); 3682 return ret; 3683 } 3684 3685 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3686 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3687 dst_path->slots[0]); 3688 3689 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3690 3691 if (i == nr - 1) 3692 last_key = ins_keys[i]; 3693 3694 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3695 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3696 dst_path->slots[0], 3697 struct btrfs_inode_item); 3698 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3699 &inode->vfs_inode, 3700 inode_only == LOG_INODE_EXISTS, 3701 logged_isize); 3702 } else { 3703 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3704 src_offset, ins_sizes[i]); 3705 } 3706 3707 /* 3708 * We set need_find_last_extent here in case we know we were 3709 * processing other items and then walk into the first extent in 3710 * the inode. If we don't hit an extent then nothing changes, 3711 * we'll do the last search the next time around. 3712 */ 3713 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3714 has_extents = true; 3715 if (first_key.objectid == (u64)-1) 3716 first_key = ins_keys[i]; 3717 } else { 3718 need_find_last_extent = false; 3719 } 3720 3721 /* take a reference on file data extents so that truncates 3722 * or deletes of this inode don't have to relog the inode 3723 * again 3724 */ 3725 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3726 !skip_csum) { 3727 int found_type; 3728 extent = btrfs_item_ptr(src, start_slot + i, 3729 struct btrfs_file_extent_item); 3730 3731 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3732 continue; 3733 3734 found_type = btrfs_file_extent_type(src, extent); 3735 if (found_type == BTRFS_FILE_EXTENT_REG) { 3736 u64 ds, dl, cs, cl; 3737 ds = btrfs_file_extent_disk_bytenr(src, 3738 extent); 3739 /* ds == 0 is a hole */ 3740 if (ds == 0) 3741 continue; 3742 3743 dl = btrfs_file_extent_disk_num_bytes(src, 3744 extent); 3745 cs = btrfs_file_extent_offset(src, extent); 3746 cl = btrfs_file_extent_num_bytes(src, 3747 extent); 3748 if (btrfs_file_extent_compression(src, 3749 extent)) { 3750 cs = 0; 3751 cl = dl; 3752 } 3753 3754 ret = btrfs_lookup_csums_range( 3755 fs_info->csum_root, 3756 ds + cs, ds + cs + cl - 1, 3757 &ordered_sums, 0); 3758 if (ret) { 3759 btrfs_release_path(dst_path); 3760 kfree(ins_data); 3761 return ret; 3762 } 3763 } 3764 } 3765 } 3766 3767 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3768 btrfs_release_path(dst_path); 3769 kfree(ins_data); 3770 3771 /* 3772 * we have to do this after the loop above to avoid changing the 3773 * log tree while trying to change the log tree. 3774 */ 3775 ret = 0; 3776 while (!list_empty(&ordered_sums)) { 3777 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3778 struct btrfs_ordered_sum, 3779 list); 3780 if (!ret) 3781 ret = btrfs_csum_file_blocks(trans, log, sums); 3782 list_del(&sums->list); 3783 kfree(sums); 3784 } 3785 3786 if (!has_extents) 3787 return ret; 3788 3789 if (need_find_last_extent && *last_extent == first_key.offset) { 3790 /* 3791 * We don't have any leafs between our current one and the one 3792 * we processed before that can have file extent items for our 3793 * inode (and have a generation number smaller than our current 3794 * transaction id). 3795 */ 3796 need_find_last_extent = false; 3797 } 3798 3799 /* 3800 * Because we use btrfs_search_forward we could skip leaves that were 3801 * not modified and then assume *last_extent is valid when it really 3802 * isn't. So back up to the previous leaf and read the end of the last 3803 * extent before we go and fill in holes. 3804 */ 3805 if (need_find_last_extent) { 3806 u64 len; 3807 3808 ret = btrfs_prev_leaf(inode->root, src_path); 3809 if (ret < 0) 3810 return ret; 3811 if (ret) 3812 goto fill_holes; 3813 if (src_path->slots[0]) 3814 src_path->slots[0]--; 3815 src = src_path->nodes[0]; 3816 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3817 if (key.objectid != btrfs_ino(inode) || 3818 key.type != BTRFS_EXTENT_DATA_KEY) 3819 goto fill_holes; 3820 extent = btrfs_item_ptr(src, src_path->slots[0], 3821 struct btrfs_file_extent_item); 3822 if (btrfs_file_extent_type(src, extent) == 3823 BTRFS_FILE_EXTENT_INLINE) { 3824 len = btrfs_file_extent_inline_len(src, 3825 src_path->slots[0], 3826 extent); 3827 *last_extent = ALIGN(key.offset + len, 3828 fs_info->sectorsize); 3829 } else { 3830 len = btrfs_file_extent_num_bytes(src, extent); 3831 *last_extent = key.offset + len; 3832 } 3833 } 3834 fill_holes: 3835 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3836 * things could have happened 3837 * 3838 * 1) A merge could have happened, so we could currently be on a leaf 3839 * that holds what we were copying in the first place. 3840 * 2) A split could have happened, and now not all of the items we want 3841 * are on the same leaf. 3842 * 3843 * So we need to adjust how we search for holes, we need to drop the 3844 * path and re-search for the first extent key we found, and then walk 3845 * forward until we hit the last one we copied. 3846 */ 3847 if (need_find_last_extent) { 3848 /* btrfs_prev_leaf could return 1 without releasing the path */ 3849 btrfs_release_path(src_path); 3850 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3851 src_path, 0, 0); 3852 if (ret < 0) 3853 return ret; 3854 ASSERT(ret == 0); 3855 src = src_path->nodes[0]; 3856 i = src_path->slots[0]; 3857 } else { 3858 i = start_slot; 3859 } 3860 3861 /* 3862 * Ok so here we need to go through and fill in any holes we may have 3863 * to make sure that holes are punched for those areas in case they had 3864 * extents previously. 3865 */ 3866 while (!done) { 3867 u64 offset, len; 3868 u64 extent_end; 3869 3870 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3871 ret = btrfs_next_leaf(inode->root, src_path); 3872 if (ret < 0) 3873 return ret; 3874 ASSERT(ret == 0); 3875 src = src_path->nodes[0]; 3876 i = 0; 3877 } 3878 3879 btrfs_item_key_to_cpu(src, &key, i); 3880 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3881 done = true; 3882 if (key.objectid != btrfs_ino(inode) || 3883 key.type != BTRFS_EXTENT_DATA_KEY) { 3884 i++; 3885 continue; 3886 } 3887 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3888 if (btrfs_file_extent_type(src, extent) == 3889 BTRFS_FILE_EXTENT_INLINE) { 3890 len = btrfs_file_extent_inline_len(src, i, extent); 3891 extent_end = ALIGN(key.offset + len, 3892 fs_info->sectorsize); 3893 } else { 3894 len = btrfs_file_extent_num_bytes(src, extent); 3895 extent_end = key.offset + len; 3896 } 3897 i++; 3898 3899 if (*last_extent == key.offset) { 3900 *last_extent = extent_end; 3901 continue; 3902 } 3903 offset = *last_extent; 3904 len = key.offset - *last_extent; 3905 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3906 offset, 0, 0, len, 0, len, 0, 0, 0); 3907 if (ret) 3908 break; 3909 *last_extent = extent_end; 3910 } 3911 /* 3912 * Need to let the callers know we dropped the path so they should 3913 * re-search. 3914 */ 3915 if (!ret && need_find_last_extent) 3916 ret = 1; 3917 return ret; 3918 } 3919 3920 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3921 { 3922 struct extent_map *em1, *em2; 3923 3924 em1 = list_entry(a, struct extent_map, list); 3925 em2 = list_entry(b, struct extent_map, list); 3926 3927 if (em1->start < em2->start) 3928 return -1; 3929 else if (em1->start > em2->start) 3930 return 1; 3931 return 0; 3932 } 3933 3934 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3935 struct inode *inode, 3936 struct btrfs_root *root, 3937 const struct extent_map *em, 3938 const struct list_head *logged_list, 3939 bool *ordered_io_error) 3940 { 3941 struct btrfs_fs_info *fs_info = root->fs_info; 3942 struct btrfs_ordered_extent *ordered; 3943 struct btrfs_root *log = root->log_root; 3944 u64 mod_start = em->mod_start; 3945 u64 mod_len = em->mod_len; 3946 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3947 u64 csum_offset; 3948 u64 csum_len; 3949 LIST_HEAD(ordered_sums); 3950 int ret = 0; 3951 3952 *ordered_io_error = false; 3953 3954 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3955 em->block_start == EXTENT_MAP_HOLE) 3956 return 0; 3957 3958 /* 3959 * Wait far any ordered extent that covers our extent map. If it 3960 * finishes without an error, first check and see if our csums are on 3961 * our outstanding ordered extents. 3962 */ 3963 list_for_each_entry(ordered, logged_list, log_list) { 3964 struct btrfs_ordered_sum *sum; 3965 3966 if (!mod_len) 3967 break; 3968 3969 if (ordered->file_offset + ordered->len <= mod_start || 3970 mod_start + mod_len <= ordered->file_offset) 3971 continue; 3972 3973 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3974 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3975 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3976 const u64 start = ordered->file_offset; 3977 const u64 end = ordered->file_offset + ordered->len - 1; 3978 3979 WARN_ON(ordered->inode != inode); 3980 filemap_fdatawrite_range(inode->i_mapping, start, end); 3981 } 3982 3983 wait_event(ordered->wait, 3984 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3985 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3986 3987 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3988 /* 3989 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3990 * i_mapping flags, so that the next fsync won't get 3991 * an outdated io error too. 3992 */ 3993 filemap_check_errors(inode->i_mapping); 3994 *ordered_io_error = true; 3995 break; 3996 } 3997 /* 3998 * We are going to copy all the csums on this ordered extent, so 3999 * go ahead and adjust mod_start and mod_len in case this 4000 * ordered extent has already been logged. 4001 */ 4002 if (ordered->file_offset > mod_start) { 4003 if (ordered->file_offset + ordered->len >= 4004 mod_start + mod_len) 4005 mod_len = ordered->file_offset - mod_start; 4006 /* 4007 * If we have this case 4008 * 4009 * |--------- logged extent ---------| 4010 * |----- ordered extent ----| 4011 * 4012 * Just don't mess with mod_start and mod_len, we'll 4013 * just end up logging more csums than we need and it 4014 * will be ok. 4015 */ 4016 } else { 4017 if (ordered->file_offset + ordered->len < 4018 mod_start + mod_len) { 4019 mod_len = (mod_start + mod_len) - 4020 (ordered->file_offset + ordered->len); 4021 mod_start = ordered->file_offset + 4022 ordered->len; 4023 } else { 4024 mod_len = 0; 4025 } 4026 } 4027 4028 if (skip_csum) 4029 continue; 4030 4031 /* 4032 * To keep us from looping for the above case of an ordered 4033 * extent that falls inside of the logged extent. 4034 */ 4035 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4036 &ordered->flags)) 4037 continue; 4038 4039 list_for_each_entry(sum, &ordered->list, list) { 4040 ret = btrfs_csum_file_blocks(trans, log, sum); 4041 if (ret) 4042 break; 4043 } 4044 } 4045 4046 if (*ordered_io_error || !mod_len || ret || skip_csum) 4047 return ret; 4048 4049 if (em->compress_type) { 4050 csum_offset = 0; 4051 csum_len = max(em->block_len, em->orig_block_len); 4052 } else { 4053 csum_offset = mod_start - em->start; 4054 csum_len = mod_len; 4055 } 4056 4057 /* block start is already adjusted for the file extent offset. */ 4058 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4059 em->block_start + csum_offset, 4060 em->block_start + csum_offset + 4061 csum_len - 1, &ordered_sums, 0); 4062 if (ret) 4063 return ret; 4064 4065 while (!list_empty(&ordered_sums)) { 4066 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4067 struct btrfs_ordered_sum, 4068 list); 4069 if (!ret) 4070 ret = btrfs_csum_file_blocks(trans, log, sums); 4071 list_del(&sums->list); 4072 kfree(sums); 4073 } 4074 4075 return ret; 4076 } 4077 4078 static int log_one_extent(struct btrfs_trans_handle *trans, 4079 struct btrfs_inode *inode, struct btrfs_root *root, 4080 const struct extent_map *em, 4081 struct btrfs_path *path, 4082 const struct list_head *logged_list, 4083 struct btrfs_log_ctx *ctx) 4084 { 4085 struct btrfs_root *log = root->log_root; 4086 struct btrfs_file_extent_item *fi; 4087 struct extent_buffer *leaf; 4088 struct btrfs_map_token token; 4089 struct btrfs_key key; 4090 u64 extent_offset = em->start - em->orig_start; 4091 u64 block_len; 4092 int ret; 4093 int extent_inserted = 0; 4094 bool ordered_io_err = false; 4095 4096 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4097 logged_list, &ordered_io_err); 4098 if (ret) 4099 return ret; 4100 4101 if (ordered_io_err) { 4102 ctx->io_err = -EIO; 4103 return 0; 4104 } 4105 4106 btrfs_init_map_token(&token); 4107 4108 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4109 em->start + em->len, NULL, 0, 1, 4110 sizeof(*fi), &extent_inserted); 4111 if (ret) 4112 return ret; 4113 4114 if (!extent_inserted) { 4115 key.objectid = btrfs_ino(inode); 4116 key.type = BTRFS_EXTENT_DATA_KEY; 4117 key.offset = em->start; 4118 4119 ret = btrfs_insert_empty_item(trans, log, path, &key, 4120 sizeof(*fi)); 4121 if (ret) 4122 return ret; 4123 } 4124 leaf = path->nodes[0]; 4125 fi = btrfs_item_ptr(leaf, path->slots[0], 4126 struct btrfs_file_extent_item); 4127 4128 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4129 &token); 4130 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4131 btrfs_set_token_file_extent_type(leaf, fi, 4132 BTRFS_FILE_EXTENT_PREALLOC, 4133 &token); 4134 else 4135 btrfs_set_token_file_extent_type(leaf, fi, 4136 BTRFS_FILE_EXTENT_REG, 4137 &token); 4138 4139 block_len = max(em->block_len, em->orig_block_len); 4140 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4141 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4142 em->block_start, 4143 &token); 4144 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4145 &token); 4146 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4147 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4148 em->block_start - 4149 extent_offset, &token); 4150 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4151 &token); 4152 } else { 4153 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4154 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4155 &token); 4156 } 4157 4158 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4159 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4160 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4161 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4162 &token); 4163 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4164 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4165 btrfs_mark_buffer_dirty(leaf); 4166 4167 btrfs_release_path(path); 4168 4169 return ret; 4170 } 4171 4172 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4173 struct btrfs_root *root, 4174 struct btrfs_inode *inode, 4175 struct btrfs_path *path, 4176 struct list_head *logged_list, 4177 struct btrfs_log_ctx *ctx, 4178 const u64 start, 4179 const u64 end) 4180 { 4181 struct extent_map *em, *n; 4182 struct list_head extents; 4183 struct extent_map_tree *tree = &inode->extent_tree; 4184 u64 test_gen; 4185 int ret = 0; 4186 int num = 0; 4187 4188 INIT_LIST_HEAD(&extents); 4189 4190 down_write(&inode->dio_sem); 4191 write_lock(&tree->lock); 4192 test_gen = root->fs_info->last_trans_committed; 4193 4194 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4195 list_del_init(&em->list); 4196 4197 /* 4198 * Just an arbitrary number, this can be really CPU intensive 4199 * once we start getting a lot of extents, and really once we 4200 * have a bunch of extents we just want to commit since it will 4201 * be faster. 4202 */ 4203 if (++num > 32768) { 4204 list_del_init(&tree->modified_extents); 4205 ret = -EFBIG; 4206 goto process; 4207 } 4208 4209 if (em->generation <= test_gen) 4210 continue; 4211 /* Need a ref to keep it from getting evicted from cache */ 4212 refcount_inc(&em->refs); 4213 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4214 list_add_tail(&em->list, &extents); 4215 num++; 4216 } 4217 4218 list_sort(NULL, &extents, extent_cmp); 4219 btrfs_get_logged_extents(inode, logged_list, start, end); 4220 /* 4221 * Some ordered extents started by fsync might have completed 4222 * before we could collect them into the list logged_list, which 4223 * means they're gone, not in our logged_list nor in the inode's 4224 * ordered tree. We want the application/user space to know an 4225 * error happened while attempting to persist file data so that 4226 * it can take proper action. If such error happened, we leave 4227 * without writing to the log tree and the fsync must report the 4228 * file data write error and not commit the current transaction. 4229 */ 4230 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4231 if (ret) 4232 ctx->io_err = ret; 4233 process: 4234 while (!list_empty(&extents)) { 4235 em = list_entry(extents.next, struct extent_map, list); 4236 4237 list_del_init(&em->list); 4238 4239 /* 4240 * If we had an error we just need to delete everybody from our 4241 * private list. 4242 */ 4243 if (ret) { 4244 clear_em_logging(tree, em); 4245 free_extent_map(em); 4246 continue; 4247 } 4248 4249 write_unlock(&tree->lock); 4250 4251 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4252 ctx); 4253 write_lock(&tree->lock); 4254 clear_em_logging(tree, em); 4255 free_extent_map(em); 4256 } 4257 WARN_ON(!list_empty(&extents)); 4258 write_unlock(&tree->lock); 4259 up_write(&inode->dio_sem); 4260 4261 btrfs_release_path(path); 4262 return ret; 4263 } 4264 4265 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4266 struct btrfs_path *path, u64 *size_ret) 4267 { 4268 struct btrfs_key key; 4269 int ret; 4270 4271 key.objectid = btrfs_ino(inode); 4272 key.type = BTRFS_INODE_ITEM_KEY; 4273 key.offset = 0; 4274 4275 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4276 if (ret < 0) { 4277 return ret; 4278 } else if (ret > 0) { 4279 *size_ret = 0; 4280 } else { 4281 struct btrfs_inode_item *item; 4282 4283 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4284 struct btrfs_inode_item); 4285 *size_ret = btrfs_inode_size(path->nodes[0], item); 4286 } 4287 4288 btrfs_release_path(path); 4289 return 0; 4290 } 4291 4292 /* 4293 * At the moment we always log all xattrs. This is to figure out at log replay 4294 * time which xattrs must have their deletion replayed. If a xattr is missing 4295 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4296 * because if a xattr is deleted, the inode is fsynced and a power failure 4297 * happens, causing the log to be replayed the next time the fs is mounted, 4298 * we want the xattr to not exist anymore (same behaviour as other filesystems 4299 * with a journal, ext3/4, xfs, f2fs, etc). 4300 */ 4301 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4302 struct btrfs_root *root, 4303 struct btrfs_inode *inode, 4304 struct btrfs_path *path, 4305 struct btrfs_path *dst_path) 4306 { 4307 int ret; 4308 struct btrfs_key key; 4309 const u64 ino = btrfs_ino(inode); 4310 int ins_nr = 0; 4311 int start_slot = 0; 4312 4313 key.objectid = ino; 4314 key.type = BTRFS_XATTR_ITEM_KEY; 4315 key.offset = 0; 4316 4317 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4318 if (ret < 0) 4319 return ret; 4320 4321 while (true) { 4322 int slot = path->slots[0]; 4323 struct extent_buffer *leaf = path->nodes[0]; 4324 int nritems = btrfs_header_nritems(leaf); 4325 4326 if (slot >= nritems) { 4327 if (ins_nr > 0) { 4328 u64 last_extent = 0; 4329 4330 ret = copy_items(trans, inode, dst_path, path, 4331 &last_extent, start_slot, 4332 ins_nr, 1, 0); 4333 /* can't be 1, extent items aren't processed */ 4334 ASSERT(ret <= 0); 4335 if (ret < 0) 4336 return ret; 4337 ins_nr = 0; 4338 } 4339 ret = btrfs_next_leaf(root, path); 4340 if (ret < 0) 4341 return ret; 4342 else if (ret > 0) 4343 break; 4344 continue; 4345 } 4346 4347 btrfs_item_key_to_cpu(leaf, &key, slot); 4348 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4349 break; 4350 4351 if (ins_nr == 0) 4352 start_slot = slot; 4353 ins_nr++; 4354 path->slots[0]++; 4355 cond_resched(); 4356 } 4357 if (ins_nr > 0) { 4358 u64 last_extent = 0; 4359 4360 ret = copy_items(trans, inode, dst_path, path, 4361 &last_extent, start_slot, 4362 ins_nr, 1, 0); 4363 /* can't be 1, extent items aren't processed */ 4364 ASSERT(ret <= 0); 4365 if (ret < 0) 4366 return ret; 4367 } 4368 4369 return 0; 4370 } 4371 4372 /* 4373 * If the no holes feature is enabled we need to make sure any hole between the 4374 * last extent and the i_size of our inode is explicitly marked in the log. This 4375 * is to make sure that doing something like: 4376 * 4377 * 1) create file with 128Kb of data 4378 * 2) truncate file to 64Kb 4379 * 3) truncate file to 256Kb 4380 * 4) fsync file 4381 * 5) <crash/power failure> 4382 * 6) mount fs and trigger log replay 4383 * 4384 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4385 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4386 * file correspond to a hole. The presence of explicit holes in a log tree is 4387 * what guarantees that log replay will remove/adjust file extent items in the 4388 * fs/subvol tree. 4389 * 4390 * Here we do not need to care about holes between extents, that is already done 4391 * by copy_items(). We also only need to do this in the full sync path, where we 4392 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4393 * lookup the list of modified extent maps and if any represents a hole, we 4394 * insert a corresponding extent representing a hole in the log tree. 4395 */ 4396 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4397 struct btrfs_root *root, 4398 struct btrfs_inode *inode, 4399 struct btrfs_path *path) 4400 { 4401 struct btrfs_fs_info *fs_info = root->fs_info; 4402 int ret; 4403 struct btrfs_key key; 4404 u64 hole_start; 4405 u64 hole_size; 4406 struct extent_buffer *leaf; 4407 struct btrfs_root *log = root->log_root; 4408 const u64 ino = btrfs_ino(inode); 4409 const u64 i_size = i_size_read(&inode->vfs_inode); 4410 4411 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4412 return 0; 4413 4414 key.objectid = ino; 4415 key.type = BTRFS_EXTENT_DATA_KEY; 4416 key.offset = (u64)-1; 4417 4418 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4419 ASSERT(ret != 0); 4420 if (ret < 0) 4421 return ret; 4422 4423 ASSERT(path->slots[0] > 0); 4424 path->slots[0]--; 4425 leaf = path->nodes[0]; 4426 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4427 4428 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4429 /* inode does not have any extents */ 4430 hole_start = 0; 4431 hole_size = i_size; 4432 } else { 4433 struct btrfs_file_extent_item *extent; 4434 u64 len; 4435 4436 /* 4437 * If there's an extent beyond i_size, an explicit hole was 4438 * already inserted by copy_items(). 4439 */ 4440 if (key.offset >= i_size) 4441 return 0; 4442 4443 extent = btrfs_item_ptr(leaf, path->slots[0], 4444 struct btrfs_file_extent_item); 4445 4446 if (btrfs_file_extent_type(leaf, extent) == 4447 BTRFS_FILE_EXTENT_INLINE) { 4448 len = btrfs_file_extent_inline_len(leaf, 4449 path->slots[0], 4450 extent); 4451 ASSERT(len == i_size || 4452 (len == fs_info->sectorsize && 4453 btrfs_file_extent_compression(leaf, extent) != 4454 BTRFS_COMPRESS_NONE)); 4455 return 0; 4456 } 4457 4458 len = btrfs_file_extent_num_bytes(leaf, extent); 4459 /* Last extent goes beyond i_size, no need to log a hole. */ 4460 if (key.offset + len > i_size) 4461 return 0; 4462 hole_start = key.offset + len; 4463 hole_size = i_size - hole_start; 4464 } 4465 btrfs_release_path(path); 4466 4467 /* Last extent ends at i_size. */ 4468 if (hole_size == 0) 4469 return 0; 4470 4471 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4472 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4473 hole_size, 0, hole_size, 0, 0, 0); 4474 return ret; 4475 } 4476 4477 /* 4478 * When we are logging a new inode X, check if it doesn't have a reference that 4479 * matches the reference from some other inode Y created in a past transaction 4480 * and that was renamed in the current transaction. If we don't do this, then at 4481 * log replay time we can lose inode Y (and all its files if it's a directory): 4482 * 4483 * mkdir /mnt/x 4484 * echo "hello world" > /mnt/x/foobar 4485 * sync 4486 * mv /mnt/x /mnt/y 4487 * mkdir /mnt/x # or touch /mnt/x 4488 * xfs_io -c fsync /mnt/x 4489 * <power fail> 4490 * mount fs, trigger log replay 4491 * 4492 * After the log replay procedure, we would lose the first directory and all its 4493 * files (file foobar). 4494 * For the case where inode Y is not a directory we simply end up losing it: 4495 * 4496 * echo "123" > /mnt/foo 4497 * sync 4498 * mv /mnt/foo /mnt/bar 4499 * echo "abc" > /mnt/foo 4500 * xfs_io -c fsync /mnt/foo 4501 * <power fail> 4502 * 4503 * We also need this for cases where a snapshot entry is replaced by some other 4504 * entry (file or directory) otherwise we end up with an unreplayable log due to 4505 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4506 * if it were a regular entry: 4507 * 4508 * mkdir /mnt/x 4509 * btrfs subvolume snapshot /mnt /mnt/x/snap 4510 * btrfs subvolume delete /mnt/x/snap 4511 * rmdir /mnt/x 4512 * mkdir /mnt/x 4513 * fsync /mnt/x or fsync some new file inside it 4514 * <power fail> 4515 * 4516 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4517 * the same transaction. 4518 */ 4519 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4520 const int slot, 4521 const struct btrfs_key *key, 4522 struct btrfs_inode *inode, 4523 u64 *other_ino) 4524 { 4525 int ret; 4526 struct btrfs_path *search_path; 4527 char *name = NULL; 4528 u32 name_len = 0; 4529 u32 item_size = btrfs_item_size_nr(eb, slot); 4530 u32 cur_offset = 0; 4531 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4532 4533 search_path = btrfs_alloc_path(); 4534 if (!search_path) 4535 return -ENOMEM; 4536 search_path->search_commit_root = 1; 4537 search_path->skip_locking = 1; 4538 4539 while (cur_offset < item_size) { 4540 u64 parent; 4541 u32 this_name_len; 4542 u32 this_len; 4543 unsigned long name_ptr; 4544 struct btrfs_dir_item *di; 4545 4546 if (key->type == BTRFS_INODE_REF_KEY) { 4547 struct btrfs_inode_ref *iref; 4548 4549 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4550 parent = key->offset; 4551 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4552 name_ptr = (unsigned long)(iref + 1); 4553 this_len = sizeof(*iref) + this_name_len; 4554 } else { 4555 struct btrfs_inode_extref *extref; 4556 4557 extref = (struct btrfs_inode_extref *)(ptr + 4558 cur_offset); 4559 parent = btrfs_inode_extref_parent(eb, extref); 4560 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4561 name_ptr = (unsigned long)&extref->name; 4562 this_len = sizeof(*extref) + this_name_len; 4563 } 4564 4565 ret = btrfs_is_name_len_valid(eb, slot, name_ptr, 4566 this_name_len); 4567 if (!ret) { 4568 ret = -EIO; 4569 goto out; 4570 } 4571 if (this_name_len > name_len) { 4572 char *new_name; 4573 4574 new_name = krealloc(name, this_name_len, GFP_NOFS); 4575 if (!new_name) { 4576 ret = -ENOMEM; 4577 goto out; 4578 } 4579 name_len = this_name_len; 4580 name = new_name; 4581 } 4582 4583 read_extent_buffer(eb, name, name_ptr, this_name_len); 4584 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4585 parent, name, this_name_len, 0); 4586 if (di && !IS_ERR(di)) { 4587 struct btrfs_key di_key; 4588 4589 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4590 di, &di_key); 4591 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4592 ret = 1; 4593 *other_ino = di_key.objectid; 4594 } else { 4595 ret = -EAGAIN; 4596 } 4597 goto out; 4598 } else if (IS_ERR(di)) { 4599 ret = PTR_ERR(di); 4600 goto out; 4601 } 4602 btrfs_release_path(search_path); 4603 4604 cur_offset += this_len; 4605 } 4606 ret = 0; 4607 out: 4608 btrfs_free_path(search_path); 4609 kfree(name); 4610 return ret; 4611 } 4612 4613 /* log a single inode in the tree log. 4614 * At least one parent directory for this inode must exist in the tree 4615 * or be logged already. 4616 * 4617 * Any items from this inode changed by the current transaction are copied 4618 * to the log tree. An extra reference is taken on any extents in this 4619 * file, allowing us to avoid a whole pile of corner cases around logging 4620 * blocks that have been removed from the tree. 4621 * 4622 * See LOG_INODE_ALL and related defines for a description of what inode_only 4623 * does. 4624 * 4625 * This handles both files and directories. 4626 */ 4627 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4628 struct btrfs_root *root, struct btrfs_inode *inode, 4629 int inode_only, 4630 const loff_t start, 4631 const loff_t end, 4632 struct btrfs_log_ctx *ctx) 4633 { 4634 struct btrfs_fs_info *fs_info = root->fs_info; 4635 struct btrfs_path *path; 4636 struct btrfs_path *dst_path; 4637 struct btrfs_key min_key; 4638 struct btrfs_key max_key; 4639 struct btrfs_root *log = root->log_root; 4640 struct extent_buffer *src = NULL; 4641 LIST_HEAD(logged_list); 4642 u64 last_extent = 0; 4643 int err = 0; 4644 int ret; 4645 int nritems; 4646 int ins_start_slot = 0; 4647 int ins_nr; 4648 bool fast_search = false; 4649 u64 ino = btrfs_ino(inode); 4650 struct extent_map_tree *em_tree = &inode->extent_tree; 4651 u64 logged_isize = 0; 4652 bool need_log_inode_item = true; 4653 4654 path = btrfs_alloc_path(); 4655 if (!path) 4656 return -ENOMEM; 4657 dst_path = btrfs_alloc_path(); 4658 if (!dst_path) { 4659 btrfs_free_path(path); 4660 return -ENOMEM; 4661 } 4662 4663 min_key.objectid = ino; 4664 min_key.type = BTRFS_INODE_ITEM_KEY; 4665 min_key.offset = 0; 4666 4667 max_key.objectid = ino; 4668 4669 4670 /* today the code can only do partial logging of directories */ 4671 if (S_ISDIR(inode->vfs_inode.i_mode) || 4672 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4673 &inode->runtime_flags) && 4674 inode_only >= LOG_INODE_EXISTS)) 4675 max_key.type = BTRFS_XATTR_ITEM_KEY; 4676 else 4677 max_key.type = (u8)-1; 4678 max_key.offset = (u64)-1; 4679 4680 /* 4681 * Only run delayed items if we are a dir or a new file. 4682 * Otherwise commit the delayed inode only, which is needed in 4683 * order for the log replay code to mark inodes for link count 4684 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4685 */ 4686 if (S_ISDIR(inode->vfs_inode.i_mode) || 4687 inode->generation > fs_info->last_trans_committed) 4688 ret = btrfs_commit_inode_delayed_items(trans, inode); 4689 else 4690 ret = btrfs_commit_inode_delayed_inode(inode); 4691 4692 if (ret) { 4693 btrfs_free_path(path); 4694 btrfs_free_path(dst_path); 4695 return ret; 4696 } 4697 4698 if (inode_only == LOG_OTHER_INODE) { 4699 inode_only = LOG_INODE_EXISTS; 4700 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4701 } else { 4702 mutex_lock(&inode->log_mutex); 4703 } 4704 4705 /* 4706 * a brute force approach to making sure we get the most uptodate 4707 * copies of everything. 4708 */ 4709 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4710 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4711 4712 if (inode_only == LOG_INODE_EXISTS) 4713 max_key_type = BTRFS_XATTR_ITEM_KEY; 4714 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4715 } else { 4716 if (inode_only == LOG_INODE_EXISTS) { 4717 /* 4718 * Make sure the new inode item we write to the log has 4719 * the same isize as the current one (if it exists). 4720 * This is necessary to prevent data loss after log 4721 * replay, and also to prevent doing a wrong expanding 4722 * truncate - for e.g. create file, write 4K into offset 4723 * 0, fsync, write 4K into offset 4096, add hard link, 4724 * fsync some other file (to sync log), power fail - if 4725 * we use the inode's current i_size, after log replay 4726 * we get a 8Kb file, with the last 4Kb extent as a hole 4727 * (zeroes), as if an expanding truncate happened, 4728 * instead of getting a file of 4Kb only. 4729 */ 4730 err = logged_inode_size(log, inode, path, &logged_isize); 4731 if (err) 4732 goto out_unlock; 4733 } 4734 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4735 &inode->runtime_flags)) { 4736 if (inode_only == LOG_INODE_EXISTS) { 4737 max_key.type = BTRFS_XATTR_ITEM_KEY; 4738 ret = drop_objectid_items(trans, log, path, ino, 4739 max_key.type); 4740 } else { 4741 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4742 &inode->runtime_flags); 4743 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4744 &inode->runtime_flags); 4745 while(1) { 4746 ret = btrfs_truncate_inode_items(trans, 4747 log, &inode->vfs_inode, 0, 0); 4748 if (ret != -EAGAIN) 4749 break; 4750 } 4751 } 4752 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4753 &inode->runtime_flags) || 4754 inode_only == LOG_INODE_EXISTS) { 4755 if (inode_only == LOG_INODE_ALL) 4756 fast_search = true; 4757 max_key.type = BTRFS_XATTR_ITEM_KEY; 4758 ret = drop_objectid_items(trans, log, path, ino, 4759 max_key.type); 4760 } else { 4761 if (inode_only == LOG_INODE_ALL) 4762 fast_search = true; 4763 goto log_extents; 4764 } 4765 4766 } 4767 if (ret) { 4768 err = ret; 4769 goto out_unlock; 4770 } 4771 4772 while (1) { 4773 ins_nr = 0; 4774 ret = btrfs_search_forward(root, &min_key, 4775 path, trans->transid); 4776 if (ret < 0) { 4777 err = ret; 4778 goto out_unlock; 4779 } 4780 if (ret != 0) 4781 break; 4782 again: 4783 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4784 if (min_key.objectid != ino) 4785 break; 4786 if (min_key.type > max_key.type) 4787 break; 4788 4789 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4790 need_log_inode_item = false; 4791 4792 if ((min_key.type == BTRFS_INODE_REF_KEY || 4793 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4794 inode->generation == trans->transid) { 4795 u64 other_ino = 0; 4796 4797 ret = btrfs_check_ref_name_override(path->nodes[0], 4798 path->slots[0], &min_key, inode, 4799 &other_ino); 4800 if (ret < 0) { 4801 err = ret; 4802 goto out_unlock; 4803 } else if (ret > 0 && ctx && 4804 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4805 struct btrfs_key inode_key; 4806 struct inode *other_inode; 4807 4808 if (ins_nr > 0) { 4809 ins_nr++; 4810 } else { 4811 ins_nr = 1; 4812 ins_start_slot = path->slots[0]; 4813 } 4814 ret = copy_items(trans, inode, dst_path, path, 4815 &last_extent, ins_start_slot, 4816 ins_nr, inode_only, 4817 logged_isize); 4818 if (ret < 0) { 4819 err = ret; 4820 goto out_unlock; 4821 } 4822 ins_nr = 0; 4823 btrfs_release_path(path); 4824 inode_key.objectid = other_ino; 4825 inode_key.type = BTRFS_INODE_ITEM_KEY; 4826 inode_key.offset = 0; 4827 other_inode = btrfs_iget(fs_info->sb, 4828 &inode_key, root, 4829 NULL); 4830 /* 4831 * If the other inode that had a conflicting dir 4832 * entry was deleted in the current transaction, 4833 * we don't need to do more work nor fallback to 4834 * a transaction commit. 4835 */ 4836 if (IS_ERR(other_inode) && 4837 PTR_ERR(other_inode) == -ENOENT) { 4838 goto next_key; 4839 } else if (IS_ERR(other_inode)) { 4840 err = PTR_ERR(other_inode); 4841 goto out_unlock; 4842 } 4843 /* 4844 * We are safe logging the other inode without 4845 * acquiring its i_mutex as long as we log with 4846 * the LOG_INODE_EXISTS mode. We're safe against 4847 * concurrent renames of the other inode as well 4848 * because during a rename we pin the log and 4849 * update the log with the new name before we 4850 * unpin it. 4851 */ 4852 err = btrfs_log_inode(trans, root, 4853 BTRFS_I(other_inode), 4854 LOG_OTHER_INODE, 0, LLONG_MAX, 4855 ctx); 4856 iput(other_inode); 4857 if (err) 4858 goto out_unlock; 4859 else 4860 goto next_key; 4861 } 4862 } 4863 4864 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4865 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4866 if (ins_nr == 0) 4867 goto next_slot; 4868 ret = copy_items(trans, inode, dst_path, path, 4869 &last_extent, ins_start_slot, 4870 ins_nr, inode_only, logged_isize); 4871 if (ret < 0) { 4872 err = ret; 4873 goto out_unlock; 4874 } 4875 ins_nr = 0; 4876 if (ret) { 4877 btrfs_release_path(path); 4878 continue; 4879 } 4880 goto next_slot; 4881 } 4882 4883 src = path->nodes[0]; 4884 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4885 ins_nr++; 4886 goto next_slot; 4887 } else if (!ins_nr) { 4888 ins_start_slot = path->slots[0]; 4889 ins_nr = 1; 4890 goto next_slot; 4891 } 4892 4893 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4894 ins_start_slot, ins_nr, inode_only, 4895 logged_isize); 4896 if (ret < 0) { 4897 err = ret; 4898 goto out_unlock; 4899 } 4900 if (ret) { 4901 ins_nr = 0; 4902 btrfs_release_path(path); 4903 continue; 4904 } 4905 ins_nr = 1; 4906 ins_start_slot = path->slots[0]; 4907 next_slot: 4908 4909 nritems = btrfs_header_nritems(path->nodes[0]); 4910 path->slots[0]++; 4911 if (path->slots[0] < nritems) { 4912 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4913 path->slots[0]); 4914 goto again; 4915 } 4916 if (ins_nr) { 4917 ret = copy_items(trans, inode, dst_path, path, 4918 &last_extent, ins_start_slot, 4919 ins_nr, inode_only, logged_isize); 4920 if (ret < 0) { 4921 err = ret; 4922 goto out_unlock; 4923 } 4924 ret = 0; 4925 ins_nr = 0; 4926 } 4927 btrfs_release_path(path); 4928 next_key: 4929 if (min_key.offset < (u64)-1) { 4930 min_key.offset++; 4931 } else if (min_key.type < max_key.type) { 4932 min_key.type++; 4933 min_key.offset = 0; 4934 } else { 4935 break; 4936 } 4937 } 4938 if (ins_nr) { 4939 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4940 ins_start_slot, ins_nr, inode_only, 4941 logged_isize); 4942 if (ret < 0) { 4943 err = ret; 4944 goto out_unlock; 4945 } 4946 ret = 0; 4947 ins_nr = 0; 4948 } 4949 4950 btrfs_release_path(path); 4951 btrfs_release_path(dst_path); 4952 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4953 if (err) 4954 goto out_unlock; 4955 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4956 btrfs_release_path(path); 4957 btrfs_release_path(dst_path); 4958 err = btrfs_log_trailing_hole(trans, root, inode, path); 4959 if (err) 4960 goto out_unlock; 4961 } 4962 log_extents: 4963 btrfs_release_path(path); 4964 btrfs_release_path(dst_path); 4965 if (need_log_inode_item) { 4966 err = log_inode_item(trans, log, dst_path, inode); 4967 if (err) 4968 goto out_unlock; 4969 } 4970 if (fast_search) { 4971 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4972 &logged_list, ctx, start, end); 4973 if (ret) { 4974 err = ret; 4975 goto out_unlock; 4976 } 4977 } else if (inode_only == LOG_INODE_ALL) { 4978 struct extent_map *em, *n; 4979 4980 write_lock(&em_tree->lock); 4981 /* 4982 * We can't just remove every em if we're called for a ranged 4983 * fsync - that is, one that doesn't cover the whole possible 4984 * file range (0 to LLONG_MAX). This is because we can have 4985 * em's that fall outside the range we're logging and therefore 4986 * their ordered operations haven't completed yet 4987 * (btrfs_finish_ordered_io() not invoked yet). This means we 4988 * didn't get their respective file extent item in the fs/subvol 4989 * tree yet, and need to let the next fast fsync (one which 4990 * consults the list of modified extent maps) find the em so 4991 * that it logs a matching file extent item and waits for the 4992 * respective ordered operation to complete (if it's still 4993 * running). 4994 * 4995 * Removing every em outside the range we're logging would make 4996 * the next fast fsync not log their matching file extent items, 4997 * therefore making us lose data after a log replay. 4998 */ 4999 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5000 list) { 5001 const u64 mod_end = em->mod_start + em->mod_len - 1; 5002 5003 if (em->mod_start >= start && mod_end <= end) 5004 list_del_init(&em->list); 5005 } 5006 write_unlock(&em_tree->lock); 5007 } 5008 5009 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5010 ret = log_directory_changes(trans, root, inode, path, dst_path, 5011 ctx); 5012 if (ret) { 5013 err = ret; 5014 goto out_unlock; 5015 } 5016 } 5017 5018 spin_lock(&inode->lock); 5019 inode->logged_trans = trans->transid; 5020 inode->last_log_commit = inode->last_sub_trans; 5021 spin_unlock(&inode->lock); 5022 out_unlock: 5023 if (unlikely(err)) 5024 btrfs_put_logged_extents(&logged_list); 5025 else 5026 btrfs_submit_logged_extents(&logged_list, log); 5027 mutex_unlock(&inode->log_mutex); 5028 5029 btrfs_free_path(path); 5030 btrfs_free_path(dst_path); 5031 return err; 5032 } 5033 5034 /* 5035 * Check if we must fallback to a transaction commit when logging an inode. 5036 * This must be called after logging the inode and is used only in the context 5037 * when fsyncing an inode requires the need to log some other inode - in which 5038 * case we can't lock the i_mutex of each other inode we need to log as that 5039 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5040 * log inodes up or down in the hierarchy) or rename operations for example. So 5041 * we take the log_mutex of the inode after we have logged it and then check for 5042 * its last_unlink_trans value - this is safe because any task setting 5043 * last_unlink_trans must take the log_mutex and it must do this before it does 5044 * the actual unlink operation, so if we do this check before a concurrent task 5045 * sets last_unlink_trans it means we've logged a consistent version/state of 5046 * all the inode items, otherwise we are not sure and must do a transaction 5047 * commit (the concurrent task might have only updated last_unlink_trans before 5048 * we logged the inode or it might have also done the unlink). 5049 */ 5050 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5051 struct btrfs_inode *inode) 5052 { 5053 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5054 bool ret = false; 5055 5056 mutex_lock(&inode->log_mutex); 5057 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5058 /* 5059 * Make sure any commits to the log are forced to be full 5060 * commits. 5061 */ 5062 btrfs_set_log_full_commit(fs_info, trans); 5063 ret = true; 5064 } 5065 mutex_unlock(&inode->log_mutex); 5066 5067 return ret; 5068 } 5069 5070 /* 5071 * follow the dentry parent pointers up the chain and see if any 5072 * of the directories in it require a full commit before they can 5073 * be logged. Returns zero if nothing special needs to be done or 1 if 5074 * a full commit is required. 5075 */ 5076 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5077 struct btrfs_inode *inode, 5078 struct dentry *parent, 5079 struct super_block *sb, 5080 u64 last_committed) 5081 { 5082 int ret = 0; 5083 struct dentry *old_parent = NULL; 5084 struct btrfs_inode *orig_inode = inode; 5085 5086 /* 5087 * for regular files, if its inode is already on disk, we don't 5088 * have to worry about the parents at all. This is because 5089 * we can use the last_unlink_trans field to record renames 5090 * and other fun in this file. 5091 */ 5092 if (S_ISREG(inode->vfs_inode.i_mode) && 5093 inode->generation <= last_committed && 5094 inode->last_unlink_trans <= last_committed) 5095 goto out; 5096 5097 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5098 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5099 goto out; 5100 inode = BTRFS_I(d_inode(parent)); 5101 } 5102 5103 while (1) { 5104 /* 5105 * If we are logging a directory then we start with our inode, 5106 * not our parent's inode, so we need to skip setting the 5107 * logged_trans so that further down in the log code we don't 5108 * think this inode has already been logged. 5109 */ 5110 if (inode != orig_inode) 5111 inode->logged_trans = trans->transid; 5112 smp_mb(); 5113 5114 if (btrfs_must_commit_transaction(trans, inode)) { 5115 ret = 1; 5116 break; 5117 } 5118 5119 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5120 break; 5121 5122 if (IS_ROOT(parent)) { 5123 inode = BTRFS_I(d_inode(parent)); 5124 if (btrfs_must_commit_transaction(trans, inode)) 5125 ret = 1; 5126 break; 5127 } 5128 5129 parent = dget_parent(parent); 5130 dput(old_parent); 5131 old_parent = parent; 5132 inode = BTRFS_I(d_inode(parent)); 5133 5134 } 5135 dput(old_parent); 5136 out: 5137 return ret; 5138 } 5139 5140 struct btrfs_dir_list { 5141 u64 ino; 5142 struct list_head list; 5143 }; 5144 5145 /* 5146 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5147 * details about the why it is needed. 5148 * This is a recursive operation - if an existing dentry corresponds to a 5149 * directory, that directory's new entries are logged too (same behaviour as 5150 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5151 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5152 * complains about the following circular lock dependency / possible deadlock: 5153 * 5154 * CPU0 CPU1 5155 * ---- ---- 5156 * lock(&type->i_mutex_dir_key#3/2); 5157 * lock(sb_internal#2); 5158 * lock(&type->i_mutex_dir_key#3/2); 5159 * lock(&sb->s_type->i_mutex_key#14); 5160 * 5161 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5162 * sb_start_intwrite() in btrfs_start_transaction(). 5163 * Not locking i_mutex of the inodes is still safe because: 5164 * 5165 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5166 * that while logging the inode new references (names) are added or removed 5167 * from the inode, leaving the logged inode item with a link count that does 5168 * not match the number of logged inode reference items. This is fine because 5169 * at log replay time we compute the real number of links and correct the 5170 * link count in the inode item (see replay_one_buffer() and 5171 * link_to_fixup_dir()); 5172 * 5173 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5174 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5175 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5176 * has a size that doesn't match the sum of the lengths of all the logged 5177 * names. This does not result in a problem because if a dir_item key is 5178 * logged but its matching dir_index key is not logged, at log replay time we 5179 * don't use it to replay the respective name (see replay_one_name()). On the 5180 * other hand if only the dir_index key ends up being logged, the respective 5181 * name is added to the fs/subvol tree with both the dir_item and dir_index 5182 * keys created (see replay_one_name()). 5183 * The directory's inode item with a wrong i_size is not a problem as well, 5184 * since we don't use it at log replay time to set the i_size in the inode 5185 * item of the fs/subvol tree (see overwrite_item()). 5186 */ 5187 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5188 struct btrfs_root *root, 5189 struct btrfs_inode *start_inode, 5190 struct btrfs_log_ctx *ctx) 5191 { 5192 struct btrfs_fs_info *fs_info = root->fs_info; 5193 struct btrfs_root *log = root->log_root; 5194 struct btrfs_path *path; 5195 LIST_HEAD(dir_list); 5196 struct btrfs_dir_list *dir_elem; 5197 int ret = 0; 5198 5199 path = btrfs_alloc_path(); 5200 if (!path) 5201 return -ENOMEM; 5202 5203 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5204 if (!dir_elem) { 5205 btrfs_free_path(path); 5206 return -ENOMEM; 5207 } 5208 dir_elem->ino = btrfs_ino(start_inode); 5209 list_add_tail(&dir_elem->list, &dir_list); 5210 5211 while (!list_empty(&dir_list)) { 5212 struct extent_buffer *leaf; 5213 struct btrfs_key min_key; 5214 int nritems; 5215 int i; 5216 5217 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5218 list); 5219 if (ret) 5220 goto next_dir_inode; 5221 5222 min_key.objectid = dir_elem->ino; 5223 min_key.type = BTRFS_DIR_ITEM_KEY; 5224 min_key.offset = 0; 5225 again: 5226 btrfs_release_path(path); 5227 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5228 if (ret < 0) { 5229 goto next_dir_inode; 5230 } else if (ret > 0) { 5231 ret = 0; 5232 goto next_dir_inode; 5233 } 5234 5235 process_leaf: 5236 leaf = path->nodes[0]; 5237 nritems = btrfs_header_nritems(leaf); 5238 for (i = path->slots[0]; i < nritems; i++) { 5239 struct btrfs_dir_item *di; 5240 struct btrfs_key di_key; 5241 struct inode *di_inode; 5242 struct btrfs_dir_list *new_dir_elem; 5243 int log_mode = LOG_INODE_EXISTS; 5244 int type; 5245 5246 btrfs_item_key_to_cpu(leaf, &min_key, i); 5247 if (min_key.objectid != dir_elem->ino || 5248 min_key.type != BTRFS_DIR_ITEM_KEY) 5249 goto next_dir_inode; 5250 5251 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5252 type = btrfs_dir_type(leaf, di); 5253 if (btrfs_dir_transid(leaf, di) < trans->transid && 5254 type != BTRFS_FT_DIR) 5255 continue; 5256 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5257 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5258 continue; 5259 5260 btrfs_release_path(path); 5261 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5262 if (IS_ERR(di_inode)) { 5263 ret = PTR_ERR(di_inode); 5264 goto next_dir_inode; 5265 } 5266 5267 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5268 iput(di_inode); 5269 break; 5270 } 5271 5272 ctx->log_new_dentries = false; 5273 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5274 log_mode = LOG_INODE_ALL; 5275 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5276 log_mode, 0, LLONG_MAX, ctx); 5277 if (!ret && 5278 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5279 ret = 1; 5280 iput(di_inode); 5281 if (ret) 5282 goto next_dir_inode; 5283 if (ctx->log_new_dentries) { 5284 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5285 GFP_NOFS); 5286 if (!new_dir_elem) { 5287 ret = -ENOMEM; 5288 goto next_dir_inode; 5289 } 5290 new_dir_elem->ino = di_key.objectid; 5291 list_add_tail(&new_dir_elem->list, &dir_list); 5292 } 5293 break; 5294 } 5295 if (i == nritems) { 5296 ret = btrfs_next_leaf(log, path); 5297 if (ret < 0) { 5298 goto next_dir_inode; 5299 } else if (ret > 0) { 5300 ret = 0; 5301 goto next_dir_inode; 5302 } 5303 goto process_leaf; 5304 } 5305 if (min_key.offset < (u64)-1) { 5306 min_key.offset++; 5307 goto again; 5308 } 5309 next_dir_inode: 5310 list_del(&dir_elem->list); 5311 kfree(dir_elem); 5312 } 5313 5314 btrfs_free_path(path); 5315 return ret; 5316 } 5317 5318 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5319 struct btrfs_inode *inode, 5320 struct btrfs_log_ctx *ctx) 5321 { 5322 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5323 int ret; 5324 struct btrfs_path *path; 5325 struct btrfs_key key; 5326 struct btrfs_root *root = inode->root; 5327 const u64 ino = btrfs_ino(inode); 5328 5329 path = btrfs_alloc_path(); 5330 if (!path) 5331 return -ENOMEM; 5332 path->skip_locking = 1; 5333 path->search_commit_root = 1; 5334 5335 key.objectid = ino; 5336 key.type = BTRFS_INODE_REF_KEY; 5337 key.offset = 0; 5338 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5339 if (ret < 0) 5340 goto out; 5341 5342 while (true) { 5343 struct extent_buffer *leaf = path->nodes[0]; 5344 int slot = path->slots[0]; 5345 u32 cur_offset = 0; 5346 u32 item_size; 5347 unsigned long ptr; 5348 5349 if (slot >= btrfs_header_nritems(leaf)) { 5350 ret = btrfs_next_leaf(root, path); 5351 if (ret < 0) 5352 goto out; 5353 else if (ret > 0) 5354 break; 5355 continue; 5356 } 5357 5358 btrfs_item_key_to_cpu(leaf, &key, slot); 5359 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5360 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5361 break; 5362 5363 item_size = btrfs_item_size_nr(leaf, slot); 5364 ptr = btrfs_item_ptr_offset(leaf, slot); 5365 while (cur_offset < item_size) { 5366 struct btrfs_key inode_key; 5367 struct inode *dir_inode; 5368 5369 inode_key.type = BTRFS_INODE_ITEM_KEY; 5370 inode_key.offset = 0; 5371 5372 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5373 struct btrfs_inode_extref *extref; 5374 5375 extref = (struct btrfs_inode_extref *) 5376 (ptr + cur_offset); 5377 inode_key.objectid = btrfs_inode_extref_parent( 5378 leaf, extref); 5379 cur_offset += sizeof(*extref); 5380 cur_offset += btrfs_inode_extref_name_len(leaf, 5381 extref); 5382 } else { 5383 inode_key.objectid = key.offset; 5384 cur_offset = item_size; 5385 } 5386 5387 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5388 root, NULL); 5389 /* If parent inode was deleted, skip it. */ 5390 if (IS_ERR(dir_inode)) 5391 continue; 5392 5393 if (ctx) 5394 ctx->log_new_dentries = false; 5395 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5396 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5397 if (!ret && 5398 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5399 ret = 1; 5400 if (!ret && ctx && ctx->log_new_dentries) 5401 ret = log_new_dir_dentries(trans, root, 5402 BTRFS_I(dir_inode), ctx); 5403 iput(dir_inode); 5404 if (ret) 5405 goto out; 5406 } 5407 path->slots[0]++; 5408 } 5409 ret = 0; 5410 out: 5411 btrfs_free_path(path); 5412 return ret; 5413 } 5414 5415 /* 5416 * helper function around btrfs_log_inode to make sure newly created 5417 * parent directories also end up in the log. A minimal inode and backref 5418 * only logging is done of any parent directories that are older than 5419 * the last committed transaction 5420 */ 5421 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5422 struct btrfs_root *root, 5423 struct btrfs_inode *inode, 5424 struct dentry *parent, 5425 const loff_t start, 5426 const loff_t end, 5427 int exists_only, 5428 struct btrfs_log_ctx *ctx) 5429 { 5430 struct btrfs_fs_info *fs_info = root->fs_info; 5431 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5432 struct super_block *sb; 5433 struct dentry *old_parent = NULL; 5434 int ret = 0; 5435 u64 last_committed = fs_info->last_trans_committed; 5436 bool log_dentries = false; 5437 struct btrfs_inode *orig_inode = inode; 5438 5439 sb = inode->vfs_inode.i_sb; 5440 5441 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5442 ret = 1; 5443 goto end_no_trans; 5444 } 5445 5446 /* 5447 * The prev transaction commit doesn't complete, we need do 5448 * full commit by ourselves. 5449 */ 5450 if (fs_info->last_trans_log_full_commit > 5451 fs_info->last_trans_committed) { 5452 ret = 1; 5453 goto end_no_trans; 5454 } 5455 5456 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5457 ret = 1; 5458 goto end_no_trans; 5459 } 5460 5461 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5462 last_committed); 5463 if (ret) 5464 goto end_no_trans; 5465 5466 if (btrfs_inode_in_log(inode, trans->transid)) { 5467 ret = BTRFS_NO_LOG_SYNC; 5468 goto end_no_trans; 5469 } 5470 5471 ret = start_log_trans(trans, root, ctx); 5472 if (ret) 5473 goto end_no_trans; 5474 5475 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5476 if (ret) 5477 goto end_trans; 5478 5479 /* 5480 * for regular files, if its inode is already on disk, we don't 5481 * have to worry about the parents at all. This is because 5482 * we can use the last_unlink_trans field to record renames 5483 * and other fun in this file. 5484 */ 5485 if (S_ISREG(inode->vfs_inode.i_mode) && 5486 inode->generation <= last_committed && 5487 inode->last_unlink_trans <= last_committed) { 5488 ret = 0; 5489 goto end_trans; 5490 } 5491 5492 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5493 log_dentries = true; 5494 5495 /* 5496 * On unlink we must make sure all our current and old parent directory 5497 * inodes are fully logged. This is to prevent leaving dangling 5498 * directory index entries in directories that were our parents but are 5499 * not anymore. Not doing this results in old parent directory being 5500 * impossible to delete after log replay (rmdir will always fail with 5501 * error -ENOTEMPTY). 5502 * 5503 * Example 1: 5504 * 5505 * mkdir testdir 5506 * touch testdir/foo 5507 * ln testdir/foo testdir/bar 5508 * sync 5509 * unlink testdir/bar 5510 * xfs_io -c fsync testdir/foo 5511 * <power failure> 5512 * mount fs, triggers log replay 5513 * 5514 * If we don't log the parent directory (testdir), after log replay the 5515 * directory still has an entry pointing to the file inode using the bar 5516 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5517 * the file inode has a link count of 1. 5518 * 5519 * Example 2: 5520 * 5521 * mkdir testdir 5522 * touch foo 5523 * ln foo testdir/foo2 5524 * ln foo testdir/foo3 5525 * sync 5526 * unlink testdir/foo3 5527 * xfs_io -c fsync foo 5528 * <power failure> 5529 * mount fs, triggers log replay 5530 * 5531 * Similar as the first example, after log replay the parent directory 5532 * testdir still has an entry pointing to the inode file with name foo3 5533 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5534 * and has a link count of 2. 5535 */ 5536 if (inode->last_unlink_trans > last_committed) { 5537 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5538 if (ret) 5539 goto end_trans; 5540 } 5541 5542 while (1) { 5543 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5544 break; 5545 5546 inode = BTRFS_I(d_inode(parent)); 5547 if (root != inode->root) 5548 break; 5549 5550 if (inode->generation > last_committed) { 5551 ret = btrfs_log_inode(trans, root, inode, 5552 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5553 if (ret) 5554 goto end_trans; 5555 } 5556 if (IS_ROOT(parent)) 5557 break; 5558 5559 parent = dget_parent(parent); 5560 dput(old_parent); 5561 old_parent = parent; 5562 } 5563 if (log_dentries) 5564 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5565 else 5566 ret = 0; 5567 end_trans: 5568 dput(old_parent); 5569 if (ret < 0) { 5570 btrfs_set_log_full_commit(fs_info, trans); 5571 ret = 1; 5572 } 5573 5574 if (ret) 5575 btrfs_remove_log_ctx(root, ctx); 5576 btrfs_end_log_trans(root); 5577 end_no_trans: 5578 return ret; 5579 } 5580 5581 /* 5582 * it is not safe to log dentry if the chunk root has added new 5583 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5584 * If this returns 1, you must commit the transaction to safely get your 5585 * data on disk. 5586 */ 5587 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5588 struct btrfs_root *root, struct dentry *dentry, 5589 const loff_t start, 5590 const loff_t end, 5591 struct btrfs_log_ctx *ctx) 5592 { 5593 struct dentry *parent = dget_parent(dentry); 5594 int ret; 5595 5596 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5597 parent, start, end, 0, ctx); 5598 dput(parent); 5599 5600 return ret; 5601 } 5602 5603 /* 5604 * should be called during mount to recover any replay any log trees 5605 * from the FS 5606 */ 5607 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5608 { 5609 int ret; 5610 struct btrfs_path *path; 5611 struct btrfs_trans_handle *trans; 5612 struct btrfs_key key; 5613 struct btrfs_key found_key; 5614 struct btrfs_key tmp_key; 5615 struct btrfs_root *log; 5616 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5617 struct walk_control wc = { 5618 .process_func = process_one_buffer, 5619 .stage = 0, 5620 }; 5621 5622 path = btrfs_alloc_path(); 5623 if (!path) 5624 return -ENOMEM; 5625 5626 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5627 5628 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5629 if (IS_ERR(trans)) { 5630 ret = PTR_ERR(trans); 5631 goto error; 5632 } 5633 5634 wc.trans = trans; 5635 wc.pin = 1; 5636 5637 ret = walk_log_tree(trans, log_root_tree, &wc); 5638 if (ret) { 5639 btrfs_handle_fs_error(fs_info, ret, 5640 "Failed to pin buffers while recovering log root tree."); 5641 goto error; 5642 } 5643 5644 again: 5645 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5646 key.offset = (u64)-1; 5647 key.type = BTRFS_ROOT_ITEM_KEY; 5648 5649 while (1) { 5650 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5651 5652 if (ret < 0) { 5653 btrfs_handle_fs_error(fs_info, ret, 5654 "Couldn't find tree log root."); 5655 goto error; 5656 } 5657 if (ret > 0) { 5658 if (path->slots[0] == 0) 5659 break; 5660 path->slots[0]--; 5661 } 5662 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5663 path->slots[0]); 5664 btrfs_release_path(path); 5665 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5666 break; 5667 5668 log = btrfs_read_fs_root(log_root_tree, &found_key); 5669 if (IS_ERR(log)) { 5670 ret = PTR_ERR(log); 5671 btrfs_handle_fs_error(fs_info, ret, 5672 "Couldn't read tree log root."); 5673 goto error; 5674 } 5675 5676 tmp_key.objectid = found_key.offset; 5677 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5678 tmp_key.offset = (u64)-1; 5679 5680 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5681 if (IS_ERR(wc.replay_dest)) { 5682 ret = PTR_ERR(wc.replay_dest); 5683 free_extent_buffer(log->node); 5684 free_extent_buffer(log->commit_root); 5685 kfree(log); 5686 btrfs_handle_fs_error(fs_info, ret, 5687 "Couldn't read target root for tree log recovery."); 5688 goto error; 5689 } 5690 5691 wc.replay_dest->log_root = log; 5692 btrfs_record_root_in_trans(trans, wc.replay_dest); 5693 ret = walk_log_tree(trans, log, &wc); 5694 5695 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5696 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5697 path); 5698 } 5699 5700 key.offset = found_key.offset - 1; 5701 wc.replay_dest->log_root = NULL; 5702 free_extent_buffer(log->node); 5703 free_extent_buffer(log->commit_root); 5704 kfree(log); 5705 5706 if (ret) 5707 goto error; 5708 5709 if (found_key.offset == 0) 5710 break; 5711 } 5712 btrfs_release_path(path); 5713 5714 /* step one is to pin it all, step two is to replay just inodes */ 5715 if (wc.pin) { 5716 wc.pin = 0; 5717 wc.process_func = replay_one_buffer; 5718 wc.stage = LOG_WALK_REPLAY_INODES; 5719 goto again; 5720 } 5721 /* step three is to replay everything */ 5722 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5723 wc.stage++; 5724 goto again; 5725 } 5726 5727 btrfs_free_path(path); 5728 5729 /* step 4: commit the transaction, which also unpins the blocks */ 5730 ret = btrfs_commit_transaction(trans); 5731 if (ret) 5732 return ret; 5733 5734 free_extent_buffer(log_root_tree->node); 5735 log_root_tree->log_root = NULL; 5736 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5737 kfree(log_root_tree); 5738 5739 return 0; 5740 error: 5741 if (wc.trans) 5742 btrfs_end_transaction(wc.trans); 5743 btrfs_free_path(path); 5744 return ret; 5745 } 5746 5747 /* 5748 * there are some corner cases where we want to force a full 5749 * commit instead of allowing a directory to be logged. 5750 * 5751 * They revolve around files there were unlinked from the directory, and 5752 * this function updates the parent directory so that a full commit is 5753 * properly done if it is fsync'd later after the unlinks are done. 5754 * 5755 * Must be called before the unlink operations (updates to the subvolume tree, 5756 * inodes, etc) are done. 5757 */ 5758 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5759 struct btrfs_inode *dir, struct btrfs_inode *inode, 5760 int for_rename) 5761 { 5762 /* 5763 * when we're logging a file, if it hasn't been renamed 5764 * or unlinked, and its inode is fully committed on disk, 5765 * we don't have to worry about walking up the directory chain 5766 * to log its parents. 5767 * 5768 * So, we use the last_unlink_trans field to put this transid 5769 * into the file. When the file is logged we check it and 5770 * don't log the parents if the file is fully on disk. 5771 */ 5772 mutex_lock(&inode->log_mutex); 5773 inode->last_unlink_trans = trans->transid; 5774 mutex_unlock(&inode->log_mutex); 5775 5776 /* 5777 * if this directory was already logged any new 5778 * names for this file/dir will get recorded 5779 */ 5780 smp_mb(); 5781 if (dir->logged_trans == trans->transid) 5782 return; 5783 5784 /* 5785 * if the inode we're about to unlink was logged, 5786 * the log will be properly updated for any new names 5787 */ 5788 if (inode->logged_trans == trans->transid) 5789 return; 5790 5791 /* 5792 * when renaming files across directories, if the directory 5793 * there we're unlinking from gets fsync'd later on, there's 5794 * no way to find the destination directory later and fsync it 5795 * properly. So, we have to be conservative and force commits 5796 * so the new name gets discovered. 5797 */ 5798 if (for_rename) 5799 goto record; 5800 5801 /* we can safely do the unlink without any special recording */ 5802 return; 5803 5804 record: 5805 mutex_lock(&dir->log_mutex); 5806 dir->last_unlink_trans = trans->transid; 5807 mutex_unlock(&dir->log_mutex); 5808 } 5809 5810 /* 5811 * Make sure that if someone attempts to fsync the parent directory of a deleted 5812 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5813 * that after replaying the log tree of the parent directory's root we will not 5814 * see the snapshot anymore and at log replay time we will not see any log tree 5815 * corresponding to the deleted snapshot's root, which could lead to replaying 5816 * it after replaying the log tree of the parent directory (which would replay 5817 * the snapshot delete operation). 5818 * 5819 * Must be called before the actual snapshot destroy operation (updates to the 5820 * parent root and tree of tree roots trees, etc) are done. 5821 */ 5822 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5823 struct btrfs_inode *dir) 5824 { 5825 mutex_lock(&dir->log_mutex); 5826 dir->last_unlink_trans = trans->transid; 5827 mutex_unlock(&dir->log_mutex); 5828 } 5829 5830 /* 5831 * Call this after adding a new name for a file and it will properly 5832 * update the log to reflect the new name. 5833 * 5834 * It will return zero if all goes well, and it will return 1 if a 5835 * full transaction commit is required. 5836 */ 5837 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5838 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5839 struct dentry *parent) 5840 { 5841 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5842 struct btrfs_root *root = inode->root; 5843 5844 /* 5845 * this will force the logging code to walk the dentry chain 5846 * up for the file 5847 */ 5848 if (S_ISREG(inode->vfs_inode.i_mode)) 5849 inode->last_unlink_trans = trans->transid; 5850 5851 /* 5852 * if this inode hasn't been logged and directory we're renaming it 5853 * from hasn't been logged, we don't need to log it 5854 */ 5855 if (inode->logged_trans <= fs_info->last_trans_committed && 5856 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5857 return 0; 5858 5859 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5860 LLONG_MAX, 1, NULL); 5861 } 5862 5863