1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include <linux/iversion.h> 24 #include "tree-log.h" 25 #include "disk-io.h" 26 #include "locking.h" 27 #include "print-tree.h" 28 #include "backref.h" 29 #include "hash.h" 30 #include "compression.h" 31 #include "qgroup.h" 32 #include "inode-map.h" 33 34 /* magic values for the inode_only field in btrfs_log_inode: 35 * 36 * LOG_INODE_ALL means to log everything 37 * LOG_INODE_EXISTS means to log just enough to recreate the inode 38 * during log replay 39 */ 40 #define LOG_INODE_ALL 0 41 #define LOG_INODE_EXISTS 1 42 #define LOG_OTHER_INODE 2 43 44 /* 45 * directory trouble cases 46 * 47 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 48 * log, we must force a full commit before doing an fsync of the directory 49 * where the unlink was done. 50 * ---> record transid of last unlink/rename per directory 51 * 52 * mkdir foo/some_dir 53 * normal commit 54 * rename foo/some_dir foo2/some_dir 55 * mkdir foo/some_dir 56 * fsync foo/some_dir/some_file 57 * 58 * The fsync above will unlink the original some_dir without recording 59 * it in its new location (foo2). After a crash, some_dir will be gone 60 * unless the fsync of some_file forces a full commit 61 * 62 * 2) we must log any new names for any file or dir that is in the fsync 63 * log. ---> check inode while renaming/linking. 64 * 65 * 2a) we must log any new names for any file or dir during rename 66 * when the directory they are being removed from was logged. 67 * ---> check inode and old parent dir during rename 68 * 69 * 2a is actually the more important variant. With the extra logging 70 * a crash might unlink the old name without recreating the new one 71 * 72 * 3) after a crash, we must go through any directories with a link count 73 * of zero and redo the rm -rf 74 * 75 * mkdir f1/foo 76 * normal commit 77 * rm -rf f1/foo 78 * fsync(f1) 79 * 80 * The directory f1 was fully removed from the FS, but fsync was never 81 * called on f1, only its parent dir. After a crash the rm -rf must 82 * be replayed. This must be able to recurse down the entire 83 * directory tree. The inode link count fixup code takes care of the 84 * ugly details. 85 */ 86 87 /* 88 * stages for the tree walking. The first 89 * stage (0) is to only pin down the blocks we find 90 * the second stage (1) is to make sure that all the inodes 91 * we find in the log are created in the subvolume. 92 * 93 * The last stage is to deal with directories and links and extents 94 * and all the other fun semantics 95 */ 96 #define LOG_WALK_PIN_ONLY 0 97 #define LOG_WALK_REPLAY_INODES 1 98 #define LOG_WALK_REPLAY_DIR_INDEX 2 99 #define LOG_WALK_REPLAY_ALL 3 100 101 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, struct btrfs_inode *inode, 103 int inode_only, 104 const loff_t start, 105 const loff_t end, 106 struct btrfs_log_ctx *ctx); 107 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 108 struct btrfs_root *root, 109 struct btrfs_path *path, u64 objectid); 110 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 111 struct btrfs_root *root, 112 struct btrfs_root *log, 113 struct btrfs_path *path, 114 u64 dirid, int del_all); 115 116 /* 117 * tree logging is a special write ahead log used to make sure that 118 * fsyncs and O_SYNCs can happen without doing full tree commits. 119 * 120 * Full tree commits are expensive because they require commonly 121 * modified blocks to be recowed, creating many dirty pages in the 122 * extent tree an 4x-6x higher write load than ext3. 123 * 124 * Instead of doing a tree commit on every fsync, we use the 125 * key ranges and transaction ids to find items for a given file or directory 126 * that have changed in this transaction. Those items are copied into 127 * a special tree (one per subvolume root), that tree is written to disk 128 * and then the fsync is considered complete. 129 * 130 * After a crash, items are copied out of the log-tree back into the 131 * subvolume tree. Any file data extents found are recorded in the extent 132 * allocation tree, and the log-tree freed. 133 * 134 * The log tree is read three times, once to pin down all the extents it is 135 * using in ram and once, once to create all the inodes logged in the tree 136 * and once to do all the other items. 137 */ 138 139 /* 140 * start a sub transaction and setup the log tree 141 * this increments the log tree writer count to make the people 142 * syncing the tree wait for us to finish 143 */ 144 static int start_log_trans(struct btrfs_trans_handle *trans, 145 struct btrfs_root *root, 146 struct btrfs_log_ctx *ctx) 147 { 148 struct btrfs_fs_info *fs_info = root->fs_info; 149 int ret = 0; 150 151 mutex_lock(&root->log_mutex); 152 153 if (root->log_root) { 154 if (btrfs_need_log_full_commit(fs_info, trans)) { 155 ret = -EAGAIN; 156 goto out; 157 } 158 159 if (!root->log_start_pid) { 160 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 161 root->log_start_pid = current->pid; 162 } else if (root->log_start_pid != current->pid) { 163 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 164 } 165 } else { 166 mutex_lock(&fs_info->tree_log_mutex); 167 if (!fs_info->log_root_tree) 168 ret = btrfs_init_log_root_tree(trans, fs_info); 169 mutex_unlock(&fs_info->tree_log_mutex); 170 if (ret) 171 goto out; 172 173 ret = btrfs_add_log_tree(trans, root); 174 if (ret) 175 goto out; 176 177 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 178 root->log_start_pid = current->pid; 179 } 180 181 atomic_inc(&root->log_batch); 182 atomic_inc(&root->log_writers); 183 if (ctx) { 184 int index = root->log_transid % 2; 185 list_add_tail(&ctx->list, &root->log_ctxs[index]); 186 ctx->log_transid = root->log_transid; 187 } 188 189 out: 190 mutex_unlock(&root->log_mutex); 191 return ret; 192 } 193 194 /* 195 * returns 0 if there was a log transaction running and we were able 196 * to join, or returns -ENOENT if there were not transactions 197 * in progress 198 */ 199 static int join_running_log_trans(struct btrfs_root *root) 200 { 201 int ret = -ENOENT; 202 203 smp_mb(); 204 if (!root->log_root) 205 return -ENOENT; 206 207 mutex_lock(&root->log_mutex); 208 if (root->log_root) { 209 ret = 0; 210 atomic_inc(&root->log_writers); 211 } 212 mutex_unlock(&root->log_mutex); 213 return ret; 214 } 215 216 /* 217 * This either makes the current running log transaction wait 218 * until you call btrfs_end_log_trans() or it makes any future 219 * log transactions wait until you call btrfs_end_log_trans() 220 */ 221 int btrfs_pin_log_trans(struct btrfs_root *root) 222 { 223 int ret = -ENOENT; 224 225 mutex_lock(&root->log_mutex); 226 atomic_inc(&root->log_writers); 227 mutex_unlock(&root->log_mutex); 228 return ret; 229 } 230 231 /* 232 * indicate we're done making changes to the log tree 233 * and wake up anyone waiting to do a sync 234 */ 235 void btrfs_end_log_trans(struct btrfs_root *root) 236 { 237 if (atomic_dec_and_test(&root->log_writers)) { 238 /* 239 * Implicit memory barrier after atomic_dec_and_test 240 */ 241 if (waitqueue_active(&root->log_writer_wait)) 242 wake_up(&root->log_writer_wait); 243 } 244 } 245 246 247 /* 248 * the walk control struct is used to pass state down the chain when 249 * processing the log tree. The stage field tells us which part 250 * of the log tree processing we are currently doing. The others 251 * are state fields used for that specific part 252 */ 253 struct walk_control { 254 /* should we free the extent on disk when done? This is used 255 * at transaction commit time while freeing a log tree 256 */ 257 int free; 258 259 /* should we write out the extent buffer? This is used 260 * while flushing the log tree to disk during a sync 261 */ 262 int write; 263 264 /* should we wait for the extent buffer io to finish? Also used 265 * while flushing the log tree to disk for a sync 266 */ 267 int wait; 268 269 /* pin only walk, we record which extents on disk belong to the 270 * log trees 271 */ 272 int pin; 273 274 /* what stage of the replay code we're currently in */ 275 int stage; 276 277 /* the root we are currently replaying */ 278 struct btrfs_root *replay_dest; 279 280 /* the trans handle for the current replay */ 281 struct btrfs_trans_handle *trans; 282 283 /* the function that gets used to process blocks we find in the 284 * tree. Note the extent_buffer might not be up to date when it is 285 * passed in, and it must be checked or read if you need the data 286 * inside it 287 */ 288 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 289 struct walk_control *wc, u64 gen); 290 }; 291 292 /* 293 * process_func used to pin down extents, write them or wait on them 294 */ 295 static int process_one_buffer(struct btrfs_root *log, 296 struct extent_buffer *eb, 297 struct walk_control *wc, u64 gen) 298 { 299 struct btrfs_fs_info *fs_info = log->fs_info; 300 int ret = 0; 301 302 /* 303 * If this fs is mixed then we need to be able to process the leaves to 304 * pin down any logged extents, so we have to read the block. 305 */ 306 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 307 ret = btrfs_read_buffer(eb, gen); 308 if (ret) 309 return ret; 310 } 311 312 if (wc->pin) 313 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 314 eb->len); 315 316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 317 if (wc->pin && btrfs_header_level(eb) == 0) 318 ret = btrfs_exclude_logged_extents(fs_info, eb); 319 if (wc->write) 320 btrfs_write_tree_block(eb); 321 if (wc->wait) 322 btrfs_wait_tree_block_writeback(eb); 323 } 324 return ret; 325 } 326 327 /* 328 * Item overwrite used by replay and tree logging. eb, slot and key all refer 329 * to the src data we are copying out. 330 * 331 * root is the tree we are copying into, and path is a scratch 332 * path for use in this function (it should be released on entry and 333 * will be released on exit). 334 * 335 * If the key is already in the destination tree the existing item is 336 * overwritten. If the existing item isn't big enough, it is extended. 337 * If it is too large, it is truncated. 338 * 339 * If the key isn't in the destination yet, a new item is inserted. 340 */ 341 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 342 struct btrfs_root *root, 343 struct btrfs_path *path, 344 struct extent_buffer *eb, int slot, 345 struct btrfs_key *key) 346 { 347 struct btrfs_fs_info *fs_info = root->fs_info; 348 int ret; 349 u32 item_size; 350 u64 saved_i_size = 0; 351 int save_old_i_size = 0; 352 unsigned long src_ptr; 353 unsigned long dst_ptr; 354 int overwrite_root = 0; 355 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 356 357 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 358 overwrite_root = 1; 359 360 item_size = btrfs_item_size_nr(eb, slot); 361 src_ptr = btrfs_item_ptr_offset(eb, slot); 362 363 /* look for the key in the destination tree */ 364 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 365 if (ret < 0) 366 return ret; 367 368 if (ret == 0) { 369 char *src_copy; 370 char *dst_copy; 371 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 372 path->slots[0]); 373 if (dst_size != item_size) 374 goto insert; 375 376 if (item_size == 0) { 377 btrfs_release_path(path); 378 return 0; 379 } 380 dst_copy = kmalloc(item_size, GFP_NOFS); 381 src_copy = kmalloc(item_size, GFP_NOFS); 382 if (!dst_copy || !src_copy) { 383 btrfs_release_path(path); 384 kfree(dst_copy); 385 kfree(src_copy); 386 return -ENOMEM; 387 } 388 389 read_extent_buffer(eb, src_copy, src_ptr, item_size); 390 391 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 392 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 393 item_size); 394 ret = memcmp(dst_copy, src_copy, item_size); 395 396 kfree(dst_copy); 397 kfree(src_copy); 398 /* 399 * they have the same contents, just return, this saves 400 * us from cowing blocks in the destination tree and doing 401 * extra writes that may not have been done by a previous 402 * sync 403 */ 404 if (ret == 0) { 405 btrfs_release_path(path); 406 return 0; 407 } 408 409 /* 410 * We need to load the old nbytes into the inode so when we 411 * replay the extents we've logged we get the right nbytes. 412 */ 413 if (inode_item) { 414 struct btrfs_inode_item *item; 415 u64 nbytes; 416 u32 mode; 417 418 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 419 struct btrfs_inode_item); 420 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 421 item = btrfs_item_ptr(eb, slot, 422 struct btrfs_inode_item); 423 btrfs_set_inode_nbytes(eb, item, nbytes); 424 425 /* 426 * If this is a directory we need to reset the i_size to 427 * 0 so that we can set it up properly when replaying 428 * the rest of the items in this log. 429 */ 430 mode = btrfs_inode_mode(eb, item); 431 if (S_ISDIR(mode)) 432 btrfs_set_inode_size(eb, item, 0); 433 } 434 } else if (inode_item) { 435 struct btrfs_inode_item *item; 436 u32 mode; 437 438 /* 439 * New inode, set nbytes to 0 so that the nbytes comes out 440 * properly when we replay the extents. 441 */ 442 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 443 btrfs_set_inode_nbytes(eb, item, 0); 444 445 /* 446 * If this is a directory we need to reset the i_size to 0 so 447 * that we can set it up properly when replaying the rest of 448 * the items in this log. 449 */ 450 mode = btrfs_inode_mode(eb, item); 451 if (S_ISDIR(mode)) 452 btrfs_set_inode_size(eb, item, 0); 453 } 454 insert: 455 btrfs_release_path(path); 456 /* try to insert the key into the destination tree */ 457 path->skip_release_on_error = 1; 458 ret = btrfs_insert_empty_item(trans, root, path, 459 key, item_size); 460 path->skip_release_on_error = 0; 461 462 /* make sure any existing item is the correct size */ 463 if (ret == -EEXIST || ret == -EOVERFLOW) { 464 u32 found_size; 465 found_size = btrfs_item_size_nr(path->nodes[0], 466 path->slots[0]); 467 if (found_size > item_size) 468 btrfs_truncate_item(fs_info, path, item_size, 1); 469 else if (found_size < item_size) 470 btrfs_extend_item(fs_info, path, 471 item_size - found_size); 472 } else if (ret) { 473 return ret; 474 } 475 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 476 path->slots[0]); 477 478 /* don't overwrite an existing inode if the generation number 479 * was logged as zero. This is done when the tree logging code 480 * is just logging an inode to make sure it exists after recovery. 481 * 482 * Also, don't overwrite i_size on directories during replay. 483 * log replay inserts and removes directory items based on the 484 * state of the tree found in the subvolume, and i_size is modified 485 * as it goes 486 */ 487 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 488 struct btrfs_inode_item *src_item; 489 struct btrfs_inode_item *dst_item; 490 491 src_item = (struct btrfs_inode_item *)src_ptr; 492 dst_item = (struct btrfs_inode_item *)dst_ptr; 493 494 if (btrfs_inode_generation(eb, src_item) == 0) { 495 struct extent_buffer *dst_eb = path->nodes[0]; 496 const u64 ino_size = btrfs_inode_size(eb, src_item); 497 498 /* 499 * For regular files an ino_size == 0 is used only when 500 * logging that an inode exists, as part of a directory 501 * fsync, and the inode wasn't fsynced before. In this 502 * case don't set the size of the inode in the fs/subvol 503 * tree, otherwise we would be throwing valid data away. 504 */ 505 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 506 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 507 ino_size != 0) { 508 struct btrfs_map_token token; 509 510 btrfs_init_map_token(&token); 511 btrfs_set_token_inode_size(dst_eb, dst_item, 512 ino_size, &token); 513 } 514 goto no_copy; 515 } 516 517 if (overwrite_root && 518 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 519 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 520 save_old_i_size = 1; 521 saved_i_size = btrfs_inode_size(path->nodes[0], 522 dst_item); 523 } 524 } 525 526 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 527 src_ptr, item_size); 528 529 if (save_old_i_size) { 530 struct btrfs_inode_item *dst_item; 531 dst_item = (struct btrfs_inode_item *)dst_ptr; 532 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 533 } 534 535 /* make sure the generation is filled in */ 536 if (key->type == BTRFS_INODE_ITEM_KEY) { 537 struct btrfs_inode_item *dst_item; 538 dst_item = (struct btrfs_inode_item *)dst_ptr; 539 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 540 btrfs_set_inode_generation(path->nodes[0], dst_item, 541 trans->transid); 542 } 543 } 544 no_copy: 545 btrfs_mark_buffer_dirty(path->nodes[0]); 546 btrfs_release_path(path); 547 return 0; 548 } 549 550 /* 551 * simple helper to read an inode off the disk from a given root 552 * This can only be called for subvolume roots and not for the log 553 */ 554 static noinline struct inode *read_one_inode(struct btrfs_root *root, 555 u64 objectid) 556 { 557 struct btrfs_key key; 558 struct inode *inode; 559 560 key.objectid = objectid; 561 key.type = BTRFS_INODE_ITEM_KEY; 562 key.offset = 0; 563 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 564 if (IS_ERR(inode)) { 565 inode = NULL; 566 } else if (is_bad_inode(inode)) { 567 iput(inode); 568 inode = NULL; 569 } 570 return inode; 571 } 572 573 /* replays a single extent in 'eb' at 'slot' with 'key' into the 574 * subvolume 'root'. path is released on entry and should be released 575 * on exit. 576 * 577 * extents in the log tree have not been allocated out of the extent 578 * tree yet. So, this completes the allocation, taking a reference 579 * as required if the extent already exists or creating a new extent 580 * if it isn't in the extent allocation tree yet. 581 * 582 * The extent is inserted into the file, dropping any existing extents 583 * from the file that overlap the new one. 584 */ 585 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 586 struct btrfs_root *root, 587 struct btrfs_path *path, 588 struct extent_buffer *eb, int slot, 589 struct btrfs_key *key) 590 { 591 struct btrfs_fs_info *fs_info = root->fs_info; 592 int found_type; 593 u64 extent_end; 594 u64 start = key->offset; 595 u64 nbytes = 0; 596 struct btrfs_file_extent_item *item; 597 struct inode *inode = NULL; 598 unsigned long size; 599 int ret = 0; 600 601 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 602 found_type = btrfs_file_extent_type(eb, item); 603 604 if (found_type == BTRFS_FILE_EXTENT_REG || 605 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 606 nbytes = btrfs_file_extent_num_bytes(eb, item); 607 extent_end = start + nbytes; 608 609 /* 610 * We don't add to the inodes nbytes if we are prealloc or a 611 * hole. 612 */ 613 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 614 nbytes = 0; 615 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 616 size = btrfs_file_extent_inline_len(eb, slot, item); 617 nbytes = btrfs_file_extent_ram_bytes(eb, item); 618 extent_end = ALIGN(start + size, 619 fs_info->sectorsize); 620 } else { 621 ret = 0; 622 goto out; 623 } 624 625 inode = read_one_inode(root, key->objectid); 626 if (!inode) { 627 ret = -EIO; 628 goto out; 629 } 630 631 /* 632 * first check to see if we already have this extent in the 633 * file. This must be done before the btrfs_drop_extents run 634 * so we don't try to drop this extent. 635 */ 636 ret = btrfs_lookup_file_extent(trans, root, path, 637 btrfs_ino(BTRFS_I(inode)), start, 0); 638 639 if (ret == 0 && 640 (found_type == BTRFS_FILE_EXTENT_REG || 641 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 642 struct btrfs_file_extent_item cmp1; 643 struct btrfs_file_extent_item cmp2; 644 struct btrfs_file_extent_item *existing; 645 struct extent_buffer *leaf; 646 647 leaf = path->nodes[0]; 648 existing = btrfs_item_ptr(leaf, path->slots[0], 649 struct btrfs_file_extent_item); 650 651 read_extent_buffer(eb, &cmp1, (unsigned long)item, 652 sizeof(cmp1)); 653 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 654 sizeof(cmp2)); 655 656 /* 657 * we already have a pointer to this exact extent, 658 * we don't have to do anything 659 */ 660 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 661 btrfs_release_path(path); 662 goto out; 663 } 664 } 665 btrfs_release_path(path); 666 667 /* drop any overlapping extents */ 668 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 669 if (ret) 670 goto out; 671 672 if (found_type == BTRFS_FILE_EXTENT_REG || 673 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 674 u64 offset; 675 unsigned long dest_offset; 676 struct btrfs_key ins; 677 678 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 679 btrfs_fs_incompat(fs_info, NO_HOLES)) 680 goto update_inode; 681 682 ret = btrfs_insert_empty_item(trans, root, path, key, 683 sizeof(*item)); 684 if (ret) 685 goto out; 686 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 687 path->slots[0]); 688 copy_extent_buffer(path->nodes[0], eb, dest_offset, 689 (unsigned long)item, sizeof(*item)); 690 691 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 692 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 693 ins.type = BTRFS_EXTENT_ITEM_KEY; 694 offset = key->offset - btrfs_file_extent_offset(eb, item); 695 696 /* 697 * Manually record dirty extent, as here we did a shallow 698 * file extent item copy and skip normal backref update, 699 * but modifying extent tree all by ourselves. 700 * So need to manually record dirty extent for qgroup, 701 * as the owner of the file extent changed from log tree 702 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 703 */ 704 ret = btrfs_qgroup_trace_extent(trans, fs_info, 705 btrfs_file_extent_disk_bytenr(eb, item), 706 btrfs_file_extent_disk_num_bytes(eb, item), 707 GFP_NOFS); 708 if (ret < 0) 709 goto out; 710 711 if (ins.objectid > 0) { 712 u64 csum_start; 713 u64 csum_end; 714 LIST_HEAD(ordered_sums); 715 /* 716 * is this extent already allocated in the extent 717 * allocation tree? If so, just add a reference 718 */ 719 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 720 ins.offset); 721 if (ret == 0) { 722 ret = btrfs_inc_extent_ref(trans, root, 723 ins.objectid, ins.offset, 724 0, root->root_key.objectid, 725 key->objectid, offset); 726 if (ret) 727 goto out; 728 } else { 729 /* 730 * insert the extent pointer in the extent 731 * allocation tree 732 */ 733 ret = btrfs_alloc_logged_file_extent(trans, 734 fs_info, 735 root->root_key.objectid, 736 key->objectid, offset, &ins); 737 if (ret) 738 goto out; 739 } 740 btrfs_release_path(path); 741 742 if (btrfs_file_extent_compression(eb, item)) { 743 csum_start = ins.objectid; 744 csum_end = csum_start + ins.offset; 745 } else { 746 csum_start = ins.objectid + 747 btrfs_file_extent_offset(eb, item); 748 csum_end = csum_start + 749 btrfs_file_extent_num_bytes(eb, item); 750 } 751 752 ret = btrfs_lookup_csums_range(root->log_root, 753 csum_start, csum_end - 1, 754 &ordered_sums, 0); 755 if (ret) 756 goto out; 757 /* 758 * Now delete all existing cums in the csum root that 759 * cover our range. We do this because we can have an 760 * extent that is completely referenced by one file 761 * extent item and partially referenced by another 762 * file extent item (like after using the clone or 763 * extent_same ioctls). In this case if we end up doing 764 * the replay of the one that partially references the 765 * extent first, and we do not do the csum deletion 766 * below, we can get 2 csum items in the csum tree that 767 * overlap each other. For example, imagine our log has 768 * the two following file extent items: 769 * 770 * key (257 EXTENT_DATA 409600) 771 * extent data disk byte 12845056 nr 102400 772 * extent data offset 20480 nr 20480 ram 102400 773 * 774 * key (257 EXTENT_DATA 819200) 775 * extent data disk byte 12845056 nr 102400 776 * extent data offset 0 nr 102400 ram 102400 777 * 778 * Where the second one fully references the 100K extent 779 * that starts at disk byte 12845056, and the log tree 780 * has a single csum item that covers the entire range 781 * of the extent: 782 * 783 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 784 * 785 * After the first file extent item is replayed, the 786 * csum tree gets the following csum item: 787 * 788 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 789 * 790 * Which covers the 20K sub-range starting at offset 20K 791 * of our extent. Now when we replay the second file 792 * extent item, if we do not delete existing csum items 793 * that cover any of its blocks, we end up getting two 794 * csum items in our csum tree that overlap each other: 795 * 796 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 797 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 798 * 799 * Which is a problem, because after this anyone trying 800 * to lookup up for the checksum of any block of our 801 * extent starting at an offset of 40K or higher, will 802 * end up looking at the second csum item only, which 803 * does not contain the checksum for any block starting 804 * at offset 40K or higher of our extent. 805 */ 806 while (!list_empty(&ordered_sums)) { 807 struct btrfs_ordered_sum *sums; 808 sums = list_entry(ordered_sums.next, 809 struct btrfs_ordered_sum, 810 list); 811 if (!ret) 812 ret = btrfs_del_csums(trans, fs_info, 813 sums->bytenr, 814 sums->len); 815 if (!ret) 816 ret = btrfs_csum_file_blocks(trans, 817 fs_info->csum_root, sums); 818 list_del(&sums->list); 819 kfree(sums); 820 } 821 if (ret) 822 goto out; 823 } else { 824 btrfs_release_path(path); 825 } 826 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 827 /* inline extents are easy, we just overwrite them */ 828 ret = overwrite_item(trans, root, path, eb, slot, key); 829 if (ret) 830 goto out; 831 } 832 833 inode_add_bytes(inode, nbytes); 834 update_inode: 835 ret = btrfs_update_inode(trans, root, inode); 836 out: 837 if (inode) 838 iput(inode); 839 return ret; 840 } 841 842 /* 843 * when cleaning up conflicts between the directory names in the 844 * subvolume, directory names in the log and directory names in the 845 * inode back references, we may have to unlink inodes from directories. 846 * 847 * This is a helper function to do the unlink of a specific directory 848 * item 849 */ 850 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 851 struct btrfs_root *root, 852 struct btrfs_path *path, 853 struct btrfs_inode *dir, 854 struct btrfs_dir_item *di) 855 { 856 struct btrfs_fs_info *fs_info = root->fs_info; 857 struct inode *inode; 858 char *name; 859 int name_len; 860 struct extent_buffer *leaf; 861 struct btrfs_key location; 862 int ret; 863 864 leaf = path->nodes[0]; 865 866 btrfs_dir_item_key_to_cpu(leaf, di, &location); 867 name_len = btrfs_dir_name_len(leaf, di); 868 name = kmalloc(name_len, GFP_NOFS); 869 if (!name) 870 return -ENOMEM; 871 872 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 873 btrfs_release_path(path); 874 875 inode = read_one_inode(root, location.objectid); 876 if (!inode) { 877 ret = -EIO; 878 goto out; 879 } 880 881 ret = link_to_fixup_dir(trans, root, path, location.objectid); 882 if (ret) 883 goto out; 884 885 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 886 name_len); 887 if (ret) 888 goto out; 889 else 890 ret = btrfs_run_delayed_items(trans, fs_info); 891 out: 892 kfree(name); 893 iput(inode); 894 return ret; 895 } 896 897 /* 898 * helper function to see if a given name and sequence number found 899 * in an inode back reference are already in a directory and correctly 900 * point to this inode 901 */ 902 static noinline int inode_in_dir(struct btrfs_root *root, 903 struct btrfs_path *path, 904 u64 dirid, u64 objectid, u64 index, 905 const char *name, int name_len) 906 { 907 struct btrfs_dir_item *di; 908 struct btrfs_key location; 909 int match = 0; 910 911 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 912 index, name, name_len, 0); 913 if (di && !IS_ERR(di)) { 914 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 915 if (location.objectid != objectid) 916 goto out; 917 } else 918 goto out; 919 btrfs_release_path(path); 920 921 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 922 if (di && !IS_ERR(di)) { 923 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 924 if (location.objectid != objectid) 925 goto out; 926 } else 927 goto out; 928 match = 1; 929 out: 930 btrfs_release_path(path); 931 return match; 932 } 933 934 /* 935 * helper function to check a log tree for a named back reference in 936 * an inode. This is used to decide if a back reference that is 937 * found in the subvolume conflicts with what we find in the log. 938 * 939 * inode backreferences may have multiple refs in a single item, 940 * during replay we process one reference at a time, and we don't 941 * want to delete valid links to a file from the subvolume if that 942 * link is also in the log. 943 */ 944 static noinline int backref_in_log(struct btrfs_root *log, 945 struct btrfs_key *key, 946 u64 ref_objectid, 947 const char *name, int namelen) 948 { 949 struct btrfs_path *path; 950 struct btrfs_inode_ref *ref; 951 unsigned long ptr; 952 unsigned long ptr_end; 953 unsigned long name_ptr; 954 int found_name_len; 955 int item_size; 956 int ret; 957 int match = 0; 958 959 path = btrfs_alloc_path(); 960 if (!path) 961 return -ENOMEM; 962 963 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 964 if (ret != 0) 965 goto out; 966 967 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 968 969 if (key->type == BTRFS_INODE_EXTREF_KEY) { 970 if (btrfs_find_name_in_ext_backref(path->nodes[0], 971 path->slots[0], 972 ref_objectid, 973 name, namelen, NULL)) 974 match = 1; 975 976 goto out; 977 } 978 979 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 980 ptr_end = ptr + item_size; 981 while (ptr < ptr_end) { 982 ref = (struct btrfs_inode_ref *)ptr; 983 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 984 if (found_name_len == namelen) { 985 name_ptr = (unsigned long)(ref + 1); 986 ret = memcmp_extent_buffer(path->nodes[0], name, 987 name_ptr, namelen); 988 if (ret == 0) { 989 match = 1; 990 goto out; 991 } 992 } 993 ptr = (unsigned long)(ref + 1) + found_name_len; 994 } 995 out: 996 btrfs_free_path(path); 997 return match; 998 } 999 1000 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1001 struct btrfs_root *root, 1002 struct btrfs_path *path, 1003 struct btrfs_root *log_root, 1004 struct btrfs_inode *dir, 1005 struct btrfs_inode *inode, 1006 u64 inode_objectid, u64 parent_objectid, 1007 u64 ref_index, char *name, int namelen, 1008 int *search_done) 1009 { 1010 struct btrfs_fs_info *fs_info = root->fs_info; 1011 int ret; 1012 char *victim_name; 1013 int victim_name_len; 1014 struct extent_buffer *leaf; 1015 struct btrfs_dir_item *di; 1016 struct btrfs_key search_key; 1017 struct btrfs_inode_extref *extref; 1018 1019 again: 1020 /* Search old style refs */ 1021 search_key.objectid = inode_objectid; 1022 search_key.type = BTRFS_INODE_REF_KEY; 1023 search_key.offset = parent_objectid; 1024 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1025 if (ret == 0) { 1026 struct btrfs_inode_ref *victim_ref; 1027 unsigned long ptr; 1028 unsigned long ptr_end; 1029 1030 leaf = path->nodes[0]; 1031 1032 /* are we trying to overwrite a back ref for the root directory 1033 * if so, just jump out, we're done 1034 */ 1035 if (search_key.objectid == search_key.offset) 1036 return 1; 1037 1038 /* check all the names in this back reference to see 1039 * if they are in the log. if so, we allow them to stay 1040 * otherwise they must be unlinked as a conflict 1041 */ 1042 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1043 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1044 while (ptr < ptr_end) { 1045 victim_ref = (struct btrfs_inode_ref *)ptr; 1046 victim_name_len = btrfs_inode_ref_name_len(leaf, 1047 victim_ref); 1048 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1049 if (!victim_name) 1050 return -ENOMEM; 1051 1052 read_extent_buffer(leaf, victim_name, 1053 (unsigned long)(victim_ref + 1), 1054 victim_name_len); 1055 1056 if (!backref_in_log(log_root, &search_key, 1057 parent_objectid, 1058 victim_name, 1059 victim_name_len)) { 1060 inc_nlink(&inode->vfs_inode); 1061 btrfs_release_path(path); 1062 1063 ret = btrfs_unlink_inode(trans, root, dir, inode, 1064 victim_name, victim_name_len); 1065 kfree(victim_name); 1066 if (ret) 1067 return ret; 1068 ret = btrfs_run_delayed_items(trans, fs_info); 1069 if (ret) 1070 return ret; 1071 *search_done = 1; 1072 goto again; 1073 } 1074 kfree(victim_name); 1075 1076 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1077 } 1078 1079 /* 1080 * NOTE: we have searched root tree and checked the 1081 * corresponding ref, it does not need to check again. 1082 */ 1083 *search_done = 1; 1084 } 1085 btrfs_release_path(path); 1086 1087 /* Same search but for extended refs */ 1088 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1089 inode_objectid, parent_objectid, 0, 1090 0); 1091 if (!IS_ERR_OR_NULL(extref)) { 1092 u32 item_size; 1093 u32 cur_offset = 0; 1094 unsigned long base; 1095 struct inode *victim_parent; 1096 1097 leaf = path->nodes[0]; 1098 1099 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1100 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1101 1102 while (cur_offset < item_size) { 1103 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1104 1105 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1106 1107 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1108 goto next; 1109 1110 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1111 if (!victim_name) 1112 return -ENOMEM; 1113 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1114 victim_name_len); 1115 1116 search_key.objectid = inode_objectid; 1117 search_key.type = BTRFS_INODE_EXTREF_KEY; 1118 search_key.offset = btrfs_extref_hash(parent_objectid, 1119 victim_name, 1120 victim_name_len); 1121 ret = 0; 1122 if (!backref_in_log(log_root, &search_key, 1123 parent_objectid, victim_name, 1124 victim_name_len)) { 1125 ret = -ENOENT; 1126 victim_parent = read_one_inode(root, 1127 parent_objectid); 1128 if (victim_parent) { 1129 inc_nlink(&inode->vfs_inode); 1130 btrfs_release_path(path); 1131 1132 ret = btrfs_unlink_inode(trans, root, 1133 BTRFS_I(victim_parent), 1134 inode, 1135 victim_name, 1136 victim_name_len); 1137 if (!ret) 1138 ret = btrfs_run_delayed_items( 1139 trans, 1140 fs_info); 1141 } 1142 iput(victim_parent); 1143 kfree(victim_name); 1144 if (ret) 1145 return ret; 1146 *search_done = 1; 1147 goto again; 1148 } 1149 kfree(victim_name); 1150 next: 1151 cur_offset += victim_name_len + sizeof(*extref); 1152 } 1153 *search_done = 1; 1154 } 1155 btrfs_release_path(path); 1156 1157 /* look for a conflicting sequence number */ 1158 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1159 ref_index, name, namelen, 0); 1160 if (di && !IS_ERR(di)) { 1161 ret = drop_one_dir_item(trans, root, path, dir, di); 1162 if (ret) 1163 return ret; 1164 } 1165 btrfs_release_path(path); 1166 1167 /* look for a conflicing name */ 1168 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1169 name, namelen, 0); 1170 if (di && !IS_ERR(di)) { 1171 ret = drop_one_dir_item(trans, root, path, dir, di); 1172 if (ret) 1173 return ret; 1174 } 1175 btrfs_release_path(path); 1176 1177 return 0; 1178 } 1179 1180 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1181 u32 *namelen, char **name, u64 *index, 1182 u64 *parent_objectid) 1183 { 1184 struct btrfs_inode_extref *extref; 1185 1186 extref = (struct btrfs_inode_extref *)ref_ptr; 1187 1188 *namelen = btrfs_inode_extref_name_len(eb, extref); 1189 *name = kmalloc(*namelen, GFP_NOFS); 1190 if (*name == NULL) 1191 return -ENOMEM; 1192 1193 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1194 *namelen); 1195 1196 if (index) 1197 *index = btrfs_inode_extref_index(eb, extref); 1198 if (parent_objectid) 1199 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1200 1201 return 0; 1202 } 1203 1204 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1205 u32 *namelen, char **name, u64 *index) 1206 { 1207 struct btrfs_inode_ref *ref; 1208 1209 ref = (struct btrfs_inode_ref *)ref_ptr; 1210 1211 *namelen = btrfs_inode_ref_name_len(eb, ref); 1212 *name = kmalloc(*namelen, GFP_NOFS); 1213 if (*name == NULL) 1214 return -ENOMEM; 1215 1216 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1217 1218 if (index) 1219 *index = btrfs_inode_ref_index(eb, ref); 1220 1221 return 0; 1222 } 1223 1224 /* 1225 * Take an inode reference item from the log tree and iterate all names from the 1226 * inode reference item in the subvolume tree with the same key (if it exists). 1227 * For any name that is not in the inode reference item from the log tree, do a 1228 * proper unlink of that name (that is, remove its entry from the inode 1229 * reference item and both dir index keys). 1230 */ 1231 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1232 struct btrfs_root *root, 1233 struct btrfs_path *path, 1234 struct btrfs_inode *inode, 1235 struct extent_buffer *log_eb, 1236 int log_slot, 1237 struct btrfs_key *key) 1238 { 1239 int ret; 1240 unsigned long ref_ptr; 1241 unsigned long ref_end; 1242 struct extent_buffer *eb; 1243 1244 again: 1245 btrfs_release_path(path); 1246 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1247 if (ret > 0) { 1248 ret = 0; 1249 goto out; 1250 } 1251 if (ret < 0) 1252 goto out; 1253 1254 eb = path->nodes[0]; 1255 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1256 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1257 while (ref_ptr < ref_end) { 1258 char *name = NULL; 1259 int namelen; 1260 u64 parent_id; 1261 1262 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1263 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1264 NULL, &parent_id); 1265 } else { 1266 parent_id = key->offset; 1267 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1268 NULL); 1269 } 1270 if (ret) 1271 goto out; 1272 1273 if (key->type == BTRFS_INODE_EXTREF_KEY) 1274 ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, 1275 parent_id, name, 1276 namelen, NULL); 1277 else 1278 ret = btrfs_find_name_in_backref(log_eb, log_slot, name, 1279 namelen, NULL); 1280 1281 if (!ret) { 1282 struct inode *dir; 1283 1284 btrfs_release_path(path); 1285 dir = read_one_inode(root, parent_id); 1286 if (!dir) { 1287 ret = -ENOENT; 1288 kfree(name); 1289 goto out; 1290 } 1291 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1292 inode, name, namelen); 1293 kfree(name); 1294 iput(dir); 1295 if (ret) 1296 goto out; 1297 goto again; 1298 } 1299 1300 kfree(name); 1301 ref_ptr += namelen; 1302 if (key->type == BTRFS_INODE_EXTREF_KEY) 1303 ref_ptr += sizeof(struct btrfs_inode_extref); 1304 else 1305 ref_ptr += sizeof(struct btrfs_inode_ref); 1306 } 1307 ret = 0; 1308 out: 1309 btrfs_release_path(path); 1310 return ret; 1311 } 1312 1313 /* 1314 * replay one inode back reference item found in the log tree. 1315 * eb, slot and key refer to the buffer and key found in the log tree. 1316 * root is the destination we are replaying into, and path is for temp 1317 * use by this function. (it should be released on return). 1318 */ 1319 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1320 struct btrfs_root *root, 1321 struct btrfs_root *log, 1322 struct btrfs_path *path, 1323 struct extent_buffer *eb, int slot, 1324 struct btrfs_key *key) 1325 { 1326 struct inode *dir = NULL; 1327 struct inode *inode = NULL; 1328 unsigned long ref_ptr; 1329 unsigned long ref_end; 1330 char *name = NULL; 1331 int namelen; 1332 int ret; 1333 int search_done = 0; 1334 int log_ref_ver = 0; 1335 u64 parent_objectid; 1336 u64 inode_objectid; 1337 u64 ref_index = 0; 1338 int ref_struct_size; 1339 1340 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1341 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1342 1343 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1344 struct btrfs_inode_extref *r; 1345 1346 ref_struct_size = sizeof(struct btrfs_inode_extref); 1347 log_ref_ver = 1; 1348 r = (struct btrfs_inode_extref *)ref_ptr; 1349 parent_objectid = btrfs_inode_extref_parent(eb, r); 1350 } else { 1351 ref_struct_size = sizeof(struct btrfs_inode_ref); 1352 parent_objectid = key->offset; 1353 } 1354 inode_objectid = key->objectid; 1355 1356 /* 1357 * it is possible that we didn't log all the parent directories 1358 * for a given inode. If we don't find the dir, just don't 1359 * copy the back ref in. The link count fixup code will take 1360 * care of the rest 1361 */ 1362 dir = read_one_inode(root, parent_objectid); 1363 if (!dir) { 1364 ret = -ENOENT; 1365 goto out; 1366 } 1367 1368 inode = read_one_inode(root, inode_objectid); 1369 if (!inode) { 1370 ret = -EIO; 1371 goto out; 1372 } 1373 1374 while (ref_ptr < ref_end) { 1375 if (log_ref_ver) { 1376 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1377 &ref_index, &parent_objectid); 1378 /* 1379 * parent object can change from one array 1380 * item to another. 1381 */ 1382 if (!dir) 1383 dir = read_one_inode(root, parent_objectid); 1384 if (!dir) { 1385 ret = -ENOENT; 1386 goto out; 1387 } 1388 } else { 1389 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1390 &ref_index); 1391 } 1392 if (ret) 1393 goto out; 1394 1395 /* if we already have a perfect match, we're done */ 1396 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1397 btrfs_ino(BTRFS_I(inode)), ref_index, 1398 name, namelen)) { 1399 /* 1400 * look for a conflicting back reference in the 1401 * metadata. if we find one we have to unlink that name 1402 * of the file before we add our new link. Later on, we 1403 * overwrite any existing back reference, and we don't 1404 * want to create dangling pointers in the directory. 1405 */ 1406 1407 if (!search_done) { 1408 ret = __add_inode_ref(trans, root, path, log, 1409 BTRFS_I(dir), 1410 BTRFS_I(inode), 1411 inode_objectid, 1412 parent_objectid, 1413 ref_index, name, namelen, 1414 &search_done); 1415 if (ret) { 1416 if (ret == 1) 1417 ret = 0; 1418 goto out; 1419 } 1420 } 1421 1422 /* insert our name */ 1423 ret = btrfs_add_link(trans, BTRFS_I(dir), 1424 BTRFS_I(inode), 1425 name, namelen, 0, ref_index); 1426 if (ret) 1427 goto out; 1428 1429 btrfs_update_inode(trans, root, inode); 1430 } 1431 1432 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1433 kfree(name); 1434 name = NULL; 1435 if (log_ref_ver) { 1436 iput(dir); 1437 dir = NULL; 1438 } 1439 } 1440 1441 /* 1442 * Before we overwrite the inode reference item in the subvolume tree 1443 * with the item from the log tree, we must unlink all names from the 1444 * parent directory that are in the subvolume's tree inode reference 1445 * item, otherwise we end up with an inconsistent subvolume tree where 1446 * dir index entries exist for a name but there is no inode reference 1447 * item with the same name. 1448 */ 1449 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1450 key); 1451 if (ret) 1452 goto out; 1453 1454 /* finally write the back reference in the inode */ 1455 ret = overwrite_item(trans, root, path, eb, slot, key); 1456 out: 1457 btrfs_release_path(path); 1458 kfree(name); 1459 iput(dir); 1460 iput(inode); 1461 return ret; 1462 } 1463 1464 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1465 struct btrfs_root *root, u64 ino) 1466 { 1467 int ret; 1468 1469 ret = btrfs_insert_orphan_item(trans, root, ino); 1470 if (ret == -EEXIST) 1471 ret = 0; 1472 1473 return ret; 1474 } 1475 1476 static int count_inode_extrefs(struct btrfs_root *root, 1477 struct btrfs_inode *inode, struct btrfs_path *path) 1478 { 1479 int ret = 0; 1480 int name_len; 1481 unsigned int nlink = 0; 1482 u32 item_size; 1483 u32 cur_offset = 0; 1484 u64 inode_objectid = btrfs_ino(inode); 1485 u64 offset = 0; 1486 unsigned long ptr; 1487 struct btrfs_inode_extref *extref; 1488 struct extent_buffer *leaf; 1489 1490 while (1) { 1491 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1492 &extref, &offset); 1493 if (ret) 1494 break; 1495 1496 leaf = path->nodes[0]; 1497 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1498 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1499 cur_offset = 0; 1500 1501 while (cur_offset < item_size) { 1502 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1503 name_len = btrfs_inode_extref_name_len(leaf, extref); 1504 1505 nlink++; 1506 1507 cur_offset += name_len + sizeof(*extref); 1508 } 1509 1510 offset++; 1511 btrfs_release_path(path); 1512 } 1513 btrfs_release_path(path); 1514 1515 if (ret < 0 && ret != -ENOENT) 1516 return ret; 1517 return nlink; 1518 } 1519 1520 static int count_inode_refs(struct btrfs_root *root, 1521 struct btrfs_inode *inode, struct btrfs_path *path) 1522 { 1523 int ret; 1524 struct btrfs_key key; 1525 unsigned int nlink = 0; 1526 unsigned long ptr; 1527 unsigned long ptr_end; 1528 int name_len; 1529 u64 ino = btrfs_ino(inode); 1530 1531 key.objectid = ino; 1532 key.type = BTRFS_INODE_REF_KEY; 1533 key.offset = (u64)-1; 1534 1535 while (1) { 1536 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1537 if (ret < 0) 1538 break; 1539 if (ret > 0) { 1540 if (path->slots[0] == 0) 1541 break; 1542 path->slots[0]--; 1543 } 1544 process_slot: 1545 btrfs_item_key_to_cpu(path->nodes[0], &key, 1546 path->slots[0]); 1547 if (key.objectid != ino || 1548 key.type != BTRFS_INODE_REF_KEY) 1549 break; 1550 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1551 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1552 path->slots[0]); 1553 while (ptr < ptr_end) { 1554 struct btrfs_inode_ref *ref; 1555 1556 ref = (struct btrfs_inode_ref *)ptr; 1557 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1558 ref); 1559 ptr = (unsigned long)(ref + 1) + name_len; 1560 nlink++; 1561 } 1562 1563 if (key.offset == 0) 1564 break; 1565 if (path->slots[0] > 0) { 1566 path->slots[0]--; 1567 goto process_slot; 1568 } 1569 key.offset--; 1570 btrfs_release_path(path); 1571 } 1572 btrfs_release_path(path); 1573 1574 return nlink; 1575 } 1576 1577 /* 1578 * There are a few corners where the link count of the file can't 1579 * be properly maintained during replay. So, instead of adding 1580 * lots of complexity to the log code, we just scan the backrefs 1581 * for any file that has been through replay. 1582 * 1583 * The scan will update the link count on the inode to reflect the 1584 * number of back refs found. If it goes down to zero, the iput 1585 * will free the inode. 1586 */ 1587 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1588 struct btrfs_root *root, 1589 struct inode *inode) 1590 { 1591 struct btrfs_path *path; 1592 int ret; 1593 u64 nlink = 0; 1594 u64 ino = btrfs_ino(BTRFS_I(inode)); 1595 1596 path = btrfs_alloc_path(); 1597 if (!path) 1598 return -ENOMEM; 1599 1600 ret = count_inode_refs(root, BTRFS_I(inode), path); 1601 if (ret < 0) 1602 goto out; 1603 1604 nlink = ret; 1605 1606 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1607 if (ret < 0) 1608 goto out; 1609 1610 nlink += ret; 1611 1612 ret = 0; 1613 1614 if (nlink != inode->i_nlink) { 1615 set_nlink(inode, nlink); 1616 btrfs_update_inode(trans, root, inode); 1617 } 1618 BTRFS_I(inode)->index_cnt = (u64)-1; 1619 1620 if (inode->i_nlink == 0) { 1621 if (S_ISDIR(inode->i_mode)) { 1622 ret = replay_dir_deletes(trans, root, NULL, path, 1623 ino, 1); 1624 if (ret) 1625 goto out; 1626 } 1627 ret = insert_orphan_item(trans, root, ino); 1628 } 1629 1630 out: 1631 btrfs_free_path(path); 1632 return ret; 1633 } 1634 1635 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1636 struct btrfs_root *root, 1637 struct btrfs_path *path) 1638 { 1639 int ret; 1640 struct btrfs_key key; 1641 struct inode *inode; 1642 1643 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1644 key.type = BTRFS_ORPHAN_ITEM_KEY; 1645 key.offset = (u64)-1; 1646 while (1) { 1647 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1648 if (ret < 0) 1649 break; 1650 1651 if (ret == 1) { 1652 if (path->slots[0] == 0) 1653 break; 1654 path->slots[0]--; 1655 } 1656 1657 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1658 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1659 key.type != BTRFS_ORPHAN_ITEM_KEY) 1660 break; 1661 1662 ret = btrfs_del_item(trans, root, path); 1663 if (ret) 1664 goto out; 1665 1666 btrfs_release_path(path); 1667 inode = read_one_inode(root, key.offset); 1668 if (!inode) 1669 return -EIO; 1670 1671 ret = fixup_inode_link_count(trans, root, inode); 1672 iput(inode); 1673 if (ret) 1674 goto out; 1675 1676 /* 1677 * fixup on a directory may create new entries, 1678 * make sure we always look for the highset possible 1679 * offset 1680 */ 1681 key.offset = (u64)-1; 1682 } 1683 ret = 0; 1684 out: 1685 btrfs_release_path(path); 1686 return ret; 1687 } 1688 1689 1690 /* 1691 * record a given inode in the fixup dir so we can check its link 1692 * count when replay is done. The link count is incremented here 1693 * so the inode won't go away until we check it 1694 */ 1695 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1696 struct btrfs_root *root, 1697 struct btrfs_path *path, 1698 u64 objectid) 1699 { 1700 struct btrfs_key key; 1701 int ret = 0; 1702 struct inode *inode; 1703 1704 inode = read_one_inode(root, objectid); 1705 if (!inode) 1706 return -EIO; 1707 1708 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1709 key.type = BTRFS_ORPHAN_ITEM_KEY; 1710 key.offset = objectid; 1711 1712 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1713 1714 btrfs_release_path(path); 1715 if (ret == 0) { 1716 if (!inode->i_nlink) 1717 set_nlink(inode, 1); 1718 else 1719 inc_nlink(inode); 1720 ret = btrfs_update_inode(trans, root, inode); 1721 } else if (ret == -EEXIST) { 1722 ret = 0; 1723 } else { 1724 BUG(); /* Logic Error */ 1725 } 1726 iput(inode); 1727 1728 return ret; 1729 } 1730 1731 /* 1732 * when replaying the log for a directory, we only insert names 1733 * for inodes that actually exist. This means an fsync on a directory 1734 * does not implicitly fsync all the new files in it 1735 */ 1736 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1737 struct btrfs_root *root, 1738 u64 dirid, u64 index, 1739 char *name, int name_len, 1740 struct btrfs_key *location) 1741 { 1742 struct inode *inode; 1743 struct inode *dir; 1744 int ret; 1745 1746 inode = read_one_inode(root, location->objectid); 1747 if (!inode) 1748 return -ENOENT; 1749 1750 dir = read_one_inode(root, dirid); 1751 if (!dir) { 1752 iput(inode); 1753 return -EIO; 1754 } 1755 1756 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1757 name_len, 1, index); 1758 1759 /* FIXME, put inode into FIXUP list */ 1760 1761 iput(inode); 1762 iput(dir); 1763 return ret; 1764 } 1765 1766 /* 1767 * Return true if an inode reference exists in the log for the given name, 1768 * inode and parent inode. 1769 */ 1770 static bool name_in_log_ref(struct btrfs_root *log_root, 1771 const char *name, const int name_len, 1772 const u64 dirid, const u64 ino) 1773 { 1774 struct btrfs_key search_key; 1775 1776 search_key.objectid = ino; 1777 search_key.type = BTRFS_INODE_REF_KEY; 1778 search_key.offset = dirid; 1779 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1780 return true; 1781 1782 search_key.type = BTRFS_INODE_EXTREF_KEY; 1783 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1784 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1785 return true; 1786 1787 return false; 1788 } 1789 1790 /* 1791 * take a single entry in a log directory item and replay it into 1792 * the subvolume. 1793 * 1794 * if a conflicting item exists in the subdirectory already, 1795 * the inode it points to is unlinked and put into the link count 1796 * fix up tree. 1797 * 1798 * If a name from the log points to a file or directory that does 1799 * not exist in the FS, it is skipped. fsyncs on directories 1800 * do not force down inodes inside that directory, just changes to the 1801 * names or unlinks in a directory. 1802 * 1803 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1804 * non-existing inode) and 1 if the name was replayed. 1805 */ 1806 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1807 struct btrfs_root *root, 1808 struct btrfs_path *path, 1809 struct extent_buffer *eb, 1810 struct btrfs_dir_item *di, 1811 struct btrfs_key *key) 1812 { 1813 char *name; 1814 int name_len; 1815 struct btrfs_dir_item *dst_di; 1816 struct btrfs_key found_key; 1817 struct btrfs_key log_key; 1818 struct inode *dir; 1819 u8 log_type; 1820 int exists; 1821 int ret = 0; 1822 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1823 bool name_added = false; 1824 1825 dir = read_one_inode(root, key->objectid); 1826 if (!dir) 1827 return -EIO; 1828 1829 name_len = btrfs_dir_name_len(eb, di); 1830 name = kmalloc(name_len, GFP_NOFS); 1831 if (!name) { 1832 ret = -ENOMEM; 1833 goto out; 1834 } 1835 1836 log_type = btrfs_dir_type(eb, di); 1837 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1838 name_len); 1839 1840 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1841 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1842 if (exists == 0) 1843 exists = 1; 1844 else 1845 exists = 0; 1846 btrfs_release_path(path); 1847 1848 if (key->type == BTRFS_DIR_ITEM_KEY) { 1849 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1850 name, name_len, 1); 1851 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1852 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1853 key->objectid, 1854 key->offset, name, 1855 name_len, 1); 1856 } else { 1857 /* Corruption */ 1858 ret = -EINVAL; 1859 goto out; 1860 } 1861 if (IS_ERR_OR_NULL(dst_di)) { 1862 /* we need a sequence number to insert, so we only 1863 * do inserts for the BTRFS_DIR_INDEX_KEY types 1864 */ 1865 if (key->type != BTRFS_DIR_INDEX_KEY) 1866 goto out; 1867 goto insert; 1868 } 1869 1870 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1871 /* the existing item matches the logged item */ 1872 if (found_key.objectid == log_key.objectid && 1873 found_key.type == log_key.type && 1874 found_key.offset == log_key.offset && 1875 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1876 update_size = false; 1877 goto out; 1878 } 1879 1880 /* 1881 * don't drop the conflicting directory entry if the inode 1882 * for the new entry doesn't exist 1883 */ 1884 if (!exists) 1885 goto out; 1886 1887 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1888 if (ret) 1889 goto out; 1890 1891 if (key->type == BTRFS_DIR_INDEX_KEY) 1892 goto insert; 1893 out: 1894 btrfs_release_path(path); 1895 if (!ret && update_size) { 1896 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1897 ret = btrfs_update_inode(trans, root, dir); 1898 } 1899 kfree(name); 1900 iput(dir); 1901 if (!ret && name_added) 1902 ret = 1; 1903 return ret; 1904 1905 insert: 1906 if (name_in_log_ref(root->log_root, name, name_len, 1907 key->objectid, log_key.objectid)) { 1908 /* The dentry will be added later. */ 1909 ret = 0; 1910 update_size = false; 1911 goto out; 1912 } 1913 btrfs_release_path(path); 1914 ret = insert_one_name(trans, root, key->objectid, key->offset, 1915 name, name_len, &log_key); 1916 if (ret && ret != -ENOENT && ret != -EEXIST) 1917 goto out; 1918 if (!ret) 1919 name_added = true; 1920 update_size = false; 1921 ret = 0; 1922 goto out; 1923 } 1924 1925 /* 1926 * find all the names in a directory item and reconcile them into 1927 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1928 * one name in a directory item, but the same code gets used for 1929 * both directory index types 1930 */ 1931 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1932 struct btrfs_root *root, 1933 struct btrfs_path *path, 1934 struct extent_buffer *eb, int slot, 1935 struct btrfs_key *key) 1936 { 1937 int ret = 0; 1938 u32 item_size = btrfs_item_size_nr(eb, slot); 1939 struct btrfs_dir_item *di; 1940 int name_len; 1941 unsigned long ptr; 1942 unsigned long ptr_end; 1943 struct btrfs_path *fixup_path = NULL; 1944 1945 ptr = btrfs_item_ptr_offset(eb, slot); 1946 ptr_end = ptr + item_size; 1947 while (ptr < ptr_end) { 1948 di = (struct btrfs_dir_item *)ptr; 1949 name_len = btrfs_dir_name_len(eb, di); 1950 ret = replay_one_name(trans, root, path, eb, di, key); 1951 if (ret < 0) 1952 break; 1953 ptr = (unsigned long)(di + 1); 1954 ptr += name_len; 1955 1956 /* 1957 * If this entry refers to a non-directory (directories can not 1958 * have a link count > 1) and it was added in the transaction 1959 * that was not committed, make sure we fixup the link count of 1960 * the inode it the entry points to. Otherwise something like 1961 * the following would result in a directory pointing to an 1962 * inode with a wrong link that does not account for this dir 1963 * entry: 1964 * 1965 * mkdir testdir 1966 * touch testdir/foo 1967 * touch testdir/bar 1968 * sync 1969 * 1970 * ln testdir/bar testdir/bar_link 1971 * ln testdir/foo testdir/foo_link 1972 * xfs_io -c "fsync" testdir/bar 1973 * 1974 * <power failure> 1975 * 1976 * mount fs, log replay happens 1977 * 1978 * File foo would remain with a link count of 1 when it has two 1979 * entries pointing to it in the directory testdir. This would 1980 * make it impossible to ever delete the parent directory has 1981 * it would result in stale dentries that can never be deleted. 1982 */ 1983 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1984 struct btrfs_key di_key; 1985 1986 if (!fixup_path) { 1987 fixup_path = btrfs_alloc_path(); 1988 if (!fixup_path) { 1989 ret = -ENOMEM; 1990 break; 1991 } 1992 } 1993 1994 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1995 ret = link_to_fixup_dir(trans, root, fixup_path, 1996 di_key.objectid); 1997 if (ret) 1998 break; 1999 } 2000 ret = 0; 2001 } 2002 btrfs_free_path(fixup_path); 2003 return ret; 2004 } 2005 2006 /* 2007 * directory replay has two parts. There are the standard directory 2008 * items in the log copied from the subvolume, and range items 2009 * created in the log while the subvolume was logged. 2010 * 2011 * The range items tell us which parts of the key space the log 2012 * is authoritative for. During replay, if a key in the subvolume 2013 * directory is in a logged range item, but not actually in the log 2014 * that means it was deleted from the directory before the fsync 2015 * and should be removed. 2016 */ 2017 static noinline int find_dir_range(struct btrfs_root *root, 2018 struct btrfs_path *path, 2019 u64 dirid, int key_type, 2020 u64 *start_ret, u64 *end_ret) 2021 { 2022 struct btrfs_key key; 2023 u64 found_end; 2024 struct btrfs_dir_log_item *item; 2025 int ret; 2026 int nritems; 2027 2028 if (*start_ret == (u64)-1) 2029 return 1; 2030 2031 key.objectid = dirid; 2032 key.type = key_type; 2033 key.offset = *start_ret; 2034 2035 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2036 if (ret < 0) 2037 goto out; 2038 if (ret > 0) { 2039 if (path->slots[0] == 0) 2040 goto out; 2041 path->slots[0]--; 2042 } 2043 if (ret != 0) 2044 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2045 2046 if (key.type != key_type || key.objectid != dirid) { 2047 ret = 1; 2048 goto next; 2049 } 2050 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2051 struct btrfs_dir_log_item); 2052 found_end = btrfs_dir_log_end(path->nodes[0], item); 2053 2054 if (*start_ret >= key.offset && *start_ret <= found_end) { 2055 ret = 0; 2056 *start_ret = key.offset; 2057 *end_ret = found_end; 2058 goto out; 2059 } 2060 ret = 1; 2061 next: 2062 /* check the next slot in the tree to see if it is a valid item */ 2063 nritems = btrfs_header_nritems(path->nodes[0]); 2064 path->slots[0]++; 2065 if (path->slots[0] >= nritems) { 2066 ret = btrfs_next_leaf(root, path); 2067 if (ret) 2068 goto out; 2069 } 2070 2071 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2072 2073 if (key.type != key_type || key.objectid != dirid) { 2074 ret = 1; 2075 goto out; 2076 } 2077 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2078 struct btrfs_dir_log_item); 2079 found_end = btrfs_dir_log_end(path->nodes[0], item); 2080 *start_ret = key.offset; 2081 *end_ret = found_end; 2082 ret = 0; 2083 out: 2084 btrfs_release_path(path); 2085 return ret; 2086 } 2087 2088 /* 2089 * this looks for a given directory item in the log. If the directory 2090 * item is not in the log, the item is removed and the inode it points 2091 * to is unlinked 2092 */ 2093 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2094 struct btrfs_root *root, 2095 struct btrfs_root *log, 2096 struct btrfs_path *path, 2097 struct btrfs_path *log_path, 2098 struct inode *dir, 2099 struct btrfs_key *dir_key) 2100 { 2101 struct btrfs_fs_info *fs_info = root->fs_info; 2102 int ret; 2103 struct extent_buffer *eb; 2104 int slot; 2105 u32 item_size; 2106 struct btrfs_dir_item *di; 2107 struct btrfs_dir_item *log_di; 2108 int name_len; 2109 unsigned long ptr; 2110 unsigned long ptr_end; 2111 char *name; 2112 struct inode *inode; 2113 struct btrfs_key location; 2114 2115 again: 2116 eb = path->nodes[0]; 2117 slot = path->slots[0]; 2118 item_size = btrfs_item_size_nr(eb, slot); 2119 ptr = btrfs_item_ptr_offset(eb, slot); 2120 ptr_end = ptr + item_size; 2121 while (ptr < ptr_end) { 2122 di = (struct btrfs_dir_item *)ptr; 2123 name_len = btrfs_dir_name_len(eb, di); 2124 name = kmalloc(name_len, GFP_NOFS); 2125 if (!name) { 2126 ret = -ENOMEM; 2127 goto out; 2128 } 2129 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2130 name_len); 2131 log_di = NULL; 2132 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2133 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2134 dir_key->objectid, 2135 name, name_len, 0); 2136 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2137 log_di = btrfs_lookup_dir_index_item(trans, log, 2138 log_path, 2139 dir_key->objectid, 2140 dir_key->offset, 2141 name, name_len, 0); 2142 } 2143 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2144 btrfs_dir_item_key_to_cpu(eb, di, &location); 2145 btrfs_release_path(path); 2146 btrfs_release_path(log_path); 2147 inode = read_one_inode(root, location.objectid); 2148 if (!inode) { 2149 kfree(name); 2150 return -EIO; 2151 } 2152 2153 ret = link_to_fixup_dir(trans, root, 2154 path, location.objectid); 2155 if (ret) { 2156 kfree(name); 2157 iput(inode); 2158 goto out; 2159 } 2160 2161 inc_nlink(inode); 2162 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2163 BTRFS_I(inode), name, name_len); 2164 if (!ret) 2165 ret = btrfs_run_delayed_items(trans, fs_info); 2166 kfree(name); 2167 iput(inode); 2168 if (ret) 2169 goto out; 2170 2171 /* there might still be more names under this key 2172 * check and repeat if required 2173 */ 2174 ret = btrfs_search_slot(NULL, root, dir_key, path, 2175 0, 0); 2176 if (ret == 0) 2177 goto again; 2178 ret = 0; 2179 goto out; 2180 } else if (IS_ERR(log_di)) { 2181 kfree(name); 2182 return PTR_ERR(log_di); 2183 } 2184 btrfs_release_path(log_path); 2185 kfree(name); 2186 2187 ptr = (unsigned long)(di + 1); 2188 ptr += name_len; 2189 } 2190 ret = 0; 2191 out: 2192 btrfs_release_path(path); 2193 btrfs_release_path(log_path); 2194 return ret; 2195 } 2196 2197 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2198 struct btrfs_root *root, 2199 struct btrfs_root *log, 2200 struct btrfs_path *path, 2201 const u64 ino) 2202 { 2203 struct btrfs_key search_key; 2204 struct btrfs_path *log_path; 2205 int i; 2206 int nritems; 2207 int ret; 2208 2209 log_path = btrfs_alloc_path(); 2210 if (!log_path) 2211 return -ENOMEM; 2212 2213 search_key.objectid = ino; 2214 search_key.type = BTRFS_XATTR_ITEM_KEY; 2215 search_key.offset = 0; 2216 again: 2217 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2218 if (ret < 0) 2219 goto out; 2220 process_leaf: 2221 nritems = btrfs_header_nritems(path->nodes[0]); 2222 for (i = path->slots[0]; i < nritems; i++) { 2223 struct btrfs_key key; 2224 struct btrfs_dir_item *di; 2225 struct btrfs_dir_item *log_di; 2226 u32 total_size; 2227 u32 cur; 2228 2229 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2230 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2231 ret = 0; 2232 goto out; 2233 } 2234 2235 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2236 total_size = btrfs_item_size_nr(path->nodes[0], i); 2237 cur = 0; 2238 while (cur < total_size) { 2239 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2240 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2241 u32 this_len = sizeof(*di) + name_len + data_len; 2242 char *name; 2243 2244 name = kmalloc(name_len, GFP_NOFS); 2245 if (!name) { 2246 ret = -ENOMEM; 2247 goto out; 2248 } 2249 read_extent_buffer(path->nodes[0], name, 2250 (unsigned long)(di + 1), name_len); 2251 2252 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2253 name, name_len, 0); 2254 btrfs_release_path(log_path); 2255 if (!log_di) { 2256 /* Doesn't exist in log tree, so delete it. */ 2257 btrfs_release_path(path); 2258 di = btrfs_lookup_xattr(trans, root, path, ino, 2259 name, name_len, -1); 2260 kfree(name); 2261 if (IS_ERR(di)) { 2262 ret = PTR_ERR(di); 2263 goto out; 2264 } 2265 ASSERT(di); 2266 ret = btrfs_delete_one_dir_name(trans, root, 2267 path, di); 2268 if (ret) 2269 goto out; 2270 btrfs_release_path(path); 2271 search_key = key; 2272 goto again; 2273 } 2274 kfree(name); 2275 if (IS_ERR(log_di)) { 2276 ret = PTR_ERR(log_di); 2277 goto out; 2278 } 2279 cur += this_len; 2280 di = (struct btrfs_dir_item *)((char *)di + this_len); 2281 } 2282 } 2283 ret = btrfs_next_leaf(root, path); 2284 if (ret > 0) 2285 ret = 0; 2286 else if (ret == 0) 2287 goto process_leaf; 2288 out: 2289 btrfs_free_path(log_path); 2290 btrfs_release_path(path); 2291 return ret; 2292 } 2293 2294 2295 /* 2296 * deletion replay happens before we copy any new directory items 2297 * out of the log or out of backreferences from inodes. It 2298 * scans the log to find ranges of keys that log is authoritative for, 2299 * and then scans the directory to find items in those ranges that are 2300 * not present in the log. 2301 * 2302 * Anything we don't find in the log is unlinked and removed from the 2303 * directory. 2304 */ 2305 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2306 struct btrfs_root *root, 2307 struct btrfs_root *log, 2308 struct btrfs_path *path, 2309 u64 dirid, int del_all) 2310 { 2311 u64 range_start; 2312 u64 range_end; 2313 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2314 int ret = 0; 2315 struct btrfs_key dir_key; 2316 struct btrfs_key found_key; 2317 struct btrfs_path *log_path; 2318 struct inode *dir; 2319 2320 dir_key.objectid = dirid; 2321 dir_key.type = BTRFS_DIR_ITEM_KEY; 2322 log_path = btrfs_alloc_path(); 2323 if (!log_path) 2324 return -ENOMEM; 2325 2326 dir = read_one_inode(root, dirid); 2327 /* it isn't an error if the inode isn't there, that can happen 2328 * because we replay the deletes before we copy in the inode item 2329 * from the log 2330 */ 2331 if (!dir) { 2332 btrfs_free_path(log_path); 2333 return 0; 2334 } 2335 again: 2336 range_start = 0; 2337 range_end = 0; 2338 while (1) { 2339 if (del_all) 2340 range_end = (u64)-1; 2341 else { 2342 ret = find_dir_range(log, path, dirid, key_type, 2343 &range_start, &range_end); 2344 if (ret != 0) 2345 break; 2346 } 2347 2348 dir_key.offset = range_start; 2349 while (1) { 2350 int nritems; 2351 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2352 0, 0); 2353 if (ret < 0) 2354 goto out; 2355 2356 nritems = btrfs_header_nritems(path->nodes[0]); 2357 if (path->slots[0] >= nritems) { 2358 ret = btrfs_next_leaf(root, path); 2359 if (ret) 2360 break; 2361 } 2362 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2363 path->slots[0]); 2364 if (found_key.objectid != dirid || 2365 found_key.type != dir_key.type) 2366 goto next_type; 2367 2368 if (found_key.offset > range_end) 2369 break; 2370 2371 ret = check_item_in_log(trans, root, log, path, 2372 log_path, dir, 2373 &found_key); 2374 if (ret) 2375 goto out; 2376 if (found_key.offset == (u64)-1) 2377 break; 2378 dir_key.offset = found_key.offset + 1; 2379 } 2380 btrfs_release_path(path); 2381 if (range_end == (u64)-1) 2382 break; 2383 range_start = range_end + 1; 2384 } 2385 2386 next_type: 2387 ret = 0; 2388 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2389 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2390 dir_key.type = BTRFS_DIR_INDEX_KEY; 2391 btrfs_release_path(path); 2392 goto again; 2393 } 2394 out: 2395 btrfs_release_path(path); 2396 btrfs_free_path(log_path); 2397 iput(dir); 2398 return ret; 2399 } 2400 2401 /* 2402 * the process_func used to replay items from the log tree. This 2403 * gets called in two different stages. The first stage just looks 2404 * for inodes and makes sure they are all copied into the subvolume. 2405 * 2406 * The second stage copies all the other item types from the log into 2407 * the subvolume. The two stage approach is slower, but gets rid of 2408 * lots of complexity around inodes referencing other inodes that exist 2409 * only in the log (references come from either directory items or inode 2410 * back refs). 2411 */ 2412 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2413 struct walk_control *wc, u64 gen) 2414 { 2415 int nritems; 2416 struct btrfs_path *path; 2417 struct btrfs_root *root = wc->replay_dest; 2418 struct btrfs_key key; 2419 int level; 2420 int i; 2421 int ret; 2422 2423 ret = btrfs_read_buffer(eb, gen); 2424 if (ret) 2425 return ret; 2426 2427 level = btrfs_header_level(eb); 2428 2429 if (level != 0) 2430 return 0; 2431 2432 path = btrfs_alloc_path(); 2433 if (!path) 2434 return -ENOMEM; 2435 2436 nritems = btrfs_header_nritems(eb); 2437 for (i = 0; i < nritems; i++) { 2438 btrfs_item_key_to_cpu(eb, &key, i); 2439 2440 /* inode keys are done during the first stage */ 2441 if (key.type == BTRFS_INODE_ITEM_KEY && 2442 wc->stage == LOG_WALK_REPLAY_INODES) { 2443 struct btrfs_inode_item *inode_item; 2444 u32 mode; 2445 2446 inode_item = btrfs_item_ptr(eb, i, 2447 struct btrfs_inode_item); 2448 ret = replay_xattr_deletes(wc->trans, root, log, 2449 path, key.objectid); 2450 if (ret) 2451 break; 2452 mode = btrfs_inode_mode(eb, inode_item); 2453 if (S_ISDIR(mode)) { 2454 ret = replay_dir_deletes(wc->trans, 2455 root, log, path, key.objectid, 0); 2456 if (ret) 2457 break; 2458 } 2459 ret = overwrite_item(wc->trans, root, path, 2460 eb, i, &key); 2461 if (ret) 2462 break; 2463 2464 /* for regular files, make sure corresponding 2465 * orphan item exist. extents past the new EOF 2466 * will be truncated later by orphan cleanup. 2467 */ 2468 if (S_ISREG(mode)) { 2469 ret = insert_orphan_item(wc->trans, root, 2470 key.objectid); 2471 if (ret) 2472 break; 2473 } 2474 2475 ret = link_to_fixup_dir(wc->trans, root, 2476 path, key.objectid); 2477 if (ret) 2478 break; 2479 } 2480 2481 if (key.type == BTRFS_DIR_INDEX_KEY && 2482 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2483 ret = replay_one_dir_item(wc->trans, root, path, 2484 eb, i, &key); 2485 if (ret) 2486 break; 2487 } 2488 2489 if (wc->stage < LOG_WALK_REPLAY_ALL) 2490 continue; 2491 2492 /* these keys are simply copied */ 2493 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2494 ret = overwrite_item(wc->trans, root, path, 2495 eb, i, &key); 2496 if (ret) 2497 break; 2498 } else if (key.type == BTRFS_INODE_REF_KEY || 2499 key.type == BTRFS_INODE_EXTREF_KEY) { 2500 ret = add_inode_ref(wc->trans, root, log, path, 2501 eb, i, &key); 2502 if (ret && ret != -ENOENT) 2503 break; 2504 ret = 0; 2505 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2506 ret = replay_one_extent(wc->trans, root, path, 2507 eb, i, &key); 2508 if (ret) 2509 break; 2510 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2511 ret = replay_one_dir_item(wc->trans, root, path, 2512 eb, i, &key); 2513 if (ret) 2514 break; 2515 } 2516 } 2517 btrfs_free_path(path); 2518 return ret; 2519 } 2520 2521 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2522 struct btrfs_root *root, 2523 struct btrfs_path *path, int *level, 2524 struct walk_control *wc) 2525 { 2526 struct btrfs_fs_info *fs_info = root->fs_info; 2527 u64 root_owner; 2528 u64 bytenr; 2529 u64 ptr_gen; 2530 struct extent_buffer *next; 2531 struct extent_buffer *cur; 2532 struct extent_buffer *parent; 2533 u32 blocksize; 2534 int ret = 0; 2535 2536 WARN_ON(*level < 0); 2537 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2538 2539 while (*level > 0) { 2540 WARN_ON(*level < 0); 2541 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2542 cur = path->nodes[*level]; 2543 2544 WARN_ON(btrfs_header_level(cur) != *level); 2545 2546 if (path->slots[*level] >= 2547 btrfs_header_nritems(cur)) 2548 break; 2549 2550 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2551 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2552 blocksize = fs_info->nodesize; 2553 2554 parent = path->nodes[*level]; 2555 root_owner = btrfs_header_owner(parent); 2556 2557 next = btrfs_find_create_tree_block(fs_info, bytenr); 2558 if (IS_ERR(next)) 2559 return PTR_ERR(next); 2560 2561 if (*level == 1) { 2562 ret = wc->process_func(root, next, wc, ptr_gen); 2563 if (ret) { 2564 free_extent_buffer(next); 2565 return ret; 2566 } 2567 2568 path->slots[*level]++; 2569 if (wc->free) { 2570 ret = btrfs_read_buffer(next, ptr_gen); 2571 if (ret) { 2572 free_extent_buffer(next); 2573 return ret; 2574 } 2575 2576 if (trans) { 2577 btrfs_tree_lock(next); 2578 btrfs_set_lock_blocking(next); 2579 clean_tree_block(fs_info, next); 2580 btrfs_wait_tree_block_writeback(next); 2581 btrfs_tree_unlock(next); 2582 } else { 2583 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2584 clear_extent_buffer_dirty(next); 2585 } 2586 2587 WARN_ON(root_owner != 2588 BTRFS_TREE_LOG_OBJECTID); 2589 ret = btrfs_free_and_pin_reserved_extent( 2590 fs_info, bytenr, 2591 blocksize); 2592 if (ret) { 2593 free_extent_buffer(next); 2594 return ret; 2595 } 2596 } 2597 free_extent_buffer(next); 2598 continue; 2599 } 2600 ret = btrfs_read_buffer(next, ptr_gen); 2601 if (ret) { 2602 free_extent_buffer(next); 2603 return ret; 2604 } 2605 2606 WARN_ON(*level <= 0); 2607 if (path->nodes[*level-1]) 2608 free_extent_buffer(path->nodes[*level-1]); 2609 path->nodes[*level-1] = next; 2610 *level = btrfs_header_level(next); 2611 path->slots[*level] = 0; 2612 cond_resched(); 2613 } 2614 WARN_ON(*level < 0); 2615 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2616 2617 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2618 2619 cond_resched(); 2620 return 0; 2621 } 2622 2623 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2624 struct btrfs_root *root, 2625 struct btrfs_path *path, int *level, 2626 struct walk_control *wc) 2627 { 2628 struct btrfs_fs_info *fs_info = root->fs_info; 2629 u64 root_owner; 2630 int i; 2631 int slot; 2632 int ret; 2633 2634 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2635 slot = path->slots[i]; 2636 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2637 path->slots[i]++; 2638 *level = i; 2639 WARN_ON(*level == 0); 2640 return 0; 2641 } else { 2642 struct extent_buffer *parent; 2643 if (path->nodes[*level] == root->node) 2644 parent = path->nodes[*level]; 2645 else 2646 parent = path->nodes[*level + 1]; 2647 2648 root_owner = btrfs_header_owner(parent); 2649 ret = wc->process_func(root, path->nodes[*level], wc, 2650 btrfs_header_generation(path->nodes[*level])); 2651 if (ret) 2652 return ret; 2653 2654 if (wc->free) { 2655 struct extent_buffer *next; 2656 2657 next = path->nodes[*level]; 2658 2659 if (trans) { 2660 btrfs_tree_lock(next); 2661 btrfs_set_lock_blocking(next); 2662 clean_tree_block(fs_info, next); 2663 btrfs_wait_tree_block_writeback(next); 2664 btrfs_tree_unlock(next); 2665 } else { 2666 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2667 clear_extent_buffer_dirty(next); 2668 } 2669 2670 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2671 ret = btrfs_free_and_pin_reserved_extent( 2672 fs_info, 2673 path->nodes[*level]->start, 2674 path->nodes[*level]->len); 2675 if (ret) 2676 return ret; 2677 } 2678 free_extent_buffer(path->nodes[*level]); 2679 path->nodes[*level] = NULL; 2680 *level = i + 1; 2681 } 2682 } 2683 return 1; 2684 } 2685 2686 /* 2687 * drop the reference count on the tree rooted at 'snap'. This traverses 2688 * the tree freeing any blocks that have a ref count of zero after being 2689 * decremented. 2690 */ 2691 static int walk_log_tree(struct btrfs_trans_handle *trans, 2692 struct btrfs_root *log, struct walk_control *wc) 2693 { 2694 struct btrfs_fs_info *fs_info = log->fs_info; 2695 int ret = 0; 2696 int wret; 2697 int level; 2698 struct btrfs_path *path; 2699 int orig_level; 2700 2701 path = btrfs_alloc_path(); 2702 if (!path) 2703 return -ENOMEM; 2704 2705 level = btrfs_header_level(log->node); 2706 orig_level = level; 2707 path->nodes[level] = log->node; 2708 extent_buffer_get(log->node); 2709 path->slots[level] = 0; 2710 2711 while (1) { 2712 wret = walk_down_log_tree(trans, log, path, &level, wc); 2713 if (wret > 0) 2714 break; 2715 if (wret < 0) { 2716 ret = wret; 2717 goto out; 2718 } 2719 2720 wret = walk_up_log_tree(trans, log, path, &level, wc); 2721 if (wret > 0) 2722 break; 2723 if (wret < 0) { 2724 ret = wret; 2725 goto out; 2726 } 2727 } 2728 2729 /* was the root node processed? if not, catch it here */ 2730 if (path->nodes[orig_level]) { 2731 ret = wc->process_func(log, path->nodes[orig_level], wc, 2732 btrfs_header_generation(path->nodes[orig_level])); 2733 if (ret) 2734 goto out; 2735 if (wc->free) { 2736 struct extent_buffer *next; 2737 2738 next = path->nodes[orig_level]; 2739 2740 if (trans) { 2741 btrfs_tree_lock(next); 2742 btrfs_set_lock_blocking(next); 2743 clean_tree_block(fs_info, next); 2744 btrfs_wait_tree_block_writeback(next); 2745 btrfs_tree_unlock(next); 2746 } else { 2747 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2748 clear_extent_buffer_dirty(next); 2749 } 2750 2751 WARN_ON(log->root_key.objectid != 2752 BTRFS_TREE_LOG_OBJECTID); 2753 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2754 next->start, next->len); 2755 if (ret) 2756 goto out; 2757 } 2758 } 2759 2760 out: 2761 btrfs_free_path(path); 2762 return ret; 2763 } 2764 2765 /* 2766 * helper function to update the item for a given subvolumes log root 2767 * in the tree of log roots 2768 */ 2769 static int update_log_root(struct btrfs_trans_handle *trans, 2770 struct btrfs_root *log) 2771 { 2772 struct btrfs_fs_info *fs_info = log->fs_info; 2773 int ret; 2774 2775 if (log->log_transid == 1) { 2776 /* insert root item on the first sync */ 2777 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2778 &log->root_key, &log->root_item); 2779 } else { 2780 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2781 &log->root_key, &log->root_item); 2782 } 2783 return ret; 2784 } 2785 2786 static void wait_log_commit(struct btrfs_root *root, int transid) 2787 { 2788 DEFINE_WAIT(wait); 2789 int index = transid % 2; 2790 2791 /* 2792 * we only allow two pending log transactions at a time, 2793 * so we know that if ours is more than 2 older than the 2794 * current transaction, we're done 2795 */ 2796 for (;;) { 2797 prepare_to_wait(&root->log_commit_wait[index], 2798 &wait, TASK_UNINTERRUPTIBLE); 2799 2800 if (!(root->log_transid_committed < transid && 2801 atomic_read(&root->log_commit[index]))) 2802 break; 2803 2804 mutex_unlock(&root->log_mutex); 2805 schedule(); 2806 mutex_lock(&root->log_mutex); 2807 } 2808 finish_wait(&root->log_commit_wait[index], &wait); 2809 } 2810 2811 static void wait_for_writer(struct btrfs_root *root) 2812 { 2813 DEFINE_WAIT(wait); 2814 2815 for (;;) { 2816 prepare_to_wait(&root->log_writer_wait, &wait, 2817 TASK_UNINTERRUPTIBLE); 2818 if (!atomic_read(&root->log_writers)) 2819 break; 2820 2821 mutex_unlock(&root->log_mutex); 2822 schedule(); 2823 mutex_lock(&root->log_mutex); 2824 } 2825 finish_wait(&root->log_writer_wait, &wait); 2826 } 2827 2828 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2829 struct btrfs_log_ctx *ctx) 2830 { 2831 if (!ctx) 2832 return; 2833 2834 mutex_lock(&root->log_mutex); 2835 list_del_init(&ctx->list); 2836 mutex_unlock(&root->log_mutex); 2837 } 2838 2839 /* 2840 * Invoked in log mutex context, or be sure there is no other task which 2841 * can access the list. 2842 */ 2843 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2844 int index, int error) 2845 { 2846 struct btrfs_log_ctx *ctx; 2847 struct btrfs_log_ctx *safe; 2848 2849 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2850 list_del_init(&ctx->list); 2851 ctx->log_ret = error; 2852 } 2853 2854 INIT_LIST_HEAD(&root->log_ctxs[index]); 2855 } 2856 2857 /* 2858 * btrfs_sync_log does sends a given tree log down to the disk and 2859 * updates the super blocks to record it. When this call is done, 2860 * you know that any inodes previously logged are safely on disk only 2861 * if it returns 0. 2862 * 2863 * Any other return value means you need to call btrfs_commit_transaction. 2864 * Some of the edge cases for fsyncing directories that have had unlinks 2865 * or renames done in the past mean that sometimes the only safe 2866 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2867 * that has happened. 2868 */ 2869 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2870 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2871 { 2872 int index1; 2873 int index2; 2874 int mark; 2875 int ret; 2876 struct btrfs_fs_info *fs_info = root->fs_info; 2877 struct btrfs_root *log = root->log_root; 2878 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2879 int log_transid = 0; 2880 struct btrfs_log_ctx root_log_ctx; 2881 struct blk_plug plug; 2882 2883 mutex_lock(&root->log_mutex); 2884 log_transid = ctx->log_transid; 2885 if (root->log_transid_committed >= log_transid) { 2886 mutex_unlock(&root->log_mutex); 2887 return ctx->log_ret; 2888 } 2889 2890 index1 = log_transid % 2; 2891 if (atomic_read(&root->log_commit[index1])) { 2892 wait_log_commit(root, log_transid); 2893 mutex_unlock(&root->log_mutex); 2894 return ctx->log_ret; 2895 } 2896 ASSERT(log_transid == root->log_transid); 2897 atomic_set(&root->log_commit[index1], 1); 2898 2899 /* wait for previous tree log sync to complete */ 2900 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2901 wait_log_commit(root, log_transid - 1); 2902 2903 while (1) { 2904 int batch = atomic_read(&root->log_batch); 2905 /* when we're on an ssd, just kick the log commit out */ 2906 if (!btrfs_test_opt(fs_info, SSD) && 2907 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2908 mutex_unlock(&root->log_mutex); 2909 schedule_timeout_uninterruptible(1); 2910 mutex_lock(&root->log_mutex); 2911 } 2912 wait_for_writer(root); 2913 if (batch == atomic_read(&root->log_batch)) 2914 break; 2915 } 2916 2917 /* bail out if we need to do a full commit */ 2918 if (btrfs_need_log_full_commit(fs_info, trans)) { 2919 ret = -EAGAIN; 2920 btrfs_free_logged_extents(log, log_transid); 2921 mutex_unlock(&root->log_mutex); 2922 goto out; 2923 } 2924 2925 if (log_transid % 2 == 0) 2926 mark = EXTENT_DIRTY; 2927 else 2928 mark = EXTENT_NEW; 2929 2930 /* we start IO on all the marked extents here, but we don't actually 2931 * wait for them until later. 2932 */ 2933 blk_start_plug(&plug); 2934 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2935 if (ret) { 2936 blk_finish_plug(&plug); 2937 btrfs_abort_transaction(trans, ret); 2938 btrfs_free_logged_extents(log, log_transid); 2939 btrfs_set_log_full_commit(fs_info, trans); 2940 mutex_unlock(&root->log_mutex); 2941 goto out; 2942 } 2943 2944 btrfs_set_root_node(&log->root_item, log->node); 2945 2946 root->log_transid++; 2947 log->log_transid = root->log_transid; 2948 root->log_start_pid = 0; 2949 /* 2950 * IO has been started, blocks of the log tree have WRITTEN flag set 2951 * in their headers. new modifications of the log will be written to 2952 * new positions. so it's safe to allow log writers to go in. 2953 */ 2954 mutex_unlock(&root->log_mutex); 2955 2956 btrfs_init_log_ctx(&root_log_ctx, NULL); 2957 2958 mutex_lock(&log_root_tree->log_mutex); 2959 atomic_inc(&log_root_tree->log_batch); 2960 atomic_inc(&log_root_tree->log_writers); 2961 2962 index2 = log_root_tree->log_transid % 2; 2963 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2964 root_log_ctx.log_transid = log_root_tree->log_transid; 2965 2966 mutex_unlock(&log_root_tree->log_mutex); 2967 2968 ret = update_log_root(trans, log); 2969 2970 mutex_lock(&log_root_tree->log_mutex); 2971 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2972 /* 2973 * Implicit memory barrier after atomic_dec_and_test 2974 */ 2975 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2976 wake_up(&log_root_tree->log_writer_wait); 2977 } 2978 2979 if (ret) { 2980 if (!list_empty(&root_log_ctx.list)) 2981 list_del_init(&root_log_ctx.list); 2982 2983 blk_finish_plug(&plug); 2984 btrfs_set_log_full_commit(fs_info, trans); 2985 2986 if (ret != -ENOSPC) { 2987 btrfs_abort_transaction(trans, ret); 2988 mutex_unlock(&log_root_tree->log_mutex); 2989 goto out; 2990 } 2991 btrfs_wait_tree_log_extents(log, mark); 2992 btrfs_free_logged_extents(log, log_transid); 2993 mutex_unlock(&log_root_tree->log_mutex); 2994 ret = -EAGAIN; 2995 goto out; 2996 } 2997 2998 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2999 blk_finish_plug(&plug); 3000 list_del_init(&root_log_ctx.list); 3001 mutex_unlock(&log_root_tree->log_mutex); 3002 ret = root_log_ctx.log_ret; 3003 goto out; 3004 } 3005 3006 index2 = root_log_ctx.log_transid % 2; 3007 if (atomic_read(&log_root_tree->log_commit[index2])) { 3008 blk_finish_plug(&plug); 3009 ret = btrfs_wait_tree_log_extents(log, mark); 3010 btrfs_wait_logged_extents(trans, log, log_transid); 3011 wait_log_commit(log_root_tree, 3012 root_log_ctx.log_transid); 3013 mutex_unlock(&log_root_tree->log_mutex); 3014 if (!ret) 3015 ret = root_log_ctx.log_ret; 3016 goto out; 3017 } 3018 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3019 atomic_set(&log_root_tree->log_commit[index2], 1); 3020 3021 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3022 wait_log_commit(log_root_tree, 3023 root_log_ctx.log_transid - 1); 3024 } 3025 3026 wait_for_writer(log_root_tree); 3027 3028 /* 3029 * now that we've moved on to the tree of log tree roots, 3030 * check the full commit flag again 3031 */ 3032 if (btrfs_need_log_full_commit(fs_info, trans)) { 3033 blk_finish_plug(&plug); 3034 btrfs_wait_tree_log_extents(log, mark); 3035 btrfs_free_logged_extents(log, log_transid); 3036 mutex_unlock(&log_root_tree->log_mutex); 3037 ret = -EAGAIN; 3038 goto out_wake_log_root; 3039 } 3040 3041 ret = btrfs_write_marked_extents(fs_info, 3042 &log_root_tree->dirty_log_pages, 3043 EXTENT_DIRTY | EXTENT_NEW); 3044 blk_finish_plug(&plug); 3045 if (ret) { 3046 btrfs_set_log_full_commit(fs_info, trans); 3047 btrfs_abort_transaction(trans, ret); 3048 btrfs_free_logged_extents(log, log_transid); 3049 mutex_unlock(&log_root_tree->log_mutex); 3050 goto out_wake_log_root; 3051 } 3052 ret = btrfs_wait_tree_log_extents(log, mark); 3053 if (!ret) 3054 ret = btrfs_wait_tree_log_extents(log_root_tree, 3055 EXTENT_NEW | EXTENT_DIRTY); 3056 if (ret) { 3057 btrfs_set_log_full_commit(fs_info, trans); 3058 btrfs_free_logged_extents(log, log_transid); 3059 mutex_unlock(&log_root_tree->log_mutex); 3060 goto out_wake_log_root; 3061 } 3062 btrfs_wait_logged_extents(trans, log, log_transid); 3063 3064 btrfs_set_super_log_root(fs_info->super_for_commit, 3065 log_root_tree->node->start); 3066 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3067 btrfs_header_level(log_root_tree->node)); 3068 3069 log_root_tree->log_transid++; 3070 mutex_unlock(&log_root_tree->log_mutex); 3071 3072 /* 3073 * nobody else is going to jump in and write the the ctree 3074 * super here because the log_commit atomic below is protecting 3075 * us. We must be called with a transaction handle pinning 3076 * the running transaction open, so a full commit can't hop 3077 * in and cause problems either. 3078 */ 3079 ret = write_all_supers(fs_info, 1); 3080 if (ret) { 3081 btrfs_set_log_full_commit(fs_info, trans); 3082 btrfs_abort_transaction(trans, ret); 3083 goto out_wake_log_root; 3084 } 3085 3086 mutex_lock(&root->log_mutex); 3087 if (root->last_log_commit < log_transid) 3088 root->last_log_commit = log_transid; 3089 mutex_unlock(&root->log_mutex); 3090 3091 out_wake_log_root: 3092 mutex_lock(&log_root_tree->log_mutex); 3093 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3094 3095 log_root_tree->log_transid_committed++; 3096 atomic_set(&log_root_tree->log_commit[index2], 0); 3097 mutex_unlock(&log_root_tree->log_mutex); 3098 3099 /* 3100 * The barrier before waitqueue_active is implied by mutex_unlock 3101 */ 3102 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3103 wake_up(&log_root_tree->log_commit_wait[index2]); 3104 out: 3105 mutex_lock(&root->log_mutex); 3106 btrfs_remove_all_log_ctxs(root, index1, ret); 3107 root->log_transid_committed++; 3108 atomic_set(&root->log_commit[index1], 0); 3109 mutex_unlock(&root->log_mutex); 3110 3111 /* 3112 * The barrier before waitqueue_active is implied by mutex_unlock 3113 */ 3114 if (waitqueue_active(&root->log_commit_wait[index1])) 3115 wake_up(&root->log_commit_wait[index1]); 3116 return ret; 3117 } 3118 3119 static void free_log_tree(struct btrfs_trans_handle *trans, 3120 struct btrfs_root *log) 3121 { 3122 int ret; 3123 u64 start; 3124 u64 end; 3125 struct walk_control wc = { 3126 .free = 1, 3127 .process_func = process_one_buffer 3128 }; 3129 3130 ret = walk_log_tree(trans, log, &wc); 3131 /* I don't think this can happen but just in case */ 3132 if (ret) 3133 btrfs_abort_transaction(trans, ret); 3134 3135 while (1) { 3136 ret = find_first_extent_bit(&log->dirty_log_pages, 3137 0, &start, &end, 3138 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, 3139 NULL); 3140 if (ret) 3141 break; 3142 3143 clear_extent_bits(&log->dirty_log_pages, start, end, 3144 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3145 } 3146 3147 /* 3148 * We may have short-circuited the log tree with the full commit logic 3149 * and left ordered extents on our list, so clear these out to keep us 3150 * from leaking inodes and memory. 3151 */ 3152 btrfs_free_logged_extents(log, 0); 3153 btrfs_free_logged_extents(log, 1); 3154 3155 free_extent_buffer(log->node); 3156 kfree(log); 3157 } 3158 3159 /* 3160 * free all the extents used by the tree log. This should be called 3161 * at commit time of the full transaction 3162 */ 3163 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3164 { 3165 if (root->log_root) { 3166 free_log_tree(trans, root->log_root); 3167 root->log_root = NULL; 3168 } 3169 return 0; 3170 } 3171 3172 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3173 struct btrfs_fs_info *fs_info) 3174 { 3175 if (fs_info->log_root_tree) { 3176 free_log_tree(trans, fs_info->log_root_tree); 3177 fs_info->log_root_tree = NULL; 3178 } 3179 return 0; 3180 } 3181 3182 /* 3183 * If both a file and directory are logged, and unlinks or renames are 3184 * mixed in, we have a few interesting corners: 3185 * 3186 * create file X in dir Y 3187 * link file X to X.link in dir Y 3188 * fsync file X 3189 * unlink file X but leave X.link 3190 * fsync dir Y 3191 * 3192 * After a crash we would expect only X.link to exist. But file X 3193 * didn't get fsync'd again so the log has back refs for X and X.link. 3194 * 3195 * We solve this by removing directory entries and inode backrefs from the 3196 * log when a file that was logged in the current transaction is 3197 * unlinked. Any later fsync will include the updated log entries, and 3198 * we'll be able to reconstruct the proper directory items from backrefs. 3199 * 3200 * This optimizations allows us to avoid relogging the entire inode 3201 * or the entire directory. 3202 */ 3203 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3204 struct btrfs_root *root, 3205 const char *name, int name_len, 3206 struct btrfs_inode *dir, u64 index) 3207 { 3208 struct btrfs_root *log; 3209 struct btrfs_dir_item *di; 3210 struct btrfs_path *path; 3211 int ret; 3212 int err = 0; 3213 int bytes_del = 0; 3214 u64 dir_ino = btrfs_ino(dir); 3215 3216 if (dir->logged_trans < trans->transid) 3217 return 0; 3218 3219 ret = join_running_log_trans(root); 3220 if (ret) 3221 return 0; 3222 3223 mutex_lock(&dir->log_mutex); 3224 3225 log = root->log_root; 3226 path = btrfs_alloc_path(); 3227 if (!path) { 3228 err = -ENOMEM; 3229 goto out_unlock; 3230 } 3231 3232 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3233 name, name_len, -1); 3234 if (IS_ERR(di)) { 3235 err = PTR_ERR(di); 3236 goto fail; 3237 } 3238 if (di) { 3239 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3240 bytes_del += name_len; 3241 if (ret) { 3242 err = ret; 3243 goto fail; 3244 } 3245 } 3246 btrfs_release_path(path); 3247 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3248 index, name, name_len, -1); 3249 if (IS_ERR(di)) { 3250 err = PTR_ERR(di); 3251 goto fail; 3252 } 3253 if (di) { 3254 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3255 bytes_del += name_len; 3256 if (ret) { 3257 err = ret; 3258 goto fail; 3259 } 3260 } 3261 3262 /* update the directory size in the log to reflect the names 3263 * we have removed 3264 */ 3265 if (bytes_del) { 3266 struct btrfs_key key; 3267 3268 key.objectid = dir_ino; 3269 key.offset = 0; 3270 key.type = BTRFS_INODE_ITEM_KEY; 3271 btrfs_release_path(path); 3272 3273 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3274 if (ret < 0) { 3275 err = ret; 3276 goto fail; 3277 } 3278 if (ret == 0) { 3279 struct btrfs_inode_item *item; 3280 u64 i_size; 3281 3282 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3283 struct btrfs_inode_item); 3284 i_size = btrfs_inode_size(path->nodes[0], item); 3285 if (i_size > bytes_del) 3286 i_size -= bytes_del; 3287 else 3288 i_size = 0; 3289 btrfs_set_inode_size(path->nodes[0], item, i_size); 3290 btrfs_mark_buffer_dirty(path->nodes[0]); 3291 } else 3292 ret = 0; 3293 btrfs_release_path(path); 3294 } 3295 fail: 3296 btrfs_free_path(path); 3297 out_unlock: 3298 mutex_unlock(&dir->log_mutex); 3299 if (ret == -ENOSPC) { 3300 btrfs_set_log_full_commit(root->fs_info, trans); 3301 ret = 0; 3302 } else if (ret < 0) 3303 btrfs_abort_transaction(trans, ret); 3304 3305 btrfs_end_log_trans(root); 3306 3307 return err; 3308 } 3309 3310 /* see comments for btrfs_del_dir_entries_in_log */ 3311 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3312 struct btrfs_root *root, 3313 const char *name, int name_len, 3314 struct btrfs_inode *inode, u64 dirid) 3315 { 3316 struct btrfs_fs_info *fs_info = root->fs_info; 3317 struct btrfs_root *log; 3318 u64 index; 3319 int ret; 3320 3321 if (inode->logged_trans < trans->transid) 3322 return 0; 3323 3324 ret = join_running_log_trans(root); 3325 if (ret) 3326 return 0; 3327 log = root->log_root; 3328 mutex_lock(&inode->log_mutex); 3329 3330 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3331 dirid, &index); 3332 mutex_unlock(&inode->log_mutex); 3333 if (ret == -ENOSPC) { 3334 btrfs_set_log_full_commit(fs_info, trans); 3335 ret = 0; 3336 } else if (ret < 0 && ret != -ENOENT) 3337 btrfs_abort_transaction(trans, ret); 3338 btrfs_end_log_trans(root); 3339 3340 return ret; 3341 } 3342 3343 /* 3344 * creates a range item in the log for 'dirid'. first_offset and 3345 * last_offset tell us which parts of the key space the log should 3346 * be considered authoritative for. 3347 */ 3348 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3349 struct btrfs_root *log, 3350 struct btrfs_path *path, 3351 int key_type, u64 dirid, 3352 u64 first_offset, u64 last_offset) 3353 { 3354 int ret; 3355 struct btrfs_key key; 3356 struct btrfs_dir_log_item *item; 3357 3358 key.objectid = dirid; 3359 key.offset = first_offset; 3360 if (key_type == BTRFS_DIR_ITEM_KEY) 3361 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3362 else 3363 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3364 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3365 if (ret) 3366 return ret; 3367 3368 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3369 struct btrfs_dir_log_item); 3370 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3371 btrfs_mark_buffer_dirty(path->nodes[0]); 3372 btrfs_release_path(path); 3373 return 0; 3374 } 3375 3376 /* 3377 * log all the items included in the current transaction for a given 3378 * directory. This also creates the range items in the log tree required 3379 * to replay anything deleted before the fsync 3380 */ 3381 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3382 struct btrfs_root *root, struct btrfs_inode *inode, 3383 struct btrfs_path *path, 3384 struct btrfs_path *dst_path, int key_type, 3385 struct btrfs_log_ctx *ctx, 3386 u64 min_offset, u64 *last_offset_ret) 3387 { 3388 struct btrfs_key min_key; 3389 struct btrfs_root *log = root->log_root; 3390 struct extent_buffer *src; 3391 int err = 0; 3392 int ret; 3393 int i; 3394 int nritems; 3395 u64 first_offset = min_offset; 3396 u64 last_offset = (u64)-1; 3397 u64 ino = btrfs_ino(inode); 3398 3399 log = root->log_root; 3400 3401 min_key.objectid = ino; 3402 min_key.type = key_type; 3403 min_key.offset = min_offset; 3404 3405 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3406 3407 /* 3408 * we didn't find anything from this transaction, see if there 3409 * is anything at all 3410 */ 3411 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3412 min_key.objectid = ino; 3413 min_key.type = key_type; 3414 min_key.offset = (u64)-1; 3415 btrfs_release_path(path); 3416 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3417 if (ret < 0) { 3418 btrfs_release_path(path); 3419 return ret; 3420 } 3421 ret = btrfs_previous_item(root, path, ino, key_type); 3422 3423 /* if ret == 0 there are items for this type, 3424 * create a range to tell us the last key of this type. 3425 * otherwise, there are no items in this directory after 3426 * *min_offset, and we create a range to indicate that. 3427 */ 3428 if (ret == 0) { 3429 struct btrfs_key tmp; 3430 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3431 path->slots[0]); 3432 if (key_type == tmp.type) 3433 first_offset = max(min_offset, tmp.offset) + 1; 3434 } 3435 goto done; 3436 } 3437 3438 /* go backward to find any previous key */ 3439 ret = btrfs_previous_item(root, path, ino, key_type); 3440 if (ret == 0) { 3441 struct btrfs_key tmp; 3442 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3443 if (key_type == tmp.type) { 3444 first_offset = tmp.offset; 3445 ret = overwrite_item(trans, log, dst_path, 3446 path->nodes[0], path->slots[0], 3447 &tmp); 3448 if (ret) { 3449 err = ret; 3450 goto done; 3451 } 3452 } 3453 } 3454 btrfs_release_path(path); 3455 3456 /* find the first key from this transaction again */ 3457 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3458 if (WARN_ON(ret != 0)) 3459 goto done; 3460 3461 /* 3462 * we have a block from this transaction, log every item in it 3463 * from our directory 3464 */ 3465 while (1) { 3466 struct btrfs_key tmp; 3467 src = path->nodes[0]; 3468 nritems = btrfs_header_nritems(src); 3469 for (i = path->slots[0]; i < nritems; i++) { 3470 struct btrfs_dir_item *di; 3471 3472 btrfs_item_key_to_cpu(src, &min_key, i); 3473 3474 if (min_key.objectid != ino || min_key.type != key_type) 3475 goto done; 3476 ret = overwrite_item(trans, log, dst_path, src, i, 3477 &min_key); 3478 if (ret) { 3479 err = ret; 3480 goto done; 3481 } 3482 3483 /* 3484 * We must make sure that when we log a directory entry, 3485 * the corresponding inode, after log replay, has a 3486 * matching link count. For example: 3487 * 3488 * touch foo 3489 * mkdir mydir 3490 * sync 3491 * ln foo mydir/bar 3492 * xfs_io -c "fsync" mydir 3493 * <crash> 3494 * <mount fs and log replay> 3495 * 3496 * Would result in a fsync log that when replayed, our 3497 * file inode would have a link count of 1, but we get 3498 * two directory entries pointing to the same inode. 3499 * After removing one of the names, it would not be 3500 * possible to remove the other name, which resulted 3501 * always in stale file handle errors, and would not 3502 * be possible to rmdir the parent directory, since 3503 * its i_size could never decrement to the value 3504 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3505 */ 3506 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3507 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3508 if (ctx && 3509 (btrfs_dir_transid(src, di) == trans->transid || 3510 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3511 tmp.type != BTRFS_ROOT_ITEM_KEY) 3512 ctx->log_new_dentries = true; 3513 } 3514 path->slots[0] = nritems; 3515 3516 /* 3517 * look ahead to the next item and see if it is also 3518 * from this directory and from this transaction 3519 */ 3520 ret = btrfs_next_leaf(root, path); 3521 if (ret == 1) { 3522 last_offset = (u64)-1; 3523 goto done; 3524 } 3525 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3526 if (tmp.objectid != ino || tmp.type != key_type) { 3527 last_offset = (u64)-1; 3528 goto done; 3529 } 3530 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3531 ret = overwrite_item(trans, log, dst_path, 3532 path->nodes[0], path->slots[0], 3533 &tmp); 3534 if (ret) 3535 err = ret; 3536 else 3537 last_offset = tmp.offset; 3538 goto done; 3539 } 3540 } 3541 done: 3542 btrfs_release_path(path); 3543 btrfs_release_path(dst_path); 3544 3545 if (err == 0) { 3546 *last_offset_ret = last_offset; 3547 /* 3548 * insert the log range keys to indicate where the log 3549 * is valid 3550 */ 3551 ret = insert_dir_log_key(trans, log, path, key_type, 3552 ino, first_offset, last_offset); 3553 if (ret) 3554 err = ret; 3555 } 3556 return err; 3557 } 3558 3559 /* 3560 * logging directories is very similar to logging inodes, We find all the items 3561 * from the current transaction and write them to the log. 3562 * 3563 * The recovery code scans the directory in the subvolume, and if it finds a 3564 * key in the range logged that is not present in the log tree, then it means 3565 * that dir entry was unlinked during the transaction. 3566 * 3567 * In order for that scan to work, we must include one key smaller than 3568 * the smallest logged by this transaction and one key larger than the largest 3569 * key logged by this transaction. 3570 */ 3571 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3572 struct btrfs_root *root, struct btrfs_inode *inode, 3573 struct btrfs_path *path, 3574 struct btrfs_path *dst_path, 3575 struct btrfs_log_ctx *ctx) 3576 { 3577 u64 min_key; 3578 u64 max_key; 3579 int ret; 3580 int key_type = BTRFS_DIR_ITEM_KEY; 3581 3582 again: 3583 min_key = 0; 3584 max_key = 0; 3585 while (1) { 3586 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3587 ctx, min_key, &max_key); 3588 if (ret) 3589 return ret; 3590 if (max_key == (u64)-1) 3591 break; 3592 min_key = max_key + 1; 3593 } 3594 3595 if (key_type == BTRFS_DIR_ITEM_KEY) { 3596 key_type = BTRFS_DIR_INDEX_KEY; 3597 goto again; 3598 } 3599 return 0; 3600 } 3601 3602 /* 3603 * a helper function to drop items from the log before we relog an 3604 * inode. max_key_type indicates the highest item type to remove. 3605 * This cannot be run for file data extents because it does not 3606 * free the extents they point to. 3607 */ 3608 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3609 struct btrfs_root *log, 3610 struct btrfs_path *path, 3611 u64 objectid, int max_key_type) 3612 { 3613 int ret; 3614 struct btrfs_key key; 3615 struct btrfs_key found_key; 3616 int start_slot; 3617 3618 key.objectid = objectid; 3619 key.type = max_key_type; 3620 key.offset = (u64)-1; 3621 3622 while (1) { 3623 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3624 BUG_ON(ret == 0); /* Logic error */ 3625 if (ret < 0) 3626 break; 3627 3628 if (path->slots[0] == 0) 3629 break; 3630 3631 path->slots[0]--; 3632 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3633 path->slots[0]); 3634 3635 if (found_key.objectid != objectid) 3636 break; 3637 3638 found_key.offset = 0; 3639 found_key.type = 0; 3640 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3641 &start_slot); 3642 3643 ret = btrfs_del_items(trans, log, path, start_slot, 3644 path->slots[0] - start_slot + 1); 3645 /* 3646 * If start slot isn't 0 then we don't need to re-search, we've 3647 * found the last guy with the objectid in this tree. 3648 */ 3649 if (ret || start_slot != 0) 3650 break; 3651 btrfs_release_path(path); 3652 } 3653 btrfs_release_path(path); 3654 if (ret > 0) 3655 ret = 0; 3656 return ret; 3657 } 3658 3659 static void fill_inode_item(struct btrfs_trans_handle *trans, 3660 struct extent_buffer *leaf, 3661 struct btrfs_inode_item *item, 3662 struct inode *inode, int log_inode_only, 3663 u64 logged_isize) 3664 { 3665 struct btrfs_map_token token; 3666 3667 btrfs_init_map_token(&token); 3668 3669 if (log_inode_only) { 3670 /* set the generation to zero so the recover code 3671 * can tell the difference between an logging 3672 * just to say 'this inode exists' and a logging 3673 * to say 'update this inode with these values' 3674 */ 3675 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3676 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3677 } else { 3678 btrfs_set_token_inode_generation(leaf, item, 3679 BTRFS_I(inode)->generation, 3680 &token); 3681 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3682 } 3683 3684 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3685 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3686 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3687 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3688 3689 btrfs_set_token_timespec_sec(leaf, &item->atime, 3690 inode->i_atime.tv_sec, &token); 3691 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3692 inode->i_atime.tv_nsec, &token); 3693 3694 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3695 inode->i_mtime.tv_sec, &token); 3696 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3697 inode->i_mtime.tv_nsec, &token); 3698 3699 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3700 inode->i_ctime.tv_sec, &token); 3701 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3702 inode->i_ctime.tv_nsec, &token); 3703 3704 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3705 &token); 3706 3707 btrfs_set_token_inode_sequence(leaf, item, 3708 inode_peek_iversion(inode), &token); 3709 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3710 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3711 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3712 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3713 } 3714 3715 static int log_inode_item(struct btrfs_trans_handle *trans, 3716 struct btrfs_root *log, struct btrfs_path *path, 3717 struct btrfs_inode *inode) 3718 { 3719 struct btrfs_inode_item *inode_item; 3720 int ret; 3721 3722 ret = btrfs_insert_empty_item(trans, log, path, 3723 &inode->location, sizeof(*inode_item)); 3724 if (ret && ret != -EEXIST) 3725 return ret; 3726 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3727 struct btrfs_inode_item); 3728 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3729 0, 0); 3730 btrfs_release_path(path); 3731 return 0; 3732 } 3733 3734 static noinline int copy_items(struct btrfs_trans_handle *trans, 3735 struct btrfs_inode *inode, 3736 struct btrfs_path *dst_path, 3737 struct btrfs_path *src_path, u64 *last_extent, 3738 int start_slot, int nr, int inode_only, 3739 u64 logged_isize) 3740 { 3741 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3742 unsigned long src_offset; 3743 unsigned long dst_offset; 3744 struct btrfs_root *log = inode->root->log_root; 3745 struct btrfs_file_extent_item *extent; 3746 struct btrfs_inode_item *inode_item; 3747 struct extent_buffer *src = src_path->nodes[0]; 3748 struct btrfs_key first_key, last_key, key; 3749 int ret; 3750 struct btrfs_key *ins_keys; 3751 u32 *ins_sizes; 3752 char *ins_data; 3753 int i; 3754 struct list_head ordered_sums; 3755 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3756 bool has_extents = false; 3757 bool need_find_last_extent = true; 3758 bool done = false; 3759 3760 INIT_LIST_HEAD(&ordered_sums); 3761 3762 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3763 nr * sizeof(u32), GFP_NOFS); 3764 if (!ins_data) 3765 return -ENOMEM; 3766 3767 first_key.objectid = (u64)-1; 3768 3769 ins_sizes = (u32 *)ins_data; 3770 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3771 3772 for (i = 0; i < nr; i++) { 3773 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3774 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3775 } 3776 ret = btrfs_insert_empty_items(trans, log, dst_path, 3777 ins_keys, ins_sizes, nr); 3778 if (ret) { 3779 kfree(ins_data); 3780 return ret; 3781 } 3782 3783 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3784 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3785 dst_path->slots[0]); 3786 3787 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3788 3789 if (i == nr - 1) 3790 last_key = ins_keys[i]; 3791 3792 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3793 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3794 dst_path->slots[0], 3795 struct btrfs_inode_item); 3796 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3797 &inode->vfs_inode, 3798 inode_only == LOG_INODE_EXISTS, 3799 logged_isize); 3800 } else { 3801 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3802 src_offset, ins_sizes[i]); 3803 } 3804 3805 /* 3806 * We set need_find_last_extent here in case we know we were 3807 * processing other items and then walk into the first extent in 3808 * the inode. If we don't hit an extent then nothing changes, 3809 * we'll do the last search the next time around. 3810 */ 3811 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3812 has_extents = true; 3813 if (first_key.objectid == (u64)-1) 3814 first_key = ins_keys[i]; 3815 } else { 3816 need_find_last_extent = false; 3817 } 3818 3819 /* take a reference on file data extents so that truncates 3820 * or deletes of this inode don't have to relog the inode 3821 * again 3822 */ 3823 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3824 !skip_csum) { 3825 int found_type; 3826 extent = btrfs_item_ptr(src, start_slot + i, 3827 struct btrfs_file_extent_item); 3828 3829 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3830 continue; 3831 3832 found_type = btrfs_file_extent_type(src, extent); 3833 if (found_type == BTRFS_FILE_EXTENT_REG) { 3834 u64 ds, dl, cs, cl; 3835 ds = btrfs_file_extent_disk_bytenr(src, 3836 extent); 3837 /* ds == 0 is a hole */ 3838 if (ds == 0) 3839 continue; 3840 3841 dl = btrfs_file_extent_disk_num_bytes(src, 3842 extent); 3843 cs = btrfs_file_extent_offset(src, extent); 3844 cl = btrfs_file_extent_num_bytes(src, 3845 extent); 3846 if (btrfs_file_extent_compression(src, 3847 extent)) { 3848 cs = 0; 3849 cl = dl; 3850 } 3851 3852 ret = btrfs_lookup_csums_range( 3853 fs_info->csum_root, 3854 ds + cs, ds + cs + cl - 1, 3855 &ordered_sums, 0); 3856 if (ret) { 3857 btrfs_release_path(dst_path); 3858 kfree(ins_data); 3859 return ret; 3860 } 3861 } 3862 } 3863 } 3864 3865 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3866 btrfs_release_path(dst_path); 3867 kfree(ins_data); 3868 3869 /* 3870 * we have to do this after the loop above to avoid changing the 3871 * log tree while trying to change the log tree. 3872 */ 3873 ret = 0; 3874 while (!list_empty(&ordered_sums)) { 3875 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3876 struct btrfs_ordered_sum, 3877 list); 3878 if (!ret) 3879 ret = btrfs_csum_file_blocks(trans, log, sums); 3880 list_del(&sums->list); 3881 kfree(sums); 3882 } 3883 3884 if (!has_extents) 3885 return ret; 3886 3887 if (need_find_last_extent && *last_extent == first_key.offset) { 3888 /* 3889 * We don't have any leafs between our current one and the one 3890 * we processed before that can have file extent items for our 3891 * inode (and have a generation number smaller than our current 3892 * transaction id). 3893 */ 3894 need_find_last_extent = false; 3895 } 3896 3897 /* 3898 * Because we use btrfs_search_forward we could skip leaves that were 3899 * not modified and then assume *last_extent is valid when it really 3900 * isn't. So back up to the previous leaf and read the end of the last 3901 * extent before we go and fill in holes. 3902 */ 3903 if (need_find_last_extent) { 3904 u64 len; 3905 3906 ret = btrfs_prev_leaf(inode->root, src_path); 3907 if (ret < 0) 3908 return ret; 3909 if (ret) 3910 goto fill_holes; 3911 if (src_path->slots[0]) 3912 src_path->slots[0]--; 3913 src = src_path->nodes[0]; 3914 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3915 if (key.objectid != btrfs_ino(inode) || 3916 key.type != BTRFS_EXTENT_DATA_KEY) 3917 goto fill_holes; 3918 extent = btrfs_item_ptr(src, src_path->slots[0], 3919 struct btrfs_file_extent_item); 3920 if (btrfs_file_extent_type(src, extent) == 3921 BTRFS_FILE_EXTENT_INLINE) { 3922 len = btrfs_file_extent_inline_len(src, 3923 src_path->slots[0], 3924 extent); 3925 *last_extent = ALIGN(key.offset + len, 3926 fs_info->sectorsize); 3927 } else { 3928 len = btrfs_file_extent_num_bytes(src, extent); 3929 *last_extent = key.offset + len; 3930 } 3931 } 3932 fill_holes: 3933 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3934 * things could have happened 3935 * 3936 * 1) A merge could have happened, so we could currently be on a leaf 3937 * that holds what we were copying in the first place. 3938 * 2) A split could have happened, and now not all of the items we want 3939 * are on the same leaf. 3940 * 3941 * So we need to adjust how we search for holes, we need to drop the 3942 * path and re-search for the first extent key we found, and then walk 3943 * forward until we hit the last one we copied. 3944 */ 3945 if (need_find_last_extent) { 3946 /* btrfs_prev_leaf could return 1 without releasing the path */ 3947 btrfs_release_path(src_path); 3948 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3949 src_path, 0, 0); 3950 if (ret < 0) 3951 return ret; 3952 ASSERT(ret == 0); 3953 src = src_path->nodes[0]; 3954 i = src_path->slots[0]; 3955 } else { 3956 i = start_slot; 3957 } 3958 3959 /* 3960 * Ok so here we need to go through and fill in any holes we may have 3961 * to make sure that holes are punched for those areas in case they had 3962 * extents previously. 3963 */ 3964 while (!done) { 3965 u64 offset, len; 3966 u64 extent_end; 3967 3968 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3969 ret = btrfs_next_leaf(inode->root, src_path); 3970 if (ret < 0) 3971 return ret; 3972 ASSERT(ret == 0); 3973 src = src_path->nodes[0]; 3974 i = 0; 3975 } 3976 3977 btrfs_item_key_to_cpu(src, &key, i); 3978 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3979 done = true; 3980 if (key.objectid != btrfs_ino(inode) || 3981 key.type != BTRFS_EXTENT_DATA_KEY) { 3982 i++; 3983 continue; 3984 } 3985 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3986 if (btrfs_file_extent_type(src, extent) == 3987 BTRFS_FILE_EXTENT_INLINE) { 3988 len = btrfs_file_extent_inline_len(src, i, extent); 3989 extent_end = ALIGN(key.offset + len, 3990 fs_info->sectorsize); 3991 } else { 3992 len = btrfs_file_extent_num_bytes(src, extent); 3993 extent_end = key.offset + len; 3994 } 3995 i++; 3996 3997 if (*last_extent == key.offset) { 3998 *last_extent = extent_end; 3999 continue; 4000 } 4001 offset = *last_extent; 4002 len = key.offset - *last_extent; 4003 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 4004 offset, 0, 0, len, 0, len, 0, 0, 0); 4005 if (ret) 4006 break; 4007 *last_extent = extent_end; 4008 } 4009 /* 4010 * Need to let the callers know we dropped the path so they should 4011 * re-search. 4012 */ 4013 if (!ret && need_find_last_extent) 4014 ret = 1; 4015 return ret; 4016 } 4017 4018 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4019 { 4020 struct extent_map *em1, *em2; 4021 4022 em1 = list_entry(a, struct extent_map, list); 4023 em2 = list_entry(b, struct extent_map, list); 4024 4025 if (em1->start < em2->start) 4026 return -1; 4027 else if (em1->start > em2->start) 4028 return 1; 4029 return 0; 4030 } 4031 4032 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 4033 struct inode *inode, 4034 struct btrfs_root *root, 4035 const struct extent_map *em, 4036 const struct list_head *logged_list, 4037 bool *ordered_io_error) 4038 { 4039 struct btrfs_fs_info *fs_info = root->fs_info; 4040 struct btrfs_ordered_extent *ordered; 4041 struct btrfs_root *log = root->log_root; 4042 u64 mod_start = em->mod_start; 4043 u64 mod_len = em->mod_len; 4044 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 4045 u64 csum_offset; 4046 u64 csum_len; 4047 LIST_HEAD(ordered_sums); 4048 int ret = 0; 4049 4050 *ordered_io_error = false; 4051 4052 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4053 em->block_start == EXTENT_MAP_HOLE) 4054 return 0; 4055 4056 /* 4057 * Wait far any ordered extent that covers our extent map. If it 4058 * finishes without an error, first check and see if our csums are on 4059 * our outstanding ordered extents. 4060 */ 4061 list_for_each_entry(ordered, logged_list, log_list) { 4062 struct btrfs_ordered_sum *sum; 4063 4064 if (!mod_len) 4065 break; 4066 4067 if (ordered->file_offset + ordered->len <= mod_start || 4068 mod_start + mod_len <= ordered->file_offset) 4069 continue; 4070 4071 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 4072 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 4073 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 4074 const u64 start = ordered->file_offset; 4075 const u64 end = ordered->file_offset + ordered->len - 1; 4076 4077 WARN_ON(ordered->inode != inode); 4078 filemap_fdatawrite_range(inode->i_mapping, start, end); 4079 } 4080 4081 wait_event(ordered->wait, 4082 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 4083 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 4084 4085 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 4086 /* 4087 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 4088 * i_mapping flags, so that the next fsync won't get 4089 * an outdated io error too. 4090 */ 4091 filemap_check_errors(inode->i_mapping); 4092 *ordered_io_error = true; 4093 break; 4094 } 4095 /* 4096 * We are going to copy all the csums on this ordered extent, so 4097 * go ahead and adjust mod_start and mod_len in case this 4098 * ordered extent has already been logged. 4099 */ 4100 if (ordered->file_offset > mod_start) { 4101 if (ordered->file_offset + ordered->len >= 4102 mod_start + mod_len) 4103 mod_len = ordered->file_offset - mod_start; 4104 /* 4105 * If we have this case 4106 * 4107 * |--------- logged extent ---------| 4108 * |----- ordered extent ----| 4109 * 4110 * Just don't mess with mod_start and mod_len, we'll 4111 * just end up logging more csums than we need and it 4112 * will be ok. 4113 */ 4114 } else { 4115 if (ordered->file_offset + ordered->len < 4116 mod_start + mod_len) { 4117 mod_len = (mod_start + mod_len) - 4118 (ordered->file_offset + ordered->len); 4119 mod_start = ordered->file_offset + 4120 ordered->len; 4121 } else { 4122 mod_len = 0; 4123 } 4124 } 4125 4126 if (skip_csum) 4127 continue; 4128 4129 /* 4130 * To keep us from looping for the above case of an ordered 4131 * extent that falls inside of the logged extent. 4132 */ 4133 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4134 &ordered->flags)) 4135 continue; 4136 4137 list_for_each_entry(sum, &ordered->list, list) { 4138 ret = btrfs_csum_file_blocks(trans, log, sum); 4139 if (ret) 4140 break; 4141 } 4142 } 4143 4144 if (*ordered_io_error || !mod_len || ret || skip_csum) 4145 return ret; 4146 4147 if (em->compress_type) { 4148 csum_offset = 0; 4149 csum_len = max(em->block_len, em->orig_block_len); 4150 } else { 4151 csum_offset = mod_start - em->start; 4152 csum_len = mod_len; 4153 } 4154 4155 /* block start is already adjusted for the file extent offset. */ 4156 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4157 em->block_start + csum_offset, 4158 em->block_start + csum_offset + 4159 csum_len - 1, &ordered_sums, 0); 4160 if (ret) 4161 return ret; 4162 4163 while (!list_empty(&ordered_sums)) { 4164 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4165 struct btrfs_ordered_sum, 4166 list); 4167 if (!ret) 4168 ret = btrfs_csum_file_blocks(trans, log, sums); 4169 list_del(&sums->list); 4170 kfree(sums); 4171 } 4172 4173 return ret; 4174 } 4175 4176 static int log_one_extent(struct btrfs_trans_handle *trans, 4177 struct btrfs_inode *inode, struct btrfs_root *root, 4178 const struct extent_map *em, 4179 struct btrfs_path *path, 4180 const struct list_head *logged_list, 4181 struct btrfs_log_ctx *ctx) 4182 { 4183 struct btrfs_root *log = root->log_root; 4184 struct btrfs_file_extent_item *fi; 4185 struct extent_buffer *leaf; 4186 struct btrfs_map_token token; 4187 struct btrfs_key key; 4188 u64 extent_offset = em->start - em->orig_start; 4189 u64 block_len; 4190 int ret; 4191 int extent_inserted = 0; 4192 bool ordered_io_err = false; 4193 4194 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4195 logged_list, &ordered_io_err); 4196 if (ret) 4197 return ret; 4198 4199 if (ordered_io_err) { 4200 ctx->io_err = -EIO; 4201 return ctx->io_err; 4202 } 4203 4204 btrfs_init_map_token(&token); 4205 4206 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4207 em->start + em->len, NULL, 0, 1, 4208 sizeof(*fi), &extent_inserted); 4209 if (ret) 4210 return ret; 4211 4212 if (!extent_inserted) { 4213 key.objectid = btrfs_ino(inode); 4214 key.type = BTRFS_EXTENT_DATA_KEY; 4215 key.offset = em->start; 4216 4217 ret = btrfs_insert_empty_item(trans, log, path, &key, 4218 sizeof(*fi)); 4219 if (ret) 4220 return ret; 4221 } 4222 leaf = path->nodes[0]; 4223 fi = btrfs_item_ptr(leaf, path->slots[0], 4224 struct btrfs_file_extent_item); 4225 4226 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4227 &token); 4228 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4229 btrfs_set_token_file_extent_type(leaf, fi, 4230 BTRFS_FILE_EXTENT_PREALLOC, 4231 &token); 4232 else 4233 btrfs_set_token_file_extent_type(leaf, fi, 4234 BTRFS_FILE_EXTENT_REG, 4235 &token); 4236 4237 block_len = max(em->block_len, em->orig_block_len); 4238 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4239 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4240 em->block_start, 4241 &token); 4242 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4243 &token); 4244 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4245 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4246 em->block_start - 4247 extent_offset, &token); 4248 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4249 &token); 4250 } else { 4251 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4252 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4253 &token); 4254 } 4255 4256 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4257 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4258 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4259 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4260 &token); 4261 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4262 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4263 btrfs_mark_buffer_dirty(leaf); 4264 4265 btrfs_release_path(path); 4266 4267 return ret; 4268 } 4269 4270 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4271 struct btrfs_root *root, 4272 struct btrfs_inode *inode, 4273 struct btrfs_path *path, 4274 struct list_head *logged_list, 4275 struct btrfs_log_ctx *ctx, 4276 const u64 start, 4277 const u64 end) 4278 { 4279 struct extent_map *em, *n; 4280 struct list_head extents; 4281 struct extent_map_tree *tree = &inode->extent_tree; 4282 u64 logged_start, logged_end; 4283 u64 test_gen; 4284 int ret = 0; 4285 int num = 0; 4286 4287 INIT_LIST_HEAD(&extents); 4288 4289 down_write(&inode->dio_sem); 4290 write_lock(&tree->lock); 4291 test_gen = root->fs_info->last_trans_committed; 4292 logged_start = start; 4293 logged_end = end; 4294 4295 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4296 list_del_init(&em->list); 4297 /* 4298 * Just an arbitrary number, this can be really CPU intensive 4299 * once we start getting a lot of extents, and really once we 4300 * have a bunch of extents we just want to commit since it will 4301 * be faster. 4302 */ 4303 if (++num > 32768) { 4304 list_del_init(&tree->modified_extents); 4305 ret = -EFBIG; 4306 goto process; 4307 } 4308 4309 if (em->generation <= test_gen) 4310 continue; 4311 4312 if (em->start < logged_start) 4313 logged_start = em->start; 4314 if ((em->start + em->len - 1) > logged_end) 4315 logged_end = em->start + em->len - 1; 4316 4317 /* Need a ref to keep it from getting evicted from cache */ 4318 refcount_inc(&em->refs); 4319 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4320 list_add_tail(&em->list, &extents); 4321 num++; 4322 } 4323 4324 list_sort(NULL, &extents, extent_cmp); 4325 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4326 /* 4327 * Some ordered extents started by fsync might have completed 4328 * before we could collect them into the list logged_list, which 4329 * means they're gone, not in our logged_list nor in the inode's 4330 * ordered tree. We want the application/user space to know an 4331 * error happened while attempting to persist file data so that 4332 * it can take proper action. If such error happened, we leave 4333 * without writing to the log tree and the fsync must report the 4334 * file data write error and not commit the current transaction. 4335 */ 4336 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4337 if (ret) 4338 ctx->io_err = ret; 4339 process: 4340 while (!list_empty(&extents)) { 4341 em = list_entry(extents.next, struct extent_map, list); 4342 4343 list_del_init(&em->list); 4344 4345 /* 4346 * If we had an error we just need to delete everybody from our 4347 * private list. 4348 */ 4349 if (ret) { 4350 clear_em_logging(tree, em); 4351 free_extent_map(em); 4352 continue; 4353 } 4354 4355 write_unlock(&tree->lock); 4356 4357 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4358 ctx); 4359 write_lock(&tree->lock); 4360 clear_em_logging(tree, em); 4361 free_extent_map(em); 4362 } 4363 WARN_ON(!list_empty(&extents)); 4364 write_unlock(&tree->lock); 4365 up_write(&inode->dio_sem); 4366 4367 btrfs_release_path(path); 4368 return ret; 4369 } 4370 4371 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4372 struct btrfs_path *path, u64 *size_ret) 4373 { 4374 struct btrfs_key key; 4375 int ret; 4376 4377 key.objectid = btrfs_ino(inode); 4378 key.type = BTRFS_INODE_ITEM_KEY; 4379 key.offset = 0; 4380 4381 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4382 if (ret < 0) { 4383 return ret; 4384 } else if (ret > 0) { 4385 *size_ret = 0; 4386 } else { 4387 struct btrfs_inode_item *item; 4388 4389 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4390 struct btrfs_inode_item); 4391 *size_ret = btrfs_inode_size(path->nodes[0], item); 4392 } 4393 4394 btrfs_release_path(path); 4395 return 0; 4396 } 4397 4398 /* 4399 * At the moment we always log all xattrs. This is to figure out at log replay 4400 * time which xattrs must have their deletion replayed. If a xattr is missing 4401 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4402 * because if a xattr is deleted, the inode is fsynced and a power failure 4403 * happens, causing the log to be replayed the next time the fs is mounted, 4404 * we want the xattr to not exist anymore (same behaviour as other filesystems 4405 * with a journal, ext3/4, xfs, f2fs, etc). 4406 */ 4407 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4408 struct btrfs_root *root, 4409 struct btrfs_inode *inode, 4410 struct btrfs_path *path, 4411 struct btrfs_path *dst_path) 4412 { 4413 int ret; 4414 struct btrfs_key key; 4415 const u64 ino = btrfs_ino(inode); 4416 int ins_nr = 0; 4417 int start_slot = 0; 4418 4419 key.objectid = ino; 4420 key.type = BTRFS_XATTR_ITEM_KEY; 4421 key.offset = 0; 4422 4423 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4424 if (ret < 0) 4425 return ret; 4426 4427 while (true) { 4428 int slot = path->slots[0]; 4429 struct extent_buffer *leaf = path->nodes[0]; 4430 int nritems = btrfs_header_nritems(leaf); 4431 4432 if (slot >= nritems) { 4433 if (ins_nr > 0) { 4434 u64 last_extent = 0; 4435 4436 ret = copy_items(trans, inode, dst_path, path, 4437 &last_extent, start_slot, 4438 ins_nr, 1, 0); 4439 /* can't be 1, extent items aren't processed */ 4440 ASSERT(ret <= 0); 4441 if (ret < 0) 4442 return ret; 4443 ins_nr = 0; 4444 } 4445 ret = btrfs_next_leaf(root, path); 4446 if (ret < 0) 4447 return ret; 4448 else if (ret > 0) 4449 break; 4450 continue; 4451 } 4452 4453 btrfs_item_key_to_cpu(leaf, &key, slot); 4454 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4455 break; 4456 4457 if (ins_nr == 0) 4458 start_slot = slot; 4459 ins_nr++; 4460 path->slots[0]++; 4461 cond_resched(); 4462 } 4463 if (ins_nr > 0) { 4464 u64 last_extent = 0; 4465 4466 ret = copy_items(trans, inode, dst_path, path, 4467 &last_extent, start_slot, 4468 ins_nr, 1, 0); 4469 /* can't be 1, extent items aren't processed */ 4470 ASSERT(ret <= 0); 4471 if (ret < 0) 4472 return ret; 4473 } 4474 4475 return 0; 4476 } 4477 4478 /* 4479 * If the no holes feature is enabled we need to make sure any hole between the 4480 * last extent and the i_size of our inode is explicitly marked in the log. This 4481 * is to make sure that doing something like: 4482 * 4483 * 1) create file with 128Kb of data 4484 * 2) truncate file to 64Kb 4485 * 3) truncate file to 256Kb 4486 * 4) fsync file 4487 * 5) <crash/power failure> 4488 * 6) mount fs and trigger log replay 4489 * 4490 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4491 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4492 * file correspond to a hole. The presence of explicit holes in a log tree is 4493 * what guarantees that log replay will remove/adjust file extent items in the 4494 * fs/subvol tree. 4495 * 4496 * Here we do not need to care about holes between extents, that is already done 4497 * by copy_items(). We also only need to do this in the full sync path, where we 4498 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4499 * lookup the list of modified extent maps and if any represents a hole, we 4500 * insert a corresponding extent representing a hole in the log tree. 4501 */ 4502 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4503 struct btrfs_root *root, 4504 struct btrfs_inode *inode, 4505 struct btrfs_path *path) 4506 { 4507 struct btrfs_fs_info *fs_info = root->fs_info; 4508 int ret; 4509 struct btrfs_key key; 4510 u64 hole_start; 4511 u64 hole_size; 4512 struct extent_buffer *leaf; 4513 struct btrfs_root *log = root->log_root; 4514 const u64 ino = btrfs_ino(inode); 4515 const u64 i_size = i_size_read(&inode->vfs_inode); 4516 4517 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4518 return 0; 4519 4520 key.objectid = ino; 4521 key.type = BTRFS_EXTENT_DATA_KEY; 4522 key.offset = (u64)-1; 4523 4524 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4525 ASSERT(ret != 0); 4526 if (ret < 0) 4527 return ret; 4528 4529 ASSERT(path->slots[0] > 0); 4530 path->slots[0]--; 4531 leaf = path->nodes[0]; 4532 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4533 4534 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4535 /* inode does not have any extents */ 4536 hole_start = 0; 4537 hole_size = i_size; 4538 } else { 4539 struct btrfs_file_extent_item *extent; 4540 u64 len; 4541 4542 /* 4543 * If there's an extent beyond i_size, an explicit hole was 4544 * already inserted by copy_items(). 4545 */ 4546 if (key.offset >= i_size) 4547 return 0; 4548 4549 extent = btrfs_item_ptr(leaf, path->slots[0], 4550 struct btrfs_file_extent_item); 4551 4552 if (btrfs_file_extent_type(leaf, extent) == 4553 BTRFS_FILE_EXTENT_INLINE) { 4554 len = btrfs_file_extent_inline_len(leaf, 4555 path->slots[0], 4556 extent); 4557 ASSERT(len == i_size || 4558 (len == fs_info->sectorsize && 4559 btrfs_file_extent_compression(leaf, extent) != 4560 BTRFS_COMPRESS_NONE)); 4561 return 0; 4562 } 4563 4564 len = btrfs_file_extent_num_bytes(leaf, extent); 4565 /* Last extent goes beyond i_size, no need to log a hole. */ 4566 if (key.offset + len > i_size) 4567 return 0; 4568 hole_start = key.offset + len; 4569 hole_size = i_size - hole_start; 4570 } 4571 btrfs_release_path(path); 4572 4573 /* Last extent ends at i_size. */ 4574 if (hole_size == 0) 4575 return 0; 4576 4577 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4578 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4579 hole_size, 0, hole_size, 0, 0, 0); 4580 return ret; 4581 } 4582 4583 /* 4584 * When we are logging a new inode X, check if it doesn't have a reference that 4585 * matches the reference from some other inode Y created in a past transaction 4586 * and that was renamed in the current transaction. If we don't do this, then at 4587 * log replay time we can lose inode Y (and all its files if it's a directory): 4588 * 4589 * mkdir /mnt/x 4590 * echo "hello world" > /mnt/x/foobar 4591 * sync 4592 * mv /mnt/x /mnt/y 4593 * mkdir /mnt/x # or touch /mnt/x 4594 * xfs_io -c fsync /mnt/x 4595 * <power fail> 4596 * mount fs, trigger log replay 4597 * 4598 * After the log replay procedure, we would lose the first directory and all its 4599 * files (file foobar). 4600 * For the case where inode Y is not a directory we simply end up losing it: 4601 * 4602 * echo "123" > /mnt/foo 4603 * sync 4604 * mv /mnt/foo /mnt/bar 4605 * echo "abc" > /mnt/foo 4606 * xfs_io -c fsync /mnt/foo 4607 * <power fail> 4608 * 4609 * We also need this for cases where a snapshot entry is replaced by some other 4610 * entry (file or directory) otherwise we end up with an unreplayable log due to 4611 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4612 * if it were a regular entry: 4613 * 4614 * mkdir /mnt/x 4615 * btrfs subvolume snapshot /mnt /mnt/x/snap 4616 * btrfs subvolume delete /mnt/x/snap 4617 * rmdir /mnt/x 4618 * mkdir /mnt/x 4619 * fsync /mnt/x or fsync some new file inside it 4620 * <power fail> 4621 * 4622 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4623 * the same transaction. 4624 */ 4625 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4626 const int slot, 4627 const struct btrfs_key *key, 4628 struct btrfs_inode *inode, 4629 u64 *other_ino) 4630 { 4631 int ret; 4632 struct btrfs_path *search_path; 4633 char *name = NULL; 4634 u32 name_len = 0; 4635 u32 item_size = btrfs_item_size_nr(eb, slot); 4636 u32 cur_offset = 0; 4637 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4638 4639 search_path = btrfs_alloc_path(); 4640 if (!search_path) 4641 return -ENOMEM; 4642 search_path->search_commit_root = 1; 4643 search_path->skip_locking = 1; 4644 4645 while (cur_offset < item_size) { 4646 u64 parent; 4647 u32 this_name_len; 4648 u32 this_len; 4649 unsigned long name_ptr; 4650 struct btrfs_dir_item *di; 4651 4652 if (key->type == BTRFS_INODE_REF_KEY) { 4653 struct btrfs_inode_ref *iref; 4654 4655 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4656 parent = key->offset; 4657 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4658 name_ptr = (unsigned long)(iref + 1); 4659 this_len = sizeof(*iref) + this_name_len; 4660 } else { 4661 struct btrfs_inode_extref *extref; 4662 4663 extref = (struct btrfs_inode_extref *)(ptr + 4664 cur_offset); 4665 parent = btrfs_inode_extref_parent(eb, extref); 4666 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4667 name_ptr = (unsigned long)&extref->name; 4668 this_len = sizeof(*extref) + this_name_len; 4669 } 4670 4671 if (this_name_len > name_len) { 4672 char *new_name; 4673 4674 new_name = krealloc(name, this_name_len, GFP_NOFS); 4675 if (!new_name) { 4676 ret = -ENOMEM; 4677 goto out; 4678 } 4679 name_len = this_name_len; 4680 name = new_name; 4681 } 4682 4683 read_extent_buffer(eb, name, name_ptr, this_name_len); 4684 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4685 parent, name, this_name_len, 0); 4686 if (di && !IS_ERR(di)) { 4687 struct btrfs_key di_key; 4688 4689 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4690 di, &di_key); 4691 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4692 ret = 1; 4693 *other_ino = di_key.objectid; 4694 } else { 4695 ret = -EAGAIN; 4696 } 4697 goto out; 4698 } else if (IS_ERR(di)) { 4699 ret = PTR_ERR(di); 4700 goto out; 4701 } 4702 btrfs_release_path(search_path); 4703 4704 cur_offset += this_len; 4705 } 4706 ret = 0; 4707 out: 4708 btrfs_free_path(search_path); 4709 kfree(name); 4710 return ret; 4711 } 4712 4713 /* log a single inode in the tree log. 4714 * At least one parent directory for this inode must exist in the tree 4715 * or be logged already. 4716 * 4717 * Any items from this inode changed by the current transaction are copied 4718 * to the log tree. An extra reference is taken on any extents in this 4719 * file, allowing us to avoid a whole pile of corner cases around logging 4720 * blocks that have been removed from the tree. 4721 * 4722 * See LOG_INODE_ALL and related defines for a description of what inode_only 4723 * does. 4724 * 4725 * This handles both files and directories. 4726 */ 4727 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4728 struct btrfs_root *root, struct btrfs_inode *inode, 4729 int inode_only, 4730 const loff_t start, 4731 const loff_t end, 4732 struct btrfs_log_ctx *ctx) 4733 { 4734 struct btrfs_fs_info *fs_info = root->fs_info; 4735 struct btrfs_path *path; 4736 struct btrfs_path *dst_path; 4737 struct btrfs_key min_key; 4738 struct btrfs_key max_key; 4739 struct btrfs_root *log = root->log_root; 4740 LIST_HEAD(logged_list); 4741 u64 last_extent = 0; 4742 int err = 0; 4743 int ret; 4744 int nritems; 4745 int ins_start_slot = 0; 4746 int ins_nr; 4747 bool fast_search = false; 4748 u64 ino = btrfs_ino(inode); 4749 struct extent_map_tree *em_tree = &inode->extent_tree; 4750 u64 logged_isize = 0; 4751 bool need_log_inode_item = true; 4752 4753 path = btrfs_alloc_path(); 4754 if (!path) 4755 return -ENOMEM; 4756 dst_path = btrfs_alloc_path(); 4757 if (!dst_path) { 4758 btrfs_free_path(path); 4759 return -ENOMEM; 4760 } 4761 4762 min_key.objectid = ino; 4763 min_key.type = BTRFS_INODE_ITEM_KEY; 4764 min_key.offset = 0; 4765 4766 max_key.objectid = ino; 4767 4768 4769 /* today the code can only do partial logging of directories */ 4770 if (S_ISDIR(inode->vfs_inode.i_mode) || 4771 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4772 &inode->runtime_flags) && 4773 inode_only >= LOG_INODE_EXISTS)) 4774 max_key.type = BTRFS_XATTR_ITEM_KEY; 4775 else 4776 max_key.type = (u8)-1; 4777 max_key.offset = (u64)-1; 4778 4779 /* 4780 * Only run delayed items if we are a dir or a new file. 4781 * Otherwise commit the delayed inode only, which is needed in 4782 * order for the log replay code to mark inodes for link count 4783 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4784 */ 4785 if (S_ISDIR(inode->vfs_inode.i_mode) || 4786 inode->generation > fs_info->last_trans_committed) 4787 ret = btrfs_commit_inode_delayed_items(trans, inode); 4788 else 4789 ret = btrfs_commit_inode_delayed_inode(inode); 4790 4791 if (ret) { 4792 btrfs_free_path(path); 4793 btrfs_free_path(dst_path); 4794 return ret; 4795 } 4796 4797 if (inode_only == LOG_OTHER_INODE) { 4798 inode_only = LOG_INODE_EXISTS; 4799 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4800 } else { 4801 mutex_lock(&inode->log_mutex); 4802 } 4803 4804 /* 4805 * a brute force approach to making sure we get the most uptodate 4806 * copies of everything. 4807 */ 4808 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4809 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4810 4811 if (inode_only == LOG_INODE_EXISTS) 4812 max_key_type = BTRFS_XATTR_ITEM_KEY; 4813 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4814 } else { 4815 if (inode_only == LOG_INODE_EXISTS) { 4816 /* 4817 * Make sure the new inode item we write to the log has 4818 * the same isize as the current one (if it exists). 4819 * This is necessary to prevent data loss after log 4820 * replay, and also to prevent doing a wrong expanding 4821 * truncate - for e.g. create file, write 4K into offset 4822 * 0, fsync, write 4K into offset 4096, add hard link, 4823 * fsync some other file (to sync log), power fail - if 4824 * we use the inode's current i_size, after log replay 4825 * we get a 8Kb file, with the last 4Kb extent as a hole 4826 * (zeroes), as if an expanding truncate happened, 4827 * instead of getting a file of 4Kb only. 4828 */ 4829 err = logged_inode_size(log, inode, path, &logged_isize); 4830 if (err) 4831 goto out_unlock; 4832 } 4833 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4834 &inode->runtime_flags)) { 4835 if (inode_only == LOG_INODE_EXISTS) { 4836 max_key.type = BTRFS_XATTR_ITEM_KEY; 4837 ret = drop_objectid_items(trans, log, path, ino, 4838 max_key.type); 4839 } else { 4840 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4841 &inode->runtime_flags); 4842 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4843 &inode->runtime_flags); 4844 while(1) { 4845 ret = btrfs_truncate_inode_items(trans, 4846 log, &inode->vfs_inode, 0, 0); 4847 if (ret != -EAGAIN) 4848 break; 4849 } 4850 } 4851 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4852 &inode->runtime_flags) || 4853 inode_only == LOG_INODE_EXISTS) { 4854 if (inode_only == LOG_INODE_ALL) 4855 fast_search = true; 4856 max_key.type = BTRFS_XATTR_ITEM_KEY; 4857 ret = drop_objectid_items(trans, log, path, ino, 4858 max_key.type); 4859 } else { 4860 if (inode_only == LOG_INODE_ALL) 4861 fast_search = true; 4862 goto log_extents; 4863 } 4864 4865 } 4866 if (ret) { 4867 err = ret; 4868 goto out_unlock; 4869 } 4870 4871 while (1) { 4872 ins_nr = 0; 4873 ret = btrfs_search_forward(root, &min_key, 4874 path, trans->transid); 4875 if (ret < 0) { 4876 err = ret; 4877 goto out_unlock; 4878 } 4879 if (ret != 0) 4880 break; 4881 again: 4882 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4883 if (min_key.objectid != ino) 4884 break; 4885 if (min_key.type > max_key.type) 4886 break; 4887 4888 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4889 need_log_inode_item = false; 4890 4891 if ((min_key.type == BTRFS_INODE_REF_KEY || 4892 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4893 inode->generation == trans->transid) { 4894 u64 other_ino = 0; 4895 4896 ret = btrfs_check_ref_name_override(path->nodes[0], 4897 path->slots[0], &min_key, inode, 4898 &other_ino); 4899 if (ret < 0) { 4900 err = ret; 4901 goto out_unlock; 4902 } else if (ret > 0 && ctx && 4903 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4904 struct btrfs_key inode_key; 4905 struct inode *other_inode; 4906 4907 if (ins_nr > 0) { 4908 ins_nr++; 4909 } else { 4910 ins_nr = 1; 4911 ins_start_slot = path->slots[0]; 4912 } 4913 ret = copy_items(trans, inode, dst_path, path, 4914 &last_extent, ins_start_slot, 4915 ins_nr, inode_only, 4916 logged_isize); 4917 if (ret < 0) { 4918 err = ret; 4919 goto out_unlock; 4920 } 4921 ins_nr = 0; 4922 btrfs_release_path(path); 4923 inode_key.objectid = other_ino; 4924 inode_key.type = BTRFS_INODE_ITEM_KEY; 4925 inode_key.offset = 0; 4926 other_inode = btrfs_iget(fs_info->sb, 4927 &inode_key, root, 4928 NULL); 4929 /* 4930 * If the other inode that had a conflicting dir 4931 * entry was deleted in the current transaction, 4932 * we don't need to do more work nor fallback to 4933 * a transaction commit. 4934 */ 4935 if (IS_ERR(other_inode) && 4936 PTR_ERR(other_inode) == -ENOENT) { 4937 goto next_key; 4938 } else if (IS_ERR(other_inode)) { 4939 err = PTR_ERR(other_inode); 4940 goto out_unlock; 4941 } 4942 /* 4943 * We are safe logging the other inode without 4944 * acquiring its i_mutex as long as we log with 4945 * the LOG_INODE_EXISTS mode. We're safe against 4946 * concurrent renames of the other inode as well 4947 * because during a rename we pin the log and 4948 * update the log with the new name before we 4949 * unpin it. 4950 */ 4951 err = btrfs_log_inode(trans, root, 4952 BTRFS_I(other_inode), 4953 LOG_OTHER_INODE, 0, LLONG_MAX, 4954 ctx); 4955 iput(other_inode); 4956 if (err) 4957 goto out_unlock; 4958 else 4959 goto next_key; 4960 } 4961 } 4962 4963 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4964 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4965 if (ins_nr == 0) 4966 goto next_slot; 4967 ret = copy_items(trans, inode, dst_path, path, 4968 &last_extent, ins_start_slot, 4969 ins_nr, inode_only, logged_isize); 4970 if (ret < 0) { 4971 err = ret; 4972 goto out_unlock; 4973 } 4974 ins_nr = 0; 4975 if (ret) { 4976 btrfs_release_path(path); 4977 continue; 4978 } 4979 goto next_slot; 4980 } 4981 4982 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4983 ins_nr++; 4984 goto next_slot; 4985 } else if (!ins_nr) { 4986 ins_start_slot = path->slots[0]; 4987 ins_nr = 1; 4988 goto next_slot; 4989 } 4990 4991 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4992 ins_start_slot, ins_nr, inode_only, 4993 logged_isize); 4994 if (ret < 0) { 4995 err = ret; 4996 goto out_unlock; 4997 } 4998 if (ret) { 4999 ins_nr = 0; 5000 btrfs_release_path(path); 5001 continue; 5002 } 5003 ins_nr = 1; 5004 ins_start_slot = path->slots[0]; 5005 next_slot: 5006 5007 nritems = btrfs_header_nritems(path->nodes[0]); 5008 path->slots[0]++; 5009 if (path->slots[0] < nritems) { 5010 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 5011 path->slots[0]); 5012 goto again; 5013 } 5014 if (ins_nr) { 5015 ret = copy_items(trans, inode, dst_path, path, 5016 &last_extent, ins_start_slot, 5017 ins_nr, inode_only, logged_isize); 5018 if (ret < 0) { 5019 err = ret; 5020 goto out_unlock; 5021 } 5022 ret = 0; 5023 ins_nr = 0; 5024 } 5025 btrfs_release_path(path); 5026 next_key: 5027 if (min_key.offset < (u64)-1) { 5028 min_key.offset++; 5029 } else if (min_key.type < max_key.type) { 5030 min_key.type++; 5031 min_key.offset = 0; 5032 } else { 5033 break; 5034 } 5035 } 5036 if (ins_nr) { 5037 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5038 ins_start_slot, ins_nr, inode_only, 5039 logged_isize); 5040 if (ret < 0) { 5041 err = ret; 5042 goto out_unlock; 5043 } 5044 ret = 0; 5045 ins_nr = 0; 5046 } 5047 5048 btrfs_release_path(path); 5049 btrfs_release_path(dst_path); 5050 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5051 if (err) 5052 goto out_unlock; 5053 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5054 btrfs_release_path(path); 5055 btrfs_release_path(dst_path); 5056 err = btrfs_log_trailing_hole(trans, root, inode, path); 5057 if (err) 5058 goto out_unlock; 5059 } 5060 log_extents: 5061 btrfs_release_path(path); 5062 btrfs_release_path(dst_path); 5063 if (need_log_inode_item) { 5064 err = log_inode_item(trans, log, dst_path, inode); 5065 if (err) 5066 goto out_unlock; 5067 } 5068 if (fast_search) { 5069 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5070 &logged_list, ctx, start, end); 5071 if (ret) { 5072 err = ret; 5073 goto out_unlock; 5074 } 5075 } else if (inode_only == LOG_INODE_ALL) { 5076 struct extent_map *em, *n; 5077 5078 write_lock(&em_tree->lock); 5079 /* 5080 * We can't just remove every em if we're called for a ranged 5081 * fsync - that is, one that doesn't cover the whole possible 5082 * file range (0 to LLONG_MAX). This is because we can have 5083 * em's that fall outside the range we're logging and therefore 5084 * their ordered operations haven't completed yet 5085 * (btrfs_finish_ordered_io() not invoked yet). This means we 5086 * didn't get their respective file extent item in the fs/subvol 5087 * tree yet, and need to let the next fast fsync (one which 5088 * consults the list of modified extent maps) find the em so 5089 * that it logs a matching file extent item and waits for the 5090 * respective ordered operation to complete (if it's still 5091 * running). 5092 * 5093 * Removing every em outside the range we're logging would make 5094 * the next fast fsync not log their matching file extent items, 5095 * therefore making us lose data after a log replay. 5096 */ 5097 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5098 list) { 5099 const u64 mod_end = em->mod_start + em->mod_len - 1; 5100 5101 if (em->mod_start >= start && mod_end <= end) 5102 list_del_init(&em->list); 5103 } 5104 write_unlock(&em_tree->lock); 5105 } 5106 5107 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5108 ret = log_directory_changes(trans, root, inode, path, dst_path, 5109 ctx); 5110 if (ret) { 5111 err = ret; 5112 goto out_unlock; 5113 } 5114 } 5115 5116 spin_lock(&inode->lock); 5117 inode->logged_trans = trans->transid; 5118 inode->last_log_commit = inode->last_sub_trans; 5119 spin_unlock(&inode->lock); 5120 out_unlock: 5121 if (unlikely(err)) 5122 btrfs_put_logged_extents(&logged_list); 5123 else 5124 btrfs_submit_logged_extents(&logged_list, log); 5125 mutex_unlock(&inode->log_mutex); 5126 5127 btrfs_free_path(path); 5128 btrfs_free_path(dst_path); 5129 return err; 5130 } 5131 5132 /* 5133 * Check if we must fallback to a transaction commit when logging an inode. 5134 * This must be called after logging the inode and is used only in the context 5135 * when fsyncing an inode requires the need to log some other inode - in which 5136 * case we can't lock the i_mutex of each other inode we need to log as that 5137 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5138 * log inodes up or down in the hierarchy) or rename operations for example. So 5139 * we take the log_mutex of the inode after we have logged it and then check for 5140 * its last_unlink_trans value - this is safe because any task setting 5141 * last_unlink_trans must take the log_mutex and it must do this before it does 5142 * the actual unlink operation, so if we do this check before a concurrent task 5143 * sets last_unlink_trans it means we've logged a consistent version/state of 5144 * all the inode items, otherwise we are not sure and must do a transaction 5145 * commit (the concurrent task might have only updated last_unlink_trans before 5146 * we logged the inode or it might have also done the unlink). 5147 */ 5148 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5149 struct btrfs_inode *inode) 5150 { 5151 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5152 bool ret = false; 5153 5154 mutex_lock(&inode->log_mutex); 5155 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5156 /* 5157 * Make sure any commits to the log are forced to be full 5158 * commits. 5159 */ 5160 btrfs_set_log_full_commit(fs_info, trans); 5161 ret = true; 5162 } 5163 mutex_unlock(&inode->log_mutex); 5164 5165 return ret; 5166 } 5167 5168 /* 5169 * follow the dentry parent pointers up the chain and see if any 5170 * of the directories in it require a full commit before they can 5171 * be logged. Returns zero if nothing special needs to be done or 1 if 5172 * a full commit is required. 5173 */ 5174 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5175 struct btrfs_inode *inode, 5176 struct dentry *parent, 5177 struct super_block *sb, 5178 u64 last_committed) 5179 { 5180 int ret = 0; 5181 struct dentry *old_parent = NULL; 5182 struct btrfs_inode *orig_inode = inode; 5183 5184 /* 5185 * for regular files, if its inode is already on disk, we don't 5186 * have to worry about the parents at all. This is because 5187 * we can use the last_unlink_trans field to record renames 5188 * and other fun in this file. 5189 */ 5190 if (S_ISREG(inode->vfs_inode.i_mode) && 5191 inode->generation <= last_committed && 5192 inode->last_unlink_trans <= last_committed) 5193 goto out; 5194 5195 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5196 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5197 goto out; 5198 inode = BTRFS_I(d_inode(parent)); 5199 } 5200 5201 while (1) { 5202 /* 5203 * If we are logging a directory then we start with our inode, 5204 * not our parent's inode, so we need to skip setting the 5205 * logged_trans so that further down in the log code we don't 5206 * think this inode has already been logged. 5207 */ 5208 if (inode != orig_inode) 5209 inode->logged_trans = trans->transid; 5210 smp_mb(); 5211 5212 if (btrfs_must_commit_transaction(trans, inode)) { 5213 ret = 1; 5214 break; 5215 } 5216 5217 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5218 break; 5219 5220 if (IS_ROOT(parent)) { 5221 inode = BTRFS_I(d_inode(parent)); 5222 if (btrfs_must_commit_transaction(trans, inode)) 5223 ret = 1; 5224 break; 5225 } 5226 5227 parent = dget_parent(parent); 5228 dput(old_parent); 5229 old_parent = parent; 5230 inode = BTRFS_I(d_inode(parent)); 5231 5232 } 5233 dput(old_parent); 5234 out: 5235 return ret; 5236 } 5237 5238 struct btrfs_dir_list { 5239 u64 ino; 5240 struct list_head list; 5241 }; 5242 5243 /* 5244 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5245 * details about the why it is needed. 5246 * This is a recursive operation - if an existing dentry corresponds to a 5247 * directory, that directory's new entries are logged too (same behaviour as 5248 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5249 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5250 * complains about the following circular lock dependency / possible deadlock: 5251 * 5252 * CPU0 CPU1 5253 * ---- ---- 5254 * lock(&type->i_mutex_dir_key#3/2); 5255 * lock(sb_internal#2); 5256 * lock(&type->i_mutex_dir_key#3/2); 5257 * lock(&sb->s_type->i_mutex_key#14); 5258 * 5259 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5260 * sb_start_intwrite() in btrfs_start_transaction(). 5261 * Not locking i_mutex of the inodes is still safe because: 5262 * 5263 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5264 * that while logging the inode new references (names) are added or removed 5265 * from the inode, leaving the logged inode item with a link count that does 5266 * not match the number of logged inode reference items. This is fine because 5267 * at log replay time we compute the real number of links and correct the 5268 * link count in the inode item (see replay_one_buffer() and 5269 * link_to_fixup_dir()); 5270 * 5271 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5272 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5273 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5274 * has a size that doesn't match the sum of the lengths of all the logged 5275 * names. This does not result in a problem because if a dir_item key is 5276 * logged but its matching dir_index key is not logged, at log replay time we 5277 * don't use it to replay the respective name (see replay_one_name()). On the 5278 * other hand if only the dir_index key ends up being logged, the respective 5279 * name is added to the fs/subvol tree with both the dir_item and dir_index 5280 * keys created (see replay_one_name()). 5281 * The directory's inode item with a wrong i_size is not a problem as well, 5282 * since we don't use it at log replay time to set the i_size in the inode 5283 * item of the fs/subvol tree (see overwrite_item()). 5284 */ 5285 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5286 struct btrfs_root *root, 5287 struct btrfs_inode *start_inode, 5288 struct btrfs_log_ctx *ctx) 5289 { 5290 struct btrfs_fs_info *fs_info = root->fs_info; 5291 struct btrfs_root *log = root->log_root; 5292 struct btrfs_path *path; 5293 LIST_HEAD(dir_list); 5294 struct btrfs_dir_list *dir_elem; 5295 int ret = 0; 5296 5297 path = btrfs_alloc_path(); 5298 if (!path) 5299 return -ENOMEM; 5300 5301 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5302 if (!dir_elem) { 5303 btrfs_free_path(path); 5304 return -ENOMEM; 5305 } 5306 dir_elem->ino = btrfs_ino(start_inode); 5307 list_add_tail(&dir_elem->list, &dir_list); 5308 5309 while (!list_empty(&dir_list)) { 5310 struct extent_buffer *leaf; 5311 struct btrfs_key min_key; 5312 int nritems; 5313 int i; 5314 5315 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5316 list); 5317 if (ret) 5318 goto next_dir_inode; 5319 5320 min_key.objectid = dir_elem->ino; 5321 min_key.type = BTRFS_DIR_ITEM_KEY; 5322 min_key.offset = 0; 5323 again: 5324 btrfs_release_path(path); 5325 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5326 if (ret < 0) { 5327 goto next_dir_inode; 5328 } else if (ret > 0) { 5329 ret = 0; 5330 goto next_dir_inode; 5331 } 5332 5333 process_leaf: 5334 leaf = path->nodes[0]; 5335 nritems = btrfs_header_nritems(leaf); 5336 for (i = path->slots[0]; i < nritems; i++) { 5337 struct btrfs_dir_item *di; 5338 struct btrfs_key di_key; 5339 struct inode *di_inode; 5340 struct btrfs_dir_list *new_dir_elem; 5341 int log_mode = LOG_INODE_EXISTS; 5342 int type; 5343 5344 btrfs_item_key_to_cpu(leaf, &min_key, i); 5345 if (min_key.objectid != dir_elem->ino || 5346 min_key.type != BTRFS_DIR_ITEM_KEY) 5347 goto next_dir_inode; 5348 5349 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5350 type = btrfs_dir_type(leaf, di); 5351 if (btrfs_dir_transid(leaf, di) < trans->transid && 5352 type != BTRFS_FT_DIR) 5353 continue; 5354 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5355 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5356 continue; 5357 5358 btrfs_release_path(path); 5359 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5360 if (IS_ERR(di_inode)) { 5361 ret = PTR_ERR(di_inode); 5362 goto next_dir_inode; 5363 } 5364 5365 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5366 iput(di_inode); 5367 break; 5368 } 5369 5370 ctx->log_new_dentries = false; 5371 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5372 log_mode = LOG_INODE_ALL; 5373 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5374 log_mode, 0, LLONG_MAX, ctx); 5375 if (!ret && 5376 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5377 ret = 1; 5378 iput(di_inode); 5379 if (ret) 5380 goto next_dir_inode; 5381 if (ctx->log_new_dentries) { 5382 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5383 GFP_NOFS); 5384 if (!new_dir_elem) { 5385 ret = -ENOMEM; 5386 goto next_dir_inode; 5387 } 5388 new_dir_elem->ino = di_key.objectid; 5389 list_add_tail(&new_dir_elem->list, &dir_list); 5390 } 5391 break; 5392 } 5393 if (i == nritems) { 5394 ret = btrfs_next_leaf(log, path); 5395 if (ret < 0) { 5396 goto next_dir_inode; 5397 } else if (ret > 0) { 5398 ret = 0; 5399 goto next_dir_inode; 5400 } 5401 goto process_leaf; 5402 } 5403 if (min_key.offset < (u64)-1) { 5404 min_key.offset++; 5405 goto again; 5406 } 5407 next_dir_inode: 5408 list_del(&dir_elem->list); 5409 kfree(dir_elem); 5410 } 5411 5412 btrfs_free_path(path); 5413 return ret; 5414 } 5415 5416 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5417 struct btrfs_inode *inode, 5418 struct btrfs_log_ctx *ctx) 5419 { 5420 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5421 int ret; 5422 struct btrfs_path *path; 5423 struct btrfs_key key; 5424 struct btrfs_root *root = inode->root; 5425 const u64 ino = btrfs_ino(inode); 5426 5427 path = btrfs_alloc_path(); 5428 if (!path) 5429 return -ENOMEM; 5430 path->skip_locking = 1; 5431 path->search_commit_root = 1; 5432 5433 key.objectid = ino; 5434 key.type = BTRFS_INODE_REF_KEY; 5435 key.offset = 0; 5436 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5437 if (ret < 0) 5438 goto out; 5439 5440 while (true) { 5441 struct extent_buffer *leaf = path->nodes[0]; 5442 int slot = path->slots[0]; 5443 u32 cur_offset = 0; 5444 u32 item_size; 5445 unsigned long ptr; 5446 5447 if (slot >= btrfs_header_nritems(leaf)) { 5448 ret = btrfs_next_leaf(root, path); 5449 if (ret < 0) 5450 goto out; 5451 else if (ret > 0) 5452 break; 5453 continue; 5454 } 5455 5456 btrfs_item_key_to_cpu(leaf, &key, slot); 5457 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5458 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5459 break; 5460 5461 item_size = btrfs_item_size_nr(leaf, slot); 5462 ptr = btrfs_item_ptr_offset(leaf, slot); 5463 while (cur_offset < item_size) { 5464 struct btrfs_key inode_key; 5465 struct inode *dir_inode; 5466 5467 inode_key.type = BTRFS_INODE_ITEM_KEY; 5468 inode_key.offset = 0; 5469 5470 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5471 struct btrfs_inode_extref *extref; 5472 5473 extref = (struct btrfs_inode_extref *) 5474 (ptr + cur_offset); 5475 inode_key.objectid = btrfs_inode_extref_parent( 5476 leaf, extref); 5477 cur_offset += sizeof(*extref); 5478 cur_offset += btrfs_inode_extref_name_len(leaf, 5479 extref); 5480 } else { 5481 inode_key.objectid = key.offset; 5482 cur_offset = item_size; 5483 } 5484 5485 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5486 root, NULL); 5487 /* If parent inode was deleted, skip it. */ 5488 if (IS_ERR(dir_inode)) 5489 continue; 5490 5491 if (ctx) 5492 ctx->log_new_dentries = false; 5493 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5494 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5495 if (!ret && 5496 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5497 ret = 1; 5498 if (!ret && ctx && ctx->log_new_dentries) 5499 ret = log_new_dir_dentries(trans, root, 5500 BTRFS_I(dir_inode), ctx); 5501 iput(dir_inode); 5502 if (ret) 5503 goto out; 5504 } 5505 path->slots[0]++; 5506 } 5507 ret = 0; 5508 out: 5509 btrfs_free_path(path); 5510 return ret; 5511 } 5512 5513 /* 5514 * helper function around btrfs_log_inode to make sure newly created 5515 * parent directories also end up in the log. A minimal inode and backref 5516 * only logging is done of any parent directories that are older than 5517 * the last committed transaction 5518 */ 5519 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5520 struct btrfs_root *root, 5521 struct btrfs_inode *inode, 5522 struct dentry *parent, 5523 const loff_t start, 5524 const loff_t end, 5525 int inode_only, 5526 struct btrfs_log_ctx *ctx) 5527 { 5528 struct btrfs_fs_info *fs_info = root->fs_info; 5529 struct super_block *sb; 5530 struct dentry *old_parent = NULL; 5531 int ret = 0; 5532 u64 last_committed = fs_info->last_trans_committed; 5533 bool log_dentries = false; 5534 struct btrfs_inode *orig_inode = inode; 5535 5536 sb = inode->vfs_inode.i_sb; 5537 5538 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5539 ret = 1; 5540 goto end_no_trans; 5541 } 5542 5543 /* 5544 * The prev transaction commit doesn't complete, we need do 5545 * full commit by ourselves. 5546 */ 5547 if (fs_info->last_trans_log_full_commit > 5548 fs_info->last_trans_committed) { 5549 ret = 1; 5550 goto end_no_trans; 5551 } 5552 5553 if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { 5554 ret = 1; 5555 goto end_no_trans; 5556 } 5557 5558 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5559 last_committed); 5560 if (ret) 5561 goto end_no_trans; 5562 5563 if (btrfs_inode_in_log(inode, trans->transid)) { 5564 ret = BTRFS_NO_LOG_SYNC; 5565 goto end_no_trans; 5566 } 5567 5568 ret = start_log_trans(trans, root, ctx); 5569 if (ret) 5570 goto end_no_trans; 5571 5572 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5573 if (ret) 5574 goto end_trans; 5575 5576 /* 5577 * for regular files, if its inode is already on disk, we don't 5578 * have to worry about the parents at all. This is because 5579 * we can use the last_unlink_trans field to record renames 5580 * and other fun in this file. 5581 */ 5582 if (S_ISREG(inode->vfs_inode.i_mode) && 5583 inode->generation <= last_committed && 5584 inode->last_unlink_trans <= last_committed) { 5585 ret = 0; 5586 goto end_trans; 5587 } 5588 5589 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5590 log_dentries = true; 5591 5592 /* 5593 * On unlink we must make sure all our current and old parent directory 5594 * inodes are fully logged. This is to prevent leaving dangling 5595 * directory index entries in directories that were our parents but are 5596 * not anymore. Not doing this results in old parent directory being 5597 * impossible to delete after log replay (rmdir will always fail with 5598 * error -ENOTEMPTY). 5599 * 5600 * Example 1: 5601 * 5602 * mkdir testdir 5603 * touch testdir/foo 5604 * ln testdir/foo testdir/bar 5605 * sync 5606 * unlink testdir/bar 5607 * xfs_io -c fsync testdir/foo 5608 * <power failure> 5609 * mount fs, triggers log replay 5610 * 5611 * If we don't log the parent directory (testdir), after log replay the 5612 * directory still has an entry pointing to the file inode using the bar 5613 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5614 * the file inode has a link count of 1. 5615 * 5616 * Example 2: 5617 * 5618 * mkdir testdir 5619 * touch foo 5620 * ln foo testdir/foo2 5621 * ln foo testdir/foo3 5622 * sync 5623 * unlink testdir/foo3 5624 * xfs_io -c fsync foo 5625 * <power failure> 5626 * mount fs, triggers log replay 5627 * 5628 * Similar as the first example, after log replay the parent directory 5629 * testdir still has an entry pointing to the inode file with name foo3 5630 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5631 * and has a link count of 2. 5632 */ 5633 if (inode->last_unlink_trans > last_committed) { 5634 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5635 if (ret) 5636 goto end_trans; 5637 } 5638 5639 while (1) { 5640 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5641 break; 5642 5643 inode = BTRFS_I(d_inode(parent)); 5644 if (root != inode->root) 5645 break; 5646 5647 if (inode->generation > last_committed) { 5648 ret = btrfs_log_inode(trans, root, inode, 5649 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5650 if (ret) 5651 goto end_trans; 5652 } 5653 if (IS_ROOT(parent)) 5654 break; 5655 5656 parent = dget_parent(parent); 5657 dput(old_parent); 5658 old_parent = parent; 5659 } 5660 if (log_dentries) 5661 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5662 else 5663 ret = 0; 5664 end_trans: 5665 dput(old_parent); 5666 if (ret < 0) { 5667 btrfs_set_log_full_commit(fs_info, trans); 5668 ret = 1; 5669 } 5670 5671 if (ret) 5672 btrfs_remove_log_ctx(root, ctx); 5673 btrfs_end_log_trans(root); 5674 end_no_trans: 5675 return ret; 5676 } 5677 5678 /* 5679 * it is not safe to log dentry if the chunk root has added new 5680 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5681 * If this returns 1, you must commit the transaction to safely get your 5682 * data on disk. 5683 */ 5684 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5685 struct btrfs_root *root, struct dentry *dentry, 5686 const loff_t start, 5687 const loff_t end, 5688 struct btrfs_log_ctx *ctx) 5689 { 5690 struct dentry *parent = dget_parent(dentry); 5691 int ret; 5692 5693 ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), 5694 parent, start, end, LOG_INODE_ALL, ctx); 5695 dput(parent); 5696 5697 return ret; 5698 } 5699 5700 /* 5701 * should be called during mount to recover any replay any log trees 5702 * from the FS 5703 */ 5704 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5705 { 5706 int ret; 5707 struct btrfs_path *path; 5708 struct btrfs_trans_handle *trans; 5709 struct btrfs_key key; 5710 struct btrfs_key found_key; 5711 struct btrfs_key tmp_key; 5712 struct btrfs_root *log; 5713 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5714 struct walk_control wc = { 5715 .process_func = process_one_buffer, 5716 .stage = 0, 5717 }; 5718 5719 path = btrfs_alloc_path(); 5720 if (!path) 5721 return -ENOMEM; 5722 5723 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5724 5725 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5726 if (IS_ERR(trans)) { 5727 ret = PTR_ERR(trans); 5728 goto error; 5729 } 5730 5731 wc.trans = trans; 5732 wc.pin = 1; 5733 5734 ret = walk_log_tree(trans, log_root_tree, &wc); 5735 if (ret) { 5736 btrfs_handle_fs_error(fs_info, ret, 5737 "Failed to pin buffers while recovering log root tree."); 5738 goto error; 5739 } 5740 5741 again: 5742 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5743 key.offset = (u64)-1; 5744 key.type = BTRFS_ROOT_ITEM_KEY; 5745 5746 while (1) { 5747 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5748 5749 if (ret < 0) { 5750 btrfs_handle_fs_error(fs_info, ret, 5751 "Couldn't find tree log root."); 5752 goto error; 5753 } 5754 if (ret > 0) { 5755 if (path->slots[0] == 0) 5756 break; 5757 path->slots[0]--; 5758 } 5759 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5760 path->slots[0]); 5761 btrfs_release_path(path); 5762 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5763 break; 5764 5765 log = btrfs_read_fs_root(log_root_tree, &found_key); 5766 if (IS_ERR(log)) { 5767 ret = PTR_ERR(log); 5768 btrfs_handle_fs_error(fs_info, ret, 5769 "Couldn't read tree log root."); 5770 goto error; 5771 } 5772 5773 tmp_key.objectid = found_key.offset; 5774 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5775 tmp_key.offset = (u64)-1; 5776 5777 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5778 if (IS_ERR(wc.replay_dest)) { 5779 ret = PTR_ERR(wc.replay_dest); 5780 free_extent_buffer(log->node); 5781 free_extent_buffer(log->commit_root); 5782 kfree(log); 5783 btrfs_handle_fs_error(fs_info, ret, 5784 "Couldn't read target root for tree log recovery."); 5785 goto error; 5786 } 5787 5788 wc.replay_dest->log_root = log; 5789 btrfs_record_root_in_trans(trans, wc.replay_dest); 5790 ret = walk_log_tree(trans, log, &wc); 5791 5792 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5793 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5794 path); 5795 } 5796 5797 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5798 struct btrfs_root *root = wc.replay_dest; 5799 5800 btrfs_release_path(path); 5801 5802 /* 5803 * We have just replayed everything, and the highest 5804 * objectid of fs roots probably has changed in case 5805 * some inode_item's got replayed. 5806 * 5807 * root->objectid_mutex is not acquired as log replay 5808 * could only happen during mount. 5809 */ 5810 ret = btrfs_find_highest_objectid(root, 5811 &root->highest_objectid); 5812 } 5813 5814 key.offset = found_key.offset - 1; 5815 wc.replay_dest->log_root = NULL; 5816 free_extent_buffer(log->node); 5817 free_extent_buffer(log->commit_root); 5818 kfree(log); 5819 5820 if (ret) 5821 goto error; 5822 5823 if (found_key.offset == 0) 5824 break; 5825 } 5826 btrfs_release_path(path); 5827 5828 /* step one is to pin it all, step two is to replay just inodes */ 5829 if (wc.pin) { 5830 wc.pin = 0; 5831 wc.process_func = replay_one_buffer; 5832 wc.stage = LOG_WALK_REPLAY_INODES; 5833 goto again; 5834 } 5835 /* step three is to replay everything */ 5836 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5837 wc.stage++; 5838 goto again; 5839 } 5840 5841 btrfs_free_path(path); 5842 5843 /* step 4: commit the transaction, which also unpins the blocks */ 5844 ret = btrfs_commit_transaction(trans); 5845 if (ret) 5846 return ret; 5847 5848 free_extent_buffer(log_root_tree->node); 5849 log_root_tree->log_root = NULL; 5850 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5851 kfree(log_root_tree); 5852 5853 return 0; 5854 error: 5855 if (wc.trans) 5856 btrfs_end_transaction(wc.trans); 5857 btrfs_free_path(path); 5858 return ret; 5859 } 5860 5861 /* 5862 * there are some corner cases where we want to force a full 5863 * commit instead of allowing a directory to be logged. 5864 * 5865 * They revolve around files there were unlinked from the directory, and 5866 * this function updates the parent directory so that a full commit is 5867 * properly done if it is fsync'd later after the unlinks are done. 5868 * 5869 * Must be called before the unlink operations (updates to the subvolume tree, 5870 * inodes, etc) are done. 5871 */ 5872 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5873 struct btrfs_inode *dir, struct btrfs_inode *inode, 5874 int for_rename) 5875 { 5876 /* 5877 * when we're logging a file, if it hasn't been renamed 5878 * or unlinked, and its inode is fully committed on disk, 5879 * we don't have to worry about walking up the directory chain 5880 * to log its parents. 5881 * 5882 * So, we use the last_unlink_trans field to put this transid 5883 * into the file. When the file is logged we check it and 5884 * don't log the parents if the file is fully on disk. 5885 */ 5886 mutex_lock(&inode->log_mutex); 5887 inode->last_unlink_trans = trans->transid; 5888 mutex_unlock(&inode->log_mutex); 5889 5890 /* 5891 * if this directory was already logged any new 5892 * names for this file/dir will get recorded 5893 */ 5894 smp_mb(); 5895 if (dir->logged_trans == trans->transid) 5896 return; 5897 5898 /* 5899 * if the inode we're about to unlink was logged, 5900 * the log will be properly updated for any new names 5901 */ 5902 if (inode->logged_trans == trans->transid) 5903 return; 5904 5905 /* 5906 * when renaming files across directories, if the directory 5907 * there we're unlinking from gets fsync'd later on, there's 5908 * no way to find the destination directory later and fsync it 5909 * properly. So, we have to be conservative and force commits 5910 * so the new name gets discovered. 5911 */ 5912 if (for_rename) 5913 goto record; 5914 5915 /* we can safely do the unlink without any special recording */ 5916 return; 5917 5918 record: 5919 mutex_lock(&dir->log_mutex); 5920 dir->last_unlink_trans = trans->transid; 5921 mutex_unlock(&dir->log_mutex); 5922 } 5923 5924 /* 5925 * Make sure that if someone attempts to fsync the parent directory of a deleted 5926 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5927 * that after replaying the log tree of the parent directory's root we will not 5928 * see the snapshot anymore and at log replay time we will not see any log tree 5929 * corresponding to the deleted snapshot's root, which could lead to replaying 5930 * it after replaying the log tree of the parent directory (which would replay 5931 * the snapshot delete operation). 5932 * 5933 * Must be called before the actual snapshot destroy operation (updates to the 5934 * parent root and tree of tree roots trees, etc) are done. 5935 */ 5936 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5937 struct btrfs_inode *dir) 5938 { 5939 mutex_lock(&dir->log_mutex); 5940 dir->last_unlink_trans = trans->transid; 5941 mutex_unlock(&dir->log_mutex); 5942 } 5943 5944 /* 5945 * Call this after adding a new name for a file and it will properly 5946 * update the log to reflect the new name. 5947 * 5948 * It will return zero if all goes well, and it will return 1 if a 5949 * full transaction commit is required. 5950 */ 5951 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5952 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5953 struct dentry *parent) 5954 { 5955 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5956 struct btrfs_root *root = inode->root; 5957 5958 /* 5959 * this will force the logging code to walk the dentry chain 5960 * up for the file 5961 */ 5962 if (!S_ISDIR(inode->vfs_inode.i_mode)) 5963 inode->last_unlink_trans = trans->transid; 5964 5965 /* 5966 * if this inode hasn't been logged and directory we're renaming it 5967 * from hasn't been logged, we don't need to log it 5968 */ 5969 if (inode->logged_trans <= fs_info->last_trans_committed && 5970 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 5971 return 0; 5972 5973 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5974 LLONG_MAX, LOG_INODE_EXISTS, NULL); 5975 } 5976 5977