1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 30 /* magic values for the inode_only field in btrfs_log_inode: 31 * 32 * LOG_INODE_ALL means to log everything 33 * LOG_INODE_EXISTS means to log just enough to recreate the inode 34 * during log replay 35 */ 36 #define LOG_INODE_ALL 0 37 #define LOG_INODE_EXISTS 1 38 39 /* 40 * directory trouble cases 41 * 42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 43 * log, we must force a full commit before doing an fsync of the directory 44 * where the unlink was done. 45 * ---> record transid of last unlink/rename per directory 46 * 47 * mkdir foo/some_dir 48 * normal commit 49 * rename foo/some_dir foo2/some_dir 50 * mkdir foo/some_dir 51 * fsync foo/some_dir/some_file 52 * 53 * The fsync above will unlink the original some_dir without recording 54 * it in its new location (foo2). After a crash, some_dir will be gone 55 * unless the fsync of some_file forces a full commit 56 * 57 * 2) we must log any new names for any file or dir that is in the fsync 58 * log. ---> check inode while renaming/linking. 59 * 60 * 2a) we must log any new names for any file or dir during rename 61 * when the directory they are being removed from was logged. 62 * ---> check inode and old parent dir during rename 63 * 64 * 2a is actually the more important variant. With the extra logging 65 * a crash might unlink the old name without recreating the new one 66 * 67 * 3) after a crash, we must go through any directories with a link count 68 * of zero and redo the rm -rf 69 * 70 * mkdir f1/foo 71 * normal commit 72 * rm -rf f1/foo 73 * fsync(f1) 74 * 75 * The directory f1 was fully removed from the FS, but fsync was never 76 * called on f1, only its parent dir. After a crash the rm -rf must 77 * be replayed. This must be able to recurse down the entire 78 * directory tree. The inode link count fixup code takes care of the 79 * ugly details. 80 */ 81 82 /* 83 * stages for the tree walking. The first 84 * stage (0) is to only pin down the blocks we find 85 * the second stage (1) is to make sure that all the inodes 86 * we find in the log are created in the subvolume. 87 * 88 * The last stage is to deal with directories and links and extents 89 * and all the other fun semantics 90 */ 91 #define LOG_WALK_PIN_ONLY 0 92 #define LOG_WALK_REPLAY_INODES 1 93 #define LOG_WALK_REPLAY_DIR_INDEX 2 94 #define LOG_WALK_REPLAY_ALL 3 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct inode *inode, 98 int inode_only, 99 const loff_t start, 100 const loff_t end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 int ret = 0; 144 145 mutex_lock(&root->log_mutex); 146 147 if (root->log_root) { 148 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 149 ret = -EAGAIN; 150 goto out; 151 } 152 153 if (!root->log_start_pid) { 154 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 155 root->log_start_pid = current->pid; 156 } else if (root->log_start_pid != current->pid) { 157 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 158 } 159 } else { 160 mutex_lock(&root->fs_info->tree_log_mutex); 161 if (!root->fs_info->log_root_tree) 162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 163 mutex_unlock(&root->fs_info->tree_log_mutex); 164 if (ret) 165 goto out; 166 167 ret = btrfs_add_log_tree(trans, root); 168 if (ret) 169 goto out; 170 171 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 172 root->log_start_pid = current->pid; 173 } 174 175 atomic_inc(&root->log_batch); 176 atomic_inc(&root->log_writers); 177 if (ctx) { 178 int index = root->log_transid % 2; 179 list_add_tail(&ctx->list, &root->log_ctxs[index]); 180 ctx->log_transid = root->log_transid; 181 } 182 183 out: 184 mutex_unlock(&root->log_mutex); 185 return ret; 186 } 187 188 /* 189 * returns 0 if there was a log transaction running and we were able 190 * to join, or returns -ENOENT if there were not transactions 191 * in progress 192 */ 193 static int join_running_log_trans(struct btrfs_root *root) 194 { 195 int ret = -ENOENT; 196 197 smp_mb(); 198 if (!root->log_root) 199 return -ENOENT; 200 201 mutex_lock(&root->log_mutex); 202 if (root->log_root) { 203 ret = 0; 204 atomic_inc(&root->log_writers); 205 } 206 mutex_unlock(&root->log_mutex); 207 return ret; 208 } 209 210 /* 211 * This either makes the current running log transaction wait 212 * until you call btrfs_end_log_trans() or it makes any future 213 * log transactions wait until you call btrfs_end_log_trans() 214 */ 215 int btrfs_pin_log_trans(struct btrfs_root *root) 216 { 217 int ret = -ENOENT; 218 219 mutex_lock(&root->log_mutex); 220 atomic_inc(&root->log_writers); 221 mutex_unlock(&root->log_mutex); 222 return ret; 223 } 224 225 /* 226 * indicate we're done making changes to the log tree 227 * and wake up anyone waiting to do a sync 228 */ 229 void btrfs_end_log_trans(struct btrfs_root *root) 230 { 231 if (atomic_dec_and_test(&root->log_writers)) { 232 /* 233 * Implicit memory barrier after atomic_dec_and_test 234 */ 235 if (waitqueue_active(&root->log_writer_wait)) 236 wake_up(&root->log_writer_wait); 237 } 238 } 239 240 241 /* 242 * the walk control struct is used to pass state down the chain when 243 * processing the log tree. The stage field tells us which part 244 * of the log tree processing we are currently doing. The others 245 * are state fields used for that specific part 246 */ 247 struct walk_control { 248 /* should we free the extent on disk when done? This is used 249 * at transaction commit time while freeing a log tree 250 */ 251 int free; 252 253 /* should we write out the extent buffer? This is used 254 * while flushing the log tree to disk during a sync 255 */ 256 int write; 257 258 /* should we wait for the extent buffer io to finish? Also used 259 * while flushing the log tree to disk for a sync 260 */ 261 int wait; 262 263 /* pin only walk, we record which extents on disk belong to the 264 * log trees 265 */ 266 int pin; 267 268 /* what stage of the replay code we're currently in */ 269 int stage; 270 271 /* the root we are currently replaying */ 272 struct btrfs_root *replay_dest; 273 274 /* the trans handle for the current replay */ 275 struct btrfs_trans_handle *trans; 276 277 /* the function that gets used to process blocks we find in the 278 * tree. Note the extent_buffer might not be up to date when it is 279 * passed in, and it must be checked or read if you need the data 280 * inside it 281 */ 282 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 283 struct walk_control *wc, u64 gen); 284 }; 285 286 /* 287 * process_func used to pin down extents, write them or wait on them 288 */ 289 static int process_one_buffer(struct btrfs_root *log, 290 struct extent_buffer *eb, 291 struct walk_control *wc, u64 gen) 292 { 293 int ret = 0; 294 295 /* 296 * If this fs is mixed then we need to be able to process the leaves to 297 * pin down any logged extents, so we have to read the block. 298 */ 299 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 300 ret = btrfs_read_buffer(eb, gen); 301 if (ret) 302 return ret; 303 } 304 305 if (wc->pin) 306 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 307 eb->start, eb->len); 308 309 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 310 if (wc->pin && btrfs_header_level(eb) == 0) 311 ret = btrfs_exclude_logged_extents(log, eb); 312 if (wc->write) 313 btrfs_write_tree_block(eb); 314 if (wc->wait) 315 btrfs_wait_tree_block_writeback(eb); 316 } 317 return ret; 318 } 319 320 /* 321 * Item overwrite used by replay and tree logging. eb, slot and key all refer 322 * to the src data we are copying out. 323 * 324 * root is the tree we are copying into, and path is a scratch 325 * path for use in this function (it should be released on entry and 326 * will be released on exit). 327 * 328 * If the key is already in the destination tree the existing item is 329 * overwritten. If the existing item isn't big enough, it is extended. 330 * If it is too large, it is truncated. 331 * 332 * If the key isn't in the destination yet, a new item is inserted. 333 */ 334 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 335 struct btrfs_root *root, 336 struct btrfs_path *path, 337 struct extent_buffer *eb, int slot, 338 struct btrfs_key *key) 339 { 340 int ret; 341 u32 item_size; 342 u64 saved_i_size = 0; 343 int save_old_i_size = 0; 344 unsigned long src_ptr; 345 unsigned long dst_ptr; 346 int overwrite_root = 0; 347 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 348 349 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 350 overwrite_root = 1; 351 352 item_size = btrfs_item_size_nr(eb, slot); 353 src_ptr = btrfs_item_ptr_offset(eb, slot); 354 355 /* look for the key in the destination tree */ 356 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 357 if (ret < 0) 358 return ret; 359 360 if (ret == 0) { 361 char *src_copy; 362 char *dst_copy; 363 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 364 path->slots[0]); 365 if (dst_size != item_size) 366 goto insert; 367 368 if (item_size == 0) { 369 btrfs_release_path(path); 370 return 0; 371 } 372 dst_copy = kmalloc(item_size, GFP_NOFS); 373 src_copy = kmalloc(item_size, GFP_NOFS); 374 if (!dst_copy || !src_copy) { 375 btrfs_release_path(path); 376 kfree(dst_copy); 377 kfree(src_copy); 378 return -ENOMEM; 379 } 380 381 read_extent_buffer(eb, src_copy, src_ptr, item_size); 382 383 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 384 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 385 item_size); 386 ret = memcmp(dst_copy, src_copy, item_size); 387 388 kfree(dst_copy); 389 kfree(src_copy); 390 /* 391 * they have the same contents, just return, this saves 392 * us from cowing blocks in the destination tree and doing 393 * extra writes that may not have been done by a previous 394 * sync 395 */ 396 if (ret == 0) { 397 btrfs_release_path(path); 398 return 0; 399 } 400 401 /* 402 * We need to load the old nbytes into the inode so when we 403 * replay the extents we've logged we get the right nbytes. 404 */ 405 if (inode_item) { 406 struct btrfs_inode_item *item; 407 u64 nbytes; 408 u32 mode; 409 410 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 411 struct btrfs_inode_item); 412 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 413 item = btrfs_item_ptr(eb, slot, 414 struct btrfs_inode_item); 415 btrfs_set_inode_nbytes(eb, item, nbytes); 416 417 /* 418 * If this is a directory we need to reset the i_size to 419 * 0 so that we can set it up properly when replaying 420 * the rest of the items in this log. 421 */ 422 mode = btrfs_inode_mode(eb, item); 423 if (S_ISDIR(mode)) 424 btrfs_set_inode_size(eb, item, 0); 425 } 426 } else if (inode_item) { 427 struct btrfs_inode_item *item; 428 u32 mode; 429 430 /* 431 * New inode, set nbytes to 0 so that the nbytes comes out 432 * properly when we replay the extents. 433 */ 434 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 435 btrfs_set_inode_nbytes(eb, item, 0); 436 437 /* 438 * If this is a directory we need to reset the i_size to 0 so 439 * that we can set it up properly when replaying the rest of 440 * the items in this log. 441 */ 442 mode = btrfs_inode_mode(eb, item); 443 if (S_ISDIR(mode)) 444 btrfs_set_inode_size(eb, item, 0); 445 } 446 insert: 447 btrfs_release_path(path); 448 /* try to insert the key into the destination tree */ 449 path->skip_release_on_error = 1; 450 ret = btrfs_insert_empty_item(trans, root, path, 451 key, item_size); 452 path->skip_release_on_error = 0; 453 454 /* make sure any existing item is the correct size */ 455 if (ret == -EEXIST || ret == -EOVERFLOW) { 456 u32 found_size; 457 found_size = btrfs_item_size_nr(path->nodes[0], 458 path->slots[0]); 459 if (found_size > item_size) 460 btrfs_truncate_item(root, path, item_size, 1); 461 else if (found_size < item_size) 462 btrfs_extend_item(root, path, 463 item_size - found_size); 464 } else if (ret) { 465 return ret; 466 } 467 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 468 path->slots[0]); 469 470 /* don't overwrite an existing inode if the generation number 471 * was logged as zero. This is done when the tree logging code 472 * is just logging an inode to make sure it exists after recovery. 473 * 474 * Also, don't overwrite i_size on directories during replay. 475 * log replay inserts and removes directory items based on the 476 * state of the tree found in the subvolume, and i_size is modified 477 * as it goes 478 */ 479 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 480 struct btrfs_inode_item *src_item; 481 struct btrfs_inode_item *dst_item; 482 483 src_item = (struct btrfs_inode_item *)src_ptr; 484 dst_item = (struct btrfs_inode_item *)dst_ptr; 485 486 if (btrfs_inode_generation(eb, src_item) == 0) { 487 struct extent_buffer *dst_eb = path->nodes[0]; 488 const u64 ino_size = btrfs_inode_size(eb, src_item); 489 490 /* 491 * For regular files an ino_size == 0 is used only when 492 * logging that an inode exists, as part of a directory 493 * fsync, and the inode wasn't fsynced before. In this 494 * case don't set the size of the inode in the fs/subvol 495 * tree, otherwise we would be throwing valid data away. 496 */ 497 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 498 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 499 ino_size != 0) { 500 struct btrfs_map_token token; 501 502 btrfs_init_map_token(&token); 503 btrfs_set_token_inode_size(dst_eb, dst_item, 504 ino_size, &token); 505 } 506 goto no_copy; 507 } 508 509 if (overwrite_root && 510 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 511 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 512 save_old_i_size = 1; 513 saved_i_size = btrfs_inode_size(path->nodes[0], 514 dst_item); 515 } 516 } 517 518 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 519 src_ptr, item_size); 520 521 if (save_old_i_size) { 522 struct btrfs_inode_item *dst_item; 523 dst_item = (struct btrfs_inode_item *)dst_ptr; 524 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 525 } 526 527 /* make sure the generation is filled in */ 528 if (key->type == BTRFS_INODE_ITEM_KEY) { 529 struct btrfs_inode_item *dst_item; 530 dst_item = (struct btrfs_inode_item *)dst_ptr; 531 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 532 btrfs_set_inode_generation(path->nodes[0], dst_item, 533 trans->transid); 534 } 535 } 536 no_copy: 537 btrfs_mark_buffer_dirty(path->nodes[0]); 538 btrfs_release_path(path); 539 return 0; 540 } 541 542 /* 543 * simple helper to read an inode off the disk from a given root 544 * This can only be called for subvolume roots and not for the log 545 */ 546 static noinline struct inode *read_one_inode(struct btrfs_root *root, 547 u64 objectid) 548 { 549 struct btrfs_key key; 550 struct inode *inode; 551 552 key.objectid = objectid; 553 key.type = BTRFS_INODE_ITEM_KEY; 554 key.offset = 0; 555 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 556 if (IS_ERR(inode)) { 557 inode = NULL; 558 } else if (is_bad_inode(inode)) { 559 iput(inode); 560 inode = NULL; 561 } 562 return inode; 563 } 564 565 /* replays a single extent in 'eb' at 'slot' with 'key' into the 566 * subvolume 'root'. path is released on entry and should be released 567 * on exit. 568 * 569 * extents in the log tree have not been allocated out of the extent 570 * tree yet. So, this completes the allocation, taking a reference 571 * as required if the extent already exists or creating a new extent 572 * if it isn't in the extent allocation tree yet. 573 * 574 * The extent is inserted into the file, dropping any existing extents 575 * from the file that overlap the new one. 576 */ 577 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 578 struct btrfs_root *root, 579 struct btrfs_path *path, 580 struct extent_buffer *eb, int slot, 581 struct btrfs_key *key) 582 { 583 int found_type; 584 u64 extent_end; 585 u64 start = key->offset; 586 u64 nbytes = 0; 587 struct btrfs_file_extent_item *item; 588 struct inode *inode = NULL; 589 unsigned long size; 590 int ret = 0; 591 592 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 593 found_type = btrfs_file_extent_type(eb, item); 594 595 if (found_type == BTRFS_FILE_EXTENT_REG || 596 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 597 nbytes = btrfs_file_extent_num_bytes(eb, item); 598 extent_end = start + nbytes; 599 600 /* 601 * We don't add to the inodes nbytes if we are prealloc or a 602 * hole. 603 */ 604 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 605 nbytes = 0; 606 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 607 size = btrfs_file_extent_inline_len(eb, slot, item); 608 nbytes = btrfs_file_extent_ram_bytes(eb, item); 609 extent_end = ALIGN(start + size, root->sectorsize); 610 } else { 611 ret = 0; 612 goto out; 613 } 614 615 inode = read_one_inode(root, key->objectid); 616 if (!inode) { 617 ret = -EIO; 618 goto out; 619 } 620 621 /* 622 * first check to see if we already have this extent in the 623 * file. This must be done before the btrfs_drop_extents run 624 * so we don't try to drop this extent. 625 */ 626 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 627 start, 0); 628 629 if (ret == 0 && 630 (found_type == BTRFS_FILE_EXTENT_REG || 631 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 632 struct btrfs_file_extent_item cmp1; 633 struct btrfs_file_extent_item cmp2; 634 struct btrfs_file_extent_item *existing; 635 struct extent_buffer *leaf; 636 637 leaf = path->nodes[0]; 638 existing = btrfs_item_ptr(leaf, path->slots[0], 639 struct btrfs_file_extent_item); 640 641 read_extent_buffer(eb, &cmp1, (unsigned long)item, 642 sizeof(cmp1)); 643 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 644 sizeof(cmp2)); 645 646 /* 647 * we already have a pointer to this exact extent, 648 * we don't have to do anything 649 */ 650 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 651 btrfs_release_path(path); 652 goto out; 653 } 654 } 655 btrfs_release_path(path); 656 657 /* drop any overlapping extents */ 658 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 659 if (ret) 660 goto out; 661 662 if (found_type == BTRFS_FILE_EXTENT_REG || 663 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 664 u64 offset; 665 unsigned long dest_offset; 666 struct btrfs_key ins; 667 668 ret = btrfs_insert_empty_item(trans, root, path, key, 669 sizeof(*item)); 670 if (ret) 671 goto out; 672 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 673 path->slots[0]); 674 copy_extent_buffer(path->nodes[0], eb, dest_offset, 675 (unsigned long)item, sizeof(*item)); 676 677 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 678 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 679 ins.type = BTRFS_EXTENT_ITEM_KEY; 680 offset = key->offset - btrfs_file_extent_offset(eb, item); 681 682 if (ins.objectid > 0) { 683 u64 csum_start; 684 u64 csum_end; 685 LIST_HEAD(ordered_sums); 686 /* 687 * is this extent already allocated in the extent 688 * allocation tree? If so, just add a reference 689 */ 690 ret = btrfs_lookup_data_extent(root, ins.objectid, 691 ins.offset); 692 if (ret == 0) { 693 ret = btrfs_inc_extent_ref(trans, root, 694 ins.objectid, ins.offset, 695 0, root->root_key.objectid, 696 key->objectid, offset); 697 if (ret) 698 goto out; 699 } else { 700 /* 701 * insert the extent pointer in the extent 702 * allocation tree 703 */ 704 ret = btrfs_alloc_logged_file_extent(trans, 705 root, root->root_key.objectid, 706 key->objectid, offset, &ins); 707 if (ret) 708 goto out; 709 } 710 btrfs_release_path(path); 711 712 if (btrfs_file_extent_compression(eb, item)) { 713 csum_start = ins.objectid; 714 csum_end = csum_start + ins.offset; 715 } else { 716 csum_start = ins.objectid + 717 btrfs_file_extent_offset(eb, item); 718 csum_end = csum_start + 719 btrfs_file_extent_num_bytes(eb, item); 720 } 721 722 ret = btrfs_lookup_csums_range(root->log_root, 723 csum_start, csum_end - 1, 724 &ordered_sums, 0); 725 if (ret) 726 goto out; 727 /* 728 * Now delete all existing cums in the csum root that 729 * cover our range. We do this because we can have an 730 * extent that is completely referenced by one file 731 * extent item and partially referenced by another 732 * file extent item (like after using the clone or 733 * extent_same ioctls). In this case if we end up doing 734 * the replay of the one that partially references the 735 * extent first, and we do not do the csum deletion 736 * below, we can get 2 csum items in the csum tree that 737 * overlap each other. For example, imagine our log has 738 * the two following file extent items: 739 * 740 * key (257 EXTENT_DATA 409600) 741 * extent data disk byte 12845056 nr 102400 742 * extent data offset 20480 nr 20480 ram 102400 743 * 744 * key (257 EXTENT_DATA 819200) 745 * extent data disk byte 12845056 nr 102400 746 * extent data offset 0 nr 102400 ram 102400 747 * 748 * Where the second one fully references the 100K extent 749 * that starts at disk byte 12845056, and the log tree 750 * has a single csum item that covers the entire range 751 * of the extent: 752 * 753 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 754 * 755 * After the first file extent item is replayed, the 756 * csum tree gets the following csum item: 757 * 758 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 759 * 760 * Which covers the 20K sub-range starting at offset 20K 761 * of our extent. Now when we replay the second file 762 * extent item, if we do not delete existing csum items 763 * that cover any of its blocks, we end up getting two 764 * csum items in our csum tree that overlap each other: 765 * 766 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 767 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 768 * 769 * Which is a problem, because after this anyone trying 770 * to lookup up for the checksum of any block of our 771 * extent starting at an offset of 40K or higher, will 772 * end up looking at the second csum item only, which 773 * does not contain the checksum for any block starting 774 * at offset 40K or higher of our extent. 775 */ 776 while (!list_empty(&ordered_sums)) { 777 struct btrfs_ordered_sum *sums; 778 sums = list_entry(ordered_sums.next, 779 struct btrfs_ordered_sum, 780 list); 781 if (!ret) 782 ret = btrfs_del_csums(trans, 783 root->fs_info->csum_root, 784 sums->bytenr, 785 sums->len); 786 if (!ret) 787 ret = btrfs_csum_file_blocks(trans, 788 root->fs_info->csum_root, 789 sums); 790 list_del(&sums->list); 791 kfree(sums); 792 } 793 if (ret) 794 goto out; 795 } else { 796 btrfs_release_path(path); 797 } 798 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 799 /* inline extents are easy, we just overwrite them */ 800 ret = overwrite_item(trans, root, path, eb, slot, key); 801 if (ret) 802 goto out; 803 } 804 805 inode_add_bytes(inode, nbytes); 806 ret = btrfs_update_inode(trans, root, inode); 807 out: 808 if (inode) 809 iput(inode); 810 return ret; 811 } 812 813 /* 814 * when cleaning up conflicts between the directory names in the 815 * subvolume, directory names in the log and directory names in the 816 * inode back references, we may have to unlink inodes from directories. 817 * 818 * This is a helper function to do the unlink of a specific directory 819 * item 820 */ 821 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 822 struct btrfs_root *root, 823 struct btrfs_path *path, 824 struct inode *dir, 825 struct btrfs_dir_item *di) 826 { 827 struct inode *inode; 828 char *name; 829 int name_len; 830 struct extent_buffer *leaf; 831 struct btrfs_key location; 832 int ret; 833 834 leaf = path->nodes[0]; 835 836 btrfs_dir_item_key_to_cpu(leaf, di, &location); 837 name_len = btrfs_dir_name_len(leaf, di); 838 name = kmalloc(name_len, GFP_NOFS); 839 if (!name) 840 return -ENOMEM; 841 842 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 843 btrfs_release_path(path); 844 845 inode = read_one_inode(root, location.objectid); 846 if (!inode) { 847 ret = -EIO; 848 goto out; 849 } 850 851 ret = link_to_fixup_dir(trans, root, path, location.objectid); 852 if (ret) 853 goto out; 854 855 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 856 if (ret) 857 goto out; 858 else 859 ret = btrfs_run_delayed_items(trans, root); 860 out: 861 kfree(name); 862 iput(inode); 863 return ret; 864 } 865 866 /* 867 * helper function to see if a given name and sequence number found 868 * in an inode back reference are already in a directory and correctly 869 * point to this inode 870 */ 871 static noinline int inode_in_dir(struct btrfs_root *root, 872 struct btrfs_path *path, 873 u64 dirid, u64 objectid, u64 index, 874 const char *name, int name_len) 875 { 876 struct btrfs_dir_item *di; 877 struct btrfs_key location; 878 int match = 0; 879 880 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 881 index, name, name_len, 0); 882 if (di && !IS_ERR(di)) { 883 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 884 if (location.objectid != objectid) 885 goto out; 886 } else 887 goto out; 888 btrfs_release_path(path); 889 890 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 891 if (di && !IS_ERR(di)) { 892 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 893 if (location.objectid != objectid) 894 goto out; 895 } else 896 goto out; 897 match = 1; 898 out: 899 btrfs_release_path(path); 900 return match; 901 } 902 903 /* 904 * helper function to check a log tree for a named back reference in 905 * an inode. This is used to decide if a back reference that is 906 * found in the subvolume conflicts with what we find in the log. 907 * 908 * inode backreferences may have multiple refs in a single item, 909 * during replay we process one reference at a time, and we don't 910 * want to delete valid links to a file from the subvolume if that 911 * link is also in the log. 912 */ 913 static noinline int backref_in_log(struct btrfs_root *log, 914 struct btrfs_key *key, 915 u64 ref_objectid, 916 const char *name, int namelen) 917 { 918 struct btrfs_path *path; 919 struct btrfs_inode_ref *ref; 920 unsigned long ptr; 921 unsigned long ptr_end; 922 unsigned long name_ptr; 923 int found_name_len; 924 int item_size; 925 int ret; 926 int match = 0; 927 928 path = btrfs_alloc_path(); 929 if (!path) 930 return -ENOMEM; 931 932 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 933 if (ret != 0) 934 goto out; 935 936 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 937 938 if (key->type == BTRFS_INODE_EXTREF_KEY) { 939 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 940 name, namelen, NULL)) 941 match = 1; 942 943 goto out; 944 } 945 946 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 947 ptr_end = ptr + item_size; 948 while (ptr < ptr_end) { 949 ref = (struct btrfs_inode_ref *)ptr; 950 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 951 if (found_name_len == namelen) { 952 name_ptr = (unsigned long)(ref + 1); 953 ret = memcmp_extent_buffer(path->nodes[0], name, 954 name_ptr, namelen); 955 if (ret == 0) { 956 match = 1; 957 goto out; 958 } 959 } 960 ptr = (unsigned long)(ref + 1) + found_name_len; 961 } 962 out: 963 btrfs_free_path(path); 964 return match; 965 } 966 967 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 968 struct btrfs_root *root, 969 struct btrfs_path *path, 970 struct btrfs_root *log_root, 971 struct inode *dir, struct inode *inode, 972 struct extent_buffer *eb, 973 u64 inode_objectid, u64 parent_objectid, 974 u64 ref_index, char *name, int namelen, 975 int *search_done) 976 { 977 int ret; 978 char *victim_name; 979 int victim_name_len; 980 struct extent_buffer *leaf; 981 struct btrfs_dir_item *di; 982 struct btrfs_key search_key; 983 struct btrfs_inode_extref *extref; 984 985 again: 986 /* Search old style refs */ 987 search_key.objectid = inode_objectid; 988 search_key.type = BTRFS_INODE_REF_KEY; 989 search_key.offset = parent_objectid; 990 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 991 if (ret == 0) { 992 struct btrfs_inode_ref *victim_ref; 993 unsigned long ptr; 994 unsigned long ptr_end; 995 996 leaf = path->nodes[0]; 997 998 /* are we trying to overwrite a back ref for the root directory 999 * if so, just jump out, we're done 1000 */ 1001 if (search_key.objectid == search_key.offset) 1002 return 1; 1003 1004 /* check all the names in this back reference to see 1005 * if they are in the log. if so, we allow them to stay 1006 * otherwise they must be unlinked as a conflict 1007 */ 1008 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1009 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1010 while (ptr < ptr_end) { 1011 victim_ref = (struct btrfs_inode_ref *)ptr; 1012 victim_name_len = btrfs_inode_ref_name_len(leaf, 1013 victim_ref); 1014 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1015 if (!victim_name) 1016 return -ENOMEM; 1017 1018 read_extent_buffer(leaf, victim_name, 1019 (unsigned long)(victim_ref + 1), 1020 victim_name_len); 1021 1022 if (!backref_in_log(log_root, &search_key, 1023 parent_objectid, 1024 victim_name, 1025 victim_name_len)) { 1026 inc_nlink(inode); 1027 btrfs_release_path(path); 1028 1029 ret = btrfs_unlink_inode(trans, root, dir, 1030 inode, victim_name, 1031 victim_name_len); 1032 kfree(victim_name); 1033 if (ret) 1034 return ret; 1035 ret = btrfs_run_delayed_items(trans, root); 1036 if (ret) 1037 return ret; 1038 *search_done = 1; 1039 goto again; 1040 } 1041 kfree(victim_name); 1042 1043 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1044 } 1045 1046 /* 1047 * NOTE: we have searched root tree and checked the 1048 * coresponding ref, it does not need to check again. 1049 */ 1050 *search_done = 1; 1051 } 1052 btrfs_release_path(path); 1053 1054 /* Same search but for extended refs */ 1055 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1056 inode_objectid, parent_objectid, 0, 1057 0); 1058 if (!IS_ERR_OR_NULL(extref)) { 1059 u32 item_size; 1060 u32 cur_offset = 0; 1061 unsigned long base; 1062 struct inode *victim_parent; 1063 1064 leaf = path->nodes[0]; 1065 1066 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1067 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1068 1069 while (cur_offset < item_size) { 1070 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1071 1072 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1073 1074 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1075 goto next; 1076 1077 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1078 if (!victim_name) 1079 return -ENOMEM; 1080 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1081 victim_name_len); 1082 1083 search_key.objectid = inode_objectid; 1084 search_key.type = BTRFS_INODE_EXTREF_KEY; 1085 search_key.offset = btrfs_extref_hash(parent_objectid, 1086 victim_name, 1087 victim_name_len); 1088 ret = 0; 1089 if (!backref_in_log(log_root, &search_key, 1090 parent_objectid, victim_name, 1091 victim_name_len)) { 1092 ret = -ENOENT; 1093 victim_parent = read_one_inode(root, 1094 parent_objectid); 1095 if (victim_parent) { 1096 inc_nlink(inode); 1097 btrfs_release_path(path); 1098 1099 ret = btrfs_unlink_inode(trans, root, 1100 victim_parent, 1101 inode, 1102 victim_name, 1103 victim_name_len); 1104 if (!ret) 1105 ret = btrfs_run_delayed_items( 1106 trans, root); 1107 } 1108 iput(victim_parent); 1109 kfree(victim_name); 1110 if (ret) 1111 return ret; 1112 *search_done = 1; 1113 goto again; 1114 } 1115 kfree(victim_name); 1116 if (ret) 1117 return ret; 1118 next: 1119 cur_offset += victim_name_len + sizeof(*extref); 1120 } 1121 *search_done = 1; 1122 } 1123 btrfs_release_path(path); 1124 1125 /* look for a conflicting sequence number */ 1126 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1127 ref_index, name, namelen, 0); 1128 if (di && !IS_ERR(di)) { 1129 ret = drop_one_dir_item(trans, root, path, dir, di); 1130 if (ret) 1131 return ret; 1132 } 1133 btrfs_release_path(path); 1134 1135 /* look for a conflicing name */ 1136 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1137 name, namelen, 0); 1138 if (di && !IS_ERR(di)) { 1139 ret = drop_one_dir_item(trans, root, path, dir, di); 1140 if (ret) 1141 return ret; 1142 } 1143 btrfs_release_path(path); 1144 1145 return 0; 1146 } 1147 1148 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1149 u32 *namelen, char **name, u64 *index, 1150 u64 *parent_objectid) 1151 { 1152 struct btrfs_inode_extref *extref; 1153 1154 extref = (struct btrfs_inode_extref *)ref_ptr; 1155 1156 *namelen = btrfs_inode_extref_name_len(eb, extref); 1157 *name = kmalloc(*namelen, GFP_NOFS); 1158 if (*name == NULL) 1159 return -ENOMEM; 1160 1161 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1162 *namelen); 1163 1164 *index = btrfs_inode_extref_index(eb, extref); 1165 if (parent_objectid) 1166 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1167 1168 return 0; 1169 } 1170 1171 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1172 u32 *namelen, char **name, u64 *index) 1173 { 1174 struct btrfs_inode_ref *ref; 1175 1176 ref = (struct btrfs_inode_ref *)ref_ptr; 1177 1178 *namelen = btrfs_inode_ref_name_len(eb, ref); 1179 *name = kmalloc(*namelen, GFP_NOFS); 1180 if (*name == NULL) 1181 return -ENOMEM; 1182 1183 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1184 1185 *index = btrfs_inode_ref_index(eb, ref); 1186 1187 return 0; 1188 } 1189 1190 /* 1191 * replay one inode back reference item found in the log tree. 1192 * eb, slot and key refer to the buffer and key found in the log tree. 1193 * root is the destination we are replaying into, and path is for temp 1194 * use by this function. (it should be released on return). 1195 */ 1196 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1197 struct btrfs_root *root, 1198 struct btrfs_root *log, 1199 struct btrfs_path *path, 1200 struct extent_buffer *eb, int slot, 1201 struct btrfs_key *key) 1202 { 1203 struct inode *dir = NULL; 1204 struct inode *inode = NULL; 1205 unsigned long ref_ptr; 1206 unsigned long ref_end; 1207 char *name = NULL; 1208 int namelen; 1209 int ret; 1210 int search_done = 0; 1211 int log_ref_ver = 0; 1212 u64 parent_objectid; 1213 u64 inode_objectid; 1214 u64 ref_index = 0; 1215 int ref_struct_size; 1216 1217 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1218 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1219 1220 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1221 struct btrfs_inode_extref *r; 1222 1223 ref_struct_size = sizeof(struct btrfs_inode_extref); 1224 log_ref_ver = 1; 1225 r = (struct btrfs_inode_extref *)ref_ptr; 1226 parent_objectid = btrfs_inode_extref_parent(eb, r); 1227 } else { 1228 ref_struct_size = sizeof(struct btrfs_inode_ref); 1229 parent_objectid = key->offset; 1230 } 1231 inode_objectid = key->objectid; 1232 1233 /* 1234 * it is possible that we didn't log all the parent directories 1235 * for a given inode. If we don't find the dir, just don't 1236 * copy the back ref in. The link count fixup code will take 1237 * care of the rest 1238 */ 1239 dir = read_one_inode(root, parent_objectid); 1240 if (!dir) { 1241 ret = -ENOENT; 1242 goto out; 1243 } 1244 1245 inode = read_one_inode(root, inode_objectid); 1246 if (!inode) { 1247 ret = -EIO; 1248 goto out; 1249 } 1250 1251 while (ref_ptr < ref_end) { 1252 if (log_ref_ver) { 1253 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1254 &ref_index, &parent_objectid); 1255 /* 1256 * parent object can change from one array 1257 * item to another. 1258 */ 1259 if (!dir) 1260 dir = read_one_inode(root, parent_objectid); 1261 if (!dir) { 1262 ret = -ENOENT; 1263 goto out; 1264 } 1265 } else { 1266 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1267 &ref_index); 1268 } 1269 if (ret) 1270 goto out; 1271 1272 /* if we already have a perfect match, we're done */ 1273 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1274 ref_index, name, namelen)) { 1275 /* 1276 * look for a conflicting back reference in the 1277 * metadata. if we find one we have to unlink that name 1278 * of the file before we add our new link. Later on, we 1279 * overwrite any existing back reference, and we don't 1280 * want to create dangling pointers in the directory. 1281 */ 1282 1283 if (!search_done) { 1284 ret = __add_inode_ref(trans, root, path, log, 1285 dir, inode, eb, 1286 inode_objectid, 1287 parent_objectid, 1288 ref_index, name, namelen, 1289 &search_done); 1290 if (ret) { 1291 if (ret == 1) 1292 ret = 0; 1293 goto out; 1294 } 1295 } 1296 1297 /* insert our name */ 1298 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1299 0, ref_index); 1300 if (ret) 1301 goto out; 1302 1303 btrfs_update_inode(trans, root, inode); 1304 } 1305 1306 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1307 kfree(name); 1308 name = NULL; 1309 if (log_ref_ver) { 1310 iput(dir); 1311 dir = NULL; 1312 } 1313 } 1314 1315 /* finally write the back reference in the inode */ 1316 ret = overwrite_item(trans, root, path, eb, slot, key); 1317 out: 1318 btrfs_release_path(path); 1319 kfree(name); 1320 iput(dir); 1321 iput(inode); 1322 return ret; 1323 } 1324 1325 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1326 struct btrfs_root *root, u64 ino) 1327 { 1328 int ret; 1329 1330 ret = btrfs_insert_orphan_item(trans, root, ino); 1331 if (ret == -EEXIST) 1332 ret = 0; 1333 1334 return ret; 1335 } 1336 1337 static int count_inode_extrefs(struct btrfs_root *root, 1338 struct inode *inode, struct btrfs_path *path) 1339 { 1340 int ret = 0; 1341 int name_len; 1342 unsigned int nlink = 0; 1343 u32 item_size; 1344 u32 cur_offset = 0; 1345 u64 inode_objectid = btrfs_ino(inode); 1346 u64 offset = 0; 1347 unsigned long ptr; 1348 struct btrfs_inode_extref *extref; 1349 struct extent_buffer *leaf; 1350 1351 while (1) { 1352 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1353 &extref, &offset); 1354 if (ret) 1355 break; 1356 1357 leaf = path->nodes[0]; 1358 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1359 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1360 cur_offset = 0; 1361 1362 while (cur_offset < item_size) { 1363 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1364 name_len = btrfs_inode_extref_name_len(leaf, extref); 1365 1366 nlink++; 1367 1368 cur_offset += name_len + sizeof(*extref); 1369 } 1370 1371 offset++; 1372 btrfs_release_path(path); 1373 } 1374 btrfs_release_path(path); 1375 1376 if (ret < 0 && ret != -ENOENT) 1377 return ret; 1378 return nlink; 1379 } 1380 1381 static int count_inode_refs(struct btrfs_root *root, 1382 struct inode *inode, struct btrfs_path *path) 1383 { 1384 int ret; 1385 struct btrfs_key key; 1386 unsigned int nlink = 0; 1387 unsigned long ptr; 1388 unsigned long ptr_end; 1389 int name_len; 1390 u64 ino = btrfs_ino(inode); 1391 1392 key.objectid = ino; 1393 key.type = BTRFS_INODE_REF_KEY; 1394 key.offset = (u64)-1; 1395 1396 while (1) { 1397 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1398 if (ret < 0) 1399 break; 1400 if (ret > 0) { 1401 if (path->slots[0] == 0) 1402 break; 1403 path->slots[0]--; 1404 } 1405 process_slot: 1406 btrfs_item_key_to_cpu(path->nodes[0], &key, 1407 path->slots[0]); 1408 if (key.objectid != ino || 1409 key.type != BTRFS_INODE_REF_KEY) 1410 break; 1411 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1412 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1413 path->slots[0]); 1414 while (ptr < ptr_end) { 1415 struct btrfs_inode_ref *ref; 1416 1417 ref = (struct btrfs_inode_ref *)ptr; 1418 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1419 ref); 1420 ptr = (unsigned long)(ref + 1) + name_len; 1421 nlink++; 1422 } 1423 1424 if (key.offset == 0) 1425 break; 1426 if (path->slots[0] > 0) { 1427 path->slots[0]--; 1428 goto process_slot; 1429 } 1430 key.offset--; 1431 btrfs_release_path(path); 1432 } 1433 btrfs_release_path(path); 1434 1435 return nlink; 1436 } 1437 1438 /* 1439 * There are a few corners where the link count of the file can't 1440 * be properly maintained during replay. So, instead of adding 1441 * lots of complexity to the log code, we just scan the backrefs 1442 * for any file that has been through replay. 1443 * 1444 * The scan will update the link count on the inode to reflect the 1445 * number of back refs found. If it goes down to zero, the iput 1446 * will free the inode. 1447 */ 1448 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1449 struct btrfs_root *root, 1450 struct inode *inode) 1451 { 1452 struct btrfs_path *path; 1453 int ret; 1454 u64 nlink = 0; 1455 u64 ino = btrfs_ino(inode); 1456 1457 path = btrfs_alloc_path(); 1458 if (!path) 1459 return -ENOMEM; 1460 1461 ret = count_inode_refs(root, inode, path); 1462 if (ret < 0) 1463 goto out; 1464 1465 nlink = ret; 1466 1467 ret = count_inode_extrefs(root, inode, path); 1468 if (ret < 0) 1469 goto out; 1470 1471 nlink += ret; 1472 1473 ret = 0; 1474 1475 if (nlink != inode->i_nlink) { 1476 set_nlink(inode, nlink); 1477 btrfs_update_inode(trans, root, inode); 1478 } 1479 BTRFS_I(inode)->index_cnt = (u64)-1; 1480 1481 if (inode->i_nlink == 0) { 1482 if (S_ISDIR(inode->i_mode)) { 1483 ret = replay_dir_deletes(trans, root, NULL, path, 1484 ino, 1); 1485 if (ret) 1486 goto out; 1487 } 1488 ret = insert_orphan_item(trans, root, ino); 1489 } 1490 1491 out: 1492 btrfs_free_path(path); 1493 return ret; 1494 } 1495 1496 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1497 struct btrfs_root *root, 1498 struct btrfs_path *path) 1499 { 1500 int ret; 1501 struct btrfs_key key; 1502 struct inode *inode; 1503 1504 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1505 key.type = BTRFS_ORPHAN_ITEM_KEY; 1506 key.offset = (u64)-1; 1507 while (1) { 1508 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1509 if (ret < 0) 1510 break; 1511 1512 if (ret == 1) { 1513 if (path->slots[0] == 0) 1514 break; 1515 path->slots[0]--; 1516 } 1517 1518 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1519 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1520 key.type != BTRFS_ORPHAN_ITEM_KEY) 1521 break; 1522 1523 ret = btrfs_del_item(trans, root, path); 1524 if (ret) 1525 goto out; 1526 1527 btrfs_release_path(path); 1528 inode = read_one_inode(root, key.offset); 1529 if (!inode) 1530 return -EIO; 1531 1532 ret = fixup_inode_link_count(trans, root, inode); 1533 iput(inode); 1534 if (ret) 1535 goto out; 1536 1537 /* 1538 * fixup on a directory may create new entries, 1539 * make sure we always look for the highset possible 1540 * offset 1541 */ 1542 key.offset = (u64)-1; 1543 } 1544 ret = 0; 1545 out: 1546 btrfs_release_path(path); 1547 return ret; 1548 } 1549 1550 1551 /* 1552 * record a given inode in the fixup dir so we can check its link 1553 * count when replay is done. The link count is incremented here 1554 * so the inode won't go away until we check it 1555 */ 1556 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1557 struct btrfs_root *root, 1558 struct btrfs_path *path, 1559 u64 objectid) 1560 { 1561 struct btrfs_key key; 1562 int ret = 0; 1563 struct inode *inode; 1564 1565 inode = read_one_inode(root, objectid); 1566 if (!inode) 1567 return -EIO; 1568 1569 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1570 key.type = BTRFS_ORPHAN_ITEM_KEY; 1571 key.offset = objectid; 1572 1573 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1574 1575 btrfs_release_path(path); 1576 if (ret == 0) { 1577 if (!inode->i_nlink) 1578 set_nlink(inode, 1); 1579 else 1580 inc_nlink(inode); 1581 ret = btrfs_update_inode(trans, root, inode); 1582 } else if (ret == -EEXIST) { 1583 ret = 0; 1584 } else { 1585 BUG(); /* Logic Error */ 1586 } 1587 iput(inode); 1588 1589 return ret; 1590 } 1591 1592 /* 1593 * when replaying the log for a directory, we only insert names 1594 * for inodes that actually exist. This means an fsync on a directory 1595 * does not implicitly fsync all the new files in it 1596 */ 1597 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1598 struct btrfs_root *root, 1599 u64 dirid, u64 index, 1600 char *name, int name_len, 1601 struct btrfs_key *location) 1602 { 1603 struct inode *inode; 1604 struct inode *dir; 1605 int ret; 1606 1607 inode = read_one_inode(root, location->objectid); 1608 if (!inode) 1609 return -ENOENT; 1610 1611 dir = read_one_inode(root, dirid); 1612 if (!dir) { 1613 iput(inode); 1614 return -EIO; 1615 } 1616 1617 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1618 1619 /* FIXME, put inode into FIXUP list */ 1620 1621 iput(inode); 1622 iput(dir); 1623 return ret; 1624 } 1625 1626 /* 1627 * Return true if an inode reference exists in the log for the given name, 1628 * inode and parent inode. 1629 */ 1630 static bool name_in_log_ref(struct btrfs_root *log_root, 1631 const char *name, const int name_len, 1632 const u64 dirid, const u64 ino) 1633 { 1634 struct btrfs_key search_key; 1635 1636 search_key.objectid = ino; 1637 search_key.type = BTRFS_INODE_REF_KEY; 1638 search_key.offset = dirid; 1639 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1640 return true; 1641 1642 search_key.type = BTRFS_INODE_EXTREF_KEY; 1643 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1644 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1645 return true; 1646 1647 return false; 1648 } 1649 1650 /* 1651 * take a single entry in a log directory item and replay it into 1652 * the subvolume. 1653 * 1654 * if a conflicting item exists in the subdirectory already, 1655 * the inode it points to is unlinked and put into the link count 1656 * fix up tree. 1657 * 1658 * If a name from the log points to a file or directory that does 1659 * not exist in the FS, it is skipped. fsyncs on directories 1660 * do not force down inodes inside that directory, just changes to the 1661 * names or unlinks in a directory. 1662 * 1663 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1664 * non-existing inode) and 1 if the name was replayed. 1665 */ 1666 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1667 struct btrfs_root *root, 1668 struct btrfs_path *path, 1669 struct extent_buffer *eb, 1670 struct btrfs_dir_item *di, 1671 struct btrfs_key *key) 1672 { 1673 char *name; 1674 int name_len; 1675 struct btrfs_dir_item *dst_di; 1676 struct btrfs_key found_key; 1677 struct btrfs_key log_key; 1678 struct inode *dir; 1679 u8 log_type; 1680 int exists; 1681 int ret = 0; 1682 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1683 bool name_added = false; 1684 1685 dir = read_one_inode(root, key->objectid); 1686 if (!dir) 1687 return -EIO; 1688 1689 name_len = btrfs_dir_name_len(eb, di); 1690 name = kmalloc(name_len, GFP_NOFS); 1691 if (!name) { 1692 ret = -ENOMEM; 1693 goto out; 1694 } 1695 1696 log_type = btrfs_dir_type(eb, di); 1697 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1698 name_len); 1699 1700 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1701 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1702 if (exists == 0) 1703 exists = 1; 1704 else 1705 exists = 0; 1706 btrfs_release_path(path); 1707 1708 if (key->type == BTRFS_DIR_ITEM_KEY) { 1709 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1710 name, name_len, 1); 1711 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1712 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1713 key->objectid, 1714 key->offset, name, 1715 name_len, 1); 1716 } else { 1717 /* Corruption */ 1718 ret = -EINVAL; 1719 goto out; 1720 } 1721 if (IS_ERR_OR_NULL(dst_di)) { 1722 /* we need a sequence number to insert, so we only 1723 * do inserts for the BTRFS_DIR_INDEX_KEY types 1724 */ 1725 if (key->type != BTRFS_DIR_INDEX_KEY) 1726 goto out; 1727 goto insert; 1728 } 1729 1730 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1731 /* the existing item matches the logged item */ 1732 if (found_key.objectid == log_key.objectid && 1733 found_key.type == log_key.type && 1734 found_key.offset == log_key.offset && 1735 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1736 update_size = false; 1737 goto out; 1738 } 1739 1740 /* 1741 * don't drop the conflicting directory entry if the inode 1742 * for the new entry doesn't exist 1743 */ 1744 if (!exists) 1745 goto out; 1746 1747 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1748 if (ret) 1749 goto out; 1750 1751 if (key->type == BTRFS_DIR_INDEX_KEY) 1752 goto insert; 1753 out: 1754 btrfs_release_path(path); 1755 if (!ret && update_size) { 1756 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1757 ret = btrfs_update_inode(trans, root, dir); 1758 } 1759 kfree(name); 1760 iput(dir); 1761 if (!ret && name_added) 1762 ret = 1; 1763 return ret; 1764 1765 insert: 1766 if (name_in_log_ref(root->log_root, name, name_len, 1767 key->objectid, log_key.objectid)) { 1768 /* The dentry will be added later. */ 1769 ret = 0; 1770 update_size = false; 1771 goto out; 1772 } 1773 btrfs_release_path(path); 1774 ret = insert_one_name(trans, root, key->objectid, key->offset, 1775 name, name_len, &log_key); 1776 if (ret && ret != -ENOENT && ret != -EEXIST) 1777 goto out; 1778 if (!ret) 1779 name_added = true; 1780 update_size = false; 1781 ret = 0; 1782 goto out; 1783 } 1784 1785 /* 1786 * find all the names in a directory item and reconcile them into 1787 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1788 * one name in a directory item, but the same code gets used for 1789 * both directory index types 1790 */ 1791 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1792 struct btrfs_root *root, 1793 struct btrfs_path *path, 1794 struct extent_buffer *eb, int slot, 1795 struct btrfs_key *key) 1796 { 1797 int ret = 0; 1798 u32 item_size = btrfs_item_size_nr(eb, slot); 1799 struct btrfs_dir_item *di; 1800 int name_len; 1801 unsigned long ptr; 1802 unsigned long ptr_end; 1803 struct btrfs_path *fixup_path = NULL; 1804 1805 ptr = btrfs_item_ptr_offset(eb, slot); 1806 ptr_end = ptr + item_size; 1807 while (ptr < ptr_end) { 1808 di = (struct btrfs_dir_item *)ptr; 1809 if (verify_dir_item(root, eb, di)) 1810 return -EIO; 1811 name_len = btrfs_dir_name_len(eb, di); 1812 ret = replay_one_name(trans, root, path, eb, di, key); 1813 if (ret < 0) 1814 break; 1815 ptr = (unsigned long)(di + 1); 1816 ptr += name_len; 1817 1818 /* 1819 * If this entry refers to a non-directory (directories can not 1820 * have a link count > 1) and it was added in the transaction 1821 * that was not committed, make sure we fixup the link count of 1822 * the inode it the entry points to. Otherwise something like 1823 * the following would result in a directory pointing to an 1824 * inode with a wrong link that does not account for this dir 1825 * entry: 1826 * 1827 * mkdir testdir 1828 * touch testdir/foo 1829 * touch testdir/bar 1830 * sync 1831 * 1832 * ln testdir/bar testdir/bar_link 1833 * ln testdir/foo testdir/foo_link 1834 * xfs_io -c "fsync" testdir/bar 1835 * 1836 * <power failure> 1837 * 1838 * mount fs, log replay happens 1839 * 1840 * File foo would remain with a link count of 1 when it has two 1841 * entries pointing to it in the directory testdir. This would 1842 * make it impossible to ever delete the parent directory has 1843 * it would result in stale dentries that can never be deleted. 1844 */ 1845 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1846 struct btrfs_key di_key; 1847 1848 if (!fixup_path) { 1849 fixup_path = btrfs_alloc_path(); 1850 if (!fixup_path) { 1851 ret = -ENOMEM; 1852 break; 1853 } 1854 } 1855 1856 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1857 ret = link_to_fixup_dir(trans, root, fixup_path, 1858 di_key.objectid); 1859 if (ret) 1860 break; 1861 } 1862 ret = 0; 1863 } 1864 btrfs_free_path(fixup_path); 1865 return ret; 1866 } 1867 1868 /* 1869 * directory replay has two parts. There are the standard directory 1870 * items in the log copied from the subvolume, and range items 1871 * created in the log while the subvolume was logged. 1872 * 1873 * The range items tell us which parts of the key space the log 1874 * is authoritative for. During replay, if a key in the subvolume 1875 * directory is in a logged range item, but not actually in the log 1876 * that means it was deleted from the directory before the fsync 1877 * and should be removed. 1878 */ 1879 static noinline int find_dir_range(struct btrfs_root *root, 1880 struct btrfs_path *path, 1881 u64 dirid, int key_type, 1882 u64 *start_ret, u64 *end_ret) 1883 { 1884 struct btrfs_key key; 1885 u64 found_end; 1886 struct btrfs_dir_log_item *item; 1887 int ret; 1888 int nritems; 1889 1890 if (*start_ret == (u64)-1) 1891 return 1; 1892 1893 key.objectid = dirid; 1894 key.type = key_type; 1895 key.offset = *start_ret; 1896 1897 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1898 if (ret < 0) 1899 goto out; 1900 if (ret > 0) { 1901 if (path->slots[0] == 0) 1902 goto out; 1903 path->slots[0]--; 1904 } 1905 if (ret != 0) 1906 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1907 1908 if (key.type != key_type || key.objectid != dirid) { 1909 ret = 1; 1910 goto next; 1911 } 1912 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1913 struct btrfs_dir_log_item); 1914 found_end = btrfs_dir_log_end(path->nodes[0], item); 1915 1916 if (*start_ret >= key.offset && *start_ret <= found_end) { 1917 ret = 0; 1918 *start_ret = key.offset; 1919 *end_ret = found_end; 1920 goto out; 1921 } 1922 ret = 1; 1923 next: 1924 /* check the next slot in the tree to see if it is a valid item */ 1925 nritems = btrfs_header_nritems(path->nodes[0]); 1926 if (path->slots[0] >= nritems) { 1927 ret = btrfs_next_leaf(root, path); 1928 if (ret) 1929 goto out; 1930 } else { 1931 path->slots[0]++; 1932 } 1933 1934 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1935 1936 if (key.type != key_type || key.objectid != dirid) { 1937 ret = 1; 1938 goto out; 1939 } 1940 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1941 struct btrfs_dir_log_item); 1942 found_end = btrfs_dir_log_end(path->nodes[0], item); 1943 *start_ret = key.offset; 1944 *end_ret = found_end; 1945 ret = 0; 1946 out: 1947 btrfs_release_path(path); 1948 return ret; 1949 } 1950 1951 /* 1952 * this looks for a given directory item in the log. If the directory 1953 * item is not in the log, the item is removed and the inode it points 1954 * to is unlinked 1955 */ 1956 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1957 struct btrfs_root *root, 1958 struct btrfs_root *log, 1959 struct btrfs_path *path, 1960 struct btrfs_path *log_path, 1961 struct inode *dir, 1962 struct btrfs_key *dir_key) 1963 { 1964 int ret; 1965 struct extent_buffer *eb; 1966 int slot; 1967 u32 item_size; 1968 struct btrfs_dir_item *di; 1969 struct btrfs_dir_item *log_di; 1970 int name_len; 1971 unsigned long ptr; 1972 unsigned long ptr_end; 1973 char *name; 1974 struct inode *inode; 1975 struct btrfs_key location; 1976 1977 again: 1978 eb = path->nodes[0]; 1979 slot = path->slots[0]; 1980 item_size = btrfs_item_size_nr(eb, slot); 1981 ptr = btrfs_item_ptr_offset(eb, slot); 1982 ptr_end = ptr + item_size; 1983 while (ptr < ptr_end) { 1984 di = (struct btrfs_dir_item *)ptr; 1985 if (verify_dir_item(root, eb, di)) { 1986 ret = -EIO; 1987 goto out; 1988 } 1989 1990 name_len = btrfs_dir_name_len(eb, di); 1991 name = kmalloc(name_len, GFP_NOFS); 1992 if (!name) { 1993 ret = -ENOMEM; 1994 goto out; 1995 } 1996 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1997 name_len); 1998 log_di = NULL; 1999 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2000 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2001 dir_key->objectid, 2002 name, name_len, 0); 2003 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2004 log_di = btrfs_lookup_dir_index_item(trans, log, 2005 log_path, 2006 dir_key->objectid, 2007 dir_key->offset, 2008 name, name_len, 0); 2009 } 2010 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2011 btrfs_dir_item_key_to_cpu(eb, di, &location); 2012 btrfs_release_path(path); 2013 btrfs_release_path(log_path); 2014 inode = read_one_inode(root, location.objectid); 2015 if (!inode) { 2016 kfree(name); 2017 return -EIO; 2018 } 2019 2020 ret = link_to_fixup_dir(trans, root, 2021 path, location.objectid); 2022 if (ret) { 2023 kfree(name); 2024 iput(inode); 2025 goto out; 2026 } 2027 2028 inc_nlink(inode); 2029 ret = btrfs_unlink_inode(trans, root, dir, inode, 2030 name, name_len); 2031 if (!ret) 2032 ret = btrfs_run_delayed_items(trans, root); 2033 kfree(name); 2034 iput(inode); 2035 if (ret) 2036 goto out; 2037 2038 /* there might still be more names under this key 2039 * check and repeat if required 2040 */ 2041 ret = btrfs_search_slot(NULL, root, dir_key, path, 2042 0, 0); 2043 if (ret == 0) 2044 goto again; 2045 ret = 0; 2046 goto out; 2047 } else if (IS_ERR(log_di)) { 2048 kfree(name); 2049 return PTR_ERR(log_di); 2050 } 2051 btrfs_release_path(log_path); 2052 kfree(name); 2053 2054 ptr = (unsigned long)(di + 1); 2055 ptr += name_len; 2056 } 2057 ret = 0; 2058 out: 2059 btrfs_release_path(path); 2060 btrfs_release_path(log_path); 2061 return ret; 2062 } 2063 2064 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2065 struct btrfs_root *root, 2066 struct btrfs_root *log, 2067 struct btrfs_path *path, 2068 const u64 ino) 2069 { 2070 struct btrfs_key search_key; 2071 struct btrfs_path *log_path; 2072 int i; 2073 int nritems; 2074 int ret; 2075 2076 log_path = btrfs_alloc_path(); 2077 if (!log_path) 2078 return -ENOMEM; 2079 2080 search_key.objectid = ino; 2081 search_key.type = BTRFS_XATTR_ITEM_KEY; 2082 search_key.offset = 0; 2083 again: 2084 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2085 if (ret < 0) 2086 goto out; 2087 process_leaf: 2088 nritems = btrfs_header_nritems(path->nodes[0]); 2089 for (i = path->slots[0]; i < nritems; i++) { 2090 struct btrfs_key key; 2091 struct btrfs_dir_item *di; 2092 struct btrfs_dir_item *log_di; 2093 u32 total_size; 2094 u32 cur; 2095 2096 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2097 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2098 ret = 0; 2099 goto out; 2100 } 2101 2102 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2103 total_size = btrfs_item_size_nr(path->nodes[0], i); 2104 cur = 0; 2105 while (cur < total_size) { 2106 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2107 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2108 u32 this_len = sizeof(*di) + name_len + data_len; 2109 char *name; 2110 2111 name = kmalloc(name_len, GFP_NOFS); 2112 if (!name) { 2113 ret = -ENOMEM; 2114 goto out; 2115 } 2116 read_extent_buffer(path->nodes[0], name, 2117 (unsigned long)(di + 1), name_len); 2118 2119 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2120 name, name_len, 0); 2121 btrfs_release_path(log_path); 2122 if (!log_di) { 2123 /* Doesn't exist in log tree, so delete it. */ 2124 btrfs_release_path(path); 2125 di = btrfs_lookup_xattr(trans, root, path, ino, 2126 name, name_len, -1); 2127 kfree(name); 2128 if (IS_ERR(di)) { 2129 ret = PTR_ERR(di); 2130 goto out; 2131 } 2132 ASSERT(di); 2133 ret = btrfs_delete_one_dir_name(trans, root, 2134 path, di); 2135 if (ret) 2136 goto out; 2137 btrfs_release_path(path); 2138 search_key = key; 2139 goto again; 2140 } 2141 kfree(name); 2142 if (IS_ERR(log_di)) { 2143 ret = PTR_ERR(log_di); 2144 goto out; 2145 } 2146 cur += this_len; 2147 di = (struct btrfs_dir_item *)((char *)di + this_len); 2148 } 2149 } 2150 ret = btrfs_next_leaf(root, path); 2151 if (ret > 0) 2152 ret = 0; 2153 else if (ret == 0) 2154 goto process_leaf; 2155 out: 2156 btrfs_free_path(log_path); 2157 btrfs_release_path(path); 2158 return ret; 2159 } 2160 2161 2162 /* 2163 * deletion replay happens before we copy any new directory items 2164 * out of the log or out of backreferences from inodes. It 2165 * scans the log to find ranges of keys that log is authoritative for, 2166 * and then scans the directory to find items in those ranges that are 2167 * not present in the log. 2168 * 2169 * Anything we don't find in the log is unlinked and removed from the 2170 * directory. 2171 */ 2172 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2173 struct btrfs_root *root, 2174 struct btrfs_root *log, 2175 struct btrfs_path *path, 2176 u64 dirid, int del_all) 2177 { 2178 u64 range_start; 2179 u64 range_end; 2180 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2181 int ret = 0; 2182 struct btrfs_key dir_key; 2183 struct btrfs_key found_key; 2184 struct btrfs_path *log_path; 2185 struct inode *dir; 2186 2187 dir_key.objectid = dirid; 2188 dir_key.type = BTRFS_DIR_ITEM_KEY; 2189 log_path = btrfs_alloc_path(); 2190 if (!log_path) 2191 return -ENOMEM; 2192 2193 dir = read_one_inode(root, dirid); 2194 /* it isn't an error if the inode isn't there, that can happen 2195 * because we replay the deletes before we copy in the inode item 2196 * from the log 2197 */ 2198 if (!dir) { 2199 btrfs_free_path(log_path); 2200 return 0; 2201 } 2202 again: 2203 range_start = 0; 2204 range_end = 0; 2205 while (1) { 2206 if (del_all) 2207 range_end = (u64)-1; 2208 else { 2209 ret = find_dir_range(log, path, dirid, key_type, 2210 &range_start, &range_end); 2211 if (ret != 0) 2212 break; 2213 } 2214 2215 dir_key.offset = range_start; 2216 while (1) { 2217 int nritems; 2218 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2219 0, 0); 2220 if (ret < 0) 2221 goto out; 2222 2223 nritems = btrfs_header_nritems(path->nodes[0]); 2224 if (path->slots[0] >= nritems) { 2225 ret = btrfs_next_leaf(root, path); 2226 if (ret) 2227 break; 2228 } 2229 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2230 path->slots[0]); 2231 if (found_key.objectid != dirid || 2232 found_key.type != dir_key.type) 2233 goto next_type; 2234 2235 if (found_key.offset > range_end) 2236 break; 2237 2238 ret = check_item_in_log(trans, root, log, path, 2239 log_path, dir, 2240 &found_key); 2241 if (ret) 2242 goto out; 2243 if (found_key.offset == (u64)-1) 2244 break; 2245 dir_key.offset = found_key.offset + 1; 2246 } 2247 btrfs_release_path(path); 2248 if (range_end == (u64)-1) 2249 break; 2250 range_start = range_end + 1; 2251 } 2252 2253 next_type: 2254 ret = 0; 2255 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2256 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2257 dir_key.type = BTRFS_DIR_INDEX_KEY; 2258 btrfs_release_path(path); 2259 goto again; 2260 } 2261 out: 2262 btrfs_release_path(path); 2263 btrfs_free_path(log_path); 2264 iput(dir); 2265 return ret; 2266 } 2267 2268 /* 2269 * the process_func used to replay items from the log tree. This 2270 * gets called in two different stages. The first stage just looks 2271 * for inodes and makes sure they are all copied into the subvolume. 2272 * 2273 * The second stage copies all the other item types from the log into 2274 * the subvolume. The two stage approach is slower, but gets rid of 2275 * lots of complexity around inodes referencing other inodes that exist 2276 * only in the log (references come from either directory items or inode 2277 * back refs). 2278 */ 2279 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2280 struct walk_control *wc, u64 gen) 2281 { 2282 int nritems; 2283 struct btrfs_path *path; 2284 struct btrfs_root *root = wc->replay_dest; 2285 struct btrfs_key key; 2286 int level; 2287 int i; 2288 int ret; 2289 2290 ret = btrfs_read_buffer(eb, gen); 2291 if (ret) 2292 return ret; 2293 2294 level = btrfs_header_level(eb); 2295 2296 if (level != 0) 2297 return 0; 2298 2299 path = btrfs_alloc_path(); 2300 if (!path) 2301 return -ENOMEM; 2302 2303 nritems = btrfs_header_nritems(eb); 2304 for (i = 0; i < nritems; i++) { 2305 btrfs_item_key_to_cpu(eb, &key, i); 2306 2307 /* inode keys are done during the first stage */ 2308 if (key.type == BTRFS_INODE_ITEM_KEY && 2309 wc->stage == LOG_WALK_REPLAY_INODES) { 2310 struct btrfs_inode_item *inode_item; 2311 u32 mode; 2312 2313 inode_item = btrfs_item_ptr(eb, i, 2314 struct btrfs_inode_item); 2315 ret = replay_xattr_deletes(wc->trans, root, log, 2316 path, key.objectid); 2317 if (ret) 2318 break; 2319 mode = btrfs_inode_mode(eb, inode_item); 2320 if (S_ISDIR(mode)) { 2321 ret = replay_dir_deletes(wc->trans, 2322 root, log, path, key.objectid, 0); 2323 if (ret) 2324 break; 2325 } 2326 ret = overwrite_item(wc->trans, root, path, 2327 eb, i, &key); 2328 if (ret) 2329 break; 2330 2331 /* for regular files, make sure corresponding 2332 * orhpan item exist. extents past the new EOF 2333 * will be truncated later by orphan cleanup. 2334 */ 2335 if (S_ISREG(mode)) { 2336 ret = insert_orphan_item(wc->trans, root, 2337 key.objectid); 2338 if (ret) 2339 break; 2340 } 2341 2342 ret = link_to_fixup_dir(wc->trans, root, 2343 path, key.objectid); 2344 if (ret) 2345 break; 2346 } 2347 2348 if (key.type == BTRFS_DIR_INDEX_KEY && 2349 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2350 ret = replay_one_dir_item(wc->trans, root, path, 2351 eb, i, &key); 2352 if (ret) 2353 break; 2354 } 2355 2356 if (wc->stage < LOG_WALK_REPLAY_ALL) 2357 continue; 2358 2359 /* these keys are simply copied */ 2360 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2361 ret = overwrite_item(wc->trans, root, path, 2362 eb, i, &key); 2363 if (ret) 2364 break; 2365 } else if (key.type == BTRFS_INODE_REF_KEY || 2366 key.type == BTRFS_INODE_EXTREF_KEY) { 2367 ret = add_inode_ref(wc->trans, root, log, path, 2368 eb, i, &key); 2369 if (ret && ret != -ENOENT) 2370 break; 2371 ret = 0; 2372 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2373 ret = replay_one_extent(wc->trans, root, path, 2374 eb, i, &key); 2375 if (ret) 2376 break; 2377 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2378 ret = replay_one_dir_item(wc->trans, root, path, 2379 eb, i, &key); 2380 if (ret) 2381 break; 2382 } 2383 } 2384 btrfs_free_path(path); 2385 return ret; 2386 } 2387 2388 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2389 struct btrfs_root *root, 2390 struct btrfs_path *path, int *level, 2391 struct walk_control *wc) 2392 { 2393 u64 root_owner; 2394 u64 bytenr; 2395 u64 ptr_gen; 2396 struct extent_buffer *next; 2397 struct extent_buffer *cur; 2398 struct extent_buffer *parent; 2399 u32 blocksize; 2400 int ret = 0; 2401 2402 WARN_ON(*level < 0); 2403 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2404 2405 while (*level > 0) { 2406 WARN_ON(*level < 0); 2407 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2408 cur = path->nodes[*level]; 2409 2410 WARN_ON(btrfs_header_level(cur) != *level); 2411 2412 if (path->slots[*level] >= 2413 btrfs_header_nritems(cur)) 2414 break; 2415 2416 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2417 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2418 blocksize = root->nodesize; 2419 2420 parent = path->nodes[*level]; 2421 root_owner = btrfs_header_owner(parent); 2422 2423 next = btrfs_find_create_tree_block(root, bytenr); 2424 if (!next) 2425 return -ENOMEM; 2426 2427 if (*level == 1) { 2428 ret = wc->process_func(root, next, wc, ptr_gen); 2429 if (ret) { 2430 free_extent_buffer(next); 2431 return ret; 2432 } 2433 2434 path->slots[*level]++; 2435 if (wc->free) { 2436 ret = btrfs_read_buffer(next, ptr_gen); 2437 if (ret) { 2438 free_extent_buffer(next); 2439 return ret; 2440 } 2441 2442 if (trans) { 2443 btrfs_tree_lock(next); 2444 btrfs_set_lock_blocking(next); 2445 clean_tree_block(trans, root->fs_info, 2446 next); 2447 btrfs_wait_tree_block_writeback(next); 2448 btrfs_tree_unlock(next); 2449 } 2450 2451 WARN_ON(root_owner != 2452 BTRFS_TREE_LOG_OBJECTID); 2453 ret = btrfs_free_and_pin_reserved_extent(root, 2454 bytenr, blocksize); 2455 if (ret) { 2456 free_extent_buffer(next); 2457 return ret; 2458 } 2459 } 2460 free_extent_buffer(next); 2461 continue; 2462 } 2463 ret = btrfs_read_buffer(next, ptr_gen); 2464 if (ret) { 2465 free_extent_buffer(next); 2466 return ret; 2467 } 2468 2469 WARN_ON(*level <= 0); 2470 if (path->nodes[*level-1]) 2471 free_extent_buffer(path->nodes[*level-1]); 2472 path->nodes[*level-1] = next; 2473 *level = btrfs_header_level(next); 2474 path->slots[*level] = 0; 2475 cond_resched(); 2476 } 2477 WARN_ON(*level < 0); 2478 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2479 2480 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2481 2482 cond_resched(); 2483 return 0; 2484 } 2485 2486 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2487 struct btrfs_root *root, 2488 struct btrfs_path *path, int *level, 2489 struct walk_control *wc) 2490 { 2491 u64 root_owner; 2492 int i; 2493 int slot; 2494 int ret; 2495 2496 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2497 slot = path->slots[i]; 2498 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2499 path->slots[i]++; 2500 *level = i; 2501 WARN_ON(*level == 0); 2502 return 0; 2503 } else { 2504 struct extent_buffer *parent; 2505 if (path->nodes[*level] == root->node) 2506 parent = path->nodes[*level]; 2507 else 2508 parent = path->nodes[*level + 1]; 2509 2510 root_owner = btrfs_header_owner(parent); 2511 ret = wc->process_func(root, path->nodes[*level], wc, 2512 btrfs_header_generation(path->nodes[*level])); 2513 if (ret) 2514 return ret; 2515 2516 if (wc->free) { 2517 struct extent_buffer *next; 2518 2519 next = path->nodes[*level]; 2520 2521 if (trans) { 2522 btrfs_tree_lock(next); 2523 btrfs_set_lock_blocking(next); 2524 clean_tree_block(trans, root->fs_info, 2525 next); 2526 btrfs_wait_tree_block_writeback(next); 2527 btrfs_tree_unlock(next); 2528 } 2529 2530 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2531 ret = btrfs_free_and_pin_reserved_extent(root, 2532 path->nodes[*level]->start, 2533 path->nodes[*level]->len); 2534 if (ret) 2535 return ret; 2536 } 2537 free_extent_buffer(path->nodes[*level]); 2538 path->nodes[*level] = NULL; 2539 *level = i + 1; 2540 } 2541 } 2542 return 1; 2543 } 2544 2545 /* 2546 * drop the reference count on the tree rooted at 'snap'. This traverses 2547 * the tree freeing any blocks that have a ref count of zero after being 2548 * decremented. 2549 */ 2550 static int walk_log_tree(struct btrfs_trans_handle *trans, 2551 struct btrfs_root *log, struct walk_control *wc) 2552 { 2553 int ret = 0; 2554 int wret; 2555 int level; 2556 struct btrfs_path *path; 2557 int orig_level; 2558 2559 path = btrfs_alloc_path(); 2560 if (!path) 2561 return -ENOMEM; 2562 2563 level = btrfs_header_level(log->node); 2564 orig_level = level; 2565 path->nodes[level] = log->node; 2566 extent_buffer_get(log->node); 2567 path->slots[level] = 0; 2568 2569 while (1) { 2570 wret = walk_down_log_tree(trans, log, path, &level, wc); 2571 if (wret > 0) 2572 break; 2573 if (wret < 0) { 2574 ret = wret; 2575 goto out; 2576 } 2577 2578 wret = walk_up_log_tree(trans, log, path, &level, wc); 2579 if (wret > 0) 2580 break; 2581 if (wret < 0) { 2582 ret = wret; 2583 goto out; 2584 } 2585 } 2586 2587 /* was the root node processed? if not, catch it here */ 2588 if (path->nodes[orig_level]) { 2589 ret = wc->process_func(log, path->nodes[orig_level], wc, 2590 btrfs_header_generation(path->nodes[orig_level])); 2591 if (ret) 2592 goto out; 2593 if (wc->free) { 2594 struct extent_buffer *next; 2595 2596 next = path->nodes[orig_level]; 2597 2598 if (trans) { 2599 btrfs_tree_lock(next); 2600 btrfs_set_lock_blocking(next); 2601 clean_tree_block(trans, log->fs_info, next); 2602 btrfs_wait_tree_block_writeback(next); 2603 btrfs_tree_unlock(next); 2604 } 2605 2606 WARN_ON(log->root_key.objectid != 2607 BTRFS_TREE_LOG_OBJECTID); 2608 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2609 next->len); 2610 if (ret) 2611 goto out; 2612 } 2613 } 2614 2615 out: 2616 btrfs_free_path(path); 2617 return ret; 2618 } 2619 2620 /* 2621 * helper function to update the item for a given subvolumes log root 2622 * in the tree of log roots 2623 */ 2624 static int update_log_root(struct btrfs_trans_handle *trans, 2625 struct btrfs_root *log) 2626 { 2627 int ret; 2628 2629 if (log->log_transid == 1) { 2630 /* insert root item on the first sync */ 2631 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2632 &log->root_key, &log->root_item); 2633 } else { 2634 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2635 &log->root_key, &log->root_item); 2636 } 2637 return ret; 2638 } 2639 2640 static void wait_log_commit(struct btrfs_root *root, int transid) 2641 { 2642 DEFINE_WAIT(wait); 2643 int index = transid % 2; 2644 2645 /* 2646 * we only allow two pending log transactions at a time, 2647 * so we know that if ours is more than 2 older than the 2648 * current transaction, we're done 2649 */ 2650 do { 2651 prepare_to_wait(&root->log_commit_wait[index], 2652 &wait, TASK_UNINTERRUPTIBLE); 2653 mutex_unlock(&root->log_mutex); 2654 2655 if (root->log_transid_committed < transid && 2656 atomic_read(&root->log_commit[index])) 2657 schedule(); 2658 2659 finish_wait(&root->log_commit_wait[index], &wait); 2660 mutex_lock(&root->log_mutex); 2661 } while (root->log_transid_committed < transid && 2662 atomic_read(&root->log_commit[index])); 2663 } 2664 2665 static void wait_for_writer(struct btrfs_root *root) 2666 { 2667 DEFINE_WAIT(wait); 2668 2669 while (atomic_read(&root->log_writers)) { 2670 prepare_to_wait(&root->log_writer_wait, 2671 &wait, TASK_UNINTERRUPTIBLE); 2672 mutex_unlock(&root->log_mutex); 2673 if (atomic_read(&root->log_writers)) 2674 schedule(); 2675 finish_wait(&root->log_writer_wait, &wait); 2676 mutex_lock(&root->log_mutex); 2677 } 2678 } 2679 2680 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2681 struct btrfs_log_ctx *ctx) 2682 { 2683 if (!ctx) 2684 return; 2685 2686 mutex_lock(&root->log_mutex); 2687 list_del_init(&ctx->list); 2688 mutex_unlock(&root->log_mutex); 2689 } 2690 2691 /* 2692 * Invoked in log mutex context, or be sure there is no other task which 2693 * can access the list. 2694 */ 2695 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2696 int index, int error) 2697 { 2698 struct btrfs_log_ctx *ctx; 2699 2700 if (!error) { 2701 INIT_LIST_HEAD(&root->log_ctxs[index]); 2702 return; 2703 } 2704 2705 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2706 ctx->log_ret = error; 2707 2708 INIT_LIST_HEAD(&root->log_ctxs[index]); 2709 } 2710 2711 /* 2712 * btrfs_sync_log does sends a given tree log down to the disk and 2713 * updates the super blocks to record it. When this call is done, 2714 * you know that any inodes previously logged are safely on disk only 2715 * if it returns 0. 2716 * 2717 * Any other return value means you need to call btrfs_commit_transaction. 2718 * Some of the edge cases for fsyncing directories that have had unlinks 2719 * or renames done in the past mean that sometimes the only safe 2720 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2721 * that has happened. 2722 */ 2723 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2724 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2725 { 2726 int index1; 2727 int index2; 2728 int mark; 2729 int ret; 2730 struct btrfs_root *log = root->log_root; 2731 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2732 int log_transid = 0; 2733 struct btrfs_log_ctx root_log_ctx; 2734 struct blk_plug plug; 2735 2736 mutex_lock(&root->log_mutex); 2737 log_transid = ctx->log_transid; 2738 if (root->log_transid_committed >= log_transid) { 2739 mutex_unlock(&root->log_mutex); 2740 return ctx->log_ret; 2741 } 2742 2743 index1 = log_transid % 2; 2744 if (atomic_read(&root->log_commit[index1])) { 2745 wait_log_commit(root, log_transid); 2746 mutex_unlock(&root->log_mutex); 2747 return ctx->log_ret; 2748 } 2749 ASSERT(log_transid == root->log_transid); 2750 atomic_set(&root->log_commit[index1], 1); 2751 2752 /* wait for previous tree log sync to complete */ 2753 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2754 wait_log_commit(root, log_transid - 1); 2755 2756 while (1) { 2757 int batch = atomic_read(&root->log_batch); 2758 /* when we're on an ssd, just kick the log commit out */ 2759 if (!btrfs_test_opt(root, SSD) && 2760 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2761 mutex_unlock(&root->log_mutex); 2762 schedule_timeout_uninterruptible(1); 2763 mutex_lock(&root->log_mutex); 2764 } 2765 wait_for_writer(root); 2766 if (batch == atomic_read(&root->log_batch)) 2767 break; 2768 } 2769 2770 /* bail out if we need to do a full commit */ 2771 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2772 ret = -EAGAIN; 2773 btrfs_free_logged_extents(log, log_transid); 2774 mutex_unlock(&root->log_mutex); 2775 goto out; 2776 } 2777 2778 if (log_transid % 2 == 0) 2779 mark = EXTENT_DIRTY; 2780 else 2781 mark = EXTENT_NEW; 2782 2783 /* we start IO on all the marked extents here, but we don't actually 2784 * wait for them until later. 2785 */ 2786 blk_start_plug(&plug); 2787 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2788 if (ret) { 2789 blk_finish_plug(&plug); 2790 btrfs_abort_transaction(trans, root, ret); 2791 btrfs_free_logged_extents(log, log_transid); 2792 btrfs_set_log_full_commit(root->fs_info, trans); 2793 mutex_unlock(&root->log_mutex); 2794 goto out; 2795 } 2796 2797 btrfs_set_root_node(&log->root_item, log->node); 2798 2799 root->log_transid++; 2800 log->log_transid = root->log_transid; 2801 root->log_start_pid = 0; 2802 /* 2803 * IO has been started, blocks of the log tree have WRITTEN flag set 2804 * in their headers. new modifications of the log will be written to 2805 * new positions. so it's safe to allow log writers to go in. 2806 */ 2807 mutex_unlock(&root->log_mutex); 2808 2809 btrfs_init_log_ctx(&root_log_ctx); 2810 2811 mutex_lock(&log_root_tree->log_mutex); 2812 atomic_inc(&log_root_tree->log_batch); 2813 atomic_inc(&log_root_tree->log_writers); 2814 2815 index2 = log_root_tree->log_transid % 2; 2816 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2817 root_log_ctx.log_transid = log_root_tree->log_transid; 2818 2819 mutex_unlock(&log_root_tree->log_mutex); 2820 2821 ret = update_log_root(trans, log); 2822 2823 mutex_lock(&log_root_tree->log_mutex); 2824 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2825 /* 2826 * Implicit memory barrier after atomic_dec_and_test 2827 */ 2828 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2829 wake_up(&log_root_tree->log_writer_wait); 2830 } 2831 2832 if (ret) { 2833 if (!list_empty(&root_log_ctx.list)) 2834 list_del_init(&root_log_ctx.list); 2835 2836 blk_finish_plug(&plug); 2837 btrfs_set_log_full_commit(root->fs_info, trans); 2838 2839 if (ret != -ENOSPC) { 2840 btrfs_abort_transaction(trans, root, ret); 2841 mutex_unlock(&log_root_tree->log_mutex); 2842 goto out; 2843 } 2844 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2845 btrfs_free_logged_extents(log, log_transid); 2846 mutex_unlock(&log_root_tree->log_mutex); 2847 ret = -EAGAIN; 2848 goto out; 2849 } 2850 2851 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2852 blk_finish_plug(&plug); 2853 mutex_unlock(&log_root_tree->log_mutex); 2854 ret = root_log_ctx.log_ret; 2855 goto out; 2856 } 2857 2858 index2 = root_log_ctx.log_transid % 2; 2859 if (atomic_read(&log_root_tree->log_commit[index2])) { 2860 blk_finish_plug(&plug); 2861 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, 2862 mark); 2863 btrfs_wait_logged_extents(trans, log, log_transid); 2864 wait_log_commit(log_root_tree, 2865 root_log_ctx.log_transid); 2866 mutex_unlock(&log_root_tree->log_mutex); 2867 if (!ret) 2868 ret = root_log_ctx.log_ret; 2869 goto out; 2870 } 2871 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2872 atomic_set(&log_root_tree->log_commit[index2], 1); 2873 2874 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2875 wait_log_commit(log_root_tree, 2876 root_log_ctx.log_transid - 1); 2877 } 2878 2879 wait_for_writer(log_root_tree); 2880 2881 /* 2882 * now that we've moved on to the tree of log tree roots, 2883 * check the full commit flag again 2884 */ 2885 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2886 blk_finish_plug(&plug); 2887 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2888 btrfs_free_logged_extents(log, log_transid); 2889 mutex_unlock(&log_root_tree->log_mutex); 2890 ret = -EAGAIN; 2891 goto out_wake_log_root; 2892 } 2893 2894 ret = btrfs_write_marked_extents(log_root_tree, 2895 &log_root_tree->dirty_log_pages, 2896 EXTENT_DIRTY | EXTENT_NEW); 2897 blk_finish_plug(&plug); 2898 if (ret) { 2899 btrfs_set_log_full_commit(root->fs_info, trans); 2900 btrfs_abort_transaction(trans, root, ret); 2901 btrfs_free_logged_extents(log, log_transid); 2902 mutex_unlock(&log_root_tree->log_mutex); 2903 goto out_wake_log_root; 2904 } 2905 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2906 if (!ret) 2907 ret = btrfs_wait_marked_extents(log_root_tree, 2908 &log_root_tree->dirty_log_pages, 2909 EXTENT_NEW | EXTENT_DIRTY); 2910 if (ret) { 2911 btrfs_set_log_full_commit(root->fs_info, trans); 2912 btrfs_free_logged_extents(log, log_transid); 2913 mutex_unlock(&log_root_tree->log_mutex); 2914 goto out_wake_log_root; 2915 } 2916 btrfs_wait_logged_extents(trans, log, log_transid); 2917 2918 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2919 log_root_tree->node->start); 2920 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2921 btrfs_header_level(log_root_tree->node)); 2922 2923 log_root_tree->log_transid++; 2924 mutex_unlock(&log_root_tree->log_mutex); 2925 2926 /* 2927 * nobody else is going to jump in and write the the ctree 2928 * super here because the log_commit atomic below is protecting 2929 * us. We must be called with a transaction handle pinning 2930 * the running transaction open, so a full commit can't hop 2931 * in and cause problems either. 2932 */ 2933 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2934 if (ret) { 2935 btrfs_set_log_full_commit(root->fs_info, trans); 2936 btrfs_abort_transaction(trans, root, ret); 2937 goto out_wake_log_root; 2938 } 2939 2940 mutex_lock(&root->log_mutex); 2941 if (root->last_log_commit < log_transid) 2942 root->last_log_commit = log_transid; 2943 mutex_unlock(&root->log_mutex); 2944 2945 out_wake_log_root: 2946 /* 2947 * We needn't get log_mutex here because we are sure all 2948 * the other tasks are blocked. 2949 */ 2950 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2951 2952 mutex_lock(&log_root_tree->log_mutex); 2953 log_root_tree->log_transid_committed++; 2954 atomic_set(&log_root_tree->log_commit[index2], 0); 2955 mutex_unlock(&log_root_tree->log_mutex); 2956 2957 /* 2958 * The barrier before waitqueue_active is implied by mutex_unlock 2959 */ 2960 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2961 wake_up(&log_root_tree->log_commit_wait[index2]); 2962 out: 2963 /* See above. */ 2964 btrfs_remove_all_log_ctxs(root, index1, ret); 2965 2966 mutex_lock(&root->log_mutex); 2967 root->log_transid_committed++; 2968 atomic_set(&root->log_commit[index1], 0); 2969 mutex_unlock(&root->log_mutex); 2970 2971 /* 2972 * The barrier before waitqueue_active is implied by mutex_unlock 2973 */ 2974 if (waitqueue_active(&root->log_commit_wait[index1])) 2975 wake_up(&root->log_commit_wait[index1]); 2976 return ret; 2977 } 2978 2979 static void free_log_tree(struct btrfs_trans_handle *trans, 2980 struct btrfs_root *log) 2981 { 2982 int ret; 2983 u64 start; 2984 u64 end; 2985 struct walk_control wc = { 2986 .free = 1, 2987 .process_func = process_one_buffer 2988 }; 2989 2990 ret = walk_log_tree(trans, log, &wc); 2991 /* I don't think this can happen but just in case */ 2992 if (ret) 2993 btrfs_abort_transaction(trans, log, ret); 2994 2995 while (1) { 2996 ret = find_first_extent_bit(&log->dirty_log_pages, 2997 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2998 NULL); 2999 if (ret) 3000 break; 3001 3002 clear_extent_bits(&log->dirty_log_pages, start, end, 3003 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 3004 } 3005 3006 /* 3007 * We may have short-circuited the log tree with the full commit logic 3008 * and left ordered extents on our list, so clear these out to keep us 3009 * from leaking inodes and memory. 3010 */ 3011 btrfs_free_logged_extents(log, 0); 3012 btrfs_free_logged_extents(log, 1); 3013 3014 free_extent_buffer(log->node); 3015 kfree(log); 3016 } 3017 3018 /* 3019 * free all the extents used by the tree log. This should be called 3020 * at commit time of the full transaction 3021 */ 3022 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3023 { 3024 if (root->log_root) { 3025 free_log_tree(trans, root->log_root); 3026 root->log_root = NULL; 3027 } 3028 return 0; 3029 } 3030 3031 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3032 struct btrfs_fs_info *fs_info) 3033 { 3034 if (fs_info->log_root_tree) { 3035 free_log_tree(trans, fs_info->log_root_tree); 3036 fs_info->log_root_tree = NULL; 3037 } 3038 return 0; 3039 } 3040 3041 /* 3042 * If both a file and directory are logged, and unlinks or renames are 3043 * mixed in, we have a few interesting corners: 3044 * 3045 * create file X in dir Y 3046 * link file X to X.link in dir Y 3047 * fsync file X 3048 * unlink file X but leave X.link 3049 * fsync dir Y 3050 * 3051 * After a crash we would expect only X.link to exist. But file X 3052 * didn't get fsync'd again so the log has back refs for X and X.link. 3053 * 3054 * We solve this by removing directory entries and inode backrefs from the 3055 * log when a file that was logged in the current transaction is 3056 * unlinked. Any later fsync will include the updated log entries, and 3057 * we'll be able to reconstruct the proper directory items from backrefs. 3058 * 3059 * This optimizations allows us to avoid relogging the entire inode 3060 * or the entire directory. 3061 */ 3062 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3063 struct btrfs_root *root, 3064 const char *name, int name_len, 3065 struct inode *dir, u64 index) 3066 { 3067 struct btrfs_root *log; 3068 struct btrfs_dir_item *di; 3069 struct btrfs_path *path; 3070 int ret; 3071 int err = 0; 3072 int bytes_del = 0; 3073 u64 dir_ino = btrfs_ino(dir); 3074 3075 if (BTRFS_I(dir)->logged_trans < trans->transid) 3076 return 0; 3077 3078 ret = join_running_log_trans(root); 3079 if (ret) 3080 return 0; 3081 3082 mutex_lock(&BTRFS_I(dir)->log_mutex); 3083 3084 log = root->log_root; 3085 path = btrfs_alloc_path(); 3086 if (!path) { 3087 err = -ENOMEM; 3088 goto out_unlock; 3089 } 3090 3091 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3092 name, name_len, -1); 3093 if (IS_ERR(di)) { 3094 err = PTR_ERR(di); 3095 goto fail; 3096 } 3097 if (di) { 3098 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3099 bytes_del += name_len; 3100 if (ret) { 3101 err = ret; 3102 goto fail; 3103 } 3104 } 3105 btrfs_release_path(path); 3106 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3107 index, name, name_len, -1); 3108 if (IS_ERR(di)) { 3109 err = PTR_ERR(di); 3110 goto fail; 3111 } 3112 if (di) { 3113 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3114 bytes_del += name_len; 3115 if (ret) { 3116 err = ret; 3117 goto fail; 3118 } 3119 } 3120 3121 /* update the directory size in the log to reflect the names 3122 * we have removed 3123 */ 3124 if (bytes_del) { 3125 struct btrfs_key key; 3126 3127 key.objectid = dir_ino; 3128 key.offset = 0; 3129 key.type = BTRFS_INODE_ITEM_KEY; 3130 btrfs_release_path(path); 3131 3132 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3133 if (ret < 0) { 3134 err = ret; 3135 goto fail; 3136 } 3137 if (ret == 0) { 3138 struct btrfs_inode_item *item; 3139 u64 i_size; 3140 3141 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3142 struct btrfs_inode_item); 3143 i_size = btrfs_inode_size(path->nodes[0], item); 3144 if (i_size > bytes_del) 3145 i_size -= bytes_del; 3146 else 3147 i_size = 0; 3148 btrfs_set_inode_size(path->nodes[0], item, i_size); 3149 btrfs_mark_buffer_dirty(path->nodes[0]); 3150 } else 3151 ret = 0; 3152 btrfs_release_path(path); 3153 } 3154 fail: 3155 btrfs_free_path(path); 3156 out_unlock: 3157 mutex_unlock(&BTRFS_I(dir)->log_mutex); 3158 if (ret == -ENOSPC) { 3159 btrfs_set_log_full_commit(root->fs_info, trans); 3160 ret = 0; 3161 } else if (ret < 0) 3162 btrfs_abort_transaction(trans, root, ret); 3163 3164 btrfs_end_log_trans(root); 3165 3166 return err; 3167 } 3168 3169 /* see comments for btrfs_del_dir_entries_in_log */ 3170 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3171 struct btrfs_root *root, 3172 const char *name, int name_len, 3173 struct inode *inode, u64 dirid) 3174 { 3175 struct btrfs_root *log; 3176 u64 index; 3177 int ret; 3178 3179 if (BTRFS_I(inode)->logged_trans < trans->transid) 3180 return 0; 3181 3182 ret = join_running_log_trans(root); 3183 if (ret) 3184 return 0; 3185 log = root->log_root; 3186 mutex_lock(&BTRFS_I(inode)->log_mutex); 3187 3188 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3189 dirid, &index); 3190 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3191 if (ret == -ENOSPC) { 3192 btrfs_set_log_full_commit(root->fs_info, trans); 3193 ret = 0; 3194 } else if (ret < 0 && ret != -ENOENT) 3195 btrfs_abort_transaction(trans, root, ret); 3196 btrfs_end_log_trans(root); 3197 3198 return ret; 3199 } 3200 3201 /* 3202 * creates a range item in the log for 'dirid'. first_offset and 3203 * last_offset tell us which parts of the key space the log should 3204 * be considered authoritative for. 3205 */ 3206 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3207 struct btrfs_root *log, 3208 struct btrfs_path *path, 3209 int key_type, u64 dirid, 3210 u64 first_offset, u64 last_offset) 3211 { 3212 int ret; 3213 struct btrfs_key key; 3214 struct btrfs_dir_log_item *item; 3215 3216 key.objectid = dirid; 3217 key.offset = first_offset; 3218 if (key_type == BTRFS_DIR_ITEM_KEY) 3219 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3220 else 3221 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3222 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3223 if (ret) 3224 return ret; 3225 3226 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3227 struct btrfs_dir_log_item); 3228 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3229 btrfs_mark_buffer_dirty(path->nodes[0]); 3230 btrfs_release_path(path); 3231 return 0; 3232 } 3233 3234 /* 3235 * log all the items included in the current transaction for a given 3236 * directory. This also creates the range items in the log tree required 3237 * to replay anything deleted before the fsync 3238 */ 3239 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3240 struct btrfs_root *root, struct inode *inode, 3241 struct btrfs_path *path, 3242 struct btrfs_path *dst_path, int key_type, 3243 struct btrfs_log_ctx *ctx, 3244 u64 min_offset, u64 *last_offset_ret) 3245 { 3246 struct btrfs_key min_key; 3247 struct btrfs_root *log = root->log_root; 3248 struct extent_buffer *src; 3249 int err = 0; 3250 int ret; 3251 int i; 3252 int nritems; 3253 u64 first_offset = min_offset; 3254 u64 last_offset = (u64)-1; 3255 u64 ino = btrfs_ino(inode); 3256 3257 log = root->log_root; 3258 3259 min_key.objectid = ino; 3260 min_key.type = key_type; 3261 min_key.offset = min_offset; 3262 3263 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3264 3265 /* 3266 * we didn't find anything from this transaction, see if there 3267 * is anything at all 3268 */ 3269 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3270 min_key.objectid = ino; 3271 min_key.type = key_type; 3272 min_key.offset = (u64)-1; 3273 btrfs_release_path(path); 3274 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3275 if (ret < 0) { 3276 btrfs_release_path(path); 3277 return ret; 3278 } 3279 ret = btrfs_previous_item(root, path, ino, key_type); 3280 3281 /* if ret == 0 there are items for this type, 3282 * create a range to tell us the last key of this type. 3283 * otherwise, there are no items in this directory after 3284 * *min_offset, and we create a range to indicate that. 3285 */ 3286 if (ret == 0) { 3287 struct btrfs_key tmp; 3288 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3289 path->slots[0]); 3290 if (key_type == tmp.type) 3291 first_offset = max(min_offset, tmp.offset) + 1; 3292 } 3293 goto done; 3294 } 3295 3296 /* go backward to find any previous key */ 3297 ret = btrfs_previous_item(root, path, ino, key_type); 3298 if (ret == 0) { 3299 struct btrfs_key tmp; 3300 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3301 if (key_type == tmp.type) { 3302 first_offset = tmp.offset; 3303 ret = overwrite_item(trans, log, dst_path, 3304 path->nodes[0], path->slots[0], 3305 &tmp); 3306 if (ret) { 3307 err = ret; 3308 goto done; 3309 } 3310 } 3311 } 3312 btrfs_release_path(path); 3313 3314 /* find the first key from this transaction again */ 3315 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3316 if (WARN_ON(ret != 0)) 3317 goto done; 3318 3319 /* 3320 * we have a block from this transaction, log every item in it 3321 * from our directory 3322 */ 3323 while (1) { 3324 struct btrfs_key tmp; 3325 src = path->nodes[0]; 3326 nritems = btrfs_header_nritems(src); 3327 for (i = path->slots[0]; i < nritems; i++) { 3328 struct btrfs_dir_item *di; 3329 3330 btrfs_item_key_to_cpu(src, &min_key, i); 3331 3332 if (min_key.objectid != ino || min_key.type != key_type) 3333 goto done; 3334 ret = overwrite_item(trans, log, dst_path, src, i, 3335 &min_key); 3336 if (ret) { 3337 err = ret; 3338 goto done; 3339 } 3340 3341 /* 3342 * We must make sure that when we log a directory entry, 3343 * the corresponding inode, after log replay, has a 3344 * matching link count. For example: 3345 * 3346 * touch foo 3347 * mkdir mydir 3348 * sync 3349 * ln foo mydir/bar 3350 * xfs_io -c "fsync" mydir 3351 * <crash> 3352 * <mount fs and log replay> 3353 * 3354 * Would result in a fsync log that when replayed, our 3355 * file inode would have a link count of 1, but we get 3356 * two directory entries pointing to the same inode. 3357 * After removing one of the names, it would not be 3358 * possible to remove the other name, which resulted 3359 * always in stale file handle errors, and would not 3360 * be possible to rmdir the parent directory, since 3361 * its i_size could never decrement to the value 3362 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3363 */ 3364 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3365 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3366 if (ctx && 3367 (btrfs_dir_transid(src, di) == trans->transid || 3368 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3369 tmp.type != BTRFS_ROOT_ITEM_KEY) 3370 ctx->log_new_dentries = true; 3371 } 3372 path->slots[0] = nritems; 3373 3374 /* 3375 * look ahead to the next item and see if it is also 3376 * from this directory and from this transaction 3377 */ 3378 ret = btrfs_next_leaf(root, path); 3379 if (ret == 1) { 3380 last_offset = (u64)-1; 3381 goto done; 3382 } 3383 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3384 if (tmp.objectid != ino || tmp.type != key_type) { 3385 last_offset = (u64)-1; 3386 goto done; 3387 } 3388 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3389 ret = overwrite_item(trans, log, dst_path, 3390 path->nodes[0], path->slots[0], 3391 &tmp); 3392 if (ret) 3393 err = ret; 3394 else 3395 last_offset = tmp.offset; 3396 goto done; 3397 } 3398 } 3399 done: 3400 btrfs_release_path(path); 3401 btrfs_release_path(dst_path); 3402 3403 if (err == 0) { 3404 *last_offset_ret = last_offset; 3405 /* 3406 * insert the log range keys to indicate where the log 3407 * is valid 3408 */ 3409 ret = insert_dir_log_key(trans, log, path, key_type, 3410 ino, first_offset, last_offset); 3411 if (ret) 3412 err = ret; 3413 } 3414 return err; 3415 } 3416 3417 /* 3418 * logging directories is very similar to logging inodes, We find all the items 3419 * from the current transaction and write them to the log. 3420 * 3421 * The recovery code scans the directory in the subvolume, and if it finds a 3422 * key in the range logged that is not present in the log tree, then it means 3423 * that dir entry was unlinked during the transaction. 3424 * 3425 * In order for that scan to work, we must include one key smaller than 3426 * the smallest logged by this transaction and one key larger than the largest 3427 * key logged by this transaction. 3428 */ 3429 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3430 struct btrfs_root *root, struct inode *inode, 3431 struct btrfs_path *path, 3432 struct btrfs_path *dst_path, 3433 struct btrfs_log_ctx *ctx) 3434 { 3435 u64 min_key; 3436 u64 max_key; 3437 int ret; 3438 int key_type = BTRFS_DIR_ITEM_KEY; 3439 3440 again: 3441 min_key = 0; 3442 max_key = 0; 3443 while (1) { 3444 ret = log_dir_items(trans, root, inode, path, 3445 dst_path, key_type, ctx, min_key, 3446 &max_key); 3447 if (ret) 3448 return ret; 3449 if (max_key == (u64)-1) 3450 break; 3451 min_key = max_key + 1; 3452 } 3453 3454 if (key_type == BTRFS_DIR_ITEM_KEY) { 3455 key_type = BTRFS_DIR_INDEX_KEY; 3456 goto again; 3457 } 3458 return 0; 3459 } 3460 3461 /* 3462 * a helper function to drop items from the log before we relog an 3463 * inode. max_key_type indicates the highest item type to remove. 3464 * This cannot be run for file data extents because it does not 3465 * free the extents they point to. 3466 */ 3467 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3468 struct btrfs_root *log, 3469 struct btrfs_path *path, 3470 u64 objectid, int max_key_type) 3471 { 3472 int ret; 3473 struct btrfs_key key; 3474 struct btrfs_key found_key; 3475 int start_slot; 3476 3477 key.objectid = objectid; 3478 key.type = max_key_type; 3479 key.offset = (u64)-1; 3480 3481 while (1) { 3482 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3483 BUG_ON(ret == 0); /* Logic error */ 3484 if (ret < 0) 3485 break; 3486 3487 if (path->slots[0] == 0) 3488 break; 3489 3490 path->slots[0]--; 3491 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3492 path->slots[0]); 3493 3494 if (found_key.objectid != objectid) 3495 break; 3496 3497 found_key.offset = 0; 3498 found_key.type = 0; 3499 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3500 &start_slot); 3501 3502 ret = btrfs_del_items(trans, log, path, start_slot, 3503 path->slots[0] - start_slot + 1); 3504 /* 3505 * If start slot isn't 0 then we don't need to re-search, we've 3506 * found the last guy with the objectid in this tree. 3507 */ 3508 if (ret || start_slot != 0) 3509 break; 3510 btrfs_release_path(path); 3511 } 3512 btrfs_release_path(path); 3513 if (ret > 0) 3514 ret = 0; 3515 return ret; 3516 } 3517 3518 static void fill_inode_item(struct btrfs_trans_handle *trans, 3519 struct extent_buffer *leaf, 3520 struct btrfs_inode_item *item, 3521 struct inode *inode, int log_inode_only, 3522 u64 logged_isize) 3523 { 3524 struct btrfs_map_token token; 3525 3526 btrfs_init_map_token(&token); 3527 3528 if (log_inode_only) { 3529 /* set the generation to zero so the recover code 3530 * can tell the difference between an logging 3531 * just to say 'this inode exists' and a logging 3532 * to say 'update this inode with these values' 3533 */ 3534 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3535 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3536 } else { 3537 btrfs_set_token_inode_generation(leaf, item, 3538 BTRFS_I(inode)->generation, 3539 &token); 3540 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3541 } 3542 3543 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3544 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3545 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3546 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3547 3548 btrfs_set_token_timespec_sec(leaf, &item->atime, 3549 inode->i_atime.tv_sec, &token); 3550 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3551 inode->i_atime.tv_nsec, &token); 3552 3553 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3554 inode->i_mtime.tv_sec, &token); 3555 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3556 inode->i_mtime.tv_nsec, &token); 3557 3558 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3559 inode->i_ctime.tv_sec, &token); 3560 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3561 inode->i_ctime.tv_nsec, &token); 3562 3563 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3564 &token); 3565 3566 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3567 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3568 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3569 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3570 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3571 } 3572 3573 static int log_inode_item(struct btrfs_trans_handle *trans, 3574 struct btrfs_root *log, struct btrfs_path *path, 3575 struct inode *inode) 3576 { 3577 struct btrfs_inode_item *inode_item; 3578 int ret; 3579 3580 ret = btrfs_insert_empty_item(trans, log, path, 3581 &BTRFS_I(inode)->location, 3582 sizeof(*inode_item)); 3583 if (ret && ret != -EEXIST) 3584 return ret; 3585 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3586 struct btrfs_inode_item); 3587 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3588 btrfs_release_path(path); 3589 return 0; 3590 } 3591 3592 static noinline int copy_items(struct btrfs_trans_handle *trans, 3593 struct inode *inode, 3594 struct btrfs_path *dst_path, 3595 struct btrfs_path *src_path, u64 *last_extent, 3596 int start_slot, int nr, int inode_only, 3597 u64 logged_isize) 3598 { 3599 unsigned long src_offset; 3600 unsigned long dst_offset; 3601 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3602 struct btrfs_file_extent_item *extent; 3603 struct btrfs_inode_item *inode_item; 3604 struct extent_buffer *src = src_path->nodes[0]; 3605 struct btrfs_key first_key, last_key, key; 3606 int ret; 3607 struct btrfs_key *ins_keys; 3608 u32 *ins_sizes; 3609 char *ins_data; 3610 int i; 3611 struct list_head ordered_sums; 3612 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3613 bool has_extents = false; 3614 bool need_find_last_extent = true; 3615 bool done = false; 3616 3617 INIT_LIST_HEAD(&ordered_sums); 3618 3619 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3620 nr * sizeof(u32), GFP_NOFS); 3621 if (!ins_data) 3622 return -ENOMEM; 3623 3624 first_key.objectid = (u64)-1; 3625 3626 ins_sizes = (u32 *)ins_data; 3627 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3628 3629 for (i = 0; i < nr; i++) { 3630 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3631 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3632 } 3633 ret = btrfs_insert_empty_items(trans, log, dst_path, 3634 ins_keys, ins_sizes, nr); 3635 if (ret) { 3636 kfree(ins_data); 3637 return ret; 3638 } 3639 3640 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3641 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3642 dst_path->slots[0]); 3643 3644 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3645 3646 if ((i == (nr - 1))) 3647 last_key = ins_keys[i]; 3648 3649 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3650 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3651 dst_path->slots[0], 3652 struct btrfs_inode_item); 3653 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3654 inode, inode_only == LOG_INODE_EXISTS, 3655 logged_isize); 3656 } else { 3657 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3658 src_offset, ins_sizes[i]); 3659 } 3660 3661 /* 3662 * We set need_find_last_extent here in case we know we were 3663 * processing other items and then walk into the first extent in 3664 * the inode. If we don't hit an extent then nothing changes, 3665 * we'll do the last search the next time around. 3666 */ 3667 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3668 has_extents = true; 3669 if (first_key.objectid == (u64)-1) 3670 first_key = ins_keys[i]; 3671 } else { 3672 need_find_last_extent = false; 3673 } 3674 3675 /* take a reference on file data extents so that truncates 3676 * or deletes of this inode don't have to relog the inode 3677 * again 3678 */ 3679 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3680 !skip_csum) { 3681 int found_type; 3682 extent = btrfs_item_ptr(src, start_slot + i, 3683 struct btrfs_file_extent_item); 3684 3685 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3686 continue; 3687 3688 found_type = btrfs_file_extent_type(src, extent); 3689 if (found_type == BTRFS_FILE_EXTENT_REG) { 3690 u64 ds, dl, cs, cl; 3691 ds = btrfs_file_extent_disk_bytenr(src, 3692 extent); 3693 /* ds == 0 is a hole */ 3694 if (ds == 0) 3695 continue; 3696 3697 dl = btrfs_file_extent_disk_num_bytes(src, 3698 extent); 3699 cs = btrfs_file_extent_offset(src, extent); 3700 cl = btrfs_file_extent_num_bytes(src, 3701 extent); 3702 if (btrfs_file_extent_compression(src, 3703 extent)) { 3704 cs = 0; 3705 cl = dl; 3706 } 3707 3708 ret = btrfs_lookup_csums_range( 3709 log->fs_info->csum_root, 3710 ds + cs, ds + cs + cl - 1, 3711 &ordered_sums, 0); 3712 if (ret) { 3713 btrfs_release_path(dst_path); 3714 kfree(ins_data); 3715 return ret; 3716 } 3717 } 3718 } 3719 } 3720 3721 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3722 btrfs_release_path(dst_path); 3723 kfree(ins_data); 3724 3725 /* 3726 * we have to do this after the loop above to avoid changing the 3727 * log tree while trying to change the log tree. 3728 */ 3729 ret = 0; 3730 while (!list_empty(&ordered_sums)) { 3731 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3732 struct btrfs_ordered_sum, 3733 list); 3734 if (!ret) 3735 ret = btrfs_csum_file_blocks(trans, log, sums); 3736 list_del(&sums->list); 3737 kfree(sums); 3738 } 3739 3740 if (!has_extents) 3741 return ret; 3742 3743 if (need_find_last_extent && *last_extent == first_key.offset) { 3744 /* 3745 * We don't have any leafs between our current one and the one 3746 * we processed before that can have file extent items for our 3747 * inode (and have a generation number smaller than our current 3748 * transaction id). 3749 */ 3750 need_find_last_extent = false; 3751 } 3752 3753 /* 3754 * Because we use btrfs_search_forward we could skip leaves that were 3755 * not modified and then assume *last_extent is valid when it really 3756 * isn't. So back up to the previous leaf and read the end of the last 3757 * extent before we go and fill in holes. 3758 */ 3759 if (need_find_last_extent) { 3760 u64 len; 3761 3762 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3763 if (ret < 0) 3764 return ret; 3765 if (ret) 3766 goto fill_holes; 3767 if (src_path->slots[0]) 3768 src_path->slots[0]--; 3769 src = src_path->nodes[0]; 3770 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3771 if (key.objectid != btrfs_ino(inode) || 3772 key.type != BTRFS_EXTENT_DATA_KEY) 3773 goto fill_holes; 3774 extent = btrfs_item_ptr(src, src_path->slots[0], 3775 struct btrfs_file_extent_item); 3776 if (btrfs_file_extent_type(src, extent) == 3777 BTRFS_FILE_EXTENT_INLINE) { 3778 len = btrfs_file_extent_inline_len(src, 3779 src_path->slots[0], 3780 extent); 3781 *last_extent = ALIGN(key.offset + len, 3782 log->sectorsize); 3783 } else { 3784 len = btrfs_file_extent_num_bytes(src, extent); 3785 *last_extent = key.offset + len; 3786 } 3787 } 3788 fill_holes: 3789 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3790 * things could have happened 3791 * 3792 * 1) A merge could have happened, so we could currently be on a leaf 3793 * that holds what we were copying in the first place. 3794 * 2) A split could have happened, and now not all of the items we want 3795 * are on the same leaf. 3796 * 3797 * So we need to adjust how we search for holes, we need to drop the 3798 * path and re-search for the first extent key we found, and then walk 3799 * forward until we hit the last one we copied. 3800 */ 3801 if (need_find_last_extent) { 3802 /* btrfs_prev_leaf could return 1 without releasing the path */ 3803 btrfs_release_path(src_path); 3804 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3805 src_path, 0, 0); 3806 if (ret < 0) 3807 return ret; 3808 ASSERT(ret == 0); 3809 src = src_path->nodes[0]; 3810 i = src_path->slots[0]; 3811 } else { 3812 i = start_slot; 3813 } 3814 3815 /* 3816 * Ok so here we need to go through and fill in any holes we may have 3817 * to make sure that holes are punched for those areas in case they had 3818 * extents previously. 3819 */ 3820 while (!done) { 3821 u64 offset, len; 3822 u64 extent_end; 3823 3824 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3825 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3826 if (ret < 0) 3827 return ret; 3828 ASSERT(ret == 0); 3829 src = src_path->nodes[0]; 3830 i = 0; 3831 } 3832 3833 btrfs_item_key_to_cpu(src, &key, i); 3834 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3835 done = true; 3836 if (key.objectid != btrfs_ino(inode) || 3837 key.type != BTRFS_EXTENT_DATA_KEY) { 3838 i++; 3839 continue; 3840 } 3841 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3842 if (btrfs_file_extent_type(src, extent) == 3843 BTRFS_FILE_EXTENT_INLINE) { 3844 len = btrfs_file_extent_inline_len(src, i, extent); 3845 extent_end = ALIGN(key.offset + len, log->sectorsize); 3846 } else { 3847 len = btrfs_file_extent_num_bytes(src, extent); 3848 extent_end = key.offset + len; 3849 } 3850 i++; 3851 3852 if (*last_extent == key.offset) { 3853 *last_extent = extent_end; 3854 continue; 3855 } 3856 offset = *last_extent; 3857 len = key.offset - *last_extent; 3858 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3859 offset, 0, 0, len, 0, len, 0, 3860 0, 0); 3861 if (ret) 3862 break; 3863 *last_extent = extent_end; 3864 } 3865 /* 3866 * Need to let the callers know we dropped the path so they should 3867 * re-search. 3868 */ 3869 if (!ret && need_find_last_extent) 3870 ret = 1; 3871 return ret; 3872 } 3873 3874 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3875 { 3876 struct extent_map *em1, *em2; 3877 3878 em1 = list_entry(a, struct extent_map, list); 3879 em2 = list_entry(b, struct extent_map, list); 3880 3881 if (em1->start < em2->start) 3882 return -1; 3883 else if (em1->start > em2->start) 3884 return 1; 3885 return 0; 3886 } 3887 3888 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3889 struct inode *inode, 3890 struct btrfs_root *root, 3891 const struct extent_map *em, 3892 const struct list_head *logged_list, 3893 bool *ordered_io_error) 3894 { 3895 struct btrfs_ordered_extent *ordered; 3896 struct btrfs_root *log = root->log_root; 3897 u64 mod_start = em->mod_start; 3898 u64 mod_len = em->mod_len; 3899 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3900 u64 csum_offset; 3901 u64 csum_len; 3902 LIST_HEAD(ordered_sums); 3903 int ret = 0; 3904 3905 *ordered_io_error = false; 3906 3907 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3908 em->block_start == EXTENT_MAP_HOLE) 3909 return 0; 3910 3911 /* 3912 * Wait far any ordered extent that covers our extent map. If it 3913 * finishes without an error, first check and see if our csums are on 3914 * our outstanding ordered extents. 3915 */ 3916 list_for_each_entry(ordered, logged_list, log_list) { 3917 struct btrfs_ordered_sum *sum; 3918 3919 if (!mod_len) 3920 break; 3921 3922 if (ordered->file_offset + ordered->len <= mod_start || 3923 mod_start + mod_len <= ordered->file_offset) 3924 continue; 3925 3926 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3927 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3928 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3929 const u64 start = ordered->file_offset; 3930 const u64 end = ordered->file_offset + ordered->len - 1; 3931 3932 WARN_ON(ordered->inode != inode); 3933 filemap_fdatawrite_range(inode->i_mapping, start, end); 3934 } 3935 3936 wait_event(ordered->wait, 3937 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3938 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3939 3940 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3941 /* 3942 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3943 * i_mapping flags, so that the next fsync won't get 3944 * an outdated io error too. 3945 */ 3946 btrfs_inode_check_errors(inode); 3947 *ordered_io_error = true; 3948 break; 3949 } 3950 /* 3951 * We are going to copy all the csums on this ordered extent, so 3952 * go ahead and adjust mod_start and mod_len in case this 3953 * ordered extent has already been logged. 3954 */ 3955 if (ordered->file_offset > mod_start) { 3956 if (ordered->file_offset + ordered->len >= 3957 mod_start + mod_len) 3958 mod_len = ordered->file_offset - mod_start; 3959 /* 3960 * If we have this case 3961 * 3962 * |--------- logged extent ---------| 3963 * |----- ordered extent ----| 3964 * 3965 * Just don't mess with mod_start and mod_len, we'll 3966 * just end up logging more csums than we need and it 3967 * will be ok. 3968 */ 3969 } else { 3970 if (ordered->file_offset + ordered->len < 3971 mod_start + mod_len) { 3972 mod_len = (mod_start + mod_len) - 3973 (ordered->file_offset + ordered->len); 3974 mod_start = ordered->file_offset + 3975 ordered->len; 3976 } else { 3977 mod_len = 0; 3978 } 3979 } 3980 3981 if (skip_csum) 3982 continue; 3983 3984 /* 3985 * To keep us from looping for the above case of an ordered 3986 * extent that falls inside of the logged extent. 3987 */ 3988 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3989 &ordered->flags)) 3990 continue; 3991 3992 list_for_each_entry(sum, &ordered->list, list) { 3993 ret = btrfs_csum_file_blocks(trans, log, sum); 3994 if (ret) 3995 break; 3996 } 3997 } 3998 3999 if (*ordered_io_error || !mod_len || ret || skip_csum) 4000 return ret; 4001 4002 if (em->compress_type) { 4003 csum_offset = 0; 4004 csum_len = max(em->block_len, em->orig_block_len); 4005 } else { 4006 csum_offset = mod_start - em->start; 4007 csum_len = mod_len; 4008 } 4009 4010 /* block start is already adjusted for the file extent offset. */ 4011 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 4012 em->block_start + csum_offset, 4013 em->block_start + csum_offset + 4014 csum_len - 1, &ordered_sums, 0); 4015 if (ret) 4016 return ret; 4017 4018 while (!list_empty(&ordered_sums)) { 4019 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4020 struct btrfs_ordered_sum, 4021 list); 4022 if (!ret) 4023 ret = btrfs_csum_file_blocks(trans, log, sums); 4024 list_del(&sums->list); 4025 kfree(sums); 4026 } 4027 4028 return ret; 4029 } 4030 4031 static int log_one_extent(struct btrfs_trans_handle *trans, 4032 struct inode *inode, struct btrfs_root *root, 4033 const struct extent_map *em, 4034 struct btrfs_path *path, 4035 const struct list_head *logged_list, 4036 struct btrfs_log_ctx *ctx) 4037 { 4038 struct btrfs_root *log = root->log_root; 4039 struct btrfs_file_extent_item *fi; 4040 struct extent_buffer *leaf; 4041 struct btrfs_map_token token; 4042 struct btrfs_key key; 4043 u64 extent_offset = em->start - em->orig_start; 4044 u64 block_len; 4045 int ret; 4046 int extent_inserted = 0; 4047 bool ordered_io_err = false; 4048 4049 ret = wait_ordered_extents(trans, inode, root, em, logged_list, 4050 &ordered_io_err); 4051 if (ret) 4052 return ret; 4053 4054 if (ordered_io_err) { 4055 ctx->io_err = -EIO; 4056 return 0; 4057 } 4058 4059 btrfs_init_map_token(&token); 4060 4061 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 4062 em->start + em->len, NULL, 0, 1, 4063 sizeof(*fi), &extent_inserted); 4064 if (ret) 4065 return ret; 4066 4067 if (!extent_inserted) { 4068 key.objectid = btrfs_ino(inode); 4069 key.type = BTRFS_EXTENT_DATA_KEY; 4070 key.offset = em->start; 4071 4072 ret = btrfs_insert_empty_item(trans, log, path, &key, 4073 sizeof(*fi)); 4074 if (ret) 4075 return ret; 4076 } 4077 leaf = path->nodes[0]; 4078 fi = btrfs_item_ptr(leaf, path->slots[0], 4079 struct btrfs_file_extent_item); 4080 4081 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4082 &token); 4083 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4084 btrfs_set_token_file_extent_type(leaf, fi, 4085 BTRFS_FILE_EXTENT_PREALLOC, 4086 &token); 4087 else 4088 btrfs_set_token_file_extent_type(leaf, fi, 4089 BTRFS_FILE_EXTENT_REG, 4090 &token); 4091 4092 block_len = max(em->block_len, em->orig_block_len); 4093 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4094 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4095 em->block_start, 4096 &token); 4097 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4098 &token); 4099 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4100 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4101 em->block_start - 4102 extent_offset, &token); 4103 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4104 &token); 4105 } else { 4106 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4107 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4108 &token); 4109 } 4110 4111 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4112 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4113 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4114 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4115 &token); 4116 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4117 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4118 btrfs_mark_buffer_dirty(leaf); 4119 4120 btrfs_release_path(path); 4121 4122 return ret; 4123 } 4124 4125 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4126 struct btrfs_root *root, 4127 struct inode *inode, 4128 struct btrfs_path *path, 4129 struct list_head *logged_list, 4130 struct btrfs_log_ctx *ctx, 4131 const u64 start, 4132 const u64 end) 4133 { 4134 struct extent_map *em, *n; 4135 struct list_head extents; 4136 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 4137 u64 test_gen; 4138 int ret = 0; 4139 int num = 0; 4140 4141 INIT_LIST_HEAD(&extents); 4142 4143 write_lock(&tree->lock); 4144 test_gen = root->fs_info->last_trans_committed; 4145 4146 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4147 list_del_init(&em->list); 4148 4149 /* 4150 * Just an arbitrary number, this can be really CPU intensive 4151 * once we start getting a lot of extents, and really once we 4152 * have a bunch of extents we just want to commit since it will 4153 * be faster. 4154 */ 4155 if (++num > 32768) { 4156 list_del_init(&tree->modified_extents); 4157 ret = -EFBIG; 4158 goto process; 4159 } 4160 4161 if (em->generation <= test_gen) 4162 continue; 4163 /* Need a ref to keep it from getting evicted from cache */ 4164 atomic_inc(&em->refs); 4165 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4166 list_add_tail(&em->list, &extents); 4167 num++; 4168 } 4169 4170 list_sort(NULL, &extents, extent_cmp); 4171 /* 4172 * Collect any new ordered extents within the range. This is to 4173 * prevent logging file extent items without waiting for the disk 4174 * location they point to being written. We do this only to deal 4175 * with races against concurrent lockless direct IO writes. 4176 */ 4177 btrfs_get_logged_extents(inode, logged_list, start, end); 4178 process: 4179 while (!list_empty(&extents)) { 4180 em = list_entry(extents.next, struct extent_map, list); 4181 4182 list_del_init(&em->list); 4183 4184 /* 4185 * If we had an error we just need to delete everybody from our 4186 * private list. 4187 */ 4188 if (ret) { 4189 clear_em_logging(tree, em); 4190 free_extent_map(em); 4191 continue; 4192 } 4193 4194 write_unlock(&tree->lock); 4195 4196 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4197 ctx); 4198 write_lock(&tree->lock); 4199 clear_em_logging(tree, em); 4200 free_extent_map(em); 4201 } 4202 WARN_ON(!list_empty(&extents)); 4203 write_unlock(&tree->lock); 4204 4205 btrfs_release_path(path); 4206 return ret; 4207 } 4208 4209 static int logged_inode_size(struct btrfs_root *log, struct inode *inode, 4210 struct btrfs_path *path, u64 *size_ret) 4211 { 4212 struct btrfs_key key; 4213 int ret; 4214 4215 key.objectid = btrfs_ino(inode); 4216 key.type = BTRFS_INODE_ITEM_KEY; 4217 key.offset = 0; 4218 4219 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4220 if (ret < 0) { 4221 return ret; 4222 } else if (ret > 0) { 4223 *size_ret = 0; 4224 } else { 4225 struct btrfs_inode_item *item; 4226 4227 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4228 struct btrfs_inode_item); 4229 *size_ret = btrfs_inode_size(path->nodes[0], item); 4230 } 4231 4232 btrfs_release_path(path); 4233 return 0; 4234 } 4235 4236 /* 4237 * At the moment we always log all xattrs. This is to figure out at log replay 4238 * time which xattrs must have their deletion replayed. If a xattr is missing 4239 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4240 * because if a xattr is deleted, the inode is fsynced and a power failure 4241 * happens, causing the log to be replayed the next time the fs is mounted, 4242 * we want the xattr to not exist anymore (same behaviour as other filesystems 4243 * with a journal, ext3/4, xfs, f2fs, etc). 4244 */ 4245 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4246 struct btrfs_root *root, 4247 struct inode *inode, 4248 struct btrfs_path *path, 4249 struct btrfs_path *dst_path) 4250 { 4251 int ret; 4252 struct btrfs_key key; 4253 const u64 ino = btrfs_ino(inode); 4254 int ins_nr = 0; 4255 int start_slot = 0; 4256 4257 key.objectid = ino; 4258 key.type = BTRFS_XATTR_ITEM_KEY; 4259 key.offset = 0; 4260 4261 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4262 if (ret < 0) 4263 return ret; 4264 4265 while (true) { 4266 int slot = path->slots[0]; 4267 struct extent_buffer *leaf = path->nodes[0]; 4268 int nritems = btrfs_header_nritems(leaf); 4269 4270 if (slot >= nritems) { 4271 if (ins_nr > 0) { 4272 u64 last_extent = 0; 4273 4274 ret = copy_items(trans, inode, dst_path, path, 4275 &last_extent, start_slot, 4276 ins_nr, 1, 0); 4277 /* can't be 1, extent items aren't processed */ 4278 ASSERT(ret <= 0); 4279 if (ret < 0) 4280 return ret; 4281 ins_nr = 0; 4282 } 4283 ret = btrfs_next_leaf(root, path); 4284 if (ret < 0) 4285 return ret; 4286 else if (ret > 0) 4287 break; 4288 continue; 4289 } 4290 4291 btrfs_item_key_to_cpu(leaf, &key, slot); 4292 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4293 break; 4294 4295 if (ins_nr == 0) 4296 start_slot = slot; 4297 ins_nr++; 4298 path->slots[0]++; 4299 cond_resched(); 4300 } 4301 if (ins_nr > 0) { 4302 u64 last_extent = 0; 4303 4304 ret = copy_items(trans, inode, dst_path, path, 4305 &last_extent, start_slot, 4306 ins_nr, 1, 0); 4307 /* can't be 1, extent items aren't processed */ 4308 ASSERT(ret <= 0); 4309 if (ret < 0) 4310 return ret; 4311 } 4312 4313 return 0; 4314 } 4315 4316 /* 4317 * If the no holes feature is enabled we need to make sure any hole between the 4318 * last extent and the i_size of our inode is explicitly marked in the log. This 4319 * is to make sure that doing something like: 4320 * 4321 * 1) create file with 128Kb of data 4322 * 2) truncate file to 64Kb 4323 * 3) truncate file to 256Kb 4324 * 4) fsync file 4325 * 5) <crash/power failure> 4326 * 6) mount fs and trigger log replay 4327 * 4328 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4329 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4330 * file correspond to a hole. The presence of explicit holes in a log tree is 4331 * what guarantees that log replay will remove/adjust file extent items in the 4332 * fs/subvol tree. 4333 * 4334 * Here we do not need to care about holes between extents, that is already done 4335 * by copy_items(). We also only need to do this in the full sync path, where we 4336 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4337 * lookup the list of modified extent maps and if any represents a hole, we 4338 * insert a corresponding extent representing a hole in the log tree. 4339 */ 4340 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4341 struct btrfs_root *root, 4342 struct inode *inode, 4343 struct btrfs_path *path) 4344 { 4345 int ret; 4346 struct btrfs_key key; 4347 u64 hole_start; 4348 u64 hole_size; 4349 struct extent_buffer *leaf; 4350 struct btrfs_root *log = root->log_root; 4351 const u64 ino = btrfs_ino(inode); 4352 const u64 i_size = i_size_read(inode); 4353 4354 if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) 4355 return 0; 4356 4357 key.objectid = ino; 4358 key.type = BTRFS_EXTENT_DATA_KEY; 4359 key.offset = (u64)-1; 4360 4361 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4362 ASSERT(ret != 0); 4363 if (ret < 0) 4364 return ret; 4365 4366 ASSERT(path->slots[0] > 0); 4367 path->slots[0]--; 4368 leaf = path->nodes[0]; 4369 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4370 4371 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4372 /* inode does not have any extents */ 4373 hole_start = 0; 4374 hole_size = i_size; 4375 } else { 4376 struct btrfs_file_extent_item *extent; 4377 u64 len; 4378 4379 /* 4380 * If there's an extent beyond i_size, an explicit hole was 4381 * already inserted by copy_items(). 4382 */ 4383 if (key.offset >= i_size) 4384 return 0; 4385 4386 extent = btrfs_item_ptr(leaf, path->slots[0], 4387 struct btrfs_file_extent_item); 4388 4389 if (btrfs_file_extent_type(leaf, extent) == 4390 BTRFS_FILE_EXTENT_INLINE) { 4391 len = btrfs_file_extent_inline_len(leaf, 4392 path->slots[0], 4393 extent); 4394 ASSERT(len == i_size); 4395 return 0; 4396 } 4397 4398 len = btrfs_file_extent_num_bytes(leaf, extent); 4399 /* Last extent goes beyond i_size, no need to log a hole. */ 4400 if (key.offset + len > i_size) 4401 return 0; 4402 hole_start = key.offset + len; 4403 hole_size = i_size - hole_start; 4404 } 4405 btrfs_release_path(path); 4406 4407 /* Last extent ends at i_size. */ 4408 if (hole_size == 0) 4409 return 0; 4410 4411 hole_size = ALIGN(hole_size, root->sectorsize); 4412 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4413 hole_size, 0, hole_size, 0, 0, 0); 4414 return ret; 4415 } 4416 4417 /* log a single inode in the tree log. 4418 * At least one parent directory for this inode must exist in the tree 4419 * or be logged already. 4420 * 4421 * Any items from this inode changed by the current transaction are copied 4422 * to the log tree. An extra reference is taken on any extents in this 4423 * file, allowing us to avoid a whole pile of corner cases around logging 4424 * blocks that have been removed from the tree. 4425 * 4426 * See LOG_INODE_ALL and related defines for a description of what inode_only 4427 * does. 4428 * 4429 * This handles both files and directories. 4430 */ 4431 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4432 struct btrfs_root *root, struct inode *inode, 4433 int inode_only, 4434 const loff_t start, 4435 const loff_t end, 4436 struct btrfs_log_ctx *ctx) 4437 { 4438 struct btrfs_path *path; 4439 struct btrfs_path *dst_path; 4440 struct btrfs_key min_key; 4441 struct btrfs_key max_key; 4442 struct btrfs_root *log = root->log_root; 4443 struct extent_buffer *src = NULL; 4444 LIST_HEAD(logged_list); 4445 u64 last_extent = 0; 4446 int err = 0; 4447 int ret; 4448 int nritems; 4449 int ins_start_slot = 0; 4450 int ins_nr; 4451 bool fast_search = false; 4452 u64 ino = btrfs_ino(inode); 4453 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4454 u64 logged_isize = 0; 4455 bool need_log_inode_item = true; 4456 4457 path = btrfs_alloc_path(); 4458 if (!path) 4459 return -ENOMEM; 4460 dst_path = btrfs_alloc_path(); 4461 if (!dst_path) { 4462 btrfs_free_path(path); 4463 return -ENOMEM; 4464 } 4465 4466 min_key.objectid = ino; 4467 min_key.type = BTRFS_INODE_ITEM_KEY; 4468 min_key.offset = 0; 4469 4470 max_key.objectid = ino; 4471 4472 4473 /* today the code can only do partial logging of directories */ 4474 if (S_ISDIR(inode->i_mode) || 4475 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4476 &BTRFS_I(inode)->runtime_flags) && 4477 inode_only == LOG_INODE_EXISTS)) 4478 max_key.type = BTRFS_XATTR_ITEM_KEY; 4479 else 4480 max_key.type = (u8)-1; 4481 max_key.offset = (u64)-1; 4482 4483 /* 4484 * Only run delayed items if we are a dir or a new file. 4485 * Otherwise commit the delayed inode only, which is needed in 4486 * order for the log replay code to mark inodes for link count 4487 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4488 */ 4489 if (S_ISDIR(inode->i_mode) || 4490 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) 4491 ret = btrfs_commit_inode_delayed_items(trans, inode); 4492 else 4493 ret = btrfs_commit_inode_delayed_inode(inode); 4494 4495 if (ret) { 4496 btrfs_free_path(path); 4497 btrfs_free_path(dst_path); 4498 return ret; 4499 } 4500 4501 mutex_lock(&BTRFS_I(inode)->log_mutex); 4502 4503 btrfs_get_logged_extents(inode, &logged_list, start, end); 4504 4505 /* 4506 * a brute force approach to making sure we get the most uptodate 4507 * copies of everything. 4508 */ 4509 if (S_ISDIR(inode->i_mode)) { 4510 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4511 4512 if (inode_only == LOG_INODE_EXISTS) 4513 max_key_type = BTRFS_XATTR_ITEM_KEY; 4514 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4515 } else { 4516 if (inode_only == LOG_INODE_EXISTS) { 4517 /* 4518 * Make sure the new inode item we write to the log has 4519 * the same isize as the current one (if it exists). 4520 * This is necessary to prevent data loss after log 4521 * replay, and also to prevent doing a wrong expanding 4522 * truncate - for e.g. create file, write 4K into offset 4523 * 0, fsync, write 4K into offset 4096, add hard link, 4524 * fsync some other file (to sync log), power fail - if 4525 * we use the inode's current i_size, after log replay 4526 * we get a 8Kb file, with the last 4Kb extent as a hole 4527 * (zeroes), as if an expanding truncate happened, 4528 * instead of getting a file of 4Kb only. 4529 */ 4530 err = logged_inode_size(log, inode, path, 4531 &logged_isize); 4532 if (err) 4533 goto out_unlock; 4534 } 4535 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4536 &BTRFS_I(inode)->runtime_flags)) { 4537 if (inode_only == LOG_INODE_EXISTS) { 4538 max_key.type = BTRFS_XATTR_ITEM_KEY; 4539 ret = drop_objectid_items(trans, log, path, ino, 4540 max_key.type); 4541 } else { 4542 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4543 &BTRFS_I(inode)->runtime_flags); 4544 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4545 &BTRFS_I(inode)->runtime_flags); 4546 while(1) { 4547 ret = btrfs_truncate_inode_items(trans, 4548 log, inode, 0, 0); 4549 if (ret != -EAGAIN) 4550 break; 4551 } 4552 } 4553 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4554 &BTRFS_I(inode)->runtime_flags) || 4555 inode_only == LOG_INODE_EXISTS) { 4556 if (inode_only == LOG_INODE_ALL) 4557 fast_search = true; 4558 max_key.type = BTRFS_XATTR_ITEM_KEY; 4559 ret = drop_objectid_items(trans, log, path, ino, 4560 max_key.type); 4561 } else { 4562 if (inode_only == LOG_INODE_ALL) 4563 fast_search = true; 4564 goto log_extents; 4565 } 4566 4567 } 4568 if (ret) { 4569 err = ret; 4570 goto out_unlock; 4571 } 4572 4573 while (1) { 4574 ins_nr = 0; 4575 ret = btrfs_search_forward(root, &min_key, 4576 path, trans->transid); 4577 if (ret != 0) 4578 break; 4579 again: 4580 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4581 if (min_key.objectid != ino) 4582 break; 4583 if (min_key.type > max_key.type) 4584 break; 4585 4586 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4587 need_log_inode_item = false; 4588 4589 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4590 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4591 if (ins_nr == 0) 4592 goto next_slot; 4593 ret = copy_items(trans, inode, dst_path, path, 4594 &last_extent, ins_start_slot, 4595 ins_nr, inode_only, logged_isize); 4596 if (ret < 0) { 4597 err = ret; 4598 goto out_unlock; 4599 } 4600 ins_nr = 0; 4601 if (ret) { 4602 btrfs_release_path(path); 4603 continue; 4604 } 4605 goto next_slot; 4606 } 4607 4608 src = path->nodes[0]; 4609 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4610 ins_nr++; 4611 goto next_slot; 4612 } else if (!ins_nr) { 4613 ins_start_slot = path->slots[0]; 4614 ins_nr = 1; 4615 goto next_slot; 4616 } 4617 4618 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4619 ins_start_slot, ins_nr, inode_only, 4620 logged_isize); 4621 if (ret < 0) { 4622 err = ret; 4623 goto out_unlock; 4624 } 4625 if (ret) { 4626 ins_nr = 0; 4627 btrfs_release_path(path); 4628 continue; 4629 } 4630 ins_nr = 1; 4631 ins_start_slot = path->slots[0]; 4632 next_slot: 4633 4634 nritems = btrfs_header_nritems(path->nodes[0]); 4635 path->slots[0]++; 4636 if (path->slots[0] < nritems) { 4637 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4638 path->slots[0]); 4639 goto again; 4640 } 4641 if (ins_nr) { 4642 ret = copy_items(trans, inode, dst_path, path, 4643 &last_extent, ins_start_slot, 4644 ins_nr, inode_only, logged_isize); 4645 if (ret < 0) { 4646 err = ret; 4647 goto out_unlock; 4648 } 4649 ret = 0; 4650 ins_nr = 0; 4651 } 4652 btrfs_release_path(path); 4653 4654 if (min_key.offset < (u64)-1) { 4655 min_key.offset++; 4656 } else if (min_key.type < max_key.type) { 4657 min_key.type++; 4658 min_key.offset = 0; 4659 } else { 4660 break; 4661 } 4662 } 4663 if (ins_nr) { 4664 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4665 ins_start_slot, ins_nr, inode_only, 4666 logged_isize); 4667 if (ret < 0) { 4668 err = ret; 4669 goto out_unlock; 4670 } 4671 ret = 0; 4672 ins_nr = 0; 4673 } 4674 4675 btrfs_release_path(path); 4676 btrfs_release_path(dst_path); 4677 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4678 if (err) 4679 goto out_unlock; 4680 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4681 btrfs_release_path(path); 4682 btrfs_release_path(dst_path); 4683 err = btrfs_log_trailing_hole(trans, root, inode, path); 4684 if (err) 4685 goto out_unlock; 4686 } 4687 log_extents: 4688 btrfs_release_path(path); 4689 btrfs_release_path(dst_path); 4690 if (need_log_inode_item) { 4691 err = log_inode_item(trans, log, dst_path, inode); 4692 if (err) 4693 goto out_unlock; 4694 } 4695 if (fast_search) { 4696 /* 4697 * Some ordered extents started by fsync might have completed 4698 * before we collected the ordered extents in logged_list, which 4699 * means they're gone, not in our logged_list nor in the inode's 4700 * ordered tree. We want the application/user space to know an 4701 * error happened while attempting to persist file data so that 4702 * it can take proper action. If such error happened, we leave 4703 * without writing to the log tree and the fsync must report the 4704 * file data write error and not commit the current transaction. 4705 */ 4706 err = btrfs_inode_check_errors(inode); 4707 if (err) { 4708 ctx->io_err = err; 4709 goto out_unlock; 4710 } 4711 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4712 &logged_list, ctx, start, end); 4713 if (ret) { 4714 err = ret; 4715 goto out_unlock; 4716 } 4717 } else if (inode_only == LOG_INODE_ALL) { 4718 struct extent_map *em, *n; 4719 4720 write_lock(&em_tree->lock); 4721 /* 4722 * We can't just remove every em if we're called for a ranged 4723 * fsync - that is, one that doesn't cover the whole possible 4724 * file range (0 to LLONG_MAX). This is because we can have 4725 * em's that fall outside the range we're logging and therefore 4726 * their ordered operations haven't completed yet 4727 * (btrfs_finish_ordered_io() not invoked yet). This means we 4728 * didn't get their respective file extent item in the fs/subvol 4729 * tree yet, and need to let the next fast fsync (one which 4730 * consults the list of modified extent maps) find the em so 4731 * that it logs a matching file extent item and waits for the 4732 * respective ordered operation to complete (if it's still 4733 * running). 4734 * 4735 * Removing every em outside the range we're logging would make 4736 * the next fast fsync not log their matching file extent items, 4737 * therefore making us lose data after a log replay. 4738 */ 4739 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4740 list) { 4741 const u64 mod_end = em->mod_start + em->mod_len - 1; 4742 4743 if (em->mod_start >= start && mod_end <= end) 4744 list_del_init(&em->list); 4745 } 4746 write_unlock(&em_tree->lock); 4747 } 4748 4749 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4750 ret = log_directory_changes(trans, root, inode, path, dst_path, 4751 ctx); 4752 if (ret) { 4753 err = ret; 4754 goto out_unlock; 4755 } 4756 } 4757 4758 spin_lock(&BTRFS_I(inode)->lock); 4759 BTRFS_I(inode)->logged_trans = trans->transid; 4760 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4761 spin_unlock(&BTRFS_I(inode)->lock); 4762 out_unlock: 4763 if (unlikely(err)) 4764 btrfs_put_logged_extents(&logged_list); 4765 else 4766 btrfs_submit_logged_extents(&logged_list, log); 4767 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4768 4769 btrfs_free_path(path); 4770 btrfs_free_path(dst_path); 4771 return err; 4772 } 4773 4774 /* 4775 * follow the dentry parent pointers up the chain and see if any 4776 * of the directories in it require a full commit before they can 4777 * be logged. Returns zero if nothing special needs to be done or 1 if 4778 * a full commit is required. 4779 */ 4780 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4781 struct inode *inode, 4782 struct dentry *parent, 4783 struct super_block *sb, 4784 u64 last_committed) 4785 { 4786 int ret = 0; 4787 struct btrfs_root *root; 4788 struct dentry *old_parent = NULL; 4789 struct inode *orig_inode = inode; 4790 4791 /* 4792 * for regular files, if its inode is already on disk, we don't 4793 * have to worry about the parents at all. This is because 4794 * we can use the last_unlink_trans field to record renames 4795 * and other fun in this file. 4796 */ 4797 if (S_ISREG(inode->i_mode) && 4798 BTRFS_I(inode)->generation <= last_committed && 4799 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4800 goto out; 4801 4802 if (!S_ISDIR(inode->i_mode)) { 4803 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4804 goto out; 4805 inode = d_inode(parent); 4806 } 4807 4808 while (1) { 4809 /* 4810 * If we are logging a directory then we start with our inode, 4811 * not our parents inode, so we need to skipp setting the 4812 * logged_trans so that further down in the log code we don't 4813 * think this inode has already been logged. 4814 */ 4815 if (inode != orig_inode) 4816 BTRFS_I(inode)->logged_trans = trans->transid; 4817 smp_mb(); 4818 4819 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4820 root = BTRFS_I(inode)->root; 4821 4822 /* 4823 * make sure any commits to the log are forced 4824 * to be full commits 4825 */ 4826 btrfs_set_log_full_commit(root->fs_info, trans); 4827 ret = 1; 4828 break; 4829 } 4830 4831 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4832 break; 4833 4834 if (IS_ROOT(parent)) 4835 break; 4836 4837 parent = dget_parent(parent); 4838 dput(old_parent); 4839 old_parent = parent; 4840 inode = d_inode(parent); 4841 4842 } 4843 dput(old_parent); 4844 out: 4845 return ret; 4846 } 4847 4848 struct btrfs_dir_list { 4849 u64 ino; 4850 struct list_head list; 4851 }; 4852 4853 /* 4854 * Log the inodes of the new dentries of a directory. See log_dir_items() for 4855 * details about the why it is needed. 4856 * This is a recursive operation - if an existing dentry corresponds to a 4857 * directory, that directory's new entries are logged too (same behaviour as 4858 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 4859 * the dentries point to we do not lock their i_mutex, otherwise lockdep 4860 * complains about the following circular lock dependency / possible deadlock: 4861 * 4862 * CPU0 CPU1 4863 * ---- ---- 4864 * lock(&type->i_mutex_dir_key#3/2); 4865 * lock(sb_internal#2); 4866 * lock(&type->i_mutex_dir_key#3/2); 4867 * lock(&sb->s_type->i_mutex_key#14); 4868 * 4869 * Where sb_internal is the lock (a counter that works as a lock) acquired by 4870 * sb_start_intwrite() in btrfs_start_transaction(). 4871 * Not locking i_mutex of the inodes is still safe because: 4872 * 4873 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 4874 * that while logging the inode new references (names) are added or removed 4875 * from the inode, leaving the logged inode item with a link count that does 4876 * not match the number of logged inode reference items. This is fine because 4877 * at log replay time we compute the real number of links and correct the 4878 * link count in the inode item (see replay_one_buffer() and 4879 * link_to_fixup_dir()); 4880 * 4881 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 4882 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 4883 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 4884 * has a size that doesn't match the sum of the lengths of all the logged 4885 * names. This does not result in a problem because if a dir_item key is 4886 * logged but its matching dir_index key is not logged, at log replay time we 4887 * don't use it to replay the respective name (see replay_one_name()). On the 4888 * other hand if only the dir_index key ends up being logged, the respective 4889 * name is added to the fs/subvol tree with both the dir_item and dir_index 4890 * keys created (see replay_one_name()). 4891 * The directory's inode item with a wrong i_size is not a problem as well, 4892 * since we don't use it at log replay time to set the i_size in the inode 4893 * item of the fs/subvol tree (see overwrite_item()). 4894 */ 4895 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 4896 struct btrfs_root *root, 4897 struct inode *start_inode, 4898 struct btrfs_log_ctx *ctx) 4899 { 4900 struct btrfs_root *log = root->log_root; 4901 struct btrfs_path *path; 4902 LIST_HEAD(dir_list); 4903 struct btrfs_dir_list *dir_elem; 4904 int ret = 0; 4905 4906 path = btrfs_alloc_path(); 4907 if (!path) 4908 return -ENOMEM; 4909 4910 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 4911 if (!dir_elem) { 4912 btrfs_free_path(path); 4913 return -ENOMEM; 4914 } 4915 dir_elem->ino = btrfs_ino(start_inode); 4916 list_add_tail(&dir_elem->list, &dir_list); 4917 4918 while (!list_empty(&dir_list)) { 4919 struct extent_buffer *leaf; 4920 struct btrfs_key min_key; 4921 int nritems; 4922 int i; 4923 4924 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 4925 list); 4926 if (ret) 4927 goto next_dir_inode; 4928 4929 min_key.objectid = dir_elem->ino; 4930 min_key.type = BTRFS_DIR_ITEM_KEY; 4931 min_key.offset = 0; 4932 again: 4933 btrfs_release_path(path); 4934 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 4935 if (ret < 0) { 4936 goto next_dir_inode; 4937 } else if (ret > 0) { 4938 ret = 0; 4939 goto next_dir_inode; 4940 } 4941 4942 process_leaf: 4943 leaf = path->nodes[0]; 4944 nritems = btrfs_header_nritems(leaf); 4945 for (i = path->slots[0]; i < nritems; i++) { 4946 struct btrfs_dir_item *di; 4947 struct btrfs_key di_key; 4948 struct inode *di_inode; 4949 struct btrfs_dir_list *new_dir_elem; 4950 int log_mode = LOG_INODE_EXISTS; 4951 int type; 4952 4953 btrfs_item_key_to_cpu(leaf, &min_key, i); 4954 if (min_key.objectid != dir_elem->ino || 4955 min_key.type != BTRFS_DIR_ITEM_KEY) 4956 goto next_dir_inode; 4957 4958 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 4959 type = btrfs_dir_type(leaf, di); 4960 if (btrfs_dir_transid(leaf, di) < trans->transid && 4961 type != BTRFS_FT_DIR) 4962 continue; 4963 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 4964 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 4965 continue; 4966 4967 di_inode = btrfs_iget(root->fs_info->sb, &di_key, 4968 root, NULL); 4969 if (IS_ERR(di_inode)) { 4970 ret = PTR_ERR(di_inode); 4971 goto next_dir_inode; 4972 } 4973 4974 if (btrfs_inode_in_log(di_inode, trans->transid)) { 4975 iput(di_inode); 4976 continue; 4977 } 4978 4979 ctx->log_new_dentries = false; 4980 if (type == BTRFS_FT_DIR) 4981 log_mode = LOG_INODE_ALL; 4982 btrfs_release_path(path); 4983 ret = btrfs_log_inode(trans, root, di_inode, 4984 log_mode, 0, LLONG_MAX, ctx); 4985 iput(di_inode); 4986 if (ret) 4987 goto next_dir_inode; 4988 if (ctx->log_new_dentries) { 4989 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 4990 GFP_NOFS); 4991 if (!new_dir_elem) { 4992 ret = -ENOMEM; 4993 goto next_dir_inode; 4994 } 4995 new_dir_elem->ino = di_key.objectid; 4996 list_add_tail(&new_dir_elem->list, &dir_list); 4997 } 4998 break; 4999 } 5000 if (i == nritems) { 5001 ret = btrfs_next_leaf(log, path); 5002 if (ret < 0) { 5003 goto next_dir_inode; 5004 } else if (ret > 0) { 5005 ret = 0; 5006 goto next_dir_inode; 5007 } 5008 goto process_leaf; 5009 } 5010 if (min_key.offset < (u64)-1) { 5011 min_key.offset++; 5012 goto again; 5013 } 5014 next_dir_inode: 5015 list_del(&dir_elem->list); 5016 kfree(dir_elem); 5017 } 5018 5019 btrfs_free_path(path); 5020 return ret; 5021 } 5022 5023 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5024 struct inode *inode, 5025 struct btrfs_log_ctx *ctx) 5026 { 5027 int ret; 5028 struct btrfs_path *path; 5029 struct btrfs_key key; 5030 struct btrfs_root *root = BTRFS_I(inode)->root; 5031 const u64 ino = btrfs_ino(inode); 5032 5033 path = btrfs_alloc_path(); 5034 if (!path) 5035 return -ENOMEM; 5036 path->skip_locking = 1; 5037 path->search_commit_root = 1; 5038 5039 key.objectid = ino; 5040 key.type = BTRFS_INODE_REF_KEY; 5041 key.offset = 0; 5042 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5043 if (ret < 0) 5044 goto out; 5045 5046 while (true) { 5047 struct extent_buffer *leaf = path->nodes[0]; 5048 int slot = path->slots[0]; 5049 u32 cur_offset = 0; 5050 u32 item_size; 5051 unsigned long ptr; 5052 5053 if (slot >= btrfs_header_nritems(leaf)) { 5054 ret = btrfs_next_leaf(root, path); 5055 if (ret < 0) 5056 goto out; 5057 else if (ret > 0) 5058 break; 5059 continue; 5060 } 5061 5062 btrfs_item_key_to_cpu(leaf, &key, slot); 5063 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5064 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5065 break; 5066 5067 item_size = btrfs_item_size_nr(leaf, slot); 5068 ptr = btrfs_item_ptr_offset(leaf, slot); 5069 while (cur_offset < item_size) { 5070 struct btrfs_key inode_key; 5071 struct inode *dir_inode; 5072 5073 inode_key.type = BTRFS_INODE_ITEM_KEY; 5074 inode_key.offset = 0; 5075 5076 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5077 struct btrfs_inode_extref *extref; 5078 5079 extref = (struct btrfs_inode_extref *) 5080 (ptr + cur_offset); 5081 inode_key.objectid = btrfs_inode_extref_parent( 5082 leaf, extref); 5083 cur_offset += sizeof(*extref); 5084 cur_offset += btrfs_inode_extref_name_len(leaf, 5085 extref); 5086 } else { 5087 inode_key.objectid = key.offset; 5088 cur_offset = item_size; 5089 } 5090 5091 dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, 5092 root, NULL); 5093 /* If parent inode was deleted, skip it. */ 5094 if (IS_ERR(dir_inode)) 5095 continue; 5096 5097 ret = btrfs_log_inode(trans, root, dir_inode, 5098 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5099 iput(dir_inode); 5100 if (ret) 5101 goto out; 5102 } 5103 path->slots[0]++; 5104 } 5105 ret = 0; 5106 out: 5107 btrfs_free_path(path); 5108 return ret; 5109 } 5110 5111 /* 5112 * helper function around btrfs_log_inode to make sure newly created 5113 * parent directories also end up in the log. A minimal inode and backref 5114 * only logging is done of any parent directories that are older than 5115 * the last committed transaction 5116 */ 5117 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5118 struct btrfs_root *root, struct inode *inode, 5119 struct dentry *parent, 5120 const loff_t start, 5121 const loff_t end, 5122 int exists_only, 5123 struct btrfs_log_ctx *ctx) 5124 { 5125 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5126 struct super_block *sb; 5127 struct dentry *old_parent = NULL; 5128 int ret = 0; 5129 u64 last_committed = root->fs_info->last_trans_committed; 5130 bool log_dentries = false; 5131 struct inode *orig_inode = inode; 5132 5133 sb = inode->i_sb; 5134 5135 if (btrfs_test_opt(root, NOTREELOG)) { 5136 ret = 1; 5137 goto end_no_trans; 5138 } 5139 5140 /* 5141 * The prev transaction commit doesn't complete, we need do 5142 * full commit by ourselves. 5143 */ 5144 if (root->fs_info->last_trans_log_full_commit > 5145 root->fs_info->last_trans_committed) { 5146 ret = 1; 5147 goto end_no_trans; 5148 } 5149 5150 if (root != BTRFS_I(inode)->root || 5151 btrfs_root_refs(&root->root_item) == 0) { 5152 ret = 1; 5153 goto end_no_trans; 5154 } 5155 5156 ret = check_parent_dirs_for_sync(trans, inode, parent, 5157 sb, last_committed); 5158 if (ret) 5159 goto end_no_trans; 5160 5161 if (btrfs_inode_in_log(inode, trans->transid)) { 5162 ret = BTRFS_NO_LOG_SYNC; 5163 goto end_no_trans; 5164 } 5165 5166 ret = start_log_trans(trans, root, ctx); 5167 if (ret) 5168 goto end_no_trans; 5169 5170 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5171 if (ret) 5172 goto end_trans; 5173 5174 /* 5175 * for regular files, if its inode is already on disk, we don't 5176 * have to worry about the parents at all. This is because 5177 * we can use the last_unlink_trans field to record renames 5178 * and other fun in this file. 5179 */ 5180 if (S_ISREG(inode->i_mode) && 5181 BTRFS_I(inode)->generation <= last_committed && 5182 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 5183 ret = 0; 5184 goto end_trans; 5185 } 5186 5187 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 5188 log_dentries = true; 5189 5190 /* 5191 * On unlink we must make sure all our current and old parent directores 5192 * inodes are fully logged. This is to prevent leaving dangling 5193 * directory index entries in directories that were our parents but are 5194 * not anymore. Not doing this results in old parent directory being 5195 * impossible to delete after log replay (rmdir will always fail with 5196 * error -ENOTEMPTY). 5197 * 5198 * Example 1: 5199 * 5200 * mkdir testdir 5201 * touch testdir/foo 5202 * ln testdir/foo testdir/bar 5203 * sync 5204 * unlink testdir/bar 5205 * xfs_io -c fsync testdir/foo 5206 * <power failure> 5207 * mount fs, triggers log replay 5208 * 5209 * If we don't log the parent directory (testdir), after log replay the 5210 * directory still has an entry pointing to the file inode using the bar 5211 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5212 * the file inode has a link count of 1. 5213 * 5214 * Example 2: 5215 * 5216 * mkdir testdir 5217 * touch foo 5218 * ln foo testdir/foo2 5219 * ln foo testdir/foo3 5220 * sync 5221 * unlink testdir/foo3 5222 * xfs_io -c fsync foo 5223 * <power failure> 5224 * mount fs, triggers log replay 5225 * 5226 * Similar as the first example, after log replay the parent directory 5227 * testdir still has an entry pointing to the inode file with name foo3 5228 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5229 * and has a link count of 2. 5230 */ 5231 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 5232 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5233 if (ret) 5234 goto end_trans; 5235 } 5236 5237 while (1) { 5238 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 5239 break; 5240 5241 inode = d_inode(parent); 5242 if (root != BTRFS_I(inode)->root) 5243 break; 5244 5245 if (BTRFS_I(inode)->generation > last_committed) { 5246 ret = btrfs_log_inode(trans, root, inode, 5247 LOG_INODE_EXISTS, 5248 0, LLONG_MAX, ctx); 5249 if (ret) 5250 goto end_trans; 5251 } 5252 if (IS_ROOT(parent)) 5253 break; 5254 5255 parent = dget_parent(parent); 5256 dput(old_parent); 5257 old_parent = parent; 5258 } 5259 if (log_dentries) 5260 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5261 else 5262 ret = 0; 5263 end_trans: 5264 dput(old_parent); 5265 if (ret < 0) { 5266 btrfs_set_log_full_commit(root->fs_info, trans); 5267 ret = 1; 5268 } 5269 5270 if (ret) 5271 btrfs_remove_log_ctx(root, ctx); 5272 btrfs_end_log_trans(root); 5273 end_no_trans: 5274 return ret; 5275 } 5276 5277 /* 5278 * it is not safe to log dentry if the chunk root has added new 5279 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5280 * If this returns 1, you must commit the transaction to safely get your 5281 * data on disk. 5282 */ 5283 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5284 struct btrfs_root *root, struct dentry *dentry, 5285 const loff_t start, 5286 const loff_t end, 5287 struct btrfs_log_ctx *ctx) 5288 { 5289 struct dentry *parent = dget_parent(dentry); 5290 int ret; 5291 5292 ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, 5293 start, end, 0, ctx); 5294 dput(parent); 5295 5296 return ret; 5297 } 5298 5299 /* 5300 * should be called during mount to recover any replay any log trees 5301 * from the FS 5302 */ 5303 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5304 { 5305 int ret; 5306 struct btrfs_path *path; 5307 struct btrfs_trans_handle *trans; 5308 struct btrfs_key key; 5309 struct btrfs_key found_key; 5310 struct btrfs_key tmp_key; 5311 struct btrfs_root *log; 5312 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5313 struct walk_control wc = { 5314 .process_func = process_one_buffer, 5315 .stage = 0, 5316 }; 5317 5318 path = btrfs_alloc_path(); 5319 if (!path) 5320 return -ENOMEM; 5321 5322 fs_info->log_root_recovering = 1; 5323 5324 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5325 if (IS_ERR(trans)) { 5326 ret = PTR_ERR(trans); 5327 goto error; 5328 } 5329 5330 wc.trans = trans; 5331 wc.pin = 1; 5332 5333 ret = walk_log_tree(trans, log_root_tree, &wc); 5334 if (ret) { 5335 btrfs_std_error(fs_info, ret, "Failed to pin buffers while " 5336 "recovering log root tree."); 5337 goto error; 5338 } 5339 5340 again: 5341 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5342 key.offset = (u64)-1; 5343 key.type = BTRFS_ROOT_ITEM_KEY; 5344 5345 while (1) { 5346 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5347 5348 if (ret < 0) { 5349 btrfs_std_error(fs_info, ret, 5350 "Couldn't find tree log root."); 5351 goto error; 5352 } 5353 if (ret > 0) { 5354 if (path->slots[0] == 0) 5355 break; 5356 path->slots[0]--; 5357 } 5358 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5359 path->slots[0]); 5360 btrfs_release_path(path); 5361 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5362 break; 5363 5364 log = btrfs_read_fs_root(log_root_tree, &found_key); 5365 if (IS_ERR(log)) { 5366 ret = PTR_ERR(log); 5367 btrfs_std_error(fs_info, ret, 5368 "Couldn't read tree log root."); 5369 goto error; 5370 } 5371 5372 tmp_key.objectid = found_key.offset; 5373 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5374 tmp_key.offset = (u64)-1; 5375 5376 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5377 if (IS_ERR(wc.replay_dest)) { 5378 ret = PTR_ERR(wc.replay_dest); 5379 free_extent_buffer(log->node); 5380 free_extent_buffer(log->commit_root); 5381 kfree(log); 5382 btrfs_std_error(fs_info, ret, "Couldn't read target root " 5383 "for tree log recovery."); 5384 goto error; 5385 } 5386 5387 wc.replay_dest->log_root = log; 5388 btrfs_record_root_in_trans(trans, wc.replay_dest); 5389 ret = walk_log_tree(trans, log, &wc); 5390 5391 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5392 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5393 path); 5394 } 5395 5396 key.offset = found_key.offset - 1; 5397 wc.replay_dest->log_root = NULL; 5398 free_extent_buffer(log->node); 5399 free_extent_buffer(log->commit_root); 5400 kfree(log); 5401 5402 if (ret) 5403 goto error; 5404 5405 if (found_key.offset == 0) 5406 break; 5407 } 5408 btrfs_release_path(path); 5409 5410 /* step one is to pin it all, step two is to replay just inodes */ 5411 if (wc.pin) { 5412 wc.pin = 0; 5413 wc.process_func = replay_one_buffer; 5414 wc.stage = LOG_WALK_REPLAY_INODES; 5415 goto again; 5416 } 5417 /* step three is to replay everything */ 5418 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5419 wc.stage++; 5420 goto again; 5421 } 5422 5423 btrfs_free_path(path); 5424 5425 /* step 4: commit the transaction, which also unpins the blocks */ 5426 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 5427 if (ret) 5428 return ret; 5429 5430 free_extent_buffer(log_root_tree->node); 5431 log_root_tree->log_root = NULL; 5432 fs_info->log_root_recovering = 0; 5433 kfree(log_root_tree); 5434 5435 return 0; 5436 error: 5437 if (wc.trans) 5438 btrfs_end_transaction(wc.trans, fs_info->tree_root); 5439 btrfs_free_path(path); 5440 return ret; 5441 } 5442 5443 /* 5444 * there are some corner cases where we want to force a full 5445 * commit instead of allowing a directory to be logged. 5446 * 5447 * They revolve around files there were unlinked from the directory, and 5448 * this function updates the parent directory so that a full commit is 5449 * properly done if it is fsync'd later after the unlinks are done. 5450 */ 5451 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5452 struct inode *dir, struct inode *inode, 5453 int for_rename) 5454 { 5455 /* 5456 * when we're logging a file, if it hasn't been renamed 5457 * or unlinked, and its inode is fully committed on disk, 5458 * we don't have to worry about walking up the directory chain 5459 * to log its parents. 5460 * 5461 * So, we use the last_unlink_trans field to put this transid 5462 * into the file. When the file is logged we check it and 5463 * don't log the parents if the file is fully on disk. 5464 */ 5465 if (S_ISREG(inode->i_mode)) 5466 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5467 5468 /* 5469 * if this directory was already logged any new 5470 * names for this file/dir will get recorded 5471 */ 5472 smp_mb(); 5473 if (BTRFS_I(dir)->logged_trans == trans->transid) 5474 return; 5475 5476 /* 5477 * if the inode we're about to unlink was logged, 5478 * the log will be properly updated for any new names 5479 */ 5480 if (BTRFS_I(inode)->logged_trans == trans->transid) 5481 return; 5482 5483 /* 5484 * when renaming files across directories, if the directory 5485 * there we're unlinking from gets fsync'd later on, there's 5486 * no way to find the destination directory later and fsync it 5487 * properly. So, we have to be conservative and force commits 5488 * so the new name gets discovered. 5489 */ 5490 if (for_rename) 5491 goto record; 5492 5493 /* we can safely do the unlink without any special recording */ 5494 return; 5495 5496 record: 5497 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5498 } 5499 5500 /* 5501 * Call this after adding a new name for a file and it will properly 5502 * update the log to reflect the new name. 5503 * 5504 * It will return zero if all goes well, and it will return 1 if a 5505 * full transaction commit is required. 5506 */ 5507 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5508 struct inode *inode, struct inode *old_dir, 5509 struct dentry *parent) 5510 { 5511 struct btrfs_root * root = BTRFS_I(inode)->root; 5512 5513 /* 5514 * this will force the logging code to walk the dentry chain 5515 * up for the file 5516 */ 5517 if (S_ISREG(inode->i_mode)) 5518 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5519 5520 /* 5521 * if this inode hasn't been logged and directory we're renaming it 5522 * from hasn't been logged, we don't need to log it 5523 */ 5524 if (BTRFS_I(inode)->logged_trans <= 5525 root->fs_info->last_trans_committed && 5526 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 5527 root->fs_info->last_trans_committed)) 5528 return 0; 5529 5530 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5531 LLONG_MAX, 1, NULL); 5532 } 5533 5534