1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 30 /* magic values for the inode_only field in btrfs_log_inode: 31 * 32 * LOG_INODE_ALL means to log everything 33 * LOG_INODE_EXISTS means to log just enough to recreate the inode 34 * during log replay 35 */ 36 #define LOG_INODE_ALL 0 37 #define LOG_INODE_EXISTS 1 38 39 /* 40 * directory trouble cases 41 * 42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 43 * log, we must force a full commit before doing an fsync of the directory 44 * where the unlink was done. 45 * ---> record transid of last unlink/rename per directory 46 * 47 * mkdir foo/some_dir 48 * normal commit 49 * rename foo/some_dir foo2/some_dir 50 * mkdir foo/some_dir 51 * fsync foo/some_dir/some_file 52 * 53 * The fsync above will unlink the original some_dir without recording 54 * it in its new location (foo2). After a crash, some_dir will be gone 55 * unless the fsync of some_file forces a full commit 56 * 57 * 2) we must log any new names for any file or dir that is in the fsync 58 * log. ---> check inode while renaming/linking. 59 * 60 * 2a) we must log any new names for any file or dir during rename 61 * when the directory they are being removed from was logged. 62 * ---> check inode and old parent dir during rename 63 * 64 * 2a is actually the more important variant. With the extra logging 65 * a crash might unlink the old name without recreating the new one 66 * 67 * 3) after a crash, we must go through any directories with a link count 68 * of zero and redo the rm -rf 69 * 70 * mkdir f1/foo 71 * normal commit 72 * rm -rf f1/foo 73 * fsync(f1) 74 * 75 * The directory f1 was fully removed from the FS, but fsync was never 76 * called on f1, only its parent dir. After a crash the rm -rf must 77 * be replayed. This must be able to recurse down the entire 78 * directory tree. The inode link count fixup code takes care of the 79 * ugly details. 80 */ 81 82 /* 83 * stages for the tree walking. The first 84 * stage (0) is to only pin down the blocks we find 85 * the second stage (1) is to make sure that all the inodes 86 * we find in the log are created in the subvolume. 87 * 88 * The last stage is to deal with directories and links and extents 89 * and all the other fun semantics 90 */ 91 #define LOG_WALK_PIN_ONLY 0 92 #define LOG_WALK_REPLAY_INODES 1 93 #define LOG_WALK_REPLAY_DIR_INDEX 2 94 #define LOG_WALK_REPLAY_ALL 3 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct inode *inode, 98 int inode_only, 99 const loff_t start, 100 const loff_t end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 int ret = 0; 144 145 mutex_lock(&root->log_mutex); 146 147 if (root->log_root) { 148 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 149 ret = -EAGAIN; 150 goto out; 151 } 152 153 if (!root->log_start_pid) { 154 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 155 root->log_start_pid = current->pid; 156 } else if (root->log_start_pid != current->pid) { 157 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 158 } 159 } else { 160 mutex_lock(&root->fs_info->tree_log_mutex); 161 if (!root->fs_info->log_root_tree) 162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 163 mutex_unlock(&root->fs_info->tree_log_mutex); 164 if (ret) 165 goto out; 166 167 ret = btrfs_add_log_tree(trans, root); 168 if (ret) 169 goto out; 170 171 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 172 root->log_start_pid = current->pid; 173 } 174 175 atomic_inc(&root->log_batch); 176 atomic_inc(&root->log_writers); 177 if (ctx) { 178 int index = root->log_transid % 2; 179 list_add_tail(&ctx->list, &root->log_ctxs[index]); 180 ctx->log_transid = root->log_transid; 181 } 182 183 out: 184 mutex_unlock(&root->log_mutex); 185 return ret; 186 } 187 188 /* 189 * returns 0 if there was a log transaction running and we were able 190 * to join, or returns -ENOENT if there were not transactions 191 * in progress 192 */ 193 static int join_running_log_trans(struct btrfs_root *root) 194 { 195 int ret = -ENOENT; 196 197 smp_mb(); 198 if (!root->log_root) 199 return -ENOENT; 200 201 mutex_lock(&root->log_mutex); 202 if (root->log_root) { 203 ret = 0; 204 atomic_inc(&root->log_writers); 205 } 206 mutex_unlock(&root->log_mutex); 207 return ret; 208 } 209 210 /* 211 * This either makes the current running log transaction wait 212 * until you call btrfs_end_log_trans() or it makes any future 213 * log transactions wait until you call btrfs_end_log_trans() 214 */ 215 int btrfs_pin_log_trans(struct btrfs_root *root) 216 { 217 int ret = -ENOENT; 218 219 mutex_lock(&root->log_mutex); 220 atomic_inc(&root->log_writers); 221 mutex_unlock(&root->log_mutex); 222 return ret; 223 } 224 225 /* 226 * indicate we're done making changes to the log tree 227 * and wake up anyone waiting to do a sync 228 */ 229 void btrfs_end_log_trans(struct btrfs_root *root) 230 { 231 if (atomic_dec_and_test(&root->log_writers)) { 232 smp_mb(); 233 if (waitqueue_active(&root->log_writer_wait)) 234 wake_up(&root->log_writer_wait); 235 } 236 } 237 238 239 /* 240 * the walk control struct is used to pass state down the chain when 241 * processing the log tree. The stage field tells us which part 242 * of the log tree processing we are currently doing. The others 243 * are state fields used for that specific part 244 */ 245 struct walk_control { 246 /* should we free the extent on disk when done? This is used 247 * at transaction commit time while freeing a log tree 248 */ 249 int free; 250 251 /* should we write out the extent buffer? This is used 252 * while flushing the log tree to disk during a sync 253 */ 254 int write; 255 256 /* should we wait for the extent buffer io to finish? Also used 257 * while flushing the log tree to disk for a sync 258 */ 259 int wait; 260 261 /* pin only walk, we record which extents on disk belong to the 262 * log trees 263 */ 264 int pin; 265 266 /* what stage of the replay code we're currently in */ 267 int stage; 268 269 /* the root we are currently replaying */ 270 struct btrfs_root *replay_dest; 271 272 /* the trans handle for the current replay */ 273 struct btrfs_trans_handle *trans; 274 275 /* the function that gets used to process blocks we find in the 276 * tree. Note the extent_buffer might not be up to date when it is 277 * passed in, and it must be checked or read if you need the data 278 * inside it 279 */ 280 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 281 struct walk_control *wc, u64 gen); 282 }; 283 284 /* 285 * process_func used to pin down extents, write them or wait on them 286 */ 287 static int process_one_buffer(struct btrfs_root *log, 288 struct extent_buffer *eb, 289 struct walk_control *wc, u64 gen) 290 { 291 int ret = 0; 292 293 /* 294 * If this fs is mixed then we need to be able to process the leaves to 295 * pin down any logged extents, so we have to read the block. 296 */ 297 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 298 ret = btrfs_read_buffer(eb, gen); 299 if (ret) 300 return ret; 301 } 302 303 if (wc->pin) 304 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 305 eb->start, eb->len); 306 307 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 308 if (wc->pin && btrfs_header_level(eb) == 0) 309 ret = btrfs_exclude_logged_extents(log, eb); 310 if (wc->write) 311 btrfs_write_tree_block(eb); 312 if (wc->wait) 313 btrfs_wait_tree_block_writeback(eb); 314 } 315 return ret; 316 } 317 318 /* 319 * Item overwrite used by replay and tree logging. eb, slot and key all refer 320 * to the src data we are copying out. 321 * 322 * root is the tree we are copying into, and path is a scratch 323 * path for use in this function (it should be released on entry and 324 * will be released on exit). 325 * 326 * If the key is already in the destination tree the existing item is 327 * overwritten. If the existing item isn't big enough, it is extended. 328 * If it is too large, it is truncated. 329 * 330 * If the key isn't in the destination yet, a new item is inserted. 331 */ 332 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 333 struct btrfs_root *root, 334 struct btrfs_path *path, 335 struct extent_buffer *eb, int slot, 336 struct btrfs_key *key) 337 { 338 int ret; 339 u32 item_size; 340 u64 saved_i_size = 0; 341 int save_old_i_size = 0; 342 unsigned long src_ptr; 343 unsigned long dst_ptr; 344 int overwrite_root = 0; 345 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 346 347 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 348 overwrite_root = 1; 349 350 item_size = btrfs_item_size_nr(eb, slot); 351 src_ptr = btrfs_item_ptr_offset(eb, slot); 352 353 /* look for the key in the destination tree */ 354 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 355 if (ret < 0) 356 return ret; 357 358 if (ret == 0) { 359 char *src_copy; 360 char *dst_copy; 361 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 362 path->slots[0]); 363 if (dst_size != item_size) 364 goto insert; 365 366 if (item_size == 0) { 367 btrfs_release_path(path); 368 return 0; 369 } 370 dst_copy = kmalloc(item_size, GFP_NOFS); 371 src_copy = kmalloc(item_size, GFP_NOFS); 372 if (!dst_copy || !src_copy) { 373 btrfs_release_path(path); 374 kfree(dst_copy); 375 kfree(src_copy); 376 return -ENOMEM; 377 } 378 379 read_extent_buffer(eb, src_copy, src_ptr, item_size); 380 381 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 382 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 383 item_size); 384 ret = memcmp(dst_copy, src_copy, item_size); 385 386 kfree(dst_copy); 387 kfree(src_copy); 388 /* 389 * they have the same contents, just return, this saves 390 * us from cowing blocks in the destination tree and doing 391 * extra writes that may not have been done by a previous 392 * sync 393 */ 394 if (ret == 0) { 395 btrfs_release_path(path); 396 return 0; 397 } 398 399 /* 400 * We need to load the old nbytes into the inode so when we 401 * replay the extents we've logged we get the right nbytes. 402 */ 403 if (inode_item) { 404 struct btrfs_inode_item *item; 405 u64 nbytes; 406 u32 mode; 407 408 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 409 struct btrfs_inode_item); 410 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 411 item = btrfs_item_ptr(eb, slot, 412 struct btrfs_inode_item); 413 btrfs_set_inode_nbytes(eb, item, nbytes); 414 415 /* 416 * If this is a directory we need to reset the i_size to 417 * 0 so that we can set it up properly when replaying 418 * the rest of the items in this log. 419 */ 420 mode = btrfs_inode_mode(eb, item); 421 if (S_ISDIR(mode)) 422 btrfs_set_inode_size(eb, item, 0); 423 } 424 } else if (inode_item) { 425 struct btrfs_inode_item *item; 426 u32 mode; 427 428 /* 429 * New inode, set nbytes to 0 so that the nbytes comes out 430 * properly when we replay the extents. 431 */ 432 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 433 btrfs_set_inode_nbytes(eb, item, 0); 434 435 /* 436 * If this is a directory we need to reset the i_size to 0 so 437 * that we can set it up properly when replaying the rest of 438 * the items in this log. 439 */ 440 mode = btrfs_inode_mode(eb, item); 441 if (S_ISDIR(mode)) 442 btrfs_set_inode_size(eb, item, 0); 443 } 444 insert: 445 btrfs_release_path(path); 446 /* try to insert the key into the destination tree */ 447 path->skip_release_on_error = 1; 448 ret = btrfs_insert_empty_item(trans, root, path, 449 key, item_size); 450 path->skip_release_on_error = 0; 451 452 /* make sure any existing item is the correct size */ 453 if (ret == -EEXIST || ret == -EOVERFLOW) { 454 u32 found_size; 455 found_size = btrfs_item_size_nr(path->nodes[0], 456 path->slots[0]); 457 if (found_size > item_size) 458 btrfs_truncate_item(root, path, item_size, 1); 459 else if (found_size < item_size) 460 btrfs_extend_item(root, path, 461 item_size - found_size); 462 } else if (ret) { 463 return ret; 464 } 465 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 466 path->slots[0]); 467 468 /* don't overwrite an existing inode if the generation number 469 * was logged as zero. This is done when the tree logging code 470 * is just logging an inode to make sure it exists after recovery. 471 * 472 * Also, don't overwrite i_size on directories during replay. 473 * log replay inserts and removes directory items based on the 474 * state of the tree found in the subvolume, and i_size is modified 475 * as it goes 476 */ 477 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 478 struct btrfs_inode_item *src_item; 479 struct btrfs_inode_item *dst_item; 480 481 src_item = (struct btrfs_inode_item *)src_ptr; 482 dst_item = (struct btrfs_inode_item *)dst_ptr; 483 484 if (btrfs_inode_generation(eb, src_item) == 0) { 485 struct extent_buffer *dst_eb = path->nodes[0]; 486 const u64 ino_size = btrfs_inode_size(eb, src_item); 487 488 /* 489 * For regular files an ino_size == 0 is used only when 490 * logging that an inode exists, as part of a directory 491 * fsync, and the inode wasn't fsynced before. In this 492 * case don't set the size of the inode in the fs/subvol 493 * tree, otherwise we would be throwing valid data away. 494 */ 495 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 496 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 497 ino_size != 0) { 498 struct btrfs_map_token token; 499 500 btrfs_init_map_token(&token); 501 btrfs_set_token_inode_size(dst_eb, dst_item, 502 ino_size, &token); 503 } 504 goto no_copy; 505 } 506 507 if (overwrite_root && 508 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 509 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 510 save_old_i_size = 1; 511 saved_i_size = btrfs_inode_size(path->nodes[0], 512 dst_item); 513 } 514 } 515 516 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 517 src_ptr, item_size); 518 519 if (save_old_i_size) { 520 struct btrfs_inode_item *dst_item; 521 dst_item = (struct btrfs_inode_item *)dst_ptr; 522 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 523 } 524 525 /* make sure the generation is filled in */ 526 if (key->type == BTRFS_INODE_ITEM_KEY) { 527 struct btrfs_inode_item *dst_item; 528 dst_item = (struct btrfs_inode_item *)dst_ptr; 529 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 530 btrfs_set_inode_generation(path->nodes[0], dst_item, 531 trans->transid); 532 } 533 } 534 no_copy: 535 btrfs_mark_buffer_dirty(path->nodes[0]); 536 btrfs_release_path(path); 537 return 0; 538 } 539 540 /* 541 * simple helper to read an inode off the disk from a given root 542 * This can only be called for subvolume roots and not for the log 543 */ 544 static noinline struct inode *read_one_inode(struct btrfs_root *root, 545 u64 objectid) 546 { 547 struct btrfs_key key; 548 struct inode *inode; 549 550 key.objectid = objectid; 551 key.type = BTRFS_INODE_ITEM_KEY; 552 key.offset = 0; 553 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 554 if (IS_ERR(inode)) { 555 inode = NULL; 556 } else if (is_bad_inode(inode)) { 557 iput(inode); 558 inode = NULL; 559 } 560 return inode; 561 } 562 563 /* replays a single extent in 'eb' at 'slot' with 'key' into the 564 * subvolume 'root'. path is released on entry and should be released 565 * on exit. 566 * 567 * extents in the log tree have not been allocated out of the extent 568 * tree yet. So, this completes the allocation, taking a reference 569 * as required if the extent already exists or creating a new extent 570 * if it isn't in the extent allocation tree yet. 571 * 572 * The extent is inserted into the file, dropping any existing extents 573 * from the file that overlap the new one. 574 */ 575 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 576 struct btrfs_root *root, 577 struct btrfs_path *path, 578 struct extent_buffer *eb, int slot, 579 struct btrfs_key *key) 580 { 581 int found_type; 582 u64 extent_end; 583 u64 start = key->offset; 584 u64 nbytes = 0; 585 struct btrfs_file_extent_item *item; 586 struct inode *inode = NULL; 587 unsigned long size; 588 int ret = 0; 589 590 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 591 found_type = btrfs_file_extent_type(eb, item); 592 593 if (found_type == BTRFS_FILE_EXTENT_REG || 594 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 595 nbytes = btrfs_file_extent_num_bytes(eb, item); 596 extent_end = start + nbytes; 597 598 /* 599 * We don't add to the inodes nbytes if we are prealloc or a 600 * hole. 601 */ 602 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 603 nbytes = 0; 604 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 605 size = btrfs_file_extent_inline_len(eb, slot, item); 606 nbytes = btrfs_file_extent_ram_bytes(eb, item); 607 extent_end = ALIGN(start + size, root->sectorsize); 608 } else { 609 ret = 0; 610 goto out; 611 } 612 613 inode = read_one_inode(root, key->objectid); 614 if (!inode) { 615 ret = -EIO; 616 goto out; 617 } 618 619 /* 620 * first check to see if we already have this extent in the 621 * file. This must be done before the btrfs_drop_extents run 622 * so we don't try to drop this extent. 623 */ 624 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 625 start, 0); 626 627 if (ret == 0 && 628 (found_type == BTRFS_FILE_EXTENT_REG || 629 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 630 struct btrfs_file_extent_item cmp1; 631 struct btrfs_file_extent_item cmp2; 632 struct btrfs_file_extent_item *existing; 633 struct extent_buffer *leaf; 634 635 leaf = path->nodes[0]; 636 existing = btrfs_item_ptr(leaf, path->slots[0], 637 struct btrfs_file_extent_item); 638 639 read_extent_buffer(eb, &cmp1, (unsigned long)item, 640 sizeof(cmp1)); 641 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 642 sizeof(cmp2)); 643 644 /* 645 * we already have a pointer to this exact extent, 646 * we don't have to do anything 647 */ 648 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 649 btrfs_release_path(path); 650 goto out; 651 } 652 } 653 btrfs_release_path(path); 654 655 /* drop any overlapping extents */ 656 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 657 if (ret) 658 goto out; 659 660 if (found_type == BTRFS_FILE_EXTENT_REG || 661 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 662 u64 offset; 663 unsigned long dest_offset; 664 struct btrfs_key ins; 665 666 ret = btrfs_insert_empty_item(trans, root, path, key, 667 sizeof(*item)); 668 if (ret) 669 goto out; 670 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 671 path->slots[0]); 672 copy_extent_buffer(path->nodes[0], eb, dest_offset, 673 (unsigned long)item, sizeof(*item)); 674 675 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 676 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 677 ins.type = BTRFS_EXTENT_ITEM_KEY; 678 offset = key->offset - btrfs_file_extent_offset(eb, item); 679 680 if (ins.objectid > 0) { 681 u64 csum_start; 682 u64 csum_end; 683 LIST_HEAD(ordered_sums); 684 /* 685 * is this extent already allocated in the extent 686 * allocation tree? If so, just add a reference 687 */ 688 ret = btrfs_lookup_data_extent(root, ins.objectid, 689 ins.offset); 690 if (ret == 0) { 691 ret = btrfs_inc_extent_ref(trans, root, 692 ins.objectid, ins.offset, 693 0, root->root_key.objectid, 694 key->objectid, offset, 0); 695 if (ret) 696 goto out; 697 } else { 698 /* 699 * insert the extent pointer in the extent 700 * allocation tree 701 */ 702 ret = btrfs_alloc_logged_file_extent(trans, 703 root, root->root_key.objectid, 704 key->objectid, offset, &ins); 705 if (ret) 706 goto out; 707 } 708 btrfs_release_path(path); 709 710 if (btrfs_file_extent_compression(eb, item)) { 711 csum_start = ins.objectid; 712 csum_end = csum_start + ins.offset; 713 } else { 714 csum_start = ins.objectid + 715 btrfs_file_extent_offset(eb, item); 716 csum_end = csum_start + 717 btrfs_file_extent_num_bytes(eb, item); 718 } 719 720 ret = btrfs_lookup_csums_range(root->log_root, 721 csum_start, csum_end - 1, 722 &ordered_sums, 0); 723 if (ret) 724 goto out; 725 /* 726 * Now delete all existing cums in the csum root that 727 * cover our range. We do this because we can have an 728 * extent that is completely referenced by one file 729 * extent item and partially referenced by another 730 * file extent item (like after using the clone or 731 * extent_same ioctls). In this case if we end up doing 732 * the replay of the one that partially references the 733 * extent first, and we do not do the csum deletion 734 * below, we can get 2 csum items in the csum tree that 735 * overlap each other. For example, imagine our log has 736 * the two following file extent items: 737 * 738 * key (257 EXTENT_DATA 409600) 739 * extent data disk byte 12845056 nr 102400 740 * extent data offset 20480 nr 20480 ram 102400 741 * 742 * key (257 EXTENT_DATA 819200) 743 * extent data disk byte 12845056 nr 102400 744 * extent data offset 0 nr 102400 ram 102400 745 * 746 * Where the second one fully references the 100K extent 747 * that starts at disk byte 12845056, and the log tree 748 * has a single csum item that covers the entire range 749 * of the extent: 750 * 751 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 752 * 753 * After the first file extent item is replayed, the 754 * csum tree gets the following csum item: 755 * 756 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 757 * 758 * Which covers the 20K sub-range starting at offset 20K 759 * of our extent. Now when we replay the second file 760 * extent item, if we do not delete existing csum items 761 * that cover any of its blocks, we end up getting two 762 * csum items in our csum tree that overlap each other: 763 * 764 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 765 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 766 * 767 * Which is a problem, because after this anyone trying 768 * to lookup up for the checksum of any block of our 769 * extent starting at an offset of 40K or higher, will 770 * end up looking at the second csum item only, which 771 * does not contain the checksum for any block starting 772 * at offset 40K or higher of our extent. 773 */ 774 while (!list_empty(&ordered_sums)) { 775 struct btrfs_ordered_sum *sums; 776 sums = list_entry(ordered_sums.next, 777 struct btrfs_ordered_sum, 778 list); 779 if (!ret) 780 ret = btrfs_del_csums(trans, 781 root->fs_info->csum_root, 782 sums->bytenr, 783 sums->len); 784 if (!ret) 785 ret = btrfs_csum_file_blocks(trans, 786 root->fs_info->csum_root, 787 sums); 788 list_del(&sums->list); 789 kfree(sums); 790 } 791 if (ret) 792 goto out; 793 } else { 794 btrfs_release_path(path); 795 } 796 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 797 /* inline extents are easy, we just overwrite them */ 798 ret = overwrite_item(trans, root, path, eb, slot, key); 799 if (ret) 800 goto out; 801 } 802 803 inode_add_bytes(inode, nbytes); 804 ret = btrfs_update_inode(trans, root, inode); 805 out: 806 if (inode) 807 iput(inode); 808 return ret; 809 } 810 811 /* 812 * when cleaning up conflicts between the directory names in the 813 * subvolume, directory names in the log and directory names in the 814 * inode back references, we may have to unlink inodes from directories. 815 * 816 * This is a helper function to do the unlink of a specific directory 817 * item 818 */ 819 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 820 struct btrfs_root *root, 821 struct btrfs_path *path, 822 struct inode *dir, 823 struct btrfs_dir_item *di) 824 { 825 struct inode *inode; 826 char *name; 827 int name_len; 828 struct extent_buffer *leaf; 829 struct btrfs_key location; 830 int ret; 831 832 leaf = path->nodes[0]; 833 834 btrfs_dir_item_key_to_cpu(leaf, di, &location); 835 name_len = btrfs_dir_name_len(leaf, di); 836 name = kmalloc(name_len, GFP_NOFS); 837 if (!name) 838 return -ENOMEM; 839 840 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 841 btrfs_release_path(path); 842 843 inode = read_one_inode(root, location.objectid); 844 if (!inode) { 845 ret = -EIO; 846 goto out; 847 } 848 849 ret = link_to_fixup_dir(trans, root, path, location.objectid); 850 if (ret) 851 goto out; 852 853 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 854 if (ret) 855 goto out; 856 else 857 ret = btrfs_run_delayed_items(trans, root); 858 out: 859 kfree(name); 860 iput(inode); 861 return ret; 862 } 863 864 /* 865 * helper function to see if a given name and sequence number found 866 * in an inode back reference are already in a directory and correctly 867 * point to this inode 868 */ 869 static noinline int inode_in_dir(struct btrfs_root *root, 870 struct btrfs_path *path, 871 u64 dirid, u64 objectid, u64 index, 872 const char *name, int name_len) 873 { 874 struct btrfs_dir_item *di; 875 struct btrfs_key location; 876 int match = 0; 877 878 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 879 index, name, name_len, 0); 880 if (di && !IS_ERR(di)) { 881 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 882 if (location.objectid != objectid) 883 goto out; 884 } else 885 goto out; 886 btrfs_release_path(path); 887 888 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 889 if (di && !IS_ERR(di)) { 890 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 891 if (location.objectid != objectid) 892 goto out; 893 } else 894 goto out; 895 match = 1; 896 out: 897 btrfs_release_path(path); 898 return match; 899 } 900 901 /* 902 * helper function to check a log tree for a named back reference in 903 * an inode. This is used to decide if a back reference that is 904 * found in the subvolume conflicts with what we find in the log. 905 * 906 * inode backreferences may have multiple refs in a single item, 907 * during replay we process one reference at a time, and we don't 908 * want to delete valid links to a file from the subvolume if that 909 * link is also in the log. 910 */ 911 static noinline int backref_in_log(struct btrfs_root *log, 912 struct btrfs_key *key, 913 u64 ref_objectid, 914 const char *name, int namelen) 915 { 916 struct btrfs_path *path; 917 struct btrfs_inode_ref *ref; 918 unsigned long ptr; 919 unsigned long ptr_end; 920 unsigned long name_ptr; 921 int found_name_len; 922 int item_size; 923 int ret; 924 int match = 0; 925 926 path = btrfs_alloc_path(); 927 if (!path) 928 return -ENOMEM; 929 930 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 931 if (ret != 0) 932 goto out; 933 934 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 935 936 if (key->type == BTRFS_INODE_EXTREF_KEY) { 937 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 938 name, namelen, NULL)) 939 match = 1; 940 941 goto out; 942 } 943 944 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 945 ptr_end = ptr + item_size; 946 while (ptr < ptr_end) { 947 ref = (struct btrfs_inode_ref *)ptr; 948 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 949 if (found_name_len == namelen) { 950 name_ptr = (unsigned long)(ref + 1); 951 ret = memcmp_extent_buffer(path->nodes[0], name, 952 name_ptr, namelen); 953 if (ret == 0) { 954 match = 1; 955 goto out; 956 } 957 } 958 ptr = (unsigned long)(ref + 1) + found_name_len; 959 } 960 out: 961 btrfs_free_path(path); 962 return match; 963 } 964 965 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 966 struct btrfs_root *root, 967 struct btrfs_path *path, 968 struct btrfs_root *log_root, 969 struct inode *dir, struct inode *inode, 970 struct extent_buffer *eb, 971 u64 inode_objectid, u64 parent_objectid, 972 u64 ref_index, char *name, int namelen, 973 int *search_done) 974 { 975 int ret; 976 char *victim_name; 977 int victim_name_len; 978 struct extent_buffer *leaf; 979 struct btrfs_dir_item *di; 980 struct btrfs_key search_key; 981 struct btrfs_inode_extref *extref; 982 983 again: 984 /* Search old style refs */ 985 search_key.objectid = inode_objectid; 986 search_key.type = BTRFS_INODE_REF_KEY; 987 search_key.offset = parent_objectid; 988 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 989 if (ret == 0) { 990 struct btrfs_inode_ref *victim_ref; 991 unsigned long ptr; 992 unsigned long ptr_end; 993 994 leaf = path->nodes[0]; 995 996 /* are we trying to overwrite a back ref for the root directory 997 * if so, just jump out, we're done 998 */ 999 if (search_key.objectid == search_key.offset) 1000 return 1; 1001 1002 /* check all the names in this back reference to see 1003 * if they are in the log. if so, we allow them to stay 1004 * otherwise they must be unlinked as a conflict 1005 */ 1006 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1007 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1008 while (ptr < ptr_end) { 1009 victim_ref = (struct btrfs_inode_ref *)ptr; 1010 victim_name_len = btrfs_inode_ref_name_len(leaf, 1011 victim_ref); 1012 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1013 if (!victim_name) 1014 return -ENOMEM; 1015 1016 read_extent_buffer(leaf, victim_name, 1017 (unsigned long)(victim_ref + 1), 1018 victim_name_len); 1019 1020 if (!backref_in_log(log_root, &search_key, 1021 parent_objectid, 1022 victim_name, 1023 victim_name_len)) { 1024 inc_nlink(inode); 1025 btrfs_release_path(path); 1026 1027 ret = btrfs_unlink_inode(trans, root, dir, 1028 inode, victim_name, 1029 victim_name_len); 1030 kfree(victim_name); 1031 if (ret) 1032 return ret; 1033 ret = btrfs_run_delayed_items(trans, root); 1034 if (ret) 1035 return ret; 1036 *search_done = 1; 1037 goto again; 1038 } 1039 kfree(victim_name); 1040 1041 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1042 } 1043 1044 /* 1045 * NOTE: we have searched root tree and checked the 1046 * coresponding ref, it does not need to check again. 1047 */ 1048 *search_done = 1; 1049 } 1050 btrfs_release_path(path); 1051 1052 /* Same search but for extended refs */ 1053 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1054 inode_objectid, parent_objectid, 0, 1055 0); 1056 if (!IS_ERR_OR_NULL(extref)) { 1057 u32 item_size; 1058 u32 cur_offset = 0; 1059 unsigned long base; 1060 struct inode *victim_parent; 1061 1062 leaf = path->nodes[0]; 1063 1064 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1065 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1066 1067 while (cur_offset < item_size) { 1068 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1069 1070 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1071 1072 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1073 goto next; 1074 1075 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1076 if (!victim_name) 1077 return -ENOMEM; 1078 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1079 victim_name_len); 1080 1081 search_key.objectid = inode_objectid; 1082 search_key.type = BTRFS_INODE_EXTREF_KEY; 1083 search_key.offset = btrfs_extref_hash(parent_objectid, 1084 victim_name, 1085 victim_name_len); 1086 ret = 0; 1087 if (!backref_in_log(log_root, &search_key, 1088 parent_objectid, victim_name, 1089 victim_name_len)) { 1090 ret = -ENOENT; 1091 victim_parent = read_one_inode(root, 1092 parent_objectid); 1093 if (victim_parent) { 1094 inc_nlink(inode); 1095 btrfs_release_path(path); 1096 1097 ret = btrfs_unlink_inode(trans, root, 1098 victim_parent, 1099 inode, 1100 victim_name, 1101 victim_name_len); 1102 if (!ret) 1103 ret = btrfs_run_delayed_items( 1104 trans, root); 1105 } 1106 iput(victim_parent); 1107 kfree(victim_name); 1108 if (ret) 1109 return ret; 1110 *search_done = 1; 1111 goto again; 1112 } 1113 kfree(victim_name); 1114 if (ret) 1115 return ret; 1116 next: 1117 cur_offset += victim_name_len + sizeof(*extref); 1118 } 1119 *search_done = 1; 1120 } 1121 btrfs_release_path(path); 1122 1123 /* look for a conflicting sequence number */ 1124 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1125 ref_index, name, namelen, 0); 1126 if (di && !IS_ERR(di)) { 1127 ret = drop_one_dir_item(trans, root, path, dir, di); 1128 if (ret) 1129 return ret; 1130 } 1131 btrfs_release_path(path); 1132 1133 /* look for a conflicing name */ 1134 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1135 name, namelen, 0); 1136 if (di && !IS_ERR(di)) { 1137 ret = drop_one_dir_item(trans, root, path, dir, di); 1138 if (ret) 1139 return ret; 1140 } 1141 btrfs_release_path(path); 1142 1143 return 0; 1144 } 1145 1146 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1147 u32 *namelen, char **name, u64 *index, 1148 u64 *parent_objectid) 1149 { 1150 struct btrfs_inode_extref *extref; 1151 1152 extref = (struct btrfs_inode_extref *)ref_ptr; 1153 1154 *namelen = btrfs_inode_extref_name_len(eb, extref); 1155 *name = kmalloc(*namelen, GFP_NOFS); 1156 if (*name == NULL) 1157 return -ENOMEM; 1158 1159 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1160 *namelen); 1161 1162 *index = btrfs_inode_extref_index(eb, extref); 1163 if (parent_objectid) 1164 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1165 1166 return 0; 1167 } 1168 1169 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1170 u32 *namelen, char **name, u64 *index) 1171 { 1172 struct btrfs_inode_ref *ref; 1173 1174 ref = (struct btrfs_inode_ref *)ref_ptr; 1175 1176 *namelen = btrfs_inode_ref_name_len(eb, ref); 1177 *name = kmalloc(*namelen, GFP_NOFS); 1178 if (*name == NULL) 1179 return -ENOMEM; 1180 1181 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1182 1183 *index = btrfs_inode_ref_index(eb, ref); 1184 1185 return 0; 1186 } 1187 1188 /* 1189 * replay one inode back reference item found in the log tree. 1190 * eb, slot and key refer to the buffer and key found in the log tree. 1191 * root is the destination we are replaying into, and path is for temp 1192 * use by this function. (it should be released on return). 1193 */ 1194 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1195 struct btrfs_root *root, 1196 struct btrfs_root *log, 1197 struct btrfs_path *path, 1198 struct extent_buffer *eb, int slot, 1199 struct btrfs_key *key) 1200 { 1201 struct inode *dir = NULL; 1202 struct inode *inode = NULL; 1203 unsigned long ref_ptr; 1204 unsigned long ref_end; 1205 char *name = NULL; 1206 int namelen; 1207 int ret; 1208 int search_done = 0; 1209 int log_ref_ver = 0; 1210 u64 parent_objectid; 1211 u64 inode_objectid; 1212 u64 ref_index = 0; 1213 int ref_struct_size; 1214 1215 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1216 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1217 1218 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1219 struct btrfs_inode_extref *r; 1220 1221 ref_struct_size = sizeof(struct btrfs_inode_extref); 1222 log_ref_ver = 1; 1223 r = (struct btrfs_inode_extref *)ref_ptr; 1224 parent_objectid = btrfs_inode_extref_parent(eb, r); 1225 } else { 1226 ref_struct_size = sizeof(struct btrfs_inode_ref); 1227 parent_objectid = key->offset; 1228 } 1229 inode_objectid = key->objectid; 1230 1231 /* 1232 * it is possible that we didn't log all the parent directories 1233 * for a given inode. If we don't find the dir, just don't 1234 * copy the back ref in. The link count fixup code will take 1235 * care of the rest 1236 */ 1237 dir = read_one_inode(root, parent_objectid); 1238 if (!dir) { 1239 ret = -ENOENT; 1240 goto out; 1241 } 1242 1243 inode = read_one_inode(root, inode_objectid); 1244 if (!inode) { 1245 ret = -EIO; 1246 goto out; 1247 } 1248 1249 while (ref_ptr < ref_end) { 1250 if (log_ref_ver) { 1251 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1252 &ref_index, &parent_objectid); 1253 /* 1254 * parent object can change from one array 1255 * item to another. 1256 */ 1257 if (!dir) 1258 dir = read_one_inode(root, parent_objectid); 1259 if (!dir) { 1260 ret = -ENOENT; 1261 goto out; 1262 } 1263 } else { 1264 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1265 &ref_index); 1266 } 1267 if (ret) 1268 goto out; 1269 1270 /* if we already have a perfect match, we're done */ 1271 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1272 ref_index, name, namelen)) { 1273 /* 1274 * look for a conflicting back reference in the 1275 * metadata. if we find one we have to unlink that name 1276 * of the file before we add our new link. Later on, we 1277 * overwrite any existing back reference, and we don't 1278 * want to create dangling pointers in the directory. 1279 */ 1280 1281 if (!search_done) { 1282 ret = __add_inode_ref(trans, root, path, log, 1283 dir, inode, eb, 1284 inode_objectid, 1285 parent_objectid, 1286 ref_index, name, namelen, 1287 &search_done); 1288 if (ret) { 1289 if (ret == 1) 1290 ret = 0; 1291 goto out; 1292 } 1293 } 1294 1295 /* insert our name */ 1296 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1297 0, ref_index); 1298 if (ret) 1299 goto out; 1300 1301 btrfs_update_inode(trans, root, inode); 1302 } 1303 1304 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1305 kfree(name); 1306 name = NULL; 1307 if (log_ref_ver) { 1308 iput(dir); 1309 dir = NULL; 1310 } 1311 } 1312 1313 /* finally write the back reference in the inode */ 1314 ret = overwrite_item(trans, root, path, eb, slot, key); 1315 out: 1316 btrfs_release_path(path); 1317 kfree(name); 1318 iput(dir); 1319 iput(inode); 1320 return ret; 1321 } 1322 1323 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1324 struct btrfs_root *root, u64 ino) 1325 { 1326 int ret; 1327 1328 ret = btrfs_insert_orphan_item(trans, root, ino); 1329 if (ret == -EEXIST) 1330 ret = 0; 1331 1332 return ret; 1333 } 1334 1335 static int count_inode_extrefs(struct btrfs_root *root, 1336 struct inode *inode, struct btrfs_path *path) 1337 { 1338 int ret = 0; 1339 int name_len; 1340 unsigned int nlink = 0; 1341 u32 item_size; 1342 u32 cur_offset = 0; 1343 u64 inode_objectid = btrfs_ino(inode); 1344 u64 offset = 0; 1345 unsigned long ptr; 1346 struct btrfs_inode_extref *extref; 1347 struct extent_buffer *leaf; 1348 1349 while (1) { 1350 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1351 &extref, &offset); 1352 if (ret) 1353 break; 1354 1355 leaf = path->nodes[0]; 1356 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1357 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1358 cur_offset = 0; 1359 1360 while (cur_offset < item_size) { 1361 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1362 name_len = btrfs_inode_extref_name_len(leaf, extref); 1363 1364 nlink++; 1365 1366 cur_offset += name_len + sizeof(*extref); 1367 } 1368 1369 offset++; 1370 btrfs_release_path(path); 1371 } 1372 btrfs_release_path(path); 1373 1374 if (ret < 0 && ret != -ENOENT) 1375 return ret; 1376 return nlink; 1377 } 1378 1379 static int count_inode_refs(struct btrfs_root *root, 1380 struct inode *inode, struct btrfs_path *path) 1381 { 1382 int ret; 1383 struct btrfs_key key; 1384 unsigned int nlink = 0; 1385 unsigned long ptr; 1386 unsigned long ptr_end; 1387 int name_len; 1388 u64 ino = btrfs_ino(inode); 1389 1390 key.objectid = ino; 1391 key.type = BTRFS_INODE_REF_KEY; 1392 key.offset = (u64)-1; 1393 1394 while (1) { 1395 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1396 if (ret < 0) 1397 break; 1398 if (ret > 0) { 1399 if (path->slots[0] == 0) 1400 break; 1401 path->slots[0]--; 1402 } 1403 process_slot: 1404 btrfs_item_key_to_cpu(path->nodes[0], &key, 1405 path->slots[0]); 1406 if (key.objectid != ino || 1407 key.type != BTRFS_INODE_REF_KEY) 1408 break; 1409 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1410 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1411 path->slots[0]); 1412 while (ptr < ptr_end) { 1413 struct btrfs_inode_ref *ref; 1414 1415 ref = (struct btrfs_inode_ref *)ptr; 1416 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1417 ref); 1418 ptr = (unsigned long)(ref + 1) + name_len; 1419 nlink++; 1420 } 1421 1422 if (key.offset == 0) 1423 break; 1424 if (path->slots[0] > 0) { 1425 path->slots[0]--; 1426 goto process_slot; 1427 } 1428 key.offset--; 1429 btrfs_release_path(path); 1430 } 1431 btrfs_release_path(path); 1432 1433 return nlink; 1434 } 1435 1436 /* 1437 * There are a few corners where the link count of the file can't 1438 * be properly maintained during replay. So, instead of adding 1439 * lots of complexity to the log code, we just scan the backrefs 1440 * for any file that has been through replay. 1441 * 1442 * The scan will update the link count on the inode to reflect the 1443 * number of back refs found. If it goes down to zero, the iput 1444 * will free the inode. 1445 */ 1446 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1447 struct btrfs_root *root, 1448 struct inode *inode) 1449 { 1450 struct btrfs_path *path; 1451 int ret; 1452 u64 nlink = 0; 1453 u64 ino = btrfs_ino(inode); 1454 1455 path = btrfs_alloc_path(); 1456 if (!path) 1457 return -ENOMEM; 1458 1459 ret = count_inode_refs(root, inode, path); 1460 if (ret < 0) 1461 goto out; 1462 1463 nlink = ret; 1464 1465 ret = count_inode_extrefs(root, inode, path); 1466 if (ret < 0) 1467 goto out; 1468 1469 nlink += ret; 1470 1471 ret = 0; 1472 1473 if (nlink != inode->i_nlink) { 1474 set_nlink(inode, nlink); 1475 btrfs_update_inode(trans, root, inode); 1476 } 1477 BTRFS_I(inode)->index_cnt = (u64)-1; 1478 1479 if (inode->i_nlink == 0) { 1480 if (S_ISDIR(inode->i_mode)) { 1481 ret = replay_dir_deletes(trans, root, NULL, path, 1482 ino, 1); 1483 if (ret) 1484 goto out; 1485 } 1486 ret = insert_orphan_item(trans, root, ino); 1487 } 1488 1489 out: 1490 btrfs_free_path(path); 1491 return ret; 1492 } 1493 1494 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1495 struct btrfs_root *root, 1496 struct btrfs_path *path) 1497 { 1498 int ret; 1499 struct btrfs_key key; 1500 struct inode *inode; 1501 1502 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1503 key.type = BTRFS_ORPHAN_ITEM_KEY; 1504 key.offset = (u64)-1; 1505 while (1) { 1506 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1507 if (ret < 0) 1508 break; 1509 1510 if (ret == 1) { 1511 if (path->slots[0] == 0) 1512 break; 1513 path->slots[0]--; 1514 } 1515 1516 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1517 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1518 key.type != BTRFS_ORPHAN_ITEM_KEY) 1519 break; 1520 1521 ret = btrfs_del_item(trans, root, path); 1522 if (ret) 1523 goto out; 1524 1525 btrfs_release_path(path); 1526 inode = read_one_inode(root, key.offset); 1527 if (!inode) 1528 return -EIO; 1529 1530 ret = fixup_inode_link_count(trans, root, inode); 1531 iput(inode); 1532 if (ret) 1533 goto out; 1534 1535 /* 1536 * fixup on a directory may create new entries, 1537 * make sure we always look for the highset possible 1538 * offset 1539 */ 1540 key.offset = (u64)-1; 1541 } 1542 ret = 0; 1543 out: 1544 btrfs_release_path(path); 1545 return ret; 1546 } 1547 1548 1549 /* 1550 * record a given inode in the fixup dir so we can check its link 1551 * count when replay is done. The link count is incremented here 1552 * so the inode won't go away until we check it 1553 */ 1554 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1555 struct btrfs_root *root, 1556 struct btrfs_path *path, 1557 u64 objectid) 1558 { 1559 struct btrfs_key key; 1560 int ret = 0; 1561 struct inode *inode; 1562 1563 inode = read_one_inode(root, objectid); 1564 if (!inode) 1565 return -EIO; 1566 1567 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1568 key.type = BTRFS_ORPHAN_ITEM_KEY; 1569 key.offset = objectid; 1570 1571 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1572 1573 btrfs_release_path(path); 1574 if (ret == 0) { 1575 if (!inode->i_nlink) 1576 set_nlink(inode, 1); 1577 else 1578 inc_nlink(inode); 1579 ret = btrfs_update_inode(trans, root, inode); 1580 } else if (ret == -EEXIST) { 1581 ret = 0; 1582 } else { 1583 BUG(); /* Logic Error */ 1584 } 1585 iput(inode); 1586 1587 return ret; 1588 } 1589 1590 /* 1591 * when replaying the log for a directory, we only insert names 1592 * for inodes that actually exist. This means an fsync on a directory 1593 * does not implicitly fsync all the new files in it 1594 */ 1595 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1596 struct btrfs_root *root, 1597 u64 dirid, u64 index, 1598 char *name, int name_len, 1599 struct btrfs_key *location) 1600 { 1601 struct inode *inode; 1602 struct inode *dir; 1603 int ret; 1604 1605 inode = read_one_inode(root, location->objectid); 1606 if (!inode) 1607 return -ENOENT; 1608 1609 dir = read_one_inode(root, dirid); 1610 if (!dir) { 1611 iput(inode); 1612 return -EIO; 1613 } 1614 1615 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1616 1617 /* FIXME, put inode into FIXUP list */ 1618 1619 iput(inode); 1620 iput(dir); 1621 return ret; 1622 } 1623 1624 /* 1625 * Return true if an inode reference exists in the log for the given name, 1626 * inode and parent inode. 1627 */ 1628 static bool name_in_log_ref(struct btrfs_root *log_root, 1629 const char *name, const int name_len, 1630 const u64 dirid, const u64 ino) 1631 { 1632 struct btrfs_key search_key; 1633 1634 search_key.objectid = ino; 1635 search_key.type = BTRFS_INODE_REF_KEY; 1636 search_key.offset = dirid; 1637 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1638 return true; 1639 1640 search_key.type = BTRFS_INODE_EXTREF_KEY; 1641 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1642 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1643 return true; 1644 1645 return false; 1646 } 1647 1648 /* 1649 * take a single entry in a log directory item and replay it into 1650 * the subvolume. 1651 * 1652 * if a conflicting item exists in the subdirectory already, 1653 * the inode it points to is unlinked and put into the link count 1654 * fix up tree. 1655 * 1656 * If a name from the log points to a file or directory that does 1657 * not exist in the FS, it is skipped. fsyncs on directories 1658 * do not force down inodes inside that directory, just changes to the 1659 * names or unlinks in a directory. 1660 * 1661 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1662 * non-existing inode) and 1 if the name was replayed. 1663 */ 1664 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1665 struct btrfs_root *root, 1666 struct btrfs_path *path, 1667 struct extent_buffer *eb, 1668 struct btrfs_dir_item *di, 1669 struct btrfs_key *key) 1670 { 1671 char *name; 1672 int name_len; 1673 struct btrfs_dir_item *dst_di; 1674 struct btrfs_key found_key; 1675 struct btrfs_key log_key; 1676 struct inode *dir; 1677 u8 log_type; 1678 int exists; 1679 int ret = 0; 1680 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1681 bool name_added = false; 1682 1683 dir = read_one_inode(root, key->objectid); 1684 if (!dir) 1685 return -EIO; 1686 1687 name_len = btrfs_dir_name_len(eb, di); 1688 name = kmalloc(name_len, GFP_NOFS); 1689 if (!name) { 1690 ret = -ENOMEM; 1691 goto out; 1692 } 1693 1694 log_type = btrfs_dir_type(eb, di); 1695 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1696 name_len); 1697 1698 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1699 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1700 if (exists == 0) 1701 exists = 1; 1702 else 1703 exists = 0; 1704 btrfs_release_path(path); 1705 1706 if (key->type == BTRFS_DIR_ITEM_KEY) { 1707 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1708 name, name_len, 1); 1709 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1710 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1711 key->objectid, 1712 key->offset, name, 1713 name_len, 1); 1714 } else { 1715 /* Corruption */ 1716 ret = -EINVAL; 1717 goto out; 1718 } 1719 if (IS_ERR_OR_NULL(dst_di)) { 1720 /* we need a sequence number to insert, so we only 1721 * do inserts for the BTRFS_DIR_INDEX_KEY types 1722 */ 1723 if (key->type != BTRFS_DIR_INDEX_KEY) 1724 goto out; 1725 goto insert; 1726 } 1727 1728 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1729 /* the existing item matches the logged item */ 1730 if (found_key.objectid == log_key.objectid && 1731 found_key.type == log_key.type && 1732 found_key.offset == log_key.offset && 1733 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1734 update_size = false; 1735 goto out; 1736 } 1737 1738 /* 1739 * don't drop the conflicting directory entry if the inode 1740 * for the new entry doesn't exist 1741 */ 1742 if (!exists) 1743 goto out; 1744 1745 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1746 if (ret) 1747 goto out; 1748 1749 if (key->type == BTRFS_DIR_INDEX_KEY) 1750 goto insert; 1751 out: 1752 btrfs_release_path(path); 1753 if (!ret && update_size) { 1754 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1755 ret = btrfs_update_inode(trans, root, dir); 1756 } 1757 kfree(name); 1758 iput(dir); 1759 if (!ret && name_added) 1760 ret = 1; 1761 return ret; 1762 1763 insert: 1764 if (name_in_log_ref(root->log_root, name, name_len, 1765 key->objectid, log_key.objectid)) { 1766 /* The dentry will be added later. */ 1767 ret = 0; 1768 update_size = false; 1769 goto out; 1770 } 1771 btrfs_release_path(path); 1772 ret = insert_one_name(trans, root, key->objectid, key->offset, 1773 name, name_len, &log_key); 1774 if (ret && ret != -ENOENT && ret != -EEXIST) 1775 goto out; 1776 if (!ret) 1777 name_added = true; 1778 update_size = false; 1779 ret = 0; 1780 goto out; 1781 } 1782 1783 /* 1784 * find all the names in a directory item and reconcile them into 1785 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1786 * one name in a directory item, but the same code gets used for 1787 * both directory index types 1788 */ 1789 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1790 struct btrfs_root *root, 1791 struct btrfs_path *path, 1792 struct extent_buffer *eb, int slot, 1793 struct btrfs_key *key) 1794 { 1795 int ret = 0; 1796 u32 item_size = btrfs_item_size_nr(eb, slot); 1797 struct btrfs_dir_item *di; 1798 int name_len; 1799 unsigned long ptr; 1800 unsigned long ptr_end; 1801 struct btrfs_path *fixup_path = NULL; 1802 1803 ptr = btrfs_item_ptr_offset(eb, slot); 1804 ptr_end = ptr + item_size; 1805 while (ptr < ptr_end) { 1806 di = (struct btrfs_dir_item *)ptr; 1807 if (verify_dir_item(root, eb, di)) 1808 return -EIO; 1809 name_len = btrfs_dir_name_len(eb, di); 1810 ret = replay_one_name(trans, root, path, eb, di, key); 1811 if (ret < 0) 1812 break; 1813 ptr = (unsigned long)(di + 1); 1814 ptr += name_len; 1815 1816 /* 1817 * If this entry refers to a non-directory (directories can not 1818 * have a link count > 1) and it was added in the transaction 1819 * that was not committed, make sure we fixup the link count of 1820 * the inode it the entry points to. Otherwise something like 1821 * the following would result in a directory pointing to an 1822 * inode with a wrong link that does not account for this dir 1823 * entry: 1824 * 1825 * mkdir testdir 1826 * touch testdir/foo 1827 * touch testdir/bar 1828 * sync 1829 * 1830 * ln testdir/bar testdir/bar_link 1831 * ln testdir/foo testdir/foo_link 1832 * xfs_io -c "fsync" testdir/bar 1833 * 1834 * <power failure> 1835 * 1836 * mount fs, log replay happens 1837 * 1838 * File foo would remain with a link count of 1 when it has two 1839 * entries pointing to it in the directory testdir. This would 1840 * make it impossible to ever delete the parent directory has 1841 * it would result in stale dentries that can never be deleted. 1842 */ 1843 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1844 struct btrfs_key di_key; 1845 1846 if (!fixup_path) { 1847 fixup_path = btrfs_alloc_path(); 1848 if (!fixup_path) { 1849 ret = -ENOMEM; 1850 break; 1851 } 1852 } 1853 1854 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1855 ret = link_to_fixup_dir(trans, root, fixup_path, 1856 di_key.objectid); 1857 if (ret) 1858 break; 1859 } 1860 ret = 0; 1861 } 1862 btrfs_free_path(fixup_path); 1863 return ret; 1864 } 1865 1866 /* 1867 * directory replay has two parts. There are the standard directory 1868 * items in the log copied from the subvolume, and range items 1869 * created in the log while the subvolume was logged. 1870 * 1871 * The range items tell us which parts of the key space the log 1872 * is authoritative for. During replay, if a key in the subvolume 1873 * directory is in a logged range item, but not actually in the log 1874 * that means it was deleted from the directory before the fsync 1875 * and should be removed. 1876 */ 1877 static noinline int find_dir_range(struct btrfs_root *root, 1878 struct btrfs_path *path, 1879 u64 dirid, int key_type, 1880 u64 *start_ret, u64 *end_ret) 1881 { 1882 struct btrfs_key key; 1883 u64 found_end; 1884 struct btrfs_dir_log_item *item; 1885 int ret; 1886 int nritems; 1887 1888 if (*start_ret == (u64)-1) 1889 return 1; 1890 1891 key.objectid = dirid; 1892 key.type = key_type; 1893 key.offset = *start_ret; 1894 1895 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1896 if (ret < 0) 1897 goto out; 1898 if (ret > 0) { 1899 if (path->slots[0] == 0) 1900 goto out; 1901 path->slots[0]--; 1902 } 1903 if (ret != 0) 1904 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1905 1906 if (key.type != key_type || key.objectid != dirid) { 1907 ret = 1; 1908 goto next; 1909 } 1910 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1911 struct btrfs_dir_log_item); 1912 found_end = btrfs_dir_log_end(path->nodes[0], item); 1913 1914 if (*start_ret >= key.offset && *start_ret <= found_end) { 1915 ret = 0; 1916 *start_ret = key.offset; 1917 *end_ret = found_end; 1918 goto out; 1919 } 1920 ret = 1; 1921 next: 1922 /* check the next slot in the tree to see if it is a valid item */ 1923 nritems = btrfs_header_nritems(path->nodes[0]); 1924 if (path->slots[0] >= nritems) { 1925 ret = btrfs_next_leaf(root, path); 1926 if (ret) 1927 goto out; 1928 } else { 1929 path->slots[0]++; 1930 } 1931 1932 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1933 1934 if (key.type != key_type || key.objectid != dirid) { 1935 ret = 1; 1936 goto out; 1937 } 1938 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1939 struct btrfs_dir_log_item); 1940 found_end = btrfs_dir_log_end(path->nodes[0], item); 1941 *start_ret = key.offset; 1942 *end_ret = found_end; 1943 ret = 0; 1944 out: 1945 btrfs_release_path(path); 1946 return ret; 1947 } 1948 1949 /* 1950 * this looks for a given directory item in the log. If the directory 1951 * item is not in the log, the item is removed and the inode it points 1952 * to is unlinked 1953 */ 1954 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1955 struct btrfs_root *root, 1956 struct btrfs_root *log, 1957 struct btrfs_path *path, 1958 struct btrfs_path *log_path, 1959 struct inode *dir, 1960 struct btrfs_key *dir_key) 1961 { 1962 int ret; 1963 struct extent_buffer *eb; 1964 int slot; 1965 u32 item_size; 1966 struct btrfs_dir_item *di; 1967 struct btrfs_dir_item *log_di; 1968 int name_len; 1969 unsigned long ptr; 1970 unsigned long ptr_end; 1971 char *name; 1972 struct inode *inode; 1973 struct btrfs_key location; 1974 1975 again: 1976 eb = path->nodes[0]; 1977 slot = path->slots[0]; 1978 item_size = btrfs_item_size_nr(eb, slot); 1979 ptr = btrfs_item_ptr_offset(eb, slot); 1980 ptr_end = ptr + item_size; 1981 while (ptr < ptr_end) { 1982 di = (struct btrfs_dir_item *)ptr; 1983 if (verify_dir_item(root, eb, di)) { 1984 ret = -EIO; 1985 goto out; 1986 } 1987 1988 name_len = btrfs_dir_name_len(eb, di); 1989 name = kmalloc(name_len, GFP_NOFS); 1990 if (!name) { 1991 ret = -ENOMEM; 1992 goto out; 1993 } 1994 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1995 name_len); 1996 log_di = NULL; 1997 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1998 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1999 dir_key->objectid, 2000 name, name_len, 0); 2001 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2002 log_di = btrfs_lookup_dir_index_item(trans, log, 2003 log_path, 2004 dir_key->objectid, 2005 dir_key->offset, 2006 name, name_len, 0); 2007 } 2008 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2009 btrfs_dir_item_key_to_cpu(eb, di, &location); 2010 btrfs_release_path(path); 2011 btrfs_release_path(log_path); 2012 inode = read_one_inode(root, location.objectid); 2013 if (!inode) { 2014 kfree(name); 2015 return -EIO; 2016 } 2017 2018 ret = link_to_fixup_dir(trans, root, 2019 path, location.objectid); 2020 if (ret) { 2021 kfree(name); 2022 iput(inode); 2023 goto out; 2024 } 2025 2026 inc_nlink(inode); 2027 ret = btrfs_unlink_inode(trans, root, dir, inode, 2028 name, name_len); 2029 if (!ret) 2030 ret = btrfs_run_delayed_items(trans, root); 2031 kfree(name); 2032 iput(inode); 2033 if (ret) 2034 goto out; 2035 2036 /* there might still be more names under this key 2037 * check and repeat if required 2038 */ 2039 ret = btrfs_search_slot(NULL, root, dir_key, path, 2040 0, 0); 2041 if (ret == 0) 2042 goto again; 2043 ret = 0; 2044 goto out; 2045 } else if (IS_ERR(log_di)) { 2046 kfree(name); 2047 return PTR_ERR(log_di); 2048 } 2049 btrfs_release_path(log_path); 2050 kfree(name); 2051 2052 ptr = (unsigned long)(di + 1); 2053 ptr += name_len; 2054 } 2055 ret = 0; 2056 out: 2057 btrfs_release_path(path); 2058 btrfs_release_path(log_path); 2059 return ret; 2060 } 2061 2062 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2063 struct btrfs_root *root, 2064 struct btrfs_root *log, 2065 struct btrfs_path *path, 2066 const u64 ino) 2067 { 2068 struct btrfs_key search_key; 2069 struct btrfs_path *log_path; 2070 int i; 2071 int nritems; 2072 int ret; 2073 2074 log_path = btrfs_alloc_path(); 2075 if (!log_path) 2076 return -ENOMEM; 2077 2078 search_key.objectid = ino; 2079 search_key.type = BTRFS_XATTR_ITEM_KEY; 2080 search_key.offset = 0; 2081 again: 2082 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2083 if (ret < 0) 2084 goto out; 2085 process_leaf: 2086 nritems = btrfs_header_nritems(path->nodes[0]); 2087 for (i = path->slots[0]; i < nritems; i++) { 2088 struct btrfs_key key; 2089 struct btrfs_dir_item *di; 2090 struct btrfs_dir_item *log_di; 2091 u32 total_size; 2092 u32 cur; 2093 2094 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2095 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2096 ret = 0; 2097 goto out; 2098 } 2099 2100 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2101 total_size = btrfs_item_size_nr(path->nodes[0], i); 2102 cur = 0; 2103 while (cur < total_size) { 2104 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2105 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2106 u32 this_len = sizeof(*di) + name_len + data_len; 2107 char *name; 2108 2109 name = kmalloc(name_len, GFP_NOFS); 2110 if (!name) { 2111 ret = -ENOMEM; 2112 goto out; 2113 } 2114 read_extent_buffer(path->nodes[0], name, 2115 (unsigned long)(di + 1), name_len); 2116 2117 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2118 name, name_len, 0); 2119 btrfs_release_path(log_path); 2120 if (!log_di) { 2121 /* Doesn't exist in log tree, so delete it. */ 2122 btrfs_release_path(path); 2123 di = btrfs_lookup_xattr(trans, root, path, ino, 2124 name, name_len, -1); 2125 kfree(name); 2126 if (IS_ERR(di)) { 2127 ret = PTR_ERR(di); 2128 goto out; 2129 } 2130 ASSERT(di); 2131 ret = btrfs_delete_one_dir_name(trans, root, 2132 path, di); 2133 if (ret) 2134 goto out; 2135 btrfs_release_path(path); 2136 search_key = key; 2137 goto again; 2138 } 2139 kfree(name); 2140 if (IS_ERR(log_di)) { 2141 ret = PTR_ERR(log_di); 2142 goto out; 2143 } 2144 cur += this_len; 2145 di = (struct btrfs_dir_item *)((char *)di + this_len); 2146 } 2147 } 2148 ret = btrfs_next_leaf(root, path); 2149 if (ret > 0) 2150 ret = 0; 2151 else if (ret == 0) 2152 goto process_leaf; 2153 out: 2154 btrfs_free_path(log_path); 2155 btrfs_release_path(path); 2156 return ret; 2157 } 2158 2159 2160 /* 2161 * deletion replay happens before we copy any new directory items 2162 * out of the log or out of backreferences from inodes. It 2163 * scans the log to find ranges of keys that log is authoritative for, 2164 * and then scans the directory to find items in those ranges that are 2165 * not present in the log. 2166 * 2167 * Anything we don't find in the log is unlinked and removed from the 2168 * directory. 2169 */ 2170 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2171 struct btrfs_root *root, 2172 struct btrfs_root *log, 2173 struct btrfs_path *path, 2174 u64 dirid, int del_all) 2175 { 2176 u64 range_start; 2177 u64 range_end; 2178 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2179 int ret = 0; 2180 struct btrfs_key dir_key; 2181 struct btrfs_key found_key; 2182 struct btrfs_path *log_path; 2183 struct inode *dir; 2184 2185 dir_key.objectid = dirid; 2186 dir_key.type = BTRFS_DIR_ITEM_KEY; 2187 log_path = btrfs_alloc_path(); 2188 if (!log_path) 2189 return -ENOMEM; 2190 2191 dir = read_one_inode(root, dirid); 2192 /* it isn't an error if the inode isn't there, that can happen 2193 * because we replay the deletes before we copy in the inode item 2194 * from the log 2195 */ 2196 if (!dir) { 2197 btrfs_free_path(log_path); 2198 return 0; 2199 } 2200 again: 2201 range_start = 0; 2202 range_end = 0; 2203 while (1) { 2204 if (del_all) 2205 range_end = (u64)-1; 2206 else { 2207 ret = find_dir_range(log, path, dirid, key_type, 2208 &range_start, &range_end); 2209 if (ret != 0) 2210 break; 2211 } 2212 2213 dir_key.offset = range_start; 2214 while (1) { 2215 int nritems; 2216 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2217 0, 0); 2218 if (ret < 0) 2219 goto out; 2220 2221 nritems = btrfs_header_nritems(path->nodes[0]); 2222 if (path->slots[0] >= nritems) { 2223 ret = btrfs_next_leaf(root, path); 2224 if (ret) 2225 break; 2226 } 2227 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2228 path->slots[0]); 2229 if (found_key.objectid != dirid || 2230 found_key.type != dir_key.type) 2231 goto next_type; 2232 2233 if (found_key.offset > range_end) 2234 break; 2235 2236 ret = check_item_in_log(trans, root, log, path, 2237 log_path, dir, 2238 &found_key); 2239 if (ret) 2240 goto out; 2241 if (found_key.offset == (u64)-1) 2242 break; 2243 dir_key.offset = found_key.offset + 1; 2244 } 2245 btrfs_release_path(path); 2246 if (range_end == (u64)-1) 2247 break; 2248 range_start = range_end + 1; 2249 } 2250 2251 next_type: 2252 ret = 0; 2253 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2254 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2255 dir_key.type = BTRFS_DIR_INDEX_KEY; 2256 btrfs_release_path(path); 2257 goto again; 2258 } 2259 out: 2260 btrfs_release_path(path); 2261 btrfs_free_path(log_path); 2262 iput(dir); 2263 return ret; 2264 } 2265 2266 /* 2267 * the process_func used to replay items from the log tree. This 2268 * gets called in two different stages. The first stage just looks 2269 * for inodes and makes sure they are all copied into the subvolume. 2270 * 2271 * The second stage copies all the other item types from the log into 2272 * the subvolume. The two stage approach is slower, but gets rid of 2273 * lots of complexity around inodes referencing other inodes that exist 2274 * only in the log (references come from either directory items or inode 2275 * back refs). 2276 */ 2277 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2278 struct walk_control *wc, u64 gen) 2279 { 2280 int nritems; 2281 struct btrfs_path *path; 2282 struct btrfs_root *root = wc->replay_dest; 2283 struct btrfs_key key; 2284 int level; 2285 int i; 2286 int ret; 2287 2288 ret = btrfs_read_buffer(eb, gen); 2289 if (ret) 2290 return ret; 2291 2292 level = btrfs_header_level(eb); 2293 2294 if (level != 0) 2295 return 0; 2296 2297 path = btrfs_alloc_path(); 2298 if (!path) 2299 return -ENOMEM; 2300 2301 nritems = btrfs_header_nritems(eb); 2302 for (i = 0; i < nritems; i++) { 2303 btrfs_item_key_to_cpu(eb, &key, i); 2304 2305 /* inode keys are done during the first stage */ 2306 if (key.type == BTRFS_INODE_ITEM_KEY && 2307 wc->stage == LOG_WALK_REPLAY_INODES) { 2308 struct btrfs_inode_item *inode_item; 2309 u32 mode; 2310 2311 inode_item = btrfs_item_ptr(eb, i, 2312 struct btrfs_inode_item); 2313 ret = replay_xattr_deletes(wc->trans, root, log, 2314 path, key.objectid); 2315 if (ret) 2316 break; 2317 mode = btrfs_inode_mode(eb, inode_item); 2318 if (S_ISDIR(mode)) { 2319 ret = replay_dir_deletes(wc->trans, 2320 root, log, path, key.objectid, 0); 2321 if (ret) 2322 break; 2323 } 2324 ret = overwrite_item(wc->trans, root, path, 2325 eb, i, &key); 2326 if (ret) 2327 break; 2328 2329 /* for regular files, make sure corresponding 2330 * orhpan item exist. extents past the new EOF 2331 * will be truncated later by orphan cleanup. 2332 */ 2333 if (S_ISREG(mode)) { 2334 ret = insert_orphan_item(wc->trans, root, 2335 key.objectid); 2336 if (ret) 2337 break; 2338 } 2339 2340 ret = link_to_fixup_dir(wc->trans, root, 2341 path, key.objectid); 2342 if (ret) 2343 break; 2344 } 2345 2346 if (key.type == BTRFS_DIR_INDEX_KEY && 2347 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2348 ret = replay_one_dir_item(wc->trans, root, path, 2349 eb, i, &key); 2350 if (ret) 2351 break; 2352 } 2353 2354 if (wc->stage < LOG_WALK_REPLAY_ALL) 2355 continue; 2356 2357 /* these keys are simply copied */ 2358 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2359 ret = overwrite_item(wc->trans, root, path, 2360 eb, i, &key); 2361 if (ret) 2362 break; 2363 } else if (key.type == BTRFS_INODE_REF_KEY || 2364 key.type == BTRFS_INODE_EXTREF_KEY) { 2365 ret = add_inode_ref(wc->trans, root, log, path, 2366 eb, i, &key); 2367 if (ret && ret != -ENOENT) 2368 break; 2369 ret = 0; 2370 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2371 ret = replay_one_extent(wc->trans, root, path, 2372 eb, i, &key); 2373 if (ret) 2374 break; 2375 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2376 ret = replay_one_dir_item(wc->trans, root, path, 2377 eb, i, &key); 2378 if (ret) 2379 break; 2380 } 2381 } 2382 btrfs_free_path(path); 2383 return ret; 2384 } 2385 2386 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2387 struct btrfs_root *root, 2388 struct btrfs_path *path, int *level, 2389 struct walk_control *wc) 2390 { 2391 u64 root_owner; 2392 u64 bytenr; 2393 u64 ptr_gen; 2394 struct extent_buffer *next; 2395 struct extent_buffer *cur; 2396 struct extent_buffer *parent; 2397 u32 blocksize; 2398 int ret = 0; 2399 2400 WARN_ON(*level < 0); 2401 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2402 2403 while (*level > 0) { 2404 WARN_ON(*level < 0); 2405 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2406 cur = path->nodes[*level]; 2407 2408 WARN_ON(btrfs_header_level(cur) != *level); 2409 2410 if (path->slots[*level] >= 2411 btrfs_header_nritems(cur)) 2412 break; 2413 2414 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2415 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2416 blocksize = root->nodesize; 2417 2418 parent = path->nodes[*level]; 2419 root_owner = btrfs_header_owner(parent); 2420 2421 next = btrfs_find_create_tree_block(root, bytenr); 2422 if (!next) 2423 return -ENOMEM; 2424 2425 if (*level == 1) { 2426 ret = wc->process_func(root, next, wc, ptr_gen); 2427 if (ret) { 2428 free_extent_buffer(next); 2429 return ret; 2430 } 2431 2432 path->slots[*level]++; 2433 if (wc->free) { 2434 ret = btrfs_read_buffer(next, ptr_gen); 2435 if (ret) { 2436 free_extent_buffer(next); 2437 return ret; 2438 } 2439 2440 if (trans) { 2441 btrfs_tree_lock(next); 2442 btrfs_set_lock_blocking(next); 2443 clean_tree_block(trans, root->fs_info, 2444 next); 2445 btrfs_wait_tree_block_writeback(next); 2446 btrfs_tree_unlock(next); 2447 } 2448 2449 WARN_ON(root_owner != 2450 BTRFS_TREE_LOG_OBJECTID); 2451 ret = btrfs_free_and_pin_reserved_extent(root, 2452 bytenr, blocksize); 2453 if (ret) { 2454 free_extent_buffer(next); 2455 return ret; 2456 } 2457 } 2458 free_extent_buffer(next); 2459 continue; 2460 } 2461 ret = btrfs_read_buffer(next, ptr_gen); 2462 if (ret) { 2463 free_extent_buffer(next); 2464 return ret; 2465 } 2466 2467 WARN_ON(*level <= 0); 2468 if (path->nodes[*level-1]) 2469 free_extent_buffer(path->nodes[*level-1]); 2470 path->nodes[*level-1] = next; 2471 *level = btrfs_header_level(next); 2472 path->slots[*level] = 0; 2473 cond_resched(); 2474 } 2475 WARN_ON(*level < 0); 2476 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2477 2478 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2479 2480 cond_resched(); 2481 return 0; 2482 } 2483 2484 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2485 struct btrfs_root *root, 2486 struct btrfs_path *path, int *level, 2487 struct walk_control *wc) 2488 { 2489 u64 root_owner; 2490 int i; 2491 int slot; 2492 int ret; 2493 2494 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2495 slot = path->slots[i]; 2496 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2497 path->slots[i]++; 2498 *level = i; 2499 WARN_ON(*level == 0); 2500 return 0; 2501 } else { 2502 struct extent_buffer *parent; 2503 if (path->nodes[*level] == root->node) 2504 parent = path->nodes[*level]; 2505 else 2506 parent = path->nodes[*level + 1]; 2507 2508 root_owner = btrfs_header_owner(parent); 2509 ret = wc->process_func(root, path->nodes[*level], wc, 2510 btrfs_header_generation(path->nodes[*level])); 2511 if (ret) 2512 return ret; 2513 2514 if (wc->free) { 2515 struct extent_buffer *next; 2516 2517 next = path->nodes[*level]; 2518 2519 if (trans) { 2520 btrfs_tree_lock(next); 2521 btrfs_set_lock_blocking(next); 2522 clean_tree_block(trans, root->fs_info, 2523 next); 2524 btrfs_wait_tree_block_writeback(next); 2525 btrfs_tree_unlock(next); 2526 } 2527 2528 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2529 ret = btrfs_free_and_pin_reserved_extent(root, 2530 path->nodes[*level]->start, 2531 path->nodes[*level]->len); 2532 if (ret) 2533 return ret; 2534 } 2535 free_extent_buffer(path->nodes[*level]); 2536 path->nodes[*level] = NULL; 2537 *level = i + 1; 2538 } 2539 } 2540 return 1; 2541 } 2542 2543 /* 2544 * drop the reference count on the tree rooted at 'snap'. This traverses 2545 * the tree freeing any blocks that have a ref count of zero after being 2546 * decremented. 2547 */ 2548 static int walk_log_tree(struct btrfs_trans_handle *trans, 2549 struct btrfs_root *log, struct walk_control *wc) 2550 { 2551 int ret = 0; 2552 int wret; 2553 int level; 2554 struct btrfs_path *path; 2555 int orig_level; 2556 2557 path = btrfs_alloc_path(); 2558 if (!path) 2559 return -ENOMEM; 2560 2561 level = btrfs_header_level(log->node); 2562 orig_level = level; 2563 path->nodes[level] = log->node; 2564 extent_buffer_get(log->node); 2565 path->slots[level] = 0; 2566 2567 while (1) { 2568 wret = walk_down_log_tree(trans, log, path, &level, wc); 2569 if (wret > 0) 2570 break; 2571 if (wret < 0) { 2572 ret = wret; 2573 goto out; 2574 } 2575 2576 wret = walk_up_log_tree(trans, log, path, &level, wc); 2577 if (wret > 0) 2578 break; 2579 if (wret < 0) { 2580 ret = wret; 2581 goto out; 2582 } 2583 } 2584 2585 /* was the root node processed? if not, catch it here */ 2586 if (path->nodes[orig_level]) { 2587 ret = wc->process_func(log, path->nodes[orig_level], wc, 2588 btrfs_header_generation(path->nodes[orig_level])); 2589 if (ret) 2590 goto out; 2591 if (wc->free) { 2592 struct extent_buffer *next; 2593 2594 next = path->nodes[orig_level]; 2595 2596 if (trans) { 2597 btrfs_tree_lock(next); 2598 btrfs_set_lock_blocking(next); 2599 clean_tree_block(trans, log->fs_info, next); 2600 btrfs_wait_tree_block_writeback(next); 2601 btrfs_tree_unlock(next); 2602 } 2603 2604 WARN_ON(log->root_key.objectid != 2605 BTRFS_TREE_LOG_OBJECTID); 2606 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2607 next->len); 2608 if (ret) 2609 goto out; 2610 } 2611 } 2612 2613 out: 2614 btrfs_free_path(path); 2615 return ret; 2616 } 2617 2618 /* 2619 * helper function to update the item for a given subvolumes log root 2620 * in the tree of log roots 2621 */ 2622 static int update_log_root(struct btrfs_trans_handle *trans, 2623 struct btrfs_root *log) 2624 { 2625 int ret; 2626 2627 if (log->log_transid == 1) { 2628 /* insert root item on the first sync */ 2629 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2630 &log->root_key, &log->root_item); 2631 } else { 2632 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2633 &log->root_key, &log->root_item); 2634 } 2635 return ret; 2636 } 2637 2638 static void wait_log_commit(struct btrfs_root *root, int transid) 2639 { 2640 DEFINE_WAIT(wait); 2641 int index = transid % 2; 2642 2643 /* 2644 * we only allow two pending log transactions at a time, 2645 * so we know that if ours is more than 2 older than the 2646 * current transaction, we're done 2647 */ 2648 do { 2649 prepare_to_wait(&root->log_commit_wait[index], 2650 &wait, TASK_UNINTERRUPTIBLE); 2651 mutex_unlock(&root->log_mutex); 2652 2653 if (root->log_transid_committed < transid && 2654 atomic_read(&root->log_commit[index])) 2655 schedule(); 2656 2657 finish_wait(&root->log_commit_wait[index], &wait); 2658 mutex_lock(&root->log_mutex); 2659 } while (root->log_transid_committed < transid && 2660 atomic_read(&root->log_commit[index])); 2661 } 2662 2663 static void wait_for_writer(struct btrfs_root *root) 2664 { 2665 DEFINE_WAIT(wait); 2666 2667 while (atomic_read(&root->log_writers)) { 2668 prepare_to_wait(&root->log_writer_wait, 2669 &wait, TASK_UNINTERRUPTIBLE); 2670 mutex_unlock(&root->log_mutex); 2671 if (atomic_read(&root->log_writers)) 2672 schedule(); 2673 finish_wait(&root->log_writer_wait, &wait); 2674 mutex_lock(&root->log_mutex); 2675 } 2676 } 2677 2678 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2679 struct btrfs_log_ctx *ctx) 2680 { 2681 if (!ctx) 2682 return; 2683 2684 mutex_lock(&root->log_mutex); 2685 list_del_init(&ctx->list); 2686 mutex_unlock(&root->log_mutex); 2687 } 2688 2689 /* 2690 * Invoked in log mutex context, or be sure there is no other task which 2691 * can access the list. 2692 */ 2693 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2694 int index, int error) 2695 { 2696 struct btrfs_log_ctx *ctx; 2697 2698 if (!error) { 2699 INIT_LIST_HEAD(&root->log_ctxs[index]); 2700 return; 2701 } 2702 2703 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2704 ctx->log_ret = error; 2705 2706 INIT_LIST_HEAD(&root->log_ctxs[index]); 2707 } 2708 2709 /* 2710 * btrfs_sync_log does sends a given tree log down to the disk and 2711 * updates the super blocks to record it. When this call is done, 2712 * you know that any inodes previously logged are safely on disk only 2713 * if it returns 0. 2714 * 2715 * Any other return value means you need to call btrfs_commit_transaction. 2716 * Some of the edge cases for fsyncing directories that have had unlinks 2717 * or renames done in the past mean that sometimes the only safe 2718 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2719 * that has happened. 2720 */ 2721 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2722 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2723 { 2724 int index1; 2725 int index2; 2726 int mark; 2727 int ret; 2728 struct btrfs_root *log = root->log_root; 2729 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2730 int log_transid = 0; 2731 struct btrfs_log_ctx root_log_ctx; 2732 struct blk_plug plug; 2733 2734 mutex_lock(&root->log_mutex); 2735 log_transid = ctx->log_transid; 2736 if (root->log_transid_committed >= log_transid) { 2737 mutex_unlock(&root->log_mutex); 2738 return ctx->log_ret; 2739 } 2740 2741 index1 = log_transid % 2; 2742 if (atomic_read(&root->log_commit[index1])) { 2743 wait_log_commit(root, log_transid); 2744 mutex_unlock(&root->log_mutex); 2745 return ctx->log_ret; 2746 } 2747 ASSERT(log_transid == root->log_transid); 2748 atomic_set(&root->log_commit[index1], 1); 2749 2750 /* wait for previous tree log sync to complete */ 2751 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2752 wait_log_commit(root, log_transid - 1); 2753 2754 while (1) { 2755 int batch = atomic_read(&root->log_batch); 2756 /* when we're on an ssd, just kick the log commit out */ 2757 if (!btrfs_test_opt(root, SSD) && 2758 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2759 mutex_unlock(&root->log_mutex); 2760 schedule_timeout_uninterruptible(1); 2761 mutex_lock(&root->log_mutex); 2762 } 2763 wait_for_writer(root); 2764 if (batch == atomic_read(&root->log_batch)) 2765 break; 2766 } 2767 2768 /* bail out if we need to do a full commit */ 2769 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2770 ret = -EAGAIN; 2771 btrfs_free_logged_extents(log, log_transid); 2772 mutex_unlock(&root->log_mutex); 2773 goto out; 2774 } 2775 2776 if (log_transid % 2 == 0) 2777 mark = EXTENT_DIRTY; 2778 else 2779 mark = EXTENT_NEW; 2780 2781 /* we start IO on all the marked extents here, but we don't actually 2782 * wait for them until later. 2783 */ 2784 blk_start_plug(&plug); 2785 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2786 if (ret) { 2787 blk_finish_plug(&plug); 2788 btrfs_abort_transaction(trans, root, ret); 2789 btrfs_free_logged_extents(log, log_transid); 2790 btrfs_set_log_full_commit(root->fs_info, trans); 2791 mutex_unlock(&root->log_mutex); 2792 goto out; 2793 } 2794 2795 btrfs_set_root_node(&log->root_item, log->node); 2796 2797 root->log_transid++; 2798 log->log_transid = root->log_transid; 2799 root->log_start_pid = 0; 2800 /* 2801 * IO has been started, blocks of the log tree have WRITTEN flag set 2802 * in their headers. new modifications of the log will be written to 2803 * new positions. so it's safe to allow log writers to go in. 2804 */ 2805 mutex_unlock(&root->log_mutex); 2806 2807 btrfs_init_log_ctx(&root_log_ctx); 2808 2809 mutex_lock(&log_root_tree->log_mutex); 2810 atomic_inc(&log_root_tree->log_batch); 2811 atomic_inc(&log_root_tree->log_writers); 2812 2813 index2 = log_root_tree->log_transid % 2; 2814 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2815 root_log_ctx.log_transid = log_root_tree->log_transid; 2816 2817 mutex_unlock(&log_root_tree->log_mutex); 2818 2819 ret = update_log_root(trans, log); 2820 2821 mutex_lock(&log_root_tree->log_mutex); 2822 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2823 smp_mb(); 2824 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2825 wake_up(&log_root_tree->log_writer_wait); 2826 } 2827 2828 if (ret) { 2829 if (!list_empty(&root_log_ctx.list)) 2830 list_del_init(&root_log_ctx.list); 2831 2832 blk_finish_plug(&plug); 2833 btrfs_set_log_full_commit(root->fs_info, trans); 2834 2835 if (ret != -ENOSPC) { 2836 btrfs_abort_transaction(trans, root, ret); 2837 mutex_unlock(&log_root_tree->log_mutex); 2838 goto out; 2839 } 2840 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2841 btrfs_free_logged_extents(log, log_transid); 2842 mutex_unlock(&log_root_tree->log_mutex); 2843 ret = -EAGAIN; 2844 goto out; 2845 } 2846 2847 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2848 blk_finish_plug(&plug); 2849 mutex_unlock(&log_root_tree->log_mutex); 2850 ret = root_log_ctx.log_ret; 2851 goto out; 2852 } 2853 2854 index2 = root_log_ctx.log_transid % 2; 2855 if (atomic_read(&log_root_tree->log_commit[index2])) { 2856 blk_finish_plug(&plug); 2857 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, 2858 mark); 2859 btrfs_wait_logged_extents(trans, log, log_transid); 2860 wait_log_commit(log_root_tree, 2861 root_log_ctx.log_transid); 2862 mutex_unlock(&log_root_tree->log_mutex); 2863 if (!ret) 2864 ret = root_log_ctx.log_ret; 2865 goto out; 2866 } 2867 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2868 atomic_set(&log_root_tree->log_commit[index2], 1); 2869 2870 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2871 wait_log_commit(log_root_tree, 2872 root_log_ctx.log_transid - 1); 2873 } 2874 2875 wait_for_writer(log_root_tree); 2876 2877 /* 2878 * now that we've moved on to the tree of log tree roots, 2879 * check the full commit flag again 2880 */ 2881 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2882 blk_finish_plug(&plug); 2883 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2884 btrfs_free_logged_extents(log, log_transid); 2885 mutex_unlock(&log_root_tree->log_mutex); 2886 ret = -EAGAIN; 2887 goto out_wake_log_root; 2888 } 2889 2890 ret = btrfs_write_marked_extents(log_root_tree, 2891 &log_root_tree->dirty_log_pages, 2892 EXTENT_DIRTY | EXTENT_NEW); 2893 blk_finish_plug(&plug); 2894 if (ret) { 2895 btrfs_set_log_full_commit(root->fs_info, trans); 2896 btrfs_abort_transaction(trans, root, ret); 2897 btrfs_free_logged_extents(log, log_transid); 2898 mutex_unlock(&log_root_tree->log_mutex); 2899 goto out_wake_log_root; 2900 } 2901 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2902 if (!ret) 2903 ret = btrfs_wait_marked_extents(log_root_tree, 2904 &log_root_tree->dirty_log_pages, 2905 EXTENT_NEW | EXTENT_DIRTY); 2906 if (ret) { 2907 btrfs_set_log_full_commit(root->fs_info, trans); 2908 btrfs_free_logged_extents(log, log_transid); 2909 mutex_unlock(&log_root_tree->log_mutex); 2910 goto out_wake_log_root; 2911 } 2912 btrfs_wait_logged_extents(trans, log, log_transid); 2913 2914 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2915 log_root_tree->node->start); 2916 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2917 btrfs_header_level(log_root_tree->node)); 2918 2919 log_root_tree->log_transid++; 2920 mutex_unlock(&log_root_tree->log_mutex); 2921 2922 /* 2923 * nobody else is going to jump in and write the the ctree 2924 * super here because the log_commit atomic below is protecting 2925 * us. We must be called with a transaction handle pinning 2926 * the running transaction open, so a full commit can't hop 2927 * in and cause problems either. 2928 */ 2929 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2930 if (ret) { 2931 btrfs_set_log_full_commit(root->fs_info, trans); 2932 btrfs_abort_transaction(trans, root, ret); 2933 goto out_wake_log_root; 2934 } 2935 2936 mutex_lock(&root->log_mutex); 2937 if (root->last_log_commit < log_transid) 2938 root->last_log_commit = log_transid; 2939 mutex_unlock(&root->log_mutex); 2940 2941 out_wake_log_root: 2942 /* 2943 * We needn't get log_mutex here because we are sure all 2944 * the other tasks are blocked. 2945 */ 2946 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2947 2948 mutex_lock(&log_root_tree->log_mutex); 2949 log_root_tree->log_transid_committed++; 2950 atomic_set(&log_root_tree->log_commit[index2], 0); 2951 mutex_unlock(&log_root_tree->log_mutex); 2952 2953 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2954 wake_up(&log_root_tree->log_commit_wait[index2]); 2955 out: 2956 /* See above. */ 2957 btrfs_remove_all_log_ctxs(root, index1, ret); 2958 2959 mutex_lock(&root->log_mutex); 2960 root->log_transid_committed++; 2961 atomic_set(&root->log_commit[index1], 0); 2962 mutex_unlock(&root->log_mutex); 2963 2964 if (waitqueue_active(&root->log_commit_wait[index1])) 2965 wake_up(&root->log_commit_wait[index1]); 2966 return ret; 2967 } 2968 2969 static void free_log_tree(struct btrfs_trans_handle *trans, 2970 struct btrfs_root *log) 2971 { 2972 int ret; 2973 u64 start; 2974 u64 end; 2975 struct walk_control wc = { 2976 .free = 1, 2977 .process_func = process_one_buffer 2978 }; 2979 2980 ret = walk_log_tree(trans, log, &wc); 2981 /* I don't think this can happen but just in case */ 2982 if (ret) 2983 btrfs_abort_transaction(trans, log, ret); 2984 2985 while (1) { 2986 ret = find_first_extent_bit(&log->dirty_log_pages, 2987 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2988 NULL); 2989 if (ret) 2990 break; 2991 2992 clear_extent_bits(&log->dirty_log_pages, start, end, 2993 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2994 } 2995 2996 /* 2997 * We may have short-circuited the log tree with the full commit logic 2998 * and left ordered extents on our list, so clear these out to keep us 2999 * from leaking inodes and memory. 3000 */ 3001 btrfs_free_logged_extents(log, 0); 3002 btrfs_free_logged_extents(log, 1); 3003 3004 free_extent_buffer(log->node); 3005 kfree(log); 3006 } 3007 3008 /* 3009 * free all the extents used by the tree log. This should be called 3010 * at commit time of the full transaction 3011 */ 3012 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3013 { 3014 if (root->log_root) { 3015 free_log_tree(trans, root->log_root); 3016 root->log_root = NULL; 3017 } 3018 return 0; 3019 } 3020 3021 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3022 struct btrfs_fs_info *fs_info) 3023 { 3024 if (fs_info->log_root_tree) { 3025 free_log_tree(trans, fs_info->log_root_tree); 3026 fs_info->log_root_tree = NULL; 3027 } 3028 return 0; 3029 } 3030 3031 /* 3032 * If both a file and directory are logged, and unlinks or renames are 3033 * mixed in, we have a few interesting corners: 3034 * 3035 * create file X in dir Y 3036 * link file X to X.link in dir Y 3037 * fsync file X 3038 * unlink file X but leave X.link 3039 * fsync dir Y 3040 * 3041 * After a crash we would expect only X.link to exist. But file X 3042 * didn't get fsync'd again so the log has back refs for X and X.link. 3043 * 3044 * We solve this by removing directory entries and inode backrefs from the 3045 * log when a file that was logged in the current transaction is 3046 * unlinked. Any later fsync will include the updated log entries, and 3047 * we'll be able to reconstruct the proper directory items from backrefs. 3048 * 3049 * This optimizations allows us to avoid relogging the entire inode 3050 * or the entire directory. 3051 */ 3052 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3053 struct btrfs_root *root, 3054 const char *name, int name_len, 3055 struct inode *dir, u64 index) 3056 { 3057 struct btrfs_root *log; 3058 struct btrfs_dir_item *di; 3059 struct btrfs_path *path; 3060 int ret; 3061 int err = 0; 3062 int bytes_del = 0; 3063 u64 dir_ino = btrfs_ino(dir); 3064 3065 if (BTRFS_I(dir)->logged_trans < trans->transid) 3066 return 0; 3067 3068 ret = join_running_log_trans(root); 3069 if (ret) 3070 return 0; 3071 3072 mutex_lock(&BTRFS_I(dir)->log_mutex); 3073 3074 log = root->log_root; 3075 path = btrfs_alloc_path(); 3076 if (!path) { 3077 err = -ENOMEM; 3078 goto out_unlock; 3079 } 3080 3081 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3082 name, name_len, -1); 3083 if (IS_ERR(di)) { 3084 err = PTR_ERR(di); 3085 goto fail; 3086 } 3087 if (di) { 3088 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3089 bytes_del += name_len; 3090 if (ret) { 3091 err = ret; 3092 goto fail; 3093 } 3094 } 3095 btrfs_release_path(path); 3096 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3097 index, name, name_len, -1); 3098 if (IS_ERR(di)) { 3099 err = PTR_ERR(di); 3100 goto fail; 3101 } 3102 if (di) { 3103 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3104 bytes_del += name_len; 3105 if (ret) { 3106 err = ret; 3107 goto fail; 3108 } 3109 } 3110 3111 /* update the directory size in the log to reflect the names 3112 * we have removed 3113 */ 3114 if (bytes_del) { 3115 struct btrfs_key key; 3116 3117 key.objectid = dir_ino; 3118 key.offset = 0; 3119 key.type = BTRFS_INODE_ITEM_KEY; 3120 btrfs_release_path(path); 3121 3122 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3123 if (ret < 0) { 3124 err = ret; 3125 goto fail; 3126 } 3127 if (ret == 0) { 3128 struct btrfs_inode_item *item; 3129 u64 i_size; 3130 3131 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3132 struct btrfs_inode_item); 3133 i_size = btrfs_inode_size(path->nodes[0], item); 3134 if (i_size > bytes_del) 3135 i_size -= bytes_del; 3136 else 3137 i_size = 0; 3138 btrfs_set_inode_size(path->nodes[0], item, i_size); 3139 btrfs_mark_buffer_dirty(path->nodes[0]); 3140 } else 3141 ret = 0; 3142 btrfs_release_path(path); 3143 } 3144 fail: 3145 btrfs_free_path(path); 3146 out_unlock: 3147 mutex_unlock(&BTRFS_I(dir)->log_mutex); 3148 if (ret == -ENOSPC) { 3149 btrfs_set_log_full_commit(root->fs_info, trans); 3150 ret = 0; 3151 } else if (ret < 0) 3152 btrfs_abort_transaction(trans, root, ret); 3153 3154 btrfs_end_log_trans(root); 3155 3156 return err; 3157 } 3158 3159 /* see comments for btrfs_del_dir_entries_in_log */ 3160 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3161 struct btrfs_root *root, 3162 const char *name, int name_len, 3163 struct inode *inode, u64 dirid) 3164 { 3165 struct btrfs_root *log; 3166 u64 index; 3167 int ret; 3168 3169 if (BTRFS_I(inode)->logged_trans < trans->transid) 3170 return 0; 3171 3172 ret = join_running_log_trans(root); 3173 if (ret) 3174 return 0; 3175 log = root->log_root; 3176 mutex_lock(&BTRFS_I(inode)->log_mutex); 3177 3178 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3179 dirid, &index); 3180 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3181 if (ret == -ENOSPC) { 3182 btrfs_set_log_full_commit(root->fs_info, trans); 3183 ret = 0; 3184 } else if (ret < 0 && ret != -ENOENT) 3185 btrfs_abort_transaction(trans, root, ret); 3186 btrfs_end_log_trans(root); 3187 3188 return ret; 3189 } 3190 3191 /* 3192 * creates a range item in the log for 'dirid'. first_offset and 3193 * last_offset tell us which parts of the key space the log should 3194 * be considered authoritative for. 3195 */ 3196 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3197 struct btrfs_root *log, 3198 struct btrfs_path *path, 3199 int key_type, u64 dirid, 3200 u64 first_offset, u64 last_offset) 3201 { 3202 int ret; 3203 struct btrfs_key key; 3204 struct btrfs_dir_log_item *item; 3205 3206 key.objectid = dirid; 3207 key.offset = first_offset; 3208 if (key_type == BTRFS_DIR_ITEM_KEY) 3209 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3210 else 3211 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3212 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3213 if (ret) 3214 return ret; 3215 3216 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3217 struct btrfs_dir_log_item); 3218 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3219 btrfs_mark_buffer_dirty(path->nodes[0]); 3220 btrfs_release_path(path); 3221 return 0; 3222 } 3223 3224 /* 3225 * log all the items included in the current transaction for a given 3226 * directory. This also creates the range items in the log tree required 3227 * to replay anything deleted before the fsync 3228 */ 3229 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3230 struct btrfs_root *root, struct inode *inode, 3231 struct btrfs_path *path, 3232 struct btrfs_path *dst_path, int key_type, 3233 struct btrfs_log_ctx *ctx, 3234 u64 min_offset, u64 *last_offset_ret) 3235 { 3236 struct btrfs_key min_key; 3237 struct btrfs_root *log = root->log_root; 3238 struct extent_buffer *src; 3239 int err = 0; 3240 int ret; 3241 int i; 3242 int nritems; 3243 u64 first_offset = min_offset; 3244 u64 last_offset = (u64)-1; 3245 u64 ino = btrfs_ino(inode); 3246 3247 log = root->log_root; 3248 3249 min_key.objectid = ino; 3250 min_key.type = key_type; 3251 min_key.offset = min_offset; 3252 3253 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3254 3255 /* 3256 * we didn't find anything from this transaction, see if there 3257 * is anything at all 3258 */ 3259 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3260 min_key.objectid = ino; 3261 min_key.type = key_type; 3262 min_key.offset = (u64)-1; 3263 btrfs_release_path(path); 3264 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3265 if (ret < 0) { 3266 btrfs_release_path(path); 3267 return ret; 3268 } 3269 ret = btrfs_previous_item(root, path, ino, key_type); 3270 3271 /* if ret == 0 there are items for this type, 3272 * create a range to tell us the last key of this type. 3273 * otherwise, there are no items in this directory after 3274 * *min_offset, and we create a range to indicate that. 3275 */ 3276 if (ret == 0) { 3277 struct btrfs_key tmp; 3278 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3279 path->slots[0]); 3280 if (key_type == tmp.type) 3281 first_offset = max(min_offset, tmp.offset) + 1; 3282 } 3283 goto done; 3284 } 3285 3286 /* go backward to find any previous key */ 3287 ret = btrfs_previous_item(root, path, ino, key_type); 3288 if (ret == 0) { 3289 struct btrfs_key tmp; 3290 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3291 if (key_type == tmp.type) { 3292 first_offset = tmp.offset; 3293 ret = overwrite_item(trans, log, dst_path, 3294 path->nodes[0], path->slots[0], 3295 &tmp); 3296 if (ret) { 3297 err = ret; 3298 goto done; 3299 } 3300 } 3301 } 3302 btrfs_release_path(path); 3303 3304 /* find the first key from this transaction again */ 3305 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3306 if (WARN_ON(ret != 0)) 3307 goto done; 3308 3309 /* 3310 * we have a block from this transaction, log every item in it 3311 * from our directory 3312 */ 3313 while (1) { 3314 struct btrfs_key tmp; 3315 src = path->nodes[0]; 3316 nritems = btrfs_header_nritems(src); 3317 for (i = path->slots[0]; i < nritems; i++) { 3318 struct btrfs_dir_item *di; 3319 3320 btrfs_item_key_to_cpu(src, &min_key, i); 3321 3322 if (min_key.objectid != ino || min_key.type != key_type) 3323 goto done; 3324 ret = overwrite_item(trans, log, dst_path, src, i, 3325 &min_key); 3326 if (ret) { 3327 err = ret; 3328 goto done; 3329 } 3330 3331 /* 3332 * We must make sure that when we log a directory entry, 3333 * the corresponding inode, after log replay, has a 3334 * matching link count. For example: 3335 * 3336 * touch foo 3337 * mkdir mydir 3338 * sync 3339 * ln foo mydir/bar 3340 * xfs_io -c "fsync" mydir 3341 * <crash> 3342 * <mount fs and log replay> 3343 * 3344 * Would result in a fsync log that when replayed, our 3345 * file inode would have a link count of 1, but we get 3346 * two directory entries pointing to the same inode. 3347 * After removing one of the names, it would not be 3348 * possible to remove the other name, which resulted 3349 * always in stale file handle errors, and would not 3350 * be possible to rmdir the parent directory, since 3351 * its i_size could never decrement to the value 3352 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3353 */ 3354 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3355 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3356 if (ctx && 3357 (btrfs_dir_transid(src, di) == trans->transid || 3358 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3359 tmp.type != BTRFS_ROOT_ITEM_KEY) 3360 ctx->log_new_dentries = true; 3361 } 3362 path->slots[0] = nritems; 3363 3364 /* 3365 * look ahead to the next item and see if it is also 3366 * from this directory and from this transaction 3367 */ 3368 ret = btrfs_next_leaf(root, path); 3369 if (ret == 1) { 3370 last_offset = (u64)-1; 3371 goto done; 3372 } 3373 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3374 if (tmp.objectid != ino || tmp.type != key_type) { 3375 last_offset = (u64)-1; 3376 goto done; 3377 } 3378 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3379 ret = overwrite_item(trans, log, dst_path, 3380 path->nodes[0], path->slots[0], 3381 &tmp); 3382 if (ret) 3383 err = ret; 3384 else 3385 last_offset = tmp.offset; 3386 goto done; 3387 } 3388 } 3389 done: 3390 btrfs_release_path(path); 3391 btrfs_release_path(dst_path); 3392 3393 if (err == 0) { 3394 *last_offset_ret = last_offset; 3395 /* 3396 * insert the log range keys to indicate where the log 3397 * is valid 3398 */ 3399 ret = insert_dir_log_key(trans, log, path, key_type, 3400 ino, first_offset, last_offset); 3401 if (ret) 3402 err = ret; 3403 } 3404 return err; 3405 } 3406 3407 /* 3408 * logging directories is very similar to logging inodes, We find all the items 3409 * from the current transaction and write them to the log. 3410 * 3411 * The recovery code scans the directory in the subvolume, and if it finds a 3412 * key in the range logged that is not present in the log tree, then it means 3413 * that dir entry was unlinked during the transaction. 3414 * 3415 * In order for that scan to work, we must include one key smaller than 3416 * the smallest logged by this transaction and one key larger than the largest 3417 * key logged by this transaction. 3418 */ 3419 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3420 struct btrfs_root *root, struct inode *inode, 3421 struct btrfs_path *path, 3422 struct btrfs_path *dst_path, 3423 struct btrfs_log_ctx *ctx) 3424 { 3425 u64 min_key; 3426 u64 max_key; 3427 int ret; 3428 int key_type = BTRFS_DIR_ITEM_KEY; 3429 3430 again: 3431 min_key = 0; 3432 max_key = 0; 3433 while (1) { 3434 ret = log_dir_items(trans, root, inode, path, 3435 dst_path, key_type, ctx, min_key, 3436 &max_key); 3437 if (ret) 3438 return ret; 3439 if (max_key == (u64)-1) 3440 break; 3441 min_key = max_key + 1; 3442 } 3443 3444 if (key_type == BTRFS_DIR_ITEM_KEY) { 3445 key_type = BTRFS_DIR_INDEX_KEY; 3446 goto again; 3447 } 3448 return 0; 3449 } 3450 3451 /* 3452 * a helper function to drop items from the log before we relog an 3453 * inode. max_key_type indicates the highest item type to remove. 3454 * This cannot be run for file data extents because it does not 3455 * free the extents they point to. 3456 */ 3457 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3458 struct btrfs_root *log, 3459 struct btrfs_path *path, 3460 u64 objectid, int max_key_type) 3461 { 3462 int ret; 3463 struct btrfs_key key; 3464 struct btrfs_key found_key; 3465 int start_slot; 3466 3467 key.objectid = objectid; 3468 key.type = max_key_type; 3469 key.offset = (u64)-1; 3470 3471 while (1) { 3472 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3473 BUG_ON(ret == 0); /* Logic error */ 3474 if (ret < 0) 3475 break; 3476 3477 if (path->slots[0] == 0) 3478 break; 3479 3480 path->slots[0]--; 3481 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3482 path->slots[0]); 3483 3484 if (found_key.objectid != objectid) 3485 break; 3486 3487 found_key.offset = 0; 3488 found_key.type = 0; 3489 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3490 &start_slot); 3491 3492 ret = btrfs_del_items(trans, log, path, start_slot, 3493 path->slots[0] - start_slot + 1); 3494 /* 3495 * If start slot isn't 0 then we don't need to re-search, we've 3496 * found the last guy with the objectid in this tree. 3497 */ 3498 if (ret || start_slot != 0) 3499 break; 3500 btrfs_release_path(path); 3501 } 3502 btrfs_release_path(path); 3503 if (ret > 0) 3504 ret = 0; 3505 return ret; 3506 } 3507 3508 static void fill_inode_item(struct btrfs_trans_handle *trans, 3509 struct extent_buffer *leaf, 3510 struct btrfs_inode_item *item, 3511 struct inode *inode, int log_inode_only, 3512 u64 logged_isize) 3513 { 3514 struct btrfs_map_token token; 3515 3516 btrfs_init_map_token(&token); 3517 3518 if (log_inode_only) { 3519 /* set the generation to zero so the recover code 3520 * can tell the difference between an logging 3521 * just to say 'this inode exists' and a logging 3522 * to say 'update this inode with these values' 3523 */ 3524 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3525 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3526 } else { 3527 btrfs_set_token_inode_generation(leaf, item, 3528 BTRFS_I(inode)->generation, 3529 &token); 3530 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3531 } 3532 3533 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3534 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3535 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3536 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3537 3538 btrfs_set_token_timespec_sec(leaf, &item->atime, 3539 inode->i_atime.tv_sec, &token); 3540 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3541 inode->i_atime.tv_nsec, &token); 3542 3543 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3544 inode->i_mtime.tv_sec, &token); 3545 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3546 inode->i_mtime.tv_nsec, &token); 3547 3548 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3549 inode->i_ctime.tv_sec, &token); 3550 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3551 inode->i_ctime.tv_nsec, &token); 3552 3553 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3554 &token); 3555 3556 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3557 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3558 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3559 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3560 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3561 } 3562 3563 static int log_inode_item(struct btrfs_trans_handle *trans, 3564 struct btrfs_root *log, struct btrfs_path *path, 3565 struct inode *inode) 3566 { 3567 struct btrfs_inode_item *inode_item; 3568 int ret; 3569 3570 ret = btrfs_insert_empty_item(trans, log, path, 3571 &BTRFS_I(inode)->location, 3572 sizeof(*inode_item)); 3573 if (ret && ret != -EEXIST) 3574 return ret; 3575 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3576 struct btrfs_inode_item); 3577 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3578 btrfs_release_path(path); 3579 return 0; 3580 } 3581 3582 static noinline int copy_items(struct btrfs_trans_handle *trans, 3583 struct inode *inode, 3584 struct btrfs_path *dst_path, 3585 struct btrfs_path *src_path, u64 *last_extent, 3586 int start_slot, int nr, int inode_only, 3587 u64 logged_isize) 3588 { 3589 unsigned long src_offset; 3590 unsigned long dst_offset; 3591 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3592 struct btrfs_file_extent_item *extent; 3593 struct btrfs_inode_item *inode_item; 3594 struct extent_buffer *src = src_path->nodes[0]; 3595 struct btrfs_key first_key, last_key, key; 3596 int ret; 3597 struct btrfs_key *ins_keys; 3598 u32 *ins_sizes; 3599 char *ins_data; 3600 int i; 3601 struct list_head ordered_sums; 3602 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3603 bool has_extents = false; 3604 bool need_find_last_extent = true; 3605 bool done = false; 3606 3607 INIT_LIST_HEAD(&ordered_sums); 3608 3609 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3610 nr * sizeof(u32), GFP_NOFS); 3611 if (!ins_data) 3612 return -ENOMEM; 3613 3614 first_key.objectid = (u64)-1; 3615 3616 ins_sizes = (u32 *)ins_data; 3617 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3618 3619 for (i = 0; i < nr; i++) { 3620 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3621 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3622 } 3623 ret = btrfs_insert_empty_items(trans, log, dst_path, 3624 ins_keys, ins_sizes, nr); 3625 if (ret) { 3626 kfree(ins_data); 3627 return ret; 3628 } 3629 3630 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3631 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3632 dst_path->slots[0]); 3633 3634 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3635 3636 if ((i == (nr - 1))) 3637 last_key = ins_keys[i]; 3638 3639 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3640 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3641 dst_path->slots[0], 3642 struct btrfs_inode_item); 3643 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3644 inode, inode_only == LOG_INODE_EXISTS, 3645 logged_isize); 3646 } else { 3647 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3648 src_offset, ins_sizes[i]); 3649 } 3650 3651 /* 3652 * We set need_find_last_extent here in case we know we were 3653 * processing other items and then walk into the first extent in 3654 * the inode. If we don't hit an extent then nothing changes, 3655 * we'll do the last search the next time around. 3656 */ 3657 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3658 has_extents = true; 3659 if (first_key.objectid == (u64)-1) 3660 first_key = ins_keys[i]; 3661 } else { 3662 need_find_last_extent = false; 3663 } 3664 3665 /* take a reference on file data extents so that truncates 3666 * or deletes of this inode don't have to relog the inode 3667 * again 3668 */ 3669 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3670 !skip_csum) { 3671 int found_type; 3672 extent = btrfs_item_ptr(src, start_slot + i, 3673 struct btrfs_file_extent_item); 3674 3675 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3676 continue; 3677 3678 found_type = btrfs_file_extent_type(src, extent); 3679 if (found_type == BTRFS_FILE_EXTENT_REG) { 3680 u64 ds, dl, cs, cl; 3681 ds = btrfs_file_extent_disk_bytenr(src, 3682 extent); 3683 /* ds == 0 is a hole */ 3684 if (ds == 0) 3685 continue; 3686 3687 dl = btrfs_file_extent_disk_num_bytes(src, 3688 extent); 3689 cs = btrfs_file_extent_offset(src, extent); 3690 cl = btrfs_file_extent_num_bytes(src, 3691 extent); 3692 if (btrfs_file_extent_compression(src, 3693 extent)) { 3694 cs = 0; 3695 cl = dl; 3696 } 3697 3698 ret = btrfs_lookup_csums_range( 3699 log->fs_info->csum_root, 3700 ds + cs, ds + cs + cl - 1, 3701 &ordered_sums, 0); 3702 if (ret) { 3703 btrfs_release_path(dst_path); 3704 kfree(ins_data); 3705 return ret; 3706 } 3707 } 3708 } 3709 } 3710 3711 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3712 btrfs_release_path(dst_path); 3713 kfree(ins_data); 3714 3715 /* 3716 * we have to do this after the loop above to avoid changing the 3717 * log tree while trying to change the log tree. 3718 */ 3719 ret = 0; 3720 while (!list_empty(&ordered_sums)) { 3721 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3722 struct btrfs_ordered_sum, 3723 list); 3724 if (!ret) 3725 ret = btrfs_csum_file_blocks(trans, log, sums); 3726 list_del(&sums->list); 3727 kfree(sums); 3728 } 3729 3730 if (!has_extents) 3731 return ret; 3732 3733 if (need_find_last_extent && *last_extent == first_key.offset) { 3734 /* 3735 * We don't have any leafs between our current one and the one 3736 * we processed before that can have file extent items for our 3737 * inode (and have a generation number smaller than our current 3738 * transaction id). 3739 */ 3740 need_find_last_extent = false; 3741 } 3742 3743 /* 3744 * Because we use btrfs_search_forward we could skip leaves that were 3745 * not modified and then assume *last_extent is valid when it really 3746 * isn't. So back up to the previous leaf and read the end of the last 3747 * extent before we go and fill in holes. 3748 */ 3749 if (need_find_last_extent) { 3750 u64 len; 3751 3752 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3753 if (ret < 0) 3754 return ret; 3755 if (ret) 3756 goto fill_holes; 3757 if (src_path->slots[0]) 3758 src_path->slots[0]--; 3759 src = src_path->nodes[0]; 3760 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3761 if (key.objectid != btrfs_ino(inode) || 3762 key.type != BTRFS_EXTENT_DATA_KEY) 3763 goto fill_holes; 3764 extent = btrfs_item_ptr(src, src_path->slots[0], 3765 struct btrfs_file_extent_item); 3766 if (btrfs_file_extent_type(src, extent) == 3767 BTRFS_FILE_EXTENT_INLINE) { 3768 len = btrfs_file_extent_inline_len(src, 3769 src_path->slots[0], 3770 extent); 3771 *last_extent = ALIGN(key.offset + len, 3772 log->sectorsize); 3773 } else { 3774 len = btrfs_file_extent_num_bytes(src, extent); 3775 *last_extent = key.offset + len; 3776 } 3777 } 3778 fill_holes: 3779 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3780 * things could have happened 3781 * 3782 * 1) A merge could have happened, so we could currently be on a leaf 3783 * that holds what we were copying in the first place. 3784 * 2) A split could have happened, and now not all of the items we want 3785 * are on the same leaf. 3786 * 3787 * So we need to adjust how we search for holes, we need to drop the 3788 * path and re-search for the first extent key we found, and then walk 3789 * forward until we hit the last one we copied. 3790 */ 3791 if (need_find_last_extent) { 3792 /* btrfs_prev_leaf could return 1 without releasing the path */ 3793 btrfs_release_path(src_path); 3794 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3795 src_path, 0, 0); 3796 if (ret < 0) 3797 return ret; 3798 ASSERT(ret == 0); 3799 src = src_path->nodes[0]; 3800 i = src_path->slots[0]; 3801 } else { 3802 i = start_slot; 3803 } 3804 3805 /* 3806 * Ok so here we need to go through and fill in any holes we may have 3807 * to make sure that holes are punched for those areas in case they had 3808 * extents previously. 3809 */ 3810 while (!done) { 3811 u64 offset, len; 3812 u64 extent_end; 3813 3814 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3815 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3816 if (ret < 0) 3817 return ret; 3818 ASSERT(ret == 0); 3819 src = src_path->nodes[0]; 3820 i = 0; 3821 } 3822 3823 btrfs_item_key_to_cpu(src, &key, i); 3824 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3825 done = true; 3826 if (key.objectid != btrfs_ino(inode) || 3827 key.type != BTRFS_EXTENT_DATA_KEY) { 3828 i++; 3829 continue; 3830 } 3831 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3832 if (btrfs_file_extent_type(src, extent) == 3833 BTRFS_FILE_EXTENT_INLINE) { 3834 len = btrfs_file_extent_inline_len(src, i, extent); 3835 extent_end = ALIGN(key.offset + len, log->sectorsize); 3836 } else { 3837 len = btrfs_file_extent_num_bytes(src, extent); 3838 extent_end = key.offset + len; 3839 } 3840 i++; 3841 3842 if (*last_extent == key.offset) { 3843 *last_extent = extent_end; 3844 continue; 3845 } 3846 offset = *last_extent; 3847 len = key.offset - *last_extent; 3848 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3849 offset, 0, 0, len, 0, len, 0, 3850 0, 0); 3851 if (ret) 3852 break; 3853 *last_extent = extent_end; 3854 } 3855 /* 3856 * Need to let the callers know we dropped the path so they should 3857 * re-search. 3858 */ 3859 if (!ret && need_find_last_extent) 3860 ret = 1; 3861 return ret; 3862 } 3863 3864 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3865 { 3866 struct extent_map *em1, *em2; 3867 3868 em1 = list_entry(a, struct extent_map, list); 3869 em2 = list_entry(b, struct extent_map, list); 3870 3871 if (em1->start < em2->start) 3872 return -1; 3873 else if (em1->start > em2->start) 3874 return 1; 3875 return 0; 3876 } 3877 3878 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3879 struct inode *inode, 3880 struct btrfs_root *root, 3881 const struct extent_map *em, 3882 const struct list_head *logged_list, 3883 bool *ordered_io_error) 3884 { 3885 struct btrfs_ordered_extent *ordered; 3886 struct btrfs_root *log = root->log_root; 3887 u64 mod_start = em->mod_start; 3888 u64 mod_len = em->mod_len; 3889 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3890 u64 csum_offset; 3891 u64 csum_len; 3892 LIST_HEAD(ordered_sums); 3893 int ret = 0; 3894 3895 *ordered_io_error = false; 3896 3897 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3898 em->block_start == EXTENT_MAP_HOLE) 3899 return 0; 3900 3901 /* 3902 * Wait far any ordered extent that covers our extent map. If it 3903 * finishes without an error, first check and see if our csums are on 3904 * our outstanding ordered extents. 3905 */ 3906 list_for_each_entry(ordered, logged_list, log_list) { 3907 struct btrfs_ordered_sum *sum; 3908 3909 if (!mod_len) 3910 break; 3911 3912 if (ordered->file_offset + ordered->len <= mod_start || 3913 mod_start + mod_len <= ordered->file_offset) 3914 continue; 3915 3916 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3917 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3918 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3919 const u64 start = ordered->file_offset; 3920 const u64 end = ordered->file_offset + ordered->len - 1; 3921 3922 WARN_ON(ordered->inode != inode); 3923 filemap_fdatawrite_range(inode->i_mapping, start, end); 3924 } 3925 3926 wait_event(ordered->wait, 3927 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3928 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3929 3930 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3931 /* 3932 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3933 * i_mapping flags, so that the next fsync won't get 3934 * an outdated io error too. 3935 */ 3936 btrfs_inode_check_errors(inode); 3937 *ordered_io_error = true; 3938 break; 3939 } 3940 /* 3941 * We are going to copy all the csums on this ordered extent, so 3942 * go ahead and adjust mod_start and mod_len in case this 3943 * ordered extent has already been logged. 3944 */ 3945 if (ordered->file_offset > mod_start) { 3946 if (ordered->file_offset + ordered->len >= 3947 mod_start + mod_len) 3948 mod_len = ordered->file_offset - mod_start; 3949 /* 3950 * If we have this case 3951 * 3952 * |--------- logged extent ---------| 3953 * |----- ordered extent ----| 3954 * 3955 * Just don't mess with mod_start and mod_len, we'll 3956 * just end up logging more csums than we need and it 3957 * will be ok. 3958 */ 3959 } else { 3960 if (ordered->file_offset + ordered->len < 3961 mod_start + mod_len) { 3962 mod_len = (mod_start + mod_len) - 3963 (ordered->file_offset + ordered->len); 3964 mod_start = ordered->file_offset + 3965 ordered->len; 3966 } else { 3967 mod_len = 0; 3968 } 3969 } 3970 3971 if (skip_csum) 3972 continue; 3973 3974 /* 3975 * To keep us from looping for the above case of an ordered 3976 * extent that falls inside of the logged extent. 3977 */ 3978 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3979 &ordered->flags)) 3980 continue; 3981 3982 list_for_each_entry(sum, &ordered->list, list) { 3983 ret = btrfs_csum_file_blocks(trans, log, sum); 3984 if (ret) 3985 break; 3986 } 3987 } 3988 3989 if (*ordered_io_error || !mod_len || ret || skip_csum) 3990 return ret; 3991 3992 if (em->compress_type) { 3993 csum_offset = 0; 3994 csum_len = max(em->block_len, em->orig_block_len); 3995 } else { 3996 csum_offset = mod_start - em->start; 3997 csum_len = mod_len; 3998 } 3999 4000 /* block start is already adjusted for the file extent offset. */ 4001 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 4002 em->block_start + csum_offset, 4003 em->block_start + csum_offset + 4004 csum_len - 1, &ordered_sums, 0); 4005 if (ret) 4006 return ret; 4007 4008 while (!list_empty(&ordered_sums)) { 4009 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4010 struct btrfs_ordered_sum, 4011 list); 4012 if (!ret) 4013 ret = btrfs_csum_file_blocks(trans, log, sums); 4014 list_del(&sums->list); 4015 kfree(sums); 4016 } 4017 4018 return ret; 4019 } 4020 4021 static int log_one_extent(struct btrfs_trans_handle *trans, 4022 struct inode *inode, struct btrfs_root *root, 4023 const struct extent_map *em, 4024 struct btrfs_path *path, 4025 const struct list_head *logged_list, 4026 struct btrfs_log_ctx *ctx) 4027 { 4028 struct btrfs_root *log = root->log_root; 4029 struct btrfs_file_extent_item *fi; 4030 struct extent_buffer *leaf; 4031 struct btrfs_map_token token; 4032 struct btrfs_key key; 4033 u64 extent_offset = em->start - em->orig_start; 4034 u64 block_len; 4035 int ret; 4036 int extent_inserted = 0; 4037 bool ordered_io_err = false; 4038 4039 ret = wait_ordered_extents(trans, inode, root, em, logged_list, 4040 &ordered_io_err); 4041 if (ret) 4042 return ret; 4043 4044 if (ordered_io_err) { 4045 ctx->io_err = -EIO; 4046 return 0; 4047 } 4048 4049 btrfs_init_map_token(&token); 4050 4051 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 4052 em->start + em->len, NULL, 0, 1, 4053 sizeof(*fi), &extent_inserted); 4054 if (ret) 4055 return ret; 4056 4057 if (!extent_inserted) { 4058 key.objectid = btrfs_ino(inode); 4059 key.type = BTRFS_EXTENT_DATA_KEY; 4060 key.offset = em->start; 4061 4062 ret = btrfs_insert_empty_item(trans, log, path, &key, 4063 sizeof(*fi)); 4064 if (ret) 4065 return ret; 4066 } 4067 leaf = path->nodes[0]; 4068 fi = btrfs_item_ptr(leaf, path->slots[0], 4069 struct btrfs_file_extent_item); 4070 4071 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4072 &token); 4073 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4074 btrfs_set_token_file_extent_type(leaf, fi, 4075 BTRFS_FILE_EXTENT_PREALLOC, 4076 &token); 4077 else 4078 btrfs_set_token_file_extent_type(leaf, fi, 4079 BTRFS_FILE_EXTENT_REG, 4080 &token); 4081 4082 block_len = max(em->block_len, em->orig_block_len); 4083 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4084 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4085 em->block_start, 4086 &token); 4087 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4088 &token); 4089 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4090 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4091 em->block_start - 4092 extent_offset, &token); 4093 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4094 &token); 4095 } else { 4096 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4097 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4098 &token); 4099 } 4100 4101 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4102 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4103 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4104 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4105 &token); 4106 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4107 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4108 btrfs_mark_buffer_dirty(leaf); 4109 4110 btrfs_release_path(path); 4111 4112 return ret; 4113 } 4114 4115 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4116 struct btrfs_root *root, 4117 struct inode *inode, 4118 struct btrfs_path *path, 4119 struct list_head *logged_list, 4120 struct btrfs_log_ctx *ctx) 4121 { 4122 struct extent_map *em, *n; 4123 struct list_head extents; 4124 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 4125 u64 test_gen; 4126 int ret = 0; 4127 int num = 0; 4128 4129 INIT_LIST_HEAD(&extents); 4130 4131 write_lock(&tree->lock); 4132 test_gen = root->fs_info->last_trans_committed; 4133 4134 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4135 list_del_init(&em->list); 4136 4137 /* 4138 * Just an arbitrary number, this can be really CPU intensive 4139 * once we start getting a lot of extents, and really once we 4140 * have a bunch of extents we just want to commit since it will 4141 * be faster. 4142 */ 4143 if (++num > 32768) { 4144 list_del_init(&tree->modified_extents); 4145 ret = -EFBIG; 4146 goto process; 4147 } 4148 4149 if (em->generation <= test_gen) 4150 continue; 4151 /* Need a ref to keep it from getting evicted from cache */ 4152 atomic_inc(&em->refs); 4153 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4154 list_add_tail(&em->list, &extents); 4155 num++; 4156 } 4157 4158 list_sort(NULL, &extents, extent_cmp); 4159 4160 process: 4161 while (!list_empty(&extents)) { 4162 em = list_entry(extents.next, struct extent_map, list); 4163 4164 list_del_init(&em->list); 4165 4166 /* 4167 * If we had an error we just need to delete everybody from our 4168 * private list. 4169 */ 4170 if (ret) { 4171 clear_em_logging(tree, em); 4172 free_extent_map(em); 4173 continue; 4174 } 4175 4176 write_unlock(&tree->lock); 4177 4178 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4179 ctx); 4180 write_lock(&tree->lock); 4181 clear_em_logging(tree, em); 4182 free_extent_map(em); 4183 } 4184 WARN_ON(!list_empty(&extents)); 4185 write_unlock(&tree->lock); 4186 4187 btrfs_release_path(path); 4188 return ret; 4189 } 4190 4191 static int logged_inode_size(struct btrfs_root *log, struct inode *inode, 4192 struct btrfs_path *path, u64 *size_ret) 4193 { 4194 struct btrfs_key key; 4195 int ret; 4196 4197 key.objectid = btrfs_ino(inode); 4198 key.type = BTRFS_INODE_ITEM_KEY; 4199 key.offset = 0; 4200 4201 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4202 if (ret < 0) { 4203 return ret; 4204 } else if (ret > 0) { 4205 *size_ret = 0; 4206 } else { 4207 struct btrfs_inode_item *item; 4208 4209 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4210 struct btrfs_inode_item); 4211 *size_ret = btrfs_inode_size(path->nodes[0], item); 4212 } 4213 4214 btrfs_release_path(path); 4215 return 0; 4216 } 4217 4218 /* 4219 * At the moment we always log all xattrs. This is to figure out at log replay 4220 * time which xattrs must have their deletion replayed. If a xattr is missing 4221 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4222 * because if a xattr is deleted, the inode is fsynced and a power failure 4223 * happens, causing the log to be replayed the next time the fs is mounted, 4224 * we want the xattr to not exist anymore (same behaviour as other filesystems 4225 * with a journal, ext3/4, xfs, f2fs, etc). 4226 */ 4227 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4228 struct btrfs_root *root, 4229 struct inode *inode, 4230 struct btrfs_path *path, 4231 struct btrfs_path *dst_path) 4232 { 4233 int ret; 4234 struct btrfs_key key; 4235 const u64 ino = btrfs_ino(inode); 4236 int ins_nr = 0; 4237 int start_slot = 0; 4238 4239 key.objectid = ino; 4240 key.type = BTRFS_XATTR_ITEM_KEY; 4241 key.offset = 0; 4242 4243 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4244 if (ret < 0) 4245 return ret; 4246 4247 while (true) { 4248 int slot = path->slots[0]; 4249 struct extent_buffer *leaf = path->nodes[0]; 4250 int nritems = btrfs_header_nritems(leaf); 4251 4252 if (slot >= nritems) { 4253 if (ins_nr > 0) { 4254 u64 last_extent = 0; 4255 4256 ret = copy_items(trans, inode, dst_path, path, 4257 &last_extent, start_slot, 4258 ins_nr, 1, 0); 4259 /* can't be 1, extent items aren't processed */ 4260 ASSERT(ret <= 0); 4261 if (ret < 0) 4262 return ret; 4263 ins_nr = 0; 4264 } 4265 ret = btrfs_next_leaf(root, path); 4266 if (ret < 0) 4267 return ret; 4268 else if (ret > 0) 4269 break; 4270 continue; 4271 } 4272 4273 btrfs_item_key_to_cpu(leaf, &key, slot); 4274 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4275 break; 4276 4277 if (ins_nr == 0) 4278 start_slot = slot; 4279 ins_nr++; 4280 path->slots[0]++; 4281 cond_resched(); 4282 } 4283 if (ins_nr > 0) { 4284 u64 last_extent = 0; 4285 4286 ret = copy_items(trans, inode, dst_path, path, 4287 &last_extent, start_slot, 4288 ins_nr, 1, 0); 4289 /* can't be 1, extent items aren't processed */ 4290 ASSERT(ret <= 0); 4291 if (ret < 0) 4292 return ret; 4293 } 4294 4295 return 0; 4296 } 4297 4298 /* 4299 * If the no holes feature is enabled we need to make sure any hole between the 4300 * last extent and the i_size of our inode is explicitly marked in the log. This 4301 * is to make sure that doing something like: 4302 * 4303 * 1) create file with 128Kb of data 4304 * 2) truncate file to 64Kb 4305 * 3) truncate file to 256Kb 4306 * 4) fsync file 4307 * 5) <crash/power failure> 4308 * 6) mount fs and trigger log replay 4309 * 4310 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4311 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4312 * file correspond to a hole. The presence of explicit holes in a log tree is 4313 * what guarantees that log replay will remove/adjust file extent items in the 4314 * fs/subvol tree. 4315 * 4316 * Here we do not need to care about holes between extents, that is already done 4317 * by copy_items(). We also only need to do this in the full sync path, where we 4318 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4319 * lookup the list of modified extent maps and if any represents a hole, we 4320 * insert a corresponding extent representing a hole in the log tree. 4321 */ 4322 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4323 struct btrfs_root *root, 4324 struct inode *inode, 4325 struct btrfs_path *path) 4326 { 4327 int ret; 4328 struct btrfs_key key; 4329 u64 hole_start; 4330 u64 hole_size; 4331 struct extent_buffer *leaf; 4332 struct btrfs_root *log = root->log_root; 4333 const u64 ino = btrfs_ino(inode); 4334 const u64 i_size = i_size_read(inode); 4335 4336 if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) 4337 return 0; 4338 4339 key.objectid = ino; 4340 key.type = BTRFS_EXTENT_DATA_KEY; 4341 key.offset = (u64)-1; 4342 4343 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4344 ASSERT(ret != 0); 4345 if (ret < 0) 4346 return ret; 4347 4348 ASSERT(path->slots[0] > 0); 4349 path->slots[0]--; 4350 leaf = path->nodes[0]; 4351 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4352 4353 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4354 /* inode does not have any extents */ 4355 hole_start = 0; 4356 hole_size = i_size; 4357 } else { 4358 struct btrfs_file_extent_item *extent; 4359 u64 len; 4360 4361 /* 4362 * If there's an extent beyond i_size, an explicit hole was 4363 * already inserted by copy_items(). 4364 */ 4365 if (key.offset >= i_size) 4366 return 0; 4367 4368 extent = btrfs_item_ptr(leaf, path->slots[0], 4369 struct btrfs_file_extent_item); 4370 4371 if (btrfs_file_extent_type(leaf, extent) == 4372 BTRFS_FILE_EXTENT_INLINE) { 4373 len = btrfs_file_extent_inline_len(leaf, 4374 path->slots[0], 4375 extent); 4376 ASSERT(len == i_size); 4377 return 0; 4378 } 4379 4380 len = btrfs_file_extent_num_bytes(leaf, extent); 4381 /* Last extent goes beyond i_size, no need to log a hole. */ 4382 if (key.offset + len > i_size) 4383 return 0; 4384 hole_start = key.offset + len; 4385 hole_size = i_size - hole_start; 4386 } 4387 btrfs_release_path(path); 4388 4389 /* Last extent ends at i_size. */ 4390 if (hole_size == 0) 4391 return 0; 4392 4393 hole_size = ALIGN(hole_size, root->sectorsize); 4394 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4395 hole_size, 0, hole_size, 0, 0, 0); 4396 return ret; 4397 } 4398 4399 /* log a single inode in the tree log. 4400 * At least one parent directory for this inode must exist in the tree 4401 * or be logged already. 4402 * 4403 * Any items from this inode changed by the current transaction are copied 4404 * to the log tree. An extra reference is taken on any extents in this 4405 * file, allowing us to avoid a whole pile of corner cases around logging 4406 * blocks that have been removed from the tree. 4407 * 4408 * See LOG_INODE_ALL and related defines for a description of what inode_only 4409 * does. 4410 * 4411 * This handles both files and directories. 4412 */ 4413 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4414 struct btrfs_root *root, struct inode *inode, 4415 int inode_only, 4416 const loff_t start, 4417 const loff_t end, 4418 struct btrfs_log_ctx *ctx) 4419 { 4420 struct btrfs_path *path; 4421 struct btrfs_path *dst_path; 4422 struct btrfs_key min_key; 4423 struct btrfs_key max_key; 4424 struct btrfs_root *log = root->log_root; 4425 struct extent_buffer *src = NULL; 4426 LIST_HEAD(logged_list); 4427 u64 last_extent = 0; 4428 int err = 0; 4429 int ret; 4430 int nritems; 4431 int ins_start_slot = 0; 4432 int ins_nr; 4433 bool fast_search = false; 4434 u64 ino = btrfs_ino(inode); 4435 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4436 u64 logged_isize = 0; 4437 bool need_log_inode_item = true; 4438 4439 path = btrfs_alloc_path(); 4440 if (!path) 4441 return -ENOMEM; 4442 dst_path = btrfs_alloc_path(); 4443 if (!dst_path) { 4444 btrfs_free_path(path); 4445 return -ENOMEM; 4446 } 4447 4448 min_key.objectid = ino; 4449 min_key.type = BTRFS_INODE_ITEM_KEY; 4450 min_key.offset = 0; 4451 4452 max_key.objectid = ino; 4453 4454 4455 /* today the code can only do partial logging of directories */ 4456 if (S_ISDIR(inode->i_mode) || 4457 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4458 &BTRFS_I(inode)->runtime_flags) && 4459 inode_only == LOG_INODE_EXISTS)) 4460 max_key.type = BTRFS_XATTR_ITEM_KEY; 4461 else 4462 max_key.type = (u8)-1; 4463 max_key.offset = (u64)-1; 4464 4465 /* 4466 * Only run delayed items if we are a dir or a new file. 4467 * Otherwise commit the delayed inode only, which is needed in 4468 * order for the log replay code to mark inodes for link count 4469 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4470 */ 4471 if (S_ISDIR(inode->i_mode) || 4472 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) 4473 ret = btrfs_commit_inode_delayed_items(trans, inode); 4474 else 4475 ret = btrfs_commit_inode_delayed_inode(inode); 4476 4477 if (ret) { 4478 btrfs_free_path(path); 4479 btrfs_free_path(dst_path); 4480 return ret; 4481 } 4482 4483 mutex_lock(&BTRFS_I(inode)->log_mutex); 4484 4485 btrfs_get_logged_extents(inode, &logged_list, start, end); 4486 4487 /* 4488 * a brute force approach to making sure we get the most uptodate 4489 * copies of everything. 4490 */ 4491 if (S_ISDIR(inode->i_mode)) { 4492 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4493 4494 if (inode_only == LOG_INODE_EXISTS) 4495 max_key_type = BTRFS_XATTR_ITEM_KEY; 4496 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4497 } else { 4498 if (inode_only == LOG_INODE_EXISTS) { 4499 /* 4500 * Make sure the new inode item we write to the log has 4501 * the same isize as the current one (if it exists). 4502 * This is necessary to prevent data loss after log 4503 * replay, and also to prevent doing a wrong expanding 4504 * truncate - for e.g. create file, write 4K into offset 4505 * 0, fsync, write 4K into offset 4096, add hard link, 4506 * fsync some other file (to sync log), power fail - if 4507 * we use the inode's current i_size, after log replay 4508 * we get a 8Kb file, with the last 4Kb extent as a hole 4509 * (zeroes), as if an expanding truncate happened, 4510 * instead of getting a file of 4Kb only. 4511 */ 4512 err = logged_inode_size(log, inode, path, 4513 &logged_isize); 4514 if (err) 4515 goto out_unlock; 4516 } 4517 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4518 &BTRFS_I(inode)->runtime_flags)) { 4519 if (inode_only == LOG_INODE_EXISTS) { 4520 max_key.type = BTRFS_XATTR_ITEM_KEY; 4521 ret = drop_objectid_items(trans, log, path, ino, 4522 max_key.type); 4523 } else { 4524 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4525 &BTRFS_I(inode)->runtime_flags); 4526 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4527 &BTRFS_I(inode)->runtime_flags); 4528 while(1) { 4529 ret = btrfs_truncate_inode_items(trans, 4530 log, inode, 0, 0); 4531 if (ret != -EAGAIN) 4532 break; 4533 } 4534 } 4535 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4536 &BTRFS_I(inode)->runtime_flags) || 4537 inode_only == LOG_INODE_EXISTS) { 4538 if (inode_only == LOG_INODE_ALL) 4539 fast_search = true; 4540 max_key.type = BTRFS_XATTR_ITEM_KEY; 4541 ret = drop_objectid_items(trans, log, path, ino, 4542 max_key.type); 4543 } else { 4544 if (inode_only == LOG_INODE_ALL) 4545 fast_search = true; 4546 goto log_extents; 4547 } 4548 4549 } 4550 if (ret) { 4551 err = ret; 4552 goto out_unlock; 4553 } 4554 4555 while (1) { 4556 ins_nr = 0; 4557 ret = btrfs_search_forward(root, &min_key, 4558 path, trans->transid); 4559 if (ret != 0) 4560 break; 4561 again: 4562 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4563 if (min_key.objectid != ino) 4564 break; 4565 if (min_key.type > max_key.type) 4566 break; 4567 4568 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4569 need_log_inode_item = false; 4570 4571 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4572 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4573 if (ins_nr == 0) 4574 goto next_slot; 4575 ret = copy_items(trans, inode, dst_path, path, 4576 &last_extent, ins_start_slot, 4577 ins_nr, inode_only, logged_isize); 4578 if (ret < 0) { 4579 err = ret; 4580 goto out_unlock; 4581 } 4582 ins_nr = 0; 4583 if (ret) { 4584 btrfs_release_path(path); 4585 continue; 4586 } 4587 goto next_slot; 4588 } 4589 4590 src = path->nodes[0]; 4591 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4592 ins_nr++; 4593 goto next_slot; 4594 } else if (!ins_nr) { 4595 ins_start_slot = path->slots[0]; 4596 ins_nr = 1; 4597 goto next_slot; 4598 } 4599 4600 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4601 ins_start_slot, ins_nr, inode_only, 4602 logged_isize); 4603 if (ret < 0) { 4604 err = ret; 4605 goto out_unlock; 4606 } 4607 if (ret) { 4608 ins_nr = 0; 4609 btrfs_release_path(path); 4610 continue; 4611 } 4612 ins_nr = 1; 4613 ins_start_slot = path->slots[0]; 4614 next_slot: 4615 4616 nritems = btrfs_header_nritems(path->nodes[0]); 4617 path->slots[0]++; 4618 if (path->slots[0] < nritems) { 4619 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4620 path->slots[0]); 4621 goto again; 4622 } 4623 if (ins_nr) { 4624 ret = copy_items(trans, inode, dst_path, path, 4625 &last_extent, ins_start_slot, 4626 ins_nr, inode_only, logged_isize); 4627 if (ret < 0) { 4628 err = ret; 4629 goto out_unlock; 4630 } 4631 ret = 0; 4632 ins_nr = 0; 4633 } 4634 btrfs_release_path(path); 4635 4636 if (min_key.offset < (u64)-1) { 4637 min_key.offset++; 4638 } else if (min_key.type < max_key.type) { 4639 min_key.type++; 4640 min_key.offset = 0; 4641 } else { 4642 break; 4643 } 4644 } 4645 if (ins_nr) { 4646 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4647 ins_start_slot, ins_nr, inode_only, 4648 logged_isize); 4649 if (ret < 0) { 4650 err = ret; 4651 goto out_unlock; 4652 } 4653 ret = 0; 4654 ins_nr = 0; 4655 } 4656 4657 btrfs_release_path(path); 4658 btrfs_release_path(dst_path); 4659 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 4660 if (err) 4661 goto out_unlock; 4662 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4663 btrfs_release_path(path); 4664 btrfs_release_path(dst_path); 4665 err = btrfs_log_trailing_hole(trans, root, inode, path); 4666 if (err) 4667 goto out_unlock; 4668 } 4669 log_extents: 4670 btrfs_release_path(path); 4671 btrfs_release_path(dst_path); 4672 if (need_log_inode_item) { 4673 err = log_inode_item(trans, log, dst_path, inode); 4674 if (err) 4675 goto out_unlock; 4676 } 4677 if (fast_search) { 4678 /* 4679 * Some ordered extents started by fsync might have completed 4680 * before we collected the ordered extents in logged_list, which 4681 * means they're gone, not in our logged_list nor in the inode's 4682 * ordered tree. We want the application/user space to know an 4683 * error happened while attempting to persist file data so that 4684 * it can take proper action. If such error happened, we leave 4685 * without writing to the log tree and the fsync must report the 4686 * file data write error and not commit the current transaction. 4687 */ 4688 err = btrfs_inode_check_errors(inode); 4689 if (err) { 4690 ctx->io_err = err; 4691 goto out_unlock; 4692 } 4693 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4694 &logged_list, ctx); 4695 if (ret) { 4696 err = ret; 4697 goto out_unlock; 4698 } 4699 } else if (inode_only == LOG_INODE_ALL) { 4700 struct extent_map *em, *n; 4701 4702 write_lock(&em_tree->lock); 4703 /* 4704 * We can't just remove every em if we're called for a ranged 4705 * fsync - that is, one that doesn't cover the whole possible 4706 * file range (0 to LLONG_MAX). This is because we can have 4707 * em's that fall outside the range we're logging and therefore 4708 * their ordered operations haven't completed yet 4709 * (btrfs_finish_ordered_io() not invoked yet). This means we 4710 * didn't get their respective file extent item in the fs/subvol 4711 * tree yet, and need to let the next fast fsync (one which 4712 * consults the list of modified extent maps) find the em so 4713 * that it logs a matching file extent item and waits for the 4714 * respective ordered operation to complete (if it's still 4715 * running). 4716 * 4717 * Removing every em outside the range we're logging would make 4718 * the next fast fsync not log their matching file extent items, 4719 * therefore making us lose data after a log replay. 4720 */ 4721 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4722 list) { 4723 const u64 mod_end = em->mod_start + em->mod_len - 1; 4724 4725 if (em->mod_start >= start && mod_end <= end) 4726 list_del_init(&em->list); 4727 } 4728 write_unlock(&em_tree->lock); 4729 } 4730 4731 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4732 ret = log_directory_changes(trans, root, inode, path, dst_path, 4733 ctx); 4734 if (ret) { 4735 err = ret; 4736 goto out_unlock; 4737 } 4738 } 4739 4740 spin_lock(&BTRFS_I(inode)->lock); 4741 BTRFS_I(inode)->logged_trans = trans->transid; 4742 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4743 spin_unlock(&BTRFS_I(inode)->lock); 4744 out_unlock: 4745 if (unlikely(err)) 4746 btrfs_put_logged_extents(&logged_list); 4747 else 4748 btrfs_submit_logged_extents(&logged_list, log); 4749 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4750 4751 btrfs_free_path(path); 4752 btrfs_free_path(dst_path); 4753 return err; 4754 } 4755 4756 /* 4757 * follow the dentry parent pointers up the chain and see if any 4758 * of the directories in it require a full commit before they can 4759 * be logged. Returns zero if nothing special needs to be done or 1 if 4760 * a full commit is required. 4761 */ 4762 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4763 struct inode *inode, 4764 struct dentry *parent, 4765 struct super_block *sb, 4766 u64 last_committed) 4767 { 4768 int ret = 0; 4769 struct btrfs_root *root; 4770 struct dentry *old_parent = NULL; 4771 struct inode *orig_inode = inode; 4772 4773 /* 4774 * for regular files, if its inode is already on disk, we don't 4775 * have to worry about the parents at all. This is because 4776 * we can use the last_unlink_trans field to record renames 4777 * and other fun in this file. 4778 */ 4779 if (S_ISREG(inode->i_mode) && 4780 BTRFS_I(inode)->generation <= last_committed && 4781 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4782 goto out; 4783 4784 if (!S_ISDIR(inode->i_mode)) { 4785 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4786 goto out; 4787 inode = d_inode(parent); 4788 } 4789 4790 while (1) { 4791 /* 4792 * If we are logging a directory then we start with our inode, 4793 * not our parents inode, so we need to skipp setting the 4794 * logged_trans so that further down in the log code we don't 4795 * think this inode has already been logged. 4796 */ 4797 if (inode != orig_inode) 4798 BTRFS_I(inode)->logged_trans = trans->transid; 4799 smp_mb(); 4800 4801 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4802 root = BTRFS_I(inode)->root; 4803 4804 /* 4805 * make sure any commits to the log are forced 4806 * to be full commits 4807 */ 4808 btrfs_set_log_full_commit(root->fs_info, trans); 4809 ret = 1; 4810 break; 4811 } 4812 4813 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 4814 break; 4815 4816 if (IS_ROOT(parent)) 4817 break; 4818 4819 parent = dget_parent(parent); 4820 dput(old_parent); 4821 old_parent = parent; 4822 inode = d_inode(parent); 4823 4824 } 4825 dput(old_parent); 4826 out: 4827 return ret; 4828 } 4829 4830 struct btrfs_dir_list { 4831 u64 ino; 4832 struct list_head list; 4833 }; 4834 4835 /* 4836 * Log the inodes of the new dentries of a directory. See log_dir_items() for 4837 * details about the why it is needed. 4838 * This is a recursive operation - if an existing dentry corresponds to a 4839 * directory, that directory's new entries are logged too (same behaviour as 4840 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 4841 * the dentries point to we do not lock their i_mutex, otherwise lockdep 4842 * complains about the following circular lock dependency / possible deadlock: 4843 * 4844 * CPU0 CPU1 4845 * ---- ---- 4846 * lock(&type->i_mutex_dir_key#3/2); 4847 * lock(sb_internal#2); 4848 * lock(&type->i_mutex_dir_key#3/2); 4849 * lock(&sb->s_type->i_mutex_key#14); 4850 * 4851 * Where sb_internal is the lock (a counter that works as a lock) acquired by 4852 * sb_start_intwrite() in btrfs_start_transaction(). 4853 * Not locking i_mutex of the inodes is still safe because: 4854 * 4855 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 4856 * that while logging the inode new references (names) are added or removed 4857 * from the inode, leaving the logged inode item with a link count that does 4858 * not match the number of logged inode reference items. This is fine because 4859 * at log replay time we compute the real number of links and correct the 4860 * link count in the inode item (see replay_one_buffer() and 4861 * link_to_fixup_dir()); 4862 * 4863 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 4864 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 4865 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 4866 * has a size that doesn't match the sum of the lengths of all the logged 4867 * names. This does not result in a problem because if a dir_item key is 4868 * logged but its matching dir_index key is not logged, at log replay time we 4869 * don't use it to replay the respective name (see replay_one_name()). On the 4870 * other hand if only the dir_index key ends up being logged, the respective 4871 * name is added to the fs/subvol tree with both the dir_item and dir_index 4872 * keys created (see replay_one_name()). 4873 * The directory's inode item with a wrong i_size is not a problem as well, 4874 * since we don't use it at log replay time to set the i_size in the inode 4875 * item of the fs/subvol tree (see overwrite_item()). 4876 */ 4877 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 4878 struct btrfs_root *root, 4879 struct inode *start_inode, 4880 struct btrfs_log_ctx *ctx) 4881 { 4882 struct btrfs_root *log = root->log_root; 4883 struct btrfs_path *path; 4884 LIST_HEAD(dir_list); 4885 struct btrfs_dir_list *dir_elem; 4886 int ret = 0; 4887 4888 path = btrfs_alloc_path(); 4889 if (!path) 4890 return -ENOMEM; 4891 4892 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 4893 if (!dir_elem) { 4894 btrfs_free_path(path); 4895 return -ENOMEM; 4896 } 4897 dir_elem->ino = btrfs_ino(start_inode); 4898 list_add_tail(&dir_elem->list, &dir_list); 4899 4900 while (!list_empty(&dir_list)) { 4901 struct extent_buffer *leaf; 4902 struct btrfs_key min_key; 4903 int nritems; 4904 int i; 4905 4906 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 4907 list); 4908 if (ret) 4909 goto next_dir_inode; 4910 4911 min_key.objectid = dir_elem->ino; 4912 min_key.type = BTRFS_DIR_ITEM_KEY; 4913 min_key.offset = 0; 4914 again: 4915 btrfs_release_path(path); 4916 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 4917 if (ret < 0) { 4918 goto next_dir_inode; 4919 } else if (ret > 0) { 4920 ret = 0; 4921 goto next_dir_inode; 4922 } 4923 4924 process_leaf: 4925 leaf = path->nodes[0]; 4926 nritems = btrfs_header_nritems(leaf); 4927 for (i = path->slots[0]; i < nritems; i++) { 4928 struct btrfs_dir_item *di; 4929 struct btrfs_key di_key; 4930 struct inode *di_inode; 4931 struct btrfs_dir_list *new_dir_elem; 4932 int log_mode = LOG_INODE_EXISTS; 4933 int type; 4934 4935 btrfs_item_key_to_cpu(leaf, &min_key, i); 4936 if (min_key.objectid != dir_elem->ino || 4937 min_key.type != BTRFS_DIR_ITEM_KEY) 4938 goto next_dir_inode; 4939 4940 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 4941 type = btrfs_dir_type(leaf, di); 4942 if (btrfs_dir_transid(leaf, di) < trans->transid && 4943 type != BTRFS_FT_DIR) 4944 continue; 4945 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 4946 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 4947 continue; 4948 4949 di_inode = btrfs_iget(root->fs_info->sb, &di_key, 4950 root, NULL); 4951 if (IS_ERR(di_inode)) { 4952 ret = PTR_ERR(di_inode); 4953 goto next_dir_inode; 4954 } 4955 4956 if (btrfs_inode_in_log(di_inode, trans->transid)) { 4957 iput(di_inode); 4958 continue; 4959 } 4960 4961 ctx->log_new_dentries = false; 4962 if (type == BTRFS_FT_DIR) 4963 log_mode = LOG_INODE_ALL; 4964 btrfs_release_path(path); 4965 ret = btrfs_log_inode(trans, root, di_inode, 4966 log_mode, 0, LLONG_MAX, ctx); 4967 iput(di_inode); 4968 if (ret) 4969 goto next_dir_inode; 4970 if (ctx->log_new_dentries) { 4971 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 4972 GFP_NOFS); 4973 if (!new_dir_elem) { 4974 ret = -ENOMEM; 4975 goto next_dir_inode; 4976 } 4977 new_dir_elem->ino = di_key.objectid; 4978 list_add_tail(&new_dir_elem->list, &dir_list); 4979 } 4980 break; 4981 } 4982 if (i == nritems) { 4983 ret = btrfs_next_leaf(log, path); 4984 if (ret < 0) { 4985 goto next_dir_inode; 4986 } else if (ret > 0) { 4987 ret = 0; 4988 goto next_dir_inode; 4989 } 4990 goto process_leaf; 4991 } 4992 if (min_key.offset < (u64)-1) { 4993 min_key.offset++; 4994 goto again; 4995 } 4996 next_dir_inode: 4997 list_del(&dir_elem->list); 4998 kfree(dir_elem); 4999 } 5000 5001 btrfs_free_path(path); 5002 return ret; 5003 } 5004 5005 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5006 struct inode *inode, 5007 struct btrfs_log_ctx *ctx) 5008 { 5009 int ret; 5010 struct btrfs_path *path; 5011 struct btrfs_key key; 5012 struct btrfs_root *root = BTRFS_I(inode)->root; 5013 const u64 ino = btrfs_ino(inode); 5014 5015 path = btrfs_alloc_path(); 5016 if (!path) 5017 return -ENOMEM; 5018 path->skip_locking = 1; 5019 path->search_commit_root = 1; 5020 5021 key.objectid = ino; 5022 key.type = BTRFS_INODE_REF_KEY; 5023 key.offset = 0; 5024 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5025 if (ret < 0) 5026 goto out; 5027 5028 while (true) { 5029 struct extent_buffer *leaf = path->nodes[0]; 5030 int slot = path->slots[0]; 5031 u32 cur_offset = 0; 5032 u32 item_size; 5033 unsigned long ptr; 5034 5035 if (slot >= btrfs_header_nritems(leaf)) { 5036 ret = btrfs_next_leaf(root, path); 5037 if (ret < 0) 5038 goto out; 5039 else if (ret > 0) 5040 break; 5041 continue; 5042 } 5043 5044 btrfs_item_key_to_cpu(leaf, &key, slot); 5045 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5046 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5047 break; 5048 5049 item_size = btrfs_item_size_nr(leaf, slot); 5050 ptr = btrfs_item_ptr_offset(leaf, slot); 5051 while (cur_offset < item_size) { 5052 struct btrfs_key inode_key; 5053 struct inode *dir_inode; 5054 5055 inode_key.type = BTRFS_INODE_ITEM_KEY; 5056 inode_key.offset = 0; 5057 5058 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5059 struct btrfs_inode_extref *extref; 5060 5061 extref = (struct btrfs_inode_extref *) 5062 (ptr + cur_offset); 5063 inode_key.objectid = btrfs_inode_extref_parent( 5064 leaf, extref); 5065 cur_offset += sizeof(*extref); 5066 cur_offset += btrfs_inode_extref_name_len(leaf, 5067 extref); 5068 } else { 5069 inode_key.objectid = key.offset; 5070 cur_offset = item_size; 5071 } 5072 5073 dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, 5074 root, NULL); 5075 /* If parent inode was deleted, skip it. */ 5076 if (IS_ERR(dir_inode)) 5077 continue; 5078 5079 ret = btrfs_log_inode(trans, root, dir_inode, 5080 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5081 iput(dir_inode); 5082 if (ret) 5083 goto out; 5084 } 5085 path->slots[0]++; 5086 } 5087 ret = 0; 5088 out: 5089 btrfs_free_path(path); 5090 return ret; 5091 } 5092 5093 /* 5094 * helper function around btrfs_log_inode to make sure newly created 5095 * parent directories also end up in the log. A minimal inode and backref 5096 * only logging is done of any parent directories that are older than 5097 * the last committed transaction 5098 */ 5099 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5100 struct btrfs_root *root, struct inode *inode, 5101 struct dentry *parent, 5102 const loff_t start, 5103 const loff_t end, 5104 int exists_only, 5105 struct btrfs_log_ctx *ctx) 5106 { 5107 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5108 struct super_block *sb; 5109 struct dentry *old_parent = NULL; 5110 int ret = 0; 5111 u64 last_committed = root->fs_info->last_trans_committed; 5112 bool log_dentries = false; 5113 struct inode *orig_inode = inode; 5114 5115 sb = inode->i_sb; 5116 5117 if (btrfs_test_opt(root, NOTREELOG)) { 5118 ret = 1; 5119 goto end_no_trans; 5120 } 5121 5122 /* 5123 * The prev transaction commit doesn't complete, we need do 5124 * full commit by ourselves. 5125 */ 5126 if (root->fs_info->last_trans_log_full_commit > 5127 root->fs_info->last_trans_committed) { 5128 ret = 1; 5129 goto end_no_trans; 5130 } 5131 5132 if (root != BTRFS_I(inode)->root || 5133 btrfs_root_refs(&root->root_item) == 0) { 5134 ret = 1; 5135 goto end_no_trans; 5136 } 5137 5138 ret = check_parent_dirs_for_sync(trans, inode, parent, 5139 sb, last_committed); 5140 if (ret) 5141 goto end_no_trans; 5142 5143 if (btrfs_inode_in_log(inode, trans->transid)) { 5144 ret = BTRFS_NO_LOG_SYNC; 5145 goto end_no_trans; 5146 } 5147 5148 ret = start_log_trans(trans, root, ctx); 5149 if (ret) 5150 goto end_no_trans; 5151 5152 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5153 if (ret) 5154 goto end_trans; 5155 5156 /* 5157 * for regular files, if its inode is already on disk, we don't 5158 * have to worry about the parents at all. This is because 5159 * we can use the last_unlink_trans field to record renames 5160 * and other fun in this file. 5161 */ 5162 if (S_ISREG(inode->i_mode) && 5163 BTRFS_I(inode)->generation <= last_committed && 5164 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 5165 ret = 0; 5166 goto end_trans; 5167 } 5168 5169 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 5170 log_dentries = true; 5171 5172 /* 5173 * On unlink we must make sure all our current and old parent directores 5174 * inodes are fully logged. This is to prevent leaving dangling 5175 * directory index entries in directories that were our parents but are 5176 * not anymore. Not doing this results in old parent directory being 5177 * impossible to delete after log replay (rmdir will always fail with 5178 * error -ENOTEMPTY). 5179 * 5180 * Example 1: 5181 * 5182 * mkdir testdir 5183 * touch testdir/foo 5184 * ln testdir/foo testdir/bar 5185 * sync 5186 * unlink testdir/bar 5187 * xfs_io -c fsync testdir/foo 5188 * <power failure> 5189 * mount fs, triggers log replay 5190 * 5191 * If we don't log the parent directory (testdir), after log replay the 5192 * directory still has an entry pointing to the file inode using the bar 5193 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5194 * the file inode has a link count of 1. 5195 * 5196 * Example 2: 5197 * 5198 * mkdir testdir 5199 * touch foo 5200 * ln foo testdir/foo2 5201 * ln foo testdir/foo3 5202 * sync 5203 * unlink testdir/foo3 5204 * xfs_io -c fsync foo 5205 * <power failure> 5206 * mount fs, triggers log replay 5207 * 5208 * Similar as the first example, after log replay the parent directory 5209 * testdir still has an entry pointing to the inode file with name foo3 5210 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5211 * and has a link count of 2. 5212 */ 5213 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 5214 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5215 if (ret) 5216 goto end_trans; 5217 } 5218 5219 while (1) { 5220 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 5221 break; 5222 5223 inode = d_inode(parent); 5224 if (root != BTRFS_I(inode)->root) 5225 break; 5226 5227 if (BTRFS_I(inode)->generation > last_committed) { 5228 ret = btrfs_log_inode(trans, root, inode, 5229 LOG_INODE_EXISTS, 5230 0, LLONG_MAX, ctx); 5231 if (ret) 5232 goto end_trans; 5233 } 5234 if (IS_ROOT(parent)) 5235 break; 5236 5237 parent = dget_parent(parent); 5238 dput(old_parent); 5239 old_parent = parent; 5240 } 5241 if (log_dentries) 5242 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5243 else 5244 ret = 0; 5245 end_trans: 5246 dput(old_parent); 5247 if (ret < 0) { 5248 btrfs_set_log_full_commit(root->fs_info, trans); 5249 ret = 1; 5250 } 5251 5252 if (ret) 5253 btrfs_remove_log_ctx(root, ctx); 5254 btrfs_end_log_trans(root); 5255 end_no_trans: 5256 return ret; 5257 } 5258 5259 /* 5260 * it is not safe to log dentry if the chunk root has added new 5261 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5262 * If this returns 1, you must commit the transaction to safely get your 5263 * data on disk. 5264 */ 5265 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5266 struct btrfs_root *root, struct dentry *dentry, 5267 const loff_t start, 5268 const loff_t end, 5269 struct btrfs_log_ctx *ctx) 5270 { 5271 struct dentry *parent = dget_parent(dentry); 5272 int ret; 5273 5274 ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, 5275 start, end, 0, ctx); 5276 dput(parent); 5277 5278 return ret; 5279 } 5280 5281 /* 5282 * should be called during mount to recover any replay any log trees 5283 * from the FS 5284 */ 5285 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5286 { 5287 int ret; 5288 struct btrfs_path *path; 5289 struct btrfs_trans_handle *trans; 5290 struct btrfs_key key; 5291 struct btrfs_key found_key; 5292 struct btrfs_key tmp_key; 5293 struct btrfs_root *log; 5294 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5295 struct walk_control wc = { 5296 .process_func = process_one_buffer, 5297 .stage = 0, 5298 }; 5299 5300 path = btrfs_alloc_path(); 5301 if (!path) 5302 return -ENOMEM; 5303 5304 fs_info->log_root_recovering = 1; 5305 5306 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5307 if (IS_ERR(trans)) { 5308 ret = PTR_ERR(trans); 5309 goto error; 5310 } 5311 5312 wc.trans = trans; 5313 wc.pin = 1; 5314 5315 ret = walk_log_tree(trans, log_root_tree, &wc); 5316 if (ret) { 5317 btrfs_error(fs_info, ret, "Failed to pin buffers while " 5318 "recovering log root tree."); 5319 goto error; 5320 } 5321 5322 again: 5323 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5324 key.offset = (u64)-1; 5325 key.type = BTRFS_ROOT_ITEM_KEY; 5326 5327 while (1) { 5328 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5329 5330 if (ret < 0) { 5331 btrfs_error(fs_info, ret, 5332 "Couldn't find tree log root."); 5333 goto error; 5334 } 5335 if (ret > 0) { 5336 if (path->slots[0] == 0) 5337 break; 5338 path->slots[0]--; 5339 } 5340 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5341 path->slots[0]); 5342 btrfs_release_path(path); 5343 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5344 break; 5345 5346 log = btrfs_read_fs_root(log_root_tree, &found_key); 5347 if (IS_ERR(log)) { 5348 ret = PTR_ERR(log); 5349 btrfs_error(fs_info, ret, 5350 "Couldn't read tree log root."); 5351 goto error; 5352 } 5353 5354 tmp_key.objectid = found_key.offset; 5355 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5356 tmp_key.offset = (u64)-1; 5357 5358 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5359 if (IS_ERR(wc.replay_dest)) { 5360 ret = PTR_ERR(wc.replay_dest); 5361 free_extent_buffer(log->node); 5362 free_extent_buffer(log->commit_root); 5363 kfree(log); 5364 btrfs_error(fs_info, ret, "Couldn't read target root " 5365 "for tree log recovery."); 5366 goto error; 5367 } 5368 5369 wc.replay_dest->log_root = log; 5370 btrfs_record_root_in_trans(trans, wc.replay_dest); 5371 ret = walk_log_tree(trans, log, &wc); 5372 5373 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5374 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5375 path); 5376 } 5377 5378 key.offset = found_key.offset - 1; 5379 wc.replay_dest->log_root = NULL; 5380 free_extent_buffer(log->node); 5381 free_extent_buffer(log->commit_root); 5382 kfree(log); 5383 5384 if (ret) 5385 goto error; 5386 5387 if (found_key.offset == 0) 5388 break; 5389 } 5390 btrfs_release_path(path); 5391 5392 /* step one is to pin it all, step two is to replay just inodes */ 5393 if (wc.pin) { 5394 wc.pin = 0; 5395 wc.process_func = replay_one_buffer; 5396 wc.stage = LOG_WALK_REPLAY_INODES; 5397 goto again; 5398 } 5399 /* step three is to replay everything */ 5400 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5401 wc.stage++; 5402 goto again; 5403 } 5404 5405 btrfs_free_path(path); 5406 5407 /* step 4: commit the transaction, which also unpins the blocks */ 5408 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 5409 if (ret) 5410 return ret; 5411 5412 free_extent_buffer(log_root_tree->node); 5413 log_root_tree->log_root = NULL; 5414 fs_info->log_root_recovering = 0; 5415 kfree(log_root_tree); 5416 5417 return 0; 5418 error: 5419 if (wc.trans) 5420 btrfs_end_transaction(wc.trans, fs_info->tree_root); 5421 btrfs_free_path(path); 5422 return ret; 5423 } 5424 5425 /* 5426 * there are some corner cases where we want to force a full 5427 * commit instead of allowing a directory to be logged. 5428 * 5429 * They revolve around files there were unlinked from the directory, and 5430 * this function updates the parent directory so that a full commit is 5431 * properly done if it is fsync'd later after the unlinks are done. 5432 */ 5433 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5434 struct inode *dir, struct inode *inode, 5435 int for_rename) 5436 { 5437 /* 5438 * when we're logging a file, if it hasn't been renamed 5439 * or unlinked, and its inode is fully committed on disk, 5440 * we don't have to worry about walking up the directory chain 5441 * to log its parents. 5442 * 5443 * So, we use the last_unlink_trans field to put this transid 5444 * into the file. When the file is logged we check it and 5445 * don't log the parents if the file is fully on disk. 5446 */ 5447 if (S_ISREG(inode->i_mode)) 5448 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5449 5450 /* 5451 * if this directory was already logged any new 5452 * names for this file/dir will get recorded 5453 */ 5454 smp_mb(); 5455 if (BTRFS_I(dir)->logged_trans == trans->transid) 5456 return; 5457 5458 /* 5459 * if the inode we're about to unlink was logged, 5460 * the log will be properly updated for any new names 5461 */ 5462 if (BTRFS_I(inode)->logged_trans == trans->transid) 5463 return; 5464 5465 /* 5466 * when renaming files across directories, if the directory 5467 * there we're unlinking from gets fsync'd later on, there's 5468 * no way to find the destination directory later and fsync it 5469 * properly. So, we have to be conservative and force commits 5470 * so the new name gets discovered. 5471 */ 5472 if (for_rename) 5473 goto record; 5474 5475 /* we can safely do the unlink without any special recording */ 5476 return; 5477 5478 record: 5479 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5480 } 5481 5482 /* 5483 * Call this after adding a new name for a file and it will properly 5484 * update the log to reflect the new name. 5485 * 5486 * It will return zero if all goes well, and it will return 1 if a 5487 * full transaction commit is required. 5488 */ 5489 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5490 struct inode *inode, struct inode *old_dir, 5491 struct dentry *parent) 5492 { 5493 struct btrfs_root * root = BTRFS_I(inode)->root; 5494 5495 /* 5496 * this will force the logging code to walk the dentry chain 5497 * up for the file 5498 */ 5499 if (S_ISREG(inode->i_mode)) 5500 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5501 5502 /* 5503 * if this inode hasn't been logged and directory we're renaming it 5504 * from hasn't been logged, we don't need to log it 5505 */ 5506 if (BTRFS_I(inode)->logged_trans <= 5507 root->fs_info->last_trans_committed && 5508 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 5509 root->fs_info->last_trans_committed)) 5510 return 0; 5511 5512 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 5513 LLONG_MAX, 1, NULL); 5514 } 5515 5516