1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include <linux/iversion.h> 24 #include "ctree.h" 25 #include "tree-log.h" 26 #include "disk-io.h" 27 #include "locking.h" 28 #include "print-tree.h" 29 #include "backref.h" 30 #include "compression.h" 31 #include "qgroup.h" 32 #include "inode-map.h" 33 34 /* magic values for the inode_only field in btrfs_log_inode: 35 * 36 * LOG_INODE_ALL means to log everything 37 * LOG_INODE_EXISTS means to log just enough to recreate the inode 38 * during log replay 39 */ 40 #define LOG_INODE_ALL 0 41 #define LOG_INODE_EXISTS 1 42 #define LOG_OTHER_INODE 2 43 44 /* 45 * directory trouble cases 46 * 47 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 48 * log, we must force a full commit before doing an fsync of the directory 49 * where the unlink was done. 50 * ---> record transid of last unlink/rename per directory 51 * 52 * mkdir foo/some_dir 53 * normal commit 54 * rename foo/some_dir foo2/some_dir 55 * mkdir foo/some_dir 56 * fsync foo/some_dir/some_file 57 * 58 * The fsync above will unlink the original some_dir without recording 59 * it in its new location (foo2). After a crash, some_dir will be gone 60 * unless the fsync of some_file forces a full commit 61 * 62 * 2) we must log any new names for any file or dir that is in the fsync 63 * log. ---> check inode while renaming/linking. 64 * 65 * 2a) we must log any new names for any file or dir during rename 66 * when the directory they are being removed from was logged. 67 * ---> check inode and old parent dir during rename 68 * 69 * 2a is actually the more important variant. With the extra logging 70 * a crash might unlink the old name without recreating the new one 71 * 72 * 3) after a crash, we must go through any directories with a link count 73 * of zero and redo the rm -rf 74 * 75 * mkdir f1/foo 76 * normal commit 77 * rm -rf f1/foo 78 * fsync(f1) 79 * 80 * The directory f1 was fully removed from the FS, but fsync was never 81 * called on f1, only its parent dir. After a crash the rm -rf must 82 * be replayed. This must be able to recurse down the entire 83 * directory tree. The inode link count fixup code takes care of the 84 * ugly details. 85 */ 86 87 /* 88 * stages for the tree walking. The first 89 * stage (0) is to only pin down the blocks we find 90 * the second stage (1) is to make sure that all the inodes 91 * we find in the log are created in the subvolume. 92 * 93 * The last stage is to deal with directories and links and extents 94 * and all the other fun semantics 95 */ 96 #define LOG_WALK_PIN_ONLY 0 97 #define LOG_WALK_REPLAY_INODES 1 98 #define LOG_WALK_REPLAY_DIR_INDEX 2 99 #define LOG_WALK_REPLAY_ALL 3 100 101 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, struct btrfs_inode *inode, 103 int inode_only, 104 const loff_t start, 105 const loff_t end, 106 struct btrfs_log_ctx *ctx); 107 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 108 struct btrfs_root *root, 109 struct btrfs_path *path, u64 objectid); 110 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 111 struct btrfs_root *root, 112 struct btrfs_root *log, 113 struct btrfs_path *path, 114 u64 dirid, int del_all); 115 116 /* 117 * tree logging is a special write ahead log used to make sure that 118 * fsyncs and O_SYNCs can happen without doing full tree commits. 119 * 120 * Full tree commits are expensive because they require commonly 121 * modified blocks to be recowed, creating many dirty pages in the 122 * extent tree an 4x-6x higher write load than ext3. 123 * 124 * Instead of doing a tree commit on every fsync, we use the 125 * key ranges and transaction ids to find items for a given file or directory 126 * that have changed in this transaction. Those items are copied into 127 * a special tree (one per subvolume root), that tree is written to disk 128 * and then the fsync is considered complete. 129 * 130 * After a crash, items are copied out of the log-tree back into the 131 * subvolume tree. Any file data extents found are recorded in the extent 132 * allocation tree, and the log-tree freed. 133 * 134 * The log tree is read three times, once to pin down all the extents it is 135 * using in ram and once, once to create all the inodes logged in the tree 136 * and once to do all the other items. 137 */ 138 139 /* 140 * start a sub transaction and setup the log tree 141 * this increments the log tree writer count to make the people 142 * syncing the tree wait for us to finish 143 */ 144 static int start_log_trans(struct btrfs_trans_handle *trans, 145 struct btrfs_root *root, 146 struct btrfs_log_ctx *ctx) 147 { 148 struct btrfs_fs_info *fs_info = root->fs_info; 149 int ret = 0; 150 151 mutex_lock(&root->log_mutex); 152 153 if (root->log_root) { 154 if (btrfs_need_log_full_commit(fs_info, trans)) { 155 ret = -EAGAIN; 156 goto out; 157 } 158 159 if (!root->log_start_pid) { 160 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 161 root->log_start_pid = current->pid; 162 } else if (root->log_start_pid != current->pid) { 163 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 164 } 165 } else { 166 mutex_lock(&fs_info->tree_log_mutex); 167 if (!fs_info->log_root_tree) 168 ret = btrfs_init_log_root_tree(trans, fs_info); 169 mutex_unlock(&fs_info->tree_log_mutex); 170 if (ret) 171 goto out; 172 173 ret = btrfs_add_log_tree(trans, root); 174 if (ret) 175 goto out; 176 177 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 178 root->log_start_pid = current->pid; 179 } 180 181 atomic_inc(&root->log_batch); 182 atomic_inc(&root->log_writers); 183 if (ctx) { 184 int index = root->log_transid % 2; 185 list_add_tail(&ctx->list, &root->log_ctxs[index]); 186 ctx->log_transid = root->log_transid; 187 } 188 189 out: 190 mutex_unlock(&root->log_mutex); 191 return ret; 192 } 193 194 /* 195 * returns 0 if there was a log transaction running and we were able 196 * to join, or returns -ENOENT if there were not transactions 197 * in progress 198 */ 199 static int join_running_log_trans(struct btrfs_root *root) 200 { 201 int ret = -ENOENT; 202 203 smp_mb(); 204 if (!root->log_root) 205 return -ENOENT; 206 207 mutex_lock(&root->log_mutex); 208 if (root->log_root) { 209 ret = 0; 210 atomic_inc(&root->log_writers); 211 } 212 mutex_unlock(&root->log_mutex); 213 return ret; 214 } 215 216 /* 217 * This either makes the current running log transaction wait 218 * until you call btrfs_end_log_trans() or it makes any future 219 * log transactions wait until you call btrfs_end_log_trans() 220 */ 221 int btrfs_pin_log_trans(struct btrfs_root *root) 222 { 223 int ret = -ENOENT; 224 225 mutex_lock(&root->log_mutex); 226 atomic_inc(&root->log_writers); 227 mutex_unlock(&root->log_mutex); 228 return ret; 229 } 230 231 /* 232 * indicate we're done making changes to the log tree 233 * and wake up anyone waiting to do a sync 234 */ 235 void btrfs_end_log_trans(struct btrfs_root *root) 236 { 237 if (atomic_dec_and_test(&root->log_writers)) { 238 /* 239 * Implicit memory barrier after atomic_dec_and_test 240 */ 241 if (waitqueue_active(&root->log_writer_wait)) 242 wake_up(&root->log_writer_wait); 243 } 244 } 245 246 247 /* 248 * the walk control struct is used to pass state down the chain when 249 * processing the log tree. The stage field tells us which part 250 * of the log tree processing we are currently doing. The others 251 * are state fields used for that specific part 252 */ 253 struct walk_control { 254 /* should we free the extent on disk when done? This is used 255 * at transaction commit time while freeing a log tree 256 */ 257 int free; 258 259 /* should we write out the extent buffer? This is used 260 * while flushing the log tree to disk during a sync 261 */ 262 int write; 263 264 /* should we wait for the extent buffer io to finish? Also used 265 * while flushing the log tree to disk for a sync 266 */ 267 int wait; 268 269 /* pin only walk, we record which extents on disk belong to the 270 * log trees 271 */ 272 int pin; 273 274 /* what stage of the replay code we're currently in */ 275 int stage; 276 277 /* the root we are currently replaying */ 278 struct btrfs_root *replay_dest; 279 280 /* the trans handle for the current replay */ 281 struct btrfs_trans_handle *trans; 282 283 /* the function that gets used to process blocks we find in the 284 * tree. Note the extent_buffer might not be up to date when it is 285 * passed in, and it must be checked or read if you need the data 286 * inside it 287 */ 288 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 289 struct walk_control *wc, u64 gen, int level); 290 }; 291 292 /* 293 * process_func used to pin down extents, write them or wait on them 294 */ 295 static int process_one_buffer(struct btrfs_root *log, 296 struct extent_buffer *eb, 297 struct walk_control *wc, u64 gen, int level) 298 { 299 struct btrfs_fs_info *fs_info = log->fs_info; 300 int ret = 0; 301 302 /* 303 * If this fs is mixed then we need to be able to process the leaves to 304 * pin down any logged extents, so we have to read the block. 305 */ 306 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 307 ret = btrfs_read_buffer(eb, gen, level, NULL); 308 if (ret) 309 return ret; 310 } 311 312 if (wc->pin) 313 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 314 eb->len); 315 316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 317 if (wc->pin && btrfs_header_level(eb) == 0) 318 ret = btrfs_exclude_logged_extents(fs_info, eb); 319 if (wc->write) 320 btrfs_write_tree_block(eb); 321 if (wc->wait) 322 btrfs_wait_tree_block_writeback(eb); 323 } 324 return ret; 325 } 326 327 /* 328 * Item overwrite used by replay and tree logging. eb, slot and key all refer 329 * to the src data we are copying out. 330 * 331 * root is the tree we are copying into, and path is a scratch 332 * path for use in this function (it should be released on entry and 333 * will be released on exit). 334 * 335 * If the key is already in the destination tree the existing item is 336 * overwritten. If the existing item isn't big enough, it is extended. 337 * If it is too large, it is truncated. 338 * 339 * If the key isn't in the destination yet, a new item is inserted. 340 */ 341 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 342 struct btrfs_root *root, 343 struct btrfs_path *path, 344 struct extent_buffer *eb, int slot, 345 struct btrfs_key *key) 346 { 347 struct btrfs_fs_info *fs_info = root->fs_info; 348 int ret; 349 u32 item_size; 350 u64 saved_i_size = 0; 351 int save_old_i_size = 0; 352 unsigned long src_ptr; 353 unsigned long dst_ptr; 354 int overwrite_root = 0; 355 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 356 357 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 358 overwrite_root = 1; 359 360 item_size = btrfs_item_size_nr(eb, slot); 361 src_ptr = btrfs_item_ptr_offset(eb, slot); 362 363 /* look for the key in the destination tree */ 364 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 365 if (ret < 0) 366 return ret; 367 368 if (ret == 0) { 369 char *src_copy; 370 char *dst_copy; 371 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 372 path->slots[0]); 373 if (dst_size != item_size) 374 goto insert; 375 376 if (item_size == 0) { 377 btrfs_release_path(path); 378 return 0; 379 } 380 dst_copy = kmalloc(item_size, GFP_NOFS); 381 src_copy = kmalloc(item_size, GFP_NOFS); 382 if (!dst_copy || !src_copy) { 383 btrfs_release_path(path); 384 kfree(dst_copy); 385 kfree(src_copy); 386 return -ENOMEM; 387 } 388 389 read_extent_buffer(eb, src_copy, src_ptr, item_size); 390 391 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 392 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 393 item_size); 394 ret = memcmp(dst_copy, src_copy, item_size); 395 396 kfree(dst_copy); 397 kfree(src_copy); 398 /* 399 * they have the same contents, just return, this saves 400 * us from cowing blocks in the destination tree and doing 401 * extra writes that may not have been done by a previous 402 * sync 403 */ 404 if (ret == 0) { 405 btrfs_release_path(path); 406 return 0; 407 } 408 409 /* 410 * We need to load the old nbytes into the inode so when we 411 * replay the extents we've logged we get the right nbytes. 412 */ 413 if (inode_item) { 414 struct btrfs_inode_item *item; 415 u64 nbytes; 416 u32 mode; 417 418 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 419 struct btrfs_inode_item); 420 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 421 item = btrfs_item_ptr(eb, slot, 422 struct btrfs_inode_item); 423 btrfs_set_inode_nbytes(eb, item, nbytes); 424 425 /* 426 * If this is a directory we need to reset the i_size to 427 * 0 so that we can set it up properly when replaying 428 * the rest of the items in this log. 429 */ 430 mode = btrfs_inode_mode(eb, item); 431 if (S_ISDIR(mode)) 432 btrfs_set_inode_size(eb, item, 0); 433 } 434 } else if (inode_item) { 435 struct btrfs_inode_item *item; 436 u32 mode; 437 438 /* 439 * New inode, set nbytes to 0 so that the nbytes comes out 440 * properly when we replay the extents. 441 */ 442 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 443 btrfs_set_inode_nbytes(eb, item, 0); 444 445 /* 446 * If this is a directory we need to reset the i_size to 0 so 447 * that we can set it up properly when replaying the rest of 448 * the items in this log. 449 */ 450 mode = btrfs_inode_mode(eb, item); 451 if (S_ISDIR(mode)) 452 btrfs_set_inode_size(eb, item, 0); 453 } 454 insert: 455 btrfs_release_path(path); 456 /* try to insert the key into the destination tree */ 457 path->skip_release_on_error = 1; 458 ret = btrfs_insert_empty_item(trans, root, path, 459 key, item_size); 460 path->skip_release_on_error = 0; 461 462 /* make sure any existing item is the correct size */ 463 if (ret == -EEXIST || ret == -EOVERFLOW) { 464 u32 found_size; 465 found_size = btrfs_item_size_nr(path->nodes[0], 466 path->slots[0]); 467 if (found_size > item_size) 468 btrfs_truncate_item(fs_info, path, item_size, 1); 469 else if (found_size < item_size) 470 btrfs_extend_item(fs_info, path, 471 item_size - found_size); 472 } else if (ret) { 473 return ret; 474 } 475 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 476 path->slots[0]); 477 478 /* don't overwrite an existing inode if the generation number 479 * was logged as zero. This is done when the tree logging code 480 * is just logging an inode to make sure it exists after recovery. 481 * 482 * Also, don't overwrite i_size on directories during replay. 483 * log replay inserts and removes directory items based on the 484 * state of the tree found in the subvolume, and i_size is modified 485 * as it goes 486 */ 487 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 488 struct btrfs_inode_item *src_item; 489 struct btrfs_inode_item *dst_item; 490 491 src_item = (struct btrfs_inode_item *)src_ptr; 492 dst_item = (struct btrfs_inode_item *)dst_ptr; 493 494 if (btrfs_inode_generation(eb, src_item) == 0) { 495 struct extent_buffer *dst_eb = path->nodes[0]; 496 const u64 ino_size = btrfs_inode_size(eb, src_item); 497 498 /* 499 * For regular files an ino_size == 0 is used only when 500 * logging that an inode exists, as part of a directory 501 * fsync, and the inode wasn't fsynced before. In this 502 * case don't set the size of the inode in the fs/subvol 503 * tree, otherwise we would be throwing valid data away. 504 */ 505 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 506 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 507 ino_size != 0) { 508 struct btrfs_map_token token; 509 510 btrfs_init_map_token(&token); 511 btrfs_set_token_inode_size(dst_eb, dst_item, 512 ino_size, &token); 513 } 514 goto no_copy; 515 } 516 517 if (overwrite_root && 518 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 519 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 520 save_old_i_size = 1; 521 saved_i_size = btrfs_inode_size(path->nodes[0], 522 dst_item); 523 } 524 } 525 526 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 527 src_ptr, item_size); 528 529 if (save_old_i_size) { 530 struct btrfs_inode_item *dst_item; 531 dst_item = (struct btrfs_inode_item *)dst_ptr; 532 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 533 } 534 535 /* make sure the generation is filled in */ 536 if (key->type == BTRFS_INODE_ITEM_KEY) { 537 struct btrfs_inode_item *dst_item; 538 dst_item = (struct btrfs_inode_item *)dst_ptr; 539 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 540 btrfs_set_inode_generation(path->nodes[0], dst_item, 541 trans->transid); 542 } 543 } 544 no_copy: 545 btrfs_mark_buffer_dirty(path->nodes[0]); 546 btrfs_release_path(path); 547 return 0; 548 } 549 550 /* 551 * simple helper to read an inode off the disk from a given root 552 * This can only be called for subvolume roots and not for the log 553 */ 554 static noinline struct inode *read_one_inode(struct btrfs_root *root, 555 u64 objectid) 556 { 557 struct btrfs_key key; 558 struct inode *inode; 559 560 key.objectid = objectid; 561 key.type = BTRFS_INODE_ITEM_KEY; 562 key.offset = 0; 563 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 564 if (IS_ERR(inode)) { 565 inode = NULL; 566 } else if (is_bad_inode(inode)) { 567 iput(inode); 568 inode = NULL; 569 } 570 return inode; 571 } 572 573 /* replays a single extent in 'eb' at 'slot' with 'key' into the 574 * subvolume 'root'. path is released on entry and should be released 575 * on exit. 576 * 577 * extents in the log tree have not been allocated out of the extent 578 * tree yet. So, this completes the allocation, taking a reference 579 * as required if the extent already exists or creating a new extent 580 * if it isn't in the extent allocation tree yet. 581 * 582 * The extent is inserted into the file, dropping any existing extents 583 * from the file that overlap the new one. 584 */ 585 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 586 struct btrfs_root *root, 587 struct btrfs_path *path, 588 struct extent_buffer *eb, int slot, 589 struct btrfs_key *key) 590 { 591 struct btrfs_fs_info *fs_info = root->fs_info; 592 int found_type; 593 u64 extent_end; 594 u64 start = key->offset; 595 u64 nbytes = 0; 596 struct btrfs_file_extent_item *item; 597 struct inode *inode = NULL; 598 unsigned long size; 599 int ret = 0; 600 601 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 602 found_type = btrfs_file_extent_type(eb, item); 603 604 if (found_type == BTRFS_FILE_EXTENT_REG || 605 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 606 nbytes = btrfs_file_extent_num_bytes(eb, item); 607 extent_end = start + nbytes; 608 609 /* 610 * We don't add to the inodes nbytes if we are prealloc or a 611 * hole. 612 */ 613 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 614 nbytes = 0; 615 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 616 size = btrfs_file_extent_inline_len(eb, slot, item); 617 nbytes = btrfs_file_extent_ram_bytes(eb, item); 618 extent_end = ALIGN(start + size, 619 fs_info->sectorsize); 620 } else { 621 ret = 0; 622 goto out; 623 } 624 625 inode = read_one_inode(root, key->objectid); 626 if (!inode) { 627 ret = -EIO; 628 goto out; 629 } 630 631 /* 632 * first check to see if we already have this extent in the 633 * file. This must be done before the btrfs_drop_extents run 634 * so we don't try to drop this extent. 635 */ 636 ret = btrfs_lookup_file_extent(trans, root, path, 637 btrfs_ino(BTRFS_I(inode)), start, 0); 638 639 if (ret == 0 && 640 (found_type == BTRFS_FILE_EXTENT_REG || 641 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 642 struct btrfs_file_extent_item cmp1; 643 struct btrfs_file_extent_item cmp2; 644 struct btrfs_file_extent_item *existing; 645 struct extent_buffer *leaf; 646 647 leaf = path->nodes[0]; 648 existing = btrfs_item_ptr(leaf, path->slots[0], 649 struct btrfs_file_extent_item); 650 651 read_extent_buffer(eb, &cmp1, (unsigned long)item, 652 sizeof(cmp1)); 653 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 654 sizeof(cmp2)); 655 656 /* 657 * we already have a pointer to this exact extent, 658 * we don't have to do anything 659 */ 660 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 661 btrfs_release_path(path); 662 goto out; 663 } 664 } 665 btrfs_release_path(path); 666 667 /* drop any overlapping extents */ 668 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 669 if (ret) 670 goto out; 671 672 if (found_type == BTRFS_FILE_EXTENT_REG || 673 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 674 u64 offset; 675 unsigned long dest_offset; 676 struct btrfs_key ins; 677 678 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 679 btrfs_fs_incompat(fs_info, NO_HOLES)) 680 goto update_inode; 681 682 ret = btrfs_insert_empty_item(trans, root, path, key, 683 sizeof(*item)); 684 if (ret) 685 goto out; 686 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 687 path->slots[0]); 688 copy_extent_buffer(path->nodes[0], eb, dest_offset, 689 (unsigned long)item, sizeof(*item)); 690 691 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 692 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 693 ins.type = BTRFS_EXTENT_ITEM_KEY; 694 offset = key->offset - btrfs_file_extent_offset(eb, item); 695 696 /* 697 * Manually record dirty extent, as here we did a shallow 698 * file extent item copy and skip normal backref update, 699 * but modifying extent tree all by ourselves. 700 * So need to manually record dirty extent for qgroup, 701 * as the owner of the file extent changed from log tree 702 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 703 */ 704 ret = btrfs_qgroup_trace_extent(trans, fs_info, 705 btrfs_file_extent_disk_bytenr(eb, item), 706 btrfs_file_extent_disk_num_bytes(eb, item), 707 GFP_NOFS); 708 if (ret < 0) 709 goto out; 710 711 if (ins.objectid > 0) { 712 u64 csum_start; 713 u64 csum_end; 714 LIST_HEAD(ordered_sums); 715 /* 716 * is this extent already allocated in the extent 717 * allocation tree? If so, just add a reference 718 */ 719 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 720 ins.offset); 721 if (ret == 0) { 722 ret = btrfs_inc_extent_ref(trans, root, 723 ins.objectid, ins.offset, 724 0, root->root_key.objectid, 725 key->objectid, offset); 726 if (ret) 727 goto out; 728 } else { 729 /* 730 * insert the extent pointer in the extent 731 * allocation tree 732 */ 733 ret = btrfs_alloc_logged_file_extent(trans, 734 fs_info, 735 root->root_key.objectid, 736 key->objectid, offset, &ins); 737 if (ret) 738 goto out; 739 } 740 btrfs_release_path(path); 741 742 if (btrfs_file_extent_compression(eb, item)) { 743 csum_start = ins.objectid; 744 csum_end = csum_start + ins.offset; 745 } else { 746 csum_start = ins.objectid + 747 btrfs_file_extent_offset(eb, item); 748 csum_end = csum_start + 749 btrfs_file_extent_num_bytes(eb, item); 750 } 751 752 ret = btrfs_lookup_csums_range(root->log_root, 753 csum_start, csum_end - 1, 754 &ordered_sums, 0); 755 if (ret) 756 goto out; 757 /* 758 * Now delete all existing cums in the csum root that 759 * cover our range. We do this because we can have an 760 * extent that is completely referenced by one file 761 * extent item and partially referenced by another 762 * file extent item (like after using the clone or 763 * extent_same ioctls). In this case if we end up doing 764 * the replay of the one that partially references the 765 * extent first, and we do not do the csum deletion 766 * below, we can get 2 csum items in the csum tree that 767 * overlap each other. For example, imagine our log has 768 * the two following file extent items: 769 * 770 * key (257 EXTENT_DATA 409600) 771 * extent data disk byte 12845056 nr 102400 772 * extent data offset 20480 nr 20480 ram 102400 773 * 774 * key (257 EXTENT_DATA 819200) 775 * extent data disk byte 12845056 nr 102400 776 * extent data offset 0 nr 102400 ram 102400 777 * 778 * Where the second one fully references the 100K extent 779 * that starts at disk byte 12845056, and the log tree 780 * has a single csum item that covers the entire range 781 * of the extent: 782 * 783 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 784 * 785 * After the first file extent item is replayed, the 786 * csum tree gets the following csum item: 787 * 788 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 789 * 790 * Which covers the 20K sub-range starting at offset 20K 791 * of our extent. Now when we replay the second file 792 * extent item, if we do not delete existing csum items 793 * that cover any of its blocks, we end up getting two 794 * csum items in our csum tree that overlap each other: 795 * 796 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 797 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 798 * 799 * Which is a problem, because after this anyone trying 800 * to lookup up for the checksum of any block of our 801 * extent starting at an offset of 40K or higher, will 802 * end up looking at the second csum item only, which 803 * does not contain the checksum for any block starting 804 * at offset 40K or higher of our extent. 805 */ 806 while (!list_empty(&ordered_sums)) { 807 struct btrfs_ordered_sum *sums; 808 sums = list_entry(ordered_sums.next, 809 struct btrfs_ordered_sum, 810 list); 811 if (!ret) 812 ret = btrfs_del_csums(trans, fs_info, 813 sums->bytenr, 814 sums->len); 815 if (!ret) 816 ret = btrfs_csum_file_blocks(trans, 817 fs_info->csum_root, sums); 818 list_del(&sums->list); 819 kfree(sums); 820 } 821 if (ret) 822 goto out; 823 } else { 824 btrfs_release_path(path); 825 } 826 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 827 /* inline extents are easy, we just overwrite them */ 828 ret = overwrite_item(trans, root, path, eb, slot, key); 829 if (ret) 830 goto out; 831 } 832 833 inode_add_bytes(inode, nbytes); 834 update_inode: 835 ret = btrfs_update_inode(trans, root, inode); 836 out: 837 if (inode) 838 iput(inode); 839 return ret; 840 } 841 842 /* 843 * when cleaning up conflicts between the directory names in the 844 * subvolume, directory names in the log and directory names in the 845 * inode back references, we may have to unlink inodes from directories. 846 * 847 * This is a helper function to do the unlink of a specific directory 848 * item 849 */ 850 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 851 struct btrfs_root *root, 852 struct btrfs_path *path, 853 struct btrfs_inode *dir, 854 struct btrfs_dir_item *di) 855 { 856 struct inode *inode; 857 char *name; 858 int name_len; 859 struct extent_buffer *leaf; 860 struct btrfs_key location; 861 int ret; 862 863 leaf = path->nodes[0]; 864 865 btrfs_dir_item_key_to_cpu(leaf, di, &location); 866 name_len = btrfs_dir_name_len(leaf, di); 867 name = kmalloc(name_len, GFP_NOFS); 868 if (!name) 869 return -ENOMEM; 870 871 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 872 btrfs_release_path(path); 873 874 inode = read_one_inode(root, location.objectid); 875 if (!inode) { 876 ret = -EIO; 877 goto out; 878 } 879 880 ret = link_to_fixup_dir(trans, root, path, location.objectid); 881 if (ret) 882 goto out; 883 884 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 885 name_len); 886 if (ret) 887 goto out; 888 else 889 ret = btrfs_run_delayed_items(trans); 890 out: 891 kfree(name); 892 iput(inode); 893 return ret; 894 } 895 896 /* 897 * helper function to see if a given name and sequence number found 898 * in an inode back reference are already in a directory and correctly 899 * point to this inode 900 */ 901 static noinline int inode_in_dir(struct btrfs_root *root, 902 struct btrfs_path *path, 903 u64 dirid, u64 objectid, u64 index, 904 const char *name, int name_len) 905 { 906 struct btrfs_dir_item *di; 907 struct btrfs_key location; 908 int match = 0; 909 910 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 911 index, name, name_len, 0); 912 if (di && !IS_ERR(di)) { 913 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 914 if (location.objectid != objectid) 915 goto out; 916 } else 917 goto out; 918 btrfs_release_path(path); 919 920 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 921 if (di && !IS_ERR(di)) { 922 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 923 if (location.objectid != objectid) 924 goto out; 925 } else 926 goto out; 927 match = 1; 928 out: 929 btrfs_release_path(path); 930 return match; 931 } 932 933 /* 934 * helper function to check a log tree for a named back reference in 935 * an inode. This is used to decide if a back reference that is 936 * found in the subvolume conflicts with what we find in the log. 937 * 938 * inode backreferences may have multiple refs in a single item, 939 * during replay we process one reference at a time, and we don't 940 * want to delete valid links to a file from the subvolume if that 941 * link is also in the log. 942 */ 943 static noinline int backref_in_log(struct btrfs_root *log, 944 struct btrfs_key *key, 945 u64 ref_objectid, 946 const char *name, int namelen) 947 { 948 struct btrfs_path *path; 949 struct btrfs_inode_ref *ref; 950 unsigned long ptr; 951 unsigned long ptr_end; 952 unsigned long name_ptr; 953 int found_name_len; 954 int item_size; 955 int ret; 956 int match = 0; 957 958 path = btrfs_alloc_path(); 959 if (!path) 960 return -ENOMEM; 961 962 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 963 if (ret != 0) 964 goto out; 965 966 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 967 968 if (key->type == BTRFS_INODE_EXTREF_KEY) { 969 if (btrfs_find_name_in_ext_backref(path->nodes[0], 970 path->slots[0], 971 ref_objectid, 972 name, namelen, NULL)) 973 match = 1; 974 975 goto out; 976 } 977 978 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 979 ptr_end = ptr + item_size; 980 while (ptr < ptr_end) { 981 ref = (struct btrfs_inode_ref *)ptr; 982 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 983 if (found_name_len == namelen) { 984 name_ptr = (unsigned long)(ref + 1); 985 ret = memcmp_extent_buffer(path->nodes[0], name, 986 name_ptr, namelen); 987 if (ret == 0) { 988 match = 1; 989 goto out; 990 } 991 } 992 ptr = (unsigned long)(ref + 1) + found_name_len; 993 } 994 out: 995 btrfs_free_path(path); 996 return match; 997 } 998 999 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1000 struct btrfs_root *root, 1001 struct btrfs_path *path, 1002 struct btrfs_root *log_root, 1003 struct btrfs_inode *dir, 1004 struct btrfs_inode *inode, 1005 u64 inode_objectid, u64 parent_objectid, 1006 u64 ref_index, char *name, int namelen, 1007 int *search_done) 1008 { 1009 int ret; 1010 char *victim_name; 1011 int victim_name_len; 1012 struct extent_buffer *leaf; 1013 struct btrfs_dir_item *di; 1014 struct btrfs_key search_key; 1015 struct btrfs_inode_extref *extref; 1016 1017 again: 1018 /* Search old style refs */ 1019 search_key.objectid = inode_objectid; 1020 search_key.type = BTRFS_INODE_REF_KEY; 1021 search_key.offset = parent_objectid; 1022 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1023 if (ret == 0) { 1024 struct btrfs_inode_ref *victim_ref; 1025 unsigned long ptr; 1026 unsigned long ptr_end; 1027 1028 leaf = path->nodes[0]; 1029 1030 /* are we trying to overwrite a back ref for the root directory 1031 * if so, just jump out, we're done 1032 */ 1033 if (search_key.objectid == search_key.offset) 1034 return 1; 1035 1036 /* check all the names in this back reference to see 1037 * if they are in the log. if so, we allow them to stay 1038 * otherwise they must be unlinked as a conflict 1039 */ 1040 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1041 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1042 while (ptr < ptr_end) { 1043 victim_ref = (struct btrfs_inode_ref *)ptr; 1044 victim_name_len = btrfs_inode_ref_name_len(leaf, 1045 victim_ref); 1046 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1047 if (!victim_name) 1048 return -ENOMEM; 1049 1050 read_extent_buffer(leaf, victim_name, 1051 (unsigned long)(victim_ref + 1), 1052 victim_name_len); 1053 1054 if (!backref_in_log(log_root, &search_key, 1055 parent_objectid, 1056 victim_name, 1057 victim_name_len)) { 1058 inc_nlink(&inode->vfs_inode); 1059 btrfs_release_path(path); 1060 1061 ret = btrfs_unlink_inode(trans, root, dir, inode, 1062 victim_name, victim_name_len); 1063 kfree(victim_name); 1064 if (ret) 1065 return ret; 1066 ret = btrfs_run_delayed_items(trans); 1067 if (ret) 1068 return ret; 1069 *search_done = 1; 1070 goto again; 1071 } 1072 kfree(victim_name); 1073 1074 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1075 } 1076 1077 /* 1078 * NOTE: we have searched root tree and checked the 1079 * corresponding ref, it does not need to check again. 1080 */ 1081 *search_done = 1; 1082 } 1083 btrfs_release_path(path); 1084 1085 /* Same search but for extended refs */ 1086 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1087 inode_objectid, parent_objectid, 0, 1088 0); 1089 if (!IS_ERR_OR_NULL(extref)) { 1090 u32 item_size; 1091 u32 cur_offset = 0; 1092 unsigned long base; 1093 struct inode *victim_parent; 1094 1095 leaf = path->nodes[0]; 1096 1097 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1098 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1099 1100 while (cur_offset < item_size) { 1101 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1102 1103 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1104 1105 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1106 goto next; 1107 1108 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1109 if (!victim_name) 1110 return -ENOMEM; 1111 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1112 victim_name_len); 1113 1114 search_key.objectid = inode_objectid; 1115 search_key.type = BTRFS_INODE_EXTREF_KEY; 1116 search_key.offset = btrfs_extref_hash(parent_objectid, 1117 victim_name, 1118 victim_name_len); 1119 ret = 0; 1120 if (!backref_in_log(log_root, &search_key, 1121 parent_objectid, victim_name, 1122 victim_name_len)) { 1123 ret = -ENOENT; 1124 victim_parent = read_one_inode(root, 1125 parent_objectid); 1126 if (victim_parent) { 1127 inc_nlink(&inode->vfs_inode); 1128 btrfs_release_path(path); 1129 1130 ret = btrfs_unlink_inode(trans, root, 1131 BTRFS_I(victim_parent), 1132 inode, 1133 victim_name, 1134 victim_name_len); 1135 if (!ret) 1136 ret = btrfs_run_delayed_items( 1137 trans); 1138 } 1139 iput(victim_parent); 1140 kfree(victim_name); 1141 if (ret) 1142 return ret; 1143 *search_done = 1; 1144 goto again; 1145 } 1146 kfree(victim_name); 1147 next: 1148 cur_offset += victim_name_len + sizeof(*extref); 1149 } 1150 *search_done = 1; 1151 } 1152 btrfs_release_path(path); 1153 1154 /* look for a conflicting sequence number */ 1155 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1156 ref_index, name, namelen, 0); 1157 if (di && !IS_ERR(di)) { 1158 ret = drop_one_dir_item(trans, root, path, dir, di); 1159 if (ret) 1160 return ret; 1161 } 1162 btrfs_release_path(path); 1163 1164 /* look for a conflicing name */ 1165 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1166 name, namelen, 0); 1167 if (di && !IS_ERR(di)) { 1168 ret = drop_one_dir_item(trans, root, path, dir, di); 1169 if (ret) 1170 return ret; 1171 } 1172 btrfs_release_path(path); 1173 1174 return 0; 1175 } 1176 1177 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1178 u32 *namelen, char **name, u64 *index, 1179 u64 *parent_objectid) 1180 { 1181 struct btrfs_inode_extref *extref; 1182 1183 extref = (struct btrfs_inode_extref *)ref_ptr; 1184 1185 *namelen = btrfs_inode_extref_name_len(eb, extref); 1186 *name = kmalloc(*namelen, GFP_NOFS); 1187 if (*name == NULL) 1188 return -ENOMEM; 1189 1190 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1191 *namelen); 1192 1193 if (index) 1194 *index = btrfs_inode_extref_index(eb, extref); 1195 if (parent_objectid) 1196 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1197 1198 return 0; 1199 } 1200 1201 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1202 u32 *namelen, char **name, u64 *index) 1203 { 1204 struct btrfs_inode_ref *ref; 1205 1206 ref = (struct btrfs_inode_ref *)ref_ptr; 1207 1208 *namelen = btrfs_inode_ref_name_len(eb, ref); 1209 *name = kmalloc(*namelen, GFP_NOFS); 1210 if (*name == NULL) 1211 return -ENOMEM; 1212 1213 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1214 1215 if (index) 1216 *index = btrfs_inode_ref_index(eb, ref); 1217 1218 return 0; 1219 } 1220 1221 /* 1222 * Take an inode reference item from the log tree and iterate all names from the 1223 * inode reference item in the subvolume tree with the same key (if it exists). 1224 * For any name that is not in the inode reference item from the log tree, do a 1225 * proper unlink of that name (that is, remove its entry from the inode 1226 * reference item and both dir index keys). 1227 */ 1228 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1229 struct btrfs_root *root, 1230 struct btrfs_path *path, 1231 struct btrfs_inode *inode, 1232 struct extent_buffer *log_eb, 1233 int log_slot, 1234 struct btrfs_key *key) 1235 { 1236 int ret; 1237 unsigned long ref_ptr; 1238 unsigned long ref_end; 1239 struct extent_buffer *eb; 1240 1241 again: 1242 btrfs_release_path(path); 1243 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1244 if (ret > 0) { 1245 ret = 0; 1246 goto out; 1247 } 1248 if (ret < 0) 1249 goto out; 1250 1251 eb = path->nodes[0]; 1252 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1253 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1254 while (ref_ptr < ref_end) { 1255 char *name = NULL; 1256 int namelen; 1257 u64 parent_id; 1258 1259 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1260 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1261 NULL, &parent_id); 1262 } else { 1263 parent_id = key->offset; 1264 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1265 NULL); 1266 } 1267 if (ret) 1268 goto out; 1269 1270 if (key->type == BTRFS_INODE_EXTREF_KEY) 1271 ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, 1272 parent_id, name, 1273 namelen, NULL); 1274 else 1275 ret = btrfs_find_name_in_backref(log_eb, log_slot, name, 1276 namelen, NULL); 1277 1278 if (!ret) { 1279 struct inode *dir; 1280 1281 btrfs_release_path(path); 1282 dir = read_one_inode(root, parent_id); 1283 if (!dir) { 1284 ret = -ENOENT; 1285 kfree(name); 1286 goto out; 1287 } 1288 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1289 inode, name, namelen); 1290 kfree(name); 1291 iput(dir); 1292 if (ret) 1293 goto out; 1294 goto again; 1295 } 1296 1297 kfree(name); 1298 ref_ptr += namelen; 1299 if (key->type == BTRFS_INODE_EXTREF_KEY) 1300 ref_ptr += sizeof(struct btrfs_inode_extref); 1301 else 1302 ref_ptr += sizeof(struct btrfs_inode_ref); 1303 } 1304 ret = 0; 1305 out: 1306 btrfs_release_path(path); 1307 return ret; 1308 } 1309 1310 /* 1311 * replay one inode back reference item found in the log tree. 1312 * eb, slot and key refer to the buffer and key found in the log tree. 1313 * root is the destination we are replaying into, and path is for temp 1314 * use by this function. (it should be released on return). 1315 */ 1316 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1317 struct btrfs_root *root, 1318 struct btrfs_root *log, 1319 struct btrfs_path *path, 1320 struct extent_buffer *eb, int slot, 1321 struct btrfs_key *key) 1322 { 1323 struct inode *dir = NULL; 1324 struct inode *inode = NULL; 1325 unsigned long ref_ptr; 1326 unsigned long ref_end; 1327 char *name = NULL; 1328 int namelen; 1329 int ret; 1330 int search_done = 0; 1331 int log_ref_ver = 0; 1332 u64 parent_objectid; 1333 u64 inode_objectid; 1334 u64 ref_index = 0; 1335 int ref_struct_size; 1336 1337 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1338 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1339 1340 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1341 struct btrfs_inode_extref *r; 1342 1343 ref_struct_size = sizeof(struct btrfs_inode_extref); 1344 log_ref_ver = 1; 1345 r = (struct btrfs_inode_extref *)ref_ptr; 1346 parent_objectid = btrfs_inode_extref_parent(eb, r); 1347 } else { 1348 ref_struct_size = sizeof(struct btrfs_inode_ref); 1349 parent_objectid = key->offset; 1350 } 1351 inode_objectid = key->objectid; 1352 1353 /* 1354 * it is possible that we didn't log all the parent directories 1355 * for a given inode. If we don't find the dir, just don't 1356 * copy the back ref in. The link count fixup code will take 1357 * care of the rest 1358 */ 1359 dir = read_one_inode(root, parent_objectid); 1360 if (!dir) { 1361 ret = -ENOENT; 1362 goto out; 1363 } 1364 1365 inode = read_one_inode(root, inode_objectid); 1366 if (!inode) { 1367 ret = -EIO; 1368 goto out; 1369 } 1370 1371 while (ref_ptr < ref_end) { 1372 if (log_ref_ver) { 1373 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1374 &ref_index, &parent_objectid); 1375 /* 1376 * parent object can change from one array 1377 * item to another. 1378 */ 1379 if (!dir) 1380 dir = read_one_inode(root, parent_objectid); 1381 if (!dir) { 1382 ret = -ENOENT; 1383 goto out; 1384 } 1385 } else { 1386 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1387 &ref_index); 1388 } 1389 if (ret) 1390 goto out; 1391 1392 /* if we already have a perfect match, we're done */ 1393 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1394 btrfs_ino(BTRFS_I(inode)), ref_index, 1395 name, namelen)) { 1396 /* 1397 * look for a conflicting back reference in the 1398 * metadata. if we find one we have to unlink that name 1399 * of the file before we add our new link. Later on, we 1400 * overwrite any existing back reference, and we don't 1401 * want to create dangling pointers in the directory. 1402 */ 1403 1404 if (!search_done) { 1405 ret = __add_inode_ref(trans, root, path, log, 1406 BTRFS_I(dir), 1407 BTRFS_I(inode), 1408 inode_objectid, 1409 parent_objectid, 1410 ref_index, name, namelen, 1411 &search_done); 1412 if (ret) { 1413 if (ret == 1) 1414 ret = 0; 1415 goto out; 1416 } 1417 } 1418 1419 /* insert our name */ 1420 ret = btrfs_add_link(trans, BTRFS_I(dir), 1421 BTRFS_I(inode), 1422 name, namelen, 0, ref_index); 1423 if (ret) 1424 goto out; 1425 1426 btrfs_update_inode(trans, root, inode); 1427 } 1428 1429 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1430 kfree(name); 1431 name = NULL; 1432 if (log_ref_ver) { 1433 iput(dir); 1434 dir = NULL; 1435 } 1436 } 1437 1438 /* 1439 * Before we overwrite the inode reference item in the subvolume tree 1440 * with the item from the log tree, we must unlink all names from the 1441 * parent directory that are in the subvolume's tree inode reference 1442 * item, otherwise we end up with an inconsistent subvolume tree where 1443 * dir index entries exist for a name but there is no inode reference 1444 * item with the same name. 1445 */ 1446 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1447 key); 1448 if (ret) 1449 goto out; 1450 1451 /* finally write the back reference in the inode */ 1452 ret = overwrite_item(trans, root, path, eb, slot, key); 1453 out: 1454 btrfs_release_path(path); 1455 kfree(name); 1456 iput(dir); 1457 iput(inode); 1458 return ret; 1459 } 1460 1461 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1462 struct btrfs_root *root, u64 ino) 1463 { 1464 int ret; 1465 1466 ret = btrfs_insert_orphan_item(trans, root, ino); 1467 if (ret == -EEXIST) 1468 ret = 0; 1469 1470 return ret; 1471 } 1472 1473 static int count_inode_extrefs(struct btrfs_root *root, 1474 struct btrfs_inode *inode, struct btrfs_path *path) 1475 { 1476 int ret = 0; 1477 int name_len; 1478 unsigned int nlink = 0; 1479 u32 item_size; 1480 u32 cur_offset = 0; 1481 u64 inode_objectid = btrfs_ino(inode); 1482 u64 offset = 0; 1483 unsigned long ptr; 1484 struct btrfs_inode_extref *extref; 1485 struct extent_buffer *leaf; 1486 1487 while (1) { 1488 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1489 &extref, &offset); 1490 if (ret) 1491 break; 1492 1493 leaf = path->nodes[0]; 1494 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1495 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1496 cur_offset = 0; 1497 1498 while (cur_offset < item_size) { 1499 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1500 name_len = btrfs_inode_extref_name_len(leaf, extref); 1501 1502 nlink++; 1503 1504 cur_offset += name_len + sizeof(*extref); 1505 } 1506 1507 offset++; 1508 btrfs_release_path(path); 1509 } 1510 btrfs_release_path(path); 1511 1512 if (ret < 0 && ret != -ENOENT) 1513 return ret; 1514 return nlink; 1515 } 1516 1517 static int count_inode_refs(struct btrfs_root *root, 1518 struct btrfs_inode *inode, struct btrfs_path *path) 1519 { 1520 int ret; 1521 struct btrfs_key key; 1522 unsigned int nlink = 0; 1523 unsigned long ptr; 1524 unsigned long ptr_end; 1525 int name_len; 1526 u64 ino = btrfs_ino(inode); 1527 1528 key.objectid = ino; 1529 key.type = BTRFS_INODE_REF_KEY; 1530 key.offset = (u64)-1; 1531 1532 while (1) { 1533 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1534 if (ret < 0) 1535 break; 1536 if (ret > 0) { 1537 if (path->slots[0] == 0) 1538 break; 1539 path->slots[0]--; 1540 } 1541 process_slot: 1542 btrfs_item_key_to_cpu(path->nodes[0], &key, 1543 path->slots[0]); 1544 if (key.objectid != ino || 1545 key.type != BTRFS_INODE_REF_KEY) 1546 break; 1547 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1548 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1549 path->slots[0]); 1550 while (ptr < ptr_end) { 1551 struct btrfs_inode_ref *ref; 1552 1553 ref = (struct btrfs_inode_ref *)ptr; 1554 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1555 ref); 1556 ptr = (unsigned long)(ref + 1) + name_len; 1557 nlink++; 1558 } 1559 1560 if (key.offset == 0) 1561 break; 1562 if (path->slots[0] > 0) { 1563 path->slots[0]--; 1564 goto process_slot; 1565 } 1566 key.offset--; 1567 btrfs_release_path(path); 1568 } 1569 btrfs_release_path(path); 1570 1571 return nlink; 1572 } 1573 1574 /* 1575 * There are a few corners where the link count of the file can't 1576 * be properly maintained during replay. So, instead of adding 1577 * lots of complexity to the log code, we just scan the backrefs 1578 * for any file that has been through replay. 1579 * 1580 * The scan will update the link count on the inode to reflect the 1581 * number of back refs found. If it goes down to zero, the iput 1582 * will free the inode. 1583 */ 1584 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1585 struct btrfs_root *root, 1586 struct inode *inode) 1587 { 1588 struct btrfs_path *path; 1589 int ret; 1590 u64 nlink = 0; 1591 u64 ino = btrfs_ino(BTRFS_I(inode)); 1592 1593 path = btrfs_alloc_path(); 1594 if (!path) 1595 return -ENOMEM; 1596 1597 ret = count_inode_refs(root, BTRFS_I(inode), path); 1598 if (ret < 0) 1599 goto out; 1600 1601 nlink = ret; 1602 1603 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1604 if (ret < 0) 1605 goto out; 1606 1607 nlink += ret; 1608 1609 ret = 0; 1610 1611 if (nlink != inode->i_nlink) { 1612 set_nlink(inode, nlink); 1613 btrfs_update_inode(trans, root, inode); 1614 } 1615 BTRFS_I(inode)->index_cnt = (u64)-1; 1616 1617 if (inode->i_nlink == 0) { 1618 if (S_ISDIR(inode->i_mode)) { 1619 ret = replay_dir_deletes(trans, root, NULL, path, 1620 ino, 1); 1621 if (ret) 1622 goto out; 1623 } 1624 ret = insert_orphan_item(trans, root, ino); 1625 } 1626 1627 out: 1628 btrfs_free_path(path); 1629 return ret; 1630 } 1631 1632 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1633 struct btrfs_root *root, 1634 struct btrfs_path *path) 1635 { 1636 int ret; 1637 struct btrfs_key key; 1638 struct inode *inode; 1639 1640 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1641 key.type = BTRFS_ORPHAN_ITEM_KEY; 1642 key.offset = (u64)-1; 1643 while (1) { 1644 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1645 if (ret < 0) 1646 break; 1647 1648 if (ret == 1) { 1649 if (path->slots[0] == 0) 1650 break; 1651 path->slots[0]--; 1652 } 1653 1654 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1655 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1656 key.type != BTRFS_ORPHAN_ITEM_KEY) 1657 break; 1658 1659 ret = btrfs_del_item(trans, root, path); 1660 if (ret) 1661 goto out; 1662 1663 btrfs_release_path(path); 1664 inode = read_one_inode(root, key.offset); 1665 if (!inode) 1666 return -EIO; 1667 1668 ret = fixup_inode_link_count(trans, root, inode); 1669 iput(inode); 1670 if (ret) 1671 goto out; 1672 1673 /* 1674 * fixup on a directory may create new entries, 1675 * make sure we always look for the highset possible 1676 * offset 1677 */ 1678 key.offset = (u64)-1; 1679 } 1680 ret = 0; 1681 out: 1682 btrfs_release_path(path); 1683 return ret; 1684 } 1685 1686 1687 /* 1688 * record a given inode in the fixup dir so we can check its link 1689 * count when replay is done. The link count is incremented here 1690 * so the inode won't go away until we check it 1691 */ 1692 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1693 struct btrfs_root *root, 1694 struct btrfs_path *path, 1695 u64 objectid) 1696 { 1697 struct btrfs_key key; 1698 int ret = 0; 1699 struct inode *inode; 1700 1701 inode = read_one_inode(root, objectid); 1702 if (!inode) 1703 return -EIO; 1704 1705 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1706 key.type = BTRFS_ORPHAN_ITEM_KEY; 1707 key.offset = objectid; 1708 1709 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1710 1711 btrfs_release_path(path); 1712 if (ret == 0) { 1713 if (!inode->i_nlink) 1714 set_nlink(inode, 1); 1715 else 1716 inc_nlink(inode); 1717 ret = btrfs_update_inode(trans, root, inode); 1718 } else if (ret == -EEXIST) { 1719 ret = 0; 1720 } else { 1721 BUG(); /* Logic Error */ 1722 } 1723 iput(inode); 1724 1725 return ret; 1726 } 1727 1728 /* 1729 * when replaying the log for a directory, we only insert names 1730 * for inodes that actually exist. This means an fsync on a directory 1731 * does not implicitly fsync all the new files in it 1732 */ 1733 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1734 struct btrfs_root *root, 1735 u64 dirid, u64 index, 1736 char *name, int name_len, 1737 struct btrfs_key *location) 1738 { 1739 struct inode *inode; 1740 struct inode *dir; 1741 int ret; 1742 1743 inode = read_one_inode(root, location->objectid); 1744 if (!inode) 1745 return -ENOENT; 1746 1747 dir = read_one_inode(root, dirid); 1748 if (!dir) { 1749 iput(inode); 1750 return -EIO; 1751 } 1752 1753 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1754 name_len, 1, index); 1755 1756 /* FIXME, put inode into FIXUP list */ 1757 1758 iput(inode); 1759 iput(dir); 1760 return ret; 1761 } 1762 1763 /* 1764 * Return true if an inode reference exists in the log for the given name, 1765 * inode and parent inode. 1766 */ 1767 static bool name_in_log_ref(struct btrfs_root *log_root, 1768 const char *name, const int name_len, 1769 const u64 dirid, const u64 ino) 1770 { 1771 struct btrfs_key search_key; 1772 1773 search_key.objectid = ino; 1774 search_key.type = BTRFS_INODE_REF_KEY; 1775 search_key.offset = dirid; 1776 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1777 return true; 1778 1779 search_key.type = BTRFS_INODE_EXTREF_KEY; 1780 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1781 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1782 return true; 1783 1784 return false; 1785 } 1786 1787 /* 1788 * take a single entry in a log directory item and replay it into 1789 * the subvolume. 1790 * 1791 * if a conflicting item exists in the subdirectory already, 1792 * the inode it points to is unlinked and put into the link count 1793 * fix up tree. 1794 * 1795 * If a name from the log points to a file or directory that does 1796 * not exist in the FS, it is skipped. fsyncs on directories 1797 * do not force down inodes inside that directory, just changes to the 1798 * names or unlinks in a directory. 1799 * 1800 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1801 * non-existing inode) and 1 if the name was replayed. 1802 */ 1803 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1804 struct btrfs_root *root, 1805 struct btrfs_path *path, 1806 struct extent_buffer *eb, 1807 struct btrfs_dir_item *di, 1808 struct btrfs_key *key) 1809 { 1810 char *name; 1811 int name_len; 1812 struct btrfs_dir_item *dst_di; 1813 struct btrfs_key found_key; 1814 struct btrfs_key log_key; 1815 struct inode *dir; 1816 u8 log_type; 1817 int exists; 1818 int ret = 0; 1819 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1820 bool name_added = false; 1821 1822 dir = read_one_inode(root, key->objectid); 1823 if (!dir) 1824 return -EIO; 1825 1826 name_len = btrfs_dir_name_len(eb, di); 1827 name = kmalloc(name_len, GFP_NOFS); 1828 if (!name) { 1829 ret = -ENOMEM; 1830 goto out; 1831 } 1832 1833 log_type = btrfs_dir_type(eb, di); 1834 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1835 name_len); 1836 1837 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1838 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1839 if (exists == 0) 1840 exists = 1; 1841 else 1842 exists = 0; 1843 btrfs_release_path(path); 1844 1845 if (key->type == BTRFS_DIR_ITEM_KEY) { 1846 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1847 name, name_len, 1); 1848 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1849 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1850 key->objectid, 1851 key->offset, name, 1852 name_len, 1); 1853 } else { 1854 /* Corruption */ 1855 ret = -EINVAL; 1856 goto out; 1857 } 1858 if (IS_ERR_OR_NULL(dst_di)) { 1859 /* we need a sequence number to insert, so we only 1860 * do inserts for the BTRFS_DIR_INDEX_KEY types 1861 */ 1862 if (key->type != BTRFS_DIR_INDEX_KEY) 1863 goto out; 1864 goto insert; 1865 } 1866 1867 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1868 /* the existing item matches the logged item */ 1869 if (found_key.objectid == log_key.objectid && 1870 found_key.type == log_key.type && 1871 found_key.offset == log_key.offset && 1872 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1873 update_size = false; 1874 goto out; 1875 } 1876 1877 /* 1878 * don't drop the conflicting directory entry if the inode 1879 * for the new entry doesn't exist 1880 */ 1881 if (!exists) 1882 goto out; 1883 1884 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 1885 if (ret) 1886 goto out; 1887 1888 if (key->type == BTRFS_DIR_INDEX_KEY) 1889 goto insert; 1890 out: 1891 btrfs_release_path(path); 1892 if (!ret && update_size) { 1893 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1894 ret = btrfs_update_inode(trans, root, dir); 1895 } 1896 kfree(name); 1897 iput(dir); 1898 if (!ret && name_added) 1899 ret = 1; 1900 return ret; 1901 1902 insert: 1903 if (name_in_log_ref(root->log_root, name, name_len, 1904 key->objectid, log_key.objectid)) { 1905 /* The dentry will be added later. */ 1906 ret = 0; 1907 update_size = false; 1908 goto out; 1909 } 1910 btrfs_release_path(path); 1911 ret = insert_one_name(trans, root, key->objectid, key->offset, 1912 name, name_len, &log_key); 1913 if (ret && ret != -ENOENT && ret != -EEXIST) 1914 goto out; 1915 if (!ret) 1916 name_added = true; 1917 update_size = false; 1918 ret = 0; 1919 goto out; 1920 } 1921 1922 /* 1923 * find all the names in a directory item and reconcile them into 1924 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1925 * one name in a directory item, but the same code gets used for 1926 * both directory index types 1927 */ 1928 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1929 struct btrfs_root *root, 1930 struct btrfs_path *path, 1931 struct extent_buffer *eb, int slot, 1932 struct btrfs_key *key) 1933 { 1934 int ret = 0; 1935 u32 item_size = btrfs_item_size_nr(eb, slot); 1936 struct btrfs_dir_item *di; 1937 int name_len; 1938 unsigned long ptr; 1939 unsigned long ptr_end; 1940 struct btrfs_path *fixup_path = NULL; 1941 1942 ptr = btrfs_item_ptr_offset(eb, slot); 1943 ptr_end = ptr + item_size; 1944 while (ptr < ptr_end) { 1945 di = (struct btrfs_dir_item *)ptr; 1946 name_len = btrfs_dir_name_len(eb, di); 1947 ret = replay_one_name(trans, root, path, eb, di, key); 1948 if (ret < 0) 1949 break; 1950 ptr = (unsigned long)(di + 1); 1951 ptr += name_len; 1952 1953 /* 1954 * If this entry refers to a non-directory (directories can not 1955 * have a link count > 1) and it was added in the transaction 1956 * that was not committed, make sure we fixup the link count of 1957 * the inode it the entry points to. Otherwise something like 1958 * the following would result in a directory pointing to an 1959 * inode with a wrong link that does not account for this dir 1960 * entry: 1961 * 1962 * mkdir testdir 1963 * touch testdir/foo 1964 * touch testdir/bar 1965 * sync 1966 * 1967 * ln testdir/bar testdir/bar_link 1968 * ln testdir/foo testdir/foo_link 1969 * xfs_io -c "fsync" testdir/bar 1970 * 1971 * <power failure> 1972 * 1973 * mount fs, log replay happens 1974 * 1975 * File foo would remain with a link count of 1 when it has two 1976 * entries pointing to it in the directory testdir. This would 1977 * make it impossible to ever delete the parent directory has 1978 * it would result in stale dentries that can never be deleted. 1979 */ 1980 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1981 struct btrfs_key di_key; 1982 1983 if (!fixup_path) { 1984 fixup_path = btrfs_alloc_path(); 1985 if (!fixup_path) { 1986 ret = -ENOMEM; 1987 break; 1988 } 1989 } 1990 1991 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1992 ret = link_to_fixup_dir(trans, root, fixup_path, 1993 di_key.objectid); 1994 if (ret) 1995 break; 1996 } 1997 ret = 0; 1998 } 1999 btrfs_free_path(fixup_path); 2000 return ret; 2001 } 2002 2003 /* 2004 * directory replay has two parts. There are the standard directory 2005 * items in the log copied from the subvolume, and range items 2006 * created in the log while the subvolume was logged. 2007 * 2008 * The range items tell us which parts of the key space the log 2009 * is authoritative for. During replay, if a key in the subvolume 2010 * directory is in a logged range item, but not actually in the log 2011 * that means it was deleted from the directory before the fsync 2012 * and should be removed. 2013 */ 2014 static noinline int find_dir_range(struct btrfs_root *root, 2015 struct btrfs_path *path, 2016 u64 dirid, int key_type, 2017 u64 *start_ret, u64 *end_ret) 2018 { 2019 struct btrfs_key key; 2020 u64 found_end; 2021 struct btrfs_dir_log_item *item; 2022 int ret; 2023 int nritems; 2024 2025 if (*start_ret == (u64)-1) 2026 return 1; 2027 2028 key.objectid = dirid; 2029 key.type = key_type; 2030 key.offset = *start_ret; 2031 2032 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2033 if (ret < 0) 2034 goto out; 2035 if (ret > 0) { 2036 if (path->slots[0] == 0) 2037 goto out; 2038 path->slots[0]--; 2039 } 2040 if (ret != 0) 2041 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2042 2043 if (key.type != key_type || key.objectid != dirid) { 2044 ret = 1; 2045 goto next; 2046 } 2047 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2048 struct btrfs_dir_log_item); 2049 found_end = btrfs_dir_log_end(path->nodes[0], item); 2050 2051 if (*start_ret >= key.offset && *start_ret <= found_end) { 2052 ret = 0; 2053 *start_ret = key.offset; 2054 *end_ret = found_end; 2055 goto out; 2056 } 2057 ret = 1; 2058 next: 2059 /* check the next slot in the tree to see if it is a valid item */ 2060 nritems = btrfs_header_nritems(path->nodes[0]); 2061 path->slots[0]++; 2062 if (path->slots[0] >= nritems) { 2063 ret = btrfs_next_leaf(root, path); 2064 if (ret) 2065 goto out; 2066 } 2067 2068 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2069 2070 if (key.type != key_type || key.objectid != dirid) { 2071 ret = 1; 2072 goto out; 2073 } 2074 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2075 struct btrfs_dir_log_item); 2076 found_end = btrfs_dir_log_end(path->nodes[0], item); 2077 *start_ret = key.offset; 2078 *end_ret = found_end; 2079 ret = 0; 2080 out: 2081 btrfs_release_path(path); 2082 return ret; 2083 } 2084 2085 /* 2086 * this looks for a given directory item in the log. If the directory 2087 * item is not in the log, the item is removed and the inode it points 2088 * to is unlinked 2089 */ 2090 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2091 struct btrfs_root *root, 2092 struct btrfs_root *log, 2093 struct btrfs_path *path, 2094 struct btrfs_path *log_path, 2095 struct inode *dir, 2096 struct btrfs_key *dir_key) 2097 { 2098 int ret; 2099 struct extent_buffer *eb; 2100 int slot; 2101 u32 item_size; 2102 struct btrfs_dir_item *di; 2103 struct btrfs_dir_item *log_di; 2104 int name_len; 2105 unsigned long ptr; 2106 unsigned long ptr_end; 2107 char *name; 2108 struct inode *inode; 2109 struct btrfs_key location; 2110 2111 again: 2112 eb = path->nodes[0]; 2113 slot = path->slots[0]; 2114 item_size = btrfs_item_size_nr(eb, slot); 2115 ptr = btrfs_item_ptr_offset(eb, slot); 2116 ptr_end = ptr + item_size; 2117 while (ptr < ptr_end) { 2118 di = (struct btrfs_dir_item *)ptr; 2119 name_len = btrfs_dir_name_len(eb, di); 2120 name = kmalloc(name_len, GFP_NOFS); 2121 if (!name) { 2122 ret = -ENOMEM; 2123 goto out; 2124 } 2125 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2126 name_len); 2127 log_di = NULL; 2128 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2129 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2130 dir_key->objectid, 2131 name, name_len, 0); 2132 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2133 log_di = btrfs_lookup_dir_index_item(trans, log, 2134 log_path, 2135 dir_key->objectid, 2136 dir_key->offset, 2137 name, name_len, 0); 2138 } 2139 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2140 btrfs_dir_item_key_to_cpu(eb, di, &location); 2141 btrfs_release_path(path); 2142 btrfs_release_path(log_path); 2143 inode = read_one_inode(root, location.objectid); 2144 if (!inode) { 2145 kfree(name); 2146 return -EIO; 2147 } 2148 2149 ret = link_to_fixup_dir(trans, root, 2150 path, location.objectid); 2151 if (ret) { 2152 kfree(name); 2153 iput(inode); 2154 goto out; 2155 } 2156 2157 inc_nlink(inode); 2158 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2159 BTRFS_I(inode), name, name_len); 2160 if (!ret) 2161 ret = btrfs_run_delayed_items(trans); 2162 kfree(name); 2163 iput(inode); 2164 if (ret) 2165 goto out; 2166 2167 /* there might still be more names under this key 2168 * check and repeat if required 2169 */ 2170 ret = btrfs_search_slot(NULL, root, dir_key, path, 2171 0, 0); 2172 if (ret == 0) 2173 goto again; 2174 ret = 0; 2175 goto out; 2176 } else if (IS_ERR(log_di)) { 2177 kfree(name); 2178 return PTR_ERR(log_di); 2179 } 2180 btrfs_release_path(log_path); 2181 kfree(name); 2182 2183 ptr = (unsigned long)(di + 1); 2184 ptr += name_len; 2185 } 2186 ret = 0; 2187 out: 2188 btrfs_release_path(path); 2189 btrfs_release_path(log_path); 2190 return ret; 2191 } 2192 2193 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2194 struct btrfs_root *root, 2195 struct btrfs_root *log, 2196 struct btrfs_path *path, 2197 const u64 ino) 2198 { 2199 struct btrfs_key search_key; 2200 struct btrfs_path *log_path; 2201 int i; 2202 int nritems; 2203 int ret; 2204 2205 log_path = btrfs_alloc_path(); 2206 if (!log_path) 2207 return -ENOMEM; 2208 2209 search_key.objectid = ino; 2210 search_key.type = BTRFS_XATTR_ITEM_KEY; 2211 search_key.offset = 0; 2212 again: 2213 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2214 if (ret < 0) 2215 goto out; 2216 process_leaf: 2217 nritems = btrfs_header_nritems(path->nodes[0]); 2218 for (i = path->slots[0]; i < nritems; i++) { 2219 struct btrfs_key key; 2220 struct btrfs_dir_item *di; 2221 struct btrfs_dir_item *log_di; 2222 u32 total_size; 2223 u32 cur; 2224 2225 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2226 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2227 ret = 0; 2228 goto out; 2229 } 2230 2231 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2232 total_size = btrfs_item_size_nr(path->nodes[0], i); 2233 cur = 0; 2234 while (cur < total_size) { 2235 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2236 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2237 u32 this_len = sizeof(*di) + name_len + data_len; 2238 char *name; 2239 2240 name = kmalloc(name_len, GFP_NOFS); 2241 if (!name) { 2242 ret = -ENOMEM; 2243 goto out; 2244 } 2245 read_extent_buffer(path->nodes[0], name, 2246 (unsigned long)(di + 1), name_len); 2247 2248 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2249 name, name_len, 0); 2250 btrfs_release_path(log_path); 2251 if (!log_di) { 2252 /* Doesn't exist in log tree, so delete it. */ 2253 btrfs_release_path(path); 2254 di = btrfs_lookup_xattr(trans, root, path, ino, 2255 name, name_len, -1); 2256 kfree(name); 2257 if (IS_ERR(di)) { 2258 ret = PTR_ERR(di); 2259 goto out; 2260 } 2261 ASSERT(di); 2262 ret = btrfs_delete_one_dir_name(trans, root, 2263 path, di); 2264 if (ret) 2265 goto out; 2266 btrfs_release_path(path); 2267 search_key = key; 2268 goto again; 2269 } 2270 kfree(name); 2271 if (IS_ERR(log_di)) { 2272 ret = PTR_ERR(log_di); 2273 goto out; 2274 } 2275 cur += this_len; 2276 di = (struct btrfs_dir_item *)((char *)di + this_len); 2277 } 2278 } 2279 ret = btrfs_next_leaf(root, path); 2280 if (ret > 0) 2281 ret = 0; 2282 else if (ret == 0) 2283 goto process_leaf; 2284 out: 2285 btrfs_free_path(log_path); 2286 btrfs_release_path(path); 2287 return ret; 2288 } 2289 2290 2291 /* 2292 * deletion replay happens before we copy any new directory items 2293 * out of the log or out of backreferences from inodes. It 2294 * scans the log to find ranges of keys that log is authoritative for, 2295 * and then scans the directory to find items in those ranges that are 2296 * not present in the log. 2297 * 2298 * Anything we don't find in the log is unlinked and removed from the 2299 * directory. 2300 */ 2301 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2302 struct btrfs_root *root, 2303 struct btrfs_root *log, 2304 struct btrfs_path *path, 2305 u64 dirid, int del_all) 2306 { 2307 u64 range_start; 2308 u64 range_end; 2309 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2310 int ret = 0; 2311 struct btrfs_key dir_key; 2312 struct btrfs_key found_key; 2313 struct btrfs_path *log_path; 2314 struct inode *dir; 2315 2316 dir_key.objectid = dirid; 2317 dir_key.type = BTRFS_DIR_ITEM_KEY; 2318 log_path = btrfs_alloc_path(); 2319 if (!log_path) 2320 return -ENOMEM; 2321 2322 dir = read_one_inode(root, dirid); 2323 /* it isn't an error if the inode isn't there, that can happen 2324 * because we replay the deletes before we copy in the inode item 2325 * from the log 2326 */ 2327 if (!dir) { 2328 btrfs_free_path(log_path); 2329 return 0; 2330 } 2331 again: 2332 range_start = 0; 2333 range_end = 0; 2334 while (1) { 2335 if (del_all) 2336 range_end = (u64)-1; 2337 else { 2338 ret = find_dir_range(log, path, dirid, key_type, 2339 &range_start, &range_end); 2340 if (ret != 0) 2341 break; 2342 } 2343 2344 dir_key.offset = range_start; 2345 while (1) { 2346 int nritems; 2347 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2348 0, 0); 2349 if (ret < 0) 2350 goto out; 2351 2352 nritems = btrfs_header_nritems(path->nodes[0]); 2353 if (path->slots[0] >= nritems) { 2354 ret = btrfs_next_leaf(root, path); 2355 if (ret) 2356 break; 2357 } 2358 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2359 path->slots[0]); 2360 if (found_key.objectid != dirid || 2361 found_key.type != dir_key.type) 2362 goto next_type; 2363 2364 if (found_key.offset > range_end) 2365 break; 2366 2367 ret = check_item_in_log(trans, root, log, path, 2368 log_path, dir, 2369 &found_key); 2370 if (ret) 2371 goto out; 2372 if (found_key.offset == (u64)-1) 2373 break; 2374 dir_key.offset = found_key.offset + 1; 2375 } 2376 btrfs_release_path(path); 2377 if (range_end == (u64)-1) 2378 break; 2379 range_start = range_end + 1; 2380 } 2381 2382 next_type: 2383 ret = 0; 2384 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2385 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2386 dir_key.type = BTRFS_DIR_INDEX_KEY; 2387 btrfs_release_path(path); 2388 goto again; 2389 } 2390 out: 2391 btrfs_release_path(path); 2392 btrfs_free_path(log_path); 2393 iput(dir); 2394 return ret; 2395 } 2396 2397 /* 2398 * the process_func used to replay items from the log tree. This 2399 * gets called in two different stages. The first stage just looks 2400 * for inodes and makes sure they are all copied into the subvolume. 2401 * 2402 * The second stage copies all the other item types from the log into 2403 * the subvolume. The two stage approach is slower, but gets rid of 2404 * lots of complexity around inodes referencing other inodes that exist 2405 * only in the log (references come from either directory items or inode 2406 * back refs). 2407 */ 2408 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2409 struct walk_control *wc, u64 gen, int level) 2410 { 2411 int nritems; 2412 struct btrfs_path *path; 2413 struct btrfs_root *root = wc->replay_dest; 2414 struct btrfs_key key; 2415 int i; 2416 int ret; 2417 2418 ret = btrfs_read_buffer(eb, gen, level, NULL); 2419 if (ret) 2420 return ret; 2421 2422 level = btrfs_header_level(eb); 2423 2424 if (level != 0) 2425 return 0; 2426 2427 path = btrfs_alloc_path(); 2428 if (!path) 2429 return -ENOMEM; 2430 2431 nritems = btrfs_header_nritems(eb); 2432 for (i = 0; i < nritems; i++) { 2433 btrfs_item_key_to_cpu(eb, &key, i); 2434 2435 /* inode keys are done during the first stage */ 2436 if (key.type == BTRFS_INODE_ITEM_KEY && 2437 wc->stage == LOG_WALK_REPLAY_INODES) { 2438 struct btrfs_inode_item *inode_item; 2439 u32 mode; 2440 2441 inode_item = btrfs_item_ptr(eb, i, 2442 struct btrfs_inode_item); 2443 ret = replay_xattr_deletes(wc->trans, root, log, 2444 path, key.objectid); 2445 if (ret) 2446 break; 2447 mode = btrfs_inode_mode(eb, inode_item); 2448 if (S_ISDIR(mode)) { 2449 ret = replay_dir_deletes(wc->trans, 2450 root, log, path, key.objectid, 0); 2451 if (ret) 2452 break; 2453 } 2454 ret = overwrite_item(wc->trans, root, path, 2455 eb, i, &key); 2456 if (ret) 2457 break; 2458 2459 /* for regular files, make sure corresponding 2460 * orphan item exist. extents past the new EOF 2461 * will be truncated later by orphan cleanup. 2462 */ 2463 if (S_ISREG(mode)) { 2464 ret = insert_orphan_item(wc->trans, root, 2465 key.objectid); 2466 if (ret) 2467 break; 2468 } 2469 2470 ret = link_to_fixup_dir(wc->trans, root, 2471 path, key.objectid); 2472 if (ret) 2473 break; 2474 } 2475 2476 if (key.type == BTRFS_DIR_INDEX_KEY && 2477 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2478 ret = replay_one_dir_item(wc->trans, root, path, 2479 eb, i, &key); 2480 if (ret) 2481 break; 2482 } 2483 2484 if (wc->stage < LOG_WALK_REPLAY_ALL) 2485 continue; 2486 2487 /* these keys are simply copied */ 2488 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2489 ret = overwrite_item(wc->trans, root, path, 2490 eb, i, &key); 2491 if (ret) 2492 break; 2493 } else if (key.type == BTRFS_INODE_REF_KEY || 2494 key.type == BTRFS_INODE_EXTREF_KEY) { 2495 ret = add_inode_ref(wc->trans, root, log, path, 2496 eb, i, &key); 2497 if (ret && ret != -ENOENT) 2498 break; 2499 ret = 0; 2500 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2501 ret = replay_one_extent(wc->trans, root, path, 2502 eb, i, &key); 2503 if (ret) 2504 break; 2505 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2506 ret = replay_one_dir_item(wc->trans, root, path, 2507 eb, i, &key); 2508 if (ret) 2509 break; 2510 } 2511 } 2512 btrfs_free_path(path); 2513 return ret; 2514 } 2515 2516 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2517 struct btrfs_root *root, 2518 struct btrfs_path *path, int *level, 2519 struct walk_control *wc) 2520 { 2521 struct btrfs_fs_info *fs_info = root->fs_info; 2522 u64 root_owner; 2523 u64 bytenr; 2524 u64 ptr_gen; 2525 struct extent_buffer *next; 2526 struct extent_buffer *cur; 2527 struct extent_buffer *parent; 2528 u32 blocksize; 2529 int ret = 0; 2530 2531 WARN_ON(*level < 0); 2532 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2533 2534 while (*level > 0) { 2535 struct btrfs_key first_key; 2536 2537 WARN_ON(*level < 0); 2538 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2539 cur = path->nodes[*level]; 2540 2541 WARN_ON(btrfs_header_level(cur) != *level); 2542 2543 if (path->slots[*level] >= 2544 btrfs_header_nritems(cur)) 2545 break; 2546 2547 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2548 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2549 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2550 blocksize = fs_info->nodesize; 2551 2552 parent = path->nodes[*level]; 2553 root_owner = btrfs_header_owner(parent); 2554 2555 next = btrfs_find_create_tree_block(fs_info, bytenr); 2556 if (IS_ERR(next)) 2557 return PTR_ERR(next); 2558 2559 if (*level == 1) { 2560 ret = wc->process_func(root, next, wc, ptr_gen, 2561 *level - 1); 2562 if (ret) { 2563 free_extent_buffer(next); 2564 return ret; 2565 } 2566 2567 path->slots[*level]++; 2568 if (wc->free) { 2569 ret = btrfs_read_buffer(next, ptr_gen, 2570 *level - 1, &first_key); 2571 if (ret) { 2572 free_extent_buffer(next); 2573 return ret; 2574 } 2575 2576 if (trans) { 2577 btrfs_tree_lock(next); 2578 btrfs_set_lock_blocking(next); 2579 clean_tree_block(fs_info, next); 2580 btrfs_wait_tree_block_writeback(next); 2581 btrfs_tree_unlock(next); 2582 } else { 2583 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2584 clear_extent_buffer_dirty(next); 2585 } 2586 2587 WARN_ON(root_owner != 2588 BTRFS_TREE_LOG_OBJECTID); 2589 ret = btrfs_free_and_pin_reserved_extent( 2590 fs_info, bytenr, 2591 blocksize); 2592 if (ret) { 2593 free_extent_buffer(next); 2594 return ret; 2595 } 2596 } 2597 free_extent_buffer(next); 2598 continue; 2599 } 2600 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2601 if (ret) { 2602 free_extent_buffer(next); 2603 return ret; 2604 } 2605 2606 WARN_ON(*level <= 0); 2607 if (path->nodes[*level-1]) 2608 free_extent_buffer(path->nodes[*level-1]); 2609 path->nodes[*level-1] = next; 2610 *level = btrfs_header_level(next); 2611 path->slots[*level] = 0; 2612 cond_resched(); 2613 } 2614 WARN_ON(*level < 0); 2615 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2616 2617 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2618 2619 cond_resched(); 2620 return 0; 2621 } 2622 2623 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2624 struct btrfs_root *root, 2625 struct btrfs_path *path, int *level, 2626 struct walk_control *wc) 2627 { 2628 struct btrfs_fs_info *fs_info = root->fs_info; 2629 u64 root_owner; 2630 int i; 2631 int slot; 2632 int ret; 2633 2634 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2635 slot = path->slots[i]; 2636 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2637 path->slots[i]++; 2638 *level = i; 2639 WARN_ON(*level == 0); 2640 return 0; 2641 } else { 2642 struct extent_buffer *parent; 2643 if (path->nodes[*level] == root->node) 2644 parent = path->nodes[*level]; 2645 else 2646 parent = path->nodes[*level + 1]; 2647 2648 root_owner = btrfs_header_owner(parent); 2649 ret = wc->process_func(root, path->nodes[*level], wc, 2650 btrfs_header_generation(path->nodes[*level]), 2651 *level); 2652 if (ret) 2653 return ret; 2654 2655 if (wc->free) { 2656 struct extent_buffer *next; 2657 2658 next = path->nodes[*level]; 2659 2660 if (trans) { 2661 btrfs_tree_lock(next); 2662 btrfs_set_lock_blocking(next); 2663 clean_tree_block(fs_info, next); 2664 btrfs_wait_tree_block_writeback(next); 2665 btrfs_tree_unlock(next); 2666 } else { 2667 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2668 clear_extent_buffer_dirty(next); 2669 } 2670 2671 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2672 ret = btrfs_free_and_pin_reserved_extent( 2673 fs_info, 2674 path->nodes[*level]->start, 2675 path->nodes[*level]->len); 2676 if (ret) 2677 return ret; 2678 } 2679 free_extent_buffer(path->nodes[*level]); 2680 path->nodes[*level] = NULL; 2681 *level = i + 1; 2682 } 2683 } 2684 return 1; 2685 } 2686 2687 /* 2688 * drop the reference count on the tree rooted at 'snap'. This traverses 2689 * the tree freeing any blocks that have a ref count of zero after being 2690 * decremented. 2691 */ 2692 static int walk_log_tree(struct btrfs_trans_handle *trans, 2693 struct btrfs_root *log, struct walk_control *wc) 2694 { 2695 struct btrfs_fs_info *fs_info = log->fs_info; 2696 int ret = 0; 2697 int wret; 2698 int level; 2699 struct btrfs_path *path; 2700 int orig_level; 2701 2702 path = btrfs_alloc_path(); 2703 if (!path) 2704 return -ENOMEM; 2705 2706 level = btrfs_header_level(log->node); 2707 orig_level = level; 2708 path->nodes[level] = log->node; 2709 extent_buffer_get(log->node); 2710 path->slots[level] = 0; 2711 2712 while (1) { 2713 wret = walk_down_log_tree(trans, log, path, &level, wc); 2714 if (wret > 0) 2715 break; 2716 if (wret < 0) { 2717 ret = wret; 2718 goto out; 2719 } 2720 2721 wret = walk_up_log_tree(trans, log, path, &level, wc); 2722 if (wret > 0) 2723 break; 2724 if (wret < 0) { 2725 ret = wret; 2726 goto out; 2727 } 2728 } 2729 2730 /* was the root node processed? if not, catch it here */ 2731 if (path->nodes[orig_level]) { 2732 ret = wc->process_func(log, path->nodes[orig_level], wc, 2733 btrfs_header_generation(path->nodes[orig_level]), 2734 orig_level); 2735 if (ret) 2736 goto out; 2737 if (wc->free) { 2738 struct extent_buffer *next; 2739 2740 next = path->nodes[orig_level]; 2741 2742 if (trans) { 2743 btrfs_tree_lock(next); 2744 btrfs_set_lock_blocking(next); 2745 clean_tree_block(fs_info, next); 2746 btrfs_wait_tree_block_writeback(next); 2747 btrfs_tree_unlock(next); 2748 } else { 2749 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2750 clear_extent_buffer_dirty(next); 2751 } 2752 2753 WARN_ON(log->root_key.objectid != 2754 BTRFS_TREE_LOG_OBJECTID); 2755 ret = btrfs_free_and_pin_reserved_extent(fs_info, 2756 next->start, next->len); 2757 if (ret) 2758 goto out; 2759 } 2760 } 2761 2762 out: 2763 btrfs_free_path(path); 2764 return ret; 2765 } 2766 2767 /* 2768 * helper function to update the item for a given subvolumes log root 2769 * in the tree of log roots 2770 */ 2771 static int update_log_root(struct btrfs_trans_handle *trans, 2772 struct btrfs_root *log) 2773 { 2774 struct btrfs_fs_info *fs_info = log->fs_info; 2775 int ret; 2776 2777 if (log->log_transid == 1) { 2778 /* insert root item on the first sync */ 2779 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2780 &log->root_key, &log->root_item); 2781 } else { 2782 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2783 &log->root_key, &log->root_item); 2784 } 2785 return ret; 2786 } 2787 2788 static void wait_log_commit(struct btrfs_root *root, int transid) 2789 { 2790 DEFINE_WAIT(wait); 2791 int index = transid % 2; 2792 2793 /* 2794 * we only allow two pending log transactions at a time, 2795 * so we know that if ours is more than 2 older than the 2796 * current transaction, we're done 2797 */ 2798 for (;;) { 2799 prepare_to_wait(&root->log_commit_wait[index], 2800 &wait, TASK_UNINTERRUPTIBLE); 2801 2802 if (!(root->log_transid_committed < transid && 2803 atomic_read(&root->log_commit[index]))) 2804 break; 2805 2806 mutex_unlock(&root->log_mutex); 2807 schedule(); 2808 mutex_lock(&root->log_mutex); 2809 } 2810 finish_wait(&root->log_commit_wait[index], &wait); 2811 } 2812 2813 static void wait_for_writer(struct btrfs_root *root) 2814 { 2815 DEFINE_WAIT(wait); 2816 2817 for (;;) { 2818 prepare_to_wait(&root->log_writer_wait, &wait, 2819 TASK_UNINTERRUPTIBLE); 2820 if (!atomic_read(&root->log_writers)) 2821 break; 2822 2823 mutex_unlock(&root->log_mutex); 2824 schedule(); 2825 mutex_lock(&root->log_mutex); 2826 } 2827 finish_wait(&root->log_writer_wait, &wait); 2828 } 2829 2830 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2831 struct btrfs_log_ctx *ctx) 2832 { 2833 if (!ctx) 2834 return; 2835 2836 mutex_lock(&root->log_mutex); 2837 list_del_init(&ctx->list); 2838 mutex_unlock(&root->log_mutex); 2839 } 2840 2841 /* 2842 * Invoked in log mutex context, or be sure there is no other task which 2843 * can access the list. 2844 */ 2845 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2846 int index, int error) 2847 { 2848 struct btrfs_log_ctx *ctx; 2849 struct btrfs_log_ctx *safe; 2850 2851 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2852 list_del_init(&ctx->list); 2853 ctx->log_ret = error; 2854 } 2855 2856 INIT_LIST_HEAD(&root->log_ctxs[index]); 2857 } 2858 2859 /* 2860 * btrfs_sync_log does sends a given tree log down to the disk and 2861 * updates the super blocks to record it. When this call is done, 2862 * you know that any inodes previously logged are safely on disk only 2863 * if it returns 0. 2864 * 2865 * Any other return value means you need to call btrfs_commit_transaction. 2866 * Some of the edge cases for fsyncing directories that have had unlinks 2867 * or renames done in the past mean that sometimes the only safe 2868 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2869 * that has happened. 2870 */ 2871 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2872 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2873 { 2874 int index1; 2875 int index2; 2876 int mark; 2877 int ret; 2878 struct btrfs_fs_info *fs_info = root->fs_info; 2879 struct btrfs_root *log = root->log_root; 2880 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2881 int log_transid = 0; 2882 struct btrfs_log_ctx root_log_ctx; 2883 struct blk_plug plug; 2884 2885 mutex_lock(&root->log_mutex); 2886 log_transid = ctx->log_transid; 2887 if (root->log_transid_committed >= log_transid) { 2888 mutex_unlock(&root->log_mutex); 2889 return ctx->log_ret; 2890 } 2891 2892 index1 = log_transid % 2; 2893 if (atomic_read(&root->log_commit[index1])) { 2894 wait_log_commit(root, log_transid); 2895 mutex_unlock(&root->log_mutex); 2896 return ctx->log_ret; 2897 } 2898 ASSERT(log_transid == root->log_transid); 2899 atomic_set(&root->log_commit[index1], 1); 2900 2901 /* wait for previous tree log sync to complete */ 2902 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2903 wait_log_commit(root, log_transid - 1); 2904 2905 while (1) { 2906 int batch = atomic_read(&root->log_batch); 2907 /* when we're on an ssd, just kick the log commit out */ 2908 if (!btrfs_test_opt(fs_info, SSD) && 2909 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2910 mutex_unlock(&root->log_mutex); 2911 schedule_timeout_uninterruptible(1); 2912 mutex_lock(&root->log_mutex); 2913 } 2914 wait_for_writer(root); 2915 if (batch == atomic_read(&root->log_batch)) 2916 break; 2917 } 2918 2919 /* bail out if we need to do a full commit */ 2920 if (btrfs_need_log_full_commit(fs_info, trans)) { 2921 ret = -EAGAIN; 2922 btrfs_free_logged_extents(log, log_transid); 2923 mutex_unlock(&root->log_mutex); 2924 goto out; 2925 } 2926 2927 if (log_transid % 2 == 0) 2928 mark = EXTENT_DIRTY; 2929 else 2930 mark = EXTENT_NEW; 2931 2932 /* we start IO on all the marked extents here, but we don't actually 2933 * wait for them until later. 2934 */ 2935 blk_start_plug(&plug); 2936 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2937 if (ret) { 2938 blk_finish_plug(&plug); 2939 btrfs_abort_transaction(trans, ret); 2940 btrfs_free_logged_extents(log, log_transid); 2941 btrfs_set_log_full_commit(fs_info, trans); 2942 mutex_unlock(&root->log_mutex); 2943 goto out; 2944 } 2945 2946 btrfs_set_root_node(&log->root_item, log->node); 2947 2948 root->log_transid++; 2949 log->log_transid = root->log_transid; 2950 root->log_start_pid = 0; 2951 /* 2952 * IO has been started, blocks of the log tree have WRITTEN flag set 2953 * in their headers. new modifications of the log will be written to 2954 * new positions. so it's safe to allow log writers to go in. 2955 */ 2956 mutex_unlock(&root->log_mutex); 2957 2958 btrfs_init_log_ctx(&root_log_ctx, NULL); 2959 2960 mutex_lock(&log_root_tree->log_mutex); 2961 atomic_inc(&log_root_tree->log_batch); 2962 atomic_inc(&log_root_tree->log_writers); 2963 2964 index2 = log_root_tree->log_transid % 2; 2965 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2966 root_log_ctx.log_transid = log_root_tree->log_transid; 2967 2968 mutex_unlock(&log_root_tree->log_mutex); 2969 2970 ret = update_log_root(trans, log); 2971 2972 mutex_lock(&log_root_tree->log_mutex); 2973 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2974 /* 2975 * Implicit memory barrier after atomic_dec_and_test 2976 */ 2977 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2978 wake_up(&log_root_tree->log_writer_wait); 2979 } 2980 2981 if (ret) { 2982 if (!list_empty(&root_log_ctx.list)) 2983 list_del_init(&root_log_ctx.list); 2984 2985 blk_finish_plug(&plug); 2986 btrfs_set_log_full_commit(fs_info, trans); 2987 2988 if (ret != -ENOSPC) { 2989 btrfs_abort_transaction(trans, ret); 2990 mutex_unlock(&log_root_tree->log_mutex); 2991 goto out; 2992 } 2993 btrfs_wait_tree_log_extents(log, mark); 2994 btrfs_free_logged_extents(log, log_transid); 2995 mutex_unlock(&log_root_tree->log_mutex); 2996 ret = -EAGAIN; 2997 goto out; 2998 } 2999 3000 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3001 blk_finish_plug(&plug); 3002 list_del_init(&root_log_ctx.list); 3003 mutex_unlock(&log_root_tree->log_mutex); 3004 ret = root_log_ctx.log_ret; 3005 goto out; 3006 } 3007 3008 index2 = root_log_ctx.log_transid % 2; 3009 if (atomic_read(&log_root_tree->log_commit[index2])) { 3010 blk_finish_plug(&plug); 3011 ret = btrfs_wait_tree_log_extents(log, mark); 3012 btrfs_wait_logged_extents(trans, log, log_transid); 3013 wait_log_commit(log_root_tree, 3014 root_log_ctx.log_transid); 3015 mutex_unlock(&log_root_tree->log_mutex); 3016 if (!ret) 3017 ret = root_log_ctx.log_ret; 3018 goto out; 3019 } 3020 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3021 atomic_set(&log_root_tree->log_commit[index2], 1); 3022 3023 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3024 wait_log_commit(log_root_tree, 3025 root_log_ctx.log_transid - 1); 3026 } 3027 3028 wait_for_writer(log_root_tree); 3029 3030 /* 3031 * now that we've moved on to the tree of log tree roots, 3032 * check the full commit flag again 3033 */ 3034 if (btrfs_need_log_full_commit(fs_info, trans)) { 3035 blk_finish_plug(&plug); 3036 btrfs_wait_tree_log_extents(log, mark); 3037 btrfs_free_logged_extents(log, log_transid); 3038 mutex_unlock(&log_root_tree->log_mutex); 3039 ret = -EAGAIN; 3040 goto out_wake_log_root; 3041 } 3042 3043 ret = btrfs_write_marked_extents(fs_info, 3044 &log_root_tree->dirty_log_pages, 3045 EXTENT_DIRTY | EXTENT_NEW); 3046 blk_finish_plug(&plug); 3047 if (ret) { 3048 btrfs_set_log_full_commit(fs_info, trans); 3049 btrfs_abort_transaction(trans, ret); 3050 btrfs_free_logged_extents(log, log_transid); 3051 mutex_unlock(&log_root_tree->log_mutex); 3052 goto out_wake_log_root; 3053 } 3054 ret = btrfs_wait_tree_log_extents(log, mark); 3055 if (!ret) 3056 ret = btrfs_wait_tree_log_extents(log_root_tree, 3057 EXTENT_NEW | EXTENT_DIRTY); 3058 if (ret) { 3059 btrfs_set_log_full_commit(fs_info, trans); 3060 btrfs_free_logged_extents(log, log_transid); 3061 mutex_unlock(&log_root_tree->log_mutex); 3062 goto out_wake_log_root; 3063 } 3064 btrfs_wait_logged_extents(trans, log, log_transid); 3065 3066 btrfs_set_super_log_root(fs_info->super_for_commit, 3067 log_root_tree->node->start); 3068 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3069 btrfs_header_level(log_root_tree->node)); 3070 3071 log_root_tree->log_transid++; 3072 mutex_unlock(&log_root_tree->log_mutex); 3073 3074 /* 3075 * nobody else is going to jump in and write the the ctree 3076 * super here because the log_commit atomic below is protecting 3077 * us. We must be called with a transaction handle pinning 3078 * the running transaction open, so a full commit can't hop 3079 * in and cause problems either. 3080 */ 3081 ret = write_all_supers(fs_info, 1); 3082 if (ret) { 3083 btrfs_set_log_full_commit(fs_info, trans); 3084 btrfs_abort_transaction(trans, ret); 3085 goto out_wake_log_root; 3086 } 3087 3088 mutex_lock(&root->log_mutex); 3089 if (root->last_log_commit < log_transid) 3090 root->last_log_commit = log_transid; 3091 mutex_unlock(&root->log_mutex); 3092 3093 out_wake_log_root: 3094 mutex_lock(&log_root_tree->log_mutex); 3095 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3096 3097 log_root_tree->log_transid_committed++; 3098 atomic_set(&log_root_tree->log_commit[index2], 0); 3099 mutex_unlock(&log_root_tree->log_mutex); 3100 3101 /* 3102 * The barrier before waitqueue_active is implied by mutex_unlock 3103 */ 3104 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 3105 wake_up(&log_root_tree->log_commit_wait[index2]); 3106 out: 3107 mutex_lock(&root->log_mutex); 3108 btrfs_remove_all_log_ctxs(root, index1, ret); 3109 root->log_transid_committed++; 3110 atomic_set(&root->log_commit[index1], 0); 3111 mutex_unlock(&root->log_mutex); 3112 3113 /* 3114 * The barrier before waitqueue_active is implied by mutex_unlock 3115 */ 3116 if (waitqueue_active(&root->log_commit_wait[index1])) 3117 wake_up(&root->log_commit_wait[index1]); 3118 return ret; 3119 } 3120 3121 static void free_log_tree(struct btrfs_trans_handle *trans, 3122 struct btrfs_root *log) 3123 { 3124 int ret; 3125 u64 start; 3126 u64 end; 3127 struct walk_control wc = { 3128 .free = 1, 3129 .process_func = process_one_buffer 3130 }; 3131 3132 ret = walk_log_tree(trans, log, &wc); 3133 /* I don't think this can happen but just in case */ 3134 if (ret) 3135 btrfs_abort_transaction(trans, ret); 3136 3137 while (1) { 3138 ret = find_first_extent_bit(&log->dirty_log_pages, 3139 0, &start, &end, 3140 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, 3141 NULL); 3142 if (ret) 3143 break; 3144 3145 clear_extent_bits(&log->dirty_log_pages, start, end, 3146 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3147 } 3148 3149 /* 3150 * We may have short-circuited the log tree with the full commit logic 3151 * and left ordered extents on our list, so clear these out to keep us 3152 * from leaking inodes and memory. 3153 */ 3154 btrfs_free_logged_extents(log, 0); 3155 btrfs_free_logged_extents(log, 1); 3156 3157 free_extent_buffer(log->node); 3158 kfree(log); 3159 } 3160 3161 /* 3162 * free all the extents used by the tree log. This should be called 3163 * at commit time of the full transaction 3164 */ 3165 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3166 { 3167 if (root->log_root) { 3168 free_log_tree(trans, root->log_root); 3169 root->log_root = NULL; 3170 } 3171 return 0; 3172 } 3173 3174 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3175 struct btrfs_fs_info *fs_info) 3176 { 3177 if (fs_info->log_root_tree) { 3178 free_log_tree(trans, fs_info->log_root_tree); 3179 fs_info->log_root_tree = NULL; 3180 } 3181 return 0; 3182 } 3183 3184 /* 3185 * If both a file and directory are logged, and unlinks or renames are 3186 * mixed in, we have a few interesting corners: 3187 * 3188 * create file X in dir Y 3189 * link file X to X.link in dir Y 3190 * fsync file X 3191 * unlink file X but leave X.link 3192 * fsync dir Y 3193 * 3194 * After a crash we would expect only X.link to exist. But file X 3195 * didn't get fsync'd again so the log has back refs for X and X.link. 3196 * 3197 * We solve this by removing directory entries and inode backrefs from the 3198 * log when a file that was logged in the current transaction is 3199 * unlinked. Any later fsync will include the updated log entries, and 3200 * we'll be able to reconstruct the proper directory items from backrefs. 3201 * 3202 * This optimizations allows us to avoid relogging the entire inode 3203 * or the entire directory. 3204 */ 3205 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3206 struct btrfs_root *root, 3207 const char *name, int name_len, 3208 struct btrfs_inode *dir, u64 index) 3209 { 3210 struct btrfs_root *log; 3211 struct btrfs_dir_item *di; 3212 struct btrfs_path *path; 3213 int ret; 3214 int err = 0; 3215 int bytes_del = 0; 3216 u64 dir_ino = btrfs_ino(dir); 3217 3218 if (dir->logged_trans < trans->transid) 3219 return 0; 3220 3221 ret = join_running_log_trans(root); 3222 if (ret) 3223 return 0; 3224 3225 mutex_lock(&dir->log_mutex); 3226 3227 log = root->log_root; 3228 path = btrfs_alloc_path(); 3229 if (!path) { 3230 err = -ENOMEM; 3231 goto out_unlock; 3232 } 3233 3234 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3235 name, name_len, -1); 3236 if (IS_ERR(di)) { 3237 err = PTR_ERR(di); 3238 goto fail; 3239 } 3240 if (di) { 3241 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3242 bytes_del += name_len; 3243 if (ret) { 3244 err = ret; 3245 goto fail; 3246 } 3247 } 3248 btrfs_release_path(path); 3249 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3250 index, name, name_len, -1); 3251 if (IS_ERR(di)) { 3252 err = PTR_ERR(di); 3253 goto fail; 3254 } 3255 if (di) { 3256 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3257 bytes_del += name_len; 3258 if (ret) { 3259 err = ret; 3260 goto fail; 3261 } 3262 } 3263 3264 /* update the directory size in the log to reflect the names 3265 * we have removed 3266 */ 3267 if (bytes_del) { 3268 struct btrfs_key key; 3269 3270 key.objectid = dir_ino; 3271 key.offset = 0; 3272 key.type = BTRFS_INODE_ITEM_KEY; 3273 btrfs_release_path(path); 3274 3275 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3276 if (ret < 0) { 3277 err = ret; 3278 goto fail; 3279 } 3280 if (ret == 0) { 3281 struct btrfs_inode_item *item; 3282 u64 i_size; 3283 3284 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3285 struct btrfs_inode_item); 3286 i_size = btrfs_inode_size(path->nodes[0], item); 3287 if (i_size > bytes_del) 3288 i_size -= bytes_del; 3289 else 3290 i_size = 0; 3291 btrfs_set_inode_size(path->nodes[0], item, i_size); 3292 btrfs_mark_buffer_dirty(path->nodes[0]); 3293 } else 3294 ret = 0; 3295 btrfs_release_path(path); 3296 } 3297 fail: 3298 btrfs_free_path(path); 3299 out_unlock: 3300 mutex_unlock(&dir->log_mutex); 3301 if (ret == -ENOSPC) { 3302 btrfs_set_log_full_commit(root->fs_info, trans); 3303 ret = 0; 3304 } else if (ret < 0) 3305 btrfs_abort_transaction(trans, ret); 3306 3307 btrfs_end_log_trans(root); 3308 3309 return err; 3310 } 3311 3312 /* see comments for btrfs_del_dir_entries_in_log */ 3313 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3314 struct btrfs_root *root, 3315 const char *name, int name_len, 3316 struct btrfs_inode *inode, u64 dirid) 3317 { 3318 struct btrfs_fs_info *fs_info = root->fs_info; 3319 struct btrfs_root *log; 3320 u64 index; 3321 int ret; 3322 3323 if (inode->logged_trans < trans->transid) 3324 return 0; 3325 3326 ret = join_running_log_trans(root); 3327 if (ret) 3328 return 0; 3329 log = root->log_root; 3330 mutex_lock(&inode->log_mutex); 3331 3332 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3333 dirid, &index); 3334 mutex_unlock(&inode->log_mutex); 3335 if (ret == -ENOSPC) { 3336 btrfs_set_log_full_commit(fs_info, trans); 3337 ret = 0; 3338 } else if (ret < 0 && ret != -ENOENT) 3339 btrfs_abort_transaction(trans, ret); 3340 btrfs_end_log_trans(root); 3341 3342 return ret; 3343 } 3344 3345 /* 3346 * creates a range item in the log for 'dirid'. first_offset and 3347 * last_offset tell us which parts of the key space the log should 3348 * be considered authoritative for. 3349 */ 3350 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3351 struct btrfs_root *log, 3352 struct btrfs_path *path, 3353 int key_type, u64 dirid, 3354 u64 first_offset, u64 last_offset) 3355 { 3356 int ret; 3357 struct btrfs_key key; 3358 struct btrfs_dir_log_item *item; 3359 3360 key.objectid = dirid; 3361 key.offset = first_offset; 3362 if (key_type == BTRFS_DIR_ITEM_KEY) 3363 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3364 else 3365 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3366 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3367 if (ret) 3368 return ret; 3369 3370 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3371 struct btrfs_dir_log_item); 3372 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3373 btrfs_mark_buffer_dirty(path->nodes[0]); 3374 btrfs_release_path(path); 3375 return 0; 3376 } 3377 3378 /* 3379 * log all the items included in the current transaction for a given 3380 * directory. This also creates the range items in the log tree required 3381 * to replay anything deleted before the fsync 3382 */ 3383 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3384 struct btrfs_root *root, struct btrfs_inode *inode, 3385 struct btrfs_path *path, 3386 struct btrfs_path *dst_path, int key_type, 3387 struct btrfs_log_ctx *ctx, 3388 u64 min_offset, u64 *last_offset_ret) 3389 { 3390 struct btrfs_key min_key; 3391 struct btrfs_root *log = root->log_root; 3392 struct extent_buffer *src; 3393 int err = 0; 3394 int ret; 3395 int i; 3396 int nritems; 3397 u64 first_offset = min_offset; 3398 u64 last_offset = (u64)-1; 3399 u64 ino = btrfs_ino(inode); 3400 3401 log = root->log_root; 3402 3403 min_key.objectid = ino; 3404 min_key.type = key_type; 3405 min_key.offset = min_offset; 3406 3407 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3408 3409 /* 3410 * we didn't find anything from this transaction, see if there 3411 * is anything at all 3412 */ 3413 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3414 min_key.objectid = ino; 3415 min_key.type = key_type; 3416 min_key.offset = (u64)-1; 3417 btrfs_release_path(path); 3418 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3419 if (ret < 0) { 3420 btrfs_release_path(path); 3421 return ret; 3422 } 3423 ret = btrfs_previous_item(root, path, ino, key_type); 3424 3425 /* if ret == 0 there are items for this type, 3426 * create a range to tell us the last key of this type. 3427 * otherwise, there are no items in this directory after 3428 * *min_offset, and we create a range to indicate that. 3429 */ 3430 if (ret == 0) { 3431 struct btrfs_key tmp; 3432 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3433 path->slots[0]); 3434 if (key_type == tmp.type) 3435 first_offset = max(min_offset, tmp.offset) + 1; 3436 } 3437 goto done; 3438 } 3439 3440 /* go backward to find any previous key */ 3441 ret = btrfs_previous_item(root, path, ino, key_type); 3442 if (ret == 0) { 3443 struct btrfs_key tmp; 3444 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3445 if (key_type == tmp.type) { 3446 first_offset = tmp.offset; 3447 ret = overwrite_item(trans, log, dst_path, 3448 path->nodes[0], path->slots[0], 3449 &tmp); 3450 if (ret) { 3451 err = ret; 3452 goto done; 3453 } 3454 } 3455 } 3456 btrfs_release_path(path); 3457 3458 /* find the first key from this transaction again */ 3459 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3460 if (WARN_ON(ret != 0)) 3461 goto done; 3462 3463 /* 3464 * we have a block from this transaction, log every item in it 3465 * from our directory 3466 */ 3467 while (1) { 3468 struct btrfs_key tmp; 3469 src = path->nodes[0]; 3470 nritems = btrfs_header_nritems(src); 3471 for (i = path->slots[0]; i < nritems; i++) { 3472 struct btrfs_dir_item *di; 3473 3474 btrfs_item_key_to_cpu(src, &min_key, i); 3475 3476 if (min_key.objectid != ino || min_key.type != key_type) 3477 goto done; 3478 ret = overwrite_item(trans, log, dst_path, src, i, 3479 &min_key); 3480 if (ret) { 3481 err = ret; 3482 goto done; 3483 } 3484 3485 /* 3486 * We must make sure that when we log a directory entry, 3487 * the corresponding inode, after log replay, has a 3488 * matching link count. For example: 3489 * 3490 * touch foo 3491 * mkdir mydir 3492 * sync 3493 * ln foo mydir/bar 3494 * xfs_io -c "fsync" mydir 3495 * <crash> 3496 * <mount fs and log replay> 3497 * 3498 * Would result in a fsync log that when replayed, our 3499 * file inode would have a link count of 1, but we get 3500 * two directory entries pointing to the same inode. 3501 * After removing one of the names, it would not be 3502 * possible to remove the other name, which resulted 3503 * always in stale file handle errors, and would not 3504 * be possible to rmdir the parent directory, since 3505 * its i_size could never decrement to the value 3506 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3507 */ 3508 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3509 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3510 if (ctx && 3511 (btrfs_dir_transid(src, di) == trans->transid || 3512 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3513 tmp.type != BTRFS_ROOT_ITEM_KEY) 3514 ctx->log_new_dentries = true; 3515 } 3516 path->slots[0] = nritems; 3517 3518 /* 3519 * look ahead to the next item and see if it is also 3520 * from this directory and from this transaction 3521 */ 3522 ret = btrfs_next_leaf(root, path); 3523 if (ret == 1) { 3524 last_offset = (u64)-1; 3525 goto done; 3526 } 3527 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3528 if (tmp.objectid != ino || tmp.type != key_type) { 3529 last_offset = (u64)-1; 3530 goto done; 3531 } 3532 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3533 ret = overwrite_item(trans, log, dst_path, 3534 path->nodes[0], path->slots[0], 3535 &tmp); 3536 if (ret) 3537 err = ret; 3538 else 3539 last_offset = tmp.offset; 3540 goto done; 3541 } 3542 } 3543 done: 3544 btrfs_release_path(path); 3545 btrfs_release_path(dst_path); 3546 3547 if (err == 0) { 3548 *last_offset_ret = last_offset; 3549 /* 3550 * insert the log range keys to indicate where the log 3551 * is valid 3552 */ 3553 ret = insert_dir_log_key(trans, log, path, key_type, 3554 ino, first_offset, last_offset); 3555 if (ret) 3556 err = ret; 3557 } 3558 return err; 3559 } 3560 3561 /* 3562 * logging directories is very similar to logging inodes, We find all the items 3563 * from the current transaction and write them to the log. 3564 * 3565 * The recovery code scans the directory in the subvolume, and if it finds a 3566 * key in the range logged that is not present in the log tree, then it means 3567 * that dir entry was unlinked during the transaction. 3568 * 3569 * In order for that scan to work, we must include one key smaller than 3570 * the smallest logged by this transaction and one key larger than the largest 3571 * key logged by this transaction. 3572 */ 3573 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3574 struct btrfs_root *root, struct btrfs_inode *inode, 3575 struct btrfs_path *path, 3576 struct btrfs_path *dst_path, 3577 struct btrfs_log_ctx *ctx) 3578 { 3579 u64 min_key; 3580 u64 max_key; 3581 int ret; 3582 int key_type = BTRFS_DIR_ITEM_KEY; 3583 3584 again: 3585 min_key = 0; 3586 max_key = 0; 3587 while (1) { 3588 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3589 ctx, min_key, &max_key); 3590 if (ret) 3591 return ret; 3592 if (max_key == (u64)-1) 3593 break; 3594 min_key = max_key + 1; 3595 } 3596 3597 if (key_type == BTRFS_DIR_ITEM_KEY) { 3598 key_type = BTRFS_DIR_INDEX_KEY; 3599 goto again; 3600 } 3601 return 0; 3602 } 3603 3604 /* 3605 * a helper function to drop items from the log before we relog an 3606 * inode. max_key_type indicates the highest item type to remove. 3607 * This cannot be run for file data extents because it does not 3608 * free the extents they point to. 3609 */ 3610 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3611 struct btrfs_root *log, 3612 struct btrfs_path *path, 3613 u64 objectid, int max_key_type) 3614 { 3615 int ret; 3616 struct btrfs_key key; 3617 struct btrfs_key found_key; 3618 int start_slot; 3619 3620 key.objectid = objectid; 3621 key.type = max_key_type; 3622 key.offset = (u64)-1; 3623 3624 while (1) { 3625 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3626 BUG_ON(ret == 0); /* Logic error */ 3627 if (ret < 0) 3628 break; 3629 3630 if (path->slots[0] == 0) 3631 break; 3632 3633 path->slots[0]--; 3634 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3635 path->slots[0]); 3636 3637 if (found_key.objectid != objectid) 3638 break; 3639 3640 found_key.offset = 0; 3641 found_key.type = 0; 3642 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3643 &start_slot); 3644 3645 ret = btrfs_del_items(trans, log, path, start_slot, 3646 path->slots[0] - start_slot + 1); 3647 /* 3648 * If start slot isn't 0 then we don't need to re-search, we've 3649 * found the last guy with the objectid in this tree. 3650 */ 3651 if (ret || start_slot != 0) 3652 break; 3653 btrfs_release_path(path); 3654 } 3655 btrfs_release_path(path); 3656 if (ret > 0) 3657 ret = 0; 3658 return ret; 3659 } 3660 3661 static void fill_inode_item(struct btrfs_trans_handle *trans, 3662 struct extent_buffer *leaf, 3663 struct btrfs_inode_item *item, 3664 struct inode *inode, int log_inode_only, 3665 u64 logged_isize) 3666 { 3667 struct btrfs_map_token token; 3668 3669 btrfs_init_map_token(&token); 3670 3671 if (log_inode_only) { 3672 /* set the generation to zero so the recover code 3673 * can tell the difference between an logging 3674 * just to say 'this inode exists' and a logging 3675 * to say 'update this inode with these values' 3676 */ 3677 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3678 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3679 } else { 3680 btrfs_set_token_inode_generation(leaf, item, 3681 BTRFS_I(inode)->generation, 3682 &token); 3683 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3684 } 3685 3686 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3687 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3688 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3689 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3690 3691 btrfs_set_token_timespec_sec(leaf, &item->atime, 3692 inode->i_atime.tv_sec, &token); 3693 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3694 inode->i_atime.tv_nsec, &token); 3695 3696 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3697 inode->i_mtime.tv_sec, &token); 3698 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3699 inode->i_mtime.tv_nsec, &token); 3700 3701 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3702 inode->i_ctime.tv_sec, &token); 3703 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3704 inode->i_ctime.tv_nsec, &token); 3705 3706 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3707 &token); 3708 3709 btrfs_set_token_inode_sequence(leaf, item, 3710 inode_peek_iversion(inode), &token); 3711 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3712 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3713 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3714 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3715 } 3716 3717 static int log_inode_item(struct btrfs_trans_handle *trans, 3718 struct btrfs_root *log, struct btrfs_path *path, 3719 struct btrfs_inode *inode) 3720 { 3721 struct btrfs_inode_item *inode_item; 3722 int ret; 3723 3724 ret = btrfs_insert_empty_item(trans, log, path, 3725 &inode->location, sizeof(*inode_item)); 3726 if (ret && ret != -EEXIST) 3727 return ret; 3728 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3729 struct btrfs_inode_item); 3730 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3731 0, 0); 3732 btrfs_release_path(path); 3733 return 0; 3734 } 3735 3736 static noinline int copy_items(struct btrfs_trans_handle *trans, 3737 struct btrfs_inode *inode, 3738 struct btrfs_path *dst_path, 3739 struct btrfs_path *src_path, u64 *last_extent, 3740 int start_slot, int nr, int inode_only, 3741 u64 logged_isize) 3742 { 3743 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 3744 unsigned long src_offset; 3745 unsigned long dst_offset; 3746 struct btrfs_root *log = inode->root->log_root; 3747 struct btrfs_file_extent_item *extent; 3748 struct btrfs_inode_item *inode_item; 3749 struct extent_buffer *src = src_path->nodes[0]; 3750 struct btrfs_key first_key, last_key, key; 3751 int ret; 3752 struct btrfs_key *ins_keys; 3753 u32 *ins_sizes; 3754 char *ins_data; 3755 int i; 3756 struct list_head ordered_sums; 3757 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 3758 bool has_extents = false; 3759 bool need_find_last_extent = true; 3760 bool done = false; 3761 3762 INIT_LIST_HEAD(&ordered_sums); 3763 3764 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3765 nr * sizeof(u32), GFP_NOFS); 3766 if (!ins_data) 3767 return -ENOMEM; 3768 3769 first_key.objectid = (u64)-1; 3770 3771 ins_sizes = (u32 *)ins_data; 3772 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3773 3774 for (i = 0; i < nr; i++) { 3775 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3776 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3777 } 3778 ret = btrfs_insert_empty_items(trans, log, dst_path, 3779 ins_keys, ins_sizes, nr); 3780 if (ret) { 3781 kfree(ins_data); 3782 return ret; 3783 } 3784 3785 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3786 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3787 dst_path->slots[0]); 3788 3789 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3790 3791 if (i == nr - 1) 3792 last_key = ins_keys[i]; 3793 3794 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3795 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3796 dst_path->slots[0], 3797 struct btrfs_inode_item); 3798 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3799 &inode->vfs_inode, 3800 inode_only == LOG_INODE_EXISTS, 3801 logged_isize); 3802 } else { 3803 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3804 src_offset, ins_sizes[i]); 3805 } 3806 3807 /* 3808 * We set need_find_last_extent here in case we know we were 3809 * processing other items and then walk into the first extent in 3810 * the inode. If we don't hit an extent then nothing changes, 3811 * we'll do the last search the next time around. 3812 */ 3813 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3814 has_extents = true; 3815 if (first_key.objectid == (u64)-1) 3816 first_key = ins_keys[i]; 3817 } else { 3818 need_find_last_extent = false; 3819 } 3820 3821 /* take a reference on file data extents so that truncates 3822 * or deletes of this inode don't have to relog the inode 3823 * again 3824 */ 3825 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3826 !skip_csum) { 3827 int found_type; 3828 extent = btrfs_item_ptr(src, start_slot + i, 3829 struct btrfs_file_extent_item); 3830 3831 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3832 continue; 3833 3834 found_type = btrfs_file_extent_type(src, extent); 3835 if (found_type == BTRFS_FILE_EXTENT_REG) { 3836 u64 ds, dl, cs, cl; 3837 ds = btrfs_file_extent_disk_bytenr(src, 3838 extent); 3839 /* ds == 0 is a hole */ 3840 if (ds == 0) 3841 continue; 3842 3843 dl = btrfs_file_extent_disk_num_bytes(src, 3844 extent); 3845 cs = btrfs_file_extent_offset(src, extent); 3846 cl = btrfs_file_extent_num_bytes(src, 3847 extent); 3848 if (btrfs_file_extent_compression(src, 3849 extent)) { 3850 cs = 0; 3851 cl = dl; 3852 } 3853 3854 ret = btrfs_lookup_csums_range( 3855 fs_info->csum_root, 3856 ds + cs, ds + cs + cl - 1, 3857 &ordered_sums, 0); 3858 if (ret) { 3859 btrfs_release_path(dst_path); 3860 kfree(ins_data); 3861 return ret; 3862 } 3863 } 3864 } 3865 } 3866 3867 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3868 btrfs_release_path(dst_path); 3869 kfree(ins_data); 3870 3871 /* 3872 * we have to do this after the loop above to avoid changing the 3873 * log tree while trying to change the log tree. 3874 */ 3875 ret = 0; 3876 while (!list_empty(&ordered_sums)) { 3877 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3878 struct btrfs_ordered_sum, 3879 list); 3880 if (!ret) 3881 ret = btrfs_csum_file_blocks(trans, log, sums); 3882 list_del(&sums->list); 3883 kfree(sums); 3884 } 3885 3886 if (!has_extents) 3887 return ret; 3888 3889 if (need_find_last_extent && *last_extent == first_key.offset) { 3890 /* 3891 * We don't have any leafs between our current one and the one 3892 * we processed before that can have file extent items for our 3893 * inode (and have a generation number smaller than our current 3894 * transaction id). 3895 */ 3896 need_find_last_extent = false; 3897 } 3898 3899 /* 3900 * Because we use btrfs_search_forward we could skip leaves that were 3901 * not modified and then assume *last_extent is valid when it really 3902 * isn't. So back up to the previous leaf and read the end of the last 3903 * extent before we go and fill in holes. 3904 */ 3905 if (need_find_last_extent) { 3906 u64 len; 3907 3908 ret = btrfs_prev_leaf(inode->root, src_path); 3909 if (ret < 0) 3910 return ret; 3911 if (ret) 3912 goto fill_holes; 3913 if (src_path->slots[0]) 3914 src_path->slots[0]--; 3915 src = src_path->nodes[0]; 3916 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3917 if (key.objectid != btrfs_ino(inode) || 3918 key.type != BTRFS_EXTENT_DATA_KEY) 3919 goto fill_holes; 3920 extent = btrfs_item_ptr(src, src_path->slots[0], 3921 struct btrfs_file_extent_item); 3922 if (btrfs_file_extent_type(src, extent) == 3923 BTRFS_FILE_EXTENT_INLINE) { 3924 len = btrfs_file_extent_inline_len(src, 3925 src_path->slots[0], 3926 extent); 3927 *last_extent = ALIGN(key.offset + len, 3928 fs_info->sectorsize); 3929 } else { 3930 len = btrfs_file_extent_num_bytes(src, extent); 3931 *last_extent = key.offset + len; 3932 } 3933 } 3934 fill_holes: 3935 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3936 * things could have happened 3937 * 3938 * 1) A merge could have happened, so we could currently be on a leaf 3939 * that holds what we were copying in the first place. 3940 * 2) A split could have happened, and now not all of the items we want 3941 * are on the same leaf. 3942 * 3943 * So we need to adjust how we search for holes, we need to drop the 3944 * path and re-search for the first extent key we found, and then walk 3945 * forward until we hit the last one we copied. 3946 */ 3947 if (need_find_last_extent) { 3948 /* btrfs_prev_leaf could return 1 without releasing the path */ 3949 btrfs_release_path(src_path); 3950 ret = btrfs_search_slot(NULL, inode->root, &first_key, 3951 src_path, 0, 0); 3952 if (ret < 0) 3953 return ret; 3954 ASSERT(ret == 0); 3955 src = src_path->nodes[0]; 3956 i = src_path->slots[0]; 3957 } else { 3958 i = start_slot; 3959 } 3960 3961 /* 3962 * Ok so here we need to go through and fill in any holes we may have 3963 * to make sure that holes are punched for those areas in case they had 3964 * extents previously. 3965 */ 3966 while (!done) { 3967 u64 offset, len; 3968 u64 extent_end; 3969 3970 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3971 ret = btrfs_next_leaf(inode->root, src_path); 3972 if (ret < 0) 3973 return ret; 3974 ASSERT(ret == 0); 3975 src = src_path->nodes[0]; 3976 i = 0; 3977 need_find_last_extent = true; 3978 } 3979 3980 btrfs_item_key_to_cpu(src, &key, i); 3981 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3982 done = true; 3983 if (key.objectid != btrfs_ino(inode) || 3984 key.type != BTRFS_EXTENT_DATA_KEY) { 3985 i++; 3986 continue; 3987 } 3988 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3989 if (btrfs_file_extent_type(src, extent) == 3990 BTRFS_FILE_EXTENT_INLINE) { 3991 len = btrfs_file_extent_inline_len(src, i, extent); 3992 extent_end = ALIGN(key.offset + len, 3993 fs_info->sectorsize); 3994 } else { 3995 len = btrfs_file_extent_num_bytes(src, extent); 3996 extent_end = key.offset + len; 3997 } 3998 i++; 3999 4000 if (*last_extent == key.offset) { 4001 *last_extent = extent_end; 4002 continue; 4003 } 4004 offset = *last_extent; 4005 len = key.offset - *last_extent; 4006 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 4007 offset, 0, 0, len, 0, len, 0, 0, 0); 4008 if (ret) 4009 break; 4010 *last_extent = extent_end; 4011 } 4012 4013 /* 4014 * Check if there is a hole between the last extent found in our leaf 4015 * and the first extent in the next leaf. If there is one, we need to 4016 * log an explicit hole so that at replay time we can punch the hole. 4017 */ 4018 if (ret == 0 && 4019 key.objectid == btrfs_ino(inode) && 4020 key.type == BTRFS_EXTENT_DATA_KEY && 4021 i == btrfs_header_nritems(src_path->nodes[0])) { 4022 ret = btrfs_next_leaf(inode->root, src_path); 4023 need_find_last_extent = true; 4024 if (ret > 0) { 4025 ret = 0; 4026 } else if (ret == 0) { 4027 btrfs_item_key_to_cpu(src_path->nodes[0], &key, 4028 src_path->slots[0]); 4029 if (key.objectid == btrfs_ino(inode) && 4030 key.type == BTRFS_EXTENT_DATA_KEY && 4031 *last_extent < key.offset) { 4032 const u64 len = key.offset - *last_extent; 4033 4034 ret = btrfs_insert_file_extent(trans, log, 4035 btrfs_ino(inode), 4036 *last_extent, 0, 4037 0, len, 0, len, 4038 0, 0, 0); 4039 } 4040 } 4041 } 4042 /* 4043 * Need to let the callers know we dropped the path so they should 4044 * re-search. 4045 */ 4046 if (!ret && need_find_last_extent) 4047 ret = 1; 4048 return ret; 4049 } 4050 4051 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 4052 { 4053 struct extent_map *em1, *em2; 4054 4055 em1 = list_entry(a, struct extent_map, list); 4056 em2 = list_entry(b, struct extent_map, list); 4057 4058 if (em1->start < em2->start) 4059 return -1; 4060 else if (em1->start > em2->start) 4061 return 1; 4062 return 0; 4063 } 4064 4065 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 4066 struct inode *inode, 4067 struct btrfs_root *root, 4068 const struct extent_map *em, 4069 const struct list_head *logged_list, 4070 bool *ordered_io_error) 4071 { 4072 struct btrfs_fs_info *fs_info = root->fs_info; 4073 struct btrfs_ordered_extent *ordered; 4074 struct btrfs_root *log = root->log_root; 4075 u64 mod_start = em->mod_start; 4076 u64 mod_len = em->mod_len; 4077 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 4078 u64 csum_offset; 4079 u64 csum_len; 4080 LIST_HEAD(ordered_sums); 4081 int ret = 0; 4082 4083 *ordered_io_error = false; 4084 4085 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4086 em->block_start == EXTENT_MAP_HOLE) 4087 return 0; 4088 4089 /* 4090 * Wait far any ordered extent that covers our extent map. If it 4091 * finishes without an error, first check and see if our csums are on 4092 * our outstanding ordered extents. 4093 */ 4094 list_for_each_entry(ordered, logged_list, log_list) { 4095 struct btrfs_ordered_sum *sum; 4096 4097 if (!mod_len) 4098 break; 4099 4100 if (ordered->file_offset + ordered->len <= mod_start || 4101 mod_start + mod_len <= ordered->file_offset) 4102 continue; 4103 4104 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 4105 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 4106 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 4107 const u64 start = ordered->file_offset; 4108 const u64 end = ordered->file_offset + ordered->len - 1; 4109 4110 WARN_ON(ordered->inode != inode); 4111 filemap_fdatawrite_range(inode->i_mapping, start, end); 4112 } 4113 4114 wait_event(ordered->wait, 4115 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 4116 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 4117 4118 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 4119 /* 4120 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 4121 * i_mapping flags, so that the next fsync won't get 4122 * an outdated io error too. 4123 */ 4124 filemap_check_errors(inode->i_mapping); 4125 *ordered_io_error = true; 4126 break; 4127 } 4128 /* 4129 * We are going to copy all the csums on this ordered extent, so 4130 * go ahead and adjust mod_start and mod_len in case this 4131 * ordered extent has already been logged. 4132 */ 4133 if (ordered->file_offset > mod_start) { 4134 if (ordered->file_offset + ordered->len >= 4135 mod_start + mod_len) 4136 mod_len = ordered->file_offset - mod_start; 4137 /* 4138 * If we have this case 4139 * 4140 * |--------- logged extent ---------| 4141 * |----- ordered extent ----| 4142 * 4143 * Just don't mess with mod_start and mod_len, we'll 4144 * just end up logging more csums than we need and it 4145 * will be ok. 4146 */ 4147 } else { 4148 if (ordered->file_offset + ordered->len < 4149 mod_start + mod_len) { 4150 mod_len = (mod_start + mod_len) - 4151 (ordered->file_offset + ordered->len); 4152 mod_start = ordered->file_offset + 4153 ordered->len; 4154 } else { 4155 mod_len = 0; 4156 } 4157 } 4158 4159 if (skip_csum) 4160 continue; 4161 4162 /* 4163 * To keep us from looping for the above case of an ordered 4164 * extent that falls inside of the logged extent. 4165 */ 4166 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 4167 &ordered->flags)) 4168 continue; 4169 4170 list_for_each_entry(sum, &ordered->list, list) { 4171 ret = btrfs_csum_file_blocks(trans, log, sum); 4172 if (ret) 4173 break; 4174 } 4175 } 4176 4177 if (*ordered_io_error || !mod_len || ret || skip_csum) 4178 return ret; 4179 4180 if (em->compress_type) { 4181 csum_offset = 0; 4182 csum_len = max(em->block_len, em->orig_block_len); 4183 } else { 4184 csum_offset = mod_start - em->start; 4185 csum_len = mod_len; 4186 } 4187 4188 /* block start is already adjusted for the file extent offset. */ 4189 ret = btrfs_lookup_csums_range(fs_info->csum_root, 4190 em->block_start + csum_offset, 4191 em->block_start + csum_offset + 4192 csum_len - 1, &ordered_sums, 0); 4193 if (ret) 4194 return ret; 4195 4196 while (!list_empty(&ordered_sums)) { 4197 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4198 struct btrfs_ordered_sum, 4199 list); 4200 if (!ret) 4201 ret = btrfs_csum_file_blocks(trans, log, sums); 4202 list_del(&sums->list); 4203 kfree(sums); 4204 } 4205 4206 return ret; 4207 } 4208 4209 static int log_one_extent(struct btrfs_trans_handle *trans, 4210 struct btrfs_inode *inode, struct btrfs_root *root, 4211 const struct extent_map *em, 4212 struct btrfs_path *path, 4213 const struct list_head *logged_list, 4214 struct btrfs_log_ctx *ctx) 4215 { 4216 struct btrfs_root *log = root->log_root; 4217 struct btrfs_file_extent_item *fi; 4218 struct extent_buffer *leaf; 4219 struct btrfs_map_token token; 4220 struct btrfs_key key; 4221 u64 extent_offset = em->start - em->orig_start; 4222 u64 block_len; 4223 int ret; 4224 int extent_inserted = 0; 4225 bool ordered_io_err = false; 4226 4227 ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, 4228 logged_list, &ordered_io_err); 4229 if (ret) 4230 return ret; 4231 4232 if (ordered_io_err) { 4233 ctx->io_err = -EIO; 4234 return ctx->io_err; 4235 } 4236 4237 btrfs_init_map_token(&token); 4238 4239 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, 4240 em->start + em->len, NULL, 0, 1, 4241 sizeof(*fi), &extent_inserted); 4242 if (ret) 4243 return ret; 4244 4245 if (!extent_inserted) { 4246 key.objectid = btrfs_ino(inode); 4247 key.type = BTRFS_EXTENT_DATA_KEY; 4248 key.offset = em->start; 4249 4250 ret = btrfs_insert_empty_item(trans, log, path, &key, 4251 sizeof(*fi)); 4252 if (ret) 4253 return ret; 4254 } 4255 leaf = path->nodes[0]; 4256 fi = btrfs_item_ptr(leaf, path->slots[0], 4257 struct btrfs_file_extent_item); 4258 4259 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 4260 &token); 4261 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4262 btrfs_set_token_file_extent_type(leaf, fi, 4263 BTRFS_FILE_EXTENT_PREALLOC, 4264 &token); 4265 else 4266 btrfs_set_token_file_extent_type(leaf, fi, 4267 BTRFS_FILE_EXTENT_REG, 4268 &token); 4269 4270 block_len = max(em->block_len, em->orig_block_len); 4271 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4272 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4273 em->block_start, 4274 &token); 4275 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4276 &token); 4277 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4278 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 4279 em->block_start - 4280 extent_offset, &token); 4281 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 4282 &token); 4283 } else { 4284 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 4285 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 4286 &token); 4287 } 4288 4289 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 4290 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 4291 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 4292 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 4293 &token); 4294 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 4295 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 4296 btrfs_mark_buffer_dirty(leaf); 4297 4298 btrfs_release_path(path); 4299 4300 return ret; 4301 } 4302 4303 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4304 struct btrfs_root *root, 4305 struct btrfs_inode *inode, 4306 struct btrfs_path *path, 4307 struct list_head *logged_list, 4308 struct btrfs_log_ctx *ctx, 4309 const u64 start, 4310 const u64 end) 4311 { 4312 struct extent_map *em, *n; 4313 struct list_head extents; 4314 struct extent_map_tree *tree = &inode->extent_tree; 4315 u64 logged_start, logged_end; 4316 u64 test_gen; 4317 int ret = 0; 4318 int num = 0; 4319 4320 INIT_LIST_HEAD(&extents); 4321 4322 down_write(&inode->dio_sem); 4323 write_lock(&tree->lock); 4324 test_gen = root->fs_info->last_trans_committed; 4325 logged_start = start; 4326 logged_end = end; 4327 4328 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4329 list_del_init(&em->list); 4330 /* 4331 * Just an arbitrary number, this can be really CPU intensive 4332 * once we start getting a lot of extents, and really once we 4333 * have a bunch of extents we just want to commit since it will 4334 * be faster. 4335 */ 4336 if (++num > 32768) { 4337 list_del_init(&tree->modified_extents); 4338 ret = -EFBIG; 4339 goto process; 4340 } 4341 4342 if (em->generation <= test_gen) 4343 continue; 4344 4345 if (em->start < logged_start) 4346 logged_start = em->start; 4347 if ((em->start + em->len - 1) > logged_end) 4348 logged_end = em->start + em->len - 1; 4349 4350 /* Need a ref to keep it from getting evicted from cache */ 4351 refcount_inc(&em->refs); 4352 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4353 list_add_tail(&em->list, &extents); 4354 num++; 4355 } 4356 4357 list_sort(NULL, &extents, extent_cmp); 4358 btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); 4359 /* 4360 * Some ordered extents started by fsync might have completed 4361 * before we could collect them into the list logged_list, which 4362 * means they're gone, not in our logged_list nor in the inode's 4363 * ordered tree. We want the application/user space to know an 4364 * error happened while attempting to persist file data so that 4365 * it can take proper action. If such error happened, we leave 4366 * without writing to the log tree and the fsync must report the 4367 * file data write error and not commit the current transaction. 4368 */ 4369 ret = filemap_check_errors(inode->vfs_inode.i_mapping); 4370 if (ret) 4371 ctx->io_err = ret; 4372 process: 4373 while (!list_empty(&extents)) { 4374 em = list_entry(extents.next, struct extent_map, list); 4375 4376 list_del_init(&em->list); 4377 4378 /* 4379 * If we had an error we just need to delete everybody from our 4380 * private list. 4381 */ 4382 if (ret) { 4383 clear_em_logging(tree, em); 4384 free_extent_map(em); 4385 continue; 4386 } 4387 4388 write_unlock(&tree->lock); 4389 4390 ret = log_one_extent(trans, inode, root, em, path, logged_list, 4391 ctx); 4392 write_lock(&tree->lock); 4393 clear_em_logging(tree, em); 4394 free_extent_map(em); 4395 } 4396 WARN_ON(!list_empty(&extents)); 4397 write_unlock(&tree->lock); 4398 up_write(&inode->dio_sem); 4399 4400 btrfs_release_path(path); 4401 return ret; 4402 } 4403 4404 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4405 struct btrfs_path *path, u64 *size_ret) 4406 { 4407 struct btrfs_key key; 4408 int ret; 4409 4410 key.objectid = btrfs_ino(inode); 4411 key.type = BTRFS_INODE_ITEM_KEY; 4412 key.offset = 0; 4413 4414 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4415 if (ret < 0) { 4416 return ret; 4417 } else if (ret > 0) { 4418 *size_ret = 0; 4419 } else { 4420 struct btrfs_inode_item *item; 4421 4422 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4423 struct btrfs_inode_item); 4424 *size_ret = btrfs_inode_size(path->nodes[0], item); 4425 } 4426 4427 btrfs_release_path(path); 4428 return 0; 4429 } 4430 4431 /* 4432 * At the moment we always log all xattrs. This is to figure out at log replay 4433 * time which xattrs must have their deletion replayed. If a xattr is missing 4434 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4435 * because if a xattr is deleted, the inode is fsynced and a power failure 4436 * happens, causing the log to be replayed the next time the fs is mounted, 4437 * we want the xattr to not exist anymore (same behaviour as other filesystems 4438 * with a journal, ext3/4, xfs, f2fs, etc). 4439 */ 4440 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4441 struct btrfs_root *root, 4442 struct btrfs_inode *inode, 4443 struct btrfs_path *path, 4444 struct btrfs_path *dst_path) 4445 { 4446 int ret; 4447 struct btrfs_key key; 4448 const u64 ino = btrfs_ino(inode); 4449 int ins_nr = 0; 4450 int start_slot = 0; 4451 4452 key.objectid = ino; 4453 key.type = BTRFS_XATTR_ITEM_KEY; 4454 key.offset = 0; 4455 4456 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4457 if (ret < 0) 4458 return ret; 4459 4460 while (true) { 4461 int slot = path->slots[0]; 4462 struct extent_buffer *leaf = path->nodes[0]; 4463 int nritems = btrfs_header_nritems(leaf); 4464 4465 if (slot >= nritems) { 4466 if (ins_nr > 0) { 4467 u64 last_extent = 0; 4468 4469 ret = copy_items(trans, inode, dst_path, path, 4470 &last_extent, start_slot, 4471 ins_nr, 1, 0); 4472 /* can't be 1, extent items aren't processed */ 4473 ASSERT(ret <= 0); 4474 if (ret < 0) 4475 return ret; 4476 ins_nr = 0; 4477 } 4478 ret = btrfs_next_leaf(root, path); 4479 if (ret < 0) 4480 return ret; 4481 else if (ret > 0) 4482 break; 4483 continue; 4484 } 4485 4486 btrfs_item_key_to_cpu(leaf, &key, slot); 4487 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4488 break; 4489 4490 if (ins_nr == 0) 4491 start_slot = slot; 4492 ins_nr++; 4493 path->slots[0]++; 4494 cond_resched(); 4495 } 4496 if (ins_nr > 0) { 4497 u64 last_extent = 0; 4498 4499 ret = copy_items(trans, inode, dst_path, path, 4500 &last_extent, start_slot, 4501 ins_nr, 1, 0); 4502 /* can't be 1, extent items aren't processed */ 4503 ASSERT(ret <= 0); 4504 if (ret < 0) 4505 return ret; 4506 } 4507 4508 return 0; 4509 } 4510 4511 /* 4512 * If the no holes feature is enabled we need to make sure any hole between the 4513 * last extent and the i_size of our inode is explicitly marked in the log. This 4514 * is to make sure that doing something like: 4515 * 4516 * 1) create file with 128Kb of data 4517 * 2) truncate file to 64Kb 4518 * 3) truncate file to 256Kb 4519 * 4) fsync file 4520 * 5) <crash/power failure> 4521 * 6) mount fs and trigger log replay 4522 * 4523 * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4524 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4525 * file correspond to a hole. The presence of explicit holes in a log tree is 4526 * what guarantees that log replay will remove/adjust file extent items in the 4527 * fs/subvol tree. 4528 * 4529 * Here we do not need to care about holes between extents, that is already done 4530 * by copy_items(). We also only need to do this in the full sync path, where we 4531 * lookup for extents from the fs/subvol tree only. In the fast path case, we 4532 * lookup the list of modified extent maps and if any represents a hole, we 4533 * insert a corresponding extent representing a hole in the log tree. 4534 */ 4535 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4536 struct btrfs_root *root, 4537 struct btrfs_inode *inode, 4538 struct btrfs_path *path) 4539 { 4540 struct btrfs_fs_info *fs_info = root->fs_info; 4541 int ret; 4542 struct btrfs_key key; 4543 u64 hole_start; 4544 u64 hole_size; 4545 struct extent_buffer *leaf; 4546 struct btrfs_root *log = root->log_root; 4547 const u64 ino = btrfs_ino(inode); 4548 const u64 i_size = i_size_read(&inode->vfs_inode); 4549 4550 if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4551 return 0; 4552 4553 key.objectid = ino; 4554 key.type = BTRFS_EXTENT_DATA_KEY; 4555 key.offset = (u64)-1; 4556 4557 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4558 ASSERT(ret != 0); 4559 if (ret < 0) 4560 return ret; 4561 4562 ASSERT(path->slots[0] > 0); 4563 path->slots[0]--; 4564 leaf = path->nodes[0]; 4565 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4566 4567 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4568 /* inode does not have any extents */ 4569 hole_start = 0; 4570 hole_size = i_size; 4571 } else { 4572 struct btrfs_file_extent_item *extent; 4573 u64 len; 4574 4575 /* 4576 * If there's an extent beyond i_size, an explicit hole was 4577 * already inserted by copy_items(). 4578 */ 4579 if (key.offset >= i_size) 4580 return 0; 4581 4582 extent = btrfs_item_ptr(leaf, path->slots[0], 4583 struct btrfs_file_extent_item); 4584 4585 if (btrfs_file_extent_type(leaf, extent) == 4586 BTRFS_FILE_EXTENT_INLINE) { 4587 len = btrfs_file_extent_inline_len(leaf, 4588 path->slots[0], 4589 extent); 4590 ASSERT(len == i_size || 4591 (len == fs_info->sectorsize && 4592 btrfs_file_extent_compression(leaf, extent) != 4593 BTRFS_COMPRESS_NONE)); 4594 return 0; 4595 } 4596 4597 len = btrfs_file_extent_num_bytes(leaf, extent); 4598 /* Last extent goes beyond i_size, no need to log a hole. */ 4599 if (key.offset + len > i_size) 4600 return 0; 4601 hole_start = key.offset + len; 4602 hole_size = i_size - hole_start; 4603 } 4604 btrfs_release_path(path); 4605 4606 /* Last extent ends at i_size. */ 4607 if (hole_size == 0) 4608 return 0; 4609 4610 hole_size = ALIGN(hole_size, fs_info->sectorsize); 4611 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4612 hole_size, 0, hole_size, 0, 0, 0); 4613 return ret; 4614 } 4615 4616 /* 4617 * When we are logging a new inode X, check if it doesn't have a reference that 4618 * matches the reference from some other inode Y created in a past transaction 4619 * and that was renamed in the current transaction. If we don't do this, then at 4620 * log replay time we can lose inode Y (and all its files if it's a directory): 4621 * 4622 * mkdir /mnt/x 4623 * echo "hello world" > /mnt/x/foobar 4624 * sync 4625 * mv /mnt/x /mnt/y 4626 * mkdir /mnt/x # or touch /mnt/x 4627 * xfs_io -c fsync /mnt/x 4628 * <power fail> 4629 * mount fs, trigger log replay 4630 * 4631 * After the log replay procedure, we would lose the first directory and all its 4632 * files (file foobar). 4633 * For the case where inode Y is not a directory we simply end up losing it: 4634 * 4635 * echo "123" > /mnt/foo 4636 * sync 4637 * mv /mnt/foo /mnt/bar 4638 * echo "abc" > /mnt/foo 4639 * xfs_io -c fsync /mnt/foo 4640 * <power fail> 4641 * 4642 * We also need this for cases where a snapshot entry is replaced by some other 4643 * entry (file or directory) otherwise we end up with an unreplayable log due to 4644 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4645 * if it were a regular entry: 4646 * 4647 * mkdir /mnt/x 4648 * btrfs subvolume snapshot /mnt /mnt/x/snap 4649 * btrfs subvolume delete /mnt/x/snap 4650 * rmdir /mnt/x 4651 * mkdir /mnt/x 4652 * fsync /mnt/x or fsync some new file inside it 4653 * <power fail> 4654 * 4655 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4656 * the same transaction. 4657 */ 4658 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4659 const int slot, 4660 const struct btrfs_key *key, 4661 struct btrfs_inode *inode, 4662 u64 *other_ino) 4663 { 4664 int ret; 4665 struct btrfs_path *search_path; 4666 char *name = NULL; 4667 u32 name_len = 0; 4668 u32 item_size = btrfs_item_size_nr(eb, slot); 4669 u32 cur_offset = 0; 4670 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4671 4672 search_path = btrfs_alloc_path(); 4673 if (!search_path) 4674 return -ENOMEM; 4675 search_path->search_commit_root = 1; 4676 search_path->skip_locking = 1; 4677 4678 while (cur_offset < item_size) { 4679 u64 parent; 4680 u32 this_name_len; 4681 u32 this_len; 4682 unsigned long name_ptr; 4683 struct btrfs_dir_item *di; 4684 4685 if (key->type == BTRFS_INODE_REF_KEY) { 4686 struct btrfs_inode_ref *iref; 4687 4688 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4689 parent = key->offset; 4690 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4691 name_ptr = (unsigned long)(iref + 1); 4692 this_len = sizeof(*iref) + this_name_len; 4693 } else { 4694 struct btrfs_inode_extref *extref; 4695 4696 extref = (struct btrfs_inode_extref *)(ptr + 4697 cur_offset); 4698 parent = btrfs_inode_extref_parent(eb, extref); 4699 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4700 name_ptr = (unsigned long)&extref->name; 4701 this_len = sizeof(*extref) + this_name_len; 4702 } 4703 4704 if (this_name_len > name_len) { 4705 char *new_name; 4706 4707 new_name = krealloc(name, this_name_len, GFP_NOFS); 4708 if (!new_name) { 4709 ret = -ENOMEM; 4710 goto out; 4711 } 4712 name_len = this_name_len; 4713 name = new_name; 4714 } 4715 4716 read_extent_buffer(eb, name, name_ptr, this_name_len); 4717 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4718 parent, name, this_name_len, 0); 4719 if (di && !IS_ERR(di)) { 4720 struct btrfs_key di_key; 4721 4722 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4723 di, &di_key); 4724 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4725 ret = 1; 4726 *other_ino = di_key.objectid; 4727 } else { 4728 ret = -EAGAIN; 4729 } 4730 goto out; 4731 } else if (IS_ERR(di)) { 4732 ret = PTR_ERR(di); 4733 goto out; 4734 } 4735 btrfs_release_path(search_path); 4736 4737 cur_offset += this_len; 4738 } 4739 ret = 0; 4740 out: 4741 btrfs_free_path(search_path); 4742 kfree(name); 4743 return ret; 4744 } 4745 4746 /* log a single inode in the tree log. 4747 * At least one parent directory for this inode must exist in the tree 4748 * or be logged already. 4749 * 4750 * Any items from this inode changed by the current transaction are copied 4751 * to the log tree. An extra reference is taken on any extents in this 4752 * file, allowing us to avoid a whole pile of corner cases around logging 4753 * blocks that have been removed from the tree. 4754 * 4755 * See LOG_INODE_ALL and related defines for a description of what inode_only 4756 * does. 4757 * 4758 * This handles both files and directories. 4759 */ 4760 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4761 struct btrfs_root *root, struct btrfs_inode *inode, 4762 int inode_only, 4763 const loff_t start, 4764 const loff_t end, 4765 struct btrfs_log_ctx *ctx) 4766 { 4767 struct btrfs_fs_info *fs_info = root->fs_info; 4768 struct btrfs_path *path; 4769 struct btrfs_path *dst_path; 4770 struct btrfs_key min_key; 4771 struct btrfs_key max_key; 4772 struct btrfs_root *log = root->log_root; 4773 LIST_HEAD(logged_list); 4774 u64 last_extent = 0; 4775 int err = 0; 4776 int ret; 4777 int nritems; 4778 int ins_start_slot = 0; 4779 int ins_nr; 4780 bool fast_search = false; 4781 u64 ino = btrfs_ino(inode); 4782 struct extent_map_tree *em_tree = &inode->extent_tree; 4783 u64 logged_isize = 0; 4784 bool need_log_inode_item = true; 4785 4786 path = btrfs_alloc_path(); 4787 if (!path) 4788 return -ENOMEM; 4789 dst_path = btrfs_alloc_path(); 4790 if (!dst_path) { 4791 btrfs_free_path(path); 4792 return -ENOMEM; 4793 } 4794 4795 min_key.objectid = ino; 4796 min_key.type = BTRFS_INODE_ITEM_KEY; 4797 min_key.offset = 0; 4798 4799 max_key.objectid = ino; 4800 4801 4802 /* today the code can only do partial logging of directories */ 4803 if (S_ISDIR(inode->vfs_inode.i_mode) || 4804 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4805 &inode->runtime_flags) && 4806 inode_only >= LOG_INODE_EXISTS)) 4807 max_key.type = BTRFS_XATTR_ITEM_KEY; 4808 else 4809 max_key.type = (u8)-1; 4810 max_key.offset = (u64)-1; 4811 4812 /* 4813 * Only run delayed items if we are a dir or a new file. 4814 * Otherwise commit the delayed inode only, which is needed in 4815 * order for the log replay code to mark inodes for link count 4816 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4817 */ 4818 if (S_ISDIR(inode->vfs_inode.i_mode) || 4819 inode->generation > fs_info->last_trans_committed) 4820 ret = btrfs_commit_inode_delayed_items(trans, inode); 4821 else 4822 ret = btrfs_commit_inode_delayed_inode(inode); 4823 4824 if (ret) { 4825 btrfs_free_path(path); 4826 btrfs_free_path(dst_path); 4827 return ret; 4828 } 4829 4830 if (inode_only == LOG_OTHER_INODE) { 4831 inode_only = LOG_INODE_EXISTS; 4832 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 4833 } else { 4834 mutex_lock(&inode->log_mutex); 4835 } 4836 4837 /* 4838 * a brute force approach to making sure we get the most uptodate 4839 * copies of everything. 4840 */ 4841 if (S_ISDIR(inode->vfs_inode.i_mode)) { 4842 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4843 4844 if (inode_only == LOG_INODE_EXISTS) 4845 max_key_type = BTRFS_XATTR_ITEM_KEY; 4846 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4847 } else { 4848 if (inode_only == LOG_INODE_EXISTS) { 4849 /* 4850 * Make sure the new inode item we write to the log has 4851 * the same isize as the current one (if it exists). 4852 * This is necessary to prevent data loss after log 4853 * replay, and also to prevent doing a wrong expanding 4854 * truncate - for e.g. create file, write 4K into offset 4855 * 0, fsync, write 4K into offset 4096, add hard link, 4856 * fsync some other file (to sync log), power fail - if 4857 * we use the inode's current i_size, after log replay 4858 * we get a 8Kb file, with the last 4Kb extent as a hole 4859 * (zeroes), as if an expanding truncate happened, 4860 * instead of getting a file of 4Kb only. 4861 */ 4862 err = logged_inode_size(log, inode, path, &logged_isize); 4863 if (err) 4864 goto out_unlock; 4865 } 4866 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4867 &inode->runtime_flags)) { 4868 if (inode_only == LOG_INODE_EXISTS) { 4869 max_key.type = BTRFS_XATTR_ITEM_KEY; 4870 ret = drop_objectid_items(trans, log, path, ino, 4871 max_key.type); 4872 } else { 4873 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4874 &inode->runtime_flags); 4875 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4876 &inode->runtime_flags); 4877 while(1) { 4878 ret = btrfs_truncate_inode_items(trans, 4879 log, &inode->vfs_inode, 0, 0); 4880 if (ret != -EAGAIN) 4881 break; 4882 } 4883 } 4884 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4885 &inode->runtime_flags) || 4886 inode_only == LOG_INODE_EXISTS) { 4887 if (inode_only == LOG_INODE_ALL) 4888 fast_search = true; 4889 max_key.type = BTRFS_XATTR_ITEM_KEY; 4890 ret = drop_objectid_items(trans, log, path, ino, 4891 max_key.type); 4892 } else { 4893 if (inode_only == LOG_INODE_ALL) 4894 fast_search = true; 4895 goto log_extents; 4896 } 4897 4898 } 4899 if (ret) { 4900 err = ret; 4901 goto out_unlock; 4902 } 4903 4904 while (1) { 4905 ins_nr = 0; 4906 ret = btrfs_search_forward(root, &min_key, 4907 path, trans->transid); 4908 if (ret < 0) { 4909 err = ret; 4910 goto out_unlock; 4911 } 4912 if (ret != 0) 4913 break; 4914 again: 4915 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4916 if (min_key.objectid != ino) 4917 break; 4918 if (min_key.type > max_key.type) 4919 break; 4920 4921 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4922 need_log_inode_item = false; 4923 4924 if ((min_key.type == BTRFS_INODE_REF_KEY || 4925 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4926 inode->generation == trans->transid) { 4927 u64 other_ino = 0; 4928 4929 ret = btrfs_check_ref_name_override(path->nodes[0], 4930 path->slots[0], &min_key, inode, 4931 &other_ino); 4932 if (ret < 0) { 4933 err = ret; 4934 goto out_unlock; 4935 } else if (ret > 0 && ctx && 4936 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 4937 struct btrfs_key inode_key; 4938 struct inode *other_inode; 4939 4940 if (ins_nr > 0) { 4941 ins_nr++; 4942 } else { 4943 ins_nr = 1; 4944 ins_start_slot = path->slots[0]; 4945 } 4946 ret = copy_items(trans, inode, dst_path, path, 4947 &last_extent, ins_start_slot, 4948 ins_nr, inode_only, 4949 logged_isize); 4950 if (ret < 0) { 4951 err = ret; 4952 goto out_unlock; 4953 } 4954 ins_nr = 0; 4955 btrfs_release_path(path); 4956 inode_key.objectid = other_ino; 4957 inode_key.type = BTRFS_INODE_ITEM_KEY; 4958 inode_key.offset = 0; 4959 other_inode = btrfs_iget(fs_info->sb, 4960 &inode_key, root, 4961 NULL); 4962 /* 4963 * If the other inode that had a conflicting dir 4964 * entry was deleted in the current transaction, 4965 * we don't need to do more work nor fallback to 4966 * a transaction commit. 4967 */ 4968 if (IS_ERR(other_inode) && 4969 PTR_ERR(other_inode) == -ENOENT) { 4970 goto next_key; 4971 } else if (IS_ERR(other_inode)) { 4972 err = PTR_ERR(other_inode); 4973 goto out_unlock; 4974 } 4975 /* 4976 * We are safe logging the other inode without 4977 * acquiring its i_mutex as long as we log with 4978 * the LOG_INODE_EXISTS mode. We're safe against 4979 * concurrent renames of the other inode as well 4980 * because during a rename we pin the log and 4981 * update the log with the new name before we 4982 * unpin it. 4983 */ 4984 err = btrfs_log_inode(trans, root, 4985 BTRFS_I(other_inode), 4986 LOG_OTHER_INODE, 0, LLONG_MAX, 4987 ctx); 4988 iput(other_inode); 4989 if (err) 4990 goto out_unlock; 4991 else 4992 goto next_key; 4993 } 4994 } 4995 4996 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4997 if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 4998 if (ins_nr == 0) 4999 goto next_slot; 5000 ret = copy_items(trans, inode, dst_path, path, 5001 &last_extent, ins_start_slot, 5002 ins_nr, inode_only, logged_isize); 5003 if (ret < 0) { 5004 err = ret; 5005 goto out_unlock; 5006 } 5007 ins_nr = 0; 5008 if (ret) { 5009 btrfs_release_path(path); 5010 continue; 5011 } 5012 goto next_slot; 5013 } 5014 5015 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5016 ins_nr++; 5017 goto next_slot; 5018 } else if (!ins_nr) { 5019 ins_start_slot = path->slots[0]; 5020 ins_nr = 1; 5021 goto next_slot; 5022 } 5023 5024 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5025 ins_start_slot, ins_nr, inode_only, 5026 logged_isize); 5027 if (ret < 0) { 5028 err = ret; 5029 goto out_unlock; 5030 } 5031 if (ret) { 5032 ins_nr = 0; 5033 btrfs_release_path(path); 5034 continue; 5035 } 5036 ins_nr = 1; 5037 ins_start_slot = path->slots[0]; 5038 next_slot: 5039 5040 nritems = btrfs_header_nritems(path->nodes[0]); 5041 path->slots[0]++; 5042 if (path->slots[0] < nritems) { 5043 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 5044 path->slots[0]); 5045 goto again; 5046 } 5047 if (ins_nr) { 5048 ret = copy_items(trans, inode, dst_path, path, 5049 &last_extent, ins_start_slot, 5050 ins_nr, inode_only, logged_isize); 5051 if (ret < 0) { 5052 err = ret; 5053 goto out_unlock; 5054 } 5055 ret = 0; 5056 ins_nr = 0; 5057 } 5058 btrfs_release_path(path); 5059 next_key: 5060 if (min_key.offset < (u64)-1) { 5061 min_key.offset++; 5062 } else if (min_key.type < max_key.type) { 5063 min_key.type++; 5064 min_key.offset = 0; 5065 } else { 5066 break; 5067 } 5068 } 5069 if (ins_nr) { 5070 ret = copy_items(trans, inode, dst_path, path, &last_extent, 5071 ins_start_slot, ins_nr, inode_only, 5072 logged_isize); 5073 if (ret < 0) { 5074 err = ret; 5075 goto out_unlock; 5076 } 5077 ret = 0; 5078 ins_nr = 0; 5079 } 5080 5081 btrfs_release_path(path); 5082 btrfs_release_path(dst_path); 5083 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5084 if (err) 5085 goto out_unlock; 5086 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5087 btrfs_release_path(path); 5088 btrfs_release_path(dst_path); 5089 err = btrfs_log_trailing_hole(trans, root, inode, path); 5090 if (err) 5091 goto out_unlock; 5092 } 5093 log_extents: 5094 btrfs_release_path(path); 5095 btrfs_release_path(dst_path); 5096 if (need_log_inode_item) { 5097 err = log_inode_item(trans, log, dst_path, inode); 5098 if (err) 5099 goto out_unlock; 5100 } 5101 if (fast_search) { 5102 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5103 &logged_list, ctx, start, end); 5104 if (ret) { 5105 err = ret; 5106 goto out_unlock; 5107 } 5108 } else if (inode_only == LOG_INODE_ALL) { 5109 struct extent_map *em, *n; 5110 5111 write_lock(&em_tree->lock); 5112 /* 5113 * We can't just remove every em if we're called for a ranged 5114 * fsync - that is, one that doesn't cover the whole possible 5115 * file range (0 to LLONG_MAX). This is because we can have 5116 * em's that fall outside the range we're logging and therefore 5117 * their ordered operations haven't completed yet 5118 * (btrfs_finish_ordered_io() not invoked yet). This means we 5119 * didn't get their respective file extent item in the fs/subvol 5120 * tree yet, and need to let the next fast fsync (one which 5121 * consults the list of modified extent maps) find the em so 5122 * that it logs a matching file extent item and waits for the 5123 * respective ordered operation to complete (if it's still 5124 * running). 5125 * 5126 * Removing every em outside the range we're logging would make 5127 * the next fast fsync not log their matching file extent items, 5128 * therefore making us lose data after a log replay. 5129 */ 5130 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 5131 list) { 5132 const u64 mod_end = em->mod_start + em->mod_len - 1; 5133 5134 if (em->mod_start >= start && mod_end <= end) 5135 list_del_init(&em->list); 5136 } 5137 write_unlock(&em_tree->lock); 5138 } 5139 5140 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5141 ret = log_directory_changes(trans, root, inode, path, dst_path, 5142 ctx); 5143 if (ret) { 5144 err = ret; 5145 goto out_unlock; 5146 } 5147 } 5148 5149 spin_lock(&inode->lock); 5150 inode->logged_trans = trans->transid; 5151 inode->last_log_commit = inode->last_sub_trans; 5152 spin_unlock(&inode->lock); 5153 out_unlock: 5154 if (unlikely(err)) 5155 btrfs_put_logged_extents(&logged_list); 5156 else 5157 btrfs_submit_logged_extents(&logged_list, log); 5158 mutex_unlock(&inode->log_mutex); 5159 5160 btrfs_free_path(path); 5161 btrfs_free_path(dst_path); 5162 return err; 5163 } 5164 5165 /* 5166 * Check if we must fallback to a transaction commit when logging an inode. 5167 * This must be called after logging the inode and is used only in the context 5168 * when fsyncing an inode requires the need to log some other inode - in which 5169 * case we can't lock the i_mutex of each other inode we need to log as that 5170 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5171 * log inodes up or down in the hierarchy) or rename operations for example. So 5172 * we take the log_mutex of the inode after we have logged it and then check for 5173 * its last_unlink_trans value - this is safe because any task setting 5174 * last_unlink_trans must take the log_mutex and it must do this before it does 5175 * the actual unlink operation, so if we do this check before a concurrent task 5176 * sets last_unlink_trans it means we've logged a consistent version/state of 5177 * all the inode items, otherwise we are not sure and must do a transaction 5178 * commit (the concurrent task might have only updated last_unlink_trans before 5179 * we logged the inode or it might have also done the unlink). 5180 */ 5181 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5182 struct btrfs_inode *inode) 5183 { 5184 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5185 bool ret = false; 5186 5187 mutex_lock(&inode->log_mutex); 5188 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5189 /* 5190 * Make sure any commits to the log are forced to be full 5191 * commits. 5192 */ 5193 btrfs_set_log_full_commit(fs_info, trans); 5194 ret = true; 5195 } 5196 mutex_unlock(&inode->log_mutex); 5197 5198 return ret; 5199 } 5200 5201 /* 5202 * follow the dentry parent pointers up the chain and see if any 5203 * of the directories in it require a full commit before they can 5204 * be logged. Returns zero if nothing special needs to be done or 1 if 5205 * a full commit is required. 5206 */ 5207 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5208 struct btrfs_inode *inode, 5209 struct dentry *parent, 5210 struct super_block *sb, 5211 u64 last_committed) 5212 { 5213 int ret = 0; 5214 struct dentry *old_parent = NULL; 5215 struct btrfs_inode *orig_inode = inode; 5216 5217 /* 5218 * for regular files, if its inode is already on disk, we don't 5219 * have to worry about the parents at all. This is because 5220 * we can use the last_unlink_trans field to record renames 5221 * and other fun in this file. 5222 */ 5223 if (S_ISREG(inode->vfs_inode.i_mode) && 5224 inode->generation <= last_committed && 5225 inode->last_unlink_trans <= last_committed) 5226 goto out; 5227 5228 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5229 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5230 goto out; 5231 inode = BTRFS_I(d_inode(parent)); 5232 } 5233 5234 while (1) { 5235 /* 5236 * If we are logging a directory then we start with our inode, 5237 * not our parent's inode, so we need to skip setting the 5238 * logged_trans so that further down in the log code we don't 5239 * think this inode has already been logged. 5240 */ 5241 if (inode != orig_inode) 5242 inode->logged_trans = trans->transid; 5243 smp_mb(); 5244 5245 if (btrfs_must_commit_transaction(trans, inode)) { 5246 ret = 1; 5247 break; 5248 } 5249 5250 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5251 break; 5252 5253 if (IS_ROOT(parent)) { 5254 inode = BTRFS_I(d_inode(parent)); 5255 if (btrfs_must_commit_transaction(trans, inode)) 5256 ret = 1; 5257 break; 5258 } 5259 5260 parent = dget_parent(parent); 5261 dput(old_parent); 5262 old_parent = parent; 5263 inode = BTRFS_I(d_inode(parent)); 5264 5265 } 5266 dput(old_parent); 5267 out: 5268 return ret; 5269 } 5270 5271 struct btrfs_dir_list { 5272 u64 ino; 5273 struct list_head list; 5274 }; 5275 5276 /* 5277 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5278 * details about the why it is needed. 5279 * This is a recursive operation - if an existing dentry corresponds to a 5280 * directory, that directory's new entries are logged too (same behaviour as 5281 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5282 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5283 * complains about the following circular lock dependency / possible deadlock: 5284 * 5285 * CPU0 CPU1 5286 * ---- ---- 5287 * lock(&type->i_mutex_dir_key#3/2); 5288 * lock(sb_internal#2); 5289 * lock(&type->i_mutex_dir_key#3/2); 5290 * lock(&sb->s_type->i_mutex_key#14); 5291 * 5292 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5293 * sb_start_intwrite() in btrfs_start_transaction(). 5294 * Not locking i_mutex of the inodes is still safe because: 5295 * 5296 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5297 * that while logging the inode new references (names) are added or removed 5298 * from the inode, leaving the logged inode item with a link count that does 5299 * not match the number of logged inode reference items. This is fine because 5300 * at log replay time we compute the real number of links and correct the 5301 * link count in the inode item (see replay_one_buffer() and 5302 * link_to_fixup_dir()); 5303 * 5304 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5305 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5306 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5307 * has a size that doesn't match the sum of the lengths of all the logged 5308 * names. This does not result in a problem because if a dir_item key is 5309 * logged but its matching dir_index key is not logged, at log replay time we 5310 * don't use it to replay the respective name (see replay_one_name()). On the 5311 * other hand if only the dir_index key ends up being logged, the respective 5312 * name is added to the fs/subvol tree with both the dir_item and dir_index 5313 * keys created (see replay_one_name()). 5314 * The directory's inode item with a wrong i_size is not a problem as well, 5315 * since we don't use it at log replay time to set the i_size in the inode 5316 * item of the fs/subvol tree (see overwrite_item()). 5317 */ 5318 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5319 struct btrfs_root *root, 5320 struct btrfs_inode *start_inode, 5321 struct btrfs_log_ctx *ctx) 5322 { 5323 struct btrfs_fs_info *fs_info = root->fs_info; 5324 struct btrfs_root *log = root->log_root; 5325 struct btrfs_path *path; 5326 LIST_HEAD(dir_list); 5327 struct btrfs_dir_list *dir_elem; 5328 int ret = 0; 5329 5330 path = btrfs_alloc_path(); 5331 if (!path) 5332 return -ENOMEM; 5333 5334 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5335 if (!dir_elem) { 5336 btrfs_free_path(path); 5337 return -ENOMEM; 5338 } 5339 dir_elem->ino = btrfs_ino(start_inode); 5340 list_add_tail(&dir_elem->list, &dir_list); 5341 5342 while (!list_empty(&dir_list)) { 5343 struct extent_buffer *leaf; 5344 struct btrfs_key min_key; 5345 int nritems; 5346 int i; 5347 5348 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5349 list); 5350 if (ret) 5351 goto next_dir_inode; 5352 5353 min_key.objectid = dir_elem->ino; 5354 min_key.type = BTRFS_DIR_ITEM_KEY; 5355 min_key.offset = 0; 5356 again: 5357 btrfs_release_path(path); 5358 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5359 if (ret < 0) { 5360 goto next_dir_inode; 5361 } else if (ret > 0) { 5362 ret = 0; 5363 goto next_dir_inode; 5364 } 5365 5366 process_leaf: 5367 leaf = path->nodes[0]; 5368 nritems = btrfs_header_nritems(leaf); 5369 for (i = path->slots[0]; i < nritems; i++) { 5370 struct btrfs_dir_item *di; 5371 struct btrfs_key di_key; 5372 struct inode *di_inode; 5373 struct btrfs_dir_list *new_dir_elem; 5374 int log_mode = LOG_INODE_EXISTS; 5375 int type; 5376 5377 btrfs_item_key_to_cpu(leaf, &min_key, i); 5378 if (min_key.objectid != dir_elem->ino || 5379 min_key.type != BTRFS_DIR_ITEM_KEY) 5380 goto next_dir_inode; 5381 5382 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5383 type = btrfs_dir_type(leaf, di); 5384 if (btrfs_dir_transid(leaf, di) < trans->transid && 5385 type != BTRFS_FT_DIR) 5386 continue; 5387 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5388 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5389 continue; 5390 5391 btrfs_release_path(path); 5392 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 5393 if (IS_ERR(di_inode)) { 5394 ret = PTR_ERR(di_inode); 5395 goto next_dir_inode; 5396 } 5397 5398 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5399 iput(di_inode); 5400 break; 5401 } 5402 5403 ctx->log_new_dentries = false; 5404 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5405 log_mode = LOG_INODE_ALL; 5406 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5407 log_mode, 0, LLONG_MAX, ctx); 5408 if (!ret && 5409 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5410 ret = 1; 5411 iput(di_inode); 5412 if (ret) 5413 goto next_dir_inode; 5414 if (ctx->log_new_dentries) { 5415 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5416 GFP_NOFS); 5417 if (!new_dir_elem) { 5418 ret = -ENOMEM; 5419 goto next_dir_inode; 5420 } 5421 new_dir_elem->ino = di_key.objectid; 5422 list_add_tail(&new_dir_elem->list, &dir_list); 5423 } 5424 break; 5425 } 5426 if (i == nritems) { 5427 ret = btrfs_next_leaf(log, path); 5428 if (ret < 0) { 5429 goto next_dir_inode; 5430 } else if (ret > 0) { 5431 ret = 0; 5432 goto next_dir_inode; 5433 } 5434 goto process_leaf; 5435 } 5436 if (min_key.offset < (u64)-1) { 5437 min_key.offset++; 5438 goto again; 5439 } 5440 next_dir_inode: 5441 list_del(&dir_elem->list); 5442 kfree(dir_elem); 5443 } 5444 5445 btrfs_free_path(path); 5446 return ret; 5447 } 5448 5449 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5450 struct btrfs_inode *inode, 5451 struct btrfs_log_ctx *ctx) 5452 { 5453 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5454 int ret; 5455 struct btrfs_path *path; 5456 struct btrfs_key key; 5457 struct btrfs_root *root = inode->root; 5458 const u64 ino = btrfs_ino(inode); 5459 5460 path = btrfs_alloc_path(); 5461 if (!path) 5462 return -ENOMEM; 5463 path->skip_locking = 1; 5464 path->search_commit_root = 1; 5465 5466 key.objectid = ino; 5467 key.type = BTRFS_INODE_REF_KEY; 5468 key.offset = 0; 5469 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5470 if (ret < 0) 5471 goto out; 5472 5473 while (true) { 5474 struct extent_buffer *leaf = path->nodes[0]; 5475 int slot = path->slots[0]; 5476 u32 cur_offset = 0; 5477 u32 item_size; 5478 unsigned long ptr; 5479 5480 if (slot >= btrfs_header_nritems(leaf)) { 5481 ret = btrfs_next_leaf(root, path); 5482 if (ret < 0) 5483 goto out; 5484 else if (ret > 0) 5485 break; 5486 continue; 5487 } 5488 5489 btrfs_item_key_to_cpu(leaf, &key, slot); 5490 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5491 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5492 break; 5493 5494 item_size = btrfs_item_size_nr(leaf, slot); 5495 ptr = btrfs_item_ptr_offset(leaf, slot); 5496 while (cur_offset < item_size) { 5497 struct btrfs_key inode_key; 5498 struct inode *dir_inode; 5499 5500 inode_key.type = BTRFS_INODE_ITEM_KEY; 5501 inode_key.offset = 0; 5502 5503 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5504 struct btrfs_inode_extref *extref; 5505 5506 extref = (struct btrfs_inode_extref *) 5507 (ptr + cur_offset); 5508 inode_key.objectid = btrfs_inode_extref_parent( 5509 leaf, extref); 5510 cur_offset += sizeof(*extref); 5511 cur_offset += btrfs_inode_extref_name_len(leaf, 5512 extref); 5513 } else { 5514 inode_key.objectid = key.offset; 5515 cur_offset = item_size; 5516 } 5517 5518 dir_inode = btrfs_iget(fs_info->sb, &inode_key, 5519 root, NULL); 5520 /* If parent inode was deleted, skip it. */ 5521 if (IS_ERR(dir_inode)) 5522 continue; 5523 5524 if (ctx) 5525 ctx->log_new_dentries = false; 5526 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5527 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5528 if (!ret && 5529 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5530 ret = 1; 5531 if (!ret && ctx && ctx->log_new_dentries) 5532 ret = log_new_dir_dentries(trans, root, 5533 BTRFS_I(dir_inode), ctx); 5534 iput(dir_inode); 5535 if (ret) 5536 goto out; 5537 } 5538 path->slots[0]++; 5539 } 5540 ret = 0; 5541 out: 5542 btrfs_free_path(path); 5543 return ret; 5544 } 5545 5546 /* 5547 * helper function around btrfs_log_inode to make sure newly created 5548 * parent directories also end up in the log. A minimal inode and backref 5549 * only logging is done of any parent directories that are older than 5550 * the last committed transaction 5551 */ 5552 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 5553 struct btrfs_inode *inode, 5554 struct dentry *parent, 5555 const loff_t start, 5556 const loff_t end, 5557 int inode_only, 5558 struct btrfs_log_ctx *ctx) 5559 { 5560 struct btrfs_root *root = inode->root; 5561 struct btrfs_fs_info *fs_info = root->fs_info; 5562 struct super_block *sb; 5563 struct dentry *old_parent = NULL; 5564 int ret = 0; 5565 u64 last_committed = fs_info->last_trans_committed; 5566 bool log_dentries = false; 5567 struct btrfs_inode *orig_inode = inode; 5568 5569 sb = inode->vfs_inode.i_sb; 5570 5571 if (btrfs_test_opt(fs_info, NOTREELOG)) { 5572 ret = 1; 5573 goto end_no_trans; 5574 } 5575 5576 /* 5577 * The prev transaction commit doesn't complete, we need do 5578 * full commit by ourselves. 5579 */ 5580 if (fs_info->last_trans_log_full_commit > 5581 fs_info->last_trans_committed) { 5582 ret = 1; 5583 goto end_no_trans; 5584 } 5585 5586 if (btrfs_root_refs(&root->root_item) == 0) { 5587 ret = 1; 5588 goto end_no_trans; 5589 } 5590 5591 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 5592 last_committed); 5593 if (ret) 5594 goto end_no_trans; 5595 5596 if (btrfs_inode_in_log(inode, trans->transid)) { 5597 ret = BTRFS_NO_LOG_SYNC; 5598 goto end_no_trans; 5599 } 5600 5601 ret = start_log_trans(trans, root, ctx); 5602 if (ret) 5603 goto end_no_trans; 5604 5605 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 5606 if (ret) 5607 goto end_trans; 5608 5609 /* 5610 * for regular files, if its inode is already on disk, we don't 5611 * have to worry about the parents at all. This is because 5612 * we can use the last_unlink_trans field to record renames 5613 * and other fun in this file. 5614 */ 5615 if (S_ISREG(inode->vfs_inode.i_mode) && 5616 inode->generation <= last_committed && 5617 inode->last_unlink_trans <= last_committed) { 5618 ret = 0; 5619 goto end_trans; 5620 } 5621 5622 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 5623 log_dentries = true; 5624 5625 /* 5626 * On unlink we must make sure all our current and old parent directory 5627 * inodes are fully logged. This is to prevent leaving dangling 5628 * directory index entries in directories that were our parents but are 5629 * not anymore. Not doing this results in old parent directory being 5630 * impossible to delete after log replay (rmdir will always fail with 5631 * error -ENOTEMPTY). 5632 * 5633 * Example 1: 5634 * 5635 * mkdir testdir 5636 * touch testdir/foo 5637 * ln testdir/foo testdir/bar 5638 * sync 5639 * unlink testdir/bar 5640 * xfs_io -c fsync testdir/foo 5641 * <power failure> 5642 * mount fs, triggers log replay 5643 * 5644 * If we don't log the parent directory (testdir), after log replay the 5645 * directory still has an entry pointing to the file inode using the bar 5646 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 5647 * the file inode has a link count of 1. 5648 * 5649 * Example 2: 5650 * 5651 * mkdir testdir 5652 * touch foo 5653 * ln foo testdir/foo2 5654 * ln foo testdir/foo3 5655 * sync 5656 * unlink testdir/foo3 5657 * xfs_io -c fsync foo 5658 * <power failure> 5659 * mount fs, triggers log replay 5660 * 5661 * Similar as the first example, after log replay the parent directory 5662 * testdir still has an entry pointing to the inode file with name foo3 5663 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 5664 * and has a link count of 2. 5665 */ 5666 if (inode->last_unlink_trans > last_committed) { 5667 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 5668 if (ret) 5669 goto end_trans; 5670 } 5671 5672 while (1) { 5673 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5674 break; 5675 5676 inode = BTRFS_I(d_inode(parent)); 5677 if (root != inode->root) 5678 break; 5679 5680 if (inode->generation > last_committed) { 5681 ret = btrfs_log_inode(trans, root, inode, 5682 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); 5683 if (ret) 5684 goto end_trans; 5685 } 5686 if (IS_ROOT(parent)) 5687 break; 5688 5689 parent = dget_parent(parent); 5690 dput(old_parent); 5691 old_parent = parent; 5692 } 5693 if (log_dentries) 5694 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 5695 else 5696 ret = 0; 5697 end_trans: 5698 dput(old_parent); 5699 if (ret < 0) { 5700 btrfs_set_log_full_commit(fs_info, trans); 5701 ret = 1; 5702 } 5703 5704 if (ret) 5705 btrfs_remove_log_ctx(root, ctx); 5706 btrfs_end_log_trans(root); 5707 end_no_trans: 5708 return ret; 5709 } 5710 5711 /* 5712 * it is not safe to log dentry if the chunk root has added new 5713 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5714 * If this returns 1, you must commit the transaction to safely get your 5715 * data on disk. 5716 */ 5717 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 5718 struct dentry *dentry, 5719 const loff_t start, 5720 const loff_t end, 5721 struct btrfs_log_ctx *ctx) 5722 { 5723 struct dentry *parent = dget_parent(dentry); 5724 int ret; 5725 5726 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 5727 start, end, LOG_INODE_ALL, ctx); 5728 dput(parent); 5729 5730 return ret; 5731 } 5732 5733 /* 5734 * should be called during mount to recover any replay any log trees 5735 * from the FS 5736 */ 5737 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5738 { 5739 int ret; 5740 struct btrfs_path *path; 5741 struct btrfs_trans_handle *trans; 5742 struct btrfs_key key; 5743 struct btrfs_key found_key; 5744 struct btrfs_key tmp_key; 5745 struct btrfs_root *log; 5746 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5747 struct walk_control wc = { 5748 .process_func = process_one_buffer, 5749 .stage = 0, 5750 }; 5751 5752 path = btrfs_alloc_path(); 5753 if (!path) 5754 return -ENOMEM; 5755 5756 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5757 5758 trans = btrfs_start_transaction(fs_info->tree_root, 0); 5759 if (IS_ERR(trans)) { 5760 ret = PTR_ERR(trans); 5761 goto error; 5762 } 5763 5764 wc.trans = trans; 5765 wc.pin = 1; 5766 5767 ret = walk_log_tree(trans, log_root_tree, &wc); 5768 if (ret) { 5769 btrfs_handle_fs_error(fs_info, ret, 5770 "Failed to pin buffers while recovering log root tree."); 5771 goto error; 5772 } 5773 5774 again: 5775 key.objectid = BTRFS_TREE_LOG_OBJECTID; 5776 key.offset = (u64)-1; 5777 key.type = BTRFS_ROOT_ITEM_KEY; 5778 5779 while (1) { 5780 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5781 5782 if (ret < 0) { 5783 btrfs_handle_fs_error(fs_info, ret, 5784 "Couldn't find tree log root."); 5785 goto error; 5786 } 5787 if (ret > 0) { 5788 if (path->slots[0] == 0) 5789 break; 5790 path->slots[0]--; 5791 } 5792 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5793 path->slots[0]); 5794 btrfs_release_path(path); 5795 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5796 break; 5797 5798 log = btrfs_read_fs_root(log_root_tree, &found_key); 5799 if (IS_ERR(log)) { 5800 ret = PTR_ERR(log); 5801 btrfs_handle_fs_error(fs_info, ret, 5802 "Couldn't read tree log root."); 5803 goto error; 5804 } 5805 5806 tmp_key.objectid = found_key.offset; 5807 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5808 tmp_key.offset = (u64)-1; 5809 5810 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 5811 if (IS_ERR(wc.replay_dest)) { 5812 ret = PTR_ERR(wc.replay_dest); 5813 free_extent_buffer(log->node); 5814 free_extent_buffer(log->commit_root); 5815 kfree(log); 5816 btrfs_handle_fs_error(fs_info, ret, 5817 "Couldn't read target root for tree log recovery."); 5818 goto error; 5819 } 5820 5821 wc.replay_dest->log_root = log; 5822 btrfs_record_root_in_trans(trans, wc.replay_dest); 5823 ret = walk_log_tree(trans, log, &wc); 5824 5825 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5826 ret = fixup_inode_link_counts(trans, wc.replay_dest, 5827 path); 5828 } 5829 5830 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5831 struct btrfs_root *root = wc.replay_dest; 5832 5833 btrfs_release_path(path); 5834 5835 /* 5836 * We have just replayed everything, and the highest 5837 * objectid of fs roots probably has changed in case 5838 * some inode_item's got replayed. 5839 * 5840 * root->objectid_mutex is not acquired as log replay 5841 * could only happen during mount. 5842 */ 5843 ret = btrfs_find_highest_objectid(root, 5844 &root->highest_objectid); 5845 } 5846 5847 key.offset = found_key.offset - 1; 5848 wc.replay_dest->log_root = NULL; 5849 free_extent_buffer(log->node); 5850 free_extent_buffer(log->commit_root); 5851 kfree(log); 5852 5853 if (ret) 5854 goto error; 5855 5856 if (found_key.offset == 0) 5857 break; 5858 } 5859 btrfs_release_path(path); 5860 5861 /* step one is to pin it all, step two is to replay just inodes */ 5862 if (wc.pin) { 5863 wc.pin = 0; 5864 wc.process_func = replay_one_buffer; 5865 wc.stage = LOG_WALK_REPLAY_INODES; 5866 goto again; 5867 } 5868 /* step three is to replay everything */ 5869 if (wc.stage < LOG_WALK_REPLAY_ALL) { 5870 wc.stage++; 5871 goto again; 5872 } 5873 5874 btrfs_free_path(path); 5875 5876 /* step 4: commit the transaction, which also unpins the blocks */ 5877 ret = btrfs_commit_transaction(trans); 5878 if (ret) 5879 return ret; 5880 5881 free_extent_buffer(log_root_tree->node); 5882 log_root_tree->log_root = NULL; 5883 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5884 kfree(log_root_tree); 5885 5886 return 0; 5887 error: 5888 if (wc.trans) 5889 btrfs_end_transaction(wc.trans); 5890 btrfs_free_path(path); 5891 return ret; 5892 } 5893 5894 /* 5895 * there are some corner cases where we want to force a full 5896 * commit instead of allowing a directory to be logged. 5897 * 5898 * They revolve around files there were unlinked from the directory, and 5899 * this function updates the parent directory so that a full commit is 5900 * properly done if it is fsync'd later after the unlinks are done. 5901 * 5902 * Must be called before the unlink operations (updates to the subvolume tree, 5903 * inodes, etc) are done. 5904 */ 5905 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5906 struct btrfs_inode *dir, struct btrfs_inode *inode, 5907 int for_rename) 5908 { 5909 /* 5910 * when we're logging a file, if it hasn't been renamed 5911 * or unlinked, and its inode is fully committed on disk, 5912 * we don't have to worry about walking up the directory chain 5913 * to log its parents. 5914 * 5915 * So, we use the last_unlink_trans field to put this transid 5916 * into the file. When the file is logged we check it and 5917 * don't log the parents if the file is fully on disk. 5918 */ 5919 mutex_lock(&inode->log_mutex); 5920 inode->last_unlink_trans = trans->transid; 5921 mutex_unlock(&inode->log_mutex); 5922 5923 /* 5924 * if this directory was already logged any new 5925 * names for this file/dir will get recorded 5926 */ 5927 smp_mb(); 5928 if (dir->logged_trans == trans->transid) 5929 return; 5930 5931 /* 5932 * if the inode we're about to unlink was logged, 5933 * the log will be properly updated for any new names 5934 */ 5935 if (inode->logged_trans == trans->transid) 5936 return; 5937 5938 /* 5939 * when renaming files across directories, if the directory 5940 * there we're unlinking from gets fsync'd later on, there's 5941 * no way to find the destination directory later and fsync it 5942 * properly. So, we have to be conservative and force commits 5943 * so the new name gets discovered. 5944 */ 5945 if (for_rename) 5946 goto record; 5947 5948 /* we can safely do the unlink without any special recording */ 5949 return; 5950 5951 record: 5952 mutex_lock(&dir->log_mutex); 5953 dir->last_unlink_trans = trans->transid; 5954 mutex_unlock(&dir->log_mutex); 5955 } 5956 5957 /* 5958 * Make sure that if someone attempts to fsync the parent directory of a deleted 5959 * snapshot, it ends up triggering a transaction commit. This is to guarantee 5960 * that after replaying the log tree of the parent directory's root we will not 5961 * see the snapshot anymore and at log replay time we will not see any log tree 5962 * corresponding to the deleted snapshot's root, which could lead to replaying 5963 * it after replaying the log tree of the parent directory (which would replay 5964 * the snapshot delete operation). 5965 * 5966 * Must be called before the actual snapshot destroy operation (updates to the 5967 * parent root and tree of tree roots trees, etc) are done. 5968 */ 5969 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 5970 struct btrfs_inode *dir) 5971 { 5972 mutex_lock(&dir->log_mutex); 5973 dir->last_unlink_trans = trans->transid; 5974 mutex_unlock(&dir->log_mutex); 5975 } 5976 5977 /* 5978 * Call this after adding a new name for a file and it will properly 5979 * update the log to reflect the new name. 5980 * 5981 * It will return zero if all goes well, and it will return 1 if a 5982 * full transaction commit is required. 5983 */ 5984 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 5985 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 5986 struct dentry *parent) 5987 { 5988 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5989 5990 /* 5991 * this will force the logging code to walk the dentry chain 5992 * up for the file 5993 */ 5994 if (!S_ISDIR(inode->vfs_inode.i_mode)) 5995 inode->last_unlink_trans = trans->transid; 5996 5997 /* 5998 * if this inode hasn't been logged and directory we're renaming it 5999 * from hasn't been logged, we don't need to log it 6000 */ 6001 if (inode->logged_trans <= fs_info->last_trans_committed && 6002 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 6003 return 0; 6004 6005 return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, 6006 LOG_INODE_EXISTS, NULL); 6007 } 6008 6009