1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 30 /* magic values for the inode_only field in btrfs_log_inode: 31 * 32 * LOG_INODE_ALL means to log everything 33 * LOG_INODE_EXISTS means to log just enough to recreate the inode 34 * during log replay 35 */ 36 #define LOG_INODE_ALL 0 37 #define LOG_INODE_EXISTS 1 38 39 /* 40 * directory trouble cases 41 * 42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 43 * log, we must force a full commit before doing an fsync of the directory 44 * where the unlink was done. 45 * ---> record transid of last unlink/rename per directory 46 * 47 * mkdir foo/some_dir 48 * normal commit 49 * rename foo/some_dir foo2/some_dir 50 * mkdir foo/some_dir 51 * fsync foo/some_dir/some_file 52 * 53 * The fsync above will unlink the original some_dir without recording 54 * it in its new location (foo2). After a crash, some_dir will be gone 55 * unless the fsync of some_file forces a full commit 56 * 57 * 2) we must log any new names for any file or dir that is in the fsync 58 * log. ---> check inode while renaming/linking. 59 * 60 * 2a) we must log any new names for any file or dir during rename 61 * when the directory they are being removed from was logged. 62 * ---> check inode and old parent dir during rename 63 * 64 * 2a is actually the more important variant. With the extra logging 65 * a crash might unlink the old name without recreating the new one 66 * 67 * 3) after a crash, we must go through any directories with a link count 68 * of zero and redo the rm -rf 69 * 70 * mkdir f1/foo 71 * normal commit 72 * rm -rf f1/foo 73 * fsync(f1) 74 * 75 * The directory f1 was fully removed from the FS, but fsync was never 76 * called on f1, only its parent dir. After a crash the rm -rf must 77 * be replayed. This must be able to recurse down the entire 78 * directory tree. The inode link count fixup code takes care of the 79 * ugly details. 80 */ 81 82 /* 83 * stages for the tree walking. The first 84 * stage (0) is to only pin down the blocks we find 85 * the second stage (1) is to make sure that all the inodes 86 * we find in the log are created in the subvolume. 87 * 88 * The last stage is to deal with directories and links and extents 89 * and all the other fun semantics 90 */ 91 #define LOG_WALK_PIN_ONLY 0 92 #define LOG_WALK_REPLAY_INODES 1 93 #define LOG_WALK_REPLAY_DIR_INDEX 2 94 #define LOG_WALK_REPLAY_ALL 3 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct inode *inode, 98 int inode_only, 99 const loff_t start, 100 const loff_t end, 101 struct btrfs_log_ctx *ctx); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 int index; 144 int ret; 145 146 mutex_lock(&root->log_mutex); 147 if (root->log_root) { 148 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 149 ret = -EAGAIN; 150 goto out; 151 } 152 if (!root->log_start_pid) { 153 root->log_start_pid = current->pid; 154 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 155 } else if (root->log_start_pid != current->pid) { 156 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 157 } 158 159 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_writers); 161 if (ctx) { 162 index = root->log_transid % 2; 163 list_add_tail(&ctx->list, &root->log_ctxs[index]); 164 ctx->log_transid = root->log_transid; 165 } 166 mutex_unlock(&root->log_mutex); 167 return 0; 168 } 169 170 ret = 0; 171 mutex_lock(&root->fs_info->tree_log_mutex); 172 if (!root->fs_info->log_root_tree) 173 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 mutex_unlock(&root->fs_info->tree_log_mutex); 175 if (ret) 176 goto out; 177 178 if (!root->log_root) { 179 ret = btrfs_add_log_tree(trans, root); 180 if (ret) 181 goto out; 182 } 183 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 184 root->log_start_pid = current->pid; 185 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_writers); 187 if (ctx) { 188 index = root->log_transid % 2; 189 list_add_tail(&ctx->list, &root->log_ctxs[index]); 190 ctx->log_transid = root->log_transid; 191 } 192 out: 193 mutex_unlock(&root->log_mutex); 194 return ret; 195 } 196 197 /* 198 * returns 0 if there was a log transaction running and we were able 199 * to join, or returns -ENOENT if there were not transactions 200 * in progress 201 */ 202 static int join_running_log_trans(struct btrfs_root *root) 203 { 204 int ret = -ENOENT; 205 206 smp_mb(); 207 if (!root->log_root) 208 return -ENOENT; 209 210 mutex_lock(&root->log_mutex); 211 if (root->log_root) { 212 ret = 0; 213 atomic_inc(&root->log_writers); 214 } 215 mutex_unlock(&root->log_mutex); 216 return ret; 217 } 218 219 /* 220 * This either makes the current running log transaction wait 221 * until you call btrfs_end_log_trans() or it makes any future 222 * log transactions wait until you call btrfs_end_log_trans() 223 */ 224 int btrfs_pin_log_trans(struct btrfs_root *root) 225 { 226 int ret = -ENOENT; 227 228 mutex_lock(&root->log_mutex); 229 atomic_inc(&root->log_writers); 230 mutex_unlock(&root->log_mutex); 231 return ret; 232 } 233 234 /* 235 * indicate we're done making changes to the log tree 236 * and wake up anyone waiting to do a sync 237 */ 238 void btrfs_end_log_trans(struct btrfs_root *root) 239 { 240 if (atomic_dec_and_test(&root->log_writers)) { 241 smp_mb(); 242 if (waitqueue_active(&root->log_writer_wait)) 243 wake_up(&root->log_writer_wait); 244 } 245 } 246 247 248 /* 249 * the walk control struct is used to pass state down the chain when 250 * processing the log tree. The stage field tells us which part 251 * of the log tree processing we are currently doing. The others 252 * are state fields used for that specific part 253 */ 254 struct walk_control { 255 /* should we free the extent on disk when done? This is used 256 * at transaction commit time while freeing a log tree 257 */ 258 int free; 259 260 /* should we write out the extent buffer? This is used 261 * while flushing the log tree to disk during a sync 262 */ 263 int write; 264 265 /* should we wait for the extent buffer io to finish? Also used 266 * while flushing the log tree to disk for a sync 267 */ 268 int wait; 269 270 /* pin only walk, we record which extents on disk belong to the 271 * log trees 272 */ 273 int pin; 274 275 /* what stage of the replay code we're currently in */ 276 int stage; 277 278 /* the root we are currently replaying */ 279 struct btrfs_root *replay_dest; 280 281 /* the trans handle for the current replay */ 282 struct btrfs_trans_handle *trans; 283 284 /* the function that gets used to process blocks we find in the 285 * tree. Note the extent_buffer might not be up to date when it is 286 * passed in, and it must be checked or read if you need the data 287 * inside it 288 */ 289 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 290 struct walk_control *wc, u64 gen); 291 }; 292 293 /* 294 * process_func used to pin down extents, write them or wait on them 295 */ 296 static int process_one_buffer(struct btrfs_root *log, 297 struct extent_buffer *eb, 298 struct walk_control *wc, u64 gen) 299 { 300 int ret = 0; 301 302 /* 303 * If this fs is mixed then we need to be able to process the leaves to 304 * pin down any logged extents, so we have to read the block. 305 */ 306 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 307 ret = btrfs_read_buffer(eb, gen); 308 if (ret) 309 return ret; 310 } 311 312 if (wc->pin) 313 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 314 eb->start, eb->len); 315 316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 317 if (wc->pin && btrfs_header_level(eb) == 0) 318 ret = btrfs_exclude_logged_extents(log, eb); 319 if (wc->write) 320 btrfs_write_tree_block(eb); 321 if (wc->wait) 322 btrfs_wait_tree_block_writeback(eb); 323 } 324 return ret; 325 } 326 327 /* 328 * Item overwrite used by replay and tree logging. eb, slot and key all refer 329 * to the src data we are copying out. 330 * 331 * root is the tree we are copying into, and path is a scratch 332 * path for use in this function (it should be released on entry and 333 * will be released on exit). 334 * 335 * If the key is already in the destination tree the existing item is 336 * overwritten. If the existing item isn't big enough, it is extended. 337 * If it is too large, it is truncated. 338 * 339 * If the key isn't in the destination yet, a new item is inserted. 340 */ 341 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 342 struct btrfs_root *root, 343 struct btrfs_path *path, 344 struct extent_buffer *eb, int slot, 345 struct btrfs_key *key) 346 { 347 int ret; 348 u32 item_size; 349 u64 saved_i_size = 0; 350 int save_old_i_size = 0; 351 unsigned long src_ptr; 352 unsigned long dst_ptr; 353 int overwrite_root = 0; 354 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 355 356 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 357 overwrite_root = 1; 358 359 item_size = btrfs_item_size_nr(eb, slot); 360 src_ptr = btrfs_item_ptr_offset(eb, slot); 361 362 /* look for the key in the destination tree */ 363 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 364 if (ret < 0) 365 return ret; 366 367 if (ret == 0) { 368 char *src_copy; 369 char *dst_copy; 370 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 371 path->slots[0]); 372 if (dst_size != item_size) 373 goto insert; 374 375 if (item_size == 0) { 376 btrfs_release_path(path); 377 return 0; 378 } 379 dst_copy = kmalloc(item_size, GFP_NOFS); 380 src_copy = kmalloc(item_size, GFP_NOFS); 381 if (!dst_copy || !src_copy) { 382 btrfs_release_path(path); 383 kfree(dst_copy); 384 kfree(src_copy); 385 return -ENOMEM; 386 } 387 388 read_extent_buffer(eb, src_copy, src_ptr, item_size); 389 390 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 391 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 392 item_size); 393 ret = memcmp(dst_copy, src_copy, item_size); 394 395 kfree(dst_copy); 396 kfree(src_copy); 397 /* 398 * they have the same contents, just return, this saves 399 * us from cowing blocks in the destination tree and doing 400 * extra writes that may not have been done by a previous 401 * sync 402 */ 403 if (ret == 0) { 404 btrfs_release_path(path); 405 return 0; 406 } 407 408 /* 409 * We need to load the old nbytes into the inode so when we 410 * replay the extents we've logged we get the right nbytes. 411 */ 412 if (inode_item) { 413 struct btrfs_inode_item *item; 414 u64 nbytes; 415 u32 mode; 416 417 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 418 struct btrfs_inode_item); 419 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 420 item = btrfs_item_ptr(eb, slot, 421 struct btrfs_inode_item); 422 btrfs_set_inode_nbytes(eb, item, nbytes); 423 424 /* 425 * If this is a directory we need to reset the i_size to 426 * 0 so that we can set it up properly when replaying 427 * the rest of the items in this log. 428 */ 429 mode = btrfs_inode_mode(eb, item); 430 if (S_ISDIR(mode)) 431 btrfs_set_inode_size(eb, item, 0); 432 } 433 } else if (inode_item) { 434 struct btrfs_inode_item *item; 435 u32 mode; 436 437 /* 438 * New inode, set nbytes to 0 so that the nbytes comes out 439 * properly when we replay the extents. 440 */ 441 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 442 btrfs_set_inode_nbytes(eb, item, 0); 443 444 /* 445 * If this is a directory we need to reset the i_size to 0 so 446 * that we can set it up properly when replaying the rest of 447 * the items in this log. 448 */ 449 mode = btrfs_inode_mode(eb, item); 450 if (S_ISDIR(mode)) 451 btrfs_set_inode_size(eb, item, 0); 452 } 453 insert: 454 btrfs_release_path(path); 455 /* try to insert the key into the destination tree */ 456 path->skip_release_on_error = 1; 457 ret = btrfs_insert_empty_item(trans, root, path, 458 key, item_size); 459 path->skip_release_on_error = 0; 460 461 /* make sure any existing item is the correct size */ 462 if (ret == -EEXIST || ret == -EOVERFLOW) { 463 u32 found_size; 464 found_size = btrfs_item_size_nr(path->nodes[0], 465 path->slots[0]); 466 if (found_size > item_size) 467 btrfs_truncate_item(root, path, item_size, 1); 468 else if (found_size < item_size) 469 btrfs_extend_item(root, path, 470 item_size - found_size); 471 } else if (ret) { 472 return ret; 473 } 474 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 475 path->slots[0]); 476 477 /* don't overwrite an existing inode if the generation number 478 * was logged as zero. This is done when the tree logging code 479 * is just logging an inode to make sure it exists after recovery. 480 * 481 * Also, don't overwrite i_size on directories during replay. 482 * log replay inserts and removes directory items based on the 483 * state of the tree found in the subvolume, and i_size is modified 484 * as it goes 485 */ 486 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 487 struct btrfs_inode_item *src_item; 488 struct btrfs_inode_item *dst_item; 489 490 src_item = (struct btrfs_inode_item *)src_ptr; 491 dst_item = (struct btrfs_inode_item *)dst_ptr; 492 493 if (btrfs_inode_generation(eb, src_item) == 0) { 494 struct extent_buffer *dst_eb = path->nodes[0]; 495 496 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 497 S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { 498 struct btrfs_map_token token; 499 u64 ino_size = btrfs_inode_size(eb, src_item); 500 501 btrfs_init_map_token(&token); 502 btrfs_set_token_inode_size(dst_eb, dst_item, 503 ino_size, &token); 504 } 505 goto no_copy; 506 } 507 508 if (overwrite_root && 509 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 510 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 511 save_old_i_size = 1; 512 saved_i_size = btrfs_inode_size(path->nodes[0], 513 dst_item); 514 } 515 } 516 517 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 518 src_ptr, item_size); 519 520 if (save_old_i_size) { 521 struct btrfs_inode_item *dst_item; 522 dst_item = (struct btrfs_inode_item *)dst_ptr; 523 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 524 } 525 526 /* make sure the generation is filled in */ 527 if (key->type == BTRFS_INODE_ITEM_KEY) { 528 struct btrfs_inode_item *dst_item; 529 dst_item = (struct btrfs_inode_item *)dst_ptr; 530 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 531 btrfs_set_inode_generation(path->nodes[0], dst_item, 532 trans->transid); 533 } 534 } 535 no_copy: 536 btrfs_mark_buffer_dirty(path->nodes[0]); 537 btrfs_release_path(path); 538 return 0; 539 } 540 541 /* 542 * simple helper to read an inode off the disk from a given root 543 * This can only be called for subvolume roots and not for the log 544 */ 545 static noinline struct inode *read_one_inode(struct btrfs_root *root, 546 u64 objectid) 547 { 548 struct btrfs_key key; 549 struct inode *inode; 550 551 key.objectid = objectid; 552 key.type = BTRFS_INODE_ITEM_KEY; 553 key.offset = 0; 554 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 555 if (IS_ERR(inode)) { 556 inode = NULL; 557 } else if (is_bad_inode(inode)) { 558 iput(inode); 559 inode = NULL; 560 } 561 return inode; 562 } 563 564 /* replays a single extent in 'eb' at 'slot' with 'key' into the 565 * subvolume 'root'. path is released on entry and should be released 566 * on exit. 567 * 568 * extents in the log tree have not been allocated out of the extent 569 * tree yet. So, this completes the allocation, taking a reference 570 * as required if the extent already exists or creating a new extent 571 * if it isn't in the extent allocation tree yet. 572 * 573 * The extent is inserted into the file, dropping any existing extents 574 * from the file that overlap the new one. 575 */ 576 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 577 struct btrfs_root *root, 578 struct btrfs_path *path, 579 struct extent_buffer *eb, int slot, 580 struct btrfs_key *key) 581 { 582 int found_type; 583 u64 extent_end; 584 u64 start = key->offset; 585 u64 nbytes = 0; 586 struct btrfs_file_extent_item *item; 587 struct inode *inode = NULL; 588 unsigned long size; 589 int ret = 0; 590 591 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 592 found_type = btrfs_file_extent_type(eb, item); 593 594 if (found_type == BTRFS_FILE_EXTENT_REG || 595 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 596 nbytes = btrfs_file_extent_num_bytes(eb, item); 597 extent_end = start + nbytes; 598 599 /* 600 * We don't add to the inodes nbytes if we are prealloc or a 601 * hole. 602 */ 603 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 604 nbytes = 0; 605 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 606 size = btrfs_file_extent_inline_len(eb, slot, item); 607 nbytes = btrfs_file_extent_ram_bytes(eb, item); 608 extent_end = ALIGN(start + size, root->sectorsize); 609 } else { 610 ret = 0; 611 goto out; 612 } 613 614 inode = read_one_inode(root, key->objectid); 615 if (!inode) { 616 ret = -EIO; 617 goto out; 618 } 619 620 /* 621 * first check to see if we already have this extent in the 622 * file. This must be done before the btrfs_drop_extents run 623 * so we don't try to drop this extent. 624 */ 625 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 626 start, 0); 627 628 if (ret == 0 && 629 (found_type == BTRFS_FILE_EXTENT_REG || 630 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 631 struct btrfs_file_extent_item cmp1; 632 struct btrfs_file_extent_item cmp2; 633 struct btrfs_file_extent_item *existing; 634 struct extent_buffer *leaf; 635 636 leaf = path->nodes[0]; 637 existing = btrfs_item_ptr(leaf, path->slots[0], 638 struct btrfs_file_extent_item); 639 640 read_extent_buffer(eb, &cmp1, (unsigned long)item, 641 sizeof(cmp1)); 642 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 643 sizeof(cmp2)); 644 645 /* 646 * we already have a pointer to this exact extent, 647 * we don't have to do anything 648 */ 649 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 650 btrfs_release_path(path); 651 goto out; 652 } 653 } 654 btrfs_release_path(path); 655 656 /* drop any overlapping extents */ 657 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 658 if (ret) 659 goto out; 660 661 if (found_type == BTRFS_FILE_EXTENT_REG || 662 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 663 u64 offset; 664 unsigned long dest_offset; 665 struct btrfs_key ins; 666 667 ret = btrfs_insert_empty_item(trans, root, path, key, 668 sizeof(*item)); 669 if (ret) 670 goto out; 671 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 672 path->slots[0]); 673 copy_extent_buffer(path->nodes[0], eb, dest_offset, 674 (unsigned long)item, sizeof(*item)); 675 676 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 677 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 678 ins.type = BTRFS_EXTENT_ITEM_KEY; 679 offset = key->offset - btrfs_file_extent_offset(eb, item); 680 681 if (ins.objectid > 0) { 682 u64 csum_start; 683 u64 csum_end; 684 LIST_HEAD(ordered_sums); 685 /* 686 * is this extent already allocated in the extent 687 * allocation tree? If so, just add a reference 688 */ 689 ret = btrfs_lookup_data_extent(root, ins.objectid, 690 ins.offset); 691 if (ret == 0) { 692 ret = btrfs_inc_extent_ref(trans, root, 693 ins.objectid, ins.offset, 694 0, root->root_key.objectid, 695 key->objectid, offset, 0); 696 if (ret) 697 goto out; 698 } else { 699 /* 700 * insert the extent pointer in the extent 701 * allocation tree 702 */ 703 ret = btrfs_alloc_logged_file_extent(trans, 704 root, root->root_key.objectid, 705 key->objectid, offset, &ins); 706 if (ret) 707 goto out; 708 } 709 btrfs_release_path(path); 710 711 if (btrfs_file_extent_compression(eb, item)) { 712 csum_start = ins.objectid; 713 csum_end = csum_start + ins.offset; 714 } else { 715 csum_start = ins.objectid + 716 btrfs_file_extent_offset(eb, item); 717 csum_end = csum_start + 718 btrfs_file_extent_num_bytes(eb, item); 719 } 720 721 ret = btrfs_lookup_csums_range(root->log_root, 722 csum_start, csum_end - 1, 723 &ordered_sums, 0); 724 if (ret) 725 goto out; 726 while (!list_empty(&ordered_sums)) { 727 struct btrfs_ordered_sum *sums; 728 sums = list_entry(ordered_sums.next, 729 struct btrfs_ordered_sum, 730 list); 731 if (!ret) 732 ret = btrfs_csum_file_blocks(trans, 733 root->fs_info->csum_root, 734 sums); 735 list_del(&sums->list); 736 kfree(sums); 737 } 738 if (ret) 739 goto out; 740 } else { 741 btrfs_release_path(path); 742 } 743 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 744 /* inline extents are easy, we just overwrite them */ 745 ret = overwrite_item(trans, root, path, eb, slot, key); 746 if (ret) 747 goto out; 748 } 749 750 inode_add_bytes(inode, nbytes); 751 ret = btrfs_update_inode(trans, root, inode); 752 out: 753 if (inode) 754 iput(inode); 755 return ret; 756 } 757 758 /* 759 * when cleaning up conflicts between the directory names in the 760 * subvolume, directory names in the log and directory names in the 761 * inode back references, we may have to unlink inodes from directories. 762 * 763 * This is a helper function to do the unlink of a specific directory 764 * item 765 */ 766 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 767 struct btrfs_root *root, 768 struct btrfs_path *path, 769 struct inode *dir, 770 struct btrfs_dir_item *di) 771 { 772 struct inode *inode; 773 char *name; 774 int name_len; 775 struct extent_buffer *leaf; 776 struct btrfs_key location; 777 int ret; 778 779 leaf = path->nodes[0]; 780 781 btrfs_dir_item_key_to_cpu(leaf, di, &location); 782 name_len = btrfs_dir_name_len(leaf, di); 783 name = kmalloc(name_len, GFP_NOFS); 784 if (!name) 785 return -ENOMEM; 786 787 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 788 btrfs_release_path(path); 789 790 inode = read_one_inode(root, location.objectid); 791 if (!inode) { 792 ret = -EIO; 793 goto out; 794 } 795 796 ret = link_to_fixup_dir(trans, root, path, location.objectid); 797 if (ret) 798 goto out; 799 800 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 801 if (ret) 802 goto out; 803 else 804 ret = btrfs_run_delayed_items(trans, root); 805 out: 806 kfree(name); 807 iput(inode); 808 return ret; 809 } 810 811 /* 812 * helper function to see if a given name and sequence number found 813 * in an inode back reference are already in a directory and correctly 814 * point to this inode 815 */ 816 static noinline int inode_in_dir(struct btrfs_root *root, 817 struct btrfs_path *path, 818 u64 dirid, u64 objectid, u64 index, 819 const char *name, int name_len) 820 { 821 struct btrfs_dir_item *di; 822 struct btrfs_key location; 823 int match = 0; 824 825 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 826 index, name, name_len, 0); 827 if (di && !IS_ERR(di)) { 828 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 829 if (location.objectid != objectid) 830 goto out; 831 } else 832 goto out; 833 btrfs_release_path(path); 834 835 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 836 if (di && !IS_ERR(di)) { 837 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 838 if (location.objectid != objectid) 839 goto out; 840 } else 841 goto out; 842 match = 1; 843 out: 844 btrfs_release_path(path); 845 return match; 846 } 847 848 /* 849 * helper function to check a log tree for a named back reference in 850 * an inode. This is used to decide if a back reference that is 851 * found in the subvolume conflicts with what we find in the log. 852 * 853 * inode backreferences may have multiple refs in a single item, 854 * during replay we process one reference at a time, and we don't 855 * want to delete valid links to a file from the subvolume if that 856 * link is also in the log. 857 */ 858 static noinline int backref_in_log(struct btrfs_root *log, 859 struct btrfs_key *key, 860 u64 ref_objectid, 861 const char *name, int namelen) 862 { 863 struct btrfs_path *path; 864 struct btrfs_inode_ref *ref; 865 unsigned long ptr; 866 unsigned long ptr_end; 867 unsigned long name_ptr; 868 int found_name_len; 869 int item_size; 870 int ret; 871 int match = 0; 872 873 path = btrfs_alloc_path(); 874 if (!path) 875 return -ENOMEM; 876 877 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 878 if (ret != 0) 879 goto out; 880 881 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 882 883 if (key->type == BTRFS_INODE_EXTREF_KEY) { 884 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 885 name, namelen, NULL)) 886 match = 1; 887 888 goto out; 889 } 890 891 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 892 ptr_end = ptr + item_size; 893 while (ptr < ptr_end) { 894 ref = (struct btrfs_inode_ref *)ptr; 895 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 896 if (found_name_len == namelen) { 897 name_ptr = (unsigned long)(ref + 1); 898 ret = memcmp_extent_buffer(path->nodes[0], name, 899 name_ptr, namelen); 900 if (ret == 0) { 901 match = 1; 902 goto out; 903 } 904 } 905 ptr = (unsigned long)(ref + 1) + found_name_len; 906 } 907 out: 908 btrfs_free_path(path); 909 return match; 910 } 911 912 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 913 struct btrfs_root *root, 914 struct btrfs_path *path, 915 struct btrfs_root *log_root, 916 struct inode *dir, struct inode *inode, 917 struct extent_buffer *eb, 918 u64 inode_objectid, u64 parent_objectid, 919 u64 ref_index, char *name, int namelen, 920 int *search_done) 921 { 922 int ret; 923 char *victim_name; 924 int victim_name_len; 925 struct extent_buffer *leaf; 926 struct btrfs_dir_item *di; 927 struct btrfs_key search_key; 928 struct btrfs_inode_extref *extref; 929 930 again: 931 /* Search old style refs */ 932 search_key.objectid = inode_objectid; 933 search_key.type = BTRFS_INODE_REF_KEY; 934 search_key.offset = parent_objectid; 935 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 936 if (ret == 0) { 937 struct btrfs_inode_ref *victim_ref; 938 unsigned long ptr; 939 unsigned long ptr_end; 940 941 leaf = path->nodes[0]; 942 943 /* are we trying to overwrite a back ref for the root directory 944 * if so, just jump out, we're done 945 */ 946 if (search_key.objectid == search_key.offset) 947 return 1; 948 949 /* check all the names in this back reference to see 950 * if they are in the log. if so, we allow them to stay 951 * otherwise they must be unlinked as a conflict 952 */ 953 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 954 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 955 while (ptr < ptr_end) { 956 victim_ref = (struct btrfs_inode_ref *)ptr; 957 victim_name_len = btrfs_inode_ref_name_len(leaf, 958 victim_ref); 959 victim_name = kmalloc(victim_name_len, GFP_NOFS); 960 if (!victim_name) 961 return -ENOMEM; 962 963 read_extent_buffer(leaf, victim_name, 964 (unsigned long)(victim_ref + 1), 965 victim_name_len); 966 967 if (!backref_in_log(log_root, &search_key, 968 parent_objectid, 969 victim_name, 970 victim_name_len)) { 971 inc_nlink(inode); 972 btrfs_release_path(path); 973 974 ret = btrfs_unlink_inode(trans, root, dir, 975 inode, victim_name, 976 victim_name_len); 977 kfree(victim_name); 978 if (ret) 979 return ret; 980 ret = btrfs_run_delayed_items(trans, root); 981 if (ret) 982 return ret; 983 *search_done = 1; 984 goto again; 985 } 986 kfree(victim_name); 987 988 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 989 } 990 991 /* 992 * NOTE: we have searched root tree and checked the 993 * coresponding ref, it does not need to check again. 994 */ 995 *search_done = 1; 996 } 997 btrfs_release_path(path); 998 999 /* Same search but for extended refs */ 1000 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1001 inode_objectid, parent_objectid, 0, 1002 0); 1003 if (!IS_ERR_OR_NULL(extref)) { 1004 u32 item_size; 1005 u32 cur_offset = 0; 1006 unsigned long base; 1007 struct inode *victim_parent; 1008 1009 leaf = path->nodes[0]; 1010 1011 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1012 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1013 1014 while (cur_offset < item_size) { 1015 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1016 1017 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1018 1019 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1020 goto next; 1021 1022 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1023 if (!victim_name) 1024 return -ENOMEM; 1025 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1026 victim_name_len); 1027 1028 search_key.objectid = inode_objectid; 1029 search_key.type = BTRFS_INODE_EXTREF_KEY; 1030 search_key.offset = btrfs_extref_hash(parent_objectid, 1031 victim_name, 1032 victim_name_len); 1033 ret = 0; 1034 if (!backref_in_log(log_root, &search_key, 1035 parent_objectid, victim_name, 1036 victim_name_len)) { 1037 ret = -ENOENT; 1038 victim_parent = read_one_inode(root, 1039 parent_objectid); 1040 if (victim_parent) { 1041 inc_nlink(inode); 1042 btrfs_release_path(path); 1043 1044 ret = btrfs_unlink_inode(trans, root, 1045 victim_parent, 1046 inode, 1047 victim_name, 1048 victim_name_len); 1049 if (!ret) 1050 ret = btrfs_run_delayed_items( 1051 trans, root); 1052 } 1053 iput(victim_parent); 1054 kfree(victim_name); 1055 if (ret) 1056 return ret; 1057 *search_done = 1; 1058 goto again; 1059 } 1060 kfree(victim_name); 1061 if (ret) 1062 return ret; 1063 next: 1064 cur_offset += victim_name_len + sizeof(*extref); 1065 } 1066 *search_done = 1; 1067 } 1068 btrfs_release_path(path); 1069 1070 /* look for a conflicting sequence number */ 1071 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1072 ref_index, name, namelen, 0); 1073 if (di && !IS_ERR(di)) { 1074 ret = drop_one_dir_item(trans, root, path, dir, di); 1075 if (ret) 1076 return ret; 1077 } 1078 btrfs_release_path(path); 1079 1080 /* look for a conflicing name */ 1081 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1082 name, namelen, 0); 1083 if (di && !IS_ERR(di)) { 1084 ret = drop_one_dir_item(trans, root, path, dir, di); 1085 if (ret) 1086 return ret; 1087 } 1088 btrfs_release_path(path); 1089 1090 return 0; 1091 } 1092 1093 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1094 u32 *namelen, char **name, u64 *index, 1095 u64 *parent_objectid) 1096 { 1097 struct btrfs_inode_extref *extref; 1098 1099 extref = (struct btrfs_inode_extref *)ref_ptr; 1100 1101 *namelen = btrfs_inode_extref_name_len(eb, extref); 1102 *name = kmalloc(*namelen, GFP_NOFS); 1103 if (*name == NULL) 1104 return -ENOMEM; 1105 1106 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1107 *namelen); 1108 1109 *index = btrfs_inode_extref_index(eb, extref); 1110 if (parent_objectid) 1111 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1112 1113 return 0; 1114 } 1115 1116 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1117 u32 *namelen, char **name, u64 *index) 1118 { 1119 struct btrfs_inode_ref *ref; 1120 1121 ref = (struct btrfs_inode_ref *)ref_ptr; 1122 1123 *namelen = btrfs_inode_ref_name_len(eb, ref); 1124 *name = kmalloc(*namelen, GFP_NOFS); 1125 if (*name == NULL) 1126 return -ENOMEM; 1127 1128 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1129 1130 *index = btrfs_inode_ref_index(eb, ref); 1131 1132 return 0; 1133 } 1134 1135 /* 1136 * replay one inode back reference item found in the log tree. 1137 * eb, slot and key refer to the buffer and key found in the log tree. 1138 * root is the destination we are replaying into, and path is for temp 1139 * use by this function. (it should be released on return). 1140 */ 1141 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1142 struct btrfs_root *root, 1143 struct btrfs_root *log, 1144 struct btrfs_path *path, 1145 struct extent_buffer *eb, int slot, 1146 struct btrfs_key *key) 1147 { 1148 struct inode *dir = NULL; 1149 struct inode *inode = NULL; 1150 unsigned long ref_ptr; 1151 unsigned long ref_end; 1152 char *name = NULL; 1153 int namelen; 1154 int ret; 1155 int search_done = 0; 1156 int log_ref_ver = 0; 1157 u64 parent_objectid; 1158 u64 inode_objectid; 1159 u64 ref_index = 0; 1160 int ref_struct_size; 1161 1162 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1163 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1164 1165 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1166 struct btrfs_inode_extref *r; 1167 1168 ref_struct_size = sizeof(struct btrfs_inode_extref); 1169 log_ref_ver = 1; 1170 r = (struct btrfs_inode_extref *)ref_ptr; 1171 parent_objectid = btrfs_inode_extref_parent(eb, r); 1172 } else { 1173 ref_struct_size = sizeof(struct btrfs_inode_ref); 1174 parent_objectid = key->offset; 1175 } 1176 inode_objectid = key->objectid; 1177 1178 /* 1179 * it is possible that we didn't log all the parent directories 1180 * for a given inode. If we don't find the dir, just don't 1181 * copy the back ref in. The link count fixup code will take 1182 * care of the rest 1183 */ 1184 dir = read_one_inode(root, parent_objectid); 1185 if (!dir) { 1186 ret = -ENOENT; 1187 goto out; 1188 } 1189 1190 inode = read_one_inode(root, inode_objectid); 1191 if (!inode) { 1192 ret = -EIO; 1193 goto out; 1194 } 1195 1196 while (ref_ptr < ref_end) { 1197 if (log_ref_ver) { 1198 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1199 &ref_index, &parent_objectid); 1200 /* 1201 * parent object can change from one array 1202 * item to another. 1203 */ 1204 if (!dir) 1205 dir = read_one_inode(root, parent_objectid); 1206 if (!dir) { 1207 ret = -ENOENT; 1208 goto out; 1209 } 1210 } else { 1211 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1212 &ref_index); 1213 } 1214 if (ret) 1215 goto out; 1216 1217 /* if we already have a perfect match, we're done */ 1218 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1219 ref_index, name, namelen)) { 1220 /* 1221 * look for a conflicting back reference in the 1222 * metadata. if we find one we have to unlink that name 1223 * of the file before we add our new link. Later on, we 1224 * overwrite any existing back reference, and we don't 1225 * want to create dangling pointers in the directory. 1226 */ 1227 1228 if (!search_done) { 1229 ret = __add_inode_ref(trans, root, path, log, 1230 dir, inode, eb, 1231 inode_objectid, 1232 parent_objectid, 1233 ref_index, name, namelen, 1234 &search_done); 1235 if (ret) { 1236 if (ret == 1) 1237 ret = 0; 1238 goto out; 1239 } 1240 } 1241 1242 /* insert our name */ 1243 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1244 0, ref_index); 1245 if (ret) 1246 goto out; 1247 1248 btrfs_update_inode(trans, root, inode); 1249 } 1250 1251 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1252 kfree(name); 1253 name = NULL; 1254 if (log_ref_ver) { 1255 iput(dir); 1256 dir = NULL; 1257 } 1258 } 1259 1260 /* finally write the back reference in the inode */ 1261 ret = overwrite_item(trans, root, path, eb, slot, key); 1262 out: 1263 btrfs_release_path(path); 1264 kfree(name); 1265 iput(dir); 1266 iput(inode); 1267 return ret; 1268 } 1269 1270 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1271 struct btrfs_root *root, u64 ino) 1272 { 1273 int ret; 1274 1275 ret = btrfs_insert_orphan_item(trans, root, ino); 1276 if (ret == -EEXIST) 1277 ret = 0; 1278 1279 return ret; 1280 } 1281 1282 static int count_inode_extrefs(struct btrfs_root *root, 1283 struct inode *inode, struct btrfs_path *path) 1284 { 1285 int ret = 0; 1286 int name_len; 1287 unsigned int nlink = 0; 1288 u32 item_size; 1289 u32 cur_offset = 0; 1290 u64 inode_objectid = btrfs_ino(inode); 1291 u64 offset = 0; 1292 unsigned long ptr; 1293 struct btrfs_inode_extref *extref; 1294 struct extent_buffer *leaf; 1295 1296 while (1) { 1297 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1298 &extref, &offset); 1299 if (ret) 1300 break; 1301 1302 leaf = path->nodes[0]; 1303 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1304 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1305 cur_offset = 0; 1306 1307 while (cur_offset < item_size) { 1308 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1309 name_len = btrfs_inode_extref_name_len(leaf, extref); 1310 1311 nlink++; 1312 1313 cur_offset += name_len + sizeof(*extref); 1314 } 1315 1316 offset++; 1317 btrfs_release_path(path); 1318 } 1319 btrfs_release_path(path); 1320 1321 if (ret < 0 && ret != -ENOENT) 1322 return ret; 1323 return nlink; 1324 } 1325 1326 static int count_inode_refs(struct btrfs_root *root, 1327 struct inode *inode, struct btrfs_path *path) 1328 { 1329 int ret; 1330 struct btrfs_key key; 1331 unsigned int nlink = 0; 1332 unsigned long ptr; 1333 unsigned long ptr_end; 1334 int name_len; 1335 u64 ino = btrfs_ino(inode); 1336 1337 key.objectid = ino; 1338 key.type = BTRFS_INODE_REF_KEY; 1339 key.offset = (u64)-1; 1340 1341 while (1) { 1342 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1343 if (ret < 0) 1344 break; 1345 if (ret > 0) { 1346 if (path->slots[0] == 0) 1347 break; 1348 path->slots[0]--; 1349 } 1350 process_slot: 1351 btrfs_item_key_to_cpu(path->nodes[0], &key, 1352 path->slots[0]); 1353 if (key.objectid != ino || 1354 key.type != BTRFS_INODE_REF_KEY) 1355 break; 1356 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1357 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1358 path->slots[0]); 1359 while (ptr < ptr_end) { 1360 struct btrfs_inode_ref *ref; 1361 1362 ref = (struct btrfs_inode_ref *)ptr; 1363 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1364 ref); 1365 ptr = (unsigned long)(ref + 1) + name_len; 1366 nlink++; 1367 } 1368 1369 if (key.offset == 0) 1370 break; 1371 if (path->slots[0] > 0) { 1372 path->slots[0]--; 1373 goto process_slot; 1374 } 1375 key.offset--; 1376 btrfs_release_path(path); 1377 } 1378 btrfs_release_path(path); 1379 1380 return nlink; 1381 } 1382 1383 /* 1384 * There are a few corners where the link count of the file can't 1385 * be properly maintained during replay. So, instead of adding 1386 * lots of complexity to the log code, we just scan the backrefs 1387 * for any file that has been through replay. 1388 * 1389 * The scan will update the link count on the inode to reflect the 1390 * number of back refs found. If it goes down to zero, the iput 1391 * will free the inode. 1392 */ 1393 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1394 struct btrfs_root *root, 1395 struct inode *inode) 1396 { 1397 struct btrfs_path *path; 1398 int ret; 1399 u64 nlink = 0; 1400 u64 ino = btrfs_ino(inode); 1401 1402 path = btrfs_alloc_path(); 1403 if (!path) 1404 return -ENOMEM; 1405 1406 ret = count_inode_refs(root, inode, path); 1407 if (ret < 0) 1408 goto out; 1409 1410 nlink = ret; 1411 1412 ret = count_inode_extrefs(root, inode, path); 1413 if (ret < 0) 1414 goto out; 1415 1416 nlink += ret; 1417 1418 ret = 0; 1419 1420 if (nlink != inode->i_nlink) { 1421 set_nlink(inode, nlink); 1422 btrfs_update_inode(trans, root, inode); 1423 } 1424 BTRFS_I(inode)->index_cnt = (u64)-1; 1425 1426 if (inode->i_nlink == 0) { 1427 if (S_ISDIR(inode->i_mode)) { 1428 ret = replay_dir_deletes(trans, root, NULL, path, 1429 ino, 1); 1430 if (ret) 1431 goto out; 1432 } 1433 ret = insert_orphan_item(trans, root, ino); 1434 } 1435 1436 out: 1437 btrfs_free_path(path); 1438 return ret; 1439 } 1440 1441 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1442 struct btrfs_root *root, 1443 struct btrfs_path *path) 1444 { 1445 int ret; 1446 struct btrfs_key key; 1447 struct inode *inode; 1448 1449 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1450 key.type = BTRFS_ORPHAN_ITEM_KEY; 1451 key.offset = (u64)-1; 1452 while (1) { 1453 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1454 if (ret < 0) 1455 break; 1456 1457 if (ret == 1) { 1458 if (path->slots[0] == 0) 1459 break; 1460 path->slots[0]--; 1461 } 1462 1463 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1464 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1465 key.type != BTRFS_ORPHAN_ITEM_KEY) 1466 break; 1467 1468 ret = btrfs_del_item(trans, root, path); 1469 if (ret) 1470 goto out; 1471 1472 btrfs_release_path(path); 1473 inode = read_one_inode(root, key.offset); 1474 if (!inode) 1475 return -EIO; 1476 1477 ret = fixup_inode_link_count(trans, root, inode); 1478 iput(inode); 1479 if (ret) 1480 goto out; 1481 1482 /* 1483 * fixup on a directory may create new entries, 1484 * make sure we always look for the highset possible 1485 * offset 1486 */ 1487 key.offset = (u64)-1; 1488 } 1489 ret = 0; 1490 out: 1491 btrfs_release_path(path); 1492 return ret; 1493 } 1494 1495 1496 /* 1497 * record a given inode in the fixup dir so we can check its link 1498 * count when replay is done. The link count is incremented here 1499 * so the inode won't go away until we check it 1500 */ 1501 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1502 struct btrfs_root *root, 1503 struct btrfs_path *path, 1504 u64 objectid) 1505 { 1506 struct btrfs_key key; 1507 int ret = 0; 1508 struct inode *inode; 1509 1510 inode = read_one_inode(root, objectid); 1511 if (!inode) 1512 return -EIO; 1513 1514 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1515 key.type = BTRFS_ORPHAN_ITEM_KEY; 1516 key.offset = objectid; 1517 1518 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1519 1520 btrfs_release_path(path); 1521 if (ret == 0) { 1522 if (!inode->i_nlink) 1523 set_nlink(inode, 1); 1524 else 1525 inc_nlink(inode); 1526 ret = btrfs_update_inode(trans, root, inode); 1527 } else if (ret == -EEXIST) { 1528 ret = 0; 1529 } else { 1530 BUG(); /* Logic Error */ 1531 } 1532 iput(inode); 1533 1534 return ret; 1535 } 1536 1537 /* 1538 * when replaying the log for a directory, we only insert names 1539 * for inodes that actually exist. This means an fsync on a directory 1540 * does not implicitly fsync all the new files in it 1541 */ 1542 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1543 struct btrfs_root *root, 1544 struct btrfs_path *path, 1545 u64 dirid, u64 index, 1546 char *name, int name_len, u8 type, 1547 struct btrfs_key *location) 1548 { 1549 struct inode *inode; 1550 struct inode *dir; 1551 int ret; 1552 1553 inode = read_one_inode(root, location->objectid); 1554 if (!inode) 1555 return -ENOENT; 1556 1557 dir = read_one_inode(root, dirid); 1558 if (!dir) { 1559 iput(inode); 1560 return -EIO; 1561 } 1562 1563 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1564 1565 /* FIXME, put inode into FIXUP list */ 1566 1567 iput(inode); 1568 iput(dir); 1569 return ret; 1570 } 1571 1572 /* 1573 * Return true if an inode reference exists in the log for the given name, 1574 * inode and parent inode. 1575 */ 1576 static bool name_in_log_ref(struct btrfs_root *log_root, 1577 const char *name, const int name_len, 1578 const u64 dirid, const u64 ino) 1579 { 1580 struct btrfs_key search_key; 1581 1582 search_key.objectid = ino; 1583 search_key.type = BTRFS_INODE_REF_KEY; 1584 search_key.offset = dirid; 1585 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1586 return true; 1587 1588 search_key.type = BTRFS_INODE_EXTREF_KEY; 1589 search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1590 if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1591 return true; 1592 1593 return false; 1594 } 1595 1596 /* 1597 * take a single entry in a log directory item and replay it into 1598 * the subvolume. 1599 * 1600 * if a conflicting item exists in the subdirectory already, 1601 * the inode it points to is unlinked and put into the link count 1602 * fix up tree. 1603 * 1604 * If a name from the log points to a file or directory that does 1605 * not exist in the FS, it is skipped. fsyncs on directories 1606 * do not force down inodes inside that directory, just changes to the 1607 * names or unlinks in a directory. 1608 */ 1609 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1610 struct btrfs_root *root, 1611 struct btrfs_path *path, 1612 struct extent_buffer *eb, 1613 struct btrfs_dir_item *di, 1614 struct btrfs_key *key) 1615 { 1616 char *name; 1617 int name_len; 1618 struct btrfs_dir_item *dst_di; 1619 struct btrfs_key found_key; 1620 struct btrfs_key log_key; 1621 struct inode *dir; 1622 u8 log_type; 1623 int exists; 1624 int ret = 0; 1625 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1626 1627 dir = read_one_inode(root, key->objectid); 1628 if (!dir) 1629 return -EIO; 1630 1631 name_len = btrfs_dir_name_len(eb, di); 1632 name = kmalloc(name_len, GFP_NOFS); 1633 if (!name) { 1634 ret = -ENOMEM; 1635 goto out; 1636 } 1637 1638 log_type = btrfs_dir_type(eb, di); 1639 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1640 name_len); 1641 1642 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1643 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1644 if (exists == 0) 1645 exists = 1; 1646 else 1647 exists = 0; 1648 btrfs_release_path(path); 1649 1650 if (key->type == BTRFS_DIR_ITEM_KEY) { 1651 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1652 name, name_len, 1); 1653 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1654 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1655 key->objectid, 1656 key->offset, name, 1657 name_len, 1); 1658 } else { 1659 /* Corruption */ 1660 ret = -EINVAL; 1661 goto out; 1662 } 1663 if (IS_ERR_OR_NULL(dst_di)) { 1664 /* we need a sequence number to insert, so we only 1665 * do inserts for the BTRFS_DIR_INDEX_KEY types 1666 */ 1667 if (key->type != BTRFS_DIR_INDEX_KEY) 1668 goto out; 1669 goto insert; 1670 } 1671 1672 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1673 /* the existing item matches the logged item */ 1674 if (found_key.objectid == log_key.objectid && 1675 found_key.type == log_key.type && 1676 found_key.offset == log_key.offset && 1677 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1678 update_size = false; 1679 goto out; 1680 } 1681 1682 /* 1683 * don't drop the conflicting directory entry if the inode 1684 * for the new entry doesn't exist 1685 */ 1686 if (!exists) 1687 goto out; 1688 1689 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1690 if (ret) 1691 goto out; 1692 1693 if (key->type == BTRFS_DIR_INDEX_KEY) 1694 goto insert; 1695 out: 1696 btrfs_release_path(path); 1697 if (!ret && update_size) { 1698 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1699 ret = btrfs_update_inode(trans, root, dir); 1700 } 1701 kfree(name); 1702 iput(dir); 1703 return ret; 1704 1705 insert: 1706 if (name_in_log_ref(root->log_root, name, name_len, 1707 key->objectid, log_key.objectid)) { 1708 /* The dentry will be added later. */ 1709 ret = 0; 1710 update_size = false; 1711 goto out; 1712 } 1713 btrfs_release_path(path); 1714 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1715 name, name_len, log_type, &log_key); 1716 if (ret && ret != -ENOENT && ret != -EEXIST) 1717 goto out; 1718 update_size = false; 1719 ret = 0; 1720 goto out; 1721 } 1722 1723 /* 1724 * find all the names in a directory item and reconcile them into 1725 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1726 * one name in a directory item, but the same code gets used for 1727 * both directory index types 1728 */ 1729 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1730 struct btrfs_root *root, 1731 struct btrfs_path *path, 1732 struct extent_buffer *eb, int slot, 1733 struct btrfs_key *key) 1734 { 1735 int ret; 1736 u32 item_size = btrfs_item_size_nr(eb, slot); 1737 struct btrfs_dir_item *di; 1738 int name_len; 1739 unsigned long ptr; 1740 unsigned long ptr_end; 1741 1742 ptr = btrfs_item_ptr_offset(eb, slot); 1743 ptr_end = ptr + item_size; 1744 while (ptr < ptr_end) { 1745 di = (struct btrfs_dir_item *)ptr; 1746 if (verify_dir_item(root, eb, di)) 1747 return -EIO; 1748 name_len = btrfs_dir_name_len(eb, di); 1749 ret = replay_one_name(trans, root, path, eb, di, key); 1750 if (ret) 1751 return ret; 1752 ptr = (unsigned long)(di + 1); 1753 ptr += name_len; 1754 } 1755 return 0; 1756 } 1757 1758 /* 1759 * directory replay has two parts. There are the standard directory 1760 * items in the log copied from the subvolume, and range items 1761 * created in the log while the subvolume was logged. 1762 * 1763 * The range items tell us which parts of the key space the log 1764 * is authoritative for. During replay, if a key in the subvolume 1765 * directory is in a logged range item, but not actually in the log 1766 * that means it was deleted from the directory before the fsync 1767 * and should be removed. 1768 */ 1769 static noinline int find_dir_range(struct btrfs_root *root, 1770 struct btrfs_path *path, 1771 u64 dirid, int key_type, 1772 u64 *start_ret, u64 *end_ret) 1773 { 1774 struct btrfs_key key; 1775 u64 found_end; 1776 struct btrfs_dir_log_item *item; 1777 int ret; 1778 int nritems; 1779 1780 if (*start_ret == (u64)-1) 1781 return 1; 1782 1783 key.objectid = dirid; 1784 key.type = key_type; 1785 key.offset = *start_ret; 1786 1787 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1788 if (ret < 0) 1789 goto out; 1790 if (ret > 0) { 1791 if (path->slots[0] == 0) 1792 goto out; 1793 path->slots[0]--; 1794 } 1795 if (ret != 0) 1796 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1797 1798 if (key.type != key_type || key.objectid != dirid) { 1799 ret = 1; 1800 goto next; 1801 } 1802 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1803 struct btrfs_dir_log_item); 1804 found_end = btrfs_dir_log_end(path->nodes[0], item); 1805 1806 if (*start_ret >= key.offset && *start_ret <= found_end) { 1807 ret = 0; 1808 *start_ret = key.offset; 1809 *end_ret = found_end; 1810 goto out; 1811 } 1812 ret = 1; 1813 next: 1814 /* check the next slot in the tree to see if it is a valid item */ 1815 nritems = btrfs_header_nritems(path->nodes[0]); 1816 if (path->slots[0] >= nritems) { 1817 ret = btrfs_next_leaf(root, path); 1818 if (ret) 1819 goto out; 1820 } else { 1821 path->slots[0]++; 1822 } 1823 1824 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1825 1826 if (key.type != key_type || key.objectid != dirid) { 1827 ret = 1; 1828 goto out; 1829 } 1830 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1831 struct btrfs_dir_log_item); 1832 found_end = btrfs_dir_log_end(path->nodes[0], item); 1833 *start_ret = key.offset; 1834 *end_ret = found_end; 1835 ret = 0; 1836 out: 1837 btrfs_release_path(path); 1838 return ret; 1839 } 1840 1841 /* 1842 * this looks for a given directory item in the log. If the directory 1843 * item is not in the log, the item is removed and the inode it points 1844 * to is unlinked 1845 */ 1846 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1847 struct btrfs_root *root, 1848 struct btrfs_root *log, 1849 struct btrfs_path *path, 1850 struct btrfs_path *log_path, 1851 struct inode *dir, 1852 struct btrfs_key *dir_key) 1853 { 1854 int ret; 1855 struct extent_buffer *eb; 1856 int slot; 1857 u32 item_size; 1858 struct btrfs_dir_item *di; 1859 struct btrfs_dir_item *log_di; 1860 int name_len; 1861 unsigned long ptr; 1862 unsigned long ptr_end; 1863 char *name; 1864 struct inode *inode; 1865 struct btrfs_key location; 1866 1867 again: 1868 eb = path->nodes[0]; 1869 slot = path->slots[0]; 1870 item_size = btrfs_item_size_nr(eb, slot); 1871 ptr = btrfs_item_ptr_offset(eb, slot); 1872 ptr_end = ptr + item_size; 1873 while (ptr < ptr_end) { 1874 di = (struct btrfs_dir_item *)ptr; 1875 if (verify_dir_item(root, eb, di)) { 1876 ret = -EIO; 1877 goto out; 1878 } 1879 1880 name_len = btrfs_dir_name_len(eb, di); 1881 name = kmalloc(name_len, GFP_NOFS); 1882 if (!name) { 1883 ret = -ENOMEM; 1884 goto out; 1885 } 1886 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1887 name_len); 1888 log_di = NULL; 1889 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1890 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1891 dir_key->objectid, 1892 name, name_len, 0); 1893 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1894 log_di = btrfs_lookup_dir_index_item(trans, log, 1895 log_path, 1896 dir_key->objectid, 1897 dir_key->offset, 1898 name, name_len, 0); 1899 } 1900 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 1901 btrfs_dir_item_key_to_cpu(eb, di, &location); 1902 btrfs_release_path(path); 1903 btrfs_release_path(log_path); 1904 inode = read_one_inode(root, location.objectid); 1905 if (!inode) { 1906 kfree(name); 1907 return -EIO; 1908 } 1909 1910 ret = link_to_fixup_dir(trans, root, 1911 path, location.objectid); 1912 if (ret) { 1913 kfree(name); 1914 iput(inode); 1915 goto out; 1916 } 1917 1918 inc_nlink(inode); 1919 ret = btrfs_unlink_inode(trans, root, dir, inode, 1920 name, name_len); 1921 if (!ret) 1922 ret = btrfs_run_delayed_items(trans, root); 1923 kfree(name); 1924 iput(inode); 1925 if (ret) 1926 goto out; 1927 1928 /* there might still be more names under this key 1929 * check and repeat if required 1930 */ 1931 ret = btrfs_search_slot(NULL, root, dir_key, path, 1932 0, 0); 1933 if (ret == 0) 1934 goto again; 1935 ret = 0; 1936 goto out; 1937 } else if (IS_ERR(log_di)) { 1938 kfree(name); 1939 return PTR_ERR(log_di); 1940 } 1941 btrfs_release_path(log_path); 1942 kfree(name); 1943 1944 ptr = (unsigned long)(di + 1); 1945 ptr += name_len; 1946 } 1947 ret = 0; 1948 out: 1949 btrfs_release_path(path); 1950 btrfs_release_path(log_path); 1951 return ret; 1952 } 1953 1954 /* 1955 * deletion replay happens before we copy any new directory items 1956 * out of the log or out of backreferences from inodes. It 1957 * scans the log to find ranges of keys that log is authoritative for, 1958 * and then scans the directory to find items in those ranges that are 1959 * not present in the log. 1960 * 1961 * Anything we don't find in the log is unlinked and removed from the 1962 * directory. 1963 */ 1964 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1965 struct btrfs_root *root, 1966 struct btrfs_root *log, 1967 struct btrfs_path *path, 1968 u64 dirid, int del_all) 1969 { 1970 u64 range_start; 1971 u64 range_end; 1972 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1973 int ret = 0; 1974 struct btrfs_key dir_key; 1975 struct btrfs_key found_key; 1976 struct btrfs_path *log_path; 1977 struct inode *dir; 1978 1979 dir_key.objectid = dirid; 1980 dir_key.type = BTRFS_DIR_ITEM_KEY; 1981 log_path = btrfs_alloc_path(); 1982 if (!log_path) 1983 return -ENOMEM; 1984 1985 dir = read_one_inode(root, dirid); 1986 /* it isn't an error if the inode isn't there, that can happen 1987 * because we replay the deletes before we copy in the inode item 1988 * from the log 1989 */ 1990 if (!dir) { 1991 btrfs_free_path(log_path); 1992 return 0; 1993 } 1994 again: 1995 range_start = 0; 1996 range_end = 0; 1997 while (1) { 1998 if (del_all) 1999 range_end = (u64)-1; 2000 else { 2001 ret = find_dir_range(log, path, dirid, key_type, 2002 &range_start, &range_end); 2003 if (ret != 0) 2004 break; 2005 } 2006 2007 dir_key.offset = range_start; 2008 while (1) { 2009 int nritems; 2010 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2011 0, 0); 2012 if (ret < 0) 2013 goto out; 2014 2015 nritems = btrfs_header_nritems(path->nodes[0]); 2016 if (path->slots[0] >= nritems) { 2017 ret = btrfs_next_leaf(root, path); 2018 if (ret) 2019 break; 2020 } 2021 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2022 path->slots[0]); 2023 if (found_key.objectid != dirid || 2024 found_key.type != dir_key.type) 2025 goto next_type; 2026 2027 if (found_key.offset > range_end) 2028 break; 2029 2030 ret = check_item_in_log(trans, root, log, path, 2031 log_path, dir, 2032 &found_key); 2033 if (ret) 2034 goto out; 2035 if (found_key.offset == (u64)-1) 2036 break; 2037 dir_key.offset = found_key.offset + 1; 2038 } 2039 btrfs_release_path(path); 2040 if (range_end == (u64)-1) 2041 break; 2042 range_start = range_end + 1; 2043 } 2044 2045 next_type: 2046 ret = 0; 2047 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2048 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2049 dir_key.type = BTRFS_DIR_INDEX_KEY; 2050 btrfs_release_path(path); 2051 goto again; 2052 } 2053 out: 2054 btrfs_release_path(path); 2055 btrfs_free_path(log_path); 2056 iput(dir); 2057 return ret; 2058 } 2059 2060 /* 2061 * the process_func used to replay items from the log tree. This 2062 * gets called in two different stages. The first stage just looks 2063 * for inodes and makes sure they are all copied into the subvolume. 2064 * 2065 * The second stage copies all the other item types from the log into 2066 * the subvolume. The two stage approach is slower, but gets rid of 2067 * lots of complexity around inodes referencing other inodes that exist 2068 * only in the log (references come from either directory items or inode 2069 * back refs). 2070 */ 2071 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2072 struct walk_control *wc, u64 gen) 2073 { 2074 int nritems; 2075 struct btrfs_path *path; 2076 struct btrfs_root *root = wc->replay_dest; 2077 struct btrfs_key key; 2078 int level; 2079 int i; 2080 int ret; 2081 2082 ret = btrfs_read_buffer(eb, gen); 2083 if (ret) 2084 return ret; 2085 2086 level = btrfs_header_level(eb); 2087 2088 if (level != 0) 2089 return 0; 2090 2091 path = btrfs_alloc_path(); 2092 if (!path) 2093 return -ENOMEM; 2094 2095 nritems = btrfs_header_nritems(eb); 2096 for (i = 0; i < nritems; i++) { 2097 btrfs_item_key_to_cpu(eb, &key, i); 2098 2099 /* inode keys are done during the first stage */ 2100 if (key.type == BTRFS_INODE_ITEM_KEY && 2101 wc->stage == LOG_WALK_REPLAY_INODES) { 2102 struct btrfs_inode_item *inode_item; 2103 u32 mode; 2104 2105 inode_item = btrfs_item_ptr(eb, i, 2106 struct btrfs_inode_item); 2107 mode = btrfs_inode_mode(eb, inode_item); 2108 if (S_ISDIR(mode)) { 2109 ret = replay_dir_deletes(wc->trans, 2110 root, log, path, key.objectid, 0); 2111 if (ret) 2112 break; 2113 } 2114 ret = overwrite_item(wc->trans, root, path, 2115 eb, i, &key); 2116 if (ret) 2117 break; 2118 2119 /* for regular files, make sure corresponding 2120 * orhpan item exist. extents past the new EOF 2121 * will be truncated later by orphan cleanup. 2122 */ 2123 if (S_ISREG(mode)) { 2124 ret = insert_orphan_item(wc->trans, root, 2125 key.objectid); 2126 if (ret) 2127 break; 2128 } 2129 2130 ret = link_to_fixup_dir(wc->trans, root, 2131 path, key.objectid); 2132 if (ret) 2133 break; 2134 } 2135 2136 if (key.type == BTRFS_DIR_INDEX_KEY && 2137 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2138 ret = replay_one_dir_item(wc->trans, root, path, 2139 eb, i, &key); 2140 if (ret) 2141 break; 2142 } 2143 2144 if (wc->stage < LOG_WALK_REPLAY_ALL) 2145 continue; 2146 2147 /* these keys are simply copied */ 2148 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2149 ret = overwrite_item(wc->trans, root, path, 2150 eb, i, &key); 2151 if (ret) 2152 break; 2153 } else if (key.type == BTRFS_INODE_REF_KEY || 2154 key.type == BTRFS_INODE_EXTREF_KEY) { 2155 ret = add_inode_ref(wc->trans, root, log, path, 2156 eb, i, &key); 2157 if (ret && ret != -ENOENT) 2158 break; 2159 ret = 0; 2160 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2161 ret = replay_one_extent(wc->trans, root, path, 2162 eb, i, &key); 2163 if (ret) 2164 break; 2165 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2166 ret = replay_one_dir_item(wc->trans, root, path, 2167 eb, i, &key); 2168 if (ret) 2169 break; 2170 } 2171 } 2172 btrfs_free_path(path); 2173 return ret; 2174 } 2175 2176 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2177 struct btrfs_root *root, 2178 struct btrfs_path *path, int *level, 2179 struct walk_control *wc) 2180 { 2181 u64 root_owner; 2182 u64 bytenr; 2183 u64 ptr_gen; 2184 struct extent_buffer *next; 2185 struct extent_buffer *cur; 2186 struct extent_buffer *parent; 2187 u32 blocksize; 2188 int ret = 0; 2189 2190 WARN_ON(*level < 0); 2191 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2192 2193 while (*level > 0) { 2194 WARN_ON(*level < 0); 2195 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2196 cur = path->nodes[*level]; 2197 2198 WARN_ON(btrfs_header_level(cur) != *level); 2199 2200 if (path->slots[*level] >= 2201 btrfs_header_nritems(cur)) 2202 break; 2203 2204 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2205 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2206 blocksize = root->nodesize; 2207 2208 parent = path->nodes[*level]; 2209 root_owner = btrfs_header_owner(parent); 2210 2211 next = btrfs_find_create_tree_block(root, bytenr); 2212 if (!next) 2213 return -ENOMEM; 2214 2215 if (*level == 1) { 2216 ret = wc->process_func(root, next, wc, ptr_gen); 2217 if (ret) { 2218 free_extent_buffer(next); 2219 return ret; 2220 } 2221 2222 path->slots[*level]++; 2223 if (wc->free) { 2224 ret = btrfs_read_buffer(next, ptr_gen); 2225 if (ret) { 2226 free_extent_buffer(next); 2227 return ret; 2228 } 2229 2230 if (trans) { 2231 btrfs_tree_lock(next); 2232 btrfs_set_lock_blocking(next); 2233 clean_tree_block(trans, root, next); 2234 btrfs_wait_tree_block_writeback(next); 2235 btrfs_tree_unlock(next); 2236 } 2237 2238 WARN_ON(root_owner != 2239 BTRFS_TREE_LOG_OBJECTID); 2240 ret = btrfs_free_and_pin_reserved_extent(root, 2241 bytenr, blocksize); 2242 if (ret) { 2243 free_extent_buffer(next); 2244 return ret; 2245 } 2246 } 2247 free_extent_buffer(next); 2248 continue; 2249 } 2250 ret = btrfs_read_buffer(next, ptr_gen); 2251 if (ret) { 2252 free_extent_buffer(next); 2253 return ret; 2254 } 2255 2256 WARN_ON(*level <= 0); 2257 if (path->nodes[*level-1]) 2258 free_extent_buffer(path->nodes[*level-1]); 2259 path->nodes[*level-1] = next; 2260 *level = btrfs_header_level(next); 2261 path->slots[*level] = 0; 2262 cond_resched(); 2263 } 2264 WARN_ON(*level < 0); 2265 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2266 2267 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2268 2269 cond_resched(); 2270 return 0; 2271 } 2272 2273 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2274 struct btrfs_root *root, 2275 struct btrfs_path *path, int *level, 2276 struct walk_control *wc) 2277 { 2278 u64 root_owner; 2279 int i; 2280 int slot; 2281 int ret; 2282 2283 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2284 slot = path->slots[i]; 2285 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2286 path->slots[i]++; 2287 *level = i; 2288 WARN_ON(*level == 0); 2289 return 0; 2290 } else { 2291 struct extent_buffer *parent; 2292 if (path->nodes[*level] == root->node) 2293 parent = path->nodes[*level]; 2294 else 2295 parent = path->nodes[*level + 1]; 2296 2297 root_owner = btrfs_header_owner(parent); 2298 ret = wc->process_func(root, path->nodes[*level], wc, 2299 btrfs_header_generation(path->nodes[*level])); 2300 if (ret) 2301 return ret; 2302 2303 if (wc->free) { 2304 struct extent_buffer *next; 2305 2306 next = path->nodes[*level]; 2307 2308 if (trans) { 2309 btrfs_tree_lock(next); 2310 btrfs_set_lock_blocking(next); 2311 clean_tree_block(trans, root, next); 2312 btrfs_wait_tree_block_writeback(next); 2313 btrfs_tree_unlock(next); 2314 } 2315 2316 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2317 ret = btrfs_free_and_pin_reserved_extent(root, 2318 path->nodes[*level]->start, 2319 path->nodes[*level]->len); 2320 if (ret) 2321 return ret; 2322 } 2323 free_extent_buffer(path->nodes[*level]); 2324 path->nodes[*level] = NULL; 2325 *level = i + 1; 2326 } 2327 } 2328 return 1; 2329 } 2330 2331 /* 2332 * drop the reference count on the tree rooted at 'snap'. This traverses 2333 * the tree freeing any blocks that have a ref count of zero after being 2334 * decremented. 2335 */ 2336 static int walk_log_tree(struct btrfs_trans_handle *trans, 2337 struct btrfs_root *log, struct walk_control *wc) 2338 { 2339 int ret = 0; 2340 int wret; 2341 int level; 2342 struct btrfs_path *path; 2343 int orig_level; 2344 2345 path = btrfs_alloc_path(); 2346 if (!path) 2347 return -ENOMEM; 2348 2349 level = btrfs_header_level(log->node); 2350 orig_level = level; 2351 path->nodes[level] = log->node; 2352 extent_buffer_get(log->node); 2353 path->slots[level] = 0; 2354 2355 while (1) { 2356 wret = walk_down_log_tree(trans, log, path, &level, wc); 2357 if (wret > 0) 2358 break; 2359 if (wret < 0) { 2360 ret = wret; 2361 goto out; 2362 } 2363 2364 wret = walk_up_log_tree(trans, log, path, &level, wc); 2365 if (wret > 0) 2366 break; 2367 if (wret < 0) { 2368 ret = wret; 2369 goto out; 2370 } 2371 } 2372 2373 /* was the root node processed? if not, catch it here */ 2374 if (path->nodes[orig_level]) { 2375 ret = wc->process_func(log, path->nodes[orig_level], wc, 2376 btrfs_header_generation(path->nodes[orig_level])); 2377 if (ret) 2378 goto out; 2379 if (wc->free) { 2380 struct extent_buffer *next; 2381 2382 next = path->nodes[orig_level]; 2383 2384 if (trans) { 2385 btrfs_tree_lock(next); 2386 btrfs_set_lock_blocking(next); 2387 clean_tree_block(trans, log, next); 2388 btrfs_wait_tree_block_writeback(next); 2389 btrfs_tree_unlock(next); 2390 } 2391 2392 WARN_ON(log->root_key.objectid != 2393 BTRFS_TREE_LOG_OBJECTID); 2394 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2395 next->len); 2396 if (ret) 2397 goto out; 2398 } 2399 } 2400 2401 out: 2402 btrfs_free_path(path); 2403 return ret; 2404 } 2405 2406 /* 2407 * helper function to update the item for a given subvolumes log root 2408 * in the tree of log roots 2409 */ 2410 static int update_log_root(struct btrfs_trans_handle *trans, 2411 struct btrfs_root *log) 2412 { 2413 int ret; 2414 2415 if (log->log_transid == 1) { 2416 /* insert root item on the first sync */ 2417 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2418 &log->root_key, &log->root_item); 2419 } else { 2420 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2421 &log->root_key, &log->root_item); 2422 } 2423 return ret; 2424 } 2425 2426 static void wait_log_commit(struct btrfs_trans_handle *trans, 2427 struct btrfs_root *root, int transid) 2428 { 2429 DEFINE_WAIT(wait); 2430 int index = transid % 2; 2431 2432 /* 2433 * we only allow two pending log transactions at a time, 2434 * so we know that if ours is more than 2 older than the 2435 * current transaction, we're done 2436 */ 2437 do { 2438 prepare_to_wait(&root->log_commit_wait[index], 2439 &wait, TASK_UNINTERRUPTIBLE); 2440 mutex_unlock(&root->log_mutex); 2441 2442 if (root->log_transid_committed < transid && 2443 atomic_read(&root->log_commit[index])) 2444 schedule(); 2445 2446 finish_wait(&root->log_commit_wait[index], &wait); 2447 mutex_lock(&root->log_mutex); 2448 } while (root->log_transid_committed < transid && 2449 atomic_read(&root->log_commit[index])); 2450 } 2451 2452 static void wait_for_writer(struct btrfs_trans_handle *trans, 2453 struct btrfs_root *root) 2454 { 2455 DEFINE_WAIT(wait); 2456 2457 while (atomic_read(&root->log_writers)) { 2458 prepare_to_wait(&root->log_writer_wait, 2459 &wait, TASK_UNINTERRUPTIBLE); 2460 mutex_unlock(&root->log_mutex); 2461 if (atomic_read(&root->log_writers)) 2462 schedule(); 2463 finish_wait(&root->log_writer_wait, &wait); 2464 mutex_lock(&root->log_mutex); 2465 } 2466 } 2467 2468 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2469 struct btrfs_log_ctx *ctx) 2470 { 2471 if (!ctx) 2472 return; 2473 2474 mutex_lock(&root->log_mutex); 2475 list_del_init(&ctx->list); 2476 mutex_unlock(&root->log_mutex); 2477 } 2478 2479 /* 2480 * Invoked in log mutex context, or be sure there is no other task which 2481 * can access the list. 2482 */ 2483 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2484 int index, int error) 2485 { 2486 struct btrfs_log_ctx *ctx; 2487 2488 if (!error) { 2489 INIT_LIST_HEAD(&root->log_ctxs[index]); 2490 return; 2491 } 2492 2493 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2494 ctx->log_ret = error; 2495 2496 INIT_LIST_HEAD(&root->log_ctxs[index]); 2497 } 2498 2499 /* 2500 * btrfs_sync_log does sends a given tree log down to the disk and 2501 * updates the super blocks to record it. When this call is done, 2502 * you know that any inodes previously logged are safely on disk only 2503 * if it returns 0. 2504 * 2505 * Any other return value means you need to call btrfs_commit_transaction. 2506 * Some of the edge cases for fsyncing directories that have had unlinks 2507 * or renames done in the past mean that sometimes the only safe 2508 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2509 * that has happened. 2510 */ 2511 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2512 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2513 { 2514 int index1; 2515 int index2; 2516 int mark; 2517 int ret; 2518 struct btrfs_root *log = root->log_root; 2519 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2520 int log_transid = 0; 2521 struct btrfs_log_ctx root_log_ctx; 2522 struct blk_plug plug; 2523 2524 mutex_lock(&root->log_mutex); 2525 log_transid = ctx->log_transid; 2526 if (root->log_transid_committed >= log_transid) { 2527 mutex_unlock(&root->log_mutex); 2528 return ctx->log_ret; 2529 } 2530 2531 index1 = log_transid % 2; 2532 if (atomic_read(&root->log_commit[index1])) { 2533 wait_log_commit(trans, root, log_transid); 2534 mutex_unlock(&root->log_mutex); 2535 return ctx->log_ret; 2536 } 2537 ASSERT(log_transid == root->log_transid); 2538 atomic_set(&root->log_commit[index1], 1); 2539 2540 /* wait for previous tree log sync to complete */ 2541 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2542 wait_log_commit(trans, root, log_transid - 1); 2543 2544 while (1) { 2545 int batch = atomic_read(&root->log_batch); 2546 /* when we're on an ssd, just kick the log commit out */ 2547 if (!btrfs_test_opt(root, SSD) && 2548 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2549 mutex_unlock(&root->log_mutex); 2550 schedule_timeout_uninterruptible(1); 2551 mutex_lock(&root->log_mutex); 2552 } 2553 wait_for_writer(trans, root); 2554 if (batch == atomic_read(&root->log_batch)) 2555 break; 2556 } 2557 2558 /* bail out if we need to do a full commit */ 2559 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2560 ret = -EAGAIN; 2561 btrfs_free_logged_extents(log, log_transid); 2562 mutex_unlock(&root->log_mutex); 2563 goto out; 2564 } 2565 2566 if (log_transid % 2 == 0) 2567 mark = EXTENT_DIRTY; 2568 else 2569 mark = EXTENT_NEW; 2570 2571 /* we start IO on all the marked extents here, but we don't actually 2572 * wait for them until later. 2573 */ 2574 blk_start_plug(&plug); 2575 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2576 if (ret) { 2577 blk_finish_plug(&plug); 2578 btrfs_abort_transaction(trans, root, ret); 2579 btrfs_free_logged_extents(log, log_transid); 2580 btrfs_set_log_full_commit(root->fs_info, trans); 2581 mutex_unlock(&root->log_mutex); 2582 goto out; 2583 } 2584 2585 btrfs_set_root_node(&log->root_item, log->node); 2586 2587 root->log_transid++; 2588 log->log_transid = root->log_transid; 2589 root->log_start_pid = 0; 2590 /* 2591 * IO has been started, blocks of the log tree have WRITTEN flag set 2592 * in their headers. new modifications of the log will be written to 2593 * new positions. so it's safe to allow log writers to go in. 2594 */ 2595 mutex_unlock(&root->log_mutex); 2596 2597 btrfs_init_log_ctx(&root_log_ctx); 2598 2599 mutex_lock(&log_root_tree->log_mutex); 2600 atomic_inc(&log_root_tree->log_batch); 2601 atomic_inc(&log_root_tree->log_writers); 2602 2603 index2 = log_root_tree->log_transid % 2; 2604 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2605 root_log_ctx.log_transid = log_root_tree->log_transid; 2606 2607 mutex_unlock(&log_root_tree->log_mutex); 2608 2609 ret = update_log_root(trans, log); 2610 2611 mutex_lock(&log_root_tree->log_mutex); 2612 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2613 smp_mb(); 2614 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2615 wake_up(&log_root_tree->log_writer_wait); 2616 } 2617 2618 if (ret) { 2619 if (!list_empty(&root_log_ctx.list)) 2620 list_del_init(&root_log_ctx.list); 2621 2622 blk_finish_plug(&plug); 2623 btrfs_set_log_full_commit(root->fs_info, trans); 2624 2625 if (ret != -ENOSPC) { 2626 btrfs_abort_transaction(trans, root, ret); 2627 mutex_unlock(&log_root_tree->log_mutex); 2628 goto out; 2629 } 2630 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2631 btrfs_free_logged_extents(log, log_transid); 2632 mutex_unlock(&log_root_tree->log_mutex); 2633 ret = -EAGAIN; 2634 goto out; 2635 } 2636 2637 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2638 blk_finish_plug(&plug); 2639 mutex_unlock(&log_root_tree->log_mutex); 2640 ret = root_log_ctx.log_ret; 2641 goto out; 2642 } 2643 2644 index2 = root_log_ctx.log_transid % 2; 2645 if (atomic_read(&log_root_tree->log_commit[index2])) { 2646 blk_finish_plug(&plug); 2647 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, 2648 mark); 2649 btrfs_wait_logged_extents(trans, log, log_transid); 2650 wait_log_commit(trans, log_root_tree, 2651 root_log_ctx.log_transid); 2652 mutex_unlock(&log_root_tree->log_mutex); 2653 if (!ret) 2654 ret = root_log_ctx.log_ret; 2655 goto out; 2656 } 2657 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2658 atomic_set(&log_root_tree->log_commit[index2], 1); 2659 2660 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2661 wait_log_commit(trans, log_root_tree, 2662 root_log_ctx.log_transid - 1); 2663 } 2664 2665 wait_for_writer(trans, log_root_tree); 2666 2667 /* 2668 * now that we've moved on to the tree of log tree roots, 2669 * check the full commit flag again 2670 */ 2671 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2672 blk_finish_plug(&plug); 2673 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2674 btrfs_free_logged_extents(log, log_transid); 2675 mutex_unlock(&log_root_tree->log_mutex); 2676 ret = -EAGAIN; 2677 goto out_wake_log_root; 2678 } 2679 2680 ret = btrfs_write_marked_extents(log_root_tree, 2681 &log_root_tree->dirty_log_pages, 2682 EXTENT_DIRTY | EXTENT_NEW); 2683 blk_finish_plug(&plug); 2684 if (ret) { 2685 btrfs_set_log_full_commit(root->fs_info, trans); 2686 btrfs_abort_transaction(trans, root, ret); 2687 btrfs_free_logged_extents(log, log_transid); 2688 mutex_unlock(&log_root_tree->log_mutex); 2689 goto out_wake_log_root; 2690 } 2691 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2692 if (!ret) 2693 ret = btrfs_wait_marked_extents(log_root_tree, 2694 &log_root_tree->dirty_log_pages, 2695 EXTENT_NEW | EXTENT_DIRTY); 2696 if (ret) { 2697 btrfs_set_log_full_commit(root->fs_info, trans); 2698 btrfs_free_logged_extents(log, log_transid); 2699 mutex_unlock(&log_root_tree->log_mutex); 2700 goto out_wake_log_root; 2701 } 2702 btrfs_wait_logged_extents(trans, log, log_transid); 2703 2704 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2705 log_root_tree->node->start); 2706 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2707 btrfs_header_level(log_root_tree->node)); 2708 2709 log_root_tree->log_transid++; 2710 mutex_unlock(&log_root_tree->log_mutex); 2711 2712 /* 2713 * nobody else is going to jump in and write the the ctree 2714 * super here because the log_commit atomic below is protecting 2715 * us. We must be called with a transaction handle pinning 2716 * the running transaction open, so a full commit can't hop 2717 * in and cause problems either. 2718 */ 2719 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2720 if (ret) { 2721 btrfs_set_log_full_commit(root->fs_info, trans); 2722 btrfs_abort_transaction(trans, root, ret); 2723 goto out_wake_log_root; 2724 } 2725 2726 mutex_lock(&root->log_mutex); 2727 if (root->last_log_commit < log_transid) 2728 root->last_log_commit = log_transid; 2729 mutex_unlock(&root->log_mutex); 2730 2731 out_wake_log_root: 2732 /* 2733 * We needn't get log_mutex here because we are sure all 2734 * the other tasks are blocked. 2735 */ 2736 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2737 2738 mutex_lock(&log_root_tree->log_mutex); 2739 log_root_tree->log_transid_committed++; 2740 atomic_set(&log_root_tree->log_commit[index2], 0); 2741 mutex_unlock(&log_root_tree->log_mutex); 2742 2743 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2744 wake_up(&log_root_tree->log_commit_wait[index2]); 2745 out: 2746 /* See above. */ 2747 btrfs_remove_all_log_ctxs(root, index1, ret); 2748 2749 mutex_lock(&root->log_mutex); 2750 root->log_transid_committed++; 2751 atomic_set(&root->log_commit[index1], 0); 2752 mutex_unlock(&root->log_mutex); 2753 2754 if (waitqueue_active(&root->log_commit_wait[index1])) 2755 wake_up(&root->log_commit_wait[index1]); 2756 return ret; 2757 } 2758 2759 static void free_log_tree(struct btrfs_trans_handle *trans, 2760 struct btrfs_root *log) 2761 { 2762 int ret; 2763 u64 start; 2764 u64 end; 2765 struct walk_control wc = { 2766 .free = 1, 2767 .process_func = process_one_buffer 2768 }; 2769 2770 ret = walk_log_tree(trans, log, &wc); 2771 /* I don't think this can happen but just in case */ 2772 if (ret) 2773 btrfs_abort_transaction(trans, log, ret); 2774 2775 while (1) { 2776 ret = find_first_extent_bit(&log->dirty_log_pages, 2777 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2778 NULL); 2779 if (ret) 2780 break; 2781 2782 clear_extent_bits(&log->dirty_log_pages, start, end, 2783 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2784 } 2785 2786 /* 2787 * We may have short-circuited the log tree with the full commit logic 2788 * and left ordered extents on our list, so clear these out to keep us 2789 * from leaking inodes and memory. 2790 */ 2791 btrfs_free_logged_extents(log, 0); 2792 btrfs_free_logged_extents(log, 1); 2793 2794 free_extent_buffer(log->node); 2795 kfree(log); 2796 } 2797 2798 /* 2799 * free all the extents used by the tree log. This should be called 2800 * at commit time of the full transaction 2801 */ 2802 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2803 { 2804 if (root->log_root) { 2805 free_log_tree(trans, root->log_root); 2806 root->log_root = NULL; 2807 } 2808 return 0; 2809 } 2810 2811 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2812 struct btrfs_fs_info *fs_info) 2813 { 2814 if (fs_info->log_root_tree) { 2815 free_log_tree(trans, fs_info->log_root_tree); 2816 fs_info->log_root_tree = NULL; 2817 } 2818 return 0; 2819 } 2820 2821 /* 2822 * If both a file and directory are logged, and unlinks or renames are 2823 * mixed in, we have a few interesting corners: 2824 * 2825 * create file X in dir Y 2826 * link file X to X.link in dir Y 2827 * fsync file X 2828 * unlink file X but leave X.link 2829 * fsync dir Y 2830 * 2831 * After a crash we would expect only X.link to exist. But file X 2832 * didn't get fsync'd again so the log has back refs for X and X.link. 2833 * 2834 * We solve this by removing directory entries and inode backrefs from the 2835 * log when a file that was logged in the current transaction is 2836 * unlinked. Any later fsync will include the updated log entries, and 2837 * we'll be able to reconstruct the proper directory items from backrefs. 2838 * 2839 * This optimizations allows us to avoid relogging the entire inode 2840 * or the entire directory. 2841 */ 2842 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2843 struct btrfs_root *root, 2844 const char *name, int name_len, 2845 struct inode *dir, u64 index) 2846 { 2847 struct btrfs_root *log; 2848 struct btrfs_dir_item *di; 2849 struct btrfs_path *path; 2850 int ret; 2851 int err = 0; 2852 int bytes_del = 0; 2853 u64 dir_ino = btrfs_ino(dir); 2854 2855 if (BTRFS_I(dir)->logged_trans < trans->transid) 2856 return 0; 2857 2858 ret = join_running_log_trans(root); 2859 if (ret) 2860 return 0; 2861 2862 mutex_lock(&BTRFS_I(dir)->log_mutex); 2863 2864 log = root->log_root; 2865 path = btrfs_alloc_path(); 2866 if (!path) { 2867 err = -ENOMEM; 2868 goto out_unlock; 2869 } 2870 2871 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2872 name, name_len, -1); 2873 if (IS_ERR(di)) { 2874 err = PTR_ERR(di); 2875 goto fail; 2876 } 2877 if (di) { 2878 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2879 bytes_del += name_len; 2880 if (ret) { 2881 err = ret; 2882 goto fail; 2883 } 2884 } 2885 btrfs_release_path(path); 2886 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2887 index, name, name_len, -1); 2888 if (IS_ERR(di)) { 2889 err = PTR_ERR(di); 2890 goto fail; 2891 } 2892 if (di) { 2893 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2894 bytes_del += name_len; 2895 if (ret) { 2896 err = ret; 2897 goto fail; 2898 } 2899 } 2900 2901 /* update the directory size in the log to reflect the names 2902 * we have removed 2903 */ 2904 if (bytes_del) { 2905 struct btrfs_key key; 2906 2907 key.objectid = dir_ino; 2908 key.offset = 0; 2909 key.type = BTRFS_INODE_ITEM_KEY; 2910 btrfs_release_path(path); 2911 2912 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2913 if (ret < 0) { 2914 err = ret; 2915 goto fail; 2916 } 2917 if (ret == 0) { 2918 struct btrfs_inode_item *item; 2919 u64 i_size; 2920 2921 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2922 struct btrfs_inode_item); 2923 i_size = btrfs_inode_size(path->nodes[0], item); 2924 if (i_size > bytes_del) 2925 i_size -= bytes_del; 2926 else 2927 i_size = 0; 2928 btrfs_set_inode_size(path->nodes[0], item, i_size); 2929 btrfs_mark_buffer_dirty(path->nodes[0]); 2930 } else 2931 ret = 0; 2932 btrfs_release_path(path); 2933 } 2934 fail: 2935 btrfs_free_path(path); 2936 out_unlock: 2937 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2938 if (ret == -ENOSPC) { 2939 btrfs_set_log_full_commit(root->fs_info, trans); 2940 ret = 0; 2941 } else if (ret < 0) 2942 btrfs_abort_transaction(trans, root, ret); 2943 2944 btrfs_end_log_trans(root); 2945 2946 return err; 2947 } 2948 2949 /* see comments for btrfs_del_dir_entries_in_log */ 2950 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2951 struct btrfs_root *root, 2952 const char *name, int name_len, 2953 struct inode *inode, u64 dirid) 2954 { 2955 struct btrfs_root *log; 2956 u64 index; 2957 int ret; 2958 2959 if (BTRFS_I(inode)->logged_trans < trans->transid) 2960 return 0; 2961 2962 ret = join_running_log_trans(root); 2963 if (ret) 2964 return 0; 2965 log = root->log_root; 2966 mutex_lock(&BTRFS_I(inode)->log_mutex); 2967 2968 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 2969 dirid, &index); 2970 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2971 if (ret == -ENOSPC) { 2972 btrfs_set_log_full_commit(root->fs_info, trans); 2973 ret = 0; 2974 } else if (ret < 0 && ret != -ENOENT) 2975 btrfs_abort_transaction(trans, root, ret); 2976 btrfs_end_log_trans(root); 2977 2978 return ret; 2979 } 2980 2981 /* 2982 * creates a range item in the log for 'dirid'. first_offset and 2983 * last_offset tell us which parts of the key space the log should 2984 * be considered authoritative for. 2985 */ 2986 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2987 struct btrfs_root *log, 2988 struct btrfs_path *path, 2989 int key_type, u64 dirid, 2990 u64 first_offset, u64 last_offset) 2991 { 2992 int ret; 2993 struct btrfs_key key; 2994 struct btrfs_dir_log_item *item; 2995 2996 key.objectid = dirid; 2997 key.offset = first_offset; 2998 if (key_type == BTRFS_DIR_ITEM_KEY) 2999 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3000 else 3001 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3002 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3003 if (ret) 3004 return ret; 3005 3006 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3007 struct btrfs_dir_log_item); 3008 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3009 btrfs_mark_buffer_dirty(path->nodes[0]); 3010 btrfs_release_path(path); 3011 return 0; 3012 } 3013 3014 /* 3015 * log all the items included in the current transaction for a given 3016 * directory. This also creates the range items in the log tree required 3017 * to replay anything deleted before the fsync 3018 */ 3019 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3020 struct btrfs_root *root, struct inode *inode, 3021 struct btrfs_path *path, 3022 struct btrfs_path *dst_path, int key_type, 3023 u64 min_offset, u64 *last_offset_ret) 3024 { 3025 struct btrfs_key min_key; 3026 struct btrfs_root *log = root->log_root; 3027 struct extent_buffer *src; 3028 int err = 0; 3029 int ret; 3030 int i; 3031 int nritems; 3032 u64 first_offset = min_offset; 3033 u64 last_offset = (u64)-1; 3034 u64 ino = btrfs_ino(inode); 3035 3036 log = root->log_root; 3037 3038 min_key.objectid = ino; 3039 min_key.type = key_type; 3040 min_key.offset = min_offset; 3041 3042 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3043 3044 /* 3045 * we didn't find anything from this transaction, see if there 3046 * is anything at all 3047 */ 3048 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3049 min_key.objectid = ino; 3050 min_key.type = key_type; 3051 min_key.offset = (u64)-1; 3052 btrfs_release_path(path); 3053 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3054 if (ret < 0) { 3055 btrfs_release_path(path); 3056 return ret; 3057 } 3058 ret = btrfs_previous_item(root, path, ino, key_type); 3059 3060 /* if ret == 0 there are items for this type, 3061 * create a range to tell us the last key of this type. 3062 * otherwise, there are no items in this directory after 3063 * *min_offset, and we create a range to indicate that. 3064 */ 3065 if (ret == 0) { 3066 struct btrfs_key tmp; 3067 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3068 path->slots[0]); 3069 if (key_type == tmp.type) 3070 first_offset = max(min_offset, tmp.offset) + 1; 3071 } 3072 goto done; 3073 } 3074 3075 /* go backward to find any previous key */ 3076 ret = btrfs_previous_item(root, path, ino, key_type); 3077 if (ret == 0) { 3078 struct btrfs_key tmp; 3079 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3080 if (key_type == tmp.type) { 3081 first_offset = tmp.offset; 3082 ret = overwrite_item(trans, log, dst_path, 3083 path->nodes[0], path->slots[0], 3084 &tmp); 3085 if (ret) { 3086 err = ret; 3087 goto done; 3088 } 3089 } 3090 } 3091 btrfs_release_path(path); 3092 3093 /* find the first key from this transaction again */ 3094 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3095 if (WARN_ON(ret != 0)) 3096 goto done; 3097 3098 /* 3099 * we have a block from this transaction, log every item in it 3100 * from our directory 3101 */ 3102 while (1) { 3103 struct btrfs_key tmp; 3104 src = path->nodes[0]; 3105 nritems = btrfs_header_nritems(src); 3106 for (i = path->slots[0]; i < nritems; i++) { 3107 btrfs_item_key_to_cpu(src, &min_key, i); 3108 3109 if (min_key.objectid != ino || min_key.type != key_type) 3110 goto done; 3111 ret = overwrite_item(trans, log, dst_path, src, i, 3112 &min_key); 3113 if (ret) { 3114 err = ret; 3115 goto done; 3116 } 3117 } 3118 path->slots[0] = nritems; 3119 3120 /* 3121 * look ahead to the next item and see if it is also 3122 * from this directory and from this transaction 3123 */ 3124 ret = btrfs_next_leaf(root, path); 3125 if (ret == 1) { 3126 last_offset = (u64)-1; 3127 goto done; 3128 } 3129 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3130 if (tmp.objectid != ino || tmp.type != key_type) { 3131 last_offset = (u64)-1; 3132 goto done; 3133 } 3134 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3135 ret = overwrite_item(trans, log, dst_path, 3136 path->nodes[0], path->slots[0], 3137 &tmp); 3138 if (ret) 3139 err = ret; 3140 else 3141 last_offset = tmp.offset; 3142 goto done; 3143 } 3144 } 3145 done: 3146 btrfs_release_path(path); 3147 btrfs_release_path(dst_path); 3148 3149 if (err == 0) { 3150 *last_offset_ret = last_offset; 3151 /* 3152 * insert the log range keys to indicate where the log 3153 * is valid 3154 */ 3155 ret = insert_dir_log_key(trans, log, path, key_type, 3156 ino, first_offset, last_offset); 3157 if (ret) 3158 err = ret; 3159 } 3160 return err; 3161 } 3162 3163 /* 3164 * logging directories is very similar to logging inodes, We find all the items 3165 * from the current transaction and write them to the log. 3166 * 3167 * The recovery code scans the directory in the subvolume, and if it finds a 3168 * key in the range logged that is not present in the log tree, then it means 3169 * that dir entry was unlinked during the transaction. 3170 * 3171 * In order for that scan to work, we must include one key smaller than 3172 * the smallest logged by this transaction and one key larger than the largest 3173 * key logged by this transaction. 3174 */ 3175 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3176 struct btrfs_root *root, struct inode *inode, 3177 struct btrfs_path *path, 3178 struct btrfs_path *dst_path) 3179 { 3180 u64 min_key; 3181 u64 max_key; 3182 int ret; 3183 int key_type = BTRFS_DIR_ITEM_KEY; 3184 3185 again: 3186 min_key = 0; 3187 max_key = 0; 3188 while (1) { 3189 ret = log_dir_items(trans, root, inode, path, 3190 dst_path, key_type, min_key, 3191 &max_key); 3192 if (ret) 3193 return ret; 3194 if (max_key == (u64)-1) 3195 break; 3196 min_key = max_key + 1; 3197 } 3198 3199 if (key_type == BTRFS_DIR_ITEM_KEY) { 3200 key_type = BTRFS_DIR_INDEX_KEY; 3201 goto again; 3202 } 3203 return 0; 3204 } 3205 3206 /* 3207 * a helper function to drop items from the log before we relog an 3208 * inode. max_key_type indicates the highest item type to remove. 3209 * This cannot be run for file data extents because it does not 3210 * free the extents they point to. 3211 */ 3212 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3213 struct btrfs_root *log, 3214 struct btrfs_path *path, 3215 u64 objectid, int max_key_type) 3216 { 3217 int ret; 3218 struct btrfs_key key; 3219 struct btrfs_key found_key; 3220 int start_slot; 3221 3222 key.objectid = objectid; 3223 key.type = max_key_type; 3224 key.offset = (u64)-1; 3225 3226 while (1) { 3227 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3228 BUG_ON(ret == 0); /* Logic error */ 3229 if (ret < 0) 3230 break; 3231 3232 if (path->slots[0] == 0) 3233 break; 3234 3235 path->slots[0]--; 3236 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3237 path->slots[0]); 3238 3239 if (found_key.objectid != objectid) 3240 break; 3241 3242 found_key.offset = 0; 3243 found_key.type = 0; 3244 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3245 &start_slot); 3246 3247 ret = btrfs_del_items(trans, log, path, start_slot, 3248 path->slots[0] - start_slot + 1); 3249 /* 3250 * If start slot isn't 0 then we don't need to re-search, we've 3251 * found the last guy with the objectid in this tree. 3252 */ 3253 if (ret || start_slot != 0) 3254 break; 3255 btrfs_release_path(path); 3256 } 3257 btrfs_release_path(path); 3258 if (ret > 0) 3259 ret = 0; 3260 return ret; 3261 } 3262 3263 static void fill_inode_item(struct btrfs_trans_handle *trans, 3264 struct extent_buffer *leaf, 3265 struct btrfs_inode_item *item, 3266 struct inode *inode, int log_inode_only, 3267 u64 logged_isize) 3268 { 3269 struct btrfs_map_token token; 3270 3271 btrfs_init_map_token(&token); 3272 3273 if (log_inode_only) { 3274 /* set the generation to zero so the recover code 3275 * can tell the difference between an logging 3276 * just to say 'this inode exists' and a logging 3277 * to say 'update this inode with these values' 3278 */ 3279 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3280 btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 3281 } else { 3282 btrfs_set_token_inode_generation(leaf, item, 3283 BTRFS_I(inode)->generation, 3284 &token); 3285 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3286 } 3287 3288 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3289 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3290 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3291 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3292 3293 btrfs_set_token_timespec_sec(leaf, &item->atime, 3294 inode->i_atime.tv_sec, &token); 3295 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3296 inode->i_atime.tv_nsec, &token); 3297 3298 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3299 inode->i_mtime.tv_sec, &token); 3300 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3301 inode->i_mtime.tv_nsec, &token); 3302 3303 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3304 inode->i_ctime.tv_sec, &token); 3305 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3306 inode->i_ctime.tv_nsec, &token); 3307 3308 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3309 &token); 3310 3311 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3312 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3313 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3314 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3315 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3316 } 3317 3318 static int log_inode_item(struct btrfs_trans_handle *trans, 3319 struct btrfs_root *log, struct btrfs_path *path, 3320 struct inode *inode) 3321 { 3322 struct btrfs_inode_item *inode_item; 3323 int ret; 3324 3325 ret = btrfs_insert_empty_item(trans, log, path, 3326 &BTRFS_I(inode)->location, 3327 sizeof(*inode_item)); 3328 if (ret && ret != -EEXIST) 3329 return ret; 3330 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3331 struct btrfs_inode_item); 3332 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3333 btrfs_release_path(path); 3334 return 0; 3335 } 3336 3337 static noinline int copy_items(struct btrfs_trans_handle *trans, 3338 struct inode *inode, 3339 struct btrfs_path *dst_path, 3340 struct btrfs_path *src_path, u64 *last_extent, 3341 int start_slot, int nr, int inode_only, 3342 u64 logged_isize) 3343 { 3344 unsigned long src_offset; 3345 unsigned long dst_offset; 3346 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3347 struct btrfs_file_extent_item *extent; 3348 struct btrfs_inode_item *inode_item; 3349 struct extent_buffer *src = src_path->nodes[0]; 3350 struct btrfs_key first_key, last_key, key; 3351 int ret; 3352 struct btrfs_key *ins_keys; 3353 u32 *ins_sizes; 3354 char *ins_data; 3355 int i; 3356 struct list_head ordered_sums; 3357 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3358 bool has_extents = false; 3359 bool need_find_last_extent = true; 3360 bool done = false; 3361 3362 INIT_LIST_HEAD(&ordered_sums); 3363 3364 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3365 nr * sizeof(u32), GFP_NOFS); 3366 if (!ins_data) 3367 return -ENOMEM; 3368 3369 first_key.objectid = (u64)-1; 3370 3371 ins_sizes = (u32 *)ins_data; 3372 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3373 3374 for (i = 0; i < nr; i++) { 3375 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3376 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3377 } 3378 ret = btrfs_insert_empty_items(trans, log, dst_path, 3379 ins_keys, ins_sizes, nr); 3380 if (ret) { 3381 kfree(ins_data); 3382 return ret; 3383 } 3384 3385 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3386 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3387 dst_path->slots[0]); 3388 3389 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3390 3391 if ((i == (nr - 1))) 3392 last_key = ins_keys[i]; 3393 3394 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3395 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3396 dst_path->slots[0], 3397 struct btrfs_inode_item); 3398 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3399 inode, inode_only == LOG_INODE_EXISTS, 3400 logged_isize); 3401 } else { 3402 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3403 src_offset, ins_sizes[i]); 3404 } 3405 3406 /* 3407 * We set need_find_last_extent here in case we know we were 3408 * processing other items and then walk into the first extent in 3409 * the inode. If we don't hit an extent then nothing changes, 3410 * we'll do the last search the next time around. 3411 */ 3412 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3413 has_extents = true; 3414 if (first_key.objectid == (u64)-1) 3415 first_key = ins_keys[i]; 3416 } else { 3417 need_find_last_extent = false; 3418 } 3419 3420 /* take a reference on file data extents so that truncates 3421 * or deletes of this inode don't have to relog the inode 3422 * again 3423 */ 3424 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3425 !skip_csum) { 3426 int found_type; 3427 extent = btrfs_item_ptr(src, start_slot + i, 3428 struct btrfs_file_extent_item); 3429 3430 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3431 continue; 3432 3433 found_type = btrfs_file_extent_type(src, extent); 3434 if (found_type == BTRFS_FILE_EXTENT_REG) { 3435 u64 ds, dl, cs, cl; 3436 ds = btrfs_file_extent_disk_bytenr(src, 3437 extent); 3438 /* ds == 0 is a hole */ 3439 if (ds == 0) 3440 continue; 3441 3442 dl = btrfs_file_extent_disk_num_bytes(src, 3443 extent); 3444 cs = btrfs_file_extent_offset(src, extent); 3445 cl = btrfs_file_extent_num_bytes(src, 3446 extent); 3447 if (btrfs_file_extent_compression(src, 3448 extent)) { 3449 cs = 0; 3450 cl = dl; 3451 } 3452 3453 ret = btrfs_lookup_csums_range( 3454 log->fs_info->csum_root, 3455 ds + cs, ds + cs + cl - 1, 3456 &ordered_sums, 0); 3457 if (ret) { 3458 btrfs_release_path(dst_path); 3459 kfree(ins_data); 3460 return ret; 3461 } 3462 } 3463 } 3464 } 3465 3466 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3467 btrfs_release_path(dst_path); 3468 kfree(ins_data); 3469 3470 /* 3471 * we have to do this after the loop above to avoid changing the 3472 * log tree while trying to change the log tree. 3473 */ 3474 ret = 0; 3475 while (!list_empty(&ordered_sums)) { 3476 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3477 struct btrfs_ordered_sum, 3478 list); 3479 if (!ret) 3480 ret = btrfs_csum_file_blocks(trans, log, sums); 3481 list_del(&sums->list); 3482 kfree(sums); 3483 } 3484 3485 if (!has_extents) 3486 return ret; 3487 3488 if (need_find_last_extent && *last_extent == first_key.offset) { 3489 /* 3490 * We don't have any leafs between our current one and the one 3491 * we processed before that can have file extent items for our 3492 * inode (and have a generation number smaller than our current 3493 * transaction id). 3494 */ 3495 need_find_last_extent = false; 3496 } 3497 3498 /* 3499 * Because we use btrfs_search_forward we could skip leaves that were 3500 * not modified and then assume *last_extent is valid when it really 3501 * isn't. So back up to the previous leaf and read the end of the last 3502 * extent before we go and fill in holes. 3503 */ 3504 if (need_find_last_extent) { 3505 u64 len; 3506 3507 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3508 if (ret < 0) 3509 return ret; 3510 if (ret) 3511 goto fill_holes; 3512 if (src_path->slots[0]) 3513 src_path->slots[0]--; 3514 src = src_path->nodes[0]; 3515 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3516 if (key.objectid != btrfs_ino(inode) || 3517 key.type != BTRFS_EXTENT_DATA_KEY) 3518 goto fill_holes; 3519 extent = btrfs_item_ptr(src, src_path->slots[0], 3520 struct btrfs_file_extent_item); 3521 if (btrfs_file_extent_type(src, extent) == 3522 BTRFS_FILE_EXTENT_INLINE) { 3523 len = btrfs_file_extent_inline_len(src, 3524 src_path->slots[0], 3525 extent); 3526 *last_extent = ALIGN(key.offset + len, 3527 log->sectorsize); 3528 } else { 3529 len = btrfs_file_extent_num_bytes(src, extent); 3530 *last_extent = key.offset + len; 3531 } 3532 } 3533 fill_holes: 3534 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3535 * things could have happened 3536 * 3537 * 1) A merge could have happened, so we could currently be on a leaf 3538 * that holds what we were copying in the first place. 3539 * 2) A split could have happened, and now not all of the items we want 3540 * are on the same leaf. 3541 * 3542 * So we need to adjust how we search for holes, we need to drop the 3543 * path and re-search for the first extent key we found, and then walk 3544 * forward until we hit the last one we copied. 3545 */ 3546 if (need_find_last_extent) { 3547 /* btrfs_prev_leaf could return 1 without releasing the path */ 3548 btrfs_release_path(src_path); 3549 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3550 src_path, 0, 0); 3551 if (ret < 0) 3552 return ret; 3553 ASSERT(ret == 0); 3554 src = src_path->nodes[0]; 3555 i = src_path->slots[0]; 3556 } else { 3557 i = start_slot; 3558 } 3559 3560 /* 3561 * Ok so here we need to go through and fill in any holes we may have 3562 * to make sure that holes are punched for those areas in case they had 3563 * extents previously. 3564 */ 3565 while (!done) { 3566 u64 offset, len; 3567 u64 extent_end; 3568 3569 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3570 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3571 if (ret < 0) 3572 return ret; 3573 ASSERT(ret == 0); 3574 src = src_path->nodes[0]; 3575 i = 0; 3576 } 3577 3578 btrfs_item_key_to_cpu(src, &key, i); 3579 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3580 done = true; 3581 if (key.objectid != btrfs_ino(inode) || 3582 key.type != BTRFS_EXTENT_DATA_KEY) { 3583 i++; 3584 continue; 3585 } 3586 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3587 if (btrfs_file_extent_type(src, extent) == 3588 BTRFS_FILE_EXTENT_INLINE) { 3589 len = btrfs_file_extent_inline_len(src, i, extent); 3590 extent_end = ALIGN(key.offset + len, log->sectorsize); 3591 } else { 3592 len = btrfs_file_extent_num_bytes(src, extent); 3593 extent_end = key.offset + len; 3594 } 3595 i++; 3596 3597 if (*last_extent == key.offset) { 3598 *last_extent = extent_end; 3599 continue; 3600 } 3601 offset = *last_extent; 3602 len = key.offset - *last_extent; 3603 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3604 offset, 0, 0, len, 0, len, 0, 3605 0, 0); 3606 if (ret) 3607 break; 3608 *last_extent = extent_end; 3609 } 3610 /* 3611 * Need to let the callers know we dropped the path so they should 3612 * re-search. 3613 */ 3614 if (!ret && need_find_last_extent) 3615 ret = 1; 3616 return ret; 3617 } 3618 3619 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3620 { 3621 struct extent_map *em1, *em2; 3622 3623 em1 = list_entry(a, struct extent_map, list); 3624 em2 = list_entry(b, struct extent_map, list); 3625 3626 if (em1->start < em2->start) 3627 return -1; 3628 else if (em1->start > em2->start) 3629 return 1; 3630 return 0; 3631 } 3632 3633 static int wait_ordered_extents(struct btrfs_trans_handle *trans, 3634 struct inode *inode, 3635 struct btrfs_root *root, 3636 const struct extent_map *em, 3637 const struct list_head *logged_list, 3638 bool *ordered_io_error) 3639 { 3640 struct btrfs_ordered_extent *ordered; 3641 struct btrfs_root *log = root->log_root; 3642 u64 mod_start = em->mod_start; 3643 u64 mod_len = em->mod_len; 3644 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3645 u64 csum_offset; 3646 u64 csum_len; 3647 LIST_HEAD(ordered_sums); 3648 int ret = 0; 3649 3650 *ordered_io_error = false; 3651 3652 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 3653 em->block_start == EXTENT_MAP_HOLE) 3654 return 0; 3655 3656 /* 3657 * Wait far any ordered extent that covers our extent map. If it 3658 * finishes without an error, first check and see if our csums are on 3659 * our outstanding ordered extents. 3660 */ 3661 list_for_each_entry(ordered, logged_list, log_list) { 3662 struct btrfs_ordered_sum *sum; 3663 3664 if (!mod_len) 3665 break; 3666 3667 if (ordered->file_offset + ordered->len <= mod_start || 3668 mod_start + mod_len <= ordered->file_offset) 3669 continue; 3670 3671 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 3672 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3673 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 3674 const u64 start = ordered->file_offset; 3675 const u64 end = ordered->file_offset + ordered->len - 1; 3676 3677 WARN_ON(ordered->inode != inode); 3678 filemap_fdatawrite_range(inode->i_mapping, start, end); 3679 } 3680 3681 wait_event(ordered->wait, 3682 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 3683 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3684 3685 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3686 /* 3687 * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3688 * i_mapping flags, so that the next fsync won't get 3689 * an outdated io error too. 3690 */ 3691 btrfs_inode_check_errors(inode); 3692 *ordered_io_error = true; 3693 break; 3694 } 3695 /* 3696 * We are going to copy all the csums on this ordered extent, so 3697 * go ahead and adjust mod_start and mod_len in case this 3698 * ordered extent has already been logged. 3699 */ 3700 if (ordered->file_offset > mod_start) { 3701 if (ordered->file_offset + ordered->len >= 3702 mod_start + mod_len) 3703 mod_len = ordered->file_offset - mod_start; 3704 /* 3705 * If we have this case 3706 * 3707 * |--------- logged extent ---------| 3708 * |----- ordered extent ----| 3709 * 3710 * Just don't mess with mod_start and mod_len, we'll 3711 * just end up logging more csums than we need and it 3712 * will be ok. 3713 */ 3714 } else { 3715 if (ordered->file_offset + ordered->len < 3716 mod_start + mod_len) { 3717 mod_len = (mod_start + mod_len) - 3718 (ordered->file_offset + ordered->len); 3719 mod_start = ordered->file_offset + 3720 ordered->len; 3721 } else { 3722 mod_len = 0; 3723 } 3724 } 3725 3726 if (skip_csum) 3727 continue; 3728 3729 /* 3730 * To keep us from looping for the above case of an ordered 3731 * extent that falls inside of the logged extent. 3732 */ 3733 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3734 &ordered->flags)) 3735 continue; 3736 3737 if (ordered->csum_bytes_left) { 3738 btrfs_start_ordered_extent(inode, ordered, 0); 3739 wait_event(ordered->wait, 3740 ordered->csum_bytes_left == 0); 3741 } 3742 3743 list_for_each_entry(sum, &ordered->list, list) { 3744 ret = btrfs_csum_file_blocks(trans, log, sum); 3745 if (ret) 3746 break; 3747 } 3748 } 3749 3750 if (*ordered_io_error || !mod_len || ret || skip_csum) 3751 return ret; 3752 3753 if (em->compress_type) { 3754 csum_offset = 0; 3755 csum_len = max(em->block_len, em->orig_block_len); 3756 } else { 3757 csum_offset = mod_start - em->start; 3758 csum_len = mod_len; 3759 } 3760 3761 /* block start is already adjusted for the file extent offset. */ 3762 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3763 em->block_start + csum_offset, 3764 em->block_start + csum_offset + 3765 csum_len - 1, &ordered_sums, 0); 3766 if (ret) 3767 return ret; 3768 3769 while (!list_empty(&ordered_sums)) { 3770 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3771 struct btrfs_ordered_sum, 3772 list); 3773 if (!ret) 3774 ret = btrfs_csum_file_blocks(trans, log, sums); 3775 list_del(&sums->list); 3776 kfree(sums); 3777 } 3778 3779 return ret; 3780 } 3781 3782 static int log_one_extent(struct btrfs_trans_handle *trans, 3783 struct inode *inode, struct btrfs_root *root, 3784 const struct extent_map *em, 3785 struct btrfs_path *path, 3786 const struct list_head *logged_list, 3787 struct btrfs_log_ctx *ctx) 3788 { 3789 struct btrfs_root *log = root->log_root; 3790 struct btrfs_file_extent_item *fi; 3791 struct extent_buffer *leaf; 3792 struct btrfs_map_token token; 3793 struct btrfs_key key; 3794 u64 extent_offset = em->start - em->orig_start; 3795 u64 block_len; 3796 int ret; 3797 int extent_inserted = 0; 3798 bool ordered_io_err = false; 3799 3800 ret = wait_ordered_extents(trans, inode, root, em, logged_list, 3801 &ordered_io_err); 3802 if (ret) 3803 return ret; 3804 3805 if (ordered_io_err) { 3806 ctx->io_err = -EIO; 3807 return 0; 3808 } 3809 3810 btrfs_init_map_token(&token); 3811 3812 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3813 em->start + em->len, NULL, 0, 1, 3814 sizeof(*fi), &extent_inserted); 3815 if (ret) 3816 return ret; 3817 3818 if (!extent_inserted) { 3819 key.objectid = btrfs_ino(inode); 3820 key.type = BTRFS_EXTENT_DATA_KEY; 3821 key.offset = em->start; 3822 3823 ret = btrfs_insert_empty_item(trans, log, path, &key, 3824 sizeof(*fi)); 3825 if (ret) 3826 return ret; 3827 } 3828 leaf = path->nodes[0]; 3829 fi = btrfs_item_ptr(leaf, path->slots[0], 3830 struct btrfs_file_extent_item); 3831 3832 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 3833 &token); 3834 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3835 btrfs_set_token_file_extent_type(leaf, fi, 3836 BTRFS_FILE_EXTENT_PREALLOC, 3837 &token); 3838 else 3839 btrfs_set_token_file_extent_type(leaf, fi, 3840 BTRFS_FILE_EXTENT_REG, 3841 &token); 3842 3843 block_len = max(em->block_len, em->orig_block_len); 3844 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3845 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3846 em->block_start, 3847 &token); 3848 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3849 &token); 3850 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3851 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3852 em->block_start - 3853 extent_offset, &token); 3854 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3855 &token); 3856 } else { 3857 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3858 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3859 &token); 3860 } 3861 3862 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 3863 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3864 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 3865 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3866 &token); 3867 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3868 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3869 btrfs_mark_buffer_dirty(leaf); 3870 3871 btrfs_release_path(path); 3872 3873 return ret; 3874 } 3875 3876 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3877 struct btrfs_root *root, 3878 struct inode *inode, 3879 struct btrfs_path *path, 3880 struct list_head *logged_list, 3881 struct btrfs_log_ctx *ctx) 3882 { 3883 struct extent_map *em, *n; 3884 struct list_head extents; 3885 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3886 u64 test_gen; 3887 int ret = 0; 3888 int num = 0; 3889 3890 INIT_LIST_HEAD(&extents); 3891 3892 write_lock(&tree->lock); 3893 test_gen = root->fs_info->last_trans_committed; 3894 3895 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3896 list_del_init(&em->list); 3897 3898 /* 3899 * Just an arbitrary number, this can be really CPU intensive 3900 * once we start getting a lot of extents, and really once we 3901 * have a bunch of extents we just want to commit since it will 3902 * be faster. 3903 */ 3904 if (++num > 32768) { 3905 list_del_init(&tree->modified_extents); 3906 ret = -EFBIG; 3907 goto process; 3908 } 3909 3910 if (em->generation <= test_gen) 3911 continue; 3912 /* Need a ref to keep it from getting evicted from cache */ 3913 atomic_inc(&em->refs); 3914 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3915 list_add_tail(&em->list, &extents); 3916 num++; 3917 } 3918 3919 list_sort(NULL, &extents, extent_cmp); 3920 3921 process: 3922 while (!list_empty(&extents)) { 3923 em = list_entry(extents.next, struct extent_map, list); 3924 3925 list_del_init(&em->list); 3926 3927 /* 3928 * If we had an error we just need to delete everybody from our 3929 * private list. 3930 */ 3931 if (ret) { 3932 clear_em_logging(tree, em); 3933 free_extent_map(em); 3934 continue; 3935 } 3936 3937 write_unlock(&tree->lock); 3938 3939 ret = log_one_extent(trans, inode, root, em, path, logged_list, 3940 ctx); 3941 write_lock(&tree->lock); 3942 clear_em_logging(tree, em); 3943 free_extent_map(em); 3944 } 3945 WARN_ON(!list_empty(&extents)); 3946 write_unlock(&tree->lock); 3947 3948 btrfs_release_path(path); 3949 return ret; 3950 } 3951 3952 static int logged_inode_size(struct btrfs_root *log, struct inode *inode, 3953 struct btrfs_path *path, u64 *size_ret) 3954 { 3955 struct btrfs_key key; 3956 int ret; 3957 3958 key.objectid = btrfs_ino(inode); 3959 key.type = BTRFS_INODE_ITEM_KEY; 3960 key.offset = 0; 3961 3962 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 3963 if (ret < 0) { 3964 return ret; 3965 } else if (ret > 0) { 3966 *size_ret = i_size_read(inode); 3967 } else { 3968 struct btrfs_inode_item *item; 3969 3970 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3971 struct btrfs_inode_item); 3972 *size_ret = btrfs_inode_size(path->nodes[0], item); 3973 } 3974 3975 btrfs_release_path(path); 3976 return 0; 3977 } 3978 3979 /* log a single inode in the tree log. 3980 * At least one parent directory for this inode must exist in the tree 3981 * or be logged already. 3982 * 3983 * Any items from this inode changed by the current transaction are copied 3984 * to the log tree. An extra reference is taken on any extents in this 3985 * file, allowing us to avoid a whole pile of corner cases around logging 3986 * blocks that have been removed from the tree. 3987 * 3988 * See LOG_INODE_ALL and related defines for a description of what inode_only 3989 * does. 3990 * 3991 * This handles both files and directories. 3992 */ 3993 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3994 struct btrfs_root *root, struct inode *inode, 3995 int inode_only, 3996 const loff_t start, 3997 const loff_t end, 3998 struct btrfs_log_ctx *ctx) 3999 { 4000 struct btrfs_path *path; 4001 struct btrfs_path *dst_path; 4002 struct btrfs_key min_key; 4003 struct btrfs_key max_key; 4004 struct btrfs_root *log = root->log_root; 4005 struct extent_buffer *src = NULL; 4006 LIST_HEAD(logged_list); 4007 u64 last_extent = 0; 4008 int err = 0; 4009 int ret; 4010 int nritems; 4011 int ins_start_slot = 0; 4012 int ins_nr; 4013 bool fast_search = false; 4014 u64 ino = btrfs_ino(inode); 4015 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4016 u64 logged_isize = 0; 4017 4018 path = btrfs_alloc_path(); 4019 if (!path) 4020 return -ENOMEM; 4021 dst_path = btrfs_alloc_path(); 4022 if (!dst_path) { 4023 btrfs_free_path(path); 4024 return -ENOMEM; 4025 } 4026 4027 min_key.objectid = ino; 4028 min_key.type = BTRFS_INODE_ITEM_KEY; 4029 min_key.offset = 0; 4030 4031 max_key.objectid = ino; 4032 4033 4034 /* today the code can only do partial logging of directories */ 4035 if (S_ISDIR(inode->i_mode) || 4036 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4037 &BTRFS_I(inode)->runtime_flags) && 4038 inode_only == LOG_INODE_EXISTS)) 4039 max_key.type = BTRFS_XATTR_ITEM_KEY; 4040 else 4041 max_key.type = (u8)-1; 4042 max_key.offset = (u64)-1; 4043 4044 /* 4045 * Only run delayed items if we are a dir or a new file. 4046 * Otherwise commit the delayed inode only, which is needed in 4047 * order for the log replay code to mark inodes for link count 4048 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 4049 */ 4050 if (S_ISDIR(inode->i_mode) || 4051 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) 4052 ret = btrfs_commit_inode_delayed_items(trans, inode); 4053 else 4054 ret = btrfs_commit_inode_delayed_inode(inode); 4055 4056 if (ret) { 4057 btrfs_free_path(path); 4058 btrfs_free_path(dst_path); 4059 return ret; 4060 } 4061 4062 mutex_lock(&BTRFS_I(inode)->log_mutex); 4063 4064 btrfs_get_logged_extents(inode, &logged_list, start, end); 4065 4066 /* 4067 * a brute force approach to making sure we get the most uptodate 4068 * copies of everything. 4069 */ 4070 if (S_ISDIR(inode->i_mode)) { 4071 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4072 4073 if (inode_only == LOG_INODE_EXISTS) { 4074 max_key_type = BTRFS_INODE_EXTREF_KEY; 4075 max_key.type = max_key_type; 4076 } 4077 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4078 } else { 4079 if (inode_only == LOG_INODE_EXISTS) { 4080 /* 4081 * Make sure the new inode item we write to the log has 4082 * the same isize as the current one (if it exists). 4083 * This is necessary to prevent data loss after log 4084 * replay, and also to prevent doing a wrong expanding 4085 * truncate - for e.g. create file, write 4K into offset 4086 * 0, fsync, write 4K into offset 4096, add hard link, 4087 * fsync some other file (to sync log), power fail - if 4088 * we use the inode's current i_size, after log replay 4089 * we get a 8Kb file, with the last 4Kb extent as a hole 4090 * (zeroes), as if an expanding truncate happened, 4091 * instead of getting a file of 4Kb only. 4092 */ 4093 err = logged_inode_size(log, inode, path, 4094 &logged_isize); 4095 if (err) 4096 goto out_unlock; 4097 } 4098 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4099 &BTRFS_I(inode)->runtime_flags)) { 4100 if (inode_only == LOG_INODE_EXISTS) { 4101 max_key.type = BTRFS_INODE_EXTREF_KEY; 4102 ret = drop_objectid_items(trans, log, path, ino, 4103 max_key.type); 4104 } else { 4105 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4106 &BTRFS_I(inode)->runtime_flags); 4107 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4108 &BTRFS_I(inode)->runtime_flags); 4109 ret = btrfs_truncate_inode_items(trans, log, 4110 inode, 0, 0); 4111 } 4112 } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, 4113 &BTRFS_I(inode)->runtime_flags) || 4114 inode_only == LOG_INODE_EXISTS) { 4115 if (inode_only == LOG_INODE_ALL) { 4116 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4117 &BTRFS_I(inode)->runtime_flags); 4118 fast_search = true; 4119 max_key.type = BTRFS_XATTR_ITEM_KEY; 4120 } else { 4121 max_key.type = BTRFS_INODE_EXTREF_KEY; 4122 } 4123 ret = drop_objectid_items(trans, log, path, ino, 4124 max_key.type); 4125 } else { 4126 if (inode_only == LOG_INODE_ALL) 4127 fast_search = true; 4128 ret = log_inode_item(trans, log, dst_path, inode); 4129 if (ret) { 4130 err = ret; 4131 goto out_unlock; 4132 } 4133 goto log_extents; 4134 } 4135 4136 } 4137 if (ret) { 4138 err = ret; 4139 goto out_unlock; 4140 } 4141 4142 while (1) { 4143 ins_nr = 0; 4144 ret = btrfs_search_forward(root, &min_key, 4145 path, trans->transid); 4146 if (ret != 0) 4147 break; 4148 again: 4149 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 4150 if (min_key.objectid != ino) 4151 break; 4152 if (min_key.type > max_key.type) 4153 break; 4154 4155 src = path->nodes[0]; 4156 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4157 ins_nr++; 4158 goto next_slot; 4159 } else if (!ins_nr) { 4160 ins_start_slot = path->slots[0]; 4161 ins_nr = 1; 4162 goto next_slot; 4163 } 4164 4165 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4166 ins_start_slot, ins_nr, inode_only, 4167 logged_isize); 4168 if (ret < 0) { 4169 err = ret; 4170 goto out_unlock; 4171 } 4172 if (ret) { 4173 ins_nr = 0; 4174 btrfs_release_path(path); 4175 continue; 4176 } 4177 ins_nr = 1; 4178 ins_start_slot = path->slots[0]; 4179 next_slot: 4180 4181 nritems = btrfs_header_nritems(path->nodes[0]); 4182 path->slots[0]++; 4183 if (path->slots[0] < nritems) { 4184 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 4185 path->slots[0]); 4186 goto again; 4187 } 4188 if (ins_nr) { 4189 ret = copy_items(trans, inode, dst_path, path, 4190 &last_extent, ins_start_slot, 4191 ins_nr, inode_only, logged_isize); 4192 if (ret < 0) { 4193 err = ret; 4194 goto out_unlock; 4195 } 4196 ret = 0; 4197 ins_nr = 0; 4198 } 4199 btrfs_release_path(path); 4200 4201 if (min_key.offset < (u64)-1) { 4202 min_key.offset++; 4203 } else if (min_key.type < max_key.type) { 4204 min_key.type++; 4205 min_key.offset = 0; 4206 } else { 4207 break; 4208 } 4209 } 4210 if (ins_nr) { 4211 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4212 ins_start_slot, ins_nr, inode_only, 4213 logged_isize); 4214 if (ret < 0) { 4215 err = ret; 4216 goto out_unlock; 4217 } 4218 ret = 0; 4219 ins_nr = 0; 4220 } 4221 4222 log_extents: 4223 btrfs_release_path(path); 4224 btrfs_release_path(dst_path); 4225 if (fast_search) { 4226 /* 4227 * Some ordered extents started by fsync might have completed 4228 * before we collected the ordered extents in logged_list, which 4229 * means they're gone, not in our logged_list nor in the inode's 4230 * ordered tree. We want the application/user space to know an 4231 * error happened while attempting to persist file data so that 4232 * it can take proper action. If such error happened, we leave 4233 * without writing to the log tree and the fsync must report the 4234 * file data write error and not commit the current transaction. 4235 */ 4236 err = btrfs_inode_check_errors(inode); 4237 if (err) { 4238 ctx->io_err = err; 4239 goto out_unlock; 4240 } 4241 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4242 &logged_list, ctx); 4243 if (ret) { 4244 err = ret; 4245 goto out_unlock; 4246 } 4247 } else if (inode_only == LOG_INODE_ALL) { 4248 struct extent_map *em, *n; 4249 4250 write_lock(&em_tree->lock); 4251 /* 4252 * We can't just remove every em if we're called for a ranged 4253 * fsync - that is, one that doesn't cover the whole possible 4254 * file range (0 to LLONG_MAX). This is because we can have 4255 * em's that fall outside the range we're logging and therefore 4256 * their ordered operations haven't completed yet 4257 * (btrfs_finish_ordered_io() not invoked yet). This means we 4258 * didn't get their respective file extent item in the fs/subvol 4259 * tree yet, and need to let the next fast fsync (one which 4260 * consults the list of modified extent maps) find the em so 4261 * that it logs a matching file extent item and waits for the 4262 * respective ordered operation to complete (if it's still 4263 * running). 4264 * 4265 * Removing every em outside the range we're logging would make 4266 * the next fast fsync not log their matching file extent items, 4267 * therefore making us lose data after a log replay. 4268 */ 4269 list_for_each_entry_safe(em, n, &em_tree->modified_extents, 4270 list) { 4271 const u64 mod_end = em->mod_start + em->mod_len - 1; 4272 4273 if (em->mod_start >= start && mod_end <= end) 4274 list_del_init(&em->list); 4275 } 4276 write_unlock(&em_tree->lock); 4277 } 4278 4279 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4280 ret = log_directory_changes(trans, root, inode, path, dst_path); 4281 if (ret) { 4282 err = ret; 4283 goto out_unlock; 4284 } 4285 } 4286 4287 BTRFS_I(inode)->logged_trans = trans->transid; 4288 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4289 out_unlock: 4290 if (unlikely(err)) 4291 btrfs_put_logged_extents(&logged_list); 4292 else 4293 btrfs_submit_logged_extents(&logged_list, log); 4294 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4295 4296 btrfs_free_path(path); 4297 btrfs_free_path(dst_path); 4298 return err; 4299 } 4300 4301 /* 4302 * follow the dentry parent pointers up the chain and see if any 4303 * of the directories in it require a full commit before they can 4304 * be logged. Returns zero if nothing special needs to be done or 1 if 4305 * a full commit is required. 4306 */ 4307 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4308 struct inode *inode, 4309 struct dentry *parent, 4310 struct super_block *sb, 4311 u64 last_committed) 4312 { 4313 int ret = 0; 4314 struct btrfs_root *root; 4315 struct dentry *old_parent = NULL; 4316 struct inode *orig_inode = inode; 4317 4318 /* 4319 * for regular files, if its inode is already on disk, we don't 4320 * have to worry about the parents at all. This is because 4321 * we can use the last_unlink_trans field to record renames 4322 * and other fun in this file. 4323 */ 4324 if (S_ISREG(inode->i_mode) && 4325 BTRFS_I(inode)->generation <= last_committed && 4326 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4327 goto out; 4328 4329 if (!S_ISDIR(inode->i_mode)) { 4330 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4331 goto out; 4332 inode = parent->d_inode; 4333 } 4334 4335 while (1) { 4336 /* 4337 * If we are logging a directory then we start with our inode, 4338 * not our parents inode, so we need to skipp setting the 4339 * logged_trans so that further down in the log code we don't 4340 * think this inode has already been logged. 4341 */ 4342 if (inode != orig_inode) 4343 BTRFS_I(inode)->logged_trans = trans->transid; 4344 smp_mb(); 4345 4346 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4347 root = BTRFS_I(inode)->root; 4348 4349 /* 4350 * make sure any commits to the log are forced 4351 * to be full commits 4352 */ 4353 btrfs_set_log_full_commit(root->fs_info, trans); 4354 ret = 1; 4355 break; 4356 } 4357 4358 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4359 break; 4360 4361 if (IS_ROOT(parent)) 4362 break; 4363 4364 parent = dget_parent(parent); 4365 dput(old_parent); 4366 old_parent = parent; 4367 inode = parent->d_inode; 4368 4369 } 4370 dput(old_parent); 4371 out: 4372 return ret; 4373 } 4374 4375 /* 4376 * helper function around btrfs_log_inode to make sure newly created 4377 * parent directories also end up in the log. A minimal inode and backref 4378 * only logging is done of any parent directories that are older than 4379 * the last committed transaction 4380 */ 4381 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4382 struct btrfs_root *root, struct inode *inode, 4383 struct dentry *parent, 4384 const loff_t start, 4385 const loff_t end, 4386 int exists_only, 4387 struct btrfs_log_ctx *ctx) 4388 { 4389 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4390 struct super_block *sb; 4391 struct dentry *old_parent = NULL; 4392 int ret = 0; 4393 u64 last_committed = root->fs_info->last_trans_committed; 4394 const struct dentry * const first_parent = parent; 4395 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > 4396 last_committed); 4397 4398 sb = inode->i_sb; 4399 4400 if (btrfs_test_opt(root, NOTREELOG)) { 4401 ret = 1; 4402 goto end_no_trans; 4403 } 4404 4405 /* 4406 * The prev transaction commit doesn't complete, we need do 4407 * full commit by ourselves. 4408 */ 4409 if (root->fs_info->last_trans_log_full_commit > 4410 root->fs_info->last_trans_committed) { 4411 ret = 1; 4412 goto end_no_trans; 4413 } 4414 4415 if (root != BTRFS_I(inode)->root || 4416 btrfs_root_refs(&root->root_item) == 0) { 4417 ret = 1; 4418 goto end_no_trans; 4419 } 4420 4421 ret = check_parent_dirs_for_sync(trans, inode, parent, 4422 sb, last_committed); 4423 if (ret) 4424 goto end_no_trans; 4425 4426 if (btrfs_inode_in_log(inode, trans->transid)) { 4427 ret = BTRFS_NO_LOG_SYNC; 4428 goto end_no_trans; 4429 } 4430 4431 ret = start_log_trans(trans, root, ctx); 4432 if (ret) 4433 goto end_no_trans; 4434 4435 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 4436 if (ret) 4437 goto end_trans; 4438 4439 /* 4440 * for regular files, if its inode is already on disk, we don't 4441 * have to worry about the parents at all. This is because 4442 * we can use the last_unlink_trans field to record renames 4443 * and other fun in this file. 4444 */ 4445 if (S_ISREG(inode->i_mode) && 4446 BTRFS_I(inode)->generation <= last_committed && 4447 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 4448 ret = 0; 4449 goto end_trans; 4450 } 4451 4452 while (1) { 4453 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4454 break; 4455 4456 inode = parent->d_inode; 4457 if (root != BTRFS_I(inode)->root) 4458 break; 4459 4460 /* 4461 * On unlink we must make sure our immediate parent directory 4462 * inode is fully logged. This is to prevent leaving dangling 4463 * directory index entries and a wrong directory inode's i_size. 4464 * Not doing so can result in a directory being impossible to 4465 * delete after log replay (rmdir will always fail with error 4466 * -ENOTEMPTY). 4467 */ 4468 if (did_unlink && parent == first_parent) 4469 inode_only = LOG_INODE_ALL; 4470 else 4471 inode_only = LOG_INODE_EXISTS; 4472 4473 if (BTRFS_I(inode)->generation > 4474 root->fs_info->last_trans_committed || 4475 inode_only == LOG_INODE_ALL) { 4476 ret = btrfs_log_inode(trans, root, inode, inode_only, 4477 0, LLONG_MAX, ctx); 4478 if (ret) 4479 goto end_trans; 4480 } 4481 if (IS_ROOT(parent)) 4482 break; 4483 4484 parent = dget_parent(parent); 4485 dput(old_parent); 4486 old_parent = parent; 4487 } 4488 ret = 0; 4489 end_trans: 4490 dput(old_parent); 4491 if (ret < 0) { 4492 btrfs_set_log_full_commit(root->fs_info, trans); 4493 ret = 1; 4494 } 4495 4496 if (ret) 4497 btrfs_remove_log_ctx(root, ctx); 4498 btrfs_end_log_trans(root); 4499 end_no_trans: 4500 return ret; 4501 } 4502 4503 /* 4504 * it is not safe to log dentry if the chunk root has added new 4505 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 4506 * If this returns 1, you must commit the transaction to safely get your 4507 * data on disk. 4508 */ 4509 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4510 struct btrfs_root *root, struct dentry *dentry, 4511 const loff_t start, 4512 const loff_t end, 4513 struct btrfs_log_ctx *ctx) 4514 { 4515 struct dentry *parent = dget_parent(dentry); 4516 int ret; 4517 4518 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 4519 start, end, 0, ctx); 4520 dput(parent); 4521 4522 return ret; 4523 } 4524 4525 /* 4526 * should be called during mount to recover any replay any log trees 4527 * from the FS 4528 */ 4529 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 4530 { 4531 int ret; 4532 struct btrfs_path *path; 4533 struct btrfs_trans_handle *trans; 4534 struct btrfs_key key; 4535 struct btrfs_key found_key; 4536 struct btrfs_key tmp_key; 4537 struct btrfs_root *log; 4538 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 4539 struct walk_control wc = { 4540 .process_func = process_one_buffer, 4541 .stage = 0, 4542 }; 4543 4544 path = btrfs_alloc_path(); 4545 if (!path) 4546 return -ENOMEM; 4547 4548 fs_info->log_root_recovering = 1; 4549 4550 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4551 if (IS_ERR(trans)) { 4552 ret = PTR_ERR(trans); 4553 goto error; 4554 } 4555 4556 wc.trans = trans; 4557 wc.pin = 1; 4558 4559 ret = walk_log_tree(trans, log_root_tree, &wc); 4560 if (ret) { 4561 btrfs_error(fs_info, ret, "Failed to pin buffers while " 4562 "recovering log root tree."); 4563 goto error; 4564 } 4565 4566 again: 4567 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4568 key.offset = (u64)-1; 4569 key.type = BTRFS_ROOT_ITEM_KEY; 4570 4571 while (1) { 4572 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4573 4574 if (ret < 0) { 4575 btrfs_error(fs_info, ret, 4576 "Couldn't find tree log root."); 4577 goto error; 4578 } 4579 if (ret > 0) { 4580 if (path->slots[0] == 0) 4581 break; 4582 path->slots[0]--; 4583 } 4584 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4585 path->slots[0]); 4586 btrfs_release_path(path); 4587 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4588 break; 4589 4590 log = btrfs_read_fs_root(log_root_tree, &found_key); 4591 if (IS_ERR(log)) { 4592 ret = PTR_ERR(log); 4593 btrfs_error(fs_info, ret, 4594 "Couldn't read tree log root."); 4595 goto error; 4596 } 4597 4598 tmp_key.objectid = found_key.offset; 4599 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4600 tmp_key.offset = (u64)-1; 4601 4602 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4603 if (IS_ERR(wc.replay_dest)) { 4604 ret = PTR_ERR(wc.replay_dest); 4605 free_extent_buffer(log->node); 4606 free_extent_buffer(log->commit_root); 4607 kfree(log); 4608 btrfs_error(fs_info, ret, "Couldn't read target root " 4609 "for tree log recovery."); 4610 goto error; 4611 } 4612 4613 wc.replay_dest->log_root = log; 4614 btrfs_record_root_in_trans(trans, wc.replay_dest); 4615 ret = walk_log_tree(trans, log, &wc); 4616 4617 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 4618 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4619 path); 4620 } 4621 4622 key.offset = found_key.offset - 1; 4623 wc.replay_dest->log_root = NULL; 4624 free_extent_buffer(log->node); 4625 free_extent_buffer(log->commit_root); 4626 kfree(log); 4627 4628 if (ret) 4629 goto error; 4630 4631 if (found_key.offset == 0) 4632 break; 4633 } 4634 btrfs_release_path(path); 4635 4636 /* step one is to pin it all, step two is to replay just inodes */ 4637 if (wc.pin) { 4638 wc.pin = 0; 4639 wc.process_func = replay_one_buffer; 4640 wc.stage = LOG_WALK_REPLAY_INODES; 4641 goto again; 4642 } 4643 /* step three is to replay everything */ 4644 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4645 wc.stage++; 4646 goto again; 4647 } 4648 4649 btrfs_free_path(path); 4650 4651 /* step 4: commit the transaction, which also unpins the blocks */ 4652 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 4653 if (ret) 4654 return ret; 4655 4656 free_extent_buffer(log_root_tree->node); 4657 log_root_tree->log_root = NULL; 4658 fs_info->log_root_recovering = 0; 4659 kfree(log_root_tree); 4660 4661 return 0; 4662 error: 4663 if (wc.trans) 4664 btrfs_end_transaction(wc.trans, fs_info->tree_root); 4665 btrfs_free_path(path); 4666 return ret; 4667 } 4668 4669 /* 4670 * there are some corner cases where we want to force a full 4671 * commit instead of allowing a directory to be logged. 4672 * 4673 * They revolve around files there were unlinked from the directory, and 4674 * this function updates the parent directory so that a full commit is 4675 * properly done if it is fsync'd later after the unlinks are done. 4676 */ 4677 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 4678 struct inode *dir, struct inode *inode, 4679 int for_rename) 4680 { 4681 /* 4682 * when we're logging a file, if it hasn't been renamed 4683 * or unlinked, and its inode is fully committed on disk, 4684 * we don't have to worry about walking up the directory chain 4685 * to log its parents. 4686 * 4687 * So, we use the last_unlink_trans field to put this transid 4688 * into the file. When the file is logged we check it and 4689 * don't log the parents if the file is fully on disk. 4690 */ 4691 if (S_ISREG(inode->i_mode)) 4692 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4693 4694 /* 4695 * if this directory was already logged any new 4696 * names for this file/dir will get recorded 4697 */ 4698 smp_mb(); 4699 if (BTRFS_I(dir)->logged_trans == trans->transid) 4700 return; 4701 4702 /* 4703 * if the inode we're about to unlink was logged, 4704 * the log will be properly updated for any new names 4705 */ 4706 if (BTRFS_I(inode)->logged_trans == trans->transid) 4707 return; 4708 4709 /* 4710 * when renaming files across directories, if the directory 4711 * there we're unlinking from gets fsync'd later on, there's 4712 * no way to find the destination directory later and fsync it 4713 * properly. So, we have to be conservative and force commits 4714 * so the new name gets discovered. 4715 */ 4716 if (for_rename) 4717 goto record; 4718 4719 /* we can safely do the unlink without any special recording */ 4720 return; 4721 4722 record: 4723 BTRFS_I(dir)->last_unlink_trans = trans->transid; 4724 } 4725 4726 /* 4727 * Call this after adding a new name for a file and it will properly 4728 * update the log to reflect the new name. 4729 * 4730 * It will return zero if all goes well, and it will return 1 if a 4731 * full transaction commit is required. 4732 */ 4733 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 4734 struct inode *inode, struct inode *old_dir, 4735 struct dentry *parent) 4736 { 4737 struct btrfs_root * root = BTRFS_I(inode)->root; 4738 4739 /* 4740 * this will force the logging code to walk the dentry chain 4741 * up for the file 4742 */ 4743 if (S_ISREG(inode->i_mode)) 4744 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4745 4746 /* 4747 * if this inode hasn't been logged and directory we're renaming it 4748 * from hasn't been logged, we don't need to log it 4749 */ 4750 if (BTRFS_I(inode)->logged_trans <= 4751 root->fs_info->last_trans_committed && 4752 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 4753 root->fs_info->last_trans_committed)) 4754 return 0; 4755 4756 return btrfs_log_inode_parent(trans, root, inode, parent, 0, 4757 LLONG_MAX, 1, NULL); 4758 } 4759 4760