1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "tree-log.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "hash.h" 29 30 /* magic values for the inode_only field in btrfs_log_inode: 31 * 32 * LOG_INODE_ALL means to log everything 33 * LOG_INODE_EXISTS means to log just enough to recreate the inode 34 * during log replay 35 */ 36 #define LOG_INODE_ALL 0 37 #define LOG_INODE_EXISTS 1 38 39 /* 40 * directory trouble cases 41 * 42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 43 * log, we must force a full commit before doing an fsync of the directory 44 * where the unlink was done. 45 * ---> record transid of last unlink/rename per directory 46 * 47 * mkdir foo/some_dir 48 * normal commit 49 * rename foo/some_dir foo2/some_dir 50 * mkdir foo/some_dir 51 * fsync foo/some_dir/some_file 52 * 53 * The fsync above will unlink the original some_dir without recording 54 * it in its new location (foo2). After a crash, some_dir will be gone 55 * unless the fsync of some_file forces a full commit 56 * 57 * 2) we must log any new names for any file or dir that is in the fsync 58 * log. ---> check inode while renaming/linking. 59 * 60 * 2a) we must log any new names for any file or dir during rename 61 * when the directory they are being removed from was logged. 62 * ---> check inode and old parent dir during rename 63 * 64 * 2a is actually the more important variant. With the extra logging 65 * a crash might unlink the old name without recreating the new one 66 * 67 * 3) after a crash, we must go through any directories with a link count 68 * of zero and redo the rm -rf 69 * 70 * mkdir f1/foo 71 * normal commit 72 * rm -rf f1/foo 73 * fsync(f1) 74 * 75 * The directory f1 was fully removed from the FS, but fsync was never 76 * called on f1, only its parent dir. After a crash the rm -rf must 77 * be replayed. This must be able to recurse down the entire 78 * directory tree. The inode link count fixup code takes care of the 79 * ugly details. 80 */ 81 82 /* 83 * stages for the tree walking. The first 84 * stage (0) is to only pin down the blocks we find 85 * the second stage (1) is to make sure that all the inodes 86 * we find in the log are created in the subvolume. 87 * 88 * The last stage is to deal with directories and links and extents 89 * and all the other fun semantics 90 */ 91 #define LOG_WALK_PIN_ONLY 0 92 #define LOG_WALK_REPLAY_INODES 1 93 #define LOG_WALK_REPLAY_DIR_INDEX 2 94 #define LOG_WALK_REPLAY_ALL 3 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct inode *inode, 98 int inode_only); 99 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_path *path, u64 objectid); 102 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_root *log, 105 struct btrfs_path *path, 106 u64 dirid, int del_all); 107 108 /* 109 * tree logging is a special write ahead log used to make sure that 110 * fsyncs and O_SYNCs can happen without doing full tree commits. 111 * 112 * Full tree commits are expensive because they require commonly 113 * modified blocks to be recowed, creating many dirty pages in the 114 * extent tree an 4x-6x higher write load than ext3. 115 * 116 * Instead of doing a tree commit on every fsync, we use the 117 * key ranges and transaction ids to find items for a given file or directory 118 * that have changed in this transaction. Those items are copied into 119 * a special tree (one per subvolume root), that tree is written to disk 120 * and then the fsync is considered complete. 121 * 122 * After a crash, items are copied out of the log-tree back into the 123 * subvolume tree. Any file data extents found are recorded in the extent 124 * allocation tree, and the log-tree freed. 125 * 126 * The log tree is read three times, once to pin down all the extents it is 127 * using in ram and once, once to create all the inodes logged in the tree 128 * and once to do all the other items. 129 */ 130 131 /* 132 * start a sub transaction and setup the log tree 133 * this increments the log tree writer count to make the people 134 * syncing the tree wait for us to finish 135 */ 136 static int start_log_trans(struct btrfs_trans_handle *trans, 137 struct btrfs_root *root, 138 struct btrfs_log_ctx *ctx) 139 { 140 int index; 141 int ret; 142 143 mutex_lock(&root->log_mutex); 144 if (root->log_root) { 145 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 146 ret = -EAGAIN; 147 goto out; 148 } 149 if (!root->log_start_pid) { 150 root->log_start_pid = current->pid; 151 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 152 } else if (root->log_start_pid != current->pid) { 153 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 154 } 155 156 atomic_inc(&root->log_batch); 157 atomic_inc(&root->log_writers); 158 if (ctx) { 159 index = root->log_transid % 2; 160 list_add_tail(&ctx->list, &root->log_ctxs[index]); 161 ctx->log_transid = root->log_transid; 162 } 163 mutex_unlock(&root->log_mutex); 164 return 0; 165 } 166 167 ret = 0; 168 mutex_lock(&root->fs_info->tree_log_mutex); 169 if (!root->fs_info->log_root_tree) 170 ret = btrfs_init_log_root_tree(trans, root->fs_info); 171 mutex_unlock(&root->fs_info->tree_log_mutex); 172 if (ret) 173 goto out; 174 175 if (!root->log_root) { 176 ret = btrfs_add_log_tree(trans, root); 177 if (ret) 178 goto out; 179 } 180 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 181 root->log_start_pid = current->pid; 182 atomic_inc(&root->log_batch); 183 atomic_inc(&root->log_writers); 184 if (ctx) { 185 index = root->log_transid % 2; 186 list_add_tail(&ctx->list, &root->log_ctxs[index]); 187 ctx->log_transid = root->log_transid; 188 } 189 out: 190 mutex_unlock(&root->log_mutex); 191 return ret; 192 } 193 194 /* 195 * returns 0 if there was a log transaction running and we were able 196 * to join, or returns -ENOENT if there were not transactions 197 * in progress 198 */ 199 static int join_running_log_trans(struct btrfs_root *root) 200 { 201 int ret = -ENOENT; 202 203 smp_mb(); 204 if (!root->log_root) 205 return -ENOENT; 206 207 mutex_lock(&root->log_mutex); 208 if (root->log_root) { 209 ret = 0; 210 atomic_inc(&root->log_writers); 211 } 212 mutex_unlock(&root->log_mutex); 213 return ret; 214 } 215 216 /* 217 * This either makes the current running log transaction wait 218 * until you call btrfs_end_log_trans() or it makes any future 219 * log transactions wait until you call btrfs_end_log_trans() 220 */ 221 int btrfs_pin_log_trans(struct btrfs_root *root) 222 { 223 int ret = -ENOENT; 224 225 mutex_lock(&root->log_mutex); 226 atomic_inc(&root->log_writers); 227 mutex_unlock(&root->log_mutex); 228 return ret; 229 } 230 231 /* 232 * indicate we're done making changes to the log tree 233 * and wake up anyone waiting to do a sync 234 */ 235 void btrfs_end_log_trans(struct btrfs_root *root) 236 { 237 if (atomic_dec_and_test(&root->log_writers)) { 238 smp_mb(); 239 if (waitqueue_active(&root->log_writer_wait)) 240 wake_up(&root->log_writer_wait); 241 } 242 } 243 244 245 /* 246 * the walk control struct is used to pass state down the chain when 247 * processing the log tree. The stage field tells us which part 248 * of the log tree processing we are currently doing. The others 249 * are state fields used for that specific part 250 */ 251 struct walk_control { 252 /* should we free the extent on disk when done? This is used 253 * at transaction commit time while freeing a log tree 254 */ 255 int free; 256 257 /* should we write out the extent buffer? This is used 258 * while flushing the log tree to disk during a sync 259 */ 260 int write; 261 262 /* should we wait for the extent buffer io to finish? Also used 263 * while flushing the log tree to disk for a sync 264 */ 265 int wait; 266 267 /* pin only walk, we record which extents on disk belong to the 268 * log trees 269 */ 270 int pin; 271 272 /* what stage of the replay code we're currently in */ 273 int stage; 274 275 /* the root we are currently replaying */ 276 struct btrfs_root *replay_dest; 277 278 /* the trans handle for the current replay */ 279 struct btrfs_trans_handle *trans; 280 281 /* the function that gets used to process blocks we find in the 282 * tree. Note the extent_buffer might not be up to date when it is 283 * passed in, and it must be checked or read if you need the data 284 * inside it 285 */ 286 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287 struct walk_control *wc, u64 gen); 288 }; 289 290 /* 291 * process_func used to pin down extents, write them or wait on them 292 */ 293 static int process_one_buffer(struct btrfs_root *log, 294 struct extent_buffer *eb, 295 struct walk_control *wc, u64 gen) 296 { 297 int ret = 0; 298 299 /* 300 * If this fs is mixed then we need to be able to process the leaves to 301 * pin down any logged extents, so we have to read the block. 302 */ 303 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 304 ret = btrfs_read_buffer(eb, gen); 305 if (ret) 306 return ret; 307 } 308 309 if (wc->pin) 310 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 311 eb->start, eb->len); 312 313 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 314 if (wc->pin && btrfs_header_level(eb) == 0) 315 ret = btrfs_exclude_logged_extents(log, eb); 316 if (wc->write) 317 btrfs_write_tree_block(eb); 318 if (wc->wait) 319 btrfs_wait_tree_block_writeback(eb); 320 } 321 return ret; 322 } 323 324 /* 325 * Item overwrite used by replay and tree logging. eb, slot and key all refer 326 * to the src data we are copying out. 327 * 328 * root is the tree we are copying into, and path is a scratch 329 * path for use in this function (it should be released on entry and 330 * will be released on exit). 331 * 332 * If the key is already in the destination tree the existing item is 333 * overwritten. If the existing item isn't big enough, it is extended. 334 * If it is too large, it is truncated. 335 * 336 * If the key isn't in the destination yet, a new item is inserted. 337 */ 338 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 339 struct btrfs_root *root, 340 struct btrfs_path *path, 341 struct extent_buffer *eb, int slot, 342 struct btrfs_key *key) 343 { 344 int ret; 345 u32 item_size; 346 u64 saved_i_size = 0; 347 int save_old_i_size = 0; 348 unsigned long src_ptr; 349 unsigned long dst_ptr; 350 int overwrite_root = 0; 351 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 352 353 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 354 overwrite_root = 1; 355 356 item_size = btrfs_item_size_nr(eb, slot); 357 src_ptr = btrfs_item_ptr_offset(eb, slot); 358 359 /* look for the key in the destination tree */ 360 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 361 if (ret < 0) 362 return ret; 363 364 if (ret == 0) { 365 char *src_copy; 366 char *dst_copy; 367 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 368 path->slots[0]); 369 if (dst_size != item_size) 370 goto insert; 371 372 if (item_size == 0) { 373 btrfs_release_path(path); 374 return 0; 375 } 376 dst_copy = kmalloc(item_size, GFP_NOFS); 377 src_copy = kmalloc(item_size, GFP_NOFS); 378 if (!dst_copy || !src_copy) { 379 btrfs_release_path(path); 380 kfree(dst_copy); 381 kfree(src_copy); 382 return -ENOMEM; 383 } 384 385 read_extent_buffer(eb, src_copy, src_ptr, item_size); 386 387 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 388 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 389 item_size); 390 ret = memcmp(dst_copy, src_copy, item_size); 391 392 kfree(dst_copy); 393 kfree(src_copy); 394 /* 395 * they have the same contents, just return, this saves 396 * us from cowing blocks in the destination tree and doing 397 * extra writes that may not have been done by a previous 398 * sync 399 */ 400 if (ret == 0) { 401 btrfs_release_path(path); 402 return 0; 403 } 404 405 /* 406 * We need to load the old nbytes into the inode so when we 407 * replay the extents we've logged we get the right nbytes. 408 */ 409 if (inode_item) { 410 struct btrfs_inode_item *item; 411 u64 nbytes; 412 u32 mode; 413 414 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 415 struct btrfs_inode_item); 416 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 417 item = btrfs_item_ptr(eb, slot, 418 struct btrfs_inode_item); 419 btrfs_set_inode_nbytes(eb, item, nbytes); 420 421 /* 422 * If this is a directory we need to reset the i_size to 423 * 0 so that we can set it up properly when replaying 424 * the rest of the items in this log. 425 */ 426 mode = btrfs_inode_mode(eb, item); 427 if (S_ISDIR(mode)) 428 btrfs_set_inode_size(eb, item, 0); 429 } 430 } else if (inode_item) { 431 struct btrfs_inode_item *item; 432 u32 mode; 433 434 /* 435 * New inode, set nbytes to 0 so that the nbytes comes out 436 * properly when we replay the extents. 437 */ 438 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 439 btrfs_set_inode_nbytes(eb, item, 0); 440 441 /* 442 * If this is a directory we need to reset the i_size to 0 so 443 * that we can set it up properly when replaying the rest of 444 * the items in this log. 445 */ 446 mode = btrfs_inode_mode(eb, item); 447 if (S_ISDIR(mode)) 448 btrfs_set_inode_size(eb, item, 0); 449 } 450 insert: 451 btrfs_release_path(path); 452 /* try to insert the key into the destination tree */ 453 ret = btrfs_insert_empty_item(trans, root, path, 454 key, item_size); 455 456 /* make sure any existing item is the correct size */ 457 if (ret == -EEXIST) { 458 u32 found_size; 459 found_size = btrfs_item_size_nr(path->nodes[0], 460 path->slots[0]); 461 if (found_size > item_size) 462 btrfs_truncate_item(root, path, item_size, 1); 463 else if (found_size < item_size) 464 btrfs_extend_item(root, path, 465 item_size - found_size); 466 } else if (ret) { 467 return ret; 468 } 469 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 470 path->slots[0]); 471 472 /* don't overwrite an existing inode if the generation number 473 * was logged as zero. This is done when the tree logging code 474 * is just logging an inode to make sure it exists after recovery. 475 * 476 * Also, don't overwrite i_size on directories during replay. 477 * log replay inserts and removes directory items based on the 478 * state of the tree found in the subvolume, and i_size is modified 479 * as it goes 480 */ 481 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 482 struct btrfs_inode_item *src_item; 483 struct btrfs_inode_item *dst_item; 484 485 src_item = (struct btrfs_inode_item *)src_ptr; 486 dst_item = (struct btrfs_inode_item *)dst_ptr; 487 488 if (btrfs_inode_generation(eb, src_item) == 0) 489 goto no_copy; 490 491 if (overwrite_root && 492 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 493 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 494 save_old_i_size = 1; 495 saved_i_size = btrfs_inode_size(path->nodes[0], 496 dst_item); 497 } 498 } 499 500 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 501 src_ptr, item_size); 502 503 if (save_old_i_size) { 504 struct btrfs_inode_item *dst_item; 505 dst_item = (struct btrfs_inode_item *)dst_ptr; 506 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 507 } 508 509 /* make sure the generation is filled in */ 510 if (key->type == BTRFS_INODE_ITEM_KEY) { 511 struct btrfs_inode_item *dst_item; 512 dst_item = (struct btrfs_inode_item *)dst_ptr; 513 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 514 btrfs_set_inode_generation(path->nodes[0], dst_item, 515 trans->transid); 516 } 517 } 518 no_copy: 519 btrfs_mark_buffer_dirty(path->nodes[0]); 520 btrfs_release_path(path); 521 return 0; 522 } 523 524 /* 525 * simple helper to read an inode off the disk from a given root 526 * This can only be called for subvolume roots and not for the log 527 */ 528 static noinline struct inode *read_one_inode(struct btrfs_root *root, 529 u64 objectid) 530 { 531 struct btrfs_key key; 532 struct inode *inode; 533 534 key.objectid = objectid; 535 key.type = BTRFS_INODE_ITEM_KEY; 536 key.offset = 0; 537 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 538 if (IS_ERR(inode)) { 539 inode = NULL; 540 } else if (is_bad_inode(inode)) { 541 iput(inode); 542 inode = NULL; 543 } 544 return inode; 545 } 546 547 /* replays a single extent in 'eb' at 'slot' with 'key' into the 548 * subvolume 'root'. path is released on entry and should be released 549 * on exit. 550 * 551 * extents in the log tree have not been allocated out of the extent 552 * tree yet. So, this completes the allocation, taking a reference 553 * as required if the extent already exists or creating a new extent 554 * if it isn't in the extent allocation tree yet. 555 * 556 * The extent is inserted into the file, dropping any existing extents 557 * from the file that overlap the new one. 558 */ 559 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 560 struct btrfs_root *root, 561 struct btrfs_path *path, 562 struct extent_buffer *eb, int slot, 563 struct btrfs_key *key) 564 { 565 int found_type; 566 u64 extent_end; 567 u64 start = key->offset; 568 u64 nbytes = 0; 569 struct btrfs_file_extent_item *item; 570 struct inode *inode = NULL; 571 unsigned long size; 572 int ret = 0; 573 574 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 575 found_type = btrfs_file_extent_type(eb, item); 576 577 if (found_type == BTRFS_FILE_EXTENT_REG || 578 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 579 nbytes = btrfs_file_extent_num_bytes(eb, item); 580 extent_end = start + nbytes; 581 582 /* 583 * We don't add to the inodes nbytes if we are prealloc or a 584 * hole. 585 */ 586 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 587 nbytes = 0; 588 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 589 size = btrfs_file_extent_inline_len(eb, slot, item); 590 nbytes = btrfs_file_extent_ram_bytes(eb, item); 591 extent_end = ALIGN(start + size, root->sectorsize); 592 } else { 593 ret = 0; 594 goto out; 595 } 596 597 inode = read_one_inode(root, key->objectid); 598 if (!inode) { 599 ret = -EIO; 600 goto out; 601 } 602 603 /* 604 * first check to see if we already have this extent in the 605 * file. This must be done before the btrfs_drop_extents run 606 * so we don't try to drop this extent. 607 */ 608 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 609 start, 0); 610 611 if (ret == 0 && 612 (found_type == BTRFS_FILE_EXTENT_REG || 613 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 614 struct btrfs_file_extent_item cmp1; 615 struct btrfs_file_extent_item cmp2; 616 struct btrfs_file_extent_item *existing; 617 struct extent_buffer *leaf; 618 619 leaf = path->nodes[0]; 620 existing = btrfs_item_ptr(leaf, path->slots[0], 621 struct btrfs_file_extent_item); 622 623 read_extent_buffer(eb, &cmp1, (unsigned long)item, 624 sizeof(cmp1)); 625 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 626 sizeof(cmp2)); 627 628 /* 629 * we already have a pointer to this exact extent, 630 * we don't have to do anything 631 */ 632 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 633 btrfs_release_path(path); 634 goto out; 635 } 636 } 637 btrfs_release_path(path); 638 639 /* drop any overlapping extents */ 640 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 641 if (ret) 642 goto out; 643 644 if (found_type == BTRFS_FILE_EXTENT_REG || 645 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 646 u64 offset; 647 unsigned long dest_offset; 648 struct btrfs_key ins; 649 650 ret = btrfs_insert_empty_item(trans, root, path, key, 651 sizeof(*item)); 652 if (ret) 653 goto out; 654 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 655 path->slots[0]); 656 copy_extent_buffer(path->nodes[0], eb, dest_offset, 657 (unsigned long)item, sizeof(*item)); 658 659 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 660 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 661 ins.type = BTRFS_EXTENT_ITEM_KEY; 662 offset = key->offset - btrfs_file_extent_offset(eb, item); 663 664 if (ins.objectid > 0) { 665 u64 csum_start; 666 u64 csum_end; 667 LIST_HEAD(ordered_sums); 668 /* 669 * is this extent already allocated in the extent 670 * allocation tree? If so, just add a reference 671 */ 672 ret = btrfs_lookup_extent(root, ins.objectid, 673 ins.offset); 674 if (ret == 0) { 675 ret = btrfs_inc_extent_ref(trans, root, 676 ins.objectid, ins.offset, 677 0, root->root_key.objectid, 678 key->objectid, offset, 0); 679 if (ret) 680 goto out; 681 } else { 682 /* 683 * insert the extent pointer in the extent 684 * allocation tree 685 */ 686 ret = btrfs_alloc_logged_file_extent(trans, 687 root, root->root_key.objectid, 688 key->objectid, offset, &ins); 689 if (ret) 690 goto out; 691 } 692 btrfs_release_path(path); 693 694 if (btrfs_file_extent_compression(eb, item)) { 695 csum_start = ins.objectid; 696 csum_end = csum_start + ins.offset; 697 } else { 698 csum_start = ins.objectid + 699 btrfs_file_extent_offset(eb, item); 700 csum_end = csum_start + 701 btrfs_file_extent_num_bytes(eb, item); 702 } 703 704 ret = btrfs_lookup_csums_range(root->log_root, 705 csum_start, csum_end - 1, 706 &ordered_sums, 0); 707 if (ret) 708 goto out; 709 while (!list_empty(&ordered_sums)) { 710 struct btrfs_ordered_sum *sums; 711 sums = list_entry(ordered_sums.next, 712 struct btrfs_ordered_sum, 713 list); 714 if (!ret) 715 ret = btrfs_csum_file_blocks(trans, 716 root->fs_info->csum_root, 717 sums); 718 list_del(&sums->list); 719 kfree(sums); 720 } 721 if (ret) 722 goto out; 723 } else { 724 btrfs_release_path(path); 725 } 726 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 727 /* inline extents are easy, we just overwrite them */ 728 ret = overwrite_item(trans, root, path, eb, slot, key); 729 if (ret) 730 goto out; 731 } 732 733 inode_add_bytes(inode, nbytes); 734 ret = btrfs_update_inode(trans, root, inode); 735 out: 736 if (inode) 737 iput(inode); 738 return ret; 739 } 740 741 /* 742 * when cleaning up conflicts between the directory names in the 743 * subvolume, directory names in the log and directory names in the 744 * inode back references, we may have to unlink inodes from directories. 745 * 746 * This is a helper function to do the unlink of a specific directory 747 * item 748 */ 749 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 750 struct btrfs_root *root, 751 struct btrfs_path *path, 752 struct inode *dir, 753 struct btrfs_dir_item *di) 754 { 755 struct inode *inode; 756 char *name; 757 int name_len; 758 struct extent_buffer *leaf; 759 struct btrfs_key location; 760 int ret; 761 762 leaf = path->nodes[0]; 763 764 btrfs_dir_item_key_to_cpu(leaf, di, &location); 765 name_len = btrfs_dir_name_len(leaf, di); 766 name = kmalloc(name_len, GFP_NOFS); 767 if (!name) 768 return -ENOMEM; 769 770 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 771 btrfs_release_path(path); 772 773 inode = read_one_inode(root, location.objectid); 774 if (!inode) { 775 ret = -EIO; 776 goto out; 777 } 778 779 ret = link_to_fixup_dir(trans, root, path, location.objectid); 780 if (ret) 781 goto out; 782 783 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 784 if (ret) 785 goto out; 786 else 787 ret = btrfs_run_delayed_items(trans, root); 788 out: 789 kfree(name); 790 iput(inode); 791 return ret; 792 } 793 794 /* 795 * helper function to see if a given name and sequence number found 796 * in an inode back reference are already in a directory and correctly 797 * point to this inode 798 */ 799 static noinline int inode_in_dir(struct btrfs_root *root, 800 struct btrfs_path *path, 801 u64 dirid, u64 objectid, u64 index, 802 const char *name, int name_len) 803 { 804 struct btrfs_dir_item *di; 805 struct btrfs_key location; 806 int match = 0; 807 808 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 809 index, name, name_len, 0); 810 if (di && !IS_ERR(di)) { 811 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 812 if (location.objectid != objectid) 813 goto out; 814 } else 815 goto out; 816 btrfs_release_path(path); 817 818 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 819 if (di && !IS_ERR(di)) { 820 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 821 if (location.objectid != objectid) 822 goto out; 823 } else 824 goto out; 825 match = 1; 826 out: 827 btrfs_release_path(path); 828 return match; 829 } 830 831 /* 832 * helper function to check a log tree for a named back reference in 833 * an inode. This is used to decide if a back reference that is 834 * found in the subvolume conflicts with what we find in the log. 835 * 836 * inode backreferences may have multiple refs in a single item, 837 * during replay we process one reference at a time, and we don't 838 * want to delete valid links to a file from the subvolume if that 839 * link is also in the log. 840 */ 841 static noinline int backref_in_log(struct btrfs_root *log, 842 struct btrfs_key *key, 843 u64 ref_objectid, 844 char *name, int namelen) 845 { 846 struct btrfs_path *path; 847 struct btrfs_inode_ref *ref; 848 unsigned long ptr; 849 unsigned long ptr_end; 850 unsigned long name_ptr; 851 int found_name_len; 852 int item_size; 853 int ret; 854 int match = 0; 855 856 path = btrfs_alloc_path(); 857 if (!path) 858 return -ENOMEM; 859 860 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 861 if (ret != 0) 862 goto out; 863 864 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 865 866 if (key->type == BTRFS_INODE_EXTREF_KEY) { 867 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 868 name, namelen, NULL)) 869 match = 1; 870 871 goto out; 872 } 873 874 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 875 ptr_end = ptr + item_size; 876 while (ptr < ptr_end) { 877 ref = (struct btrfs_inode_ref *)ptr; 878 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 879 if (found_name_len == namelen) { 880 name_ptr = (unsigned long)(ref + 1); 881 ret = memcmp_extent_buffer(path->nodes[0], name, 882 name_ptr, namelen); 883 if (ret == 0) { 884 match = 1; 885 goto out; 886 } 887 } 888 ptr = (unsigned long)(ref + 1) + found_name_len; 889 } 890 out: 891 btrfs_free_path(path); 892 return match; 893 } 894 895 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 896 struct btrfs_root *root, 897 struct btrfs_path *path, 898 struct btrfs_root *log_root, 899 struct inode *dir, struct inode *inode, 900 struct extent_buffer *eb, 901 u64 inode_objectid, u64 parent_objectid, 902 u64 ref_index, char *name, int namelen, 903 int *search_done) 904 { 905 int ret; 906 char *victim_name; 907 int victim_name_len; 908 struct extent_buffer *leaf; 909 struct btrfs_dir_item *di; 910 struct btrfs_key search_key; 911 struct btrfs_inode_extref *extref; 912 913 again: 914 /* Search old style refs */ 915 search_key.objectid = inode_objectid; 916 search_key.type = BTRFS_INODE_REF_KEY; 917 search_key.offset = parent_objectid; 918 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 919 if (ret == 0) { 920 struct btrfs_inode_ref *victim_ref; 921 unsigned long ptr; 922 unsigned long ptr_end; 923 924 leaf = path->nodes[0]; 925 926 /* are we trying to overwrite a back ref for the root directory 927 * if so, just jump out, we're done 928 */ 929 if (search_key.objectid == search_key.offset) 930 return 1; 931 932 /* check all the names in this back reference to see 933 * if they are in the log. if so, we allow them to stay 934 * otherwise they must be unlinked as a conflict 935 */ 936 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 937 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 938 while (ptr < ptr_end) { 939 victim_ref = (struct btrfs_inode_ref *)ptr; 940 victim_name_len = btrfs_inode_ref_name_len(leaf, 941 victim_ref); 942 victim_name = kmalloc(victim_name_len, GFP_NOFS); 943 if (!victim_name) 944 return -ENOMEM; 945 946 read_extent_buffer(leaf, victim_name, 947 (unsigned long)(victim_ref + 1), 948 victim_name_len); 949 950 if (!backref_in_log(log_root, &search_key, 951 parent_objectid, 952 victim_name, 953 victim_name_len)) { 954 inc_nlink(inode); 955 btrfs_release_path(path); 956 957 ret = btrfs_unlink_inode(trans, root, dir, 958 inode, victim_name, 959 victim_name_len); 960 kfree(victim_name); 961 if (ret) 962 return ret; 963 ret = btrfs_run_delayed_items(trans, root); 964 if (ret) 965 return ret; 966 *search_done = 1; 967 goto again; 968 } 969 kfree(victim_name); 970 971 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 972 } 973 974 /* 975 * NOTE: we have searched root tree and checked the 976 * coresponding ref, it does not need to check again. 977 */ 978 *search_done = 1; 979 } 980 btrfs_release_path(path); 981 982 /* Same search but for extended refs */ 983 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 984 inode_objectid, parent_objectid, 0, 985 0); 986 if (!IS_ERR_OR_NULL(extref)) { 987 u32 item_size; 988 u32 cur_offset = 0; 989 unsigned long base; 990 struct inode *victim_parent; 991 992 leaf = path->nodes[0]; 993 994 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 995 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 996 997 while (cur_offset < item_size) { 998 extref = (struct btrfs_inode_extref *)base + cur_offset; 999 1000 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1001 1002 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1003 goto next; 1004 1005 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1006 if (!victim_name) 1007 return -ENOMEM; 1008 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1009 victim_name_len); 1010 1011 search_key.objectid = inode_objectid; 1012 search_key.type = BTRFS_INODE_EXTREF_KEY; 1013 search_key.offset = btrfs_extref_hash(parent_objectid, 1014 victim_name, 1015 victim_name_len); 1016 ret = 0; 1017 if (!backref_in_log(log_root, &search_key, 1018 parent_objectid, victim_name, 1019 victim_name_len)) { 1020 ret = -ENOENT; 1021 victim_parent = read_one_inode(root, 1022 parent_objectid); 1023 if (victim_parent) { 1024 inc_nlink(inode); 1025 btrfs_release_path(path); 1026 1027 ret = btrfs_unlink_inode(trans, root, 1028 victim_parent, 1029 inode, 1030 victim_name, 1031 victim_name_len); 1032 if (!ret) 1033 ret = btrfs_run_delayed_items( 1034 trans, root); 1035 } 1036 iput(victim_parent); 1037 kfree(victim_name); 1038 if (ret) 1039 return ret; 1040 *search_done = 1; 1041 goto again; 1042 } 1043 kfree(victim_name); 1044 if (ret) 1045 return ret; 1046 next: 1047 cur_offset += victim_name_len + sizeof(*extref); 1048 } 1049 *search_done = 1; 1050 } 1051 btrfs_release_path(path); 1052 1053 /* look for a conflicting sequence number */ 1054 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1055 ref_index, name, namelen, 0); 1056 if (di && !IS_ERR(di)) { 1057 ret = drop_one_dir_item(trans, root, path, dir, di); 1058 if (ret) 1059 return ret; 1060 } 1061 btrfs_release_path(path); 1062 1063 /* look for a conflicing name */ 1064 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1065 name, namelen, 0); 1066 if (di && !IS_ERR(di)) { 1067 ret = drop_one_dir_item(trans, root, path, dir, di); 1068 if (ret) 1069 return ret; 1070 } 1071 btrfs_release_path(path); 1072 1073 return 0; 1074 } 1075 1076 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1077 u32 *namelen, char **name, u64 *index, 1078 u64 *parent_objectid) 1079 { 1080 struct btrfs_inode_extref *extref; 1081 1082 extref = (struct btrfs_inode_extref *)ref_ptr; 1083 1084 *namelen = btrfs_inode_extref_name_len(eb, extref); 1085 *name = kmalloc(*namelen, GFP_NOFS); 1086 if (*name == NULL) 1087 return -ENOMEM; 1088 1089 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1090 *namelen); 1091 1092 *index = btrfs_inode_extref_index(eb, extref); 1093 if (parent_objectid) 1094 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1095 1096 return 0; 1097 } 1098 1099 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1100 u32 *namelen, char **name, u64 *index) 1101 { 1102 struct btrfs_inode_ref *ref; 1103 1104 ref = (struct btrfs_inode_ref *)ref_ptr; 1105 1106 *namelen = btrfs_inode_ref_name_len(eb, ref); 1107 *name = kmalloc(*namelen, GFP_NOFS); 1108 if (*name == NULL) 1109 return -ENOMEM; 1110 1111 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1112 1113 *index = btrfs_inode_ref_index(eb, ref); 1114 1115 return 0; 1116 } 1117 1118 /* 1119 * replay one inode back reference item found in the log tree. 1120 * eb, slot and key refer to the buffer and key found in the log tree. 1121 * root is the destination we are replaying into, and path is for temp 1122 * use by this function. (it should be released on return). 1123 */ 1124 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1125 struct btrfs_root *root, 1126 struct btrfs_root *log, 1127 struct btrfs_path *path, 1128 struct extent_buffer *eb, int slot, 1129 struct btrfs_key *key) 1130 { 1131 struct inode *dir = NULL; 1132 struct inode *inode = NULL; 1133 unsigned long ref_ptr; 1134 unsigned long ref_end; 1135 char *name = NULL; 1136 int namelen; 1137 int ret; 1138 int search_done = 0; 1139 int log_ref_ver = 0; 1140 u64 parent_objectid; 1141 u64 inode_objectid; 1142 u64 ref_index = 0; 1143 int ref_struct_size; 1144 1145 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1146 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1147 1148 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1149 struct btrfs_inode_extref *r; 1150 1151 ref_struct_size = sizeof(struct btrfs_inode_extref); 1152 log_ref_ver = 1; 1153 r = (struct btrfs_inode_extref *)ref_ptr; 1154 parent_objectid = btrfs_inode_extref_parent(eb, r); 1155 } else { 1156 ref_struct_size = sizeof(struct btrfs_inode_ref); 1157 parent_objectid = key->offset; 1158 } 1159 inode_objectid = key->objectid; 1160 1161 /* 1162 * it is possible that we didn't log all the parent directories 1163 * for a given inode. If we don't find the dir, just don't 1164 * copy the back ref in. The link count fixup code will take 1165 * care of the rest 1166 */ 1167 dir = read_one_inode(root, parent_objectid); 1168 if (!dir) { 1169 ret = -ENOENT; 1170 goto out; 1171 } 1172 1173 inode = read_one_inode(root, inode_objectid); 1174 if (!inode) { 1175 ret = -EIO; 1176 goto out; 1177 } 1178 1179 while (ref_ptr < ref_end) { 1180 if (log_ref_ver) { 1181 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1182 &ref_index, &parent_objectid); 1183 /* 1184 * parent object can change from one array 1185 * item to another. 1186 */ 1187 if (!dir) 1188 dir = read_one_inode(root, parent_objectid); 1189 if (!dir) { 1190 ret = -ENOENT; 1191 goto out; 1192 } 1193 } else { 1194 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1195 &ref_index); 1196 } 1197 if (ret) 1198 goto out; 1199 1200 /* if we already have a perfect match, we're done */ 1201 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1202 ref_index, name, namelen)) { 1203 /* 1204 * look for a conflicting back reference in the 1205 * metadata. if we find one we have to unlink that name 1206 * of the file before we add our new link. Later on, we 1207 * overwrite any existing back reference, and we don't 1208 * want to create dangling pointers in the directory. 1209 */ 1210 1211 if (!search_done) { 1212 ret = __add_inode_ref(trans, root, path, log, 1213 dir, inode, eb, 1214 inode_objectid, 1215 parent_objectid, 1216 ref_index, name, namelen, 1217 &search_done); 1218 if (ret) { 1219 if (ret == 1) 1220 ret = 0; 1221 goto out; 1222 } 1223 } 1224 1225 /* insert our name */ 1226 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1227 0, ref_index); 1228 if (ret) 1229 goto out; 1230 1231 btrfs_update_inode(trans, root, inode); 1232 } 1233 1234 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1235 kfree(name); 1236 name = NULL; 1237 if (log_ref_ver) { 1238 iput(dir); 1239 dir = NULL; 1240 } 1241 } 1242 1243 /* finally write the back reference in the inode */ 1244 ret = overwrite_item(trans, root, path, eb, slot, key); 1245 out: 1246 btrfs_release_path(path); 1247 kfree(name); 1248 iput(dir); 1249 iput(inode); 1250 return ret; 1251 } 1252 1253 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1254 struct btrfs_root *root, u64 offset) 1255 { 1256 int ret; 1257 ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, 1258 offset, BTRFS_ORPHAN_ITEM_KEY, NULL); 1259 if (ret > 0) 1260 ret = btrfs_insert_orphan_item(trans, root, offset); 1261 return ret; 1262 } 1263 1264 static int count_inode_extrefs(struct btrfs_root *root, 1265 struct inode *inode, struct btrfs_path *path) 1266 { 1267 int ret = 0; 1268 int name_len; 1269 unsigned int nlink = 0; 1270 u32 item_size; 1271 u32 cur_offset = 0; 1272 u64 inode_objectid = btrfs_ino(inode); 1273 u64 offset = 0; 1274 unsigned long ptr; 1275 struct btrfs_inode_extref *extref; 1276 struct extent_buffer *leaf; 1277 1278 while (1) { 1279 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1280 &extref, &offset); 1281 if (ret) 1282 break; 1283 1284 leaf = path->nodes[0]; 1285 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1286 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1287 1288 while (cur_offset < item_size) { 1289 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1290 name_len = btrfs_inode_extref_name_len(leaf, extref); 1291 1292 nlink++; 1293 1294 cur_offset += name_len + sizeof(*extref); 1295 } 1296 1297 offset++; 1298 btrfs_release_path(path); 1299 } 1300 btrfs_release_path(path); 1301 1302 if (ret < 0) 1303 return ret; 1304 return nlink; 1305 } 1306 1307 static int count_inode_refs(struct btrfs_root *root, 1308 struct inode *inode, struct btrfs_path *path) 1309 { 1310 int ret; 1311 struct btrfs_key key; 1312 unsigned int nlink = 0; 1313 unsigned long ptr; 1314 unsigned long ptr_end; 1315 int name_len; 1316 u64 ino = btrfs_ino(inode); 1317 1318 key.objectid = ino; 1319 key.type = BTRFS_INODE_REF_KEY; 1320 key.offset = (u64)-1; 1321 1322 while (1) { 1323 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1324 if (ret < 0) 1325 break; 1326 if (ret > 0) { 1327 if (path->slots[0] == 0) 1328 break; 1329 path->slots[0]--; 1330 } 1331 process_slot: 1332 btrfs_item_key_to_cpu(path->nodes[0], &key, 1333 path->slots[0]); 1334 if (key.objectid != ino || 1335 key.type != BTRFS_INODE_REF_KEY) 1336 break; 1337 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1338 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1339 path->slots[0]); 1340 while (ptr < ptr_end) { 1341 struct btrfs_inode_ref *ref; 1342 1343 ref = (struct btrfs_inode_ref *)ptr; 1344 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1345 ref); 1346 ptr = (unsigned long)(ref + 1) + name_len; 1347 nlink++; 1348 } 1349 1350 if (key.offset == 0) 1351 break; 1352 if (path->slots[0] > 0) { 1353 path->slots[0]--; 1354 goto process_slot; 1355 } 1356 key.offset--; 1357 btrfs_release_path(path); 1358 } 1359 btrfs_release_path(path); 1360 1361 return nlink; 1362 } 1363 1364 /* 1365 * There are a few corners where the link count of the file can't 1366 * be properly maintained during replay. So, instead of adding 1367 * lots of complexity to the log code, we just scan the backrefs 1368 * for any file that has been through replay. 1369 * 1370 * The scan will update the link count on the inode to reflect the 1371 * number of back refs found. If it goes down to zero, the iput 1372 * will free the inode. 1373 */ 1374 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1375 struct btrfs_root *root, 1376 struct inode *inode) 1377 { 1378 struct btrfs_path *path; 1379 int ret; 1380 u64 nlink = 0; 1381 u64 ino = btrfs_ino(inode); 1382 1383 path = btrfs_alloc_path(); 1384 if (!path) 1385 return -ENOMEM; 1386 1387 ret = count_inode_refs(root, inode, path); 1388 if (ret < 0) 1389 goto out; 1390 1391 nlink = ret; 1392 1393 ret = count_inode_extrefs(root, inode, path); 1394 if (ret == -ENOENT) 1395 ret = 0; 1396 1397 if (ret < 0) 1398 goto out; 1399 1400 nlink += ret; 1401 1402 ret = 0; 1403 1404 if (nlink != inode->i_nlink) { 1405 set_nlink(inode, nlink); 1406 btrfs_update_inode(trans, root, inode); 1407 } 1408 BTRFS_I(inode)->index_cnt = (u64)-1; 1409 1410 if (inode->i_nlink == 0) { 1411 if (S_ISDIR(inode->i_mode)) { 1412 ret = replay_dir_deletes(trans, root, NULL, path, 1413 ino, 1); 1414 if (ret) 1415 goto out; 1416 } 1417 ret = insert_orphan_item(trans, root, ino); 1418 } 1419 1420 out: 1421 btrfs_free_path(path); 1422 return ret; 1423 } 1424 1425 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1426 struct btrfs_root *root, 1427 struct btrfs_path *path) 1428 { 1429 int ret; 1430 struct btrfs_key key; 1431 struct inode *inode; 1432 1433 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1434 key.type = BTRFS_ORPHAN_ITEM_KEY; 1435 key.offset = (u64)-1; 1436 while (1) { 1437 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1438 if (ret < 0) 1439 break; 1440 1441 if (ret == 1) { 1442 if (path->slots[0] == 0) 1443 break; 1444 path->slots[0]--; 1445 } 1446 1447 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1448 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1449 key.type != BTRFS_ORPHAN_ITEM_KEY) 1450 break; 1451 1452 ret = btrfs_del_item(trans, root, path); 1453 if (ret) 1454 goto out; 1455 1456 btrfs_release_path(path); 1457 inode = read_one_inode(root, key.offset); 1458 if (!inode) 1459 return -EIO; 1460 1461 ret = fixup_inode_link_count(trans, root, inode); 1462 iput(inode); 1463 if (ret) 1464 goto out; 1465 1466 /* 1467 * fixup on a directory may create new entries, 1468 * make sure we always look for the highset possible 1469 * offset 1470 */ 1471 key.offset = (u64)-1; 1472 } 1473 ret = 0; 1474 out: 1475 btrfs_release_path(path); 1476 return ret; 1477 } 1478 1479 1480 /* 1481 * record a given inode in the fixup dir so we can check its link 1482 * count when replay is done. The link count is incremented here 1483 * so the inode won't go away until we check it 1484 */ 1485 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1486 struct btrfs_root *root, 1487 struct btrfs_path *path, 1488 u64 objectid) 1489 { 1490 struct btrfs_key key; 1491 int ret = 0; 1492 struct inode *inode; 1493 1494 inode = read_one_inode(root, objectid); 1495 if (!inode) 1496 return -EIO; 1497 1498 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1499 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1500 key.offset = objectid; 1501 1502 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1503 1504 btrfs_release_path(path); 1505 if (ret == 0) { 1506 if (!inode->i_nlink) 1507 set_nlink(inode, 1); 1508 else 1509 inc_nlink(inode); 1510 ret = btrfs_update_inode(trans, root, inode); 1511 } else if (ret == -EEXIST) { 1512 ret = 0; 1513 } else { 1514 BUG(); /* Logic Error */ 1515 } 1516 iput(inode); 1517 1518 return ret; 1519 } 1520 1521 /* 1522 * when replaying the log for a directory, we only insert names 1523 * for inodes that actually exist. This means an fsync on a directory 1524 * does not implicitly fsync all the new files in it 1525 */ 1526 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1527 struct btrfs_root *root, 1528 struct btrfs_path *path, 1529 u64 dirid, u64 index, 1530 char *name, int name_len, u8 type, 1531 struct btrfs_key *location) 1532 { 1533 struct inode *inode; 1534 struct inode *dir; 1535 int ret; 1536 1537 inode = read_one_inode(root, location->objectid); 1538 if (!inode) 1539 return -ENOENT; 1540 1541 dir = read_one_inode(root, dirid); 1542 if (!dir) { 1543 iput(inode); 1544 return -EIO; 1545 } 1546 1547 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1548 1549 /* FIXME, put inode into FIXUP list */ 1550 1551 iput(inode); 1552 iput(dir); 1553 return ret; 1554 } 1555 1556 /* 1557 * take a single entry in a log directory item and replay it into 1558 * the subvolume. 1559 * 1560 * if a conflicting item exists in the subdirectory already, 1561 * the inode it points to is unlinked and put into the link count 1562 * fix up tree. 1563 * 1564 * If a name from the log points to a file or directory that does 1565 * not exist in the FS, it is skipped. fsyncs on directories 1566 * do not force down inodes inside that directory, just changes to the 1567 * names or unlinks in a directory. 1568 */ 1569 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1570 struct btrfs_root *root, 1571 struct btrfs_path *path, 1572 struct extent_buffer *eb, 1573 struct btrfs_dir_item *di, 1574 struct btrfs_key *key) 1575 { 1576 char *name; 1577 int name_len; 1578 struct btrfs_dir_item *dst_di; 1579 struct btrfs_key found_key; 1580 struct btrfs_key log_key; 1581 struct inode *dir; 1582 u8 log_type; 1583 int exists; 1584 int ret = 0; 1585 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1586 1587 dir = read_one_inode(root, key->objectid); 1588 if (!dir) 1589 return -EIO; 1590 1591 name_len = btrfs_dir_name_len(eb, di); 1592 name = kmalloc(name_len, GFP_NOFS); 1593 if (!name) { 1594 ret = -ENOMEM; 1595 goto out; 1596 } 1597 1598 log_type = btrfs_dir_type(eb, di); 1599 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1600 name_len); 1601 1602 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1603 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1604 if (exists == 0) 1605 exists = 1; 1606 else 1607 exists = 0; 1608 btrfs_release_path(path); 1609 1610 if (key->type == BTRFS_DIR_ITEM_KEY) { 1611 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1612 name, name_len, 1); 1613 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1614 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1615 key->objectid, 1616 key->offset, name, 1617 name_len, 1); 1618 } else { 1619 /* Corruption */ 1620 ret = -EINVAL; 1621 goto out; 1622 } 1623 if (IS_ERR_OR_NULL(dst_di)) { 1624 /* we need a sequence number to insert, so we only 1625 * do inserts for the BTRFS_DIR_INDEX_KEY types 1626 */ 1627 if (key->type != BTRFS_DIR_INDEX_KEY) 1628 goto out; 1629 goto insert; 1630 } 1631 1632 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1633 /* the existing item matches the logged item */ 1634 if (found_key.objectid == log_key.objectid && 1635 found_key.type == log_key.type && 1636 found_key.offset == log_key.offset && 1637 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1638 goto out; 1639 } 1640 1641 /* 1642 * don't drop the conflicting directory entry if the inode 1643 * for the new entry doesn't exist 1644 */ 1645 if (!exists) 1646 goto out; 1647 1648 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1649 if (ret) 1650 goto out; 1651 1652 if (key->type == BTRFS_DIR_INDEX_KEY) 1653 goto insert; 1654 out: 1655 btrfs_release_path(path); 1656 if (!ret && update_size) { 1657 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1658 ret = btrfs_update_inode(trans, root, dir); 1659 } 1660 kfree(name); 1661 iput(dir); 1662 return ret; 1663 1664 insert: 1665 btrfs_release_path(path); 1666 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1667 name, name_len, log_type, &log_key); 1668 if (ret && ret != -ENOENT) 1669 goto out; 1670 update_size = false; 1671 ret = 0; 1672 goto out; 1673 } 1674 1675 /* 1676 * find all the names in a directory item and reconcile them into 1677 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1678 * one name in a directory item, but the same code gets used for 1679 * both directory index types 1680 */ 1681 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1682 struct btrfs_root *root, 1683 struct btrfs_path *path, 1684 struct extent_buffer *eb, int slot, 1685 struct btrfs_key *key) 1686 { 1687 int ret; 1688 u32 item_size = btrfs_item_size_nr(eb, slot); 1689 struct btrfs_dir_item *di; 1690 int name_len; 1691 unsigned long ptr; 1692 unsigned long ptr_end; 1693 1694 ptr = btrfs_item_ptr_offset(eb, slot); 1695 ptr_end = ptr + item_size; 1696 while (ptr < ptr_end) { 1697 di = (struct btrfs_dir_item *)ptr; 1698 if (verify_dir_item(root, eb, di)) 1699 return -EIO; 1700 name_len = btrfs_dir_name_len(eb, di); 1701 ret = replay_one_name(trans, root, path, eb, di, key); 1702 if (ret) 1703 return ret; 1704 ptr = (unsigned long)(di + 1); 1705 ptr += name_len; 1706 } 1707 return 0; 1708 } 1709 1710 /* 1711 * directory replay has two parts. There are the standard directory 1712 * items in the log copied from the subvolume, and range items 1713 * created in the log while the subvolume was logged. 1714 * 1715 * The range items tell us which parts of the key space the log 1716 * is authoritative for. During replay, if a key in the subvolume 1717 * directory is in a logged range item, but not actually in the log 1718 * that means it was deleted from the directory before the fsync 1719 * and should be removed. 1720 */ 1721 static noinline int find_dir_range(struct btrfs_root *root, 1722 struct btrfs_path *path, 1723 u64 dirid, int key_type, 1724 u64 *start_ret, u64 *end_ret) 1725 { 1726 struct btrfs_key key; 1727 u64 found_end; 1728 struct btrfs_dir_log_item *item; 1729 int ret; 1730 int nritems; 1731 1732 if (*start_ret == (u64)-1) 1733 return 1; 1734 1735 key.objectid = dirid; 1736 key.type = key_type; 1737 key.offset = *start_ret; 1738 1739 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1740 if (ret < 0) 1741 goto out; 1742 if (ret > 0) { 1743 if (path->slots[0] == 0) 1744 goto out; 1745 path->slots[0]--; 1746 } 1747 if (ret != 0) 1748 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1749 1750 if (key.type != key_type || key.objectid != dirid) { 1751 ret = 1; 1752 goto next; 1753 } 1754 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1755 struct btrfs_dir_log_item); 1756 found_end = btrfs_dir_log_end(path->nodes[0], item); 1757 1758 if (*start_ret >= key.offset && *start_ret <= found_end) { 1759 ret = 0; 1760 *start_ret = key.offset; 1761 *end_ret = found_end; 1762 goto out; 1763 } 1764 ret = 1; 1765 next: 1766 /* check the next slot in the tree to see if it is a valid item */ 1767 nritems = btrfs_header_nritems(path->nodes[0]); 1768 if (path->slots[0] >= nritems) { 1769 ret = btrfs_next_leaf(root, path); 1770 if (ret) 1771 goto out; 1772 } else { 1773 path->slots[0]++; 1774 } 1775 1776 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1777 1778 if (key.type != key_type || key.objectid != dirid) { 1779 ret = 1; 1780 goto out; 1781 } 1782 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1783 struct btrfs_dir_log_item); 1784 found_end = btrfs_dir_log_end(path->nodes[0], item); 1785 *start_ret = key.offset; 1786 *end_ret = found_end; 1787 ret = 0; 1788 out: 1789 btrfs_release_path(path); 1790 return ret; 1791 } 1792 1793 /* 1794 * this looks for a given directory item in the log. If the directory 1795 * item is not in the log, the item is removed and the inode it points 1796 * to is unlinked 1797 */ 1798 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1799 struct btrfs_root *root, 1800 struct btrfs_root *log, 1801 struct btrfs_path *path, 1802 struct btrfs_path *log_path, 1803 struct inode *dir, 1804 struct btrfs_key *dir_key) 1805 { 1806 int ret; 1807 struct extent_buffer *eb; 1808 int slot; 1809 u32 item_size; 1810 struct btrfs_dir_item *di; 1811 struct btrfs_dir_item *log_di; 1812 int name_len; 1813 unsigned long ptr; 1814 unsigned long ptr_end; 1815 char *name; 1816 struct inode *inode; 1817 struct btrfs_key location; 1818 1819 again: 1820 eb = path->nodes[0]; 1821 slot = path->slots[0]; 1822 item_size = btrfs_item_size_nr(eb, slot); 1823 ptr = btrfs_item_ptr_offset(eb, slot); 1824 ptr_end = ptr + item_size; 1825 while (ptr < ptr_end) { 1826 di = (struct btrfs_dir_item *)ptr; 1827 if (verify_dir_item(root, eb, di)) { 1828 ret = -EIO; 1829 goto out; 1830 } 1831 1832 name_len = btrfs_dir_name_len(eb, di); 1833 name = kmalloc(name_len, GFP_NOFS); 1834 if (!name) { 1835 ret = -ENOMEM; 1836 goto out; 1837 } 1838 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1839 name_len); 1840 log_di = NULL; 1841 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1842 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1843 dir_key->objectid, 1844 name, name_len, 0); 1845 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1846 log_di = btrfs_lookup_dir_index_item(trans, log, 1847 log_path, 1848 dir_key->objectid, 1849 dir_key->offset, 1850 name, name_len, 0); 1851 } 1852 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 1853 btrfs_dir_item_key_to_cpu(eb, di, &location); 1854 btrfs_release_path(path); 1855 btrfs_release_path(log_path); 1856 inode = read_one_inode(root, location.objectid); 1857 if (!inode) { 1858 kfree(name); 1859 return -EIO; 1860 } 1861 1862 ret = link_to_fixup_dir(trans, root, 1863 path, location.objectid); 1864 if (ret) { 1865 kfree(name); 1866 iput(inode); 1867 goto out; 1868 } 1869 1870 inc_nlink(inode); 1871 ret = btrfs_unlink_inode(trans, root, dir, inode, 1872 name, name_len); 1873 if (!ret) 1874 ret = btrfs_run_delayed_items(trans, root); 1875 kfree(name); 1876 iput(inode); 1877 if (ret) 1878 goto out; 1879 1880 /* there might still be more names under this key 1881 * check and repeat if required 1882 */ 1883 ret = btrfs_search_slot(NULL, root, dir_key, path, 1884 0, 0); 1885 if (ret == 0) 1886 goto again; 1887 ret = 0; 1888 goto out; 1889 } else if (IS_ERR(log_di)) { 1890 kfree(name); 1891 return PTR_ERR(log_di); 1892 } 1893 btrfs_release_path(log_path); 1894 kfree(name); 1895 1896 ptr = (unsigned long)(di + 1); 1897 ptr += name_len; 1898 } 1899 ret = 0; 1900 out: 1901 btrfs_release_path(path); 1902 btrfs_release_path(log_path); 1903 return ret; 1904 } 1905 1906 /* 1907 * deletion replay happens before we copy any new directory items 1908 * out of the log or out of backreferences from inodes. It 1909 * scans the log to find ranges of keys that log is authoritative for, 1910 * and then scans the directory to find items in those ranges that are 1911 * not present in the log. 1912 * 1913 * Anything we don't find in the log is unlinked and removed from the 1914 * directory. 1915 */ 1916 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1917 struct btrfs_root *root, 1918 struct btrfs_root *log, 1919 struct btrfs_path *path, 1920 u64 dirid, int del_all) 1921 { 1922 u64 range_start; 1923 u64 range_end; 1924 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1925 int ret = 0; 1926 struct btrfs_key dir_key; 1927 struct btrfs_key found_key; 1928 struct btrfs_path *log_path; 1929 struct inode *dir; 1930 1931 dir_key.objectid = dirid; 1932 dir_key.type = BTRFS_DIR_ITEM_KEY; 1933 log_path = btrfs_alloc_path(); 1934 if (!log_path) 1935 return -ENOMEM; 1936 1937 dir = read_one_inode(root, dirid); 1938 /* it isn't an error if the inode isn't there, that can happen 1939 * because we replay the deletes before we copy in the inode item 1940 * from the log 1941 */ 1942 if (!dir) { 1943 btrfs_free_path(log_path); 1944 return 0; 1945 } 1946 again: 1947 range_start = 0; 1948 range_end = 0; 1949 while (1) { 1950 if (del_all) 1951 range_end = (u64)-1; 1952 else { 1953 ret = find_dir_range(log, path, dirid, key_type, 1954 &range_start, &range_end); 1955 if (ret != 0) 1956 break; 1957 } 1958 1959 dir_key.offset = range_start; 1960 while (1) { 1961 int nritems; 1962 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1963 0, 0); 1964 if (ret < 0) 1965 goto out; 1966 1967 nritems = btrfs_header_nritems(path->nodes[0]); 1968 if (path->slots[0] >= nritems) { 1969 ret = btrfs_next_leaf(root, path); 1970 if (ret) 1971 break; 1972 } 1973 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1974 path->slots[0]); 1975 if (found_key.objectid != dirid || 1976 found_key.type != dir_key.type) 1977 goto next_type; 1978 1979 if (found_key.offset > range_end) 1980 break; 1981 1982 ret = check_item_in_log(trans, root, log, path, 1983 log_path, dir, 1984 &found_key); 1985 if (ret) 1986 goto out; 1987 if (found_key.offset == (u64)-1) 1988 break; 1989 dir_key.offset = found_key.offset + 1; 1990 } 1991 btrfs_release_path(path); 1992 if (range_end == (u64)-1) 1993 break; 1994 range_start = range_end + 1; 1995 } 1996 1997 next_type: 1998 ret = 0; 1999 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2000 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2001 dir_key.type = BTRFS_DIR_INDEX_KEY; 2002 btrfs_release_path(path); 2003 goto again; 2004 } 2005 out: 2006 btrfs_release_path(path); 2007 btrfs_free_path(log_path); 2008 iput(dir); 2009 return ret; 2010 } 2011 2012 /* 2013 * the process_func used to replay items from the log tree. This 2014 * gets called in two different stages. The first stage just looks 2015 * for inodes and makes sure they are all copied into the subvolume. 2016 * 2017 * The second stage copies all the other item types from the log into 2018 * the subvolume. The two stage approach is slower, but gets rid of 2019 * lots of complexity around inodes referencing other inodes that exist 2020 * only in the log (references come from either directory items or inode 2021 * back refs). 2022 */ 2023 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2024 struct walk_control *wc, u64 gen) 2025 { 2026 int nritems; 2027 struct btrfs_path *path; 2028 struct btrfs_root *root = wc->replay_dest; 2029 struct btrfs_key key; 2030 int level; 2031 int i; 2032 int ret; 2033 2034 ret = btrfs_read_buffer(eb, gen); 2035 if (ret) 2036 return ret; 2037 2038 level = btrfs_header_level(eb); 2039 2040 if (level != 0) 2041 return 0; 2042 2043 path = btrfs_alloc_path(); 2044 if (!path) 2045 return -ENOMEM; 2046 2047 nritems = btrfs_header_nritems(eb); 2048 for (i = 0; i < nritems; i++) { 2049 btrfs_item_key_to_cpu(eb, &key, i); 2050 2051 /* inode keys are done during the first stage */ 2052 if (key.type == BTRFS_INODE_ITEM_KEY && 2053 wc->stage == LOG_WALK_REPLAY_INODES) { 2054 struct btrfs_inode_item *inode_item; 2055 u32 mode; 2056 2057 inode_item = btrfs_item_ptr(eb, i, 2058 struct btrfs_inode_item); 2059 mode = btrfs_inode_mode(eb, inode_item); 2060 if (S_ISDIR(mode)) { 2061 ret = replay_dir_deletes(wc->trans, 2062 root, log, path, key.objectid, 0); 2063 if (ret) 2064 break; 2065 } 2066 ret = overwrite_item(wc->trans, root, path, 2067 eb, i, &key); 2068 if (ret) 2069 break; 2070 2071 /* for regular files, make sure corresponding 2072 * orhpan item exist. extents past the new EOF 2073 * will be truncated later by orphan cleanup. 2074 */ 2075 if (S_ISREG(mode)) { 2076 ret = insert_orphan_item(wc->trans, root, 2077 key.objectid); 2078 if (ret) 2079 break; 2080 } 2081 2082 ret = link_to_fixup_dir(wc->trans, root, 2083 path, key.objectid); 2084 if (ret) 2085 break; 2086 } 2087 2088 if (key.type == BTRFS_DIR_INDEX_KEY && 2089 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2090 ret = replay_one_dir_item(wc->trans, root, path, 2091 eb, i, &key); 2092 if (ret) 2093 break; 2094 } 2095 2096 if (wc->stage < LOG_WALK_REPLAY_ALL) 2097 continue; 2098 2099 /* these keys are simply copied */ 2100 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2101 ret = overwrite_item(wc->trans, root, path, 2102 eb, i, &key); 2103 if (ret) 2104 break; 2105 } else if (key.type == BTRFS_INODE_REF_KEY || 2106 key.type == BTRFS_INODE_EXTREF_KEY) { 2107 ret = add_inode_ref(wc->trans, root, log, path, 2108 eb, i, &key); 2109 if (ret && ret != -ENOENT) 2110 break; 2111 ret = 0; 2112 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2113 ret = replay_one_extent(wc->trans, root, path, 2114 eb, i, &key); 2115 if (ret) 2116 break; 2117 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2118 ret = replay_one_dir_item(wc->trans, root, path, 2119 eb, i, &key); 2120 if (ret) 2121 break; 2122 } 2123 } 2124 btrfs_free_path(path); 2125 return ret; 2126 } 2127 2128 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2129 struct btrfs_root *root, 2130 struct btrfs_path *path, int *level, 2131 struct walk_control *wc) 2132 { 2133 u64 root_owner; 2134 u64 bytenr; 2135 u64 ptr_gen; 2136 struct extent_buffer *next; 2137 struct extent_buffer *cur; 2138 struct extent_buffer *parent; 2139 u32 blocksize; 2140 int ret = 0; 2141 2142 WARN_ON(*level < 0); 2143 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2144 2145 while (*level > 0) { 2146 WARN_ON(*level < 0); 2147 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2148 cur = path->nodes[*level]; 2149 2150 WARN_ON(btrfs_header_level(cur) != *level); 2151 2152 if (path->slots[*level] >= 2153 btrfs_header_nritems(cur)) 2154 break; 2155 2156 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2157 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2158 blocksize = btrfs_level_size(root, *level - 1); 2159 2160 parent = path->nodes[*level]; 2161 root_owner = btrfs_header_owner(parent); 2162 2163 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 2164 if (!next) 2165 return -ENOMEM; 2166 2167 if (*level == 1) { 2168 ret = wc->process_func(root, next, wc, ptr_gen); 2169 if (ret) { 2170 free_extent_buffer(next); 2171 return ret; 2172 } 2173 2174 path->slots[*level]++; 2175 if (wc->free) { 2176 ret = btrfs_read_buffer(next, ptr_gen); 2177 if (ret) { 2178 free_extent_buffer(next); 2179 return ret; 2180 } 2181 2182 if (trans) { 2183 btrfs_tree_lock(next); 2184 btrfs_set_lock_blocking(next); 2185 clean_tree_block(trans, root, next); 2186 btrfs_wait_tree_block_writeback(next); 2187 btrfs_tree_unlock(next); 2188 } 2189 2190 WARN_ON(root_owner != 2191 BTRFS_TREE_LOG_OBJECTID); 2192 ret = btrfs_free_and_pin_reserved_extent(root, 2193 bytenr, blocksize); 2194 if (ret) { 2195 free_extent_buffer(next); 2196 return ret; 2197 } 2198 } 2199 free_extent_buffer(next); 2200 continue; 2201 } 2202 ret = btrfs_read_buffer(next, ptr_gen); 2203 if (ret) { 2204 free_extent_buffer(next); 2205 return ret; 2206 } 2207 2208 WARN_ON(*level <= 0); 2209 if (path->nodes[*level-1]) 2210 free_extent_buffer(path->nodes[*level-1]); 2211 path->nodes[*level-1] = next; 2212 *level = btrfs_header_level(next); 2213 path->slots[*level] = 0; 2214 cond_resched(); 2215 } 2216 WARN_ON(*level < 0); 2217 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2218 2219 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2220 2221 cond_resched(); 2222 return 0; 2223 } 2224 2225 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2226 struct btrfs_root *root, 2227 struct btrfs_path *path, int *level, 2228 struct walk_control *wc) 2229 { 2230 u64 root_owner; 2231 int i; 2232 int slot; 2233 int ret; 2234 2235 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2236 slot = path->slots[i]; 2237 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2238 path->slots[i]++; 2239 *level = i; 2240 WARN_ON(*level == 0); 2241 return 0; 2242 } else { 2243 struct extent_buffer *parent; 2244 if (path->nodes[*level] == root->node) 2245 parent = path->nodes[*level]; 2246 else 2247 parent = path->nodes[*level + 1]; 2248 2249 root_owner = btrfs_header_owner(parent); 2250 ret = wc->process_func(root, path->nodes[*level], wc, 2251 btrfs_header_generation(path->nodes[*level])); 2252 if (ret) 2253 return ret; 2254 2255 if (wc->free) { 2256 struct extent_buffer *next; 2257 2258 next = path->nodes[*level]; 2259 2260 if (trans) { 2261 btrfs_tree_lock(next); 2262 btrfs_set_lock_blocking(next); 2263 clean_tree_block(trans, root, next); 2264 btrfs_wait_tree_block_writeback(next); 2265 btrfs_tree_unlock(next); 2266 } 2267 2268 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2269 ret = btrfs_free_and_pin_reserved_extent(root, 2270 path->nodes[*level]->start, 2271 path->nodes[*level]->len); 2272 if (ret) 2273 return ret; 2274 } 2275 free_extent_buffer(path->nodes[*level]); 2276 path->nodes[*level] = NULL; 2277 *level = i + 1; 2278 } 2279 } 2280 return 1; 2281 } 2282 2283 /* 2284 * drop the reference count on the tree rooted at 'snap'. This traverses 2285 * the tree freeing any blocks that have a ref count of zero after being 2286 * decremented. 2287 */ 2288 static int walk_log_tree(struct btrfs_trans_handle *trans, 2289 struct btrfs_root *log, struct walk_control *wc) 2290 { 2291 int ret = 0; 2292 int wret; 2293 int level; 2294 struct btrfs_path *path; 2295 int orig_level; 2296 2297 path = btrfs_alloc_path(); 2298 if (!path) 2299 return -ENOMEM; 2300 2301 level = btrfs_header_level(log->node); 2302 orig_level = level; 2303 path->nodes[level] = log->node; 2304 extent_buffer_get(log->node); 2305 path->slots[level] = 0; 2306 2307 while (1) { 2308 wret = walk_down_log_tree(trans, log, path, &level, wc); 2309 if (wret > 0) 2310 break; 2311 if (wret < 0) { 2312 ret = wret; 2313 goto out; 2314 } 2315 2316 wret = walk_up_log_tree(trans, log, path, &level, wc); 2317 if (wret > 0) 2318 break; 2319 if (wret < 0) { 2320 ret = wret; 2321 goto out; 2322 } 2323 } 2324 2325 /* was the root node processed? if not, catch it here */ 2326 if (path->nodes[orig_level]) { 2327 ret = wc->process_func(log, path->nodes[orig_level], wc, 2328 btrfs_header_generation(path->nodes[orig_level])); 2329 if (ret) 2330 goto out; 2331 if (wc->free) { 2332 struct extent_buffer *next; 2333 2334 next = path->nodes[orig_level]; 2335 2336 if (trans) { 2337 btrfs_tree_lock(next); 2338 btrfs_set_lock_blocking(next); 2339 clean_tree_block(trans, log, next); 2340 btrfs_wait_tree_block_writeback(next); 2341 btrfs_tree_unlock(next); 2342 } 2343 2344 WARN_ON(log->root_key.objectid != 2345 BTRFS_TREE_LOG_OBJECTID); 2346 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2347 next->len); 2348 if (ret) 2349 goto out; 2350 } 2351 } 2352 2353 out: 2354 btrfs_free_path(path); 2355 return ret; 2356 } 2357 2358 /* 2359 * helper function to update the item for a given subvolumes log root 2360 * in the tree of log roots 2361 */ 2362 static int update_log_root(struct btrfs_trans_handle *trans, 2363 struct btrfs_root *log) 2364 { 2365 int ret; 2366 2367 if (log->log_transid == 1) { 2368 /* insert root item on the first sync */ 2369 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2370 &log->root_key, &log->root_item); 2371 } else { 2372 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2373 &log->root_key, &log->root_item); 2374 } 2375 return ret; 2376 } 2377 2378 static void wait_log_commit(struct btrfs_trans_handle *trans, 2379 struct btrfs_root *root, int transid) 2380 { 2381 DEFINE_WAIT(wait); 2382 int index = transid % 2; 2383 2384 /* 2385 * we only allow two pending log transactions at a time, 2386 * so we know that if ours is more than 2 older than the 2387 * current transaction, we're done 2388 */ 2389 do { 2390 prepare_to_wait(&root->log_commit_wait[index], 2391 &wait, TASK_UNINTERRUPTIBLE); 2392 mutex_unlock(&root->log_mutex); 2393 2394 if (root->log_transid_committed < transid && 2395 atomic_read(&root->log_commit[index])) 2396 schedule(); 2397 2398 finish_wait(&root->log_commit_wait[index], &wait); 2399 mutex_lock(&root->log_mutex); 2400 } while (root->log_transid_committed < transid && 2401 atomic_read(&root->log_commit[index])); 2402 } 2403 2404 static void wait_for_writer(struct btrfs_trans_handle *trans, 2405 struct btrfs_root *root) 2406 { 2407 DEFINE_WAIT(wait); 2408 2409 while (atomic_read(&root->log_writers)) { 2410 prepare_to_wait(&root->log_writer_wait, 2411 &wait, TASK_UNINTERRUPTIBLE); 2412 mutex_unlock(&root->log_mutex); 2413 if (atomic_read(&root->log_writers)) 2414 schedule(); 2415 mutex_lock(&root->log_mutex); 2416 finish_wait(&root->log_writer_wait, &wait); 2417 } 2418 } 2419 2420 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2421 struct btrfs_log_ctx *ctx) 2422 { 2423 if (!ctx) 2424 return; 2425 2426 mutex_lock(&root->log_mutex); 2427 list_del_init(&ctx->list); 2428 mutex_unlock(&root->log_mutex); 2429 } 2430 2431 /* 2432 * Invoked in log mutex context, or be sure there is no other task which 2433 * can access the list. 2434 */ 2435 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2436 int index, int error) 2437 { 2438 struct btrfs_log_ctx *ctx; 2439 2440 if (!error) { 2441 INIT_LIST_HEAD(&root->log_ctxs[index]); 2442 return; 2443 } 2444 2445 list_for_each_entry(ctx, &root->log_ctxs[index], list) 2446 ctx->log_ret = error; 2447 2448 INIT_LIST_HEAD(&root->log_ctxs[index]); 2449 } 2450 2451 /* 2452 * btrfs_sync_log does sends a given tree log down to the disk and 2453 * updates the super blocks to record it. When this call is done, 2454 * you know that any inodes previously logged are safely on disk only 2455 * if it returns 0. 2456 * 2457 * Any other return value means you need to call btrfs_commit_transaction. 2458 * Some of the edge cases for fsyncing directories that have had unlinks 2459 * or renames done in the past mean that sometimes the only safe 2460 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2461 * that has happened. 2462 */ 2463 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2464 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2465 { 2466 int index1; 2467 int index2; 2468 int mark; 2469 int ret; 2470 struct btrfs_root *log = root->log_root; 2471 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2472 int log_transid = 0; 2473 struct btrfs_log_ctx root_log_ctx; 2474 struct blk_plug plug; 2475 2476 mutex_lock(&root->log_mutex); 2477 log_transid = ctx->log_transid; 2478 if (root->log_transid_committed >= log_transid) { 2479 mutex_unlock(&root->log_mutex); 2480 return ctx->log_ret; 2481 } 2482 2483 index1 = log_transid % 2; 2484 if (atomic_read(&root->log_commit[index1])) { 2485 wait_log_commit(trans, root, log_transid); 2486 mutex_unlock(&root->log_mutex); 2487 return ctx->log_ret; 2488 } 2489 ASSERT(log_transid == root->log_transid); 2490 atomic_set(&root->log_commit[index1], 1); 2491 2492 /* wait for previous tree log sync to complete */ 2493 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2494 wait_log_commit(trans, root, log_transid - 1); 2495 2496 while (1) { 2497 int batch = atomic_read(&root->log_batch); 2498 /* when we're on an ssd, just kick the log commit out */ 2499 if (!btrfs_test_opt(root, SSD) && 2500 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2501 mutex_unlock(&root->log_mutex); 2502 schedule_timeout_uninterruptible(1); 2503 mutex_lock(&root->log_mutex); 2504 } 2505 wait_for_writer(trans, root); 2506 if (batch == atomic_read(&root->log_batch)) 2507 break; 2508 } 2509 2510 /* bail out if we need to do a full commit */ 2511 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2512 ret = -EAGAIN; 2513 btrfs_free_logged_extents(log, log_transid); 2514 mutex_unlock(&root->log_mutex); 2515 goto out; 2516 } 2517 2518 if (log_transid % 2 == 0) 2519 mark = EXTENT_DIRTY; 2520 else 2521 mark = EXTENT_NEW; 2522 2523 /* we start IO on all the marked extents here, but we don't actually 2524 * wait for them until later. 2525 */ 2526 blk_start_plug(&plug); 2527 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2528 if (ret) { 2529 blk_finish_plug(&plug); 2530 btrfs_abort_transaction(trans, root, ret); 2531 btrfs_free_logged_extents(log, log_transid); 2532 btrfs_set_log_full_commit(root->fs_info, trans); 2533 mutex_unlock(&root->log_mutex); 2534 goto out; 2535 } 2536 2537 btrfs_set_root_node(&log->root_item, log->node); 2538 2539 root->log_transid++; 2540 log->log_transid = root->log_transid; 2541 root->log_start_pid = 0; 2542 /* 2543 * IO has been started, blocks of the log tree have WRITTEN flag set 2544 * in their headers. new modifications of the log will be written to 2545 * new positions. so it's safe to allow log writers to go in. 2546 */ 2547 mutex_unlock(&root->log_mutex); 2548 2549 btrfs_init_log_ctx(&root_log_ctx); 2550 2551 mutex_lock(&log_root_tree->log_mutex); 2552 atomic_inc(&log_root_tree->log_batch); 2553 atomic_inc(&log_root_tree->log_writers); 2554 2555 index2 = log_root_tree->log_transid % 2; 2556 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2557 root_log_ctx.log_transid = log_root_tree->log_transid; 2558 2559 mutex_unlock(&log_root_tree->log_mutex); 2560 2561 ret = update_log_root(trans, log); 2562 2563 mutex_lock(&log_root_tree->log_mutex); 2564 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2565 smp_mb(); 2566 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2567 wake_up(&log_root_tree->log_writer_wait); 2568 } 2569 2570 if (ret) { 2571 if (!list_empty(&root_log_ctx.list)) 2572 list_del_init(&root_log_ctx.list); 2573 2574 blk_finish_plug(&plug); 2575 btrfs_set_log_full_commit(root->fs_info, trans); 2576 2577 if (ret != -ENOSPC) { 2578 btrfs_abort_transaction(trans, root, ret); 2579 mutex_unlock(&log_root_tree->log_mutex); 2580 goto out; 2581 } 2582 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2583 btrfs_free_logged_extents(log, log_transid); 2584 mutex_unlock(&log_root_tree->log_mutex); 2585 ret = -EAGAIN; 2586 goto out; 2587 } 2588 2589 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2590 mutex_unlock(&log_root_tree->log_mutex); 2591 ret = root_log_ctx.log_ret; 2592 goto out; 2593 } 2594 2595 index2 = root_log_ctx.log_transid % 2; 2596 if (atomic_read(&log_root_tree->log_commit[index2])) { 2597 blk_finish_plug(&plug); 2598 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2599 wait_log_commit(trans, log_root_tree, 2600 root_log_ctx.log_transid); 2601 btrfs_free_logged_extents(log, log_transid); 2602 mutex_unlock(&log_root_tree->log_mutex); 2603 ret = root_log_ctx.log_ret; 2604 goto out; 2605 } 2606 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2607 atomic_set(&log_root_tree->log_commit[index2], 1); 2608 2609 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2610 wait_log_commit(trans, log_root_tree, 2611 root_log_ctx.log_transid - 1); 2612 } 2613 2614 wait_for_writer(trans, log_root_tree); 2615 2616 /* 2617 * now that we've moved on to the tree of log tree roots, 2618 * check the full commit flag again 2619 */ 2620 if (btrfs_need_log_full_commit(root->fs_info, trans)) { 2621 blk_finish_plug(&plug); 2622 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2623 btrfs_free_logged_extents(log, log_transid); 2624 mutex_unlock(&log_root_tree->log_mutex); 2625 ret = -EAGAIN; 2626 goto out_wake_log_root; 2627 } 2628 2629 ret = btrfs_write_marked_extents(log_root_tree, 2630 &log_root_tree->dirty_log_pages, 2631 EXTENT_DIRTY | EXTENT_NEW); 2632 blk_finish_plug(&plug); 2633 if (ret) { 2634 btrfs_set_log_full_commit(root->fs_info, trans); 2635 btrfs_abort_transaction(trans, root, ret); 2636 btrfs_free_logged_extents(log, log_transid); 2637 mutex_unlock(&log_root_tree->log_mutex); 2638 goto out_wake_log_root; 2639 } 2640 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2641 btrfs_wait_marked_extents(log_root_tree, 2642 &log_root_tree->dirty_log_pages, 2643 EXTENT_NEW | EXTENT_DIRTY); 2644 btrfs_wait_logged_extents(log, log_transid); 2645 2646 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2647 log_root_tree->node->start); 2648 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2649 btrfs_header_level(log_root_tree->node)); 2650 2651 log_root_tree->log_transid++; 2652 mutex_unlock(&log_root_tree->log_mutex); 2653 2654 /* 2655 * nobody else is going to jump in and write the the ctree 2656 * super here because the log_commit atomic below is protecting 2657 * us. We must be called with a transaction handle pinning 2658 * the running transaction open, so a full commit can't hop 2659 * in and cause problems either. 2660 */ 2661 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2662 if (ret) { 2663 btrfs_set_log_full_commit(root->fs_info, trans); 2664 btrfs_abort_transaction(trans, root, ret); 2665 goto out_wake_log_root; 2666 } 2667 2668 mutex_lock(&root->log_mutex); 2669 if (root->last_log_commit < log_transid) 2670 root->last_log_commit = log_transid; 2671 mutex_unlock(&root->log_mutex); 2672 2673 out_wake_log_root: 2674 /* 2675 * We needn't get log_mutex here because we are sure all 2676 * the other tasks are blocked. 2677 */ 2678 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 2679 2680 mutex_lock(&log_root_tree->log_mutex); 2681 log_root_tree->log_transid_committed++; 2682 atomic_set(&log_root_tree->log_commit[index2], 0); 2683 mutex_unlock(&log_root_tree->log_mutex); 2684 2685 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2686 wake_up(&log_root_tree->log_commit_wait[index2]); 2687 out: 2688 /* See above. */ 2689 btrfs_remove_all_log_ctxs(root, index1, ret); 2690 2691 mutex_lock(&root->log_mutex); 2692 root->log_transid_committed++; 2693 atomic_set(&root->log_commit[index1], 0); 2694 mutex_unlock(&root->log_mutex); 2695 2696 if (waitqueue_active(&root->log_commit_wait[index1])) 2697 wake_up(&root->log_commit_wait[index1]); 2698 return ret; 2699 } 2700 2701 static void free_log_tree(struct btrfs_trans_handle *trans, 2702 struct btrfs_root *log) 2703 { 2704 int ret; 2705 u64 start; 2706 u64 end; 2707 struct walk_control wc = { 2708 .free = 1, 2709 .process_func = process_one_buffer 2710 }; 2711 2712 ret = walk_log_tree(trans, log, &wc); 2713 /* I don't think this can happen but just in case */ 2714 if (ret) 2715 btrfs_abort_transaction(trans, log, ret); 2716 2717 while (1) { 2718 ret = find_first_extent_bit(&log->dirty_log_pages, 2719 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2720 NULL); 2721 if (ret) 2722 break; 2723 2724 clear_extent_bits(&log->dirty_log_pages, start, end, 2725 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2726 } 2727 2728 /* 2729 * We may have short-circuited the log tree with the full commit logic 2730 * and left ordered extents on our list, so clear these out to keep us 2731 * from leaking inodes and memory. 2732 */ 2733 btrfs_free_logged_extents(log, 0); 2734 btrfs_free_logged_extents(log, 1); 2735 2736 free_extent_buffer(log->node); 2737 kfree(log); 2738 } 2739 2740 /* 2741 * free all the extents used by the tree log. This should be called 2742 * at commit time of the full transaction 2743 */ 2744 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2745 { 2746 if (root->log_root) { 2747 free_log_tree(trans, root->log_root); 2748 root->log_root = NULL; 2749 } 2750 return 0; 2751 } 2752 2753 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2754 struct btrfs_fs_info *fs_info) 2755 { 2756 if (fs_info->log_root_tree) { 2757 free_log_tree(trans, fs_info->log_root_tree); 2758 fs_info->log_root_tree = NULL; 2759 } 2760 return 0; 2761 } 2762 2763 /* 2764 * If both a file and directory are logged, and unlinks or renames are 2765 * mixed in, we have a few interesting corners: 2766 * 2767 * create file X in dir Y 2768 * link file X to X.link in dir Y 2769 * fsync file X 2770 * unlink file X but leave X.link 2771 * fsync dir Y 2772 * 2773 * After a crash we would expect only X.link to exist. But file X 2774 * didn't get fsync'd again so the log has back refs for X and X.link. 2775 * 2776 * We solve this by removing directory entries and inode backrefs from the 2777 * log when a file that was logged in the current transaction is 2778 * unlinked. Any later fsync will include the updated log entries, and 2779 * we'll be able to reconstruct the proper directory items from backrefs. 2780 * 2781 * This optimizations allows us to avoid relogging the entire inode 2782 * or the entire directory. 2783 */ 2784 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2785 struct btrfs_root *root, 2786 const char *name, int name_len, 2787 struct inode *dir, u64 index) 2788 { 2789 struct btrfs_root *log; 2790 struct btrfs_dir_item *di; 2791 struct btrfs_path *path; 2792 int ret; 2793 int err = 0; 2794 int bytes_del = 0; 2795 u64 dir_ino = btrfs_ino(dir); 2796 2797 if (BTRFS_I(dir)->logged_trans < trans->transid) 2798 return 0; 2799 2800 ret = join_running_log_trans(root); 2801 if (ret) 2802 return 0; 2803 2804 mutex_lock(&BTRFS_I(dir)->log_mutex); 2805 2806 log = root->log_root; 2807 path = btrfs_alloc_path(); 2808 if (!path) { 2809 err = -ENOMEM; 2810 goto out_unlock; 2811 } 2812 2813 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2814 name, name_len, -1); 2815 if (IS_ERR(di)) { 2816 err = PTR_ERR(di); 2817 goto fail; 2818 } 2819 if (di) { 2820 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2821 bytes_del += name_len; 2822 if (ret) { 2823 err = ret; 2824 goto fail; 2825 } 2826 } 2827 btrfs_release_path(path); 2828 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2829 index, name, name_len, -1); 2830 if (IS_ERR(di)) { 2831 err = PTR_ERR(di); 2832 goto fail; 2833 } 2834 if (di) { 2835 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2836 bytes_del += name_len; 2837 if (ret) { 2838 err = ret; 2839 goto fail; 2840 } 2841 } 2842 2843 /* update the directory size in the log to reflect the names 2844 * we have removed 2845 */ 2846 if (bytes_del) { 2847 struct btrfs_key key; 2848 2849 key.objectid = dir_ino; 2850 key.offset = 0; 2851 key.type = BTRFS_INODE_ITEM_KEY; 2852 btrfs_release_path(path); 2853 2854 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2855 if (ret < 0) { 2856 err = ret; 2857 goto fail; 2858 } 2859 if (ret == 0) { 2860 struct btrfs_inode_item *item; 2861 u64 i_size; 2862 2863 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2864 struct btrfs_inode_item); 2865 i_size = btrfs_inode_size(path->nodes[0], item); 2866 if (i_size > bytes_del) 2867 i_size -= bytes_del; 2868 else 2869 i_size = 0; 2870 btrfs_set_inode_size(path->nodes[0], item, i_size); 2871 btrfs_mark_buffer_dirty(path->nodes[0]); 2872 } else 2873 ret = 0; 2874 btrfs_release_path(path); 2875 } 2876 fail: 2877 btrfs_free_path(path); 2878 out_unlock: 2879 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2880 if (ret == -ENOSPC) { 2881 btrfs_set_log_full_commit(root->fs_info, trans); 2882 ret = 0; 2883 } else if (ret < 0) 2884 btrfs_abort_transaction(trans, root, ret); 2885 2886 btrfs_end_log_trans(root); 2887 2888 return err; 2889 } 2890 2891 /* see comments for btrfs_del_dir_entries_in_log */ 2892 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2893 struct btrfs_root *root, 2894 const char *name, int name_len, 2895 struct inode *inode, u64 dirid) 2896 { 2897 struct btrfs_root *log; 2898 u64 index; 2899 int ret; 2900 2901 if (BTRFS_I(inode)->logged_trans < trans->transid) 2902 return 0; 2903 2904 ret = join_running_log_trans(root); 2905 if (ret) 2906 return 0; 2907 log = root->log_root; 2908 mutex_lock(&BTRFS_I(inode)->log_mutex); 2909 2910 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 2911 dirid, &index); 2912 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2913 if (ret == -ENOSPC) { 2914 btrfs_set_log_full_commit(root->fs_info, trans); 2915 ret = 0; 2916 } else if (ret < 0 && ret != -ENOENT) 2917 btrfs_abort_transaction(trans, root, ret); 2918 btrfs_end_log_trans(root); 2919 2920 return ret; 2921 } 2922 2923 /* 2924 * creates a range item in the log for 'dirid'. first_offset and 2925 * last_offset tell us which parts of the key space the log should 2926 * be considered authoritative for. 2927 */ 2928 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2929 struct btrfs_root *log, 2930 struct btrfs_path *path, 2931 int key_type, u64 dirid, 2932 u64 first_offset, u64 last_offset) 2933 { 2934 int ret; 2935 struct btrfs_key key; 2936 struct btrfs_dir_log_item *item; 2937 2938 key.objectid = dirid; 2939 key.offset = first_offset; 2940 if (key_type == BTRFS_DIR_ITEM_KEY) 2941 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2942 else 2943 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2944 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2945 if (ret) 2946 return ret; 2947 2948 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2949 struct btrfs_dir_log_item); 2950 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2951 btrfs_mark_buffer_dirty(path->nodes[0]); 2952 btrfs_release_path(path); 2953 return 0; 2954 } 2955 2956 /* 2957 * log all the items included in the current transaction for a given 2958 * directory. This also creates the range items in the log tree required 2959 * to replay anything deleted before the fsync 2960 */ 2961 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2962 struct btrfs_root *root, struct inode *inode, 2963 struct btrfs_path *path, 2964 struct btrfs_path *dst_path, int key_type, 2965 u64 min_offset, u64 *last_offset_ret) 2966 { 2967 struct btrfs_key min_key; 2968 struct btrfs_root *log = root->log_root; 2969 struct extent_buffer *src; 2970 int err = 0; 2971 int ret; 2972 int i; 2973 int nritems; 2974 u64 first_offset = min_offset; 2975 u64 last_offset = (u64)-1; 2976 u64 ino = btrfs_ino(inode); 2977 2978 log = root->log_root; 2979 2980 min_key.objectid = ino; 2981 min_key.type = key_type; 2982 min_key.offset = min_offset; 2983 2984 path->keep_locks = 1; 2985 2986 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 2987 2988 /* 2989 * we didn't find anything from this transaction, see if there 2990 * is anything at all 2991 */ 2992 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 2993 min_key.objectid = ino; 2994 min_key.type = key_type; 2995 min_key.offset = (u64)-1; 2996 btrfs_release_path(path); 2997 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2998 if (ret < 0) { 2999 btrfs_release_path(path); 3000 return ret; 3001 } 3002 ret = btrfs_previous_item(root, path, ino, key_type); 3003 3004 /* if ret == 0 there are items for this type, 3005 * create a range to tell us the last key of this type. 3006 * otherwise, there are no items in this directory after 3007 * *min_offset, and we create a range to indicate that. 3008 */ 3009 if (ret == 0) { 3010 struct btrfs_key tmp; 3011 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3012 path->slots[0]); 3013 if (key_type == tmp.type) 3014 first_offset = max(min_offset, tmp.offset) + 1; 3015 } 3016 goto done; 3017 } 3018 3019 /* go backward to find any previous key */ 3020 ret = btrfs_previous_item(root, path, ino, key_type); 3021 if (ret == 0) { 3022 struct btrfs_key tmp; 3023 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3024 if (key_type == tmp.type) { 3025 first_offset = tmp.offset; 3026 ret = overwrite_item(trans, log, dst_path, 3027 path->nodes[0], path->slots[0], 3028 &tmp); 3029 if (ret) { 3030 err = ret; 3031 goto done; 3032 } 3033 } 3034 } 3035 btrfs_release_path(path); 3036 3037 /* find the first key from this transaction again */ 3038 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3039 if (WARN_ON(ret != 0)) 3040 goto done; 3041 3042 /* 3043 * we have a block from this transaction, log every item in it 3044 * from our directory 3045 */ 3046 while (1) { 3047 struct btrfs_key tmp; 3048 src = path->nodes[0]; 3049 nritems = btrfs_header_nritems(src); 3050 for (i = path->slots[0]; i < nritems; i++) { 3051 btrfs_item_key_to_cpu(src, &min_key, i); 3052 3053 if (min_key.objectid != ino || min_key.type != key_type) 3054 goto done; 3055 ret = overwrite_item(trans, log, dst_path, src, i, 3056 &min_key); 3057 if (ret) { 3058 err = ret; 3059 goto done; 3060 } 3061 } 3062 path->slots[0] = nritems; 3063 3064 /* 3065 * look ahead to the next item and see if it is also 3066 * from this directory and from this transaction 3067 */ 3068 ret = btrfs_next_leaf(root, path); 3069 if (ret == 1) { 3070 last_offset = (u64)-1; 3071 goto done; 3072 } 3073 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3074 if (tmp.objectid != ino || tmp.type != key_type) { 3075 last_offset = (u64)-1; 3076 goto done; 3077 } 3078 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3079 ret = overwrite_item(trans, log, dst_path, 3080 path->nodes[0], path->slots[0], 3081 &tmp); 3082 if (ret) 3083 err = ret; 3084 else 3085 last_offset = tmp.offset; 3086 goto done; 3087 } 3088 } 3089 done: 3090 btrfs_release_path(path); 3091 btrfs_release_path(dst_path); 3092 3093 if (err == 0) { 3094 *last_offset_ret = last_offset; 3095 /* 3096 * insert the log range keys to indicate where the log 3097 * is valid 3098 */ 3099 ret = insert_dir_log_key(trans, log, path, key_type, 3100 ino, first_offset, last_offset); 3101 if (ret) 3102 err = ret; 3103 } 3104 return err; 3105 } 3106 3107 /* 3108 * logging directories is very similar to logging inodes, We find all the items 3109 * from the current transaction and write them to the log. 3110 * 3111 * The recovery code scans the directory in the subvolume, and if it finds a 3112 * key in the range logged that is not present in the log tree, then it means 3113 * that dir entry was unlinked during the transaction. 3114 * 3115 * In order for that scan to work, we must include one key smaller than 3116 * the smallest logged by this transaction and one key larger than the largest 3117 * key logged by this transaction. 3118 */ 3119 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3120 struct btrfs_root *root, struct inode *inode, 3121 struct btrfs_path *path, 3122 struct btrfs_path *dst_path) 3123 { 3124 u64 min_key; 3125 u64 max_key; 3126 int ret; 3127 int key_type = BTRFS_DIR_ITEM_KEY; 3128 3129 again: 3130 min_key = 0; 3131 max_key = 0; 3132 while (1) { 3133 ret = log_dir_items(trans, root, inode, path, 3134 dst_path, key_type, min_key, 3135 &max_key); 3136 if (ret) 3137 return ret; 3138 if (max_key == (u64)-1) 3139 break; 3140 min_key = max_key + 1; 3141 } 3142 3143 if (key_type == BTRFS_DIR_ITEM_KEY) { 3144 key_type = BTRFS_DIR_INDEX_KEY; 3145 goto again; 3146 } 3147 return 0; 3148 } 3149 3150 /* 3151 * a helper function to drop items from the log before we relog an 3152 * inode. max_key_type indicates the highest item type to remove. 3153 * This cannot be run for file data extents because it does not 3154 * free the extents they point to. 3155 */ 3156 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3157 struct btrfs_root *log, 3158 struct btrfs_path *path, 3159 u64 objectid, int max_key_type) 3160 { 3161 int ret; 3162 struct btrfs_key key; 3163 struct btrfs_key found_key; 3164 int start_slot; 3165 3166 key.objectid = objectid; 3167 key.type = max_key_type; 3168 key.offset = (u64)-1; 3169 3170 while (1) { 3171 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3172 BUG_ON(ret == 0); /* Logic error */ 3173 if (ret < 0) 3174 break; 3175 3176 if (path->slots[0] == 0) 3177 break; 3178 3179 path->slots[0]--; 3180 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3181 path->slots[0]); 3182 3183 if (found_key.objectid != objectid) 3184 break; 3185 3186 found_key.offset = 0; 3187 found_key.type = 0; 3188 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3189 &start_slot); 3190 3191 ret = btrfs_del_items(trans, log, path, start_slot, 3192 path->slots[0] - start_slot + 1); 3193 /* 3194 * If start slot isn't 0 then we don't need to re-search, we've 3195 * found the last guy with the objectid in this tree. 3196 */ 3197 if (ret || start_slot != 0) 3198 break; 3199 btrfs_release_path(path); 3200 } 3201 btrfs_release_path(path); 3202 if (ret > 0) 3203 ret = 0; 3204 return ret; 3205 } 3206 3207 static void fill_inode_item(struct btrfs_trans_handle *trans, 3208 struct extent_buffer *leaf, 3209 struct btrfs_inode_item *item, 3210 struct inode *inode, int log_inode_only) 3211 { 3212 struct btrfs_map_token token; 3213 3214 btrfs_init_map_token(&token); 3215 3216 if (log_inode_only) { 3217 /* set the generation to zero so the recover code 3218 * can tell the difference between an logging 3219 * just to say 'this inode exists' and a logging 3220 * to say 'update this inode with these values' 3221 */ 3222 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3223 btrfs_set_token_inode_size(leaf, item, 0, &token); 3224 } else { 3225 btrfs_set_token_inode_generation(leaf, item, 3226 BTRFS_I(inode)->generation, 3227 &token); 3228 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3229 } 3230 3231 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3232 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3233 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3234 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3235 3236 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3237 inode->i_atime.tv_sec, &token); 3238 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3239 inode->i_atime.tv_nsec, &token); 3240 3241 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3242 inode->i_mtime.tv_sec, &token); 3243 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3244 inode->i_mtime.tv_nsec, &token); 3245 3246 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3247 inode->i_ctime.tv_sec, &token); 3248 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3249 inode->i_ctime.tv_nsec, &token); 3250 3251 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3252 &token); 3253 3254 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3255 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3256 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3257 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3258 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3259 } 3260 3261 static int log_inode_item(struct btrfs_trans_handle *trans, 3262 struct btrfs_root *log, struct btrfs_path *path, 3263 struct inode *inode) 3264 { 3265 struct btrfs_inode_item *inode_item; 3266 int ret; 3267 3268 ret = btrfs_insert_empty_item(trans, log, path, 3269 &BTRFS_I(inode)->location, 3270 sizeof(*inode_item)); 3271 if (ret && ret != -EEXIST) 3272 return ret; 3273 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3274 struct btrfs_inode_item); 3275 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3276 btrfs_release_path(path); 3277 return 0; 3278 } 3279 3280 static noinline int copy_items(struct btrfs_trans_handle *trans, 3281 struct inode *inode, 3282 struct btrfs_path *dst_path, 3283 struct btrfs_path *src_path, u64 *last_extent, 3284 int start_slot, int nr, int inode_only) 3285 { 3286 unsigned long src_offset; 3287 unsigned long dst_offset; 3288 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3289 struct btrfs_file_extent_item *extent; 3290 struct btrfs_inode_item *inode_item; 3291 struct extent_buffer *src = src_path->nodes[0]; 3292 struct btrfs_key first_key, last_key, key; 3293 int ret; 3294 struct btrfs_key *ins_keys; 3295 u32 *ins_sizes; 3296 char *ins_data; 3297 int i; 3298 struct list_head ordered_sums; 3299 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3300 bool has_extents = false; 3301 bool need_find_last_extent = (*last_extent == 0); 3302 bool done = false; 3303 3304 INIT_LIST_HEAD(&ordered_sums); 3305 3306 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3307 nr * sizeof(u32), GFP_NOFS); 3308 if (!ins_data) 3309 return -ENOMEM; 3310 3311 first_key.objectid = (u64)-1; 3312 3313 ins_sizes = (u32 *)ins_data; 3314 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3315 3316 for (i = 0; i < nr; i++) { 3317 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3318 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3319 } 3320 ret = btrfs_insert_empty_items(trans, log, dst_path, 3321 ins_keys, ins_sizes, nr); 3322 if (ret) { 3323 kfree(ins_data); 3324 return ret; 3325 } 3326 3327 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3328 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3329 dst_path->slots[0]); 3330 3331 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3332 3333 if ((i == (nr - 1))) 3334 last_key = ins_keys[i]; 3335 3336 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3337 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3338 dst_path->slots[0], 3339 struct btrfs_inode_item); 3340 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3341 inode, inode_only == LOG_INODE_EXISTS); 3342 } else { 3343 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3344 src_offset, ins_sizes[i]); 3345 } 3346 3347 /* 3348 * We set need_find_last_extent here in case we know we were 3349 * processing other items and then walk into the first extent in 3350 * the inode. If we don't hit an extent then nothing changes, 3351 * we'll do the last search the next time around. 3352 */ 3353 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3354 has_extents = true; 3355 if (need_find_last_extent && 3356 first_key.objectid == (u64)-1) 3357 first_key = ins_keys[i]; 3358 } else { 3359 need_find_last_extent = false; 3360 } 3361 3362 /* take a reference on file data extents so that truncates 3363 * or deletes of this inode don't have to relog the inode 3364 * again 3365 */ 3366 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3367 !skip_csum) { 3368 int found_type; 3369 extent = btrfs_item_ptr(src, start_slot + i, 3370 struct btrfs_file_extent_item); 3371 3372 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3373 continue; 3374 3375 found_type = btrfs_file_extent_type(src, extent); 3376 if (found_type == BTRFS_FILE_EXTENT_REG) { 3377 u64 ds, dl, cs, cl; 3378 ds = btrfs_file_extent_disk_bytenr(src, 3379 extent); 3380 /* ds == 0 is a hole */ 3381 if (ds == 0) 3382 continue; 3383 3384 dl = btrfs_file_extent_disk_num_bytes(src, 3385 extent); 3386 cs = btrfs_file_extent_offset(src, extent); 3387 cl = btrfs_file_extent_num_bytes(src, 3388 extent); 3389 if (btrfs_file_extent_compression(src, 3390 extent)) { 3391 cs = 0; 3392 cl = dl; 3393 } 3394 3395 ret = btrfs_lookup_csums_range( 3396 log->fs_info->csum_root, 3397 ds + cs, ds + cs + cl - 1, 3398 &ordered_sums, 0); 3399 if (ret) { 3400 btrfs_release_path(dst_path); 3401 kfree(ins_data); 3402 return ret; 3403 } 3404 } 3405 } 3406 } 3407 3408 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3409 btrfs_release_path(dst_path); 3410 kfree(ins_data); 3411 3412 /* 3413 * we have to do this after the loop above to avoid changing the 3414 * log tree while trying to change the log tree. 3415 */ 3416 ret = 0; 3417 while (!list_empty(&ordered_sums)) { 3418 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3419 struct btrfs_ordered_sum, 3420 list); 3421 if (!ret) 3422 ret = btrfs_csum_file_blocks(trans, log, sums); 3423 list_del(&sums->list); 3424 kfree(sums); 3425 } 3426 3427 if (!has_extents) 3428 return ret; 3429 3430 /* 3431 * Because we use btrfs_search_forward we could skip leaves that were 3432 * not modified and then assume *last_extent is valid when it really 3433 * isn't. So back up to the previous leaf and read the end of the last 3434 * extent before we go and fill in holes. 3435 */ 3436 if (need_find_last_extent) { 3437 u64 len; 3438 3439 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); 3440 if (ret < 0) 3441 return ret; 3442 if (ret) 3443 goto fill_holes; 3444 if (src_path->slots[0]) 3445 src_path->slots[0]--; 3446 src = src_path->nodes[0]; 3447 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 3448 if (key.objectid != btrfs_ino(inode) || 3449 key.type != BTRFS_EXTENT_DATA_KEY) 3450 goto fill_holes; 3451 extent = btrfs_item_ptr(src, src_path->slots[0], 3452 struct btrfs_file_extent_item); 3453 if (btrfs_file_extent_type(src, extent) == 3454 BTRFS_FILE_EXTENT_INLINE) { 3455 len = btrfs_file_extent_inline_len(src, 3456 src_path->slots[0], 3457 extent); 3458 *last_extent = ALIGN(key.offset + len, 3459 log->sectorsize); 3460 } else { 3461 len = btrfs_file_extent_num_bytes(src, extent); 3462 *last_extent = key.offset + len; 3463 } 3464 } 3465 fill_holes: 3466 /* So we did prev_leaf, now we need to move to the next leaf, but a few 3467 * things could have happened 3468 * 3469 * 1) A merge could have happened, so we could currently be on a leaf 3470 * that holds what we were copying in the first place. 3471 * 2) A split could have happened, and now not all of the items we want 3472 * are on the same leaf. 3473 * 3474 * So we need to adjust how we search for holes, we need to drop the 3475 * path and re-search for the first extent key we found, and then walk 3476 * forward until we hit the last one we copied. 3477 */ 3478 if (need_find_last_extent) { 3479 /* btrfs_prev_leaf could return 1 without releasing the path */ 3480 btrfs_release_path(src_path); 3481 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, 3482 src_path, 0, 0); 3483 if (ret < 0) 3484 return ret; 3485 ASSERT(ret == 0); 3486 src = src_path->nodes[0]; 3487 i = src_path->slots[0]; 3488 } else { 3489 i = start_slot; 3490 } 3491 3492 /* 3493 * Ok so here we need to go through and fill in any holes we may have 3494 * to make sure that holes are punched for those areas in case they had 3495 * extents previously. 3496 */ 3497 while (!done) { 3498 u64 offset, len; 3499 u64 extent_end; 3500 3501 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 3502 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); 3503 if (ret < 0) 3504 return ret; 3505 ASSERT(ret == 0); 3506 src = src_path->nodes[0]; 3507 i = 0; 3508 } 3509 3510 btrfs_item_key_to_cpu(src, &key, i); 3511 if (!btrfs_comp_cpu_keys(&key, &last_key)) 3512 done = true; 3513 if (key.objectid != btrfs_ino(inode) || 3514 key.type != BTRFS_EXTENT_DATA_KEY) { 3515 i++; 3516 continue; 3517 } 3518 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 3519 if (btrfs_file_extent_type(src, extent) == 3520 BTRFS_FILE_EXTENT_INLINE) { 3521 len = btrfs_file_extent_inline_len(src, i, extent); 3522 extent_end = ALIGN(key.offset + len, log->sectorsize); 3523 } else { 3524 len = btrfs_file_extent_num_bytes(src, extent); 3525 extent_end = key.offset + len; 3526 } 3527 i++; 3528 3529 if (*last_extent == key.offset) { 3530 *last_extent = extent_end; 3531 continue; 3532 } 3533 offset = *last_extent; 3534 len = key.offset - *last_extent; 3535 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 3536 offset, 0, 0, len, 0, len, 0, 3537 0, 0); 3538 if (ret) 3539 break; 3540 *last_extent = offset + len; 3541 } 3542 /* 3543 * Need to let the callers know we dropped the path so they should 3544 * re-search. 3545 */ 3546 if (!ret && need_find_last_extent) 3547 ret = 1; 3548 return ret; 3549 } 3550 3551 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3552 { 3553 struct extent_map *em1, *em2; 3554 3555 em1 = list_entry(a, struct extent_map, list); 3556 em2 = list_entry(b, struct extent_map, list); 3557 3558 if (em1->start < em2->start) 3559 return -1; 3560 else if (em1->start > em2->start) 3561 return 1; 3562 return 0; 3563 } 3564 3565 static int log_one_extent(struct btrfs_trans_handle *trans, 3566 struct inode *inode, struct btrfs_root *root, 3567 struct extent_map *em, struct btrfs_path *path, 3568 struct list_head *logged_list) 3569 { 3570 struct btrfs_root *log = root->log_root; 3571 struct btrfs_file_extent_item *fi; 3572 struct extent_buffer *leaf; 3573 struct btrfs_ordered_extent *ordered; 3574 struct list_head ordered_sums; 3575 struct btrfs_map_token token; 3576 struct btrfs_key key; 3577 u64 mod_start = em->mod_start; 3578 u64 mod_len = em->mod_len; 3579 u64 csum_offset; 3580 u64 csum_len; 3581 u64 extent_offset = em->start - em->orig_start; 3582 u64 block_len; 3583 int ret; 3584 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3585 int extent_inserted = 0; 3586 3587 INIT_LIST_HEAD(&ordered_sums); 3588 btrfs_init_map_token(&token); 3589 3590 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3591 em->start + em->len, NULL, 0, 1, 3592 sizeof(*fi), &extent_inserted); 3593 if (ret) 3594 return ret; 3595 3596 if (!extent_inserted) { 3597 key.objectid = btrfs_ino(inode); 3598 key.type = BTRFS_EXTENT_DATA_KEY; 3599 key.offset = em->start; 3600 3601 ret = btrfs_insert_empty_item(trans, log, path, &key, 3602 sizeof(*fi)); 3603 if (ret) 3604 return ret; 3605 } 3606 leaf = path->nodes[0]; 3607 fi = btrfs_item_ptr(leaf, path->slots[0], 3608 struct btrfs_file_extent_item); 3609 3610 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3611 &token); 3612 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3613 skip_csum = true; 3614 btrfs_set_token_file_extent_type(leaf, fi, 3615 BTRFS_FILE_EXTENT_PREALLOC, 3616 &token); 3617 } else { 3618 btrfs_set_token_file_extent_type(leaf, fi, 3619 BTRFS_FILE_EXTENT_REG, 3620 &token); 3621 if (em->block_start == EXTENT_MAP_HOLE) 3622 skip_csum = true; 3623 } 3624 3625 block_len = max(em->block_len, em->orig_block_len); 3626 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3627 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3628 em->block_start, 3629 &token); 3630 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3631 &token); 3632 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3633 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3634 em->block_start - 3635 extent_offset, &token); 3636 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3637 &token); 3638 } else { 3639 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3640 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3641 &token); 3642 } 3643 3644 btrfs_set_token_file_extent_offset(leaf, fi, 3645 em->start - em->orig_start, 3646 &token); 3647 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3648 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 3649 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3650 &token); 3651 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3652 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3653 btrfs_mark_buffer_dirty(leaf); 3654 3655 btrfs_release_path(path); 3656 if (ret) { 3657 return ret; 3658 } 3659 3660 if (skip_csum) 3661 return 0; 3662 3663 /* 3664 * First check and see if our csums are on our outstanding ordered 3665 * extents. 3666 */ 3667 list_for_each_entry(ordered, logged_list, log_list) { 3668 struct btrfs_ordered_sum *sum; 3669 3670 if (!mod_len) 3671 break; 3672 3673 if (ordered->file_offset + ordered->len <= mod_start || 3674 mod_start + mod_len <= ordered->file_offset) 3675 continue; 3676 3677 /* 3678 * We are going to copy all the csums on this ordered extent, so 3679 * go ahead and adjust mod_start and mod_len in case this 3680 * ordered extent has already been logged. 3681 */ 3682 if (ordered->file_offset > mod_start) { 3683 if (ordered->file_offset + ordered->len >= 3684 mod_start + mod_len) 3685 mod_len = ordered->file_offset - mod_start; 3686 /* 3687 * If we have this case 3688 * 3689 * |--------- logged extent ---------| 3690 * |----- ordered extent ----| 3691 * 3692 * Just don't mess with mod_start and mod_len, we'll 3693 * just end up logging more csums than we need and it 3694 * will be ok. 3695 */ 3696 } else { 3697 if (ordered->file_offset + ordered->len < 3698 mod_start + mod_len) { 3699 mod_len = (mod_start + mod_len) - 3700 (ordered->file_offset + ordered->len); 3701 mod_start = ordered->file_offset + 3702 ordered->len; 3703 } else { 3704 mod_len = 0; 3705 } 3706 } 3707 3708 /* 3709 * To keep us from looping for the above case of an ordered 3710 * extent that falls inside of the logged extent. 3711 */ 3712 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3713 &ordered->flags)) 3714 continue; 3715 3716 if (ordered->csum_bytes_left) { 3717 btrfs_start_ordered_extent(inode, ordered, 0); 3718 wait_event(ordered->wait, 3719 ordered->csum_bytes_left == 0); 3720 } 3721 3722 list_for_each_entry(sum, &ordered->list, list) { 3723 ret = btrfs_csum_file_blocks(trans, log, sum); 3724 if (ret) 3725 goto unlocked; 3726 } 3727 3728 } 3729 unlocked: 3730 3731 if (!mod_len || ret) 3732 return ret; 3733 3734 if (em->compress_type) { 3735 csum_offset = 0; 3736 csum_len = block_len; 3737 } else { 3738 csum_offset = mod_start - em->start; 3739 csum_len = mod_len; 3740 } 3741 3742 /* block start is already adjusted for the file extent offset. */ 3743 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3744 em->block_start + csum_offset, 3745 em->block_start + csum_offset + 3746 csum_len - 1, &ordered_sums, 0); 3747 if (ret) 3748 return ret; 3749 3750 while (!list_empty(&ordered_sums)) { 3751 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3752 struct btrfs_ordered_sum, 3753 list); 3754 if (!ret) 3755 ret = btrfs_csum_file_blocks(trans, log, sums); 3756 list_del(&sums->list); 3757 kfree(sums); 3758 } 3759 3760 return ret; 3761 } 3762 3763 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3764 struct btrfs_root *root, 3765 struct inode *inode, 3766 struct btrfs_path *path, 3767 struct list_head *logged_list) 3768 { 3769 struct extent_map *em, *n; 3770 struct list_head extents; 3771 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3772 u64 test_gen; 3773 int ret = 0; 3774 int num = 0; 3775 3776 INIT_LIST_HEAD(&extents); 3777 3778 write_lock(&tree->lock); 3779 test_gen = root->fs_info->last_trans_committed; 3780 3781 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3782 list_del_init(&em->list); 3783 3784 /* 3785 * Just an arbitrary number, this can be really CPU intensive 3786 * once we start getting a lot of extents, and really once we 3787 * have a bunch of extents we just want to commit since it will 3788 * be faster. 3789 */ 3790 if (++num > 32768) { 3791 list_del_init(&tree->modified_extents); 3792 ret = -EFBIG; 3793 goto process; 3794 } 3795 3796 if (em->generation <= test_gen) 3797 continue; 3798 /* Need a ref to keep it from getting evicted from cache */ 3799 atomic_inc(&em->refs); 3800 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3801 list_add_tail(&em->list, &extents); 3802 num++; 3803 } 3804 3805 list_sort(NULL, &extents, extent_cmp); 3806 3807 process: 3808 while (!list_empty(&extents)) { 3809 em = list_entry(extents.next, struct extent_map, list); 3810 3811 list_del_init(&em->list); 3812 3813 /* 3814 * If we had an error we just need to delete everybody from our 3815 * private list. 3816 */ 3817 if (ret) { 3818 clear_em_logging(tree, em); 3819 free_extent_map(em); 3820 continue; 3821 } 3822 3823 write_unlock(&tree->lock); 3824 3825 ret = log_one_extent(trans, inode, root, em, path, logged_list); 3826 write_lock(&tree->lock); 3827 clear_em_logging(tree, em); 3828 free_extent_map(em); 3829 } 3830 WARN_ON(!list_empty(&extents)); 3831 write_unlock(&tree->lock); 3832 3833 btrfs_release_path(path); 3834 return ret; 3835 } 3836 3837 /* log a single inode in the tree log. 3838 * At least one parent directory for this inode must exist in the tree 3839 * or be logged already. 3840 * 3841 * Any items from this inode changed by the current transaction are copied 3842 * to the log tree. An extra reference is taken on any extents in this 3843 * file, allowing us to avoid a whole pile of corner cases around logging 3844 * blocks that have been removed from the tree. 3845 * 3846 * See LOG_INODE_ALL and related defines for a description of what inode_only 3847 * does. 3848 * 3849 * This handles both files and directories. 3850 */ 3851 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3852 struct btrfs_root *root, struct inode *inode, 3853 int inode_only) 3854 { 3855 struct btrfs_path *path; 3856 struct btrfs_path *dst_path; 3857 struct btrfs_key min_key; 3858 struct btrfs_key max_key; 3859 struct btrfs_root *log = root->log_root; 3860 struct extent_buffer *src = NULL; 3861 LIST_HEAD(logged_list); 3862 u64 last_extent = 0; 3863 int err = 0; 3864 int ret; 3865 int nritems; 3866 int ins_start_slot = 0; 3867 int ins_nr; 3868 bool fast_search = false; 3869 u64 ino = btrfs_ino(inode); 3870 3871 path = btrfs_alloc_path(); 3872 if (!path) 3873 return -ENOMEM; 3874 dst_path = btrfs_alloc_path(); 3875 if (!dst_path) { 3876 btrfs_free_path(path); 3877 return -ENOMEM; 3878 } 3879 3880 min_key.objectid = ino; 3881 min_key.type = BTRFS_INODE_ITEM_KEY; 3882 min_key.offset = 0; 3883 3884 max_key.objectid = ino; 3885 3886 3887 /* today the code can only do partial logging of directories */ 3888 if (S_ISDIR(inode->i_mode) || 3889 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3890 &BTRFS_I(inode)->runtime_flags) && 3891 inode_only == LOG_INODE_EXISTS)) 3892 max_key.type = BTRFS_XATTR_ITEM_KEY; 3893 else 3894 max_key.type = (u8)-1; 3895 max_key.offset = (u64)-1; 3896 3897 /* Only run delayed items if we are a dir or a new file */ 3898 if (S_ISDIR(inode->i_mode) || 3899 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { 3900 ret = btrfs_commit_inode_delayed_items(trans, inode); 3901 if (ret) { 3902 btrfs_free_path(path); 3903 btrfs_free_path(dst_path); 3904 return ret; 3905 } 3906 } 3907 3908 mutex_lock(&BTRFS_I(inode)->log_mutex); 3909 3910 btrfs_get_logged_extents(inode, &logged_list); 3911 3912 /* 3913 * a brute force approach to making sure we get the most uptodate 3914 * copies of everything. 3915 */ 3916 if (S_ISDIR(inode->i_mode)) { 3917 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 3918 3919 if (inode_only == LOG_INODE_EXISTS) 3920 max_key_type = BTRFS_XATTR_ITEM_KEY; 3921 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3922 } else { 3923 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3924 &BTRFS_I(inode)->runtime_flags)) { 3925 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3926 &BTRFS_I(inode)->runtime_flags); 3927 ret = btrfs_truncate_inode_items(trans, log, 3928 inode, 0, 0); 3929 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3930 &BTRFS_I(inode)->runtime_flags) || 3931 inode_only == LOG_INODE_EXISTS) { 3932 if (inode_only == LOG_INODE_ALL) 3933 fast_search = true; 3934 max_key.type = BTRFS_XATTR_ITEM_KEY; 3935 ret = drop_objectid_items(trans, log, path, ino, 3936 max_key.type); 3937 } else { 3938 if (inode_only == LOG_INODE_ALL) 3939 fast_search = true; 3940 ret = log_inode_item(trans, log, dst_path, inode); 3941 if (ret) { 3942 err = ret; 3943 goto out_unlock; 3944 } 3945 goto log_extents; 3946 } 3947 3948 } 3949 if (ret) { 3950 err = ret; 3951 goto out_unlock; 3952 } 3953 path->keep_locks = 1; 3954 3955 while (1) { 3956 ins_nr = 0; 3957 ret = btrfs_search_forward(root, &min_key, 3958 path, trans->transid); 3959 if (ret != 0) 3960 break; 3961 again: 3962 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 3963 if (min_key.objectid != ino) 3964 break; 3965 if (min_key.type > max_key.type) 3966 break; 3967 3968 src = path->nodes[0]; 3969 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 3970 ins_nr++; 3971 goto next_slot; 3972 } else if (!ins_nr) { 3973 ins_start_slot = path->slots[0]; 3974 ins_nr = 1; 3975 goto next_slot; 3976 } 3977 3978 ret = copy_items(trans, inode, dst_path, path, &last_extent, 3979 ins_start_slot, ins_nr, inode_only); 3980 if (ret < 0) { 3981 err = ret; 3982 goto out_unlock; 3983 } if (ret) { 3984 ins_nr = 0; 3985 btrfs_release_path(path); 3986 continue; 3987 } 3988 ins_nr = 1; 3989 ins_start_slot = path->slots[0]; 3990 next_slot: 3991 3992 nritems = btrfs_header_nritems(path->nodes[0]); 3993 path->slots[0]++; 3994 if (path->slots[0] < nritems) { 3995 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 3996 path->slots[0]); 3997 goto again; 3998 } 3999 if (ins_nr) { 4000 ret = copy_items(trans, inode, dst_path, path, 4001 &last_extent, ins_start_slot, 4002 ins_nr, inode_only); 4003 if (ret < 0) { 4004 err = ret; 4005 goto out_unlock; 4006 } 4007 ret = 0; 4008 ins_nr = 0; 4009 } 4010 btrfs_release_path(path); 4011 4012 if (min_key.offset < (u64)-1) { 4013 min_key.offset++; 4014 } else if (min_key.type < max_key.type) { 4015 min_key.type++; 4016 min_key.offset = 0; 4017 } else { 4018 break; 4019 } 4020 } 4021 if (ins_nr) { 4022 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4023 ins_start_slot, ins_nr, inode_only); 4024 if (ret < 0) { 4025 err = ret; 4026 goto out_unlock; 4027 } 4028 ret = 0; 4029 ins_nr = 0; 4030 } 4031 4032 log_extents: 4033 btrfs_release_path(path); 4034 btrfs_release_path(dst_path); 4035 if (fast_search) { 4036 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4037 &logged_list); 4038 if (ret) { 4039 err = ret; 4040 goto out_unlock; 4041 } 4042 } else if (inode_only == LOG_INODE_ALL) { 4043 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 4044 struct extent_map *em, *n; 4045 4046 write_lock(&tree->lock); 4047 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 4048 list_del_init(&em->list); 4049 write_unlock(&tree->lock); 4050 } 4051 4052 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4053 ret = log_directory_changes(trans, root, inode, path, dst_path); 4054 if (ret) { 4055 err = ret; 4056 goto out_unlock; 4057 } 4058 } 4059 BTRFS_I(inode)->logged_trans = trans->transid; 4060 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4061 out_unlock: 4062 if (unlikely(err)) 4063 btrfs_put_logged_extents(&logged_list); 4064 else 4065 btrfs_submit_logged_extents(&logged_list, log); 4066 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4067 4068 btrfs_free_path(path); 4069 btrfs_free_path(dst_path); 4070 return err; 4071 } 4072 4073 /* 4074 * follow the dentry parent pointers up the chain and see if any 4075 * of the directories in it require a full commit before they can 4076 * be logged. Returns zero if nothing special needs to be done or 1 if 4077 * a full commit is required. 4078 */ 4079 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 4080 struct inode *inode, 4081 struct dentry *parent, 4082 struct super_block *sb, 4083 u64 last_committed) 4084 { 4085 int ret = 0; 4086 struct btrfs_root *root; 4087 struct dentry *old_parent = NULL; 4088 struct inode *orig_inode = inode; 4089 4090 /* 4091 * for regular files, if its inode is already on disk, we don't 4092 * have to worry about the parents at all. This is because 4093 * we can use the last_unlink_trans field to record renames 4094 * and other fun in this file. 4095 */ 4096 if (S_ISREG(inode->i_mode) && 4097 BTRFS_I(inode)->generation <= last_committed && 4098 BTRFS_I(inode)->last_unlink_trans <= last_committed) 4099 goto out; 4100 4101 if (!S_ISDIR(inode->i_mode)) { 4102 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4103 goto out; 4104 inode = parent->d_inode; 4105 } 4106 4107 while (1) { 4108 /* 4109 * If we are logging a directory then we start with our inode, 4110 * not our parents inode, so we need to skipp setting the 4111 * logged_trans so that further down in the log code we don't 4112 * think this inode has already been logged. 4113 */ 4114 if (inode != orig_inode) 4115 BTRFS_I(inode)->logged_trans = trans->transid; 4116 smp_mb(); 4117 4118 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4119 root = BTRFS_I(inode)->root; 4120 4121 /* 4122 * make sure any commits to the log are forced 4123 * to be full commits 4124 */ 4125 btrfs_set_log_full_commit(root->fs_info, trans); 4126 ret = 1; 4127 break; 4128 } 4129 4130 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4131 break; 4132 4133 if (IS_ROOT(parent)) 4134 break; 4135 4136 parent = dget_parent(parent); 4137 dput(old_parent); 4138 old_parent = parent; 4139 inode = parent->d_inode; 4140 4141 } 4142 dput(old_parent); 4143 out: 4144 return ret; 4145 } 4146 4147 /* 4148 * helper function around btrfs_log_inode to make sure newly created 4149 * parent directories also end up in the log. A minimal inode and backref 4150 * only logging is done of any parent directories that are older than 4151 * the last committed transaction 4152 */ 4153 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4154 struct btrfs_root *root, struct inode *inode, 4155 struct dentry *parent, int exists_only, 4156 struct btrfs_log_ctx *ctx) 4157 { 4158 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4159 struct super_block *sb; 4160 struct dentry *old_parent = NULL; 4161 int ret = 0; 4162 u64 last_committed = root->fs_info->last_trans_committed; 4163 4164 sb = inode->i_sb; 4165 4166 if (btrfs_test_opt(root, NOTREELOG)) { 4167 ret = 1; 4168 goto end_no_trans; 4169 } 4170 4171 /* 4172 * The prev transaction commit doesn't complete, we need do 4173 * full commit by ourselves. 4174 */ 4175 if (root->fs_info->last_trans_log_full_commit > 4176 root->fs_info->last_trans_committed) { 4177 ret = 1; 4178 goto end_no_trans; 4179 } 4180 4181 if (root != BTRFS_I(inode)->root || 4182 btrfs_root_refs(&root->root_item) == 0) { 4183 ret = 1; 4184 goto end_no_trans; 4185 } 4186 4187 ret = check_parent_dirs_for_sync(trans, inode, parent, 4188 sb, last_committed); 4189 if (ret) 4190 goto end_no_trans; 4191 4192 if (btrfs_inode_in_log(inode, trans->transid)) { 4193 ret = BTRFS_NO_LOG_SYNC; 4194 goto end_no_trans; 4195 } 4196 4197 ret = start_log_trans(trans, root, ctx); 4198 if (ret) 4199 goto end_no_trans; 4200 4201 ret = btrfs_log_inode(trans, root, inode, inode_only); 4202 if (ret) 4203 goto end_trans; 4204 4205 /* 4206 * for regular files, if its inode is already on disk, we don't 4207 * have to worry about the parents at all. This is because 4208 * we can use the last_unlink_trans field to record renames 4209 * and other fun in this file. 4210 */ 4211 if (S_ISREG(inode->i_mode) && 4212 BTRFS_I(inode)->generation <= last_committed && 4213 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 4214 ret = 0; 4215 goto end_trans; 4216 } 4217 4218 inode_only = LOG_INODE_EXISTS; 4219 while (1) { 4220 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4221 break; 4222 4223 inode = parent->d_inode; 4224 if (root != BTRFS_I(inode)->root) 4225 break; 4226 4227 if (BTRFS_I(inode)->generation > 4228 root->fs_info->last_trans_committed) { 4229 ret = btrfs_log_inode(trans, root, inode, inode_only); 4230 if (ret) 4231 goto end_trans; 4232 } 4233 if (IS_ROOT(parent)) 4234 break; 4235 4236 parent = dget_parent(parent); 4237 dput(old_parent); 4238 old_parent = parent; 4239 } 4240 ret = 0; 4241 end_trans: 4242 dput(old_parent); 4243 if (ret < 0) { 4244 btrfs_set_log_full_commit(root->fs_info, trans); 4245 ret = 1; 4246 } 4247 4248 if (ret) 4249 btrfs_remove_log_ctx(root, ctx); 4250 btrfs_end_log_trans(root); 4251 end_no_trans: 4252 return ret; 4253 } 4254 4255 /* 4256 * it is not safe to log dentry if the chunk root has added new 4257 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 4258 * If this returns 1, you must commit the transaction to safely get your 4259 * data on disk. 4260 */ 4261 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4262 struct btrfs_root *root, struct dentry *dentry, 4263 struct btrfs_log_ctx *ctx) 4264 { 4265 struct dentry *parent = dget_parent(dentry); 4266 int ret; 4267 4268 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 4269 0, ctx); 4270 dput(parent); 4271 4272 return ret; 4273 } 4274 4275 /* 4276 * should be called during mount to recover any replay any log trees 4277 * from the FS 4278 */ 4279 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 4280 { 4281 int ret; 4282 struct btrfs_path *path; 4283 struct btrfs_trans_handle *trans; 4284 struct btrfs_key key; 4285 struct btrfs_key found_key; 4286 struct btrfs_key tmp_key; 4287 struct btrfs_root *log; 4288 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 4289 struct walk_control wc = { 4290 .process_func = process_one_buffer, 4291 .stage = 0, 4292 }; 4293 4294 path = btrfs_alloc_path(); 4295 if (!path) 4296 return -ENOMEM; 4297 4298 fs_info->log_root_recovering = 1; 4299 4300 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4301 if (IS_ERR(trans)) { 4302 ret = PTR_ERR(trans); 4303 goto error; 4304 } 4305 4306 wc.trans = trans; 4307 wc.pin = 1; 4308 4309 ret = walk_log_tree(trans, log_root_tree, &wc); 4310 if (ret) { 4311 btrfs_error(fs_info, ret, "Failed to pin buffers while " 4312 "recovering log root tree."); 4313 goto error; 4314 } 4315 4316 again: 4317 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4318 key.offset = (u64)-1; 4319 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4320 4321 while (1) { 4322 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4323 4324 if (ret < 0) { 4325 btrfs_error(fs_info, ret, 4326 "Couldn't find tree log root."); 4327 goto error; 4328 } 4329 if (ret > 0) { 4330 if (path->slots[0] == 0) 4331 break; 4332 path->slots[0]--; 4333 } 4334 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4335 path->slots[0]); 4336 btrfs_release_path(path); 4337 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4338 break; 4339 4340 log = btrfs_read_fs_root(log_root_tree, &found_key); 4341 if (IS_ERR(log)) { 4342 ret = PTR_ERR(log); 4343 btrfs_error(fs_info, ret, 4344 "Couldn't read tree log root."); 4345 goto error; 4346 } 4347 4348 tmp_key.objectid = found_key.offset; 4349 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4350 tmp_key.offset = (u64)-1; 4351 4352 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4353 if (IS_ERR(wc.replay_dest)) { 4354 ret = PTR_ERR(wc.replay_dest); 4355 free_extent_buffer(log->node); 4356 free_extent_buffer(log->commit_root); 4357 kfree(log); 4358 btrfs_error(fs_info, ret, "Couldn't read target root " 4359 "for tree log recovery."); 4360 goto error; 4361 } 4362 4363 wc.replay_dest->log_root = log; 4364 btrfs_record_root_in_trans(trans, wc.replay_dest); 4365 ret = walk_log_tree(trans, log, &wc); 4366 4367 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 4368 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4369 path); 4370 } 4371 4372 key.offset = found_key.offset - 1; 4373 wc.replay_dest->log_root = NULL; 4374 free_extent_buffer(log->node); 4375 free_extent_buffer(log->commit_root); 4376 kfree(log); 4377 4378 if (ret) 4379 goto error; 4380 4381 if (found_key.offset == 0) 4382 break; 4383 } 4384 btrfs_release_path(path); 4385 4386 /* step one is to pin it all, step two is to replay just inodes */ 4387 if (wc.pin) { 4388 wc.pin = 0; 4389 wc.process_func = replay_one_buffer; 4390 wc.stage = LOG_WALK_REPLAY_INODES; 4391 goto again; 4392 } 4393 /* step three is to replay everything */ 4394 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4395 wc.stage++; 4396 goto again; 4397 } 4398 4399 btrfs_free_path(path); 4400 4401 /* step 4: commit the transaction, which also unpins the blocks */ 4402 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 4403 if (ret) 4404 return ret; 4405 4406 free_extent_buffer(log_root_tree->node); 4407 log_root_tree->log_root = NULL; 4408 fs_info->log_root_recovering = 0; 4409 kfree(log_root_tree); 4410 4411 return 0; 4412 error: 4413 if (wc.trans) 4414 btrfs_end_transaction(wc.trans, fs_info->tree_root); 4415 btrfs_free_path(path); 4416 return ret; 4417 } 4418 4419 /* 4420 * there are some corner cases where we want to force a full 4421 * commit instead of allowing a directory to be logged. 4422 * 4423 * They revolve around files there were unlinked from the directory, and 4424 * this function updates the parent directory so that a full commit is 4425 * properly done if it is fsync'd later after the unlinks are done. 4426 */ 4427 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 4428 struct inode *dir, struct inode *inode, 4429 int for_rename) 4430 { 4431 /* 4432 * when we're logging a file, if it hasn't been renamed 4433 * or unlinked, and its inode is fully committed on disk, 4434 * we don't have to worry about walking up the directory chain 4435 * to log its parents. 4436 * 4437 * So, we use the last_unlink_trans field to put this transid 4438 * into the file. When the file is logged we check it and 4439 * don't log the parents if the file is fully on disk. 4440 */ 4441 if (S_ISREG(inode->i_mode)) 4442 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4443 4444 /* 4445 * if this directory was already logged any new 4446 * names for this file/dir will get recorded 4447 */ 4448 smp_mb(); 4449 if (BTRFS_I(dir)->logged_trans == trans->transid) 4450 return; 4451 4452 /* 4453 * if the inode we're about to unlink was logged, 4454 * the log will be properly updated for any new names 4455 */ 4456 if (BTRFS_I(inode)->logged_trans == trans->transid) 4457 return; 4458 4459 /* 4460 * when renaming files across directories, if the directory 4461 * there we're unlinking from gets fsync'd later on, there's 4462 * no way to find the destination directory later and fsync it 4463 * properly. So, we have to be conservative and force commits 4464 * so the new name gets discovered. 4465 */ 4466 if (for_rename) 4467 goto record; 4468 4469 /* we can safely do the unlink without any special recording */ 4470 return; 4471 4472 record: 4473 BTRFS_I(dir)->last_unlink_trans = trans->transid; 4474 } 4475 4476 /* 4477 * Call this after adding a new name for a file and it will properly 4478 * update the log to reflect the new name. 4479 * 4480 * It will return zero if all goes well, and it will return 1 if a 4481 * full transaction commit is required. 4482 */ 4483 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 4484 struct inode *inode, struct inode *old_dir, 4485 struct dentry *parent) 4486 { 4487 struct btrfs_root * root = BTRFS_I(inode)->root; 4488 4489 /* 4490 * this will force the logging code to walk the dentry chain 4491 * up for the file 4492 */ 4493 if (S_ISREG(inode->i_mode)) 4494 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4495 4496 /* 4497 * if this inode hasn't been logged and directory we're renaming it 4498 * from hasn't been logged, we don't need to log it 4499 */ 4500 if (BTRFS_I(inode)->logged_trans <= 4501 root->fs_info->last_trans_committed && 4502 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 4503 root->fs_info->last_trans_committed)) 4504 return 0; 4505 4506 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); 4507 } 4508 4509