1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * stages for the tree walking. The first 39 * stage (0) is to only pin down the blocks we find 40 * the second stage (1) is to make sure that all the inodes 41 * we find in the log are created in the subvolume. 42 * 43 * The last stage is to deal with directories and links and extents 44 * and all the other fun semantics 45 */ 46 #define LOG_WALK_PIN_ONLY 0 47 #define LOG_WALK_REPLAY_INODES 1 48 #define LOG_WALK_REPLAY_ALL 2 49 50 static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 51 struct btrfs_root *root, struct inode *inode, 52 int inode_only); 53 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 54 struct btrfs_root *root, 55 struct btrfs_path *path, u64 objectid); 56 57 /* 58 * tree logging is a special write ahead log used to make sure that 59 * fsyncs and O_SYNCs can happen without doing full tree commits. 60 * 61 * Full tree commits are expensive because they require commonly 62 * modified blocks to be recowed, creating many dirty pages in the 63 * extent tree an 4x-6x higher write load than ext3. 64 * 65 * Instead of doing a tree commit on every fsync, we use the 66 * key ranges and transaction ids to find items for a given file or directory 67 * that have changed in this transaction. Those items are copied into 68 * a special tree (one per subvolume root), that tree is written to disk 69 * and then the fsync is considered complete. 70 * 71 * After a crash, items are copied out of the log-tree back into the 72 * subvolume tree. Any file data extents found are recorded in the extent 73 * allocation tree, and the log-tree freed. 74 * 75 * The log tree is read three times, once to pin down all the extents it is 76 * using in ram and once, once to create all the inodes logged in the tree 77 * and once to do all the other items. 78 */ 79 80 /* 81 * btrfs_add_log_tree adds a new per-subvolume log tree into the 82 * tree of log tree roots. This must be called with a tree log transaction 83 * running (see start_log_trans). 84 */ 85 static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86 struct btrfs_root *root) 87 { 88 struct btrfs_key key; 89 struct btrfs_root_item root_item; 90 struct btrfs_inode_item *inode_item; 91 struct extent_buffer *leaf; 92 struct btrfs_root *new_root = root; 93 int ret; 94 u64 objectid = root->root_key.objectid; 95 96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 97 BTRFS_TREE_LOG_OBJECTID, 98 trans->transid, 0, 0, 0); 99 if (IS_ERR(leaf)) { 100 ret = PTR_ERR(leaf); 101 return ret; 102 } 103 104 btrfs_set_header_nritems(leaf, 0); 105 btrfs_set_header_level(leaf, 0); 106 btrfs_set_header_bytenr(leaf, leaf->start); 107 btrfs_set_header_generation(leaf, trans->transid); 108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); 109 110 write_extent_buffer(leaf, root->fs_info->fsid, 111 (unsigned long)btrfs_header_fsid(leaf), 112 BTRFS_FSID_SIZE); 113 btrfs_mark_buffer_dirty(leaf); 114 115 inode_item = &root_item.inode; 116 memset(inode_item, 0, sizeof(*inode_item)); 117 inode_item->generation = cpu_to_le64(1); 118 inode_item->size = cpu_to_le64(3); 119 inode_item->nlink = cpu_to_le32(1); 120 inode_item->nbytes = cpu_to_le64(root->leafsize); 121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 122 123 btrfs_set_root_bytenr(&root_item, leaf->start); 124 btrfs_set_root_generation(&root_item, trans->transid); 125 btrfs_set_root_level(&root_item, 0); 126 btrfs_set_root_refs(&root_item, 0); 127 btrfs_set_root_used(&root_item, 0); 128 129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 130 root_item.drop_level = 0; 131 132 btrfs_tree_unlock(leaf); 133 free_extent_buffer(leaf); 134 leaf = NULL; 135 136 btrfs_set_root_dirid(&root_item, 0); 137 138 key.objectid = BTRFS_TREE_LOG_OBJECTID; 139 key.offset = objectid; 140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, 142 &root_item); 143 if (ret) 144 goto fail; 145 146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, 147 &key); 148 BUG_ON(!new_root); 149 150 WARN_ON(root->log_root); 151 root->log_root = new_root; 152 153 /* 154 * log trees do not get reference counted because they go away 155 * before a real commit is actually done. They do store pointers 156 * to file data extents, and those reference counts still get 157 * updated (along with back refs to the log tree). 158 */ 159 new_root->ref_cows = 0; 160 new_root->last_trans = trans->transid; 161 fail: 162 return ret; 163 } 164 165 /* 166 * start a sub transaction and setup the log tree 167 * this increments the log tree writer count to make the people 168 * syncing the tree wait for us to finish 169 */ 170 static int start_log_trans(struct btrfs_trans_handle *trans, 171 struct btrfs_root *root) 172 { 173 int ret; 174 mutex_lock(&root->fs_info->tree_log_mutex); 175 if (!root->fs_info->log_root_tree) { 176 ret = btrfs_init_log_root_tree(trans, root->fs_info); 177 BUG_ON(ret); 178 } 179 if (!root->log_root) { 180 ret = btrfs_add_log_tree(trans, root); 181 BUG_ON(ret); 182 } 183 atomic_inc(&root->fs_info->tree_log_writers); 184 root->fs_info->tree_log_batch++; 185 mutex_unlock(&root->fs_info->tree_log_mutex); 186 return 0; 187 } 188 189 /* 190 * returns 0 if there was a log transaction running and we were able 191 * to join, or returns -ENOENT if there were not transactions 192 * in progress 193 */ 194 static int join_running_log_trans(struct btrfs_root *root) 195 { 196 int ret = -ENOENT; 197 198 smp_mb(); 199 if (!root->log_root) 200 return -ENOENT; 201 202 mutex_lock(&root->fs_info->tree_log_mutex); 203 if (root->log_root) { 204 ret = 0; 205 atomic_inc(&root->fs_info->tree_log_writers); 206 root->fs_info->tree_log_batch++; 207 } 208 mutex_unlock(&root->fs_info->tree_log_mutex); 209 return ret; 210 } 211 212 /* 213 * indicate we're done making changes to the log tree 214 * and wake up anyone waiting to do a sync 215 */ 216 static int end_log_trans(struct btrfs_root *root) 217 { 218 atomic_dec(&root->fs_info->tree_log_writers); 219 smp_mb(); 220 if (waitqueue_active(&root->fs_info->tree_log_wait)) 221 wake_up(&root->fs_info->tree_log_wait); 222 return 0; 223 } 224 225 226 /* 227 * the walk control struct is used to pass state down the chain when 228 * processing the log tree. The stage field tells us which part 229 * of the log tree processing we are currently doing. The others 230 * are state fields used for that specific part 231 */ 232 struct walk_control { 233 /* should we free the extent on disk when done? This is used 234 * at transaction commit time while freeing a log tree 235 */ 236 int free; 237 238 /* should we write out the extent buffer? This is used 239 * while flushing the log tree to disk during a sync 240 */ 241 int write; 242 243 /* should we wait for the extent buffer io to finish? Also used 244 * while flushing the log tree to disk for a sync 245 */ 246 int wait; 247 248 /* pin only walk, we record which extents on disk belong to the 249 * log trees 250 */ 251 int pin; 252 253 /* what stage of the replay code we're currently in */ 254 int stage; 255 256 /* the root we are currently replaying */ 257 struct btrfs_root *replay_dest; 258 259 /* the trans handle for the current replay */ 260 struct btrfs_trans_handle *trans; 261 262 /* the function that gets used to process blocks we find in the 263 * tree. Note the extent_buffer might not be up to date when it is 264 * passed in, and it must be checked or read if you need the data 265 * inside it 266 */ 267 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 268 struct walk_control *wc, u64 gen); 269 }; 270 271 /* 272 * process_func used to pin down extents, write them or wait on them 273 */ 274 static int process_one_buffer(struct btrfs_root *log, 275 struct extent_buffer *eb, 276 struct walk_control *wc, u64 gen) 277 { 278 if (wc->pin) { 279 mutex_lock(&log->fs_info->pinned_mutex); 280 btrfs_update_pinned_extents(log->fs_info->extent_root, 281 eb->start, eb->len, 1); 282 mutex_unlock(&log->fs_info->pinned_mutex); 283 } 284 285 if (btrfs_buffer_uptodate(eb, gen)) { 286 if (wc->write) 287 btrfs_write_tree_block(eb); 288 if (wc->wait) 289 btrfs_wait_tree_block_writeback(eb); 290 } 291 return 0; 292 } 293 294 /* 295 * Item overwrite used by replay and tree logging. eb, slot and key all refer 296 * to the src data we are copying out. 297 * 298 * root is the tree we are copying into, and path is a scratch 299 * path for use in this function (it should be released on entry and 300 * will be released on exit). 301 * 302 * If the key is already in the destination tree the existing item is 303 * overwritten. If the existing item isn't big enough, it is extended. 304 * If it is too large, it is truncated. 305 * 306 * If the key isn't in the destination yet, a new item is inserted. 307 */ 308 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 309 struct btrfs_root *root, 310 struct btrfs_path *path, 311 struct extent_buffer *eb, int slot, 312 struct btrfs_key *key) 313 { 314 int ret; 315 u32 item_size; 316 u64 saved_i_size = 0; 317 int save_old_i_size = 0; 318 unsigned long src_ptr; 319 unsigned long dst_ptr; 320 int overwrite_root = 0; 321 322 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 323 overwrite_root = 1; 324 325 item_size = btrfs_item_size_nr(eb, slot); 326 src_ptr = btrfs_item_ptr_offset(eb, slot); 327 328 /* look for the key in the destination tree */ 329 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 330 if (ret == 0) { 331 char *src_copy; 332 char *dst_copy; 333 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 334 path->slots[0]); 335 if (dst_size != item_size) 336 goto insert; 337 338 if (item_size == 0) { 339 btrfs_release_path(root, path); 340 return 0; 341 } 342 dst_copy = kmalloc(item_size, GFP_NOFS); 343 src_copy = kmalloc(item_size, GFP_NOFS); 344 345 read_extent_buffer(eb, src_copy, src_ptr, item_size); 346 347 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 348 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 349 item_size); 350 ret = memcmp(dst_copy, src_copy, item_size); 351 352 kfree(dst_copy); 353 kfree(src_copy); 354 /* 355 * they have the same contents, just return, this saves 356 * us from cowing blocks in the destination tree and doing 357 * extra writes that may not have been done by a previous 358 * sync 359 */ 360 if (ret == 0) { 361 btrfs_release_path(root, path); 362 return 0; 363 } 364 365 } 366 insert: 367 btrfs_release_path(root, path); 368 /* try to insert the key into the destination tree */ 369 ret = btrfs_insert_empty_item(trans, root, path, 370 key, item_size); 371 372 /* make sure any existing item is the correct size */ 373 if (ret == -EEXIST) { 374 u32 found_size; 375 found_size = btrfs_item_size_nr(path->nodes[0], 376 path->slots[0]); 377 if (found_size > item_size) { 378 btrfs_truncate_item(trans, root, path, item_size, 1); 379 } else if (found_size < item_size) { 380 ret = btrfs_extend_item(trans, root, path, 381 item_size - found_size); 382 BUG_ON(ret); 383 } 384 } else if (ret) { 385 BUG(); 386 } 387 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 388 path->slots[0]); 389 390 /* don't overwrite an existing inode if the generation number 391 * was logged as zero. This is done when the tree logging code 392 * is just logging an inode to make sure it exists after recovery. 393 * 394 * Also, don't overwrite i_size on directories during replay. 395 * log replay inserts and removes directory items based on the 396 * state of the tree found in the subvolume, and i_size is modified 397 * as it goes 398 */ 399 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 400 struct btrfs_inode_item *src_item; 401 struct btrfs_inode_item *dst_item; 402 403 src_item = (struct btrfs_inode_item *)src_ptr; 404 dst_item = (struct btrfs_inode_item *)dst_ptr; 405 406 if (btrfs_inode_generation(eb, src_item) == 0) 407 goto no_copy; 408 409 if (overwrite_root && 410 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 411 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 412 save_old_i_size = 1; 413 saved_i_size = btrfs_inode_size(path->nodes[0], 414 dst_item); 415 } 416 } 417 418 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 419 src_ptr, item_size); 420 421 if (save_old_i_size) { 422 struct btrfs_inode_item *dst_item; 423 dst_item = (struct btrfs_inode_item *)dst_ptr; 424 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 425 } 426 427 /* make sure the generation is filled in */ 428 if (key->type == BTRFS_INODE_ITEM_KEY) { 429 struct btrfs_inode_item *dst_item; 430 dst_item = (struct btrfs_inode_item *)dst_ptr; 431 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 432 btrfs_set_inode_generation(path->nodes[0], dst_item, 433 trans->transid); 434 } 435 } 436 437 if (overwrite_root && 438 key->type == BTRFS_EXTENT_DATA_KEY) { 439 int extent_type; 440 struct btrfs_file_extent_item *fi; 441 442 fi = (struct btrfs_file_extent_item *)dst_ptr; 443 extent_type = btrfs_file_extent_type(path->nodes[0], fi); 444 if (extent_type == BTRFS_FILE_EXTENT_REG || 445 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 446 struct btrfs_key ins; 447 ins.objectid = btrfs_file_extent_disk_bytenr( 448 path->nodes[0], fi); 449 ins.offset = btrfs_file_extent_disk_num_bytes( 450 path->nodes[0], fi); 451 ins.type = BTRFS_EXTENT_ITEM_KEY; 452 453 /* 454 * is this extent already allocated in the extent 455 * allocation tree? If so, just add a reference 456 */ 457 ret = btrfs_lookup_extent(root, ins.objectid, 458 ins.offset); 459 if (ret == 0) { 460 ret = btrfs_inc_extent_ref(trans, root, 461 ins.objectid, ins.offset, 462 path->nodes[0]->start, 463 root->root_key.objectid, 464 trans->transid, key->objectid); 465 } else { 466 /* 467 * insert the extent pointer in the extent 468 * allocation tree 469 */ 470 ret = btrfs_alloc_logged_extent(trans, root, 471 path->nodes[0]->start, 472 root->root_key.objectid, 473 trans->transid, key->objectid, 474 &ins); 475 BUG_ON(ret); 476 } 477 } 478 } 479 no_copy: 480 btrfs_mark_buffer_dirty(path->nodes[0]); 481 btrfs_release_path(root, path); 482 return 0; 483 } 484 485 /* 486 * simple helper to read an inode off the disk from a given root 487 * This can only be called for subvolume roots and not for the log 488 */ 489 static noinline struct inode *read_one_inode(struct btrfs_root *root, 490 u64 objectid) 491 { 492 struct inode *inode; 493 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 494 if (inode->i_state & I_NEW) { 495 BTRFS_I(inode)->root = root; 496 BTRFS_I(inode)->location.objectid = objectid; 497 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 498 BTRFS_I(inode)->location.offset = 0; 499 btrfs_read_locked_inode(inode); 500 unlock_new_inode(inode); 501 502 } 503 if (is_bad_inode(inode)) { 504 iput(inode); 505 inode = NULL; 506 } 507 return inode; 508 } 509 510 /* replays a single extent in 'eb' at 'slot' with 'key' into the 511 * subvolume 'root'. path is released on entry and should be released 512 * on exit. 513 * 514 * extents in the log tree have not been allocated out of the extent 515 * tree yet. So, this completes the allocation, taking a reference 516 * as required if the extent already exists or creating a new extent 517 * if it isn't in the extent allocation tree yet. 518 * 519 * The extent is inserted into the file, dropping any existing extents 520 * from the file that overlap the new one. 521 */ 522 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 523 struct btrfs_root *root, 524 struct btrfs_path *path, 525 struct extent_buffer *eb, int slot, 526 struct btrfs_key *key) 527 { 528 int found_type; 529 u64 mask = root->sectorsize - 1; 530 u64 extent_end; 531 u64 alloc_hint; 532 u64 start = key->offset; 533 struct btrfs_file_extent_item *item; 534 struct inode *inode = NULL; 535 unsigned long size; 536 int ret = 0; 537 538 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 539 found_type = btrfs_file_extent_type(eb, item); 540 541 if (found_type == BTRFS_FILE_EXTENT_REG || 542 found_type == BTRFS_FILE_EXTENT_PREALLOC) 543 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 544 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 545 size = btrfs_file_extent_inline_len(eb, item); 546 extent_end = (start + size + mask) & ~mask; 547 } else { 548 ret = 0; 549 goto out; 550 } 551 552 inode = read_one_inode(root, key->objectid); 553 if (!inode) { 554 ret = -EIO; 555 goto out; 556 } 557 558 /* 559 * first check to see if we already have this extent in the 560 * file. This must be done before the btrfs_drop_extents run 561 * so we don't try to drop this extent. 562 */ 563 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 564 start, 0); 565 566 if (ret == 0 && 567 (found_type == BTRFS_FILE_EXTENT_REG || 568 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 569 struct btrfs_file_extent_item cmp1; 570 struct btrfs_file_extent_item cmp2; 571 struct btrfs_file_extent_item *existing; 572 struct extent_buffer *leaf; 573 574 leaf = path->nodes[0]; 575 existing = btrfs_item_ptr(leaf, path->slots[0], 576 struct btrfs_file_extent_item); 577 578 read_extent_buffer(eb, &cmp1, (unsigned long)item, 579 sizeof(cmp1)); 580 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 581 sizeof(cmp2)); 582 583 /* 584 * we already have a pointer to this exact extent, 585 * we don't have to do anything 586 */ 587 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 588 btrfs_release_path(root, path); 589 goto out; 590 } 591 } 592 btrfs_release_path(root, path); 593 594 /* drop any overlapping extents */ 595 ret = btrfs_drop_extents(trans, root, inode, 596 start, extent_end, start, &alloc_hint); 597 BUG_ON(ret); 598 599 /* insert the extent */ 600 ret = overwrite_item(trans, root, path, eb, slot, key); 601 BUG_ON(ret); 602 603 /* btrfs_drop_extents changes i_bytes & i_blocks, update it here */ 604 inode_add_bytes(inode, extent_end - start); 605 btrfs_update_inode(trans, root, inode); 606 out: 607 if (inode) 608 iput(inode); 609 return ret; 610 } 611 612 /* 613 * when cleaning up conflicts between the directory names in the 614 * subvolume, directory names in the log and directory names in the 615 * inode back references, we may have to unlink inodes from directories. 616 * 617 * This is a helper function to do the unlink of a specific directory 618 * item 619 */ 620 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 621 struct btrfs_root *root, 622 struct btrfs_path *path, 623 struct inode *dir, 624 struct btrfs_dir_item *di) 625 { 626 struct inode *inode; 627 char *name; 628 int name_len; 629 struct extent_buffer *leaf; 630 struct btrfs_key location; 631 int ret; 632 633 leaf = path->nodes[0]; 634 635 btrfs_dir_item_key_to_cpu(leaf, di, &location); 636 name_len = btrfs_dir_name_len(leaf, di); 637 name = kmalloc(name_len, GFP_NOFS); 638 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 639 btrfs_release_path(root, path); 640 641 inode = read_one_inode(root, location.objectid); 642 BUG_ON(!inode); 643 644 ret = link_to_fixup_dir(trans, root, path, location.objectid); 645 BUG_ON(ret); 646 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 647 BUG_ON(ret); 648 kfree(name); 649 650 iput(inode); 651 return ret; 652 } 653 654 /* 655 * helper function to see if a given name and sequence number found 656 * in an inode back reference are already in a directory and correctly 657 * point to this inode 658 */ 659 static noinline int inode_in_dir(struct btrfs_root *root, 660 struct btrfs_path *path, 661 u64 dirid, u64 objectid, u64 index, 662 const char *name, int name_len) 663 { 664 struct btrfs_dir_item *di; 665 struct btrfs_key location; 666 int match = 0; 667 668 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 669 index, name, name_len, 0); 670 if (di && !IS_ERR(di)) { 671 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 672 if (location.objectid != objectid) 673 goto out; 674 } else 675 goto out; 676 btrfs_release_path(root, path); 677 678 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 679 if (di && !IS_ERR(di)) { 680 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 681 if (location.objectid != objectid) 682 goto out; 683 } else 684 goto out; 685 match = 1; 686 out: 687 btrfs_release_path(root, path); 688 return match; 689 } 690 691 /* 692 * helper function to check a log tree for a named back reference in 693 * an inode. This is used to decide if a back reference that is 694 * found in the subvolume conflicts with what we find in the log. 695 * 696 * inode backreferences may have multiple refs in a single item, 697 * during replay we process one reference at a time, and we don't 698 * want to delete valid links to a file from the subvolume if that 699 * link is also in the log. 700 */ 701 static noinline int backref_in_log(struct btrfs_root *log, 702 struct btrfs_key *key, 703 char *name, int namelen) 704 { 705 struct btrfs_path *path; 706 struct btrfs_inode_ref *ref; 707 unsigned long ptr; 708 unsigned long ptr_end; 709 unsigned long name_ptr; 710 int found_name_len; 711 int item_size; 712 int ret; 713 int match = 0; 714 715 path = btrfs_alloc_path(); 716 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 717 if (ret != 0) 718 goto out; 719 720 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 721 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 722 ptr_end = ptr + item_size; 723 while (ptr < ptr_end) { 724 ref = (struct btrfs_inode_ref *)ptr; 725 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 726 if (found_name_len == namelen) { 727 name_ptr = (unsigned long)(ref + 1); 728 ret = memcmp_extent_buffer(path->nodes[0], name, 729 name_ptr, namelen); 730 if (ret == 0) { 731 match = 1; 732 goto out; 733 } 734 } 735 ptr = (unsigned long)(ref + 1) + found_name_len; 736 } 737 out: 738 btrfs_free_path(path); 739 return match; 740 } 741 742 743 /* 744 * replay one inode back reference item found in the log tree. 745 * eb, slot and key refer to the buffer and key found in the log tree. 746 * root is the destination we are replaying into, and path is for temp 747 * use by this function. (it should be released on return). 748 */ 749 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 750 struct btrfs_root *root, 751 struct btrfs_root *log, 752 struct btrfs_path *path, 753 struct extent_buffer *eb, int slot, 754 struct btrfs_key *key) 755 { 756 struct inode *dir; 757 int ret; 758 struct btrfs_key location; 759 struct btrfs_inode_ref *ref; 760 struct btrfs_dir_item *di; 761 struct inode *inode; 762 char *name; 763 int namelen; 764 unsigned long ref_ptr; 765 unsigned long ref_end; 766 767 location.objectid = key->objectid; 768 location.type = BTRFS_INODE_ITEM_KEY; 769 location.offset = 0; 770 771 /* 772 * it is possible that we didn't log all the parent directories 773 * for a given inode. If we don't find the dir, just don't 774 * copy the back ref in. The link count fixup code will take 775 * care of the rest 776 */ 777 dir = read_one_inode(root, key->offset); 778 if (!dir) 779 return -ENOENT; 780 781 inode = read_one_inode(root, key->objectid); 782 BUG_ON(!dir); 783 784 ref_ptr = btrfs_item_ptr_offset(eb, slot); 785 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 786 787 again: 788 ref = (struct btrfs_inode_ref *)ref_ptr; 789 790 namelen = btrfs_inode_ref_name_len(eb, ref); 791 name = kmalloc(namelen, GFP_NOFS); 792 BUG_ON(!name); 793 794 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 795 796 /* if we already have a perfect match, we're done */ 797 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 798 btrfs_inode_ref_index(eb, ref), 799 name, namelen)) { 800 goto out; 801 } 802 803 /* 804 * look for a conflicting back reference in the metadata. 805 * if we find one we have to unlink that name of the file 806 * before we add our new link. Later on, we overwrite any 807 * existing back reference, and we don't want to create 808 * dangling pointers in the directory. 809 */ 810 conflict_again: 811 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 812 if (ret == 0) { 813 char *victim_name; 814 int victim_name_len; 815 struct btrfs_inode_ref *victim_ref; 816 unsigned long ptr; 817 unsigned long ptr_end; 818 struct extent_buffer *leaf = path->nodes[0]; 819 820 /* are we trying to overwrite a back ref for the root directory 821 * if so, just jump out, we're done 822 */ 823 if (key->objectid == key->offset) 824 goto out_nowrite; 825 826 /* check all the names in this back reference to see 827 * if they are in the log. if so, we allow them to stay 828 * otherwise they must be unlinked as a conflict 829 */ 830 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 831 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 832 while (ptr < ptr_end) { 833 victim_ref = (struct btrfs_inode_ref *)ptr; 834 victim_name_len = btrfs_inode_ref_name_len(leaf, 835 victim_ref); 836 victim_name = kmalloc(victim_name_len, GFP_NOFS); 837 BUG_ON(!victim_name); 838 839 read_extent_buffer(leaf, victim_name, 840 (unsigned long)(victim_ref + 1), 841 victim_name_len); 842 843 if (!backref_in_log(log, key, victim_name, 844 victim_name_len)) { 845 btrfs_inc_nlink(inode); 846 btrfs_release_path(root, path); 847 ret = btrfs_unlink_inode(trans, root, dir, 848 inode, victim_name, 849 victim_name_len); 850 kfree(victim_name); 851 btrfs_release_path(root, path); 852 goto conflict_again; 853 } 854 kfree(victim_name); 855 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 856 } 857 BUG_ON(ret); 858 } 859 btrfs_release_path(root, path); 860 861 /* look for a conflicting sequence number */ 862 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 863 btrfs_inode_ref_index(eb, ref), 864 name, namelen, 0); 865 if (di && !IS_ERR(di)) { 866 ret = drop_one_dir_item(trans, root, path, dir, di); 867 BUG_ON(ret); 868 } 869 btrfs_release_path(root, path); 870 871 872 /* look for a conflicting name */ 873 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 874 name, namelen, 0); 875 if (di && !IS_ERR(di)) { 876 ret = drop_one_dir_item(trans, root, path, dir, di); 877 BUG_ON(ret); 878 } 879 btrfs_release_path(root, path); 880 881 /* insert our name */ 882 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 883 btrfs_inode_ref_index(eb, ref)); 884 BUG_ON(ret); 885 886 btrfs_update_inode(trans, root, inode); 887 888 out: 889 ref_ptr = (unsigned long)(ref + 1) + namelen; 890 kfree(name); 891 if (ref_ptr < ref_end) 892 goto again; 893 894 /* finally write the back reference in the inode */ 895 ret = overwrite_item(trans, root, path, eb, slot, key); 896 BUG_ON(ret); 897 898 out_nowrite: 899 btrfs_release_path(root, path); 900 iput(dir); 901 iput(inode); 902 return 0; 903 } 904 905 /* 906 * replay one csum item from the log tree into the subvolume 'root' 907 * eb, slot and key all refer to the log tree 908 * path is for temp use by this function and should be released on return 909 * 910 * This copies the checksums out of the log tree and inserts them into 911 * the subvolume. Any existing checksums for this range in the file 912 * are overwritten, and new items are added where required. 913 * 914 * We keep this simple by reusing the btrfs_ordered_sum code from 915 * the data=ordered mode. This basically means making a copy 916 * of all the checksums in ram, which we have to do anyway for kmap 917 * rules. 918 * 919 * The copy is then sent down to btrfs_csum_file_blocks, which 920 * does all the hard work of finding existing items in the file 921 * or adding new ones. 922 */ 923 static noinline int replay_one_csum(struct btrfs_trans_handle *trans, 924 struct btrfs_root *root, 925 struct btrfs_path *path, 926 struct extent_buffer *eb, int slot, 927 struct btrfs_key *key) 928 { 929 int ret; 930 u32 item_size = btrfs_item_size_nr(eb, slot); 931 u64 cur_offset; 932 u16 csum_size = 933 btrfs_super_csum_size(&root->fs_info->super_copy); 934 unsigned long file_bytes; 935 struct btrfs_ordered_sum *sums; 936 struct btrfs_sector_sum *sector_sum; 937 unsigned long ptr; 938 939 file_bytes = (item_size / csum_size) * root->sectorsize; 940 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS); 941 if (!sums) 942 return -ENOMEM; 943 944 INIT_LIST_HEAD(&sums->list); 945 sums->len = file_bytes; 946 sums->bytenr = key->offset; 947 948 /* 949 * copy all the sums into the ordered sum struct 950 */ 951 sector_sum = sums->sums; 952 cur_offset = key->offset; 953 ptr = btrfs_item_ptr_offset(eb, slot); 954 while (item_size > 0) { 955 sector_sum->bytenr = cur_offset; 956 read_extent_buffer(eb, §or_sum->sum, ptr, csum_size); 957 sector_sum++; 958 item_size -= csum_size; 959 ptr += csum_size; 960 cur_offset += root->sectorsize; 961 } 962 963 /* let btrfs_csum_file_blocks add them into the file */ 964 ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); 965 BUG_ON(ret); 966 kfree(sums); 967 return 0; 968 } 969 /* 970 * There are a few corners where the link count of the file can't 971 * be properly maintained during replay. So, instead of adding 972 * lots of complexity to the log code, we just scan the backrefs 973 * for any file that has been through replay. 974 * 975 * The scan will update the link count on the inode to reflect the 976 * number of back refs found. If it goes down to zero, the iput 977 * will free the inode. 978 */ 979 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 980 struct btrfs_root *root, 981 struct inode *inode) 982 { 983 struct btrfs_path *path; 984 int ret; 985 struct btrfs_key key; 986 u64 nlink = 0; 987 unsigned long ptr; 988 unsigned long ptr_end; 989 int name_len; 990 991 key.objectid = inode->i_ino; 992 key.type = BTRFS_INODE_REF_KEY; 993 key.offset = (u64)-1; 994 995 path = btrfs_alloc_path(); 996 997 while (1) { 998 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 999 if (ret < 0) 1000 break; 1001 if (ret > 0) { 1002 if (path->slots[0] == 0) 1003 break; 1004 path->slots[0]--; 1005 } 1006 btrfs_item_key_to_cpu(path->nodes[0], &key, 1007 path->slots[0]); 1008 if (key.objectid != inode->i_ino || 1009 key.type != BTRFS_INODE_REF_KEY) 1010 break; 1011 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1012 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1013 path->slots[0]); 1014 while (ptr < ptr_end) { 1015 struct btrfs_inode_ref *ref; 1016 1017 ref = (struct btrfs_inode_ref *)ptr; 1018 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1019 ref); 1020 ptr = (unsigned long)(ref + 1) + name_len; 1021 nlink++; 1022 } 1023 1024 if (key.offset == 0) 1025 break; 1026 key.offset--; 1027 btrfs_release_path(root, path); 1028 } 1029 btrfs_free_path(path); 1030 if (nlink != inode->i_nlink) { 1031 inode->i_nlink = nlink; 1032 btrfs_update_inode(trans, root, inode); 1033 } 1034 BTRFS_I(inode)->index_cnt = (u64)-1; 1035 1036 return 0; 1037 } 1038 1039 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1040 struct btrfs_root *root, 1041 struct btrfs_path *path) 1042 { 1043 int ret; 1044 struct btrfs_key key; 1045 struct inode *inode; 1046 1047 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1048 key.type = BTRFS_ORPHAN_ITEM_KEY; 1049 key.offset = (u64)-1; 1050 while (1) { 1051 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1052 if (ret < 0) 1053 break; 1054 1055 if (ret == 1) { 1056 if (path->slots[0] == 0) 1057 break; 1058 path->slots[0]--; 1059 } 1060 1061 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1062 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1063 key.type != BTRFS_ORPHAN_ITEM_KEY) 1064 break; 1065 1066 ret = btrfs_del_item(trans, root, path); 1067 BUG_ON(ret); 1068 1069 btrfs_release_path(root, path); 1070 inode = read_one_inode(root, key.offset); 1071 BUG_ON(!inode); 1072 1073 ret = fixup_inode_link_count(trans, root, inode); 1074 BUG_ON(ret); 1075 1076 iput(inode); 1077 1078 if (key.offset == 0) 1079 break; 1080 key.offset--; 1081 } 1082 btrfs_release_path(root, path); 1083 return 0; 1084 } 1085 1086 1087 /* 1088 * record a given inode in the fixup dir so we can check its link 1089 * count when replay is done. The link count is incremented here 1090 * so the inode won't go away until we check it 1091 */ 1092 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1093 struct btrfs_root *root, 1094 struct btrfs_path *path, 1095 u64 objectid) 1096 { 1097 struct btrfs_key key; 1098 int ret = 0; 1099 struct inode *inode; 1100 1101 inode = read_one_inode(root, objectid); 1102 BUG_ON(!inode); 1103 1104 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1105 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1106 key.offset = objectid; 1107 1108 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1109 1110 btrfs_release_path(root, path); 1111 if (ret == 0) { 1112 btrfs_inc_nlink(inode); 1113 btrfs_update_inode(trans, root, inode); 1114 } else if (ret == -EEXIST) { 1115 ret = 0; 1116 } else { 1117 BUG(); 1118 } 1119 iput(inode); 1120 1121 return ret; 1122 } 1123 1124 /* 1125 * when replaying the log for a directory, we only insert names 1126 * for inodes that actually exist. This means an fsync on a directory 1127 * does not implicitly fsync all the new files in it 1128 */ 1129 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1130 struct btrfs_root *root, 1131 struct btrfs_path *path, 1132 u64 dirid, u64 index, 1133 char *name, int name_len, u8 type, 1134 struct btrfs_key *location) 1135 { 1136 struct inode *inode; 1137 struct inode *dir; 1138 int ret; 1139 1140 inode = read_one_inode(root, location->objectid); 1141 if (!inode) 1142 return -ENOENT; 1143 1144 dir = read_one_inode(root, dirid); 1145 if (!dir) { 1146 iput(inode); 1147 return -EIO; 1148 } 1149 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1150 1151 /* FIXME, put inode into FIXUP list */ 1152 1153 iput(inode); 1154 iput(dir); 1155 return ret; 1156 } 1157 1158 /* 1159 * take a single entry in a log directory item and replay it into 1160 * the subvolume. 1161 * 1162 * if a conflicting item exists in the subdirectory already, 1163 * the inode it points to is unlinked and put into the link count 1164 * fix up tree. 1165 * 1166 * If a name from the log points to a file or directory that does 1167 * not exist in the FS, it is skipped. fsyncs on directories 1168 * do not force down inodes inside that directory, just changes to the 1169 * names or unlinks in a directory. 1170 */ 1171 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1172 struct btrfs_root *root, 1173 struct btrfs_path *path, 1174 struct extent_buffer *eb, 1175 struct btrfs_dir_item *di, 1176 struct btrfs_key *key) 1177 { 1178 char *name; 1179 int name_len; 1180 struct btrfs_dir_item *dst_di; 1181 struct btrfs_key found_key; 1182 struct btrfs_key log_key; 1183 struct inode *dir; 1184 u8 log_type; 1185 int exists; 1186 int ret; 1187 1188 dir = read_one_inode(root, key->objectid); 1189 BUG_ON(!dir); 1190 1191 name_len = btrfs_dir_name_len(eb, di); 1192 name = kmalloc(name_len, GFP_NOFS); 1193 log_type = btrfs_dir_type(eb, di); 1194 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1195 name_len); 1196 1197 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1198 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1199 if (exists == 0) 1200 exists = 1; 1201 else 1202 exists = 0; 1203 btrfs_release_path(root, path); 1204 1205 if (key->type == BTRFS_DIR_ITEM_KEY) { 1206 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1207 name, name_len, 1); 1208 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1209 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1210 key->objectid, 1211 key->offset, name, 1212 name_len, 1); 1213 } else { 1214 BUG(); 1215 } 1216 if (!dst_di || IS_ERR(dst_di)) { 1217 /* we need a sequence number to insert, so we only 1218 * do inserts for the BTRFS_DIR_INDEX_KEY types 1219 */ 1220 if (key->type != BTRFS_DIR_INDEX_KEY) 1221 goto out; 1222 goto insert; 1223 } 1224 1225 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1226 /* the existing item matches the logged item */ 1227 if (found_key.objectid == log_key.objectid && 1228 found_key.type == log_key.type && 1229 found_key.offset == log_key.offset && 1230 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1231 goto out; 1232 } 1233 1234 /* 1235 * don't drop the conflicting directory entry if the inode 1236 * for the new entry doesn't exist 1237 */ 1238 if (!exists) 1239 goto out; 1240 1241 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1242 BUG_ON(ret); 1243 1244 if (key->type == BTRFS_DIR_INDEX_KEY) 1245 goto insert; 1246 out: 1247 btrfs_release_path(root, path); 1248 kfree(name); 1249 iput(dir); 1250 return 0; 1251 1252 insert: 1253 btrfs_release_path(root, path); 1254 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1255 name, name_len, log_type, &log_key); 1256 1257 if (ret && ret != -ENOENT) 1258 BUG(); 1259 goto out; 1260 } 1261 1262 /* 1263 * find all the names in a directory item and reconcile them into 1264 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1265 * one name in a directory item, but the same code gets used for 1266 * both directory index types 1267 */ 1268 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1269 struct btrfs_root *root, 1270 struct btrfs_path *path, 1271 struct extent_buffer *eb, int slot, 1272 struct btrfs_key *key) 1273 { 1274 int ret; 1275 u32 item_size = btrfs_item_size_nr(eb, slot); 1276 struct btrfs_dir_item *di; 1277 int name_len; 1278 unsigned long ptr; 1279 unsigned long ptr_end; 1280 1281 ptr = btrfs_item_ptr_offset(eb, slot); 1282 ptr_end = ptr + item_size; 1283 while (ptr < ptr_end) { 1284 di = (struct btrfs_dir_item *)ptr; 1285 name_len = btrfs_dir_name_len(eb, di); 1286 ret = replay_one_name(trans, root, path, eb, di, key); 1287 BUG_ON(ret); 1288 ptr = (unsigned long)(di + 1); 1289 ptr += name_len; 1290 } 1291 return 0; 1292 } 1293 1294 /* 1295 * directory replay has two parts. There are the standard directory 1296 * items in the log copied from the subvolume, and range items 1297 * created in the log while the subvolume was logged. 1298 * 1299 * The range items tell us which parts of the key space the log 1300 * is authoritative for. During replay, if a key in the subvolume 1301 * directory is in a logged range item, but not actually in the log 1302 * that means it was deleted from the directory before the fsync 1303 * and should be removed. 1304 */ 1305 static noinline int find_dir_range(struct btrfs_root *root, 1306 struct btrfs_path *path, 1307 u64 dirid, int key_type, 1308 u64 *start_ret, u64 *end_ret) 1309 { 1310 struct btrfs_key key; 1311 u64 found_end; 1312 struct btrfs_dir_log_item *item; 1313 int ret; 1314 int nritems; 1315 1316 if (*start_ret == (u64)-1) 1317 return 1; 1318 1319 key.objectid = dirid; 1320 key.type = key_type; 1321 key.offset = *start_ret; 1322 1323 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1324 if (ret < 0) 1325 goto out; 1326 if (ret > 0) { 1327 if (path->slots[0] == 0) 1328 goto out; 1329 path->slots[0]--; 1330 } 1331 if (ret != 0) 1332 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1333 1334 if (key.type != key_type || key.objectid != dirid) { 1335 ret = 1; 1336 goto next; 1337 } 1338 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1339 struct btrfs_dir_log_item); 1340 found_end = btrfs_dir_log_end(path->nodes[0], item); 1341 1342 if (*start_ret >= key.offset && *start_ret <= found_end) { 1343 ret = 0; 1344 *start_ret = key.offset; 1345 *end_ret = found_end; 1346 goto out; 1347 } 1348 ret = 1; 1349 next: 1350 /* check the next slot in the tree to see if it is a valid item */ 1351 nritems = btrfs_header_nritems(path->nodes[0]); 1352 if (path->slots[0] >= nritems) { 1353 ret = btrfs_next_leaf(root, path); 1354 if (ret) 1355 goto out; 1356 } else { 1357 path->slots[0]++; 1358 } 1359 1360 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1361 1362 if (key.type != key_type || key.objectid != dirid) { 1363 ret = 1; 1364 goto out; 1365 } 1366 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1367 struct btrfs_dir_log_item); 1368 found_end = btrfs_dir_log_end(path->nodes[0], item); 1369 *start_ret = key.offset; 1370 *end_ret = found_end; 1371 ret = 0; 1372 out: 1373 btrfs_release_path(root, path); 1374 return ret; 1375 } 1376 1377 /* 1378 * this looks for a given directory item in the log. If the directory 1379 * item is not in the log, the item is removed and the inode it points 1380 * to is unlinked 1381 */ 1382 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1383 struct btrfs_root *root, 1384 struct btrfs_root *log, 1385 struct btrfs_path *path, 1386 struct btrfs_path *log_path, 1387 struct inode *dir, 1388 struct btrfs_key *dir_key) 1389 { 1390 int ret; 1391 struct extent_buffer *eb; 1392 int slot; 1393 u32 item_size; 1394 struct btrfs_dir_item *di; 1395 struct btrfs_dir_item *log_di; 1396 int name_len; 1397 unsigned long ptr; 1398 unsigned long ptr_end; 1399 char *name; 1400 struct inode *inode; 1401 struct btrfs_key location; 1402 1403 again: 1404 eb = path->nodes[0]; 1405 slot = path->slots[0]; 1406 item_size = btrfs_item_size_nr(eb, slot); 1407 ptr = btrfs_item_ptr_offset(eb, slot); 1408 ptr_end = ptr + item_size; 1409 while (ptr < ptr_end) { 1410 di = (struct btrfs_dir_item *)ptr; 1411 name_len = btrfs_dir_name_len(eb, di); 1412 name = kmalloc(name_len, GFP_NOFS); 1413 if (!name) { 1414 ret = -ENOMEM; 1415 goto out; 1416 } 1417 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1418 name_len); 1419 log_di = NULL; 1420 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1421 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1422 dir_key->objectid, 1423 name, name_len, 0); 1424 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1425 log_di = btrfs_lookup_dir_index_item(trans, log, 1426 log_path, 1427 dir_key->objectid, 1428 dir_key->offset, 1429 name, name_len, 0); 1430 } 1431 if (!log_di || IS_ERR(log_di)) { 1432 btrfs_dir_item_key_to_cpu(eb, di, &location); 1433 btrfs_release_path(root, path); 1434 btrfs_release_path(log, log_path); 1435 inode = read_one_inode(root, location.objectid); 1436 BUG_ON(!inode); 1437 1438 ret = link_to_fixup_dir(trans, root, 1439 path, location.objectid); 1440 BUG_ON(ret); 1441 btrfs_inc_nlink(inode); 1442 ret = btrfs_unlink_inode(trans, root, dir, inode, 1443 name, name_len); 1444 BUG_ON(ret); 1445 kfree(name); 1446 iput(inode); 1447 1448 /* there might still be more names under this key 1449 * check and repeat if required 1450 */ 1451 ret = btrfs_search_slot(NULL, root, dir_key, path, 1452 0, 0); 1453 if (ret == 0) 1454 goto again; 1455 ret = 0; 1456 goto out; 1457 } 1458 btrfs_release_path(log, log_path); 1459 kfree(name); 1460 1461 ptr = (unsigned long)(di + 1); 1462 ptr += name_len; 1463 } 1464 ret = 0; 1465 out: 1466 btrfs_release_path(root, path); 1467 btrfs_release_path(log, log_path); 1468 return ret; 1469 } 1470 1471 /* 1472 * deletion replay happens before we copy any new directory items 1473 * out of the log or out of backreferences from inodes. It 1474 * scans the log to find ranges of keys that log is authoritative for, 1475 * and then scans the directory to find items in those ranges that are 1476 * not present in the log. 1477 * 1478 * Anything we don't find in the log is unlinked and removed from the 1479 * directory. 1480 */ 1481 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1482 struct btrfs_root *root, 1483 struct btrfs_root *log, 1484 struct btrfs_path *path, 1485 u64 dirid) 1486 { 1487 u64 range_start; 1488 u64 range_end; 1489 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1490 int ret = 0; 1491 struct btrfs_key dir_key; 1492 struct btrfs_key found_key; 1493 struct btrfs_path *log_path; 1494 struct inode *dir; 1495 1496 dir_key.objectid = dirid; 1497 dir_key.type = BTRFS_DIR_ITEM_KEY; 1498 log_path = btrfs_alloc_path(); 1499 if (!log_path) 1500 return -ENOMEM; 1501 1502 dir = read_one_inode(root, dirid); 1503 /* it isn't an error if the inode isn't there, that can happen 1504 * because we replay the deletes before we copy in the inode item 1505 * from the log 1506 */ 1507 if (!dir) { 1508 btrfs_free_path(log_path); 1509 return 0; 1510 } 1511 again: 1512 range_start = 0; 1513 range_end = 0; 1514 while (1) { 1515 ret = find_dir_range(log, path, dirid, key_type, 1516 &range_start, &range_end); 1517 if (ret != 0) 1518 break; 1519 1520 dir_key.offset = range_start; 1521 while (1) { 1522 int nritems; 1523 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1524 0, 0); 1525 if (ret < 0) 1526 goto out; 1527 1528 nritems = btrfs_header_nritems(path->nodes[0]); 1529 if (path->slots[0] >= nritems) { 1530 ret = btrfs_next_leaf(root, path); 1531 if (ret) 1532 break; 1533 } 1534 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1535 path->slots[0]); 1536 if (found_key.objectid != dirid || 1537 found_key.type != dir_key.type) 1538 goto next_type; 1539 1540 if (found_key.offset > range_end) 1541 break; 1542 1543 ret = check_item_in_log(trans, root, log, path, 1544 log_path, dir, &found_key); 1545 BUG_ON(ret); 1546 if (found_key.offset == (u64)-1) 1547 break; 1548 dir_key.offset = found_key.offset + 1; 1549 } 1550 btrfs_release_path(root, path); 1551 if (range_end == (u64)-1) 1552 break; 1553 range_start = range_end + 1; 1554 } 1555 1556 next_type: 1557 ret = 0; 1558 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1559 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1560 dir_key.type = BTRFS_DIR_INDEX_KEY; 1561 btrfs_release_path(root, path); 1562 goto again; 1563 } 1564 out: 1565 btrfs_release_path(root, path); 1566 btrfs_free_path(log_path); 1567 iput(dir); 1568 return ret; 1569 } 1570 1571 /* 1572 * the process_func used to replay items from the log tree. This 1573 * gets called in two different stages. The first stage just looks 1574 * for inodes and makes sure they are all copied into the subvolume. 1575 * 1576 * The second stage copies all the other item types from the log into 1577 * the subvolume. The two stage approach is slower, but gets rid of 1578 * lots of complexity around inodes referencing other inodes that exist 1579 * only in the log (references come from either directory items or inode 1580 * back refs). 1581 */ 1582 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1583 struct walk_control *wc, u64 gen) 1584 { 1585 int nritems; 1586 struct btrfs_path *path; 1587 struct btrfs_root *root = wc->replay_dest; 1588 struct btrfs_key key; 1589 u32 item_size; 1590 int level; 1591 int i; 1592 int ret; 1593 1594 btrfs_read_buffer(eb, gen); 1595 1596 level = btrfs_header_level(eb); 1597 1598 if (level != 0) 1599 return 0; 1600 1601 path = btrfs_alloc_path(); 1602 BUG_ON(!path); 1603 1604 nritems = btrfs_header_nritems(eb); 1605 for (i = 0; i < nritems; i++) { 1606 btrfs_item_key_to_cpu(eb, &key, i); 1607 item_size = btrfs_item_size_nr(eb, i); 1608 1609 /* inode keys are done during the first stage */ 1610 if (key.type == BTRFS_INODE_ITEM_KEY && 1611 wc->stage == LOG_WALK_REPLAY_INODES) { 1612 struct inode *inode; 1613 struct btrfs_inode_item *inode_item; 1614 u32 mode; 1615 1616 inode_item = btrfs_item_ptr(eb, i, 1617 struct btrfs_inode_item); 1618 mode = btrfs_inode_mode(eb, inode_item); 1619 if (S_ISDIR(mode)) { 1620 ret = replay_dir_deletes(wc->trans, 1621 root, log, path, key.objectid); 1622 BUG_ON(ret); 1623 } 1624 ret = overwrite_item(wc->trans, root, path, 1625 eb, i, &key); 1626 BUG_ON(ret); 1627 1628 /* for regular files, truncate away 1629 * extents past the new EOF 1630 */ 1631 if (S_ISREG(mode)) { 1632 inode = read_one_inode(root, 1633 key.objectid); 1634 BUG_ON(!inode); 1635 1636 ret = btrfs_truncate_inode_items(wc->trans, 1637 root, inode, inode->i_size, 1638 BTRFS_EXTENT_DATA_KEY); 1639 BUG_ON(ret); 1640 iput(inode); 1641 } 1642 ret = link_to_fixup_dir(wc->trans, root, 1643 path, key.objectid); 1644 BUG_ON(ret); 1645 } 1646 if (wc->stage < LOG_WALK_REPLAY_ALL) 1647 continue; 1648 1649 /* these keys are simply copied */ 1650 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1651 ret = overwrite_item(wc->trans, root, path, 1652 eb, i, &key); 1653 BUG_ON(ret); 1654 } else if (key.type == BTRFS_INODE_REF_KEY) { 1655 ret = add_inode_ref(wc->trans, root, log, path, 1656 eb, i, &key); 1657 BUG_ON(ret && ret != -ENOENT); 1658 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1659 ret = replay_one_extent(wc->trans, root, path, 1660 eb, i, &key); 1661 BUG_ON(ret); 1662 } else if (key.type == BTRFS_EXTENT_CSUM_KEY) { 1663 ret = replay_one_csum(wc->trans, root, path, 1664 eb, i, &key); 1665 BUG_ON(ret); 1666 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1667 key.type == BTRFS_DIR_INDEX_KEY) { 1668 ret = replay_one_dir_item(wc->trans, root, path, 1669 eb, i, &key); 1670 BUG_ON(ret); 1671 } 1672 } 1673 btrfs_free_path(path); 1674 return 0; 1675 } 1676 1677 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1678 struct btrfs_root *root, 1679 struct btrfs_path *path, int *level, 1680 struct walk_control *wc) 1681 { 1682 u64 root_owner; 1683 u64 root_gen; 1684 u64 bytenr; 1685 u64 ptr_gen; 1686 struct extent_buffer *next; 1687 struct extent_buffer *cur; 1688 struct extent_buffer *parent; 1689 u32 blocksize; 1690 int ret = 0; 1691 1692 WARN_ON(*level < 0); 1693 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1694 1695 while (*level > 0) { 1696 WARN_ON(*level < 0); 1697 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1698 cur = path->nodes[*level]; 1699 1700 if (btrfs_header_level(cur) != *level) 1701 WARN_ON(1); 1702 1703 if (path->slots[*level] >= 1704 btrfs_header_nritems(cur)) 1705 break; 1706 1707 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1708 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1709 blocksize = btrfs_level_size(root, *level - 1); 1710 1711 parent = path->nodes[*level]; 1712 root_owner = btrfs_header_owner(parent); 1713 root_gen = btrfs_header_generation(parent); 1714 1715 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1716 1717 wc->process_func(root, next, wc, ptr_gen); 1718 1719 if (*level == 1) { 1720 path->slots[*level]++; 1721 if (wc->free) { 1722 btrfs_read_buffer(next, ptr_gen); 1723 1724 btrfs_tree_lock(next); 1725 clean_tree_block(trans, root, next); 1726 btrfs_wait_tree_block_writeback(next); 1727 btrfs_tree_unlock(next); 1728 1729 ret = btrfs_drop_leaf_ref(trans, root, next); 1730 BUG_ON(ret); 1731 1732 WARN_ON(root_owner != 1733 BTRFS_TREE_LOG_OBJECTID); 1734 ret = btrfs_free_reserved_extent(root, 1735 bytenr, blocksize); 1736 BUG_ON(ret); 1737 } 1738 free_extent_buffer(next); 1739 continue; 1740 } 1741 btrfs_read_buffer(next, ptr_gen); 1742 1743 WARN_ON(*level <= 0); 1744 if (path->nodes[*level-1]) 1745 free_extent_buffer(path->nodes[*level-1]); 1746 path->nodes[*level-1] = next; 1747 *level = btrfs_header_level(next); 1748 path->slots[*level] = 0; 1749 cond_resched(); 1750 } 1751 WARN_ON(*level < 0); 1752 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1753 1754 if (path->nodes[*level] == root->node) 1755 parent = path->nodes[*level]; 1756 else 1757 parent = path->nodes[*level + 1]; 1758 1759 bytenr = path->nodes[*level]->start; 1760 1761 blocksize = btrfs_level_size(root, *level); 1762 root_owner = btrfs_header_owner(parent); 1763 root_gen = btrfs_header_generation(parent); 1764 1765 wc->process_func(root, path->nodes[*level], wc, 1766 btrfs_header_generation(path->nodes[*level])); 1767 1768 if (wc->free) { 1769 next = path->nodes[*level]; 1770 btrfs_tree_lock(next); 1771 clean_tree_block(trans, root, next); 1772 btrfs_wait_tree_block_writeback(next); 1773 btrfs_tree_unlock(next); 1774 1775 if (*level == 0) { 1776 ret = btrfs_drop_leaf_ref(trans, root, next); 1777 BUG_ON(ret); 1778 } 1779 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1780 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1781 BUG_ON(ret); 1782 } 1783 free_extent_buffer(path->nodes[*level]); 1784 path->nodes[*level] = NULL; 1785 *level += 1; 1786 1787 cond_resched(); 1788 return 0; 1789 } 1790 1791 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1792 struct btrfs_root *root, 1793 struct btrfs_path *path, int *level, 1794 struct walk_control *wc) 1795 { 1796 u64 root_owner; 1797 u64 root_gen; 1798 int i; 1799 int slot; 1800 int ret; 1801 1802 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1803 slot = path->slots[i]; 1804 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1805 struct extent_buffer *node; 1806 node = path->nodes[i]; 1807 path->slots[i]++; 1808 *level = i; 1809 WARN_ON(*level == 0); 1810 return 0; 1811 } else { 1812 struct extent_buffer *parent; 1813 if (path->nodes[*level] == root->node) 1814 parent = path->nodes[*level]; 1815 else 1816 parent = path->nodes[*level + 1]; 1817 1818 root_owner = btrfs_header_owner(parent); 1819 root_gen = btrfs_header_generation(parent); 1820 wc->process_func(root, path->nodes[*level], wc, 1821 btrfs_header_generation(path->nodes[*level])); 1822 if (wc->free) { 1823 struct extent_buffer *next; 1824 1825 next = path->nodes[*level]; 1826 1827 btrfs_tree_lock(next); 1828 clean_tree_block(trans, root, next); 1829 btrfs_wait_tree_block_writeback(next); 1830 btrfs_tree_unlock(next); 1831 1832 if (*level == 0) { 1833 ret = btrfs_drop_leaf_ref(trans, root, 1834 next); 1835 BUG_ON(ret); 1836 } 1837 1838 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1839 ret = btrfs_free_reserved_extent(root, 1840 path->nodes[*level]->start, 1841 path->nodes[*level]->len); 1842 BUG_ON(ret); 1843 } 1844 free_extent_buffer(path->nodes[*level]); 1845 path->nodes[*level] = NULL; 1846 *level = i + 1; 1847 } 1848 } 1849 return 1; 1850 } 1851 1852 /* 1853 * drop the reference count on the tree rooted at 'snap'. This traverses 1854 * the tree freeing any blocks that have a ref count of zero after being 1855 * decremented. 1856 */ 1857 static int walk_log_tree(struct btrfs_trans_handle *trans, 1858 struct btrfs_root *log, struct walk_control *wc) 1859 { 1860 int ret = 0; 1861 int wret; 1862 int level; 1863 struct btrfs_path *path; 1864 int i; 1865 int orig_level; 1866 1867 path = btrfs_alloc_path(); 1868 BUG_ON(!path); 1869 1870 level = btrfs_header_level(log->node); 1871 orig_level = level; 1872 path->nodes[level] = log->node; 1873 extent_buffer_get(log->node); 1874 path->slots[level] = 0; 1875 1876 while (1) { 1877 wret = walk_down_log_tree(trans, log, path, &level, wc); 1878 if (wret > 0) 1879 break; 1880 if (wret < 0) 1881 ret = wret; 1882 1883 wret = walk_up_log_tree(trans, log, path, &level, wc); 1884 if (wret > 0) 1885 break; 1886 if (wret < 0) 1887 ret = wret; 1888 } 1889 1890 /* was the root node processed? if not, catch it here */ 1891 if (path->nodes[orig_level]) { 1892 wc->process_func(log, path->nodes[orig_level], wc, 1893 btrfs_header_generation(path->nodes[orig_level])); 1894 if (wc->free) { 1895 struct extent_buffer *next; 1896 1897 next = path->nodes[orig_level]; 1898 1899 btrfs_tree_lock(next); 1900 clean_tree_block(trans, log, next); 1901 btrfs_wait_tree_block_writeback(next); 1902 btrfs_tree_unlock(next); 1903 1904 if (orig_level == 0) { 1905 ret = btrfs_drop_leaf_ref(trans, log, 1906 next); 1907 BUG_ON(ret); 1908 } 1909 WARN_ON(log->root_key.objectid != 1910 BTRFS_TREE_LOG_OBJECTID); 1911 ret = btrfs_free_reserved_extent(log, next->start, 1912 next->len); 1913 BUG_ON(ret); 1914 } 1915 } 1916 1917 for (i = 0; i <= orig_level; i++) { 1918 if (path->nodes[i]) { 1919 free_extent_buffer(path->nodes[i]); 1920 path->nodes[i] = NULL; 1921 } 1922 } 1923 btrfs_free_path(path); 1924 if (wc->free) 1925 free_extent_buffer(log->node); 1926 return ret; 1927 } 1928 1929 static int wait_log_commit(struct btrfs_root *log) 1930 { 1931 DEFINE_WAIT(wait); 1932 u64 transid = log->fs_info->tree_log_transid; 1933 1934 do { 1935 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1936 TASK_UNINTERRUPTIBLE); 1937 mutex_unlock(&log->fs_info->tree_log_mutex); 1938 if (atomic_read(&log->fs_info->tree_log_commit)) 1939 schedule(); 1940 finish_wait(&log->fs_info->tree_log_wait, &wait); 1941 mutex_lock(&log->fs_info->tree_log_mutex); 1942 } while (transid == log->fs_info->tree_log_transid && 1943 atomic_read(&log->fs_info->tree_log_commit)); 1944 return 0; 1945 } 1946 1947 /* 1948 * btrfs_sync_log does sends a given tree log down to the disk and 1949 * updates the super blocks to record it. When this call is done, 1950 * you know that any inodes previously logged are safely on disk 1951 */ 1952 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1953 struct btrfs_root *root) 1954 { 1955 int ret; 1956 unsigned long batch; 1957 struct btrfs_root *log = root->log_root; 1958 1959 mutex_lock(&log->fs_info->tree_log_mutex); 1960 if (atomic_read(&log->fs_info->tree_log_commit)) { 1961 wait_log_commit(log); 1962 goto out; 1963 } 1964 atomic_set(&log->fs_info->tree_log_commit, 1); 1965 1966 while (1) { 1967 batch = log->fs_info->tree_log_batch; 1968 mutex_unlock(&log->fs_info->tree_log_mutex); 1969 schedule_timeout_uninterruptible(1); 1970 mutex_lock(&log->fs_info->tree_log_mutex); 1971 1972 while (atomic_read(&log->fs_info->tree_log_writers)) { 1973 DEFINE_WAIT(wait); 1974 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1975 TASK_UNINTERRUPTIBLE); 1976 mutex_unlock(&log->fs_info->tree_log_mutex); 1977 if (atomic_read(&log->fs_info->tree_log_writers)) 1978 schedule(); 1979 mutex_lock(&log->fs_info->tree_log_mutex); 1980 finish_wait(&log->fs_info->tree_log_wait, &wait); 1981 } 1982 if (batch == log->fs_info->tree_log_batch) 1983 break; 1984 } 1985 1986 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1987 BUG_ON(ret); 1988 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1989 &root->fs_info->log_root_tree->dirty_log_pages); 1990 BUG_ON(ret); 1991 1992 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1993 log->fs_info->log_root_tree->node->start); 1994 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1995 btrfs_header_level(log->fs_info->log_root_tree->node)); 1996 1997 write_ctree_super(trans, log->fs_info->tree_root, 2); 1998 log->fs_info->tree_log_transid++; 1999 log->fs_info->tree_log_batch = 0; 2000 atomic_set(&log->fs_info->tree_log_commit, 0); 2001 smp_mb(); 2002 if (waitqueue_active(&log->fs_info->tree_log_wait)) 2003 wake_up(&log->fs_info->tree_log_wait); 2004 out: 2005 mutex_unlock(&log->fs_info->tree_log_mutex); 2006 return 0; 2007 } 2008 2009 /* * free all the extents used by the tree log. This should be called 2010 * at commit time of the full transaction 2011 */ 2012 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2013 { 2014 int ret; 2015 struct btrfs_root *log; 2016 struct key; 2017 u64 start; 2018 u64 end; 2019 struct walk_control wc = { 2020 .free = 1, 2021 .process_func = process_one_buffer 2022 }; 2023 2024 if (!root->log_root) 2025 return 0; 2026 2027 log = root->log_root; 2028 ret = walk_log_tree(trans, log, &wc); 2029 BUG_ON(ret); 2030 2031 while (1) { 2032 ret = find_first_extent_bit(&log->dirty_log_pages, 2033 0, &start, &end, EXTENT_DIRTY); 2034 if (ret) 2035 break; 2036 2037 clear_extent_dirty(&log->dirty_log_pages, 2038 start, end, GFP_NOFS); 2039 } 2040 2041 log = root->log_root; 2042 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2043 &log->root_key); 2044 BUG_ON(ret); 2045 root->log_root = NULL; 2046 kfree(root->log_root); 2047 return 0; 2048 } 2049 2050 /* 2051 * helper function to update the item for a given subvolumes log root 2052 * in the tree of log roots 2053 */ 2054 static int update_log_root(struct btrfs_trans_handle *trans, 2055 struct btrfs_root *log) 2056 { 2057 u64 bytenr = btrfs_root_bytenr(&log->root_item); 2058 int ret; 2059 2060 if (log->node->start == bytenr) 2061 return 0; 2062 2063 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2064 btrfs_set_root_generation(&log->root_item, trans->transid); 2065 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); 2066 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2067 &log->root_key, &log->root_item); 2068 BUG_ON(ret); 2069 return ret; 2070 } 2071 2072 /* 2073 * If both a file and directory are logged, and unlinks or renames are 2074 * mixed in, we have a few interesting corners: 2075 * 2076 * create file X in dir Y 2077 * link file X to X.link in dir Y 2078 * fsync file X 2079 * unlink file X but leave X.link 2080 * fsync dir Y 2081 * 2082 * After a crash we would expect only X.link to exist. But file X 2083 * didn't get fsync'd again so the log has back refs for X and X.link. 2084 * 2085 * We solve this by removing directory entries and inode backrefs from the 2086 * log when a file that was logged in the current transaction is 2087 * unlinked. Any later fsync will include the updated log entries, and 2088 * we'll be able to reconstruct the proper directory items from backrefs. 2089 * 2090 * This optimizations allows us to avoid relogging the entire inode 2091 * or the entire directory. 2092 */ 2093 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2094 struct btrfs_root *root, 2095 const char *name, int name_len, 2096 struct inode *dir, u64 index) 2097 { 2098 struct btrfs_root *log; 2099 struct btrfs_dir_item *di; 2100 struct btrfs_path *path; 2101 int ret; 2102 int bytes_del = 0; 2103 2104 if (BTRFS_I(dir)->logged_trans < trans->transid) 2105 return 0; 2106 2107 ret = join_running_log_trans(root); 2108 if (ret) 2109 return 0; 2110 2111 mutex_lock(&BTRFS_I(dir)->log_mutex); 2112 2113 log = root->log_root; 2114 path = btrfs_alloc_path(); 2115 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2116 name, name_len, -1); 2117 if (di && !IS_ERR(di)) { 2118 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2119 bytes_del += name_len; 2120 BUG_ON(ret); 2121 } 2122 btrfs_release_path(log, path); 2123 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2124 index, name, name_len, -1); 2125 if (di && !IS_ERR(di)) { 2126 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2127 bytes_del += name_len; 2128 BUG_ON(ret); 2129 } 2130 2131 /* update the directory size in the log to reflect the names 2132 * we have removed 2133 */ 2134 if (bytes_del) { 2135 struct btrfs_key key; 2136 2137 key.objectid = dir->i_ino; 2138 key.offset = 0; 2139 key.type = BTRFS_INODE_ITEM_KEY; 2140 btrfs_release_path(log, path); 2141 2142 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2143 if (ret == 0) { 2144 struct btrfs_inode_item *item; 2145 u64 i_size; 2146 2147 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2148 struct btrfs_inode_item); 2149 i_size = btrfs_inode_size(path->nodes[0], item); 2150 if (i_size > bytes_del) 2151 i_size -= bytes_del; 2152 else 2153 i_size = 0; 2154 btrfs_set_inode_size(path->nodes[0], item, i_size); 2155 btrfs_mark_buffer_dirty(path->nodes[0]); 2156 } else 2157 ret = 0; 2158 btrfs_release_path(log, path); 2159 } 2160 2161 btrfs_free_path(path); 2162 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2163 end_log_trans(root); 2164 2165 return 0; 2166 } 2167 2168 /* see comments for btrfs_del_dir_entries_in_log */ 2169 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2170 struct btrfs_root *root, 2171 const char *name, int name_len, 2172 struct inode *inode, u64 dirid) 2173 { 2174 struct btrfs_root *log; 2175 u64 index; 2176 int ret; 2177 2178 if (BTRFS_I(inode)->logged_trans < trans->transid) 2179 return 0; 2180 2181 ret = join_running_log_trans(root); 2182 if (ret) 2183 return 0; 2184 log = root->log_root; 2185 mutex_lock(&BTRFS_I(inode)->log_mutex); 2186 2187 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2188 dirid, &index); 2189 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2190 end_log_trans(root); 2191 2192 return ret; 2193 } 2194 2195 /* 2196 * creates a range item in the log for 'dirid'. first_offset and 2197 * last_offset tell us which parts of the key space the log should 2198 * be considered authoritative for. 2199 */ 2200 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2201 struct btrfs_root *log, 2202 struct btrfs_path *path, 2203 int key_type, u64 dirid, 2204 u64 first_offset, u64 last_offset) 2205 { 2206 int ret; 2207 struct btrfs_key key; 2208 struct btrfs_dir_log_item *item; 2209 2210 key.objectid = dirid; 2211 key.offset = first_offset; 2212 if (key_type == BTRFS_DIR_ITEM_KEY) 2213 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2214 else 2215 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2216 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2217 BUG_ON(ret); 2218 2219 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2220 struct btrfs_dir_log_item); 2221 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2222 btrfs_mark_buffer_dirty(path->nodes[0]); 2223 btrfs_release_path(log, path); 2224 return 0; 2225 } 2226 2227 /* 2228 * log all the items included in the current transaction for a given 2229 * directory. This also creates the range items in the log tree required 2230 * to replay anything deleted before the fsync 2231 */ 2232 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2233 struct btrfs_root *root, struct inode *inode, 2234 struct btrfs_path *path, 2235 struct btrfs_path *dst_path, int key_type, 2236 u64 min_offset, u64 *last_offset_ret) 2237 { 2238 struct btrfs_key min_key; 2239 struct btrfs_key max_key; 2240 struct btrfs_root *log = root->log_root; 2241 struct extent_buffer *src; 2242 int ret; 2243 int i; 2244 int nritems; 2245 u64 first_offset = min_offset; 2246 u64 last_offset = (u64)-1; 2247 2248 log = root->log_root; 2249 max_key.objectid = inode->i_ino; 2250 max_key.offset = (u64)-1; 2251 max_key.type = key_type; 2252 2253 min_key.objectid = inode->i_ino; 2254 min_key.type = key_type; 2255 min_key.offset = min_offset; 2256 2257 path->keep_locks = 1; 2258 2259 ret = btrfs_search_forward(root, &min_key, &max_key, 2260 path, 0, trans->transid); 2261 2262 /* 2263 * we didn't find anything from this transaction, see if there 2264 * is anything at all 2265 */ 2266 if (ret != 0 || min_key.objectid != inode->i_ino || 2267 min_key.type != key_type) { 2268 min_key.objectid = inode->i_ino; 2269 min_key.type = key_type; 2270 min_key.offset = (u64)-1; 2271 btrfs_release_path(root, path); 2272 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2273 if (ret < 0) { 2274 btrfs_release_path(root, path); 2275 return ret; 2276 } 2277 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2278 2279 /* if ret == 0 there are items for this type, 2280 * create a range to tell us the last key of this type. 2281 * otherwise, there are no items in this directory after 2282 * *min_offset, and we create a range to indicate that. 2283 */ 2284 if (ret == 0) { 2285 struct btrfs_key tmp; 2286 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2287 path->slots[0]); 2288 if (key_type == tmp.type) 2289 first_offset = max(min_offset, tmp.offset) + 1; 2290 } 2291 goto done; 2292 } 2293 2294 /* go backward to find any previous key */ 2295 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2296 if (ret == 0) { 2297 struct btrfs_key tmp; 2298 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2299 if (key_type == tmp.type) { 2300 first_offset = tmp.offset; 2301 ret = overwrite_item(trans, log, dst_path, 2302 path->nodes[0], path->slots[0], 2303 &tmp); 2304 } 2305 } 2306 btrfs_release_path(root, path); 2307 2308 /* find the first key from this transaction again */ 2309 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2310 if (ret != 0) { 2311 WARN_ON(1); 2312 goto done; 2313 } 2314 2315 /* 2316 * we have a block from this transaction, log every item in it 2317 * from our directory 2318 */ 2319 while (1) { 2320 struct btrfs_key tmp; 2321 src = path->nodes[0]; 2322 nritems = btrfs_header_nritems(src); 2323 for (i = path->slots[0]; i < nritems; i++) { 2324 btrfs_item_key_to_cpu(src, &min_key, i); 2325 2326 if (min_key.objectid != inode->i_ino || 2327 min_key.type != key_type) 2328 goto done; 2329 ret = overwrite_item(trans, log, dst_path, src, i, 2330 &min_key); 2331 BUG_ON(ret); 2332 } 2333 path->slots[0] = nritems; 2334 2335 /* 2336 * look ahead to the next item and see if it is also 2337 * from this directory and from this transaction 2338 */ 2339 ret = btrfs_next_leaf(root, path); 2340 if (ret == 1) { 2341 last_offset = (u64)-1; 2342 goto done; 2343 } 2344 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2345 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2346 last_offset = (u64)-1; 2347 goto done; 2348 } 2349 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2350 ret = overwrite_item(trans, log, dst_path, 2351 path->nodes[0], path->slots[0], 2352 &tmp); 2353 2354 BUG_ON(ret); 2355 last_offset = tmp.offset; 2356 goto done; 2357 } 2358 } 2359 done: 2360 *last_offset_ret = last_offset; 2361 btrfs_release_path(root, path); 2362 btrfs_release_path(log, dst_path); 2363 2364 /* insert the log range keys to indicate where the log is valid */ 2365 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2366 first_offset, last_offset); 2367 BUG_ON(ret); 2368 return 0; 2369 } 2370 2371 /* 2372 * logging directories is very similar to logging inodes, We find all the items 2373 * from the current transaction and write them to the log. 2374 * 2375 * The recovery code scans the directory in the subvolume, and if it finds a 2376 * key in the range logged that is not present in the log tree, then it means 2377 * that dir entry was unlinked during the transaction. 2378 * 2379 * In order for that scan to work, we must include one key smaller than 2380 * the smallest logged by this transaction and one key larger than the largest 2381 * key logged by this transaction. 2382 */ 2383 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2384 struct btrfs_root *root, struct inode *inode, 2385 struct btrfs_path *path, 2386 struct btrfs_path *dst_path) 2387 { 2388 u64 min_key; 2389 u64 max_key; 2390 int ret; 2391 int key_type = BTRFS_DIR_ITEM_KEY; 2392 2393 again: 2394 min_key = 0; 2395 max_key = 0; 2396 while (1) { 2397 ret = log_dir_items(trans, root, inode, path, 2398 dst_path, key_type, min_key, 2399 &max_key); 2400 BUG_ON(ret); 2401 if (max_key == (u64)-1) 2402 break; 2403 min_key = max_key + 1; 2404 } 2405 2406 if (key_type == BTRFS_DIR_ITEM_KEY) { 2407 key_type = BTRFS_DIR_INDEX_KEY; 2408 goto again; 2409 } 2410 return 0; 2411 } 2412 2413 /* 2414 * a helper function to drop items from the log before we relog an 2415 * inode. max_key_type indicates the highest item type to remove. 2416 * This cannot be run for file data extents because it does not 2417 * free the extents they point to. 2418 */ 2419 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2420 struct btrfs_root *log, 2421 struct btrfs_path *path, 2422 u64 objectid, int max_key_type) 2423 { 2424 int ret; 2425 struct btrfs_key key; 2426 struct btrfs_key found_key; 2427 2428 key.objectid = objectid; 2429 key.type = max_key_type; 2430 key.offset = (u64)-1; 2431 2432 while (1) { 2433 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2434 2435 if (ret != 1) 2436 break; 2437 2438 if (path->slots[0] == 0) 2439 break; 2440 2441 path->slots[0]--; 2442 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2443 path->slots[0]); 2444 2445 if (found_key.objectid != objectid) 2446 break; 2447 2448 ret = btrfs_del_item(trans, log, path); 2449 BUG_ON(ret); 2450 btrfs_release_path(log, path); 2451 } 2452 btrfs_release_path(log, path); 2453 return 0; 2454 } 2455 2456 static noinline int copy_extent_csums(struct btrfs_trans_handle *trans, 2457 struct list_head *list, 2458 struct btrfs_root *root, 2459 u64 disk_bytenr, u64 len) 2460 { 2461 struct btrfs_ordered_sum *sums; 2462 struct btrfs_sector_sum *sector_sum; 2463 int ret; 2464 struct btrfs_path *path; 2465 struct btrfs_csum_item *item = NULL; 2466 u64 end = disk_bytenr + len; 2467 u64 item_start_offset = 0; 2468 u64 item_last_offset = 0; 2469 u32 diff; 2470 u32 sum; 2471 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 2472 2473 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS); 2474 2475 sector_sum = sums->sums; 2476 sums->bytenr = disk_bytenr; 2477 sums->len = len; 2478 list_add_tail(&sums->list, list); 2479 2480 path = btrfs_alloc_path(); 2481 while (disk_bytenr < end) { 2482 if (!item || disk_bytenr < item_start_offset || 2483 disk_bytenr >= item_last_offset) { 2484 struct btrfs_key found_key; 2485 u32 item_size; 2486 2487 if (item) 2488 btrfs_release_path(root, path); 2489 item = btrfs_lookup_csum(NULL, root, path, 2490 disk_bytenr, 0); 2491 if (IS_ERR(item)) { 2492 ret = PTR_ERR(item); 2493 if (ret == -ENOENT || ret == -EFBIG) 2494 ret = 0; 2495 sum = 0; 2496 printk(KERN_INFO "log no csum found for " 2497 "byte %llu\n", 2498 (unsigned long long)disk_bytenr); 2499 item = NULL; 2500 btrfs_release_path(root, path); 2501 goto found; 2502 } 2503 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2504 path->slots[0]); 2505 2506 item_start_offset = found_key.offset; 2507 item_size = btrfs_item_size_nr(path->nodes[0], 2508 path->slots[0]); 2509 item_last_offset = item_start_offset + 2510 (item_size / csum_size) * 2511 root->sectorsize; 2512 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2513 struct btrfs_csum_item); 2514 } 2515 /* 2516 * this byte range must be able to fit inside 2517 * a single leaf so it will also fit inside a u32 2518 */ 2519 diff = disk_bytenr - item_start_offset; 2520 diff = diff / root->sectorsize; 2521 diff = diff * csum_size; 2522 2523 read_extent_buffer(path->nodes[0], &sum, 2524 ((unsigned long)item) + diff, 2525 csum_size); 2526 found: 2527 sector_sum->bytenr = disk_bytenr; 2528 sector_sum->sum = sum; 2529 disk_bytenr += root->sectorsize; 2530 sector_sum++; 2531 } 2532 btrfs_free_path(path); 2533 return 0; 2534 } 2535 2536 static noinline int copy_items(struct btrfs_trans_handle *trans, 2537 struct btrfs_root *log, 2538 struct btrfs_path *dst_path, 2539 struct extent_buffer *src, 2540 int start_slot, int nr, int inode_only) 2541 { 2542 unsigned long src_offset; 2543 unsigned long dst_offset; 2544 struct btrfs_file_extent_item *extent; 2545 struct btrfs_inode_item *inode_item; 2546 int ret; 2547 struct btrfs_key *ins_keys; 2548 u32 *ins_sizes; 2549 char *ins_data; 2550 int i; 2551 struct list_head ordered_sums; 2552 2553 INIT_LIST_HEAD(&ordered_sums); 2554 2555 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2556 nr * sizeof(u32), GFP_NOFS); 2557 ins_sizes = (u32 *)ins_data; 2558 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2559 2560 for (i = 0; i < nr; i++) { 2561 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2562 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2563 } 2564 ret = btrfs_insert_empty_items(trans, log, dst_path, 2565 ins_keys, ins_sizes, nr); 2566 BUG_ON(ret); 2567 2568 for (i = 0; i < nr; i++) { 2569 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2570 dst_path->slots[0]); 2571 2572 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2573 2574 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2575 src_offset, ins_sizes[i]); 2576 2577 if (inode_only == LOG_INODE_EXISTS && 2578 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2579 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2580 dst_path->slots[0], 2581 struct btrfs_inode_item); 2582 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2583 2584 /* set the generation to zero so the recover code 2585 * can tell the difference between an logging 2586 * just to say 'this inode exists' and a logging 2587 * to say 'update this inode with these values' 2588 */ 2589 btrfs_set_inode_generation(dst_path->nodes[0], 2590 inode_item, 0); 2591 } 2592 /* take a reference on file data extents so that truncates 2593 * or deletes of this inode don't have to relog the inode 2594 * again 2595 */ 2596 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2597 int found_type; 2598 extent = btrfs_item_ptr(src, start_slot + i, 2599 struct btrfs_file_extent_item); 2600 2601 found_type = btrfs_file_extent_type(src, extent); 2602 if (found_type == BTRFS_FILE_EXTENT_REG || 2603 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2604 u64 ds = btrfs_file_extent_disk_bytenr(src, 2605 extent); 2606 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2607 extent); 2608 u64 cs = btrfs_file_extent_offset(src, extent); 2609 u64 cl = btrfs_file_extent_num_bytes(src, 2610 extent);; 2611 if (btrfs_file_extent_compression(src, 2612 extent)) { 2613 cs = 0; 2614 cl = dl; 2615 } 2616 /* ds == 0 is a hole */ 2617 if (ds != 0) { 2618 ret = btrfs_inc_extent_ref(trans, log, 2619 ds, dl, 2620 dst_path->nodes[0]->start, 2621 BTRFS_TREE_LOG_OBJECTID, 2622 trans->transid, 2623 ins_keys[i].objectid); 2624 BUG_ON(ret); 2625 ret = copy_extent_csums(trans, 2626 &ordered_sums, 2627 log->fs_info->csum_root, 2628 ds + cs, cl); 2629 BUG_ON(ret); 2630 } 2631 } 2632 } 2633 dst_path->slots[0]++; 2634 } 2635 2636 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2637 btrfs_release_path(log, dst_path); 2638 kfree(ins_data); 2639 2640 /* 2641 * we have to do this after the loop above to avoid changing the 2642 * log tree while trying to change the log tree. 2643 */ 2644 while (!list_empty(&ordered_sums)) { 2645 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2646 struct btrfs_ordered_sum, 2647 list); 2648 ret = btrfs_csum_file_blocks(trans, log, sums); 2649 BUG_ON(ret); 2650 list_del(&sums->list); 2651 kfree(sums); 2652 } 2653 return 0; 2654 } 2655 2656 /* log a single inode in the tree log. 2657 * At least one parent directory for this inode must exist in the tree 2658 * or be logged already. 2659 * 2660 * Any items from this inode changed by the current transaction are copied 2661 * to the log tree. An extra reference is taken on any extents in this 2662 * file, allowing us to avoid a whole pile of corner cases around logging 2663 * blocks that have been removed from the tree. 2664 * 2665 * See LOG_INODE_ALL and related defines for a description of what inode_only 2666 * does. 2667 * 2668 * This handles both files and directories. 2669 */ 2670 static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2671 struct btrfs_root *root, struct inode *inode, 2672 int inode_only) 2673 { 2674 struct btrfs_path *path; 2675 struct btrfs_path *dst_path; 2676 struct btrfs_key min_key; 2677 struct btrfs_key max_key; 2678 struct btrfs_root *log = root->log_root; 2679 struct extent_buffer *src = NULL; 2680 u32 size; 2681 int ret; 2682 int nritems; 2683 int ins_start_slot = 0; 2684 int ins_nr; 2685 2686 log = root->log_root; 2687 2688 path = btrfs_alloc_path(); 2689 dst_path = btrfs_alloc_path(); 2690 2691 min_key.objectid = inode->i_ino; 2692 min_key.type = BTRFS_INODE_ITEM_KEY; 2693 min_key.offset = 0; 2694 2695 max_key.objectid = inode->i_ino; 2696 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2697 max_key.type = BTRFS_XATTR_ITEM_KEY; 2698 else 2699 max_key.type = (u8)-1; 2700 max_key.offset = (u64)-1; 2701 2702 /* 2703 * if this inode has already been logged and we're in inode_only 2704 * mode, we don't want to delete the things that have already 2705 * been written to the log. 2706 * 2707 * But, if the inode has been through an inode_only log, 2708 * the logged_trans field is not set. This allows us to catch 2709 * any new names for this inode in the backrefs by logging it 2710 * again 2711 */ 2712 if (inode_only == LOG_INODE_EXISTS && 2713 BTRFS_I(inode)->logged_trans == trans->transid) { 2714 btrfs_free_path(path); 2715 btrfs_free_path(dst_path); 2716 goto out; 2717 } 2718 mutex_lock(&BTRFS_I(inode)->log_mutex); 2719 2720 /* 2721 * a brute force approach to making sure we get the most uptodate 2722 * copies of everything. 2723 */ 2724 if (S_ISDIR(inode->i_mode)) { 2725 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2726 2727 if (inode_only == LOG_INODE_EXISTS) 2728 max_key_type = BTRFS_XATTR_ITEM_KEY; 2729 ret = drop_objectid_items(trans, log, path, 2730 inode->i_ino, max_key_type); 2731 } else { 2732 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2733 } 2734 BUG_ON(ret); 2735 path->keep_locks = 1; 2736 2737 while (1) { 2738 ins_nr = 0; 2739 ret = btrfs_search_forward(root, &min_key, &max_key, 2740 path, 0, trans->transid); 2741 if (ret != 0) 2742 break; 2743 again: 2744 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2745 if (min_key.objectid != inode->i_ino) 2746 break; 2747 if (min_key.type > max_key.type) 2748 break; 2749 2750 src = path->nodes[0]; 2751 size = btrfs_item_size_nr(src, path->slots[0]); 2752 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2753 ins_nr++; 2754 goto next_slot; 2755 } else if (!ins_nr) { 2756 ins_start_slot = path->slots[0]; 2757 ins_nr = 1; 2758 goto next_slot; 2759 } 2760 2761 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2762 ins_nr, inode_only); 2763 BUG_ON(ret); 2764 ins_nr = 1; 2765 ins_start_slot = path->slots[0]; 2766 next_slot: 2767 2768 nritems = btrfs_header_nritems(path->nodes[0]); 2769 path->slots[0]++; 2770 if (path->slots[0] < nritems) { 2771 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2772 path->slots[0]); 2773 goto again; 2774 } 2775 if (ins_nr) { 2776 ret = copy_items(trans, log, dst_path, src, 2777 ins_start_slot, 2778 ins_nr, inode_only); 2779 BUG_ON(ret); 2780 ins_nr = 0; 2781 } 2782 btrfs_release_path(root, path); 2783 2784 if (min_key.offset < (u64)-1) 2785 min_key.offset++; 2786 else if (min_key.type < (u8)-1) 2787 min_key.type++; 2788 else if (min_key.objectid < (u64)-1) 2789 min_key.objectid++; 2790 else 2791 break; 2792 } 2793 if (ins_nr) { 2794 ret = copy_items(trans, log, dst_path, src, 2795 ins_start_slot, 2796 ins_nr, inode_only); 2797 BUG_ON(ret); 2798 ins_nr = 0; 2799 } 2800 WARN_ON(ins_nr); 2801 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2802 btrfs_release_path(root, path); 2803 btrfs_release_path(log, dst_path); 2804 BTRFS_I(inode)->log_dirty_trans = 0; 2805 ret = log_directory_changes(trans, root, inode, path, dst_path); 2806 BUG_ON(ret); 2807 } 2808 BTRFS_I(inode)->logged_trans = trans->transid; 2809 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2810 2811 btrfs_free_path(path); 2812 btrfs_free_path(dst_path); 2813 2814 mutex_lock(&root->fs_info->tree_log_mutex); 2815 ret = update_log_root(trans, log); 2816 BUG_ON(ret); 2817 mutex_unlock(&root->fs_info->tree_log_mutex); 2818 out: 2819 return 0; 2820 } 2821 2822 int btrfs_log_inode(struct btrfs_trans_handle *trans, 2823 struct btrfs_root *root, struct inode *inode, 2824 int inode_only) 2825 { 2826 int ret; 2827 2828 start_log_trans(trans, root); 2829 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2830 end_log_trans(root); 2831 return ret; 2832 } 2833 2834 /* 2835 * helper function around btrfs_log_inode to make sure newly created 2836 * parent directories also end up in the log. A minimal inode and backref 2837 * only logging is done of any parent directories that are older than 2838 * the last committed transaction 2839 */ 2840 int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2841 struct btrfs_root *root, struct dentry *dentry) 2842 { 2843 int inode_only = LOG_INODE_ALL; 2844 struct super_block *sb; 2845 int ret; 2846 2847 start_log_trans(trans, root); 2848 sb = dentry->d_inode->i_sb; 2849 while (1) { 2850 ret = __btrfs_log_inode(trans, root, dentry->d_inode, 2851 inode_only); 2852 BUG_ON(ret); 2853 inode_only = LOG_INODE_EXISTS; 2854 2855 dentry = dentry->d_parent; 2856 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2857 break; 2858 2859 if (BTRFS_I(dentry->d_inode)->generation <= 2860 root->fs_info->last_trans_committed) 2861 break; 2862 } 2863 end_log_trans(root); 2864 return 0; 2865 } 2866 2867 /* 2868 * it is not safe to log dentry if the chunk root has added new 2869 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2870 * If this returns 1, you must commit the transaction to safely get your 2871 * data on disk. 2872 */ 2873 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2874 struct btrfs_root *root, struct dentry *dentry) 2875 { 2876 u64 gen; 2877 gen = root->fs_info->last_trans_new_blockgroup; 2878 if (gen > root->fs_info->last_trans_committed) 2879 return 1; 2880 else 2881 return btrfs_log_dentry(trans, root, dentry); 2882 } 2883 2884 /* 2885 * should be called during mount to recover any replay any log trees 2886 * from the FS 2887 */ 2888 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 2889 { 2890 int ret; 2891 struct btrfs_path *path; 2892 struct btrfs_trans_handle *trans; 2893 struct btrfs_key key; 2894 struct btrfs_key found_key; 2895 struct btrfs_key tmp_key; 2896 struct btrfs_root *log; 2897 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2898 u64 highest_inode; 2899 struct walk_control wc = { 2900 .process_func = process_one_buffer, 2901 .stage = 0, 2902 }; 2903 2904 fs_info->log_root_recovering = 1; 2905 path = btrfs_alloc_path(); 2906 BUG_ON(!path); 2907 2908 trans = btrfs_start_transaction(fs_info->tree_root, 1); 2909 2910 wc.trans = trans; 2911 wc.pin = 1; 2912 2913 walk_log_tree(trans, log_root_tree, &wc); 2914 2915 again: 2916 key.objectid = BTRFS_TREE_LOG_OBJECTID; 2917 key.offset = (u64)-1; 2918 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 2919 2920 while (1) { 2921 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 2922 if (ret < 0) 2923 break; 2924 if (ret > 0) { 2925 if (path->slots[0] == 0) 2926 break; 2927 path->slots[0]--; 2928 } 2929 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2930 path->slots[0]); 2931 btrfs_release_path(log_root_tree, path); 2932 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 2933 break; 2934 2935 log = btrfs_read_fs_root_no_radix(log_root_tree, 2936 &found_key); 2937 BUG_ON(!log); 2938 2939 2940 tmp_key.objectid = found_key.offset; 2941 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 2942 tmp_key.offset = (u64)-1; 2943 2944 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 2945 2946 BUG_ON(!wc.replay_dest); 2947 2948 btrfs_record_root_in_trans(wc.replay_dest); 2949 ret = walk_log_tree(trans, log, &wc); 2950 BUG_ON(ret); 2951 2952 if (wc.stage == LOG_WALK_REPLAY_ALL) { 2953 ret = fixup_inode_link_counts(trans, wc.replay_dest, 2954 path); 2955 BUG_ON(ret); 2956 } 2957 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); 2958 if (ret == 0) { 2959 wc.replay_dest->highest_inode = highest_inode; 2960 wc.replay_dest->last_inode_alloc = highest_inode; 2961 } 2962 2963 key.offset = found_key.offset - 1; 2964 free_extent_buffer(log->node); 2965 kfree(log); 2966 2967 if (found_key.offset == 0) 2968 break; 2969 } 2970 btrfs_release_path(log_root_tree, path); 2971 2972 /* step one is to pin it all, step two is to replay just inodes */ 2973 if (wc.pin) { 2974 wc.pin = 0; 2975 wc.process_func = replay_one_buffer; 2976 wc.stage = LOG_WALK_REPLAY_INODES; 2977 goto again; 2978 } 2979 /* step three is to replay everything */ 2980 if (wc.stage < LOG_WALK_REPLAY_ALL) { 2981 wc.stage++; 2982 goto again; 2983 } 2984 2985 btrfs_free_path(path); 2986 2987 free_extent_buffer(log_root_tree->node); 2988 log_root_tree->log_root = NULL; 2989 fs_info->log_root_recovering = 0; 2990 2991 /* step 4: commit the transaction, which also unpins the blocks */ 2992 btrfs_commit_transaction(trans, fs_info->tree_root); 2993 2994 kfree(log_root_tree); 2995 return 0; 2996 } 2997