1e02119d5SChris Mason /* 2e02119d5SChris Mason * Copyright (C) 2008 Oracle. All rights reserved. 3e02119d5SChris Mason * 4e02119d5SChris Mason * This program is free software; you can redistribute it and/or 5e02119d5SChris Mason * modify it under the terms of the GNU General Public 6e02119d5SChris Mason * License v2 as published by the Free Software Foundation. 7e02119d5SChris Mason * 8e02119d5SChris Mason * This program is distributed in the hope that it will be useful, 9e02119d5SChris Mason * but WITHOUT ANY WARRANTY; without even the implied warranty of 10e02119d5SChris Mason * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11e02119d5SChris Mason * General Public License for more details. 12e02119d5SChris Mason * 13e02119d5SChris Mason * You should have received a copy of the GNU General Public 14e02119d5SChris Mason * License along with this program; if not, write to the 15e02119d5SChris Mason * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16e02119d5SChris Mason * Boston, MA 021110-1307, USA. 17e02119d5SChris Mason */ 18e02119d5SChris Mason 19e02119d5SChris Mason #include <linux/sched.h> 205a0e3ad6STejun Heo #include <linux/slab.h> 21e02119d5SChris Mason #include "ctree.h" 22e02119d5SChris Mason #include "transaction.h" 23e02119d5SChris Mason #include "disk-io.h" 24e02119d5SChris Mason #include "locking.h" 25e02119d5SChris Mason #include "print-tree.h" 26e02119d5SChris Mason #include "compat.h" 27b2950863SChristoph Hellwig #include "tree-log.h" 28e02119d5SChris Mason 29e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode: 30e02119d5SChris Mason * 31e02119d5SChris Mason * LOG_INODE_ALL means to log everything 32e02119d5SChris Mason * LOG_INODE_EXISTS means to log just enough to recreate the inode 33e02119d5SChris Mason * during log replay 34e02119d5SChris Mason */ 35e02119d5SChris Mason #define LOG_INODE_ALL 0 36e02119d5SChris Mason #define LOG_INODE_EXISTS 1 37e02119d5SChris Mason 38e02119d5SChris Mason /* 3912fcfd22SChris Mason * directory trouble cases 4012fcfd22SChris Mason * 4112fcfd22SChris Mason * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 4212fcfd22SChris Mason * log, we must force a full commit before doing an fsync of the directory 4312fcfd22SChris Mason * where the unlink was done. 4412fcfd22SChris Mason * ---> record transid of last unlink/rename per directory 4512fcfd22SChris Mason * 4612fcfd22SChris Mason * mkdir foo/some_dir 4712fcfd22SChris Mason * normal commit 4812fcfd22SChris Mason * rename foo/some_dir foo2/some_dir 4912fcfd22SChris Mason * mkdir foo/some_dir 5012fcfd22SChris Mason * fsync foo/some_dir/some_file 5112fcfd22SChris Mason * 5212fcfd22SChris Mason * The fsync above will unlink the original some_dir without recording 5312fcfd22SChris Mason * it in its new location (foo2). After a crash, some_dir will be gone 5412fcfd22SChris Mason * unless the fsync of some_file forces a full commit 5512fcfd22SChris Mason * 5612fcfd22SChris Mason * 2) we must log any new names for any file or dir that is in the fsync 5712fcfd22SChris Mason * log. ---> check inode while renaming/linking. 5812fcfd22SChris Mason * 5912fcfd22SChris Mason * 2a) we must log any new names for any file or dir during rename 6012fcfd22SChris Mason * when the directory they are being removed from was logged. 6112fcfd22SChris Mason * ---> check inode and old parent dir during rename 6212fcfd22SChris Mason * 6312fcfd22SChris Mason * 2a is actually the more important variant. With the extra logging 6412fcfd22SChris Mason * a crash might unlink the old name without recreating the new one 6512fcfd22SChris Mason * 6612fcfd22SChris Mason * 3) after a crash, we must go through any directories with a link count 6712fcfd22SChris Mason * of zero and redo the rm -rf 6812fcfd22SChris Mason * 6912fcfd22SChris Mason * mkdir f1/foo 7012fcfd22SChris Mason * normal commit 7112fcfd22SChris Mason * rm -rf f1/foo 7212fcfd22SChris Mason * fsync(f1) 7312fcfd22SChris Mason * 7412fcfd22SChris Mason * The directory f1 was fully removed from the FS, but fsync was never 7512fcfd22SChris Mason * called on f1, only its parent dir. After a crash the rm -rf must 7612fcfd22SChris Mason * be replayed. This must be able to recurse down the entire 7712fcfd22SChris Mason * directory tree. The inode link count fixup code takes care of the 7812fcfd22SChris Mason * ugly details. 7912fcfd22SChris Mason */ 8012fcfd22SChris Mason 8112fcfd22SChris Mason /* 82e02119d5SChris Mason * stages for the tree walking. The first 83e02119d5SChris Mason * stage (0) is to only pin down the blocks we find 84e02119d5SChris Mason * the second stage (1) is to make sure that all the inodes 85e02119d5SChris Mason * we find in the log are created in the subvolume. 86e02119d5SChris Mason * 87e02119d5SChris Mason * The last stage is to deal with directories and links and extents 88e02119d5SChris Mason * and all the other fun semantics 89e02119d5SChris Mason */ 90e02119d5SChris Mason #define LOG_WALK_PIN_ONLY 0 91e02119d5SChris Mason #define LOG_WALK_REPLAY_INODES 1 92e02119d5SChris Mason #define LOG_WALK_REPLAY_ALL 2 93e02119d5SChris Mason 9412fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans, 95e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 96e02119d5SChris Mason int inode_only); 97ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 98ec051c0fSYan Zheng struct btrfs_root *root, 99ec051c0fSYan Zheng struct btrfs_path *path, u64 objectid); 10012fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 10112fcfd22SChris Mason struct btrfs_root *root, 10212fcfd22SChris Mason struct btrfs_root *log, 10312fcfd22SChris Mason struct btrfs_path *path, 10412fcfd22SChris Mason u64 dirid, int del_all); 105e02119d5SChris Mason 106e02119d5SChris Mason /* 107e02119d5SChris Mason * tree logging is a special write ahead log used to make sure that 108e02119d5SChris Mason * fsyncs and O_SYNCs can happen without doing full tree commits. 109e02119d5SChris Mason * 110e02119d5SChris Mason * Full tree commits are expensive because they require commonly 111e02119d5SChris Mason * modified blocks to be recowed, creating many dirty pages in the 112e02119d5SChris Mason * extent tree an 4x-6x higher write load than ext3. 113e02119d5SChris Mason * 114e02119d5SChris Mason * Instead of doing a tree commit on every fsync, we use the 115e02119d5SChris Mason * key ranges and transaction ids to find items for a given file or directory 116e02119d5SChris Mason * that have changed in this transaction. Those items are copied into 117e02119d5SChris Mason * a special tree (one per subvolume root), that tree is written to disk 118e02119d5SChris Mason * and then the fsync is considered complete. 119e02119d5SChris Mason * 120e02119d5SChris Mason * After a crash, items are copied out of the log-tree back into the 121e02119d5SChris Mason * subvolume tree. Any file data extents found are recorded in the extent 122e02119d5SChris Mason * allocation tree, and the log-tree freed. 123e02119d5SChris Mason * 124e02119d5SChris Mason * The log tree is read three times, once to pin down all the extents it is 125e02119d5SChris Mason * using in ram and once, once to create all the inodes logged in the tree 126e02119d5SChris Mason * and once to do all the other items. 127e02119d5SChris Mason */ 128e02119d5SChris Mason 129e02119d5SChris Mason /* 130e02119d5SChris Mason * start a sub transaction and setup the log tree 131e02119d5SChris Mason * this increments the log tree writer count to make the people 132e02119d5SChris Mason * syncing the tree wait for us to finish 133e02119d5SChris Mason */ 134e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans, 135e02119d5SChris Mason struct btrfs_root *root) 136e02119d5SChris Mason { 137e02119d5SChris Mason int ret; 1384a500fd1SYan, Zheng int err = 0; 1397237f183SYan Zheng 1407237f183SYan Zheng mutex_lock(&root->log_mutex); 1417237f183SYan Zheng if (root->log_root) { 142ff782e0aSJosef Bacik if (!root->log_start_pid) { 143ff782e0aSJosef Bacik root->log_start_pid = current->pid; 144ff782e0aSJosef Bacik root->log_multiple_pids = false; 145ff782e0aSJosef Bacik } else if (root->log_start_pid != current->pid) { 146ff782e0aSJosef Bacik root->log_multiple_pids = true; 147ff782e0aSJosef Bacik } 148ff782e0aSJosef Bacik 1497237f183SYan Zheng root->log_batch++; 1507237f183SYan Zheng atomic_inc(&root->log_writers); 1517237f183SYan Zheng mutex_unlock(&root->log_mutex); 1527237f183SYan Zheng return 0; 1537237f183SYan Zheng } 154ff782e0aSJosef Bacik root->log_multiple_pids = false; 155ff782e0aSJosef Bacik root->log_start_pid = current->pid; 156e02119d5SChris Mason mutex_lock(&root->fs_info->tree_log_mutex); 157e02119d5SChris Mason if (!root->fs_info->log_root_tree) { 158e02119d5SChris Mason ret = btrfs_init_log_root_tree(trans, root->fs_info); 1594a500fd1SYan, Zheng if (ret) 1604a500fd1SYan, Zheng err = ret; 161e02119d5SChris Mason } 1624a500fd1SYan, Zheng if (err == 0 && !root->log_root) { 163e02119d5SChris Mason ret = btrfs_add_log_tree(trans, root); 1644a500fd1SYan, Zheng if (ret) 1654a500fd1SYan, Zheng err = ret; 166e02119d5SChris Mason } 167e02119d5SChris Mason mutex_unlock(&root->fs_info->tree_log_mutex); 1687237f183SYan Zheng root->log_batch++; 1697237f183SYan Zheng atomic_inc(&root->log_writers); 1707237f183SYan Zheng mutex_unlock(&root->log_mutex); 1714a500fd1SYan, Zheng return err; 172e02119d5SChris Mason } 173e02119d5SChris Mason 174e02119d5SChris Mason /* 175e02119d5SChris Mason * returns 0 if there was a log transaction running and we were able 176e02119d5SChris Mason * to join, or returns -ENOENT if there were not transactions 177e02119d5SChris Mason * in progress 178e02119d5SChris Mason */ 179e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root) 180e02119d5SChris Mason { 181e02119d5SChris Mason int ret = -ENOENT; 182e02119d5SChris Mason 183e02119d5SChris Mason smp_mb(); 184e02119d5SChris Mason if (!root->log_root) 185e02119d5SChris Mason return -ENOENT; 186e02119d5SChris Mason 1877237f183SYan Zheng mutex_lock(&root->log_mutex); 188e02119d5SChris Mason if (root->log_root) { 189e02119d5SChris Mason ret = 0; 1907237f183SYan Zheng atomic_inc(&root->log_writers); 191e02119d5SChris Mason } 1927237f183SYan Zheng mutex_unlock(&root->log_mutex); 193e02119d5SChris Mason return ret; 194e02119d5SChris Mason } 195e02119d5SChris Mason 196e02119d5SChris Mason /* 19712fcfd22SChris Mason * This either makes the current running log transaction wait 19812fcfd22SChris Mason * until you call btrfs_end_log_trans() or it makes any future 19912fcfd22SChris Mason * log transactions wait until you call btrfs_end_log_trans() 20012fcfd22SChris Mason */ 20112fcfd22SChris Mason int btrfs_pin_log_trans(struct btrfs_root *root) 20212fcfd22SChris Mason { 20312fcfd22SChris Mason int ret = -ENOENT; 20412fcfd22SChris Mason 20512fcfd22SChris Mason mutex_lock(&root->log_mutex); 20612fcfd22SChris Mason atomic_inc(&root->log_writers); 20712fcfd22SChris Mason mutex_unlock(&root->log_mutex); 20812fcfd22SChris Mason return ret; 20912fcfd22SChris Mason } 21012fcfd22SChris Mason 21112fcfd22SChris Mason /* 212e02119d5SChris Mason * indicate we're done making changes to the log tree 213e02119d5SChris Mason * and wake up anyone waiting to do a sync 214e02119d5SChris Mason */ 21512fcfd22SChris Mason int btrfs_end_log_trans(struct btrfs_root *root) 216e02119d5SChris Mason { 2177237f183SYan Zheng if (atomic_dec_and_test(&root->log_writers)) { 218e02119d5SChris Mason smp_mb(); 2197237f183SYan Zheng if (waitqueue_active(&root->log_writer_wait)) 2207237f183SYan Zheng wake_up(&root->log_writer_wait); 2217237f183SYan Zheng } 222e02119d5SChris Mason return 0; 223e02119d5SChris Mason } 224e02119d5SChris Mason 225e02119d5SChris Mason 226e02119d5SChris Mason /* 227e02119d5SChris Mason * the walk control struct is used to pass state down the chain when 228e02119d5SChris Mason * processing the log tree. The stage field tells us which part 229e02119d5SChris Mason * of the log tree processing we are currently doing. The others 230e02119d5SChris Mason * are state fields used for that specific part 231e02119d5SChris Mason */ 232e02119d5SChris Mason struct walk_control { 233e02119d5SChris Mason /* should we free the extent on disk when done? This is used 234e02119d5SChris Mason * at transaction commit time while freeing a log tree 235e02119d5SChris Mason */ 236e02119d5SChris Mason int free; 237e02119d5SChris Mason 238e02119d5SChris Mason /* should we write out the extent buffer? This is used 239e02119d5SChris Mason * while flushing the log tree to disk during a sync 240e02119d5SChris Mason */ 241e02119d5SChris Mason int write; 242e02119d5SChris Mason 243e02119d5SChris Mason /* should we wait for the extent buffer io to finish? Also used 244e02119d5SChris Mason * while flushing the log tree to disk for a sync 245e02119d5SChris Mason */ 246e02119d5SChris Mason int wait; 247e02119d5SChris Mason 248e02119d5SChris Mason /* pin only walk, we record which extents on disk belong to the 249e02119d5SChris Mason * log trees 250e02119d5SChris Mason */ 251e02119d5SChris Mason int pin; 252e02119d5SChris Mason 253e02119d5SChris Mason /* what stage of the replay code we're currently in */ 254e02119d5SChris Mason int stage; 255e02119d5SChris Mason 256e02119d5SChris Mason /* the root we are currently replaying */ 257e02119d5SChris Mason struct btrfs_root *replay_dest; 258e02119d5SChris Mason 259e02119d5SChris Mason /* the trans handle for the current replay */ 260e02119d5SChris Mason struct btrfs_trans_handle *trans; 261e02119d5SChris Mason 262e02119d5SChris Mason /* the function that gets used to process blocks we find in the 263e02119d5SChris Mason * tree. Note the extent_buffer might not be up to date when it is 264e02119d5SChris Mason * passed in, and it must be checked or read if you need the data 265e02119d5SChris Mason * inside it 266e02119d5SChris Mason */ 267e02119d5SChris Mason int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 268e02119d5SChris Mason struct walk_control *wc, u64 gen); 269e02119d5SChris Mason }; 270e02119d5SChris Mason 271e02119d5SChris Mason /* 272e02119d5SChris Mason * process_func used to pin down extents, write them or wait on them 273e02119d5SChris Mason */ 274e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log, 275e02119d5SChris Mason struct extent_buffer *eb, 276e02119d5SChris Mason struct walk_control *wc, u64 gen) 277e02119d5SChris Mason { 27804018de5SJosef Bacik if (wc->pin) 27911833d66SYan Zheng btrfs_pin_extent(log->fs_info->extent_root, 28011833d66SYan Zheng eb->start, eb->len, 0); 281e02119d5SChris Mason 282e02119d5SChris Mason if (btrfs_buffer_uptodate(eb, gen)) { 283e02119d5SChris Mason if (wc->write) 284e02119d5SChris Mason btrfs_write_tree_block(eb); 285e02119d5SChris Mason if (wc->wait) 286e02119d5SChris Mason btrfs_wait_tree_block_writeback(eb); 287e02119d5SChris Mason } 288e02119d5SChris Mason return 0; 289e02119d5SChris Mason } 290e02119d5SChris Mason 291e02119d5SChris Mason /* 292e02119d5SChris Mason * Item overwrite used by replay and tree logging. eb, slot and key all refer 293e02119d5SChris Mason * to the src data we are copying out. 294e02119d5SChris Mason * 295e02119d5SChris Mason * root is the tree we are copying into, and path is a scratch 296e02119d5SChris Mason * path for use in this function (it should be released on entry and 297e02119d5SChris Mason * will be released on exit). 298e02119d5SChris Mason * 299e02119d5SChris Mason * If the key is already in the destination tree the existing item is 300e02119d5SChris Mason * overwritten. If the existing item isn't big enough, it is extended. 301e02119d5SChris Mason * If it is too large, it is truncated. 302e02119d5SChris Mason * 303e02119d5SChris Mason * If the key isn't in the destination yet, a new item is inserted. 304e02119d5SChris Mason */ 305e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans, 306e02119d5SChris Mason struct btrfs_root *root, 307e02119d5SChris Mason struct btrfs_path *path, 308e02119d5SChris Mason struct extent_buffer *eb, int slot, 309e02119d5SChris Mason struct btrfs_key *key) 310e02119d5SChris Mason { 311e02119d5SChris Mason int ret; 312e02119d5SChris Mason u32 item_size; 313e02119d5SChris Mason u64 saved_i_size = 0; 314e02119d5SChris Mason int save_old_i_size = 0; 315e02119d5SChris Mason unsigned long src_ptr; 316e02119d5SChris Mason unsigned long dst_ptr; 317e02119d5SChris Mason int overwrite_root = 0; 318e02119d5SChris Mason 319e02119d5SChris Mason if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 320e02119d5SChris Mason overwrite_root = 1; 321e02119d5SChris Mason 322e02119d5SChris Mason item_size = btrfs_item_size_nr(eb, slot); 323e02119d5SChris Mason src_ptr = btrfs_item_ptr_offset(eb, slot); 324e02119d5SChris Mason 325e02119d5SChris Mason /* look for the key in the destination tree */ 326e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 327e02119d5SChris Mason if (ret == 0) { 328e02119d5SChris Mason char *src_copy; 329e02119d5SChris Mason char *dst_copy; 330e02119d5SChris Mason u32 dst_size = btrfs_item_size_nr(path->nodes[0], 331e02119d5SChris Mason path->slots[0]); 332e02119d5SChris Mason if (dst_size != item_size) 333e02119d5SChris Mason goto insert; 334e02119d5SChris Mason 335e02119d5SChris Mason if (item_size == 0) { 336e02119d5SChris Mason btrfs_release_path(root, path); 337e02119d5SChris Mason return 0; 338e02119d5SChris Mason } 339e02119d5SChris Mason dst_copy = kmalloc(item_size, GFP_NOFS); 340e02119d5SChris Mason src_copy = kmalloc(item_size, GFP_NOFS); 3412a29edc6Sliubo if (!dst_copy || !src_copy) { 3422a29edc6Sliubo btrfs_release_path(root, path); 3432a29edc6Sliubo kfree(dst_copy); 3442a29edc6Sliubo kfree(src_copy); 3452a29edc6Sliubo return -ENOMEM; 3462a29edc6Sliubo } 347e02119d5SChris Mason 348e02119d5SChris Mason read_extent_buffer(eb, src_copy, src_ptr, item_size); 349e02119d5SChris Mason 350e02119d5SChris Mason dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 351e02119d5SChris Mason read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 352e02119d5SChris Mason item_size); 353e02119d5SChris Mason ret = memcmp(dst_copy, src_copy, item_size); 354e02119d5SChris Mason 355e02119d5SChris Mason kfree(dst_copy); 356e02119d5SChris Mason kfree(src_copy); 357e02119d5SChris Mason /* 358e02119d5SChris Mason * they have the same contents, just return, this saves 359e02119d5SChris Mason * us from cowing blocks in the destination tree and doing 360e02119d5SChris Mason * extra writes that may not have been done by a previous 361e02119d5SChris Mason * sync 362e02119d5SChris Mason */ 363e02119d5SChris Mason if (ret == 0) { 364e02119d5SChris Mason btrfs_release_path(root, path); 365e02119d5SChris Mason return 0; 366e02119d5SChris Mason } 367e02119d5SChris Mason 368e02119d5SChris Mason } 369e02119d5SChris Mason insert: 370e02119d5SChris Mason btrfs_release_path(root, path); 371e02119d5SChris Mason /* try to insert the key into the destination tree */ 372e02119d5SChris Mason ret = btrfs_insert_empty_item(trans, root, path, 373e02119d5SChris Mason key, item_size); 374e02119d5SChris Mason 375e02119d5SChris Mason /* make sure any existing item is the correct size */ 376e02119d5SChris Mason if (ret == -EEXIST) { 377e02119d5SChris Mason u32 found_size; 378e02119d5SChris Mason found_size = btrfs_item_size_nr(path->nodes[0], 379e02119d5SChris Mason path->slots[0]); 380e02119d5SChris Mason if (found_size > item_size) { 381e02119d5SChris Mason btrfs_truncate_item(trans, root, path, item_size, 1); 382e02119d5SChris Mason } else if (found_size < item_size) { 38387b29b20SYan Zheng ret = btrfs_extend_item(trans, root, path, 38487b29b20SYan Zheng item_size - found_size); 385e02119d5SChris Mason } 386e02119d5SChris Mason } else if (ret) { 3874a500fd1SYan, Zheng return ret; 388e02119d5SChris Mason } 389e02119d5SChris Mason dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 390e02119d5SChris Mason path->slots[0]); 391e02119d5SChris Mason 392e02119d5SChris Mason /* don't overwrite an existing inode if the generation number 393e02119d5SChris Mason * was logged as zero. This is done when the tree logging code 394e02119d5SChris Mason * is just logging an inode to make sure it exists after recovery. 395e02119d5SChris Mason * 396e02119d5SChris Mason * Also, don't overwrite i_size on directories during replay. 397e02119d5SChris Mason * log replay inserts and removes directory items based on the 398e02119d5SChris Mason * state of the tree found in the subvolume, and i_size is modified 399e02119d5SChris Mason * as it goes 400e02119d5SChris Mason */ 401e02119d5SChris Mason if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 402e02119d5SChris Mason struct btrfs_inode_item *src_item; 403e02119d5SChris Mason struct btrfs_inode_item *dst_item; 404e02119d5SChris Mason 405e02119d5SChris Mason src_item = (struct btrfs_inode_item *)src_ptr; 406e02119d5SChris Mason dst_item = (struct btrfs_inode_item *)dst_ptr; 407e02119d5SChris Mason 408e02119d5SChris Mason if (btrfs_inode_generation(eb, src_item) == 0) 409e02119d5SChris Mason goto no_copy; 410e02119d5SChris Mason 411e02119d5SChris Mason if (overwrite_root && 412e02119d5SChris Mason S_ISDIR(btrfs_inode_mode(eb, src_item)) && 413e02119d5SChris Mason S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 414e02119d5SChris Mason save_old_i_size = 1; 415e02119d5SChris Mason saved_i_size = btrfs_inode_size(path->nodes[0], 416e02119d5SChris Mason dst_item); 417e02119d5SChris Mason } 418e02119d5SChris Mason } 419e02119d5SChris Mason 420e02119d5SChris Mason copy_extent_buffer(path->nodes[0], eb, dst_ptr, 421e02119d5SChris Mason src_ptr, item_size); 422e02119d5SChris Mason 423e02119d5SChris Mason if (save_old_i_size) { 424e02119d5SChris Mason struct btrfs_inode_item *dst_item; 425e02119d5SChris Mason dst_item = (struct btrfs_inode_item *)dst_ptr; 426e02119d5SChris Mason btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 427e02119d5SChris Mason } 428e02119d5SChris Mason 429e02119d5SChris Mason /* make sure the generation is filled in */ 430e02119d5SChris Mason if (key->type == BTRFS_INODE_ITEM_KEY) { 431e02119d5SChris Mason struct btrfs_inode_item *dst_item; 432e02119d5SChris Mason dst_item = (struct btrfs_inode_item *)dst_ptr; 433e02119d5SChris Mason if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 434e02119d5SChris Mason btrfs_set_inode_generation(path->nodes[0], dst_item, 435e02119d5SChris Mason trans->transid); 436e02119d5SChris Mason } 437e02119d5SChris Mason } 438e02119d5SChris Mason no_copy: 439e02119d5SChris Mason btrfs_mark_buffer_dirty(path->nodes[0]); 440e02119d5SChris Mason btrfs_release_path(root, path); 441e02119d5SChris Mason return 0; 442e02119d5SChris Mason } 443e02119d5SChris Mason 444e02119d5SChris Mason /* 445e02119d5SChris Mason * simple helper to read an inode off the disk from a given root 446e02119d5SChris Mason * This can only be called for subvolume roots and not for the log 447e02119d5SChris Mason */ 448e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root, 449e02119d5SChris Mason u64 objectid) 450e02119d5SChris Mason { 4515d4f98a2SYan Zheng struct btrfs_key key; 452e02119d5SChris Mason struct inode *inode; 453e02119d5SChris Mason 4545d4f98a2SYan Zheng key.objectid = objectid; 4555d4f98a2SYan Zheng key.type = BTRFS_INODE_ITEM_KEY; 4565d4f98a2SYan Zheng key.offset = 0; 45773f73415SJosef Bacik inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 4585d4f98a2SYan Zheng if (IS_ERR(inode)) { 4595d4f98a2SYan Zheng inode = NULL; 4605d4f98a2SYan Zheng } else if (is_bad_inode(inode)) { 461e02119d5SChris Mason iput(inode); 462e02119d5SChris Mason inode = NULL; 463e02119d5SChris Mason } 464e02119d5SChris Mason return inode; 465e02119d5SChris Mason } 466e02119d5SChris Mason 467e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the 468e02119d5SChris Mason * subvolume 'root'. path is released on entry and should be released 469e02119d5SChris Mason * on exit. 470e02119d5SChris Mason * 471e02119d5SChris Mason * extents in the log tree have not been allocated out of the extent 472e02119d5SChris Mason * tree yet. So, this completes the allocation, taking a reference 473e02119d5SChris Mason * as required if the extent already exists or creating a new extent 474e02119d5SChris Mason * if it isn't in the extent allocation tree yet. 475e02119d5SChris Mason * 476e02119d5SChris Mason * The extent is inserted into the file, dropping any existing extents 477e02119d5SChris Mason * from the file that overlap the new one. 478e02119d5SChris Mason */ 479e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 480e02119d5SChris Mason struct btrfs_root *root, 481e02119d5SChris Mason struct btrfs_path *path, 482e02119d5SChris Mason struct extent_buffer *eb, int slot, 483e02119d5SChris Mason struct btrfs_key *key) 484e02119d5SChris Mason { 485e02119d5SChris Mason int found_type; 486e02119d5SChris Mason u64 mask = root->sectorsize - 1; 487e02119d5SChris Mason u64 extent_end; 488e02119d5SChris Mason u64 alloc_hint; 489e02119d5SChris Mason u64 start = key->offset; 49007d400a6SYan Zheng u64 saved_nbytes; 491e02119d5SChris Mason struct btrfs_file_extent_item *item; 492e02119d5SChris Mason struct inode *inode = NULL; 493e02119d5SChris Mason unsigned long size; 494e02119d5SChris Mason int ret = 0; 495e02119d5SChris Mason 496e02119d5SChris Mason item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 497e02119d5SChris Mason found_type = btrfs_file_extent_type(eb, item); 498e02119d5SChris Mason 499d899e052SYan Zheng if (found_type == BTRFS_FILE_EXTENT_REG || 500d899e052SYan Zheng found_type == BTRFS_FILE_EXTENT_PREALLOC) 501e02119d5SChris Mason extent_end = start + btrfs_file_extent_num_bytes(eb, item); 502e02119d5SChris Mason else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 503c8b97818SChris Mason size = btrfs_file_extent_inline_len(eb, item); 504e02119d5SChris Mason extent_end = (start + size + mask) & ~mask; 505e02119d5SChris Mason } else { 506e02119d5SChris Mason ret = 0; 507e02119d5SChris Mason goto out; 508e02119d5SChris Mason } 509e02119d5SChris Mason 510e02119d5SChris Mason inode = read_one_inode(root, key->objectid); 511e02119d5SChris Mason if (!inode) { 512e02119d5SChris Mason ret = -EIO; 513e02119d5SChris Mason goto out; 514e02119d5SChris Mason } 515e02119d5SChris Mason 516e02119d5SChris Mason /* 517e02119d5SChris Mason * first check to see if we already have this extent in the 518e02119d5SChris Mason * file. This must be done before the btrfs_drop_extents run 519e02119d5SChris Mason * so we don't try to drop this extent. 520e02119d5SChris Mason */ 521e02119d5SChris Mason ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 522e02119d5SChris Mason start, 0); 523e02119d5SChris Mason 524d899e052SYan Zheng if (ret == 0 && 525d899e052SYan Zheng (found_type == BTRFS_FILE_EXTENT_REG || 526d899e052SYan Zheng found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 527e02119d5SChris Mason struct btrfs_file_extent_item cmp1; 528e02119d5SChris Mason struct btrfs_file_extent_item cmp2; 529e02119d5SChris Mason struct btrfs_file_extent_item *existing; 530e02119d5SChris Mason struct extent_buffer *leaf; 531e02119d5SChris Mason 532e02119d5SChris Mason leaf = path->nodes[0]; 533e02119d5SChris Mason existing = btrfs_item_ptr(leaf, path->slots[0], 534e02119d5SChris Mason struct btrfs_file_extent_item); 535e02119d5SChris Mason 536e02119d5SChris Mason read_extent_buffer(eb, &cmp1, (unsigned long)item, 537e02119d5SChris Mason sizeof(cmp1)); 538e02119d5SChris Mason read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 539e02119d5SChris Mason sizeof(cmp2)); 540e02119d5SChris Mason 541e02119d5SChris Mason /* 542e02119d5SChris Mason * we already have a pointer to this exact extent, 543e02119d5SChris Mason * we don't have to do anything 544e02119d5SChris Mason */ 545e02119d5SChris Mason if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 546e02119d5SChris Mason btrfs_release_path(root, path); 547e02119d5SChris Mason goto out; 548e02119d5SChris Mason } 549e02119d5SChris Mason } 550e02119d5SChris Mason btrfs_release_path(root, path); 551e02119d5SChris Mason 55207d400a6SYan Zheng saved_nbytes = inode_get_bytes(inode); 553e02119d5SChris Mason /* drop any overlapping extents */ 554920bbbfbSYan, Zheng ret = btrfs_drop_extents(trans, inode, start, extent_end, 555920bbbfbSYan, Zheng &alloc_hint, 1); 556e02119d5SChris Mason BUG_ON(ret); 557e02119d5SChris Mason 55807d400a6SYan Zheng if (found_type == BTRFS_FILE_EXTENT_REG || 55907d400a6SYan Zheng found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5605d4f98a2SYan Zheng u64 offset; 56107d400a6SYan Zheng unsigned long dest_offset; 56207d400a6SYan Zheng struct btrfs_key ins; 56307d400a6SYan Zheng 56407d400a6SYan Zheng ret = btrfs_insert_empty_item(trans, root, path, key, 56507d400a6SYan Zheng sizeof(*item)); 56607d400a6SYan Zheng BUG_ON(ret); 56707d400a6SYan Zheng dest_offset = btrfs_item_ptr_offset(path->nodes[0], 56807d400a6SYan Zheng path->slots[0]); 56907d400a6SYan Zheng copy_extent_buffer(path->nodes[0], eb, dest_offset, 57007d400a6SYan Zheng (unsigned long)item, sizeof(*item)); 57107d400a6SYan Zheng 57207d400a6SYan Zheng ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 57307d400a6SYan Zheng ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 57407d400a6SYan Zheng ins.type = BTRFS_EXTENT_ITEM_KEY; 5755d4f98a2SYan Zheng offset = key->offset - btrfs_file_extent_offset(eb, item); 57607d400a6SYan Zheng 57707d400a6SYan Zheng if (ins.objectid > 0) { 57807d400a6SYan Zheng u64 csum_start; 57907d400a6SYan Zheng u64 csum_end; 58007d400a6SYan Zheng LIST_HEAD(ordered_sums); 58107d400a6SYan Zheng /* 58207d400a6SYan Zheng * is this extent already allocated in the extent 58307d400a6SYan Zheng * allocation tree? If so, just add a reference 58407d400a6SYan Zheng */ 58507d400a6SYan Zheng ret = btrfs_lookup_extent(root, ins.objectid, 58607d400a6SYan Zheng ins.offset); 58707d400a6SYan Zheng if (ret == 0) { 58807d400a6SYan Zheng ret = btrfs_inc_extent_ref(trans, root, 58907d400a6SYan Zheng ins.objectid, ins.offset, 5905d4f98a2SYan Zheng 0, root->root_key.objectid, 5915d4f98a2SYan Zheng key->objectid, offset); 59207d400a6SYan Zheng } else { 59307d400a6SYan Zheng /* 59407d400a6SYan Zheng * insert the extent pointer in the extent 59507d400a6SYan Zheng * allocation tree 59607d400a6SYan Zheng */ 5975d4f98a2SYan Zheng ret = btrfs_alloc_logged_file_extent(trans, 5985d4f98a2SYan Zheng root, root->root_key.objectid, 5995d4f98a2SYan Zheng key->objectid, offset, &ins); 60007d400a6SYan Zheng BUG_ON(ret); 60107d400a6SYan Zheng } 60207d400a6SYan Zheng btrfs_release_path(root, path); 60307d400a6SYan Zheng 60407d400a6SYan Zheng if (btrfs_file_extent_compression(eb, item)) { 60507d400a6SYan Zheng csum_start = ins.objectid; 60607d400a6SYan Zheng csum_end = csum_start + ins.offset; 60707d400a6SYan Zheng } else { 60807d400a6SYan Zheng csum_start = ins.objectid + 60907d400a6SYan Zheng btrfs_file_extent_offset(eb, item); 61007d400a6SYan Zheng csum_end = csum_start + 61107d400a6SYan Zheng btrfs_file_extent_num_bytes(eb, item); 61207d400a6SYan Zheng } 61307d400a6SYan Zheng 61407d400a6SYan Zheng ret = btrfs_lookup_csums_range(root->log_root, 61507d400a6SYan Zheng csum_start, csum_end - 1, 61607d400a6SYan Zheng &ordered_sums); 61707d400a6SYan Zheng BUG_ON(ret); 61807d400a6SYan Zheng while (!list_empty(&ordered_sums)) { 61907d400a6SYan Zheng struct btrfs_ordered_sum *sums; 62007d400a6SYan Zheng sums = list_entry(ordered_sums.next, 62107d400a6SYan Zheng struct btrfs_ordered_sum, 62207d400a6SYan Zheng list); 62307d400a6SYan Zheng ret = btrfs_csum_file_blocks(trans, 62407d400a6SYan Zheng root->fs_info->csum_root, 62507d400a6SYan Zheng sums); 62607d400a6SYan Zheng BUG_ON(ret); 62707d400a6SYan Zheng list_del(&sums->list); 62807d400a6SYan Zheng kfree(sums); 62907d400a6SYan Zheng } 63007d400a6SYan Zheng } else { 63107d400a6SYan Zheng btrfs_release_path(root, path); 63207d400a6SYan Zheng } 63307d400a6SYan Zheng } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 63407d400a6SYan Zheng /* inline extents are easy, we just overwrite them */ 635e02119d5SChris Mason ret = overwrite_item(trans, root, path, eb, slot, key); 636e02119d5SChris Mason BUG_ON(ret); 63707d400a6SYan Zheng } 638e02119d5SChris Mason 63907d400a6SYan Zheng inode_set_bytes(inode, saved_nbytes); 640e02119d5SChris Mason btrfs_update_inode(trans, root, inode); 641e02119d5SChris Mason out: 642e02119d5SChris Mason if (inode) 643e02119d5SChris Mason iput(inode); 644e02119d5SChris Mason return ret; 645e02119d5SChris Mason } 646e02119d5SChris Mason 647e02119d5SChris Mason /* 648e02119d5SChris Mason * when cleaning up conflicts between the directory names in the 649e02119d5SChris Mason * subvolume, directory names in the log and directory names in the 650e02119d5SChris Mason * inode back references, we may have to unlink inodes from directories. 651e02119d5SChris Mason * 652e02119d5SChris Mason * This is a helper function to do the unlink of a specific directory 653e02119d5SChris Mason * item 654e02119d5SChris Mason */ 655e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 656e02119d5SChris Mason struct btrfs_root *root, 657e02119d5SChris Mason struct btrfs_path *path, 658e02119d5SChris Mason struct inode *dir, 659e02119d5SChris Mason struct btrfs_dir_item *di) 660e02119d5SChris Mason { 661e02119d5SChris Mason struct inode *inode; 662e02119d5SChris Mason char *name; 663e02119d5SChris Mason int name_len; 664e02119d5SChris Mason struct extent_buffer *leaf; 665e02119d5SChris Mason struct btrfs_key location; 666e02119d5SChris Mason int ret; 667e02119d5SChris Mason 668e02119d5SChris Mason leaf = path->nodes[0]; 669e02119d5SChris Mason 670e02119d5SChris Mason btrfs_dir_item_key_to_cpu(leaf, di, &location); 671e02119d5SChris Mason name_len = btrfs_dir_name_len(leaf, di); 672e02119d5SChris Mason name = kmalloc(name_len, GFP_NOFS); 6732a29edc6Sliubo if (!name) 6742a29edc6Sliubo return -ENOMEM; 6752a29edc6Sliubo 676e02119d5SChris Mason read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 677e02119d5SChris Mason btrfs_release_path(root, path); 678e02119d5SChris Mason 679e02119d5SChris Mason inode = read_one_inode(root, location.objectid); 680*c00e9493STsutomu Itoh if (!inode) { 681*c00e9493STsutomu Itoh kfree(name); 682*c00e9493STsutomu Itoh return -EIO; 683*c00e9493STsutomu Itoh } 684e02119d5SChris Mason 685ec051c0fSYan Zheng ret = link_to_fixup_dir(trans, root, path, location.objectid); 686ec051c0fSYan Zheng BUG_ON(ret); 68712fcfd22SChris Mason 688e02119d5SChris Mason ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 689ec051c0fSYan Zheng BUG_ON(ret); 690e02119d5SChris Mason kfree(name); 691e02119d5SChris Mason 692e02119d5SChris Mason iput(inode); 693e02119d5SChris Mason return ret; 694e02119d5SChris Mason } 695e02119d5SChris Mason 696e02119d5SChris Mason /* 697e02119d5SChris Mason * helper function to see if a given name and sequence number found 698e02119d5SChris Mason * in an inode back reference are already in a directory and correctly 699e02119d5SChris Mason * point to this inode 700e02119d5SChris Mason */ 701e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root, 702e02119d5SChris Mason struct btrfs_path *path, 703e02119d5SChris Mason u64 dirid, u64 objectid, u64 index, 704e02119d5SChris Mason const char *name, int name_len) 705e02119d5SChris Mason { 706e02119d5SChris Mason struct btrfs_dir_item *di; 707e02119d5SChris Mason struct btrfs_key location; 708e02119d5SChris Mason int match = 0; 709e02119d5SChris Mason 710e02119d5SChris Mason di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 711e02119d5SChris Mason index, name, name_len, 0); 712e02119d5SChris Mason if (di && !IS_ERR(di)) { 713e02119d5SChris Mason btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 714e02119d5SChris Mason if (location.objectid != objectid) 715e02119d5SChris Mason goto out; 716e02119d5SChris Mason } else 717e02119d5SChris Mason goto out; 718e02119d5SChris Mason btrfs_release_path(root, path); 719e02119d5SChris Mason 720e02119d5SChris Mason di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 721e02119d5SChris Mason if (di && !IS_ERR(di)) { 722e02119d5SChris Mason btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 723e02119d5SChris Mason if (location.objectid != objectid) 724e02119d5SChris Mason goto out; 725e02119d5SChris Mason } else 726e02119d5SChris Mason goto out; 727e02119d5SChris Mason match = 1; 728e02119d5SChris Mason out: 729e02119d5SChris Mason btrfs_release_path(root, path); 730e02119d5SChris Mason return match; 731e02119d5SChris Mason } 732e02119d5SChris Mason 733e02119d5SChris Mason /* 734e02119d5SChris Mason * helper function to check a log tree for a named back reference in 735e02119d5SChris Mason * an inode. This is used to decide if a back reference that is 736e02119d5SChris Mason * found in the subvolume conflicts with what we find in the log. 737e02119d5SChris Mason * 738e02119d5SChris Mason * inode backreferences may have multiple refs in a single item, 739e02119d5SChris Mason * during replay we process one reference at a time, and we don't 740e02119d5SChris Mason * want to delete valid links to a file from the subvolume if that 741e02119d5SChris Mason * link is also in the log. 742e02119d5SChris Mason */ 743e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log, 744e02119d5SChris Mason struct btrfs_key *key, 745e02119d5SChris Mason char *name, int namelen) 746e02119d5SChris Mason { 747e02119d5SChris Mason struct btrfs_path *path; 748e02119d5SChris Mason struct btrfs_inode_ref *ref; 749e02119d5SChris Mason unsigned long ptr; 750e02119d5SChris Mason unsigned long ptr_end; 751e02119d5SChris Mason unsigned long name_ptr; 752e02119d5SChris Mason int found_name_len; 753e02119d5SChris Mason int item_size; 754e02119d5SChris Mason int ret; 755e02119d5SChris Mason int match = 0; 756e02119d5SChris Mason 757e02119d5SChris Mason path = btrfs_alloc_path(); 7582a29edc6Sliubo if (!path) 7592a29edc6Sliubo return -ENOMEM; 7602a29edc6Sliubo 761e02119d5SChris Mason ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 762e02119d5SChris Mason if (ret != 0) 763e02119d5SChris Mason goto out; 764e02119d5SChris Mason 765e02119d5SChris Mason item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 766e02119d5SChris Mason ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 767e02119d5SChris Mason ptr_end = ptr + item_size; 768e02119d5SChris Mason while (ptr < ptr_end) { 769e02119d5SChris Mason ref = (struct btrfs_inode_ref *)ptr; 770e02119d5SChris Mason found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 771e02119d5SChris Mason if (found_name_len == namelen) { 772e02119d5SChris Mason name_ptr = (unsigned long)(ref + 1); 773e02119d5SChris Mason ret = memcmp_extent_buffer(path->nodes[0], name, 774e02119d5SChris Mason name_ptr, namelen); 775e02119d5SChris Mason if (ret == 0) { 776e02119d5SChris Mason match = 1; 777e02119d5SChris Mason goto out; 778e02119d5SChris Mason } 779e02119d5SChris Mason } 780e02119d5SChris Mason ptr = (unsigned long)(ref + 1) + found_name_len; 781e02119d5SChris Mason } 782e02119d5SChris Mason out: 783e02119d5SChris Mason btrfs_free_path(path); 784e02119d5SChris Mason return match; 785e02119d5SChris Mason } 786e02119d5SChris Mason 787e02119d5SChris Mason 788e02119d5SChris Mason /* 789e02119d5SChris Mason * replay one inode back reference item found in the log tree. 790e02119d5SChris Mason * eb, slot and key refer to the buffer and key found in the log tree. 791e02119d5SChris Mason * root is the destination we are replaying into, and path is for temp 792e02119d5SChris Mason * use by this function. (it should be released on return). 793e02119d5SChris Mason */ 794e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 795e02119d5SChris Mason struct btrfs_root *root, 796e02119d5SChris Mason struct btrfs_root *log, 797e02119d5SChris Mason struct btrfs_path *path, 798e02119d5SChris Mason struct extent_buffer *eb, int slot, 799e02119d5SChris Mason struct btrfs_key *key) 800e02119d5SChris Mason { 801e02119d5SChris Mason struct inode *dir; 802e02119d5SChris Mason int ret; 803e02119d5SChris Mason struct btrfs_inode_ref *ref; 804e02119d5SChris Mason struct inode *inode; 805e02119d5SChris Mason char *name; 806e02119d5SChris Mason int namelen; 807e02119d5SChris Mason unsigned long ref_ptr; 808e02119d5SChris Mason unsigned long ref_end; 809c622ae60Sliubo int search_done = 0; 810e02119d5SChris Mason 811e02119d5SChris Mason /* 812e02119d5SChris Mason * it is possible that we didn't log all the parent directories 813e02119d5SChris Mason * for a given inode. If we don't find the dir, just don't 814e02119d5SChris Mason * copy the back ref in. The link count fixup code will take 815e02119d5SChris Mason * care of the rest 816e02119d5SChris Mason */ 817e02119d5SChris Mason dir = read_one_inode(root, key->offset); 818e02119d5SChris Mason if (!dir) 819e02119d5SChris Mason return -ENOENT; 820e02119d5SChris Mason 821e02119d5SChris Mason inode = read_one_inode(root, key->objectid); 822*c00e9493STsutomu Itoh if (!inode) { 823*c00e9493STsutomu Itoh iput(dir); 824*c00e9493STsutomu Itoh return -EIO; 825*c00e9493STsutomu Itoh } 826e02119d5SChris Mason 827e02119d5SChris Mason ref_ptr = btrfs_item_ptr_offset(eb, slot); 828e02119d5SChris Mason ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 829e02119d5SChris Mason 830e02119d5SChris Mason again: 831e02119d5SChris Mason ref = (struct btrfs_inode_ref *)ref_ptr; 832e02119d5SChris Mason 833e02119d5SChris Mason namelen = btrfs_inode_ref_name_len(eb, ref); 834e02119d5SChris Mason name = kmalloc(namelen, GFP_NOFS); 835e02119d5SChris Mason BUG_ON(!name); 836e02119d5SChris Mason 837e02119d5SChris Mason read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 838e02119d5SChris Mason 839e02119d5SChris Mason /* if we already have a perfect match, we're done */ 840e02119d5SChris Mason if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 841e02119d5SChris Mason btrfs_inode_ref_index(eb, ref), 842e02119d5SChris Mason name, namelen)) { 843e02119d5SChris Mason goto out; 844e02119d5SChris Mason } 845e02119d5SChris Mason 846e02119d5SChris Mason /* 847e02119d5SChris Mason * look for a conflicting back reference in the metadata. 848e02119d5SChris Mason * if we find one we have to unlink that name of the file 849e02119d5SChris Mason * before we add our new link. Later on, we overwrite any 850e02119d5SChris Mason * existing back reference, and we don't want to create 851e02119d5SChris Mason * dangling pointers in the directory. 852e02119d5SChris Mason */ 853c622ae60Sliubo 854c622ae60Sliubo if (search_done) 855c622ae60Sliubo goto insert; 856c622ae60Sliubo 857e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 858e02119d5SChris Mason if (ret == 0) { 859e02119d5SChris Mason char *victim_name; 860e02119d5SChris Mason int victim_name_len; 861e02119d5SChris Mason struct btrfs_inode_ref *victim_ref; 862e02119d5SChris Mason unsigned long ptr; 863e02119d5SChris Mason unsigned long ptr_end; 864e02119d5SChris Mason struct extent_buffer *leaf = path->nodes[0]; 865e02119d5SChris Mason 866e02119d5SChris Mason /* are we trying to overwrite a back ref for the root directory 867e02119d5SChris Mason * if so, just jump out, we're done 868e02119d5SChris Mason */ 869e02119d5SChris Mason if (key->objectid == key->offset) 870e02119d5SChris Mason goto out_nowrite; 871e02119d5SChris Mason 872e02119d5SChris Mason /* check all the names in this back reference to see 873e02119d5SChris Mason * if they are in the log. if so, we allow them to stay 874e02119d5SChris Mason * otherwise they must be unlinked as a conflict 875e02119d5SChris Mason */ 876e02119d5SChris Mason ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 877e02119d5SChris Mason ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 878e02119d5SChris Mason while (ptr < ptr_end) { 879e02119d5SChris Mason victim_ref = (struct btrfs_inode_ref *)ptr; 880e02119d5SChris Mason victim_name_len = btrfs_inode_ref_name_len(leaf, 881e02119d5SChris Mason victim_ref); 882e02119d5SChris Mason victim_name = kmalloc(victim_name_len, GFP_NOFS); 883e02119d5SChris Mason BUG_ON(!victim_name); 884e02119d5SChris Mason 885e02119d5SChris Mason read_extent_buffer(leaf, victim_name, 886e02119d5SChris Mason (unsigned long)(victim_ref + 1), 887e02119d5SChris Mason victim_name_len); 888e02119d5SChris Mason 889e02119d5SChris Mason if (!backref_in_log(log, key, victim_name, 890e02119d5SChris Mason victim_name_len)) { 891e02119d5SChris Mason btrfs_inc_nlink(inode); 892e02119d5SChris Mason btrfs_release_path(root, path); 89312fcfd22SChris Mason 894e02119d5SChris Mason ret = btrfs_unlink_inode(trans, root, dir, 895e02119d5SChris Mason inode, victim_name, 896e02119d5SChris Mason victim_name_len); 897e02119d5SChris Mason } 898e02119d5SChris Mason kfree(victim_name); 899e02119d5SChris Mason ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 900e02119d5SChris Mason } 901e02119d5SChris Mason BUG_ON(ret); 902c622ae60Sliubo 903c622ae60Sliubo /* 904c622ae60Sliubo * NOTE: we have searched root tree and checked the 905c622ae60Sliubo * coresponding ref, it does not need to check again. 906c622ae60Sliubo */ 907c622ae60Sliubo search_done = 1; 908e02119d5SChris Mason } 909e02119d5SChris Mason btrfs_release_path(root, path); 910e02119d5SChris Mason 911c622ae60Sliubo insert: 912e02119d5SChris Mason /* insert our name */ 913e02119d5SChris Mason ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 914e02119d5SChris Mason btrfs_inode_ref_index(eb, ref)); 915e02119d5SChris Mason BUG_ON(ret); 916e02119d5SChris Mason 917e02119d5SChris Mason btrfs_update_inode(trans, root, inode); 918e02119d5SChris Mason 919e02119d5SChris Mason out: 920e02119d5SChris Mason ref_ptr = (unsigned long)(ref + 1) + namelen; 921e02119d5SChris Mason kfree(name); 922e02119d5SChris Mason if (ref_ptr < ref_end) 923e02119d5SChris Mason goto again; 924e02119d5SChris Mason 925e02119d5SChris Mason /* finally write the back reference in the inode */ 926e02119d5SChris Mason ret = overwrite_item(trans, root, path, eb, slot, key); 927e02119d5SChris Mason BUG_ON(ret); 928e02119d5SChris Mason 929e02119d5SChris Mason out_nowrite: 930e02119d5SChris Mason btrfs_release_path(root, path); 931e02119d5SChris Mason iput(dir); 932e02119d5SChris Mason iput(inode); 933e02119d5SChris Mason return 0; 934e02119d5SChris Mason } 935e02119d5SChris Mason 936c71bf099SYan, Zheng static int insert_orphan_item(struct btrfs_trans_handle *trans, 937c71bf099SYan, Zheng struct btrfs_root *root, u64 offset) 938c71bf099SYan, Zheng { 939c71bf099SYan, Zheng int ret; 940c71bf099SYan, Zheng ret = btrfs_find_orphan_item(root, offset); 941c71bf099SYan, Zheng if (ret > 0) 942c71bf099SYan, Zheng ret = btrfs_insert_orphan_item(trans, root, offset); 943c71bf099SYan, Zheng return ret; 944c71bf099SYan, Zheng } 945c71bf099SYan, Zheng 946c71bf099SYan, Zheng 947e02119d5SChris Mason /* 948e02119d5SChris Mason * There are a few corners where the link count of the file can't 949e02119d5SChris Mason * be properly maintained during replay. So, instead of adding 950e02119d5SChris Mason * lots of complexity to the log code, we just scan the backrefs 951e02119d5SChris Mason * for any file that has been through replay. 952e02119d5SChris Mason * 953e02119d5SChris Mason * The scan will update the link count on the inode to reflect the 954e02119d5SChris Mason * number of back refs found. If it goes down to zero, the iput 955e02119d5SChris Mason * will free the inode. 956e02119d5SChris Mason */ 957e02119d5SChris Mason static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 958e02119d5SChris Mason struct btrfs_root *root, 959e02119d5SChris Mason struct inode *inode) 960e02119d5SChris Mason { 961e02119d5SChris Mason struct btrfs_path *path; 962e02119d5SChris Mason int ret; 963e02119d5SChris Mason struct btrfs_key key; 964e02119d5SChris Mason u64 nlink = 0; 965e02119d5SChris Mason unsigned long ptr; 966e02119d5SChris Mason unsigned long ptr_end; 967e02119d5SChris Mason int name_len; 968e02119d5SChris Mason 969e02119d5SChris Mason key.objectid = inode->i_ino; 970e02119d5SChris Mason key.type = BTRFS_INODE_REF_KEY; 971e02119d5SChris Mason key.offset = (u64)-1; 972e02119d5SChris Mason 973e02119d5SChris Mason path = btrfs_alloc_path(); 9742a29edc6Sliubo if (!path) 9752a29edc6Sliubo return -ENOMEM; 976e02119d5SChris Mason 977e02119d5SChris Mason while (1) { 978e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 979e02119d5SChris Mason if (ret < 0) 980e02119d5SChris Mason break; 981e02119d5SChris Mason if (ret > 0) { 982e02119d5SChris Mason if (path->slots[0] == 0) 983e02119d5SChris Mason break; 984e02119d5SChris Mason path->slots[0]--; 985e02119d5SChris Mason } 986e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, 987e02119d5SChris Mason path->slots[0]); 988e02119d5SChris Mason if (key.objectid != inode->i_ino || 989e02119d5SChris Mason key.type != BTRFS_INODE_REF_KEY) 990e02119d5SChris Mason break; 991e02119d5SChris Mason ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 992e02119d5SChris Mason ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 993e02119d5SChris Mason path->slots[0]); 994e02119d5SChris Mason while (ptr < ptr_end) { 995e02119d5SChris Mason struct btrfs_inode_ref *ref; 996e02119d5SChris Mason 997e02119d5SChris Mason ref = (struct btrfs_inode_ref *)ptr; 998e02119d5SChris Mason name_len = btrfs_inode_ref_name_len(path->nodes[0], 999e02119d5SChris Mason ref); 1000e02119d5SChris Mason ptr = (unsigned long)(ref + 1) + name_len; 1001e02119d5SChris Mason nlink++; 1002e02119d5SChris Mason } 1003e02119d5SChris Mason 1004e02119d5SChris Mason if (key.offset == 0) 1005e02119d5SChris Mason break; 1006e02119d5SChris Mason key.offset--; 1007e02119d5SChris Mason btrfs_release_path(root, path); 1008e02119d5SChris Mason } 100912fcfd22SChris Mason btrfs_release_path(root, path); 1010e02119d5SChris Mason if (nlink != inode->i_nlink) { 1011e02119d5SChris Mason inode->i_nlink = nlink; 1012e02119d5SChris Mason btrfs_update_inode(trans, root, inode); 1013e02119d5SChris Mason } 10148d5bf1cbSChris Mason BTRFS_I(inode)->index_cnt = (u64)-1; 1015e02119d5SChris Mason 1016c71bf099SYan, Zheng if (inode->i_nlink == 0) { 1017c71bf099SYan, Zheng if (S_ISDIR(inode->i_mode)) { 101812fcfd22SChris Mason ret = replay_dir_deletes(trans, root, NULL, path, 101912fcfd22SChris Mason inode->i_ino, 1); 102012fcfd22SChris Mason BUG_ON(ret); 102112fcfd22SChris Mason } 1022c71bf099SYan, Zheng ret = insert_orphan_item(trans, root, inode->i_ino); 1023c71bf099SYan, Zheng BUG_ON(ret); 1024c71bf099SYan, Zheng } 102512fcfd22SChris Mason btrfs_free_path(path); 102612fcfd22SChris Mason 1027e02119d5SChris Mason return 0; 1028e02119d5SChris Mason } 1029e02119d5SChris Mason 1030e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1031e02119d5SChris Mason struct btrfs_root *root, 1032e02119d5SChris Mason struct btrfs_path *path) 1033e02119d5SChris Mason { 1034e02119d5SChris Mason int ret; 1035e02119d5SChris Mason struct btrfs_key key; 1036e02119d5SChris Mason struct inode *inode; 1037e02119d5SChris Mason 1038e02119d5SChris Mason key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1039e02119d5SChris Mason key.type = BTRFS_ORPHAN_ITEM_KEY; 1040e02119d5SChris Mason key.offset = (u64)-1; 1041e02119d5SChris Mason while (1) { 1042e02119d5SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1043e02119d5SChris Mason if (ret < 0) 1044e02119d5SChris Mason break; 1045e02119d5SChris Mason 1046e02119d5SChris Mason if (ret == 1) { 1047e02119d5SChris Mason if (path->slots[0] == 0) 1048e02119d5SChris Mason break; 1049e02119d5SChris Mason path->slots[0]--; 1050e02119d5SChris Mason } 1051e02119d5SChris Mason 1052e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1053e02119d5SChris Mason if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1054e02119d5SChris Mason key.type != BTRFS_ORPHAN_ITEM_KEY) 1055e02119d5SChris Mason break; 1056e02119d5SChris Mason 1057e02119d5SChris Mason ret = btrfs_del_item(trans, root, path); 105865a246c5STsutomu Itoh if (ret) 105965a246c5STsutomu Itoh goto out; 1060e02119d5SChris Mason 1061e02119d5SChris Mason btrfs_release_path(root, path); 1062e02119d5SChris Mason inode = read_one_inode(root, key.offset); 1063*c00e9493STsutomu Itoh if (!inode) 1064*c00e9493STsutomu Itoh return -EIO; 1065e02119d5SChris Mason 1066e02119d5SChris Mason ret = fixup_inode_link_count(trans, root, inode); 1067e02119d5SChris Mason BUG_ON(ret); 1068e02119d5SChris Mason 1069e02119d5SChris Mason iput(inode); 1070e02119d5SChris Mason 107112fcfd22SChris Mason /* 107212fcfd22SChris Mason * fixup on a directory may create new entries, 107312fcfd22SChris Mason * make sure we always look for the highset possible 107412fcfd22SChris Mason * offset 107512fcfd22SChris Mason */ 107612fcfd22SChris Mason key.offset = (u64)-1; 1077e02119d5SChris Mason } 107865a246c5STsutomu Itoh ret = 0; 107965a246c5STsutomu Itoh out: 1080e02119d5SChris Mason btrfs_release_path(root, path); 108165a246c5STsutomu Itoh return ret; 1082e02119d5SChris Mason } 1083e02119d5SChris Mason 1084e02119d5SChris Mason 1085e02119d5SChris Mason /* 1086e02119d5SChris Mason * record a given inode in the fixup dir so we can check its link 1087e02119d5SChris Mason * count when replay is done. The link count is incremented here 1088e02119d5SChris Mason * so the inode won't go away until we check it 1089e02119d5SChris Mason */ 1090e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1091e02119d5SChris Mason struct btrfs_root *root, 1092e02119d5SChris Mason struct btrfs_path *path, 1093e02119d5SChris Mason u64 objectid) 1094e02119d5SChris Mason { 1095e02119d5SChris Mason struct btrfs_key key; 1096e02119d5SChris Mason int ret = 0; 1097e02119d5SChris Mason struct inode *inode; 1098e02119d5SChris Mason 1099e02119d5SChris Mason inode = read_one_inode(root, objectid); 1100*c00e9493STsutomu Itoh if (!inode) 1101*c00e9493STsutomu Itoh return -EIO; 1102e02119d5SChris Mason 1103e02119d5SChris Mason key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1104e02119d5SChris Mason btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1105e02119d5SChris Mason key.offset = objectid; 1106e02119d5SChris Mason 1107e02119d5SChris Mason ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1108e02119d5SChris Mason 1109e02119d5SChris Mason btrfs_release_path(root, path); 1110e02119d5SChris Mason if (ret == 0) { 1111e02119d5SChris Mason btrfs_inc_nlink(inode); 1112e02119d5SChris Mason btrfs_update_inode(trans, root, inode); 1113e02119d5SChris Mason } else if (ret == -EEXIST) { 1114e02119d5SChris Mason ret = 0; 1115e02119d5SChris Mason } else { 1116e02119d5SChris Mason BUG(); 1117e02119d5SChris Mason } 1118e02119d5SChris Mason iput(inode); 1119e02119d5SChris Mason 1120e02119d5SChris Mason return ret; 1121e02119d5SChris Mason } 1122e02119d5SChris Mason 1123e02119d5SChris Mason /* 1124e02119d5SChris Mason * when replaying the log for a directory, we only insert names 1125e02119d5SChris Mason * for inodes that actually exist. This means an fsync on a directory 1126e02119d5SChris Mason * does not implicitly fsync all the new files in it 1127e02119d5SChris Mason */ 1128e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1129e02119d5SChris Mason struct btrfs_root *root, 1130e02119d5SChris Mason struct btrfs_path *path, 1131e02119d5SChris Mason u64 dirid, u64 index, 1132e02119d5SChris Mason char *name, int name_len, u8 type, 1133e02119d5SChris Mason struct btrfs_key *location) 1134e02119d5SChris Mason { 1135e02119d5SChris Mason struct inode *inode; 1136e02119d5SChris Mason struct inode *dir; 1137e02119d5SChris Mason int ret; 1138e02119d5SChris Mason 1139e02119d5SChris Mason inode = read_one_inode(root, location->objectid); 1140e02119d5SChris Mason if (!inode) 1141e02119d5SChris Mason return -ENOENT; 1142e02119d5SChris Mason 1143e02119d5SChris Mason dir = read_one_inode(root, dirid); 1144e02119d5SChris Mason if (!dir) { 1145e02119d5SChris Mason iput(inode); 1146e02119d5SChris Mason return -EIO; 1147e02119d5SChris Mason } 1148e02119d5SChris Mason ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1149e02119d5SChris Mason 1150e02119d5SChris Mason /* FIXME, put inode into FIXUP list */ 1151e02119d5SChris Mason 1152e02119d5SChris Mason iput(inode); 1153e02119d5SChris Mason iput(dir); 1154e02119d5SChris Mason return ret; 1155e02119d5SChris Mason } 1156e02119d5SChris Mason 1157e02119d5SChris Mason /* 1158e02119d5SChris Mason * take a single entry in a log directory item and replay it into 1159e02119d5SChris Mason * the subvolume. 1160e02119d5SChris Mason * 1161e02119d5SChris Mason * if a conflicting item exists in the subdirectory already, 1162e02119d5SChris Mason * the inode it points to is unlinked and put into the link count 1163e02119d5SChris Mason * fix up tree. 1164e02119d5SChris Mason * 1165e02119d5SChris Mason * If a name from the log points to a file or directory that does 1166e02119d5SChris Mason * not exist in the FS, it is skipped. fsyncs on directories 1167e02119d5SChris Mason * do not force down inodes inside that directory, just changes to the 1168e02119d5SChris Mason * names or unlinks in a directory. 1169e02119d5SChris Mason */ 1170e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1171e02119d5SChris Mason struct btrfs_root *root, 1172e02119d5SChris Mason struct btrfs_path *path, 1173e02119d5SChris Mason struct extent_buffer *eb, 1174e02119d5SChris Mason struct btrfs_dir_item *di, 1175e02119d5SChris Mason struct btrfs_key *key) 1176e02119d5SChris Mason { 1177e02119d5SChris Mason char *name; 1178e02119d5SChris Mason int name_len; 1179e02119d5SChris Mason struct btrfs_dir_item *dst_di; 1180e02119d5SChris Mason struct btrfs_key found_key; 1181e02119d5SChris Mason struct btrfs_key log_key; 1182e02119d5SChris Mason struct inode *dir; 1183e02119d5SChris Mason u8 log_type; 11844bef0848SChris Mason int exists; 1185e02119d5SChris Mason int ret; 1186e02119d5SChris Mason 1187e02119d5SChris Mason dir = read_one_inode(root, key->objectid); 1188*c00e9493STsutomu Itoh if (!dir) 1189*c00e9493STsutomu Itoh return -EIO; 1190e02119d5SChris Mason 1191e02119d5SChris Mason name_len = btrfs_dir_name_len(eb, di); 1192e02119d5SChris Mason name = kmalloc(name_len, GFP_NOFS); 11932a29edc6Sliubo if (!name) 11942a29edc6Sliubo return -ENOMEM; 11952a29edc6Sliubo 1196e02119d5SChris Mason log_type = btrfs_dir_type(eb, di); 1197e02119d5SChris Mason read_extent_buffer(eb, name, (unsigned long)(di + 1), 1198e02119d5SChris Mason name_len); 1199e02119d5SChris Mason 1200e02119d5SChris Mason btrfs_dir_item_key_to_cpu(eb, di, &log_key); 12014bef0848SChris Mason exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 12024bef0848SChris Mason if (exists == 0) 12034bef0848SChris Mason exists = 1; 12044bef0848SChris Mason else 12054bef0848SChris Mason exists = 0; 12064bef0848SChris Mason btrfs_release_path(root, path); 12074bef0848SChris Mason 1208e02119d5SChris Mason if (key->type == BTRFS_DIR_ITEM_KEY) { 1209e02119d5SChris Mason dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1210e02119d5SChris Mason name, name_len, 1); 1211d397712bSChris Mason } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1212e02119d5SChris Mason dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1213e02119d5SChris Mason key->objectid, 1214e02119d5SChris Mason key->offset, name, 1215e02119d5SChris Mason name_len, 1); 1216e02119d5SChris Mason } else { 1217e02119d5SChris Mason BUG(); 1218e02119d5SChris Mason } 1219e02119d5SChris Mason if (!dst_di || IS_ERR(dst_di)) { 1220e02119d5SChris Mason /* we need a sequence number to insert, so we only 1221e02119d5SChris Mason * do inserts for the BTRFS_DIR_INDEX_KEY types 1222e02119d5SChris Mason */ 1223e02119d5SChris Mason if (key->type != BTRFS_DIR_INDEX_KEY) 1224e02119d5SChris Mason goto out; 1225e02119d5SChris Mason goto insert; 1226e02119d5SChris Mason } 1227e02119d5SChris Mason 1228e02119d5SChris Mason btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1229e02119d5SChris Mason /* the existing item matches the logged item */ 1230e02119d5SChris Mason if (found_key.objectid == log_key.objectid && 1231e02119d5SChris Mason found_key.type == log_key.type && 1232e02119d5SChris Mason found_key.offset == log_key.offset && 1233e02119d5SChris Mason btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1234e02119d5SChris Mason goto out; 1235e02119d5SChris Mason } 1236e02119d5SChris Mason 1237e02119d5SChris Mason /* 1238e02119d5SChris Mason * don't drop the conflicting directory entry if the inode 1239e02119d5SChris Mason * for the new entry doesn't exist 1240e02119d5SChris Mason */ 12414bef0848SChris Mason if (!exists) 1242e02119d5SChris Mason goto out; 1243e02119d5SChris Mason 1244e02119d5SChris Mason ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1245e02119d5SChris Mason BUG_ON(ret); 1246e02119d5SChris Mason 1247e02119d5SChris Mason if (key->type == BTRFS_DIR_INDEX_KEY) 1248e02119d5SChris Mason goto insert; 1249e02119d5SChris Mason out: 1250e02119d5SChris Mason btrfs_release_path(root, path); 1251e02119d5SChris Mason kfree(name); 1252e02119d5SChris Mason iput(dir); 1253e02119d5SChris Mason return 0; 1254e02119d5SChris Mason 1255e02119d5SChris Mason insert: 1256e02119d5SChris Mason btrfs_release_path(root, path); 1257e02119d5SChris Mason ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1258e02119d5SChris Mason name, name_len, log_type, &log_key); 1259e02119d5SChris Mason 1260c293498bSStoyan Gaydarov BUG_ON(ret && ret != -ENOENT); 1261e02119d5SChris Mason goto out; 1262e02119d5SChris Mason } 1263e02119d5SChris Mason 1264e02119d5SChris Mason /* 1265e02119d5SChris Mason * find all the names in a directory item and reconcile them into 1266e02119d5SChris Mason * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1267e02119d5SChris Mason * one name in a directory item, but the same code gets used for 1268e02119d5SChris Mason * both directory index types 1269e02119d5SChris Mason */ 1270e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1271e02119d5SChris Mason struct btrfs_root *root, 1272e02119d5SChris Mason struct btrfs_path *path, 1273e02119d5SChris Mason struct extent_buffer *eb, int slot, 1274e02119d5SChris Mason struct btrfs_key *key) 1275e02119d5SChris Mason { 1276e02119d5SChris Mason int ret; 1277e02119d5SChris Mason u32 item_size = btrfs_item_size_nr(eb, slot); 1278e02119d5SChris Mason struct btrfs_dir_item *di; 1279e02119d5SChris Mason int name_len; 1280e02119d5SChris Mason unsigned long ptr; 1281e02119d5SChris Mason unsigned long ptr_end; 1282e02119d5SChris Mason 1283e02119d5SChris Mason ptr = btrfs_item_ptr_offset(eb, slot); 1284e02119d5SChris Mason ptr_end = ptr + item_size; 1285e02119d5SChris Mason while (ptr < ptr_end) { 1286e02119d5SChris Mason di = (struct btrfs_dir_item *)ptr; 128722a94d44SJosef Bacik if (verify_dir_item(root, eb, di)) 128822a94d44SJosef Bacik return -EIO; 1289e02119d5SChris Mason name_len = btrfs_dir_name_len(eb, di); 1290e02119d5SChris Mason ret = replay_one_name(trans, root, path, eb, di, key); 1291e02119d5SChris Mason BUG_ON(ret); 1292e02119d5SChris Mason ptr = (unsigned long)(di + 1); 1293e02119d5SChris Mason ptr += name_len; 1294e02119d5SChris Mason } 1295e02119d5SChris Mason return 0; 1296e02119d5SChris Mason } 1297e02119d5SChris Mason 1298e02119d5SChris Mason /* 1299e02119d5SChris Mason * directory replay has two parts. There are the standard directory 1300e02119d5SChris Mason * items in the log copied from the subvolume, and range items 1301e02119d5SChris Mason * created in the log while the subvolume was logged. 1302e02119d5SChris Mason * 1303e02119d5SChris Mason * The range items tell us which parts of the key space the log 1304e02119d5SChris Mason * is authoritative for. During replay, if a key in the subvolume 1305e02119d5SChris Mason * directory is in a logged range item, but not actually in the log 1306e02119d5SChris Mason * that means it was deleted from the directory before the fsync 1307e02119d5SChris Mason * and should be removed. 1308e02119d5SChris Mason */ 1309e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root, 1310e02119d5SChris Mason struct btrfs_path *path, 1311e02119d5SChris Mason u64 dirid, int key_type, 1312e02119d5SChris Mason u64 *start_ret, u64 *end_ret) 1313e02119d5SChris Mason { 1314e02119d5SChris Mason struct btrfs_key key; 1315e02119d5SChris Mason u64 found_end; 1316e02119d5SChris Mason struct btrfs_dir_log_item *item; 1317e02119d5SChris Mason int ret; 1318e02119d5SChris Mason int nritems; 1319e02119d5SChris Mason 1320e02119d5SChris Mason if (*start_ret == (u64)-1) 1321e02119d5SChris Mason return 1; 1322e02119d5SChris Mason 1323e02119d5SChris Mason key.objectid = dirid; 1324e02119d5SChris Mason key.type = key_type; 1325e02119d5SChris Mason key.offset = *start_ret; 1326e02119d5SChris Mason 1327e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1328e02119d5SChris Mason if (ret < 0) 1329e02119d5SChris Mason goto out; 1330e02119d5SChris Mason if (ret > 0) { 1331e02119d5SChris Mason if (path->slots[0] == 0) 1332e02119d5SChris Mason goto out; 1333e02119d5SChris Mason path->slots[0]--; 1334e02119d5SChris Mason } 1335e02119d5SChris Mason if (ret != 0) 1336e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1337e02119d5SChris Mason 1338e02119d5SChris Mason if (key.type != key_type || key.objectid != dirid) { 1339e02119d5SChris Mason ret = 1; 1340e02119d5SChris Mason goto next; 1341e02119d5SChris Mason } 1342e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1343e02119d5SChris Mason struct btrfs_dir_log_item); 1344e02119d5SChris Mason found_end = btrfs_dir_log_end(path->nodes[0], item); 1345e02119d5SChris Mason 1346e02119d5SChris Mason if (*start_ret >= key.offset && *start_ret <= found_end) { 1347e02119d5SChris Mason ret = 0; 1348e02119d5SChris Mason *start_ret = key.offset; 1349e02119d5SChris Mason *end_ret = found_end; 1350e02119d5SChris Mason goto out; 1351e02119d5SChris Mason } 1352e02119d5SChris Mason ret = 1; 1353e02119d5SChris Mason next: 1354e02119d5SChris Mason /* check the next slot in the tree to see if it is a valid item */ 1355e02119d5SChris Mason nritems = btrfs_header_nritems(path->nodes[0]); 1356e02119d5SChris Mason if (path->slots[0] >= nritems) { 1357e02119d5SChris Mason ret = btrfs_next_leaf(root, path); 1358e02119d5SChris Mason if (ret) 1359e02119d5SChris Mason goto out; 1360e02119d5SChris Mason } else { 1361e02119d5SChris Mason path->slots[0]++; 1362e02119d5SChris Mason } 1363e02119d5SChris Mason 1364e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1365e02119d5SChris Mason 1366e02119d5SChris Mason if (key.type != key_type || key.objectid != dirid) { 1367e02119d5SChris Mason ret = 1; 1368e02119d5SChris Mason goto out; 1369e02119d5SChris Mason } 1370e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1371e02119d5SChris Mason struct btrfs_dir_log_item); 1372e02119d5SChris Mason found_end = btrfs_dir_log_end(path->nodes[0], item); 1373e02119d5SChris Mason *start_ret = key.offset; 1374e02119d5SChris Mason *end_ret = found_end; 1375e02119d5SChris Mason ret = 0; 1376e02119d5SChris Mason out: 1377e02119d5SChris Mason btrfs_release_path(root, path); 1378e02119d5SChris Mason return ret; 1379e02119d5SChris Mason } 1380e02119d5SChris Mason 1381e02119d5SChris Mason /* 1382e02119d5SChris Mason * this looks for a given directory item in the log. If the directory 1383e02119d5SChris Mason * item is not in the log, the item is removed and the inode it points 1384e02119d5SChris Mason * to is unlinked 1385e02119d5SChris Mason */ 1386e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1387e02119d5SChris Mason struct btrfs_root *root, 1388e02119d5SChris Mason struct btrfs_root *log, 1389e02119d5SChris Mason struct btrfs_path *path, 1390e02119d5SChris Mason struct btrfs_path *log_path, 1391e02119d5SChris Mason struct inode *dir, 1392e02119d5SChris Mason struct btrfs_key *dir_key) 1393e02119d5SChris Mason { 1394e02119d5SChris Mason int ret; 1395e02119d5SChris Mason struct extent_buffer *eb; 1396e02119d5SChris Mason int slot; 1397e02119d5SChris Mason u32 item_size; 1398e02119d5SChris Mason struct btrfs_dir_item *di; 1399e02119d5SChris Mason struct btrfs_dir_item *log_di; 1400e02119d5SChris Mason int name_len; 1401e02119d5SChris Mason unsigned long ptr; 1402e02119d5SChris Mason unsigned long ptr_end; 1403e02119d5SChris Mason char *name; 1404e02119d5SChris Mason struct inode *inode; 1405e02119d5SChris Mason struct btrfs_key location; 1406e02119d5SChris Mason 1407e02119d5SChris Mason again: 1408e02119d5SChris Mason eb = path->nodes[0]; 1409e02119d5SChris Mason slot = path->slots[0]; 1410e02119d5SChris Mason item_size = btrfs_item_size_nr(eb, slot); 1411e02119d5SChris Mason ptr = btrfs_item_ptr_offset(eb, slot); 1412e02119d5SChris Mason ptr_end = ptr + item_size; 1413e02119d5SChris Mason while (ptr < ptr_end) { 1414e02119d5SChris Mason di = (struct btrfs_dir_item *)ptr; 141522a94d44SJosef Bacik if (verify_dir_item(root, eb, di)) { 141622a94d44SJosef Bacik ret = -EIO; 141722a94d44SJosef Bacik goto out; 141822a94d44SJosef Bacik } 141922a94d44SJosef Bacik 1420e02119d5SChris Mason name_len = btrfs_dir_name_len(eb, di); 1421e02119d5SChris Mason name = kmalloc(name_len, GFP_NOFS); 1422e02119d5SChris Mason if (!name) { 1423e02119d5SChris Mason ret = -ENOMEM; 1424e02119d5SChris Mason goto out; 1425e02119d5SChris Mason } 1426e02119d5SChris Mason read_extent_buffer(eb, name, (unsigned long)(di + 1), 1427e02119d5SChris Mason name_len); 1428e02119d5SChris Mason log_di = NULL; 142912fcfd22SChris Mason if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1430e02119d5SChris Mason log_di = btrfs_lookup_dir_item(trans, log, log_path, 1431e02119d5SChris Mason dir_key->objectid, 1432e02119d5SChris Mason name, name_len, 0); 143312fcfd22SChris Mason } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1434e02119d5SChris Mason log_di = btrfs_lookup_dir_index_item(trans, log, 1435e02119d5SChris Mason log_path, 1436e02119d5SChris Mason dir_key->objectid, 1437e02119d5SChris Mason dir_key->offset, 1438e02119d5SChris Mason name, name_len, 0); 1439e02119d5SChris Mason } 1440e02119d5SChris Mason if (!log_di || IS_ERR(log_di)) { 1441e02119d5SChris Mason btrfs_dir_item_key_to_cpu(eb, di, &location); 1442e02119d5SChris Mason btrfs_release_path(root, path); 1443e02119d5SChris Mason btrfs_release_path(log, log_path); 1444e02119d5SChris Mason inode = read_one_inode(root, location.objectid); 1445*c00e9493STsutomu Itoh if (!inode) { 1446*c00e9493STsutomu Itoh kfree(name); 1447*c00e9493STsutomu Itoh return -EIO; 1448*c00e9493STsutomu Itoh } 1449e02119d5SChris Mason 1450e02119d5SChris Mason ret = link_to_fixup_dir(trans, root, 1451e02119d5SChris Mason path, location.objectid); 1452e02119d5SChris Mason BUG_ON(ret); 1453e02119d5SChris Mason btrfs_inc_nlink(inode); 1454e02119d5SChris Mason ret = btrfs_unlink_inode(trans, root, dir, inode, 1455e02119d5SChris Mason name, name_len); 1456e02119d5SChris Mason BUG_ON(ret); 1457e02119d5SChris Mason kfree(name); 1458e02119d5SChris Mason iput(inode); 1459e02119d5SChris Mason 1460e02119d5SChris Mason /* there might still be more names under this key 1461e02119d5SChris Mason * check and repeat if required 1462e02119d5SChris Mason */ 1463e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, dir_key, path, 1464e02119d5SChris Mason 0, 0); 1465e02119d5SChris Mason if (ret == 0) 1466e02119d5SChris Mason goto again; 1467e02119d5SChris Mason ret = 0; 1468e02119d5SChris Mason goto out; 1469e02119d5SChris Mason } 1470e02119d5SChris Mason btrfs_release_path(log, log_path); 1471e02119d5SChris Mason kfree(name); 1472e02119d5SChris Mason 1473e02119d5SChris Mason ptr = (unsigned long)(di + 1); 1474e02119d5SChris Mason ptr += name_len; 1475e02119d5SChris Mason } 1476e02119d5SChris Mason ret = 0; 1477e02119d5SChris Mason out: 1478e02119d5SChris Mason btrfs_release_path(root, path); 1479e02119d5SChris Mason btrfs_release_path(log, log_path); 1480e02119d5SChris Mason return ret; 1481e02119d5SChris Mason } 1482e02119d5SChris Mason 1483e02119d5SChris Mason /* 1484e02119d5SChris Mason * deletion replay happens before we copy any new directory items 1485e02119d5SChris Mason * out of the log or out of backreferences from inodes. It 1486e02119d5SChris Mason * scans the log to find ranges of keys that log is authoritative for, 1487e02119d5SChris Mason * and then scans the directory to find items in those ranges that are 1488e02119d5SChris Mason * not present in the log. 1489e02119d5SChris Mason * 1490e02119d5SChris Mason * Anything we don't find in the log is unlinked and removed from the 1491e02119d5SChris Mason * directory. 1492e02119d5SChris Mason */ 1493e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1494e02119d5SChris Mason struct btrfs_root *root, 1495e02119d5SChris Mason struct btrfs_root *log, 1496e02119d5SChris Mason struct btrfs_path *path, 149712fcfd22SChris Mason u64 dirid, int del_all) 1498e02119d5SChris Mason { 1499e02119d5SChris Mason u64 range_start; 1500e02119d5SChris Mason u64 range_end; 1501e02119d5SChris Mason int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1502e02119d5SChris Mason int ret = 0; 1503e02119d5SChris Mason struct btrfs_key dir_key; 1504e02119d5SChris Mason struct btrfs_key found_key; 1505e02119d5SChris Mason struct btrfs_path *log_path; 1506e02119d5SChris Mason struct inode *dir; 1507e02119d5SChris Mason 1508e02119d5SChris Mason dir_key.objectid = dirid; 1509e02119d5SChris Mason dir_key.type = BTRFS_DIR_ITEM_KEY; 1510e02119d5SChris Mason log_path = btrfs_alloc_path(); 1511e02119d5SChris Mason if (!log_path) 1512e02119d5SChris Mason return -ENOMEM; 1513e02119d5SChris Mason 1514e02119d5SChris Mason dir = read_one_inode(root, dirid); 1515e02119d5SChris Mason /* it isn't an error if the inode isn't there, that can happen 1516e02119d5SChris Mason * because we replay the deletes before we copy in the inode item 1517e02119d5SChris Mason * from the log 1518e02119d5SChris Mason */ 1519e02119d5SChris Mason if (!dir) { 1520e02119d5SChris Mason btrfs_free_path(log_path); 1521e02119d5SChris Mason return 0; 1522e02119d5SChris Mason } 1523e02119d5SChris Mason again: 1524e02119d5SChris Mason range_start = 0; 1525e02119d5SChris Mason range_end = 0; 1526e02119d5SChris Mason while (1) { 152712fcfd22SChris Mason if (del_all) 152812fcfd22SChris Mason range_end = (u64)-1; 152912fcfd22SChris Mason else { 1530e02119d5SChris Mason ret = find_dir_range(log, path, dirid, key_type, 1531e02119d5SChris Mason &range_start, &range_end); 1532e02119d5SChris Mason if (ret != 0) 1533e02119d5SChris Mason break; 153412fcfd22SChris Mason } 1535e02119d5SChris Mason 1536e02119d5SChris Mason dir_key.offset = range_start; 1537e02119d5SChris Mason while (1) { 1538e02119d5SChris Mason int nritems; 1539e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &dir_key, path, 1540e02119d5SChris Mason 0, 0); 1541e02119d5SChris Mason if (ret < 0) 1542e02119d5SChris Mason goto out; 1543e02119d5SChris Mason 1544e02119d5SChris Mason nritems = btrfs_header_nritems(path->nodes[0]); 1545e02119d5SChris Mason if (path->slots[0] >= nritems) { 1546e02119d5SChris Mason ret = btrfs_next_leaf(root, path); 1547e02119d5SChris Mason if (ret) 1548e02119d5SChris Mason break; 1549e02119d5SChris Mason } 1550e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1551e02119d5SChris Mason path->slots[0]); 1552e02119d5SChris Mason if (found_key.objectid != dirid || 1553e02119d5SChris Mason found_key.type != dir_key.type) 1554e02119d5SChris Mason goto next_type; 1555e02119d5SChris Mason 1556e02119d5SChris Mason if (found_key.offset > range_end) 1557e02119d5SChris Mason break; 1558e02119d5SChris Mason 1559e02119d5SChris Mason ret = check_item_in_log(trans, root, log, path, 156012fcfd22SChris Mason log_path, dir, 156112fcfd22SChris Mason &found_key); 1562e02119d5SChris Mason BUG_ON(ret); 1563e02119d5SChris Mason if (found_key.offset == (u64)-1) 1564e02119d5SChris Mason break; 1565e02119d5SChris Mason dir_key.offset = found_key.offset + 1; 1566e02119d5SChris Mason } 1567e02119d5SChris Mason btrfs_release_path(root, path); 1568e02119d5SChris Mason if (range_end == (u64)-1) 1569e02119d5SChris Mason break; 1570e02119d5SChris Mason range_start = range_end + 1; 1571e02119d5SChris Mason } 1572e02119d5SChris Mason 1573e02119d5SChris Mason next_type: 1574e02119d5SChris Mason ret = 0; 1575e02119d5SChris Mason if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1576e02119d5SChris Mason key_type = BTRFS_DIR_LOG_INDEX_KEY; 1577e02119d5SChris Mason dir_key.type = BTRFS_DIR_INDEX_KEY; 1578e02119d5SChris Mason btrfs_release_path(root, path); 1579e02119d5SChris Mason goto again; 1580e02119d5SChris Mason } 1581e02119d5SChris Mason out: 1582e02119d5SChris Mason btrfs_release_path(root, path); 1583e02119d5SChris Mason btrfs_free_path(log_path); 1584e02119d5SChris Mason iput(dir); 1585e02119d5SChris Mason return ret; 1586e02119d5SChris Mason } 1587e02119d5SChris Mason 1588e02119d5SChris Mason /* 1589e02119d5SChris Mason * the process_func used to replay items from the log tree. This 1590e02119d5SChris Mason * gets called in two different stages. The first stage just looks 1591e02119d5SChris Mason * for inodes and makes sure they are all copied into the subvolume. 1592e02119d5SChris Mason * 1593e02119d5SChris Mason * The second stage copies all the other item types from the log into 1594e02119d5SChris Mason * the subvolume. The two stage approach is slower, but gets rid of 1595e02119d5SChris Mason * lots of complexity around inodes referencing other inodes that exist 1596e02119d5SChris Mason * only in the log (references come from either directory items or inode 1597e02119d5SChris Mason * back refs). 1598e02119d5SChris Mason */ 1599e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1600e02119d5SChris Mason struct walk_control *wc, u64 gen) 1601e02119d5SChris Mason { 1602e02119d5SChris Mason int nritems; 1603e02119d5SChris Mason struct btrfs_path *path; 1604e02119d5SChris Mason struct btrfs_root *root = wc->replay_dest; 1605e02119d5SChris Mason struct btrfs_key key; 1606e02119d5SChris Mason int level; 1607e02119d5SChris Mason int i; 1608e02119d5SChris Mason int ret; 1609e02119d5SChris Mason 1610e02119d5SChris Mason btrfs_read_buffer(eb, gen); 1611e02119d5SChris Mason 1612e02119d5SChris Mason level = btrfs_header_level(eb); 1613e02119d5SChris Mason 1614e02119d5SChris Mason if (level != 0) 1615e02119d5SChris Mason return 0; 1616e02119d5SChris Mason 1617e02119d5SChris Mason path = btrfs_alloc_path(); 1618e02119d5SChris Mason BUG_ON(!path); 1619e02119d5SChris Mason 1620e02119d5SChris Mason nritems = btrfs_header_nritems(eb); 1621e02119d5SChris Mason for (i = 0; i < nritems; i++) { 1622e02119d5SChris Mason btrfs_item_key_to_cpu(eb, &key, i); 1623e02119d5SChris Mason 1624e02119d5SChris Mason /* inode keys are done during the first stage */ 1625e02119d5SChris Mason if (key.type == BTRFS_INODE_ITEM_KEY && 1626e02119d5SChris Mason wc->stage == LOG_WALK_REPLAY_INODES) { 1627e02119d5SChris Mason struct btrfs_inode_item *inode_item; 1628e02119d5SChris Mason u32 mode; 1629e02119d5SChris Mason 1630e02119d5SChris Mason inode_item = btrfs_item_ptr(eb, i, 1631e02119d5SChris Mason struct btrfs_inode_item); 1632e02119d5SChris Mason mode = btrfs_inode_mode(eb, inode_item); 1633e02119d5SChris Mason if (S_ISDIR(mode)) { 1634e02119d5SChris Mason ret = replay_dir_deletes(wc->trans, 163512fcfd22SChris Mason root, log, path, key.objectid, 0); 1636e02119d5SChris Mason BUG_ON(ret); 1637e02119d5SChris Mason } 1638e02119d5SChris Mason ret = overwrite_item(wc->trans, root, path, 1639e02119d5SChris Mason eb, i, &key); 1640e02119d5SChris Mason BUG_ON(ret); 1641e02119d5SChris Mason 1642c71bf099SYan, Zheng /* for regular files, make sure corresponding 1643c71bf099SYan, Zheng * orhpan item exist. extents past the new EOF 1644c71bf099SYan, Zheng * will be truncated later by orphan cleanup. 1645e02119d5SChris Mason */ 1646e02119d5SChris Mason if (S_ISREG(mode)) { 1647c71bf099SYan, Zheng ret = insert_orphan_item(wc->trans, root, 1648e02119d5SChris Mason key.objectid); 1649e02119d5SChris Mason BUG_ON(ret); 1650c71bf099SYan, Zheng } 1651a74ac322SChris Mason 1652e02119d5SChris Mason ret = link_to_fixup_dir(wc->trans, root, 1653e02119d5SChris Mason path, key.objectid); 1654e02119d5SChris Mason BUG_ON(ret); 1655e02119d5SChris Mason } 1656e02119d5SChris Mason if (wc->stage < LOG_WALK_REPLAY_ALL) 1657e02119d5SChris Mason continue; 1658e02119d5SChris Mason 1659e02119d5SChris Mason /* these keys are simply copied */ 1660e02119d5SChris Mason if (key.type == BTRFS_XATTR_ITEM_KEY) { 1661e02119d5SChris Mason ret = overwrite_item(wc->trans, root, path, 1662e02119d5SChris Mason eb, i, &key); 1663e02119d5SChris Mason BUG_ON(ret); 1664e02119d5SChris Mason } else if (key.type == BTRFS_INODE_REF_KEY) { 1665e02119d5SChris Mason ret = add_inode_ref(wc->trans, root, log, path, 1666e02119d5SChris Mason eb, i, &key); 1667e02119d5SChris Mason BUG_ON(ret && ret != -ENOENT); 1668e02119d5SChris Mason } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1669e02119d5SChris Mason ret = replay_one_extent(wc->trans, root, path, 1670e02119d5SChris Mason eb, i, &key); 1671e02119d5SChris Mason BUG_ON(ret); 1672e02119d5SChris Mason } else if (key.type == BTRFS_DIR_ITEM_KEY || 1673e02119d5SChris Mason key.type == BTRFS_DIR_INDEX_KEY) { 1674e02119d5SChris Mason ret = replay_one_dir_item(wc->trans, root, path, 1675e02119d5SChris Mason eb, i, &key); 1676e02119d5SChris Mason BUG_ON(ret); 1677e02119d5SChris Mason } 1678e02119d5SChris Mason } 1679e02119d5SChris Mason btrfs_free_path(path); 1680e02119d5SChris Mason return 0; 1681e02119d5SChris Mason } 1682e02119d5SChris Mason 1683d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1684e02119d5SChris Mason struct btrfs_root *root, 1685e02119d5SChris Mason struct btrfs_path *path, int *level, 1686e02119d5SChris Mason struct walk_control *wc) 1687e02119d5SChris Mason { 1688e02119d5SChris Mason u64 root_owner; 1689e02119d5SChris Mason u64 bytenr; 1690e02119d5SChris Mason u64 ptr_gen; 1691e02119d5SChris Mason struct extent_buffer *next; 1692e02119d5SChris Mason struct extent_buffer *cur; 1693e02119d5SChris Mason struct extent_buffer *parent; 1694e02119d5SChris Mason u32 blocksize; 1695e02119d5SChris Mason int ret = 0; 1696e02119d5SChris Mason 1697e02119d5SChris Mason WARN_ON(*level < 0); 1698e02119d5SChris Mason WARN_ON(*level >= BTRFS_MAX_LEVEL); 1699e02119d5SChris Mason 1700e02119d5SChris Mason while (*level > 0) { 1701e02119d5SChris Mason WARN_ON(*level < 0); 1702e02119d5SChris Mason WARN_ON(*level >= BTRFS_MAX_LEVEL); 1703e02119d5SChris Mason cur = path->nodes[*level]; 1704e02119d5SChris Mason 1705e02119d5SChris Mason if (btrfs_header_level(cur) != *level) 1706e02119d5SChris Mason WARN_ON(1); 1707e02119d5SChris Mason 1708e02119d5SChris Mason if (path->slots[*level] >= 1709e02119d5SChris Mason btrfs_header_nritems(cur)) 1710e02119d5SChris Mason break; 1711e02119d5SChris Mason 1712e02119d5SChris Mason bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1713e02119d5SChris Mason ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1714e02119d5SChris Mason blocksize = btrfs_level_size(root, *level - 1); 1715e02119d5SChris Mason 1716e02119d5SChris Mason parent = path->nodes[*level]; 1717e02119d5SChris Mason root_owner = btrfs_header_owner(parent); 1718e02119d5SChris Mason 1719e02119d5SChris Mason next = btrfs_find_create_tree_block(root, bytenr, blocksize); 17202a29edc6Sliubo if (!next) 17212a29edc6Sliubo return -ENOMEM; 1722e02119d5SChris Mason 17234a500fd1SYan, Zheng if (*level == 1) { 1724e02119d5SChris Mason wc->process_func(root, next, wc, ptr_gen); 1725e02119d5SChris Mason 1726e02119d5SChris Mason path->slots[*level]++; 1727e02119d5SChris Mason if (wc->free) { 1728e02119d5SChris Mason btrfs_read_buffer(next, ptr_gen); 1729e02119d5SChris Mason 1730e02119d5SChris Mason btrfs_tree_lock(next); 1731e02119d5SChris Mason clean_tree_block(trans, root, next); 1732b4ce94deSChris Mason btrfs_set_lock_blocking(next); 1733e02119d5SChris Mason btrfs_wait_tree_block_writeback(next); 1734e02119d5SChris Mason btrfs_tree_unlock(next); 1735e02119d5SChris Mason 1736e02119d5SChris Mason WARN_ON(root_owner != 1737e02119d5SChris Mason BTRFS_TREE_LOG_OBJECTID); 1738d00aff00SChris Mason ret = btrfs_free_reserved_extent(root, 1739d00aff00SChris Mason bytenr, blocksize); 1740e02119d5SChris Mason BUG_ON(ret); 1741e02119d5SChris Mason } 1742e02119d5SChris Mason free_extent_buffer(next); 1743e02119d5SChris Mason continue; 1744e02119d5SChris Mason } 1745e02119d5SChris Mason btrfs_read_buffer(next, ptr_gen); 1746e02119d5SChris Mason 1747e02119d5SChris Mason WARN_ON(*level <= 0); 1748e02119d5SChris Mason if (path->nodes[*level-1]) 1749e02119d5SChris Mason free_extent_buffer(path->nodes[*level-1]); 1750e02119d5SChris Mason path->nodes[*level-1] = next; 1751e02119d5SChris Mason *level = btrfs_header_level(next); 1752e02119d5SChris Mason path->slots[*level] = 0; 1753e02119d5SChris Mason cond_resched(); 1754e02119d5SChris Mason } 1755e02119d5SChris Mason WARN_ON(*level < 0); 1756e02119d5SChris Mason WARN_ON(*level >= BTRFS_MAX_LEVEL); 1757e02119d5SChris Mason 17584a500fd1SYan, Zheng path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 1759e02119d5SChris Mason 1760e02119d5SChris Mason cond_resched(); 1761e02119d5SChris Mason return 0; 1762e02119d5SChris Mason } 1763e02119d5SChris Mason 1764d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1765e02119d5SChris Mason struct btrfs_root *root, 1766e02119d5SChris Mason struct btrfs_path *path, int *level, 1767e02119d5SChris Mason struct walk_control *wc) 1768e02119d5SChris Mason { 1769e02119d5SChris Mason u64 root_owner; 1770e02119d5SChris Mason int i; 1771e02119d5SChris Mason int slot; 1772e02119d5SChris Mason int ret; 1773e02119d5SChris Mason 1774e02119d5SChris Mason for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1775e02119d5SChris Mason slot = path->slots[i]; 17764a500fd1SYan, Zheng if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1777e02119d5SChris Mason path->slots[i]++; 1778e02119d5SChris Mason *level = i; 1779e02119d5SChris Mason WARN_ON(*level == 0); 1780e02119d5SChris Mason return 0; 1781e02119d5SChris Mason } else { 178231840ae1SZheng Yan struct extent_buffer *parent; 178331840ae1SZheng Yan if (path->nodes[*level] == root->node) 178431840ae1SZheng Yan parent = path->nodes[*level]; 178531840ae1SZheng Yan else 178631840ae1SZheng Yan parent = path->nodes[*level + 1]; 178731840ae1SZheng Yan 178831840ae1SZheng Yan root_owner = btrfs_header_owner(parent); 1789e02119d5SChris Mason wc->process_func(root, path->nodes[*level], wc, 1790e02119d5SChris Mason btrfs_header_generation(path->nodes[*level])); 1791e02119d5SChris Mason if (wc->free) { 1792e02119d5SChris Mason struct extent_buffer *next; 1793e02119d5SChris Mason 1794e02119d5SChris Mason next = path->nodes[*level]; 1795e02119d5SChris Mason 1796e02119d5SChris Mason btrfs_tree_lock(next); 1797e02119d5SChris Mason clean_tree_block(trans, root, next); 1798b4ce94deSChris Mason btrfs_set_lock_blocking(next); 1799e02119d5SChris Mason btrfs_wait_tree_block_writeback(next); 1800e02119d5SChris Mason btrfs_tree_unlock(next); 1801e02119d5SChris Mason 1802e02119d5SChris Mason WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1803d00aff00SChris Mason ret = btrfs_free_reserved_extent(root, 1804e02119d5SChris Mason path->nodes[*level]->start, 1805d00aff00SChris Mason path->nodes[*level]->len); 1806e02119d5SChris Mason BUG_ON(ret); 1807e02119d5SChris Mason } 1808e02119d5SChris Mason free_extent_buffer(path->nodes[*level]); 1809e02119d5SChris Mason path->nodes[*level] = NULL; 1810e02119d5SChris Mason *level = i + 1; 1811e02119d5SChris Mason } 1812e02119d5SChris Mason } 1813e02119d5SChris Mason return 1; 1814e02119d5SChris Mason } 1815e02119d5SChris Mason 1816e02119d5SChris Mason /* 1817e02119d5SChris Mason * drop the reference count on the tree rooted at 'snap'. This traverses 1818e02119d5SChris Mason * the tree freeing any blocks that have a ref count of zero after being 1819e02119d5SChris Mason * decremented. 1820e02119d5SChris Mason */ 1821e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans, 1822e02119d5SChris Mason struct btrfs_root *log, struct walk_control *wc) 1823e02119d5SChris Mason { 1824e02119d5SChris Mason int ret = 0; 1825e02119d5SChris Mason int wret; 1826e02119d5SChris Mason int level; 1827e02119d5SChris Mason struct btrfs_path *path; 1828e02119d5SChris Mason int i; 1829e02119d5SChris Mason int orig_level; 1830e02119d5SChris Mason 1831e02119d5SChris Mason path = btrfs_alloc_path(); 1832db5b493aSTsutomu Itoh if (!path) 1833db5b493aSTsutomu Itoh return -ENOMEM; 1834e02119d5SChris Mason 1835e02119d5SChris Mason level = btrfs_header_level(log->node); 1836e02119d5SChris Mason orig_level = level; 1837e02119d5SChris Mason path->nodes[level] = log->node; 1838e02119d5SChris Mason extent_buffer_get(log->node); 1839e02119d5SChris Mason path->slots[level] = 0; 1840e02119d5SChris Mason 1841e02119d5SChris Mason while (1) { 1842e02119d5SChris Mason wret = walk_down_log_tree(trans, log, path, &level, wc); 1843e02119d5SChris Mason if (wret > 0) 1844e02119d5SChris Mason break; 1845e02119d5SChris Mason if (wret < 0) 1846e02119d5SChris Mason ret = wret; 1847e02119d5SChris Mason 1848e02119d5SChris Mason wret = walk_up_log_tree(trans, log, path, &level, wc); 1849e02119d5SChris Mason if (wret > 0) 1850e02119d5SChris Mason break; 1851e02119d5SChris Mason if (wret < 0) 1852e02119d5SChris Mason ret = wret; 1853e02119d5SChris Mason } 1854e02119d5SChris Mason 1855e02119d5SChris Mason /* was the root node processed? if not, catch it here */ 1856e02119d5SChris Mason if (path->nodes[orig_level]) { 1857e02119d5SChris Mason wc->process_func(log, path->nodes[orig_level], wc, 1858e02119d5SChris Mason btrfs_header_generation(path->nodes[orig_level])); 1859e02119d5SChris Mason if (wc->free) { 1860e02119d5SChris Mason struct extent_buffer *next; 1861e02119d5SChris Mason 1862e02119d5SChris Mason next = path->nodes[orig_level]; 1863e02119d5SChris Mason 1864e02119d5SChris Mason btrfs_tree_lock(next); 1865e02119d5SChris Mason clean_tree_block(trans, log, next); 1866b4ce94deSChris Mason btrfs_set_lock_blocking(next); 1867e02119d5SChris Mason btrfs_wait_tree_block_writeback(next); 1868e02119d5SChris Mason btrfs_tree_unlock(next); 1869e02119d5SChris Mason 1870e02119d5SChris Mason WARN_ON(log->root_key.objectid != 1871e02119d5SChris Mason BTRFS_TREE_LOG_OBJECTID); 1872d00aff00SChris Mason ret = btrfs_free_reserved_extent(log, next->start, 1873d00aff00SChris Mason next->len); 1874e02119d5SChris Mason BUG_ON(ret); 1875e02119d5SChris Mason } 1876e02119d5SChris Mason } 1877e02119d5SChris Mason 1878e02119d5SChris Mason for (i = 0; i <= orig_level; i++) { 1879e02119d5SChris Mason if (path->nodes[i]) { 1880e02119d5SChris Mason free_extent_buffer(path->nodes[i]); 1881e02119d5SChris Mason path->nodes[i] = NULL; 1882e02119d5SChris Mason } 1883e02119d5SChris Mason } 1884e02119d5SChris Mason btrfs_free_path(path); 1885e02119d5SChris Mason return ret; 1886e02119d5SChris Mason } 1887e02119d5SChris Mason 18887237f183SYan Zheng /* 18897237f183SYan Zheng * helper function to update the item for a given subvolumes log root 18907237f183SYan Zheng * in the tree of log roots 18917237f183SYan Zheng */ 18927237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans, 18937237f183SYan Zheng struct btrfs_root *log) 18947237f183SYan Zheng { 18957237f183SYan Zheng int ret; 18967237f183SYan Zheng 18977237f183SYan Zheng if (log->log_transid == 1) { 18987237f183SYan Zheng /* insert root item on the first sync */ 18997237f183SYan Zheng ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 19007237f183SYan Zheng &log->root_key, &log->root_item); 19017237f183SYan Zheng } else { 19027237f183SYan Zheng ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 19037237f183SYan Zheng &log->root_key, &log->root_item); 19047237f183SYan Zheng } 19057237f183SYan Zheng return ret; 19067237f183SYan Zheng } 19077237f183SYan Zheng 190812fcfd22SChris Mason static int wait_log_commit(struct btrfs_trans_handle *trans, 190912fcfd22SChris Mason struct btrfs_root *root, unsigned long transid) 1910e02119d5SChris Mason { 1911e02119d5SChris Mason DEFINE_WAIT(wait); 19127237f183SYan Zheng int index = transid % 2; 1913e02119d5SChris Mason 19147237f183SYan Zheng /* 19157237f183SYan Zheng * we only allow two pending log transactions at a time, 19167237f183SYan Zheng * so we know that if ours is more than 2 older than the 19177237f183SYan Zheng * current transaction, we're done 19187237f183SYan Zheng */ 1919e02119d5SChris Mason do { 19207237f183SYan Zheng prepare_to_wait(&root->log_commit_wait[index], 19217237f183SYan Zheng &wait, TASK_UNINTERRUPTIBLE); 19227237f183SYan Zheng mutex_unlock(&root->log_mutex); 192312fcfd22SChris Mason 192412fcfd22SChris Mason if (root->fs_info->last_trans_log_full_commit != 192512fcfd22SChris Mason trans->transid && root->log_transid < transid + 2 && 19267237f183SYan Zheng atomic_read(&root->log_commit[index])) 1927e02119d5SChris Mason schedule(); 192812fcfd22SChris Mason 19297237f183SYan Zheng finish_wait(&root->log_commit_wait[index], &wait); 19307237f183SYan Zheng mutex_lock(&root->log_mutex); 19317237f183SYan Zheng } while (root->log_transid < transid + 2 && 19327237f183SYan Zheng atomic_read(&root->log_commit[index])); 19337237f183SYan Zheng return 0; 19347237f183SYan Zheng } 19357237f183SYan Zheng 193612fcfd22SChris Mason static int wait_for_writer(struct btrfs_trans_handle *trans, 193712fcfd22SChris Mason struct btrfs_root *root) 19387237f183SYan Zheng { 19397237f183SYan Zheng DEFINE_WAIT(wait); 19407237f183SYan Zheng while (atomic_read(&root->log_writers)) { 19417237f183SYan Zheng prepare_to_wait(&root->log_writer_wait, 19427237f183SYan Zheng &wait, TASK_UNINTERRUPTIBLE); 19437237f183SYan Zheng mutex_unlock(&root->log_mutex); 194412fcfd22SChris Mason if (root->fs_info->last_trans_log_full_commit != 194512fcfd22SChris Mason trans->transid && atomic_read(&root->log_writers)) 19467237f183SYan Zheng schedule(); 19477237f183SYan Zheng mutex_lock(&root->log_mutex); 19487237f183SYan Zheng finish_wait(&root->log_writer_wait, &wait); 19497237f183SYan Zheng } 1950e02119d5SChris Mason return 0; 1951e02119d5SChris Mason } 1952e02119d5SChris Mason 1953e02119d5SChris Mason /* 1954e02119d5SChris Mason * btrfs_sync_log does sends a given tree log down to the disk and 1955e02119d5SChris Mason * updates the super blocks to record it. When this call is done, 195612fcfd22SChris Mason * you know that any inodes previously logged are safely on disk only 195712fcfd22SChris Mason * if it returns 0. 195812fcfd22SChris Mason * 195912fcfd22SChris Mason * Any other return value means you need to call btrfs_commit_transaction. 196012fcfd22SChris Mason * Some of the edge cases for fsyncing directories that have had unlinks 196112fcfd22SChris Mason * or renames done in the past mean that sometimes the only safe 196212fcfd22SChris Mason * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 196312fcfd22SChris Mason * that has happened. 1964e02119d5SChris Mason */ 1965e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans, 1966e02119d5SChris Mason struct btrfs_root *root) 1967e02119d5SChris Mason { 19687237f183SYan Zheng int index1; 19697237f183SYan Zheng int index2; 19708cef4e16SYan, Zheng int mark; 1971e02119d5SChris Mason int ret; 1972e02119d5SChris Mason struct btrfs_root *log = root->log_root; 19737237f183SYan Zheng struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 19748cef4e16SYan, Zheng unsigned long log_transid = 0; 1975e02119d5SChris Mason 19767237f183SYan Zheng mutex_lock(&root->log_mutex); 19777237f183SYan Zheng index1 = root->log_transid % 2; 19787237f183SYan Zheng if (atomic_read(&root->log_commit[index1])) { 197912fcfd22SChris Mason wait_log_commit(trans, root, root->log_transid); 19807237f183SYan Zheng mutex_unlock(&root->log_mutex); 19817237f183SYan Zheng return 0; 1982e02119d5SChris Mason } 19837237f183SYan Zheng atomic_set(&root->log_commit[index1], 1); 19847237f183SYan Zheng 19857237f183SYan Zheng /* wait for previous tree log sync to complete */ 19867237f183SYan Zheng if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 198712fcfd22SChris Mason wait_log_commit(trans, root, root->log_transid - 1); 1988e02119d5SChris Mason 198986df7eb9SYan, Zheng while (1) { 19907237f183SYan Zheng unsigned long batch = root->log_batch; 199186df7eb9SYan, Zheng if (root->log_multiple_pids) { 19927237f183SYan Zheng mutex_unlock(&root->log_mutex); 1993e02119d5SChris Mason schedule_timeout_uninterruptible(1); 19947237f183SYan Zheng mutex_lock(&root->log_mutex); 199586df7eb9SYan, Zheng } 199612fcfd22SChris Mason wait_for_writer(trans, root); 19977237f183SYan Zheng if (batch == root->log_batch) 1998e02119d5SChris Mason break; 1999e02119d5SChris Mason } 2000d0c803c4SChris Mason 200112fcfd22SChris Mason /* bail out if we need to do a full commit */ 200212fcfd22SChris Mason if (root->fs_info->last_trans_log_full_commit == trans->transid) { 200312fcfd22SChris Mason ret = -EAGAIN; 200412fcfd22SChris Mason mutex_unlock(&root->log_mutex); 200512fcfd22SChris Mason goto out; 200612fcfd22SChris Mason } 200712fcfd22SChris Mason 20088cef4e16SYan, Zheng log_transid = root->log_transid; 20098cef4e16SYan, Zheng if (log_transid % 2 == 0) 20108cef4e16SYan, Zheng mark = EXTENT_DIRTY; 20118cef4e16SYan, Zheng else 20128cef4e16SYan, Zheng mark = EXTENT_NEW; 20138cef4e16SYan, Zheng 2014690587d1SChris Mason /* we start IO on all the marked extents here, but we don't actually 2015690587d1SChris Mason * wait for them until later. 2016690587d1SChris Mason */ 20178cef4e16SYan, Zheng ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2018e02119d5SChris Mason BUG_ON(ret); 20197237f183SYan Zheng 20205d4f98a2SYan Zheng btrfs_set_root_node(&log->root_item, log->node); 20217237f183SYan Zheng 20227237f183SYan Zheng root->log_batch = 0; 20237237f183SYan Zheng root->log_transid++; 20247237f183SYan Zheng log->log_transid = root->log_transid; 2025ff782e0aSJosef Bacik root->log_start_pid = 0; 20267237f183SYan Zheng smp_mb(); 20277237f183SYan Zheng /* 20288cef4e16SYan, Zheng * IO has been started, blocks of the log tree have WRITTEN flag set 20298cef4e16SYan, Zheng * in their headers. new modifications of the log will be written to 20308cef4e16SYan, Zheng * new positions. so it's safe to allow log writers to go in. 20317237f183SYan Zheng */ 20327237f183SYan Zheng mutex_unlock(&root->log_mutex); 20337237f183SYan Zheng 20347237f183SYan Zheng mutex_lock(&log_root_tree->log_mutex); 20357237f183SYan Zheng log_root_tree->log_batch++; 20367237f183SYan Zheng atomic_inc(&log_root_tree->log_writers); 20377237f183SYan Zheng mutex_unlock(&log_root_tree->log_mutex); 20387237f183SYan Zheng 20397237f183SYan Zheng ret = update_log_root(trans, log); 20407237f183SYan Zheng 20417237f183SYan Zheng mutex_lock(&log_root_tree->log_mutex); 20427237f183SYan Zheng if (atomic_dec_and_test(&log_root_tree->log_writers)) { 20437237f183SYan Zheng smp_mb(); 20447237f183SYan Zheng if (waitqueue_active(&log_root_tree->log_writer_wait)) 20457237f183SYan Zheng wake_up(&log_root_tree->log_writer_wait); 20467237f183SYan Zheng } 20477237f183SYan Zheng 20484a500fd1SYan, Zheng if (ret) { 20494a500fd1SYan, Zheng BUG_ON(ret != -ENOSPC); 20504a500fd1SYan, Zheng root->fs_info->last_trans_log_full_commit = trans->transid; 20514a500fd1SYan, Zheng btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 20524a500fd1SYan, Zheng mutex_unlock(&log_root_tree->log_mutex); 20534a500fd1SYan, Zheng ret = -EAGAIN; 20544a500fd1SYan, Zheng goto out; 20554a500fd1SYan, Zheng } 20564a500fd1SYan, Zheng 20577237f183SYan Zheng index2 = log_root_tree->log_transid % 2; 20587237f183SYan Zheng if (atomic_read(&log_root_tree->log_commit[index2])) { 20598cef4e16SYan, Zheng btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 206012fcfd22SChris Mason wait_log_commit(trans, log_root_tree, 206112fcfd22SChris Mason log_root_tree->log_transid); 20627237f183SYan Zheng mutex_unlock(&log_root_tree->log_mutex); 2063b31eabd8SChris Mason ret = 0; 20647237f183SYan Zheng goto out; 20657237f183SYan Zheng } 20667237f183SYan Zheng atomic_set(&log_root_tree->log_commit[index2], 1); 20677237f183SYan Zheng 206812fcfd22SChris Mason if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 206912fcfd22SChris Mason wait_log_commit(trans, log_root_tree, 207012fcfd22SChris Mason log_root_tree->log_transid - 1); 207112fcfd22SChris Mason } 20727237f183SYan Zheng 207312fcfd22SChris Mason wait_for_writer(trans, log_root_tree); 207412fcfd22SChris Mason 207512fcfd22SChris Mason /* 207612fcfd22SChris Mason * now that we've moved on to the tree of log tree roots, 207712fcfd22SChris Mason * check the full commit flag again 207812fcfd22SChris Mason */ 207912fcfd22SChris Mason if (root->fs_info->last_trans_log_full_commit == trans->transid) { 20808cef4e16SYan, Zheng btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 208112fcfd22SChris Mason mutex_unlock(&log_root_tree->log_mutex); 208212fcfd22SChris Mason ret = -EAGAIN; 208312fcfd22SChris Mason goto out_wake_log_root; 208412fcfd22SChris Mason } 20857237f183SYan Zheng 20867237f183SYan Zheng ret = btrfs_write_and_wait_marked_extents(log_root_tree, 20878cef4e16SYan, Zheng &log_root_tree->dirty_log_pages, 20888cef4e16SYan, Zheng EXTENT_DIRTY | EXTENT_NEW); 2089e02119d5SChris Mason BUG_ON(ret); 20908cef4e16SYan, Zheng btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2091e02119d5SChris Mason 2092e02119d5SChris Mason btrfs_set_super_log_root(&root->fs_info->super_for_commit, 20937237f183SYan Zheng log_root_tree->node->start); 2094e02119d5SChris Mason btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 20957237f183SYan Zheng btrfs_header_level(log_root_tree->node)); 2096e02119d5SChris Mason 20977237f183SYan Zheng log_root_tree->log_batch = 0; 20987237f183SYan Zheng log_root_tree->log_transid++; 2099e02119d5SChris Mason smp_mb(); 21007237f183SYan Zheng 21017237f183SYan Zheng mutex_unlock(&log_root_tree->log_mutex); 21027237f183SYan Zheng 21037237f183SYan Zheng /* 21047237f183SYan Zheng * nobody else is going to jump in and write the the ctree 21057237f183SYan Zheng * super here because the log_commit atomic below is protecting 21067237f183SYan Zheng * us. We must be called with a transaction handle pinning 21077237f183SYan Zheng * the running transaction open, so a full commit can't hop 21087237f183SYan Zheng * in and cause problems either. 21097237f183SYan Zheng */ 21104722607dSChris Mason write_ctree_super(trans, root->fs_info->tree_root, 1); 211112fcfd22SChris Mason ret = 0; 21127237f183SYan Zheng 2113257c62e1SChris Mason mutex_lock(&root->log_mutex); 2114257c62e1SChris Mason if (root->last_log_commit < log_transid) 2115257c62e1SChris Mason root->last_log_commit = log_transid; 2116257c62e1SChris Mason mutex_unlock(&root->log_mutex); 2117257c62e1SChris Mason 211812fcfd22SChris Mason out_wake_log_root: 21197237f183SYan Zheng atomic_set(&log_root_tree->log_commit[index2], 0); 21207237f183SYan Zheng smp_mb(); 21217237f183SYan Zheng if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 21227237f183SYan Zheng wake_up(&log_root_tree->log_commit_wait[index2]); 2123e02119d5SChris Mason out: 21247237f183SYan Zheng atomic_set(&root->log_commit[index1], 0); 21257237f183SYan Zheng smp_mb(); 21267237f183SYan Zheng if (waitqueue_active(&root->log_commit_wait[index1])) 21277237f183SYan Zheng wake_up(&root->log_commit_wait[index1]); 2128b31eabd8SChris Mason return ret; 2129e02119d5SChris Mason } 2130e02119d5SChris Mason 21314a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans, 21324a500fd1SYan, Zheng struct btrfs_root *log) 2133e02119d5SChris Mason { 2134e02119d5SChris Mason int ret; 2135d0c803c4SChris Mason u64 start; 2136d0c803c4SChris Mason u64 end; 2137e02119d5SChris Mason struct walk_control wc = { 2138e02119d5SChris Mason .free = 1, 2139e02119d5SChris Mason .process_func = process_one_buffer 2140e02119d5SChris Mason }; 2141e02119d5SChris Mason 2142e02119d5SChris Mason ret = walk_log_tree(trans, log, &wc); 2143e02119d5SChris Mason BUG_ON(ret); 2144e02119d5SChris Mason 2145d0c803c4SChris Mason while (1) { 2146d0c803c4SChris Mason ret = find_first_extent_bit(&log->dirty_log_pages, 21478cef4e16SYan, Zheng 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2148d0c803c4SChris Mason if (ret) 2149d0c803c4SChris Mason break; 2150d0c803c4SChris Mason 21518cef4e16SYan, Zheng clear_extent_bits(&log->dirty_log_pages, start, end, 21528cef4e16SYan, Zheng EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2153d0c803c4SChris Mason } 2154d0c803c4SChris Mason 21557237f183SYan Zheng free_extent_buffer(log->node); 21567237f183SYan Zheng kfree(log); 21574a500fd1SYan, Zheng } 21584a500fd1SYan, Zheng 21594a500fd1SYan, Zheng /* 21604a500fd1SYan, Zheng * free all the extents used by the tree log. This should be called 21614a500fd1SYan, Zheng * at commit time of the full transaction 21624a500fd1SYan, Zheng */ 21634a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 21644a500fd1SYan, Zheng { 21654a500fd1SYan, Zheng if (root->log_root) { 21664a500fd1SYan, Zheng free_log_tree(trans, root->log_root); 21674a500fd1SYan, Zheng root->log_root = NULL; 21684a500fd1SYan, Zheng } 21694a500fd1SYan, Zheng return 0; 21704a500fd1SYan, Zheng } 21714a500fd1SYan, Zheng 21724a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 21734a500fd1SYan, Zheng struct btrfs_fs_info *fs_info) 21744a500fd1SYan, Zheng { 21754a500fd1SYan, Zheng if (fs_info->log_root_tree) { 21764a500fd1SYan, Zheng free_log_tree(trans, fs_info->log_root_tree); 21774a500fd1SYan, Zheng fs_info->log_root_tree = NULL; 21784a500fd1SYan, Zheng } 2179e02119d5SChris Mason return 0; 2180e02119d5SChris Mason } 2181e02119d5SChris Mason 2182e02119d5SChris Mason /* 2183e02119d5SChris Mason * If both a file and directory are logged, and unlinks or renames are 2184e02119d5SChris Mason * mixed in, we have a few interesting corners: 2185e02119d5SChris Mason * 2186e02119d5SChris Mason * create file X in dir Y 2187e02119d5SChris Mason * link file X to X.link in dir Y 2188e02119d5SChris Mason * fsync file X 2189e02119d5SChris Mason * unlink file X but leave X.link 2190e02119d5SChris Mason * fsync dir Y 2191e02119d5SChris Mason * 2192e02119d5SChris Mason * After a crash we would expect only X.link to exist. But file X 2193e02119d5SChris Mason * didn't get fsync'd again so the log has back refs for X and X.link. 2194e02119d5SChris Mason * 2195e02119d5SChris Mason * We solve this by removing directory entries and inode backrefs from the 2196e02119d5SChris Mason * log when a file that was logged in the current transaction is 2197e02119d5SChris Mason * unlinked. Any later fsync will include the updated log entries, and 2198e02119d5SChris Mason * we'll be able to reconstruct the proper directory items from backrefs. 2199e02119d5SChris Mason * 2200e02119d5SChris Mason * This optimizations allows us to avoid relogging the entire inode 2201e02119d5SChris Mason * or the entire directory. 2202e02119d5SChris Mason */ 2203e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2204e02119d5SChris Mason struct btrfs_root *root, 2205e02119d5SChris Mason const char *name, int name_len, 2206e02119d5SChris Mason struct inode *dir, u64 index) 2207e02119d5SChris Mason { 2208e02119d5SChris Mason struct btrfs_root *log; 2209e02119d5SChris Mason struct btrfs_dir_item *di; 2210e02119d5SChris Mason struct btrfs_path *path; 2211e02119d5SChris Mason int ret; 22124a500fd1SYan, Zheng int err = 0; 2213e02119d5SChris Mason int bytes_del = 0; 2214e02119d5SChris Mason 22153a5f1d45SChris Mason if (BTRFS_I(dir)->logged_trans < trans->transid) 22163a5f1d45SChris Mason return 0; 22173a5f1d45SChris Mason 2218e02119d5SChris Mason ret = join_running_log_trans(root); 2219e02119d5SChris Mason if (ret) 2220e02119d5SChris Mason return 0; 2221e02119d5SChris Mason 2222e02119d5SChris Mason mutex_lock(&BTRFS_I(dir)->log_mutex); 2223e02119d5SChris Mason 2224e02119d5SChris Mason log = root->log_root; 2225e02119d5SChris Mason path = btrfs_alloc_path(); 2226a62f44a5STsutomu Itoh if (!path) { 2227a62f44a5STsutomu Itoh err = -ENOMEM; 2228a62f44a5STsutomu Itoh goto out_unlock; 2229a62f44a5STsutomu Itoh } 22302a29edc6Sliubo 2231e02119d5SChris Mason di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2232e02119d5SChris Mason name, name_len, -1); 22334a500fd1SYan, Zheng if (IS_ERR(di)) { 22344a500fd1SYan, Zheng err = PTR_ERR(di); 22354a500fd1SYan, Zheng goto fail; 22364a500fd1SYan, Zheng } 22374a500fd1SYan, Zheng if (di) { 2238e02119d5SChris Mason ret = btrfs_delete_one_dir_name(trans, log, path, di); 2239e02119d5SChris Mason bytes_del += name_len; 2240e02119d5SChris Mason BUG_ON(ret); 2241e02119d5SChris Mason } 2242e02119d5SChris Mason btrfs_release_path(log, path); 2243e02119d5SChris Mason di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2244e02119d5SChris Mason index, name, name_len, -1); 22454a500fd1SYan, Zheng if (IS_ERR(di)) { 22464a500fd1SYan, Zheng err = PTR_ERR(di); 22474a500fd1SYan, Zheng goto fail; 22484a500fd1SYan, Zheng } 22494a500fd1SYan, Zheng if (di) { 2250e02119d5SChris Mason ret = btrfs_delete_one_dir_name(trans, log, path, di); 2251e02119d5SChris Mason bytes_del += name_len; 2252e02119d5SChris Mason BUG_ON(ret); 2253e02119d5SChris Mason } 2254e02119d5SChris Mason 2255e02119d5SChris Mason /* update the directory size in the log to reflect the names 2256e02119d5SChris Mason * we have removed 2257e02119d5SChris Mason */ 2258e02119d5SChris Mason if (bytes_del) { 2259e02119d5SChris Mason struct btrfs_key key; 2260e02119d5SChris Mason 2261e02119d5SChris Mason key.objectid = dir->i_ino; 2262e02119d5SChris Mason key.offset = 0; 2263e02119d5SChris Mason key.type = BTRFS_INODE_ITEM_KEY; 2264e02119d5SChris Mason btrfs_release_path(log, path); 2265e02119d5SChris Mason 2266e02119d5SChris Mason ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 22674a500fd1SYan, Zheng if (ret < 0) { 22684a500fd1SYan, Zheng err = ret; 22694a500fd1SYan, Zheng goto fail; 22704a500fd1SYan, Zheng } 2271e02119d5SChris Mason if (ret == 0) { 2272e02119d5SChris Mason struct btrfs_inode_item *item; 2273e02119d5SChris Mason u64 i_size; 2274e02119d5SChris Mason 2275e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2276e02119d5SChris Mason struct btrfs_inode_item); 2277e02119d5SChris Mason i_size = btrfs_inode_size(path->nodes[0], item); 2278e02119d5SChris Mason if (i_size > bytes_del) 2279e02119d5SChris Mason i_size -= bytes_del; 2280e02119d5SChris Mason else 2281e02119d5SChris Mason i_size = 0; 2282e02119d5SChris Mason btrfs_set_inode_size(path->nodes[0], item, i_size); 2283e02119d5SChris Mason btrfs_mark_buffer_dirty(path->nodes[0]); 2284e02119d5SChris Mason } else 2285e02119d5SChris Mason ret = 0; 2286e02119d5SChris Mason btrfs_release_path(log, path); 2287e02119d5SChris Mason } 22884a500fd1SYan, Zheng fail: 2289e02119d5SChris Mason btrfs_free_path(path); 2290a62f44a5STsutomu Itoh out_unlock: 2291e02119d5SChris Mason mutex_unlock(&BTRFS_I(dir)->log_mutex); 22924a500fd1SYan, Zheng if (ret == -ENOSPC) { 22934a500fd1SYan, Zheng root->fs_info->last_trans_log_full_commit = trans->transid; 22944a500fd1SYan, Zheng ret = 0; 22954a500fd1SYan, Zheng } 229612fcfd22SChris Mason btrfs_end_log_trans(root); 2297e02119d5SChris Mason 2298411fc6bcSAndi Kleen return err; 2299e02119d5SChris Mason } 2300e02119d5SChris Mason 2301e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */ 2302e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2303e02119d5SChris Mason struct btrfs_root *root, 2304e02119d5SChris Mason const char *name, int name_len, 2305e02119d5SChris Mason struct inode *inode, u64 dirid) 2306e02119d5SChris Mason { 2307e02119d5SChris Mason struct btrfs_root *log; 2308e02119d5SChris Mason u64 index; 2309e02119d5SChris Mason int ret; 2310e02119d5SChris Mason 23113a5f1d45SChris Mason if (BTRFS_I(inode)->logged_trans < trans->transid) 23123a5f1d45SChris Mason return 0; 23133a5f1d45SChris Mason 2314e02119d5SChris Mason ret = join_running_log_trans(root); 2315e02119d5SChris Mason if (ret) 2316e02119d5SChris Mason return 0; 2317e02119d5SChris Mason log = root->log_root; 2318e02119d5SChris Mason mutex_lock(&BTRFS_I(inode)->log_mutex); 2319e02119d5SChris Mason 2320e02119d5SChris Mason ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2321e02119d5SChris Mason dirid, &index); 2322e02119d5SChris Mason mutex_unlock(&BTRFS_I(inode)->log_mutex); 23234a500fd1SYan, Zheng if (ret == -ENOSPC) { 23244a500fd1SYan, Zheng root->fs_info->last_trans_log_full_commit = trans->transid; 23254a500fd1SYan, Zheng ret = 0; 23264a500fd1SYan, Zheng } 232712fcfd22SChris Mason btrfs_end_log_trans(root); 2328e02119d5SChris Mason 2329e02119d5SChris Mason return ret; 2330e02119d5SChris Mason } 2331e02119d5SChris Mason 2332e02119d5SChris Mason /* 2333e02119d5SChris Mason * creates a range item in the log for 'dirid'. first_offset and 2334e02119d5SChris Mason * last_offset tell us which parts of the key space the log should 2335e02119d5SChris Mason * be considered authoritative for. 2336e02119d5SChris Mason */ 2337e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2338e02119d5SChris Mason struct btrfs_root *log, 2339e02119d5SChris Mason struct btrfs_path *path, 2340e02119d5SChris Mason int key_type, u64 dirid, 2341e02119d5SChris Mason u64 first_offset, u64 last_offset) 2342e02119d5SChris Mason { 2343e02119d5SChris Mason int ret; 2344e02119d5SChris Mason struct btrfs_key key; 2345e02119d5SChris Mason struct btrfs_dir_log_item *item; 2346e02119d5SChris Mason 2347e02119d5SChris Mason key.objectid = dirid; 2348e02119d5SChris Mason key.offset = first_offset; 2349e02119d5SChris Mason if (key_type == BTRFS_DIR_ITEM_KEY) 2350e02119d5SChris Mason key.type = BTRFS_DIR_LOG_ITEM_KEY; 2351e02119d5SChris Mason else 2352e02119d5SChris Mason key.type = BTRFS_DIR_LOG_INDEX_KEY; 2353e02119d5SChris Mason ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 23544a500fd1SYan, Zheng if (ret) 23554a500fd1SYan, Zheng return ret; 2356e02119d5SChris Mason 2357e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2358e02119d5SChris Mason struct btrfs_dir_log_item); 2359e02119d5SChris Mason btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2360e02119d5SChris Mason btrfs_mark_buffer_dirty(path->nodes[0]); 2361e02119d5SChris Mason btrfs_release_path(log, path); 2362e02119d5SChris Mason return 0; 2363e02119d5SChris Mason } 2364e02119d5SChris Mason 2365e02119d5SChris Mason /* 2366e02119d5SChris Mason * log all the items included in the current transaction for a given 2367e02119d5SChris Mason * directory. This also creates the range items in the log tree required 2368e02119d5SChris Mason * to replay anything deleted before the fsync 2369e02119d5SChris Mason */ 2370e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2371e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 2372e02119d5SChris Mason struct btrfs_path *path, 2373e02119d5SChris Mason struct btrfs_path *dst_path, int key_type, 2374e02119d5SChris Mason u64 min_offset, u64 *last_offset_ret) 2375e02119d5SChris Mason { 2376e02119d5SChris Mason struct btrfs_key min_key; 2377e02119d5SChris Mason struct btrfs_key max_key; 2378e02119d5SChris Mason struct btrfs_root *log = root->log_root; 2379e02119d5SChris Mason struct extent_buffer *src; 23804a500fd1SYan, Zheng int err = 0; 2381e02119d5SChris Mason int ret; 2382e02119d5SChris Mason int i; 2383e02119d5SChris Mason int nritems; 2384e02119d5SChris Mason u64 first_offset = min_offset; 2385e02119d5SChris Mason u64 last_offset = (u64)-1; 2386e02119d5SChris Mason 2387e02119d5SChris Mason log = root->log_root; 2388e02119d5SChris Mason max_key.objectid = inode->i_ino; 2389e02119d5SChris Mason max_key.offset = (u64)-1; 2390e02119d5SChris Mason max_key.type = key_type; 2391e02119d5SChris Mason 2392e02119d5SChris Mason min_key.objectid = inode->i_ino; 2393e02119d5SChris Mason min_key.type = key_type; 2394e02119d5SChris Mason min_key.offset = min_offset; 2395e02119d5SChris Mason 2396e02119d5SChris Mason path->keep_locks = 1; 2397e02119d5SChris Mason 2398e02119d5SChris Mason ret = btrfs_search_forward(root, &min_key, &max_key, 2399e02119d5SChris Mason path, 0, trans->transid); 2400e02119d5SChris Mason 2401e02119d5SChris Mason /* 2402e02119d5SChris Mason * we didn't find anything from this transaction, see if there 2403e02119d5SChris Mason * is anything at all 2404e02119d5SChris Mason */ 2405e02119d5SChris Mason if (ret != 0 || min_key.objectid != inode->i_ino || 2406e02119d5SChris Mason min_key.type != key_type) { 2407e02119d5SChris Mason min_key.objectid = inode->i_ino; 2408e02119d5SChris Mason min_key.type = key_type; 2409e02119d5SChris Mason min_key.offset = (u64)-1; 2410e02119d5SChris Mason btrfs_release_path(root, path); 2411e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2412e02119d5SChris Mason if (ret < 0) { 2413e02119d5SChris Mason btrfs_release_path(root, path); 2414e02119d5SChris Mason return ret; 2415e02119d5SChris Mason } 2416e02119d5SChris Mason ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2417e02119d5SChris Mason 2418e02119d5SChris Mason /* if ret == 0 there are items for this type, 2419e02119d5SChris Mason * create a range to tell us the last key of this type. 2420e02119d5SChris Mason * otherwise, there are no items in this directory after 2421e02119d5SChris Mason * *min_offset, and we create a range to indicate that. 2422e02119d5SChris Mason */ 2423e02119d5SChris Mason if (ret == 0) { 2424e02119d5SChris Mason struct btrfs_key tmp; 2425e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2426e02119d5SChris Mason path->slots[0]); 2427d397712bSChris Mason if (key_type == tmp.type) 2428e02119d5SChris Mason first_offset = max(min_offset, tmp.offset) + 1; 2429e02119d5SChris Mason } 2430e02119d5SChris Mason goto done; 2431e02119d5SChris Mason } 2432e02119d5SChris Mason 2433e02119d5SChris Mason /* go backward to find any previous key */ 2434e02119d5SChris Mason ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2435e02119d5SChris Mason if (ret == 0) { 2436e02119d5SChris Mason struct btrfs_key tmp; 2437e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2438e02119d5SChris Mason if (key_type == tmp.type) { 2439e02119d5SChris Mason first_offset = tmp.offset; 2440e02119d5SChris Mason ret = overwrite_item(trans, log, dst_path, 2441e02119d5SChris Mason path->nodes[0], path->slots[0], 2442e02119d5SChris Mason &tmp); 24434a500fd1SYan, Zheng if (ret) { 24444a500fd1SYan, Zheng err = ret; 24454a500fd1SYan, Zheng goto done; 24464a500fd1SYan, Zheng } 2447e02119d5SChris Mason } 2448e02119d5SChris Mason } 2449e02119d5SChris Mason btrfs_release_path(root, path); 2450e02119d5SChris Mason 2451e02119d5SChris Mason /* find the first key from this transaction again */ 2452e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2453e02119d5SChris Mason if (ret != 0) { 2454e02119d5SChris Mason WARN_ON(1); 2455e02119d5SChris Mason goto done; 2456e02119d5SChris Mason } 2457e02119d5SChris Mason 2458e02119d5SChris Mason /* 2459e02119d5SChris Mason * we have a block from this transaction, log every item in it 2460e02119d5SChris Mason * from our directory 2461e02119d5SChris Mason */ 2462e02119d5SChris Mason while (1) { 2463e02119d5SChris Mason struct btrfs_key tmp; 2464e02119d5SChris Mason src = path->nodes[0]; 2465e02119d5SChris Mason nritems = btrfs_header_nritems(src); 2466e02119d5SChris Mason for (i = path->slots[0]; i < nritems; i++) { 2467e02119d5SChris Mason btrfs_item_key_to_cpu(src, &min_key, i); 2468e02119d5SChris Mason 2469e02119d5SChris Mason if (min_key.objectid != inode->i_ino || 2470e02119d5SChris Mason min_key.type != key_type) 2471e02119d5SChris Mason goto done; 2472e02119d5SChris Mason ret = overwrite_item(trans, log, dst_path, src, i, 2473e02119d5SChris Mason &min_key); 24744a500fd1SYan, Zheng if (ret) { 24754a500fd1SYan, Zheng err = ret; 24764a500fd1SYan, Zheng goto done; 24774a500fd1SYan, Zheng } 2478e02119d5SChris Mason } 2479e02119d5SChris Mason path->slots[0] = nritems; 2480e02119d5SChris Mason 2481e02119d5SChris Mason /* 2482e02119d5SChris Mason * look ahead to the next item and see if it is also 2483e02119d5SChris Mason * from this directory and from this transaction 2484e02119d5SChris Mason */ 2485e02119d5SChris Mason ret = btrfs_next_leaf(root, path); 2486e02119d5SChris Mason if (ret == 1) { 2487e02119d5SChris Mason last_offset = (u64)-1; 2488e02119d5SChris Mason goto done; 2489e02119d5SChris Mason } 2490e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2491e02119d5SChris Mason if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2492e02119d5SChris Mason last_offset = (u64)-1; 2493e02119d5SChris Mason goto done; 2494e02119d5SChris Mason } 2495e02119d5SChris Mason if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2496e02119d5SChris Mason ret = overwrite_item(trans, log, dst_path, 2497e02119d5SChris Mason path->nodes[0], path->slots[0], 2498e02119d5SChris Mason &tmp); 24994a500fd1SYan, Zheng if (ret) 25004a500fd1SYan, Zheng err = ret; 25014a500fd1SYan, Zheng else 2502e02119d5SChris Mason last_offset = tmp.offset; 2503e02119d5SChris Mason goto done; 2504e02119d5SChris Mason } 2505e02119d5SChris Mason } 2506e02119d5SChris Mason done: 2507e02119d5SChris Mason btrfs_release_path(root, path); 2508e02119d5SChris Mason btrfs_release_path(log, dst_path); 2509e02119d5SChris Mason 25104a500fd1SYan, Zheng if (err == 0) { 25114a500fd1SYan, Zheng *last_offset_ret = last_offset; 25124a500fd1SYan, Zheng /* 25134a500fd1SYan, Zheng * insert the log range keys to indicate where the log 25144a500fd1SYan, Zheng * is valid 25154a500fd1SYan, Zheng */ 25164a500fd1SYan, Zheng ret = insert_dir_log_key(trans, log, path, key_type, 25174a500fd1SYan, Zheng inode->i_ino, first_offset, 25184a500fd1SYan, Zheng last_offset); 25194a500fd1SYan, Zheng if (ret) 25204a500fd1SYan, Zheng err = ret; 25214a500fd1SYan, Zheng } 25224a500fd1SYan, Zheng return err; 2523e02119d5SChris Mason } 2524e02119d5SChris Mason 2525e02119d5SChris Mason /* 2526e02119d5SChris Mason * logging directories is very similar to logging inodes, We find all the items 2527e02119d5SChris Mason * from the current transaction and write them to the log. 2528e02119d5SChris Mason * 2529e02119d5SChris Mason * The recovery code scans the directory in the subvolume, and if it finds a 2530e02119d5SChris Mason * key in the range logged that is not present in the log tree, then it means 2531e02119d5SChris Mason * that dir entry was unlinked during the transaction. 2532e02119d5SChris Mason * 2533e02119d5SChris Mason * In order for that scan to work, we must include one key smaller than 2534e02119d5SChris Mason * the smallest logged by this transaction and one key larger than the largest 2535e02119d5SChris Mason * key logged by this transaction. 2536e02119d5SChris Mason */ 2537e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2538e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 2539e02119d5SChris Mason struct btrfs_path *path, 2540e02119d5SChris Mason struct btrfs_path *dst_path) 2541e02119d5SChris Mason { 2542e02119d5SChris Mason u64 min_key; 2543e02119d5SChris Mason u64 max_key; 2544e02119d5SChris Mason int ret; 2545e02119d5SChris Mason int key_type = BTRFS_DIR_ITEM_KEY; 2546e02119d5SChris Mason 2547e02119d5SChris Mason again: 2548e02119d5SChris Mason min_key = 0; 2549e02119d5SChris Mason max_key = 0; 2550e02119d5SChris Mason while (1) { 2551e02119d5SChris Mason ret = log_dir_items(trans, root, inode, path, 2552e02119d5SChris Mason dst_path, key_type, min_key, 2553e02119d5SChris Mason &max_key); 25544a500fd1SYan, Zheng if (ret) 25554a500fd1SYan, Zheng return ret; 2556e02119d5SChris Mason if (max_key == (u64)-1) 2557e02119d5SChris Mason break; 2558e02119d5SChris Mason min_key = max_key + 1; 2559e02119d5SChris Mason } 2560e02119d5SChris Mason 2561e02119d5SChris Mason if (key_type == BTRFS_DIR_ITEM_KEY) { 2562e02119d5SChris Mason key_type = BTRFS_DIR_INDEX_KEY; 2563e02119d5SChris Mason goto again; 2564e02119d5SChris Mason } 2565e02119d5SChris Mason return 0; 2566e02119d5SChris Mason } 2567e02119d5SChris Mason 2568e02119d5SChris Mason /* 2569e02119d5SChris Mason * a helper function to drop items from the log before we relog an 2570e02119d5SChris Mason * inode. max_key_type indicates the highest item type to remove. 2571e02119d5SChris Mason * This cannot be run for file data extents because it does not 2572e02119d5SChris Mason * free the extents they point to. 2573e02119d5SChris Mason */ 2574e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans, 2575e02119d5SChris Mason struct btrfs_root *log, 2576e02119d5SChris Mason struct btrfs_path *path, 2577e02119d5SChris Mason u64 objectid, int max_key_type) 2578e02119d5SChris Mason { 2579e02119d5SChris Mason int ret; 2580e02119d5SChris Mason struct btrfs_key key; 2581e02119d5SChris Mason struct btrfs_key found_key; 2582e02119d5SChris Mason 2583e02119d5SChris Mason key.objectid = objectid; 2584e02119d5SChris Mason key.type = max_key_type; 2585e02119d5SChris Mason key.offset = (u64)-1; 2586e02119d5SChris Mason 2587e02119d5SChris Mason while (1) { 2588e02119d5SChris Mason ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 25894a500fd1SYan, Zheng BUG_ON(ret == 0); 25904a500fd1SYan, Zheng if (ret < 0) 2591e02119d5SChris Mason break; 2592e02119d5SChris Mason 2593e02119d5SChris Mason if (path->slots[0] == 0) 2594e02119d5SChris Mason break; 2595e02119d5SChris Mason 2596e02119d5SChris Mason path->slots[0]--; 2597e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2598e02119d5SChris Mason path->slots[0]); 2599e02119d5SChris Mason 2600e02119d5SChris Mason if (found_key.objectid != objectid) 2601e02119d5SChris Mason break; 2602e02119d5SChris Mason 2603e02119d5SChris Mason ret = btrfs_del_item(trans, log, path); 260465a246c5STsutomu Itoh if (ret) 260565a246c5STsutomu Itoh break; 2606e02119d5SChris Mason btrfs_release_path(log, path); 2607e02119d5SChris Mason } 2608e02119d5SChris Mason btrfs_release_path(log, path); 26094a500fd1SYan, Zheng return ret; 2610e02119d5SChris Mason } 2611e02119d5SChris Mason 261231ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans, 261331ff1cd2SChris Mason struct btrfs_root *log, 261431ff1cd2SChris Mason struct btrfs_path *dst_path, 261531ff1cd2SChris Mason struct extent_buffer *src, 261631ff1cd2SChris Mason int start_slot, int nr, int inode_only) 261731ff1cd2SChris Mason { 261831ff1cd2SChris Mason unsigned long src_offset; 261931ff1cd2SChris Mason unsigned long dst_offset; 262031ff1cd2SChris Mason struct btrfs_file_extent_item *extent; 262131ff1cd2SChris Mason struct btrfs_inode_item *inode_item; 262231ff1cd2SChris Mason int ret; 262331ff1cd2SChris Mason struct btrfs_key *ins_keys; 262431ff1cd2SChris Mason u32 *ins_sizes; 262531ff1cd2SChris Mason char *ins_data; 262631ff1cd2SChris Mason int i; 2627d20f7043SChris Mason struct list_head ordered_sums; 2628d20f7043SChris Mason 2629d20f7043SChris Mason INIT_LIST_HEAD(&ordered_sums); 263031ff1cd2SChris Mason 263131ff1cd2SChris Mason ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 263231ff1cd2SChris Mason nr * sizeof(u32), GFP_NOFS); 26332a29edc6Sliubo if (!ins_data) 26342a29edc6Sliubo return -ENOMEM; 26352a29edc6Sliubo 263631ff1cd2SChris Mason ins_sizes = (u32 *)ins_data; 263731ff1cd2SChris Mason ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 263831ff1cd2SChris Mason 263931ff1cd2SChris Mason for (i = 0; i < nr; i++) { 264031ff1cd2SChris Mason ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 264131ff1cd2SChris Mason btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 264231ff1cd2SChris Mason } 264331ff1cd2SChris Mason ret = btrfs_insert_empty_items(trans, log, dst_path, 264431ff1cd2SChris Mason ins_keys, ins_sizes, nr); 26454a500fd1SYan, Zheng if (ret) { 26464a500fd1SYan, Zheng kfree(ins_data); 26474a500fd1SYan, Zheng return ret; 26484a500fd1SYan, Zheng } 264931ff1cd2SChris Mason 26505d4f98a2SYan Zheng for (i = 0; i < nr; i++, dst_path->slots[0]++) { 265131ff1cd2SChris Mason dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 265231ff1cd2SChris Mason dst_path->slots[0]); 265331ff1cd2SChris Mason 265431ff1cd2SChris Mason src_offset = btrfs_item_ptr_offset(src, start_slot + i); 265531ff1cd2SChris Mason 265631ff1cd2SChris Mason copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 265731ff1cd2SChris Mason src_offset, ins_sizes[i]); 265831ff1cd2SChris Mason 265931ff1cd2SChris Mason if (inode_only == LOG_INODE_EXISTS && 266031ff1cd2SChris Mason ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 266131ff1cd2SChris Mason inode_item = btrfs_item_ptr(dst_path->nodes[0], 266231ff1cd2SChris Mason dst_path->slots[0], 266331ff1cd2SChris Mason struct btrfs_inode_item); 266431ff1cd2SChris Mason btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 266531ff1cd2SChris Mason 266631ff1cd2SChris Mason /* set the generation to zero so the recover code 266731ff1cd2SChris Mason * can tell the difference between an logging 266831ff1cd2SChris Mason * just to say 'this inode exists' and a logging 266931ff1cd2SChris Mason * to say 'update this inode with these values' 267031ff1cd2SChris Mason */ 267131ff1cd2SChris Mason btrfs_set_inode_generation(dst_path->nodes[0], 267231ff1cd2SChris Mason inode_item, 0); 267331ff1cd2SChris Mason } 267431ff1cd2SChris Mason /* take a reference on file data extents so that truncates 267531ff1cd2SChris Mason * or deletes of this inode don't have to relog the inode 267631ff1cd2SChris Mason * again 267731ff1cd2SChris Mason */ 267831ff1cd2SChris Mason if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 267931ff1cd2SChris Mason int found_type; 268031ff1cd2SChris Mason extent = btrfs_item_ptr(src, start_slot + i, 268131ff1cd2SChris Mason struct btrfs_file_extent_item); 268231ff1cd2SChris Mason 268331ff1cd2SChris Mason found_type = btrfs_file_extent_type(src, extent); 2684d899e052SYan Zheng if (found_type == BTRFS_FILE_EXTENT_REG || 2685d899e052SYan Zheng found_type == BTRFS_FILE_EXTENT_PREALLOC) { 26865d4f98a2SYan Zheng u64 ds, dl, cs, cl; 26875d4f98a2SYan Zheng ds = btrfs_file_extent_disk_bytenr(src, 268831ff1cd2SChris Mason extent); 26895d4f98a2SYan Zheng /* ds == 0 is a hole */ 26905d4f98a2SYan Zheng if (ds == 0) 26915d4f98a2SYan Zheng continue; 26925d4f98a2SYan Zheng 26935d4f98a2SYan Zheng dl = btrfs_file_extent_disk_num_bytes(src, 269431ff1cd2SChris Mason extent); 26955d4f98a2SYan Zheng cs = btrfs_file_extent_offset(src, extent); 26965d4f98a2SYan Zheng cl = btrfs_file_extent_num_bytes(src, 2697a419aef8SJoe Perches extent); 2698580afd76SChris Mason if (btrfs_file_extent_compression(src, 2699580afd76SChris Mason extent)) { 2700580afd76SChris Mason cs = 0; 2701580afd76SChris Mason cl = dl; 2702580afd76SChris Mason } 27035d4f98a2SYan Zheng 270407d400a6SYan Zheng ret = btrfs_lookup_csums_range( 2705d20f7043SChris Mason log->fs_info->csum_root, 270607d400a6SYan Zheng ds + cs, ds + cs + cl - 1, 270707d400a6SYan Zheng &ordered_sums); 2708d20f7043SChris Mason BUG_ON(ret); 270931ff1cd2SChris Mason } 271031ff1cd2SChris Mason } 271131ff1cd2SChris Mason } 271231ff1cd2SChris Mason 271331ff1cd2SChris Mason btrfs_mark_buffer_dirty(dst_path->nodes[0]); 271431ff1cd2SChris Mason btrfs_release_path(log, dst_path); 271531ff1cd2SChris Mason kfree(ins_data); 2716d20f7043SChris Mason 2717d20f7043SChris Mason /* 2718d20f7043SChris Mason * we have to do this after the loop above to avoid changing the 2719d20f7043SChris Mason * log tree while trying to change the log tree. 2720d20f7043SChris Mason */ 27214a500fd1SYan, Zheng ret = 0; 2722d20f7043SChris Mason while (!list_empty(&ordered_sums)) { 2723d20f7043SChris Mason struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2724d20f7043SChris Mason struct btrfs_ordered_sum, 2725d20f7043SChris Mason list); 27264a500fd1SYan, Zheng if (!ret) 2727d20f7043SChris Mason ret = btrfs_csum_file_blocks(trans, log, sums); 2728d20f7043SChris Mason list_del(&sums->list); 2729d20f7043SChris Mason kfree(sums); 2730d20f7043SChris Mason } 27314a500fd1SYan, Zheng return ret; 273231ff1cd2SChris Mason } 273331ff1cd2SChris Mason 2734e02119d5SChris Mason /* log a single inode in the tree log. 2735e02119d5SChris Mason * At least one parent directory for this inode must exist in the tree 2736e02119d5SChris Mason * or be logged already. 2737e02119d5SChris Mason * 2738e02119d5SChris Mason * Any items from this inode changed by the current transaction are copied 2739e02119d5SChris Mason * to the log tree. An extra reference is taken on any extents in this 2740e02119d5SChris Mason * file, allowing us to avoid a whole pile of corner cases around logging 2741e02119d5SChris Mason * blocks that have been removed from the tree. 2742e02119d5SChris Mason * 2743e02119d5SChris Mason * See LOG_INODE_ALL and related defines for a description of what inode_only 2744e02119d5SChris Mason * does. 2745e02119d5SChris Mason * 2746e02119d5SChris Mason * This handles both files and directories. 2747e02119d5SChris Mason */ 274812fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2749e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 2750e02119d5SChris Mason int inode_only) 2751e02119d5SChris Mason { 2752e02119d5SChris Mason struct btrfs_path *path; 2753e02119d5SChris Mason struct btrfs_path *dst_path; 2754e02119d5SChris Mason struct btrfs_key min_key; 2755e02119d5SChris Mason struct btrfs_key max_key; 2756e02119d5SChris Mason struct btrfs_root *log = root->log_root; 275731ff1cd2SChris Mason struct extent_buffer *src = NULL; 27584a500fd1SYan, Zheng int err = 0; 2759e02119d5SChris Mason int ret; 27603a5f1d45SChris Mason int nritems; 276131ff1cd2SChris Mason int ins_start_slot = 0; 276231ff1cd2SChris Mason int ins_nr; 2763e02119d5SChris Mason 2764e02119d5SChris Mason log = root->log_root; 2765e02119d5SChris Mason 2766e02119d5SChris Mason path = btrfs_alloc_path(); 27675df67083STsutomu Itoh if (!path) 27685df67083STsutomu Itoh return -ENOMEM; 2769e02119d5SChris Mason dst_path = btrfs_alloc_path(); 27705df67083STsutomu Itoh if (!dst_path) { 27715df67083STsutomu Itoh btrfs_free_path(path); 27725df67083STsutomu Itoh return -ENOMEM; 27735df67083STsutomu Itoh } 2774e02119d5SChris Mason 2775e02119d5SChris Mason min_key.objectid = inode->i_ino; 2776e02119d5SChris Mason min_key.type = BTRFS_INODE_ITEM_KEY; 2777e02119d5SChris Mason min_key.offset = 0; 2778e02119d5SChris Mason 2779e02119d5SChris Mason max_key.objectid = inode->i_ino; 278012fcfd22SChris Mason 278112fcfd22SChris Mason /* today the code can only do partial logging of directories */ 278212fcfd22SChris Mason if (!S_ISDIR(inode->i_mode)) 278312fcfd22SChris Mason inode_only = LOG_INODE_ALL; 278412fcfd22SChris Mason 2785e02119d5SChris Mason if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2786e02119d5SChris Mason max_key.type = BTRFS_XATTR_ITEM_KEY; 2787e02119d5SChris Mason else 2788e02119d5SChris Mason max_key.type = (u8)-1; 2789e02119d5SChris Mason max_key.offset = (u64)-1; 2790e02119d5SChris Mason 2791e02119d5SChris Mason mutex_lock(&BTRFS_I(inode)->log_mutex); 2792e02119d5SChris Mason 2793e02119d5SChris Mason /* 2794e02119d5SChris Mason * a brute force approach to making sure we get the most uptodate 2795e02119d5SChris Mason * copies of everything. 2796e02119d5SChris Mason */ 2797e02119d5SChris Mason if (S_ISDIR(inode->i_mode)) { 2798e02119d5SChris Mason int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2799e02119d5SChris Mason 2800e02119d5SChris Mason if (inode_only == LOG_INODE_EXISTS) 2801e02119d5SChris Mason max_key_type = BTRFS_XATTR_ITEM_KEY; 2802e02119d5SChris Mason ret = drop_objectid_items(trans, log, path, 2803e02119d5SChris Mason inode->i_ino, max_key_type); 2804e02119d5SChris Mason } else { 2805e02119d5SChris Mason ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2806e02119d5SChris Mason } 28074a500fd1SYan, Zheng if (ret) { 28084a500fd1SYan, Zheng err = ret; 28094a500fd1SYan, Zheng goto out_unlock; 28104a500fd1SYan, Zheng } 2811e02119d5SChris Mason path->keep_locks = 1; 2812e02119d5SChris Mason 2813e02119d5SChris Mason while (1) { 281431ff1cd2SChris Mason ins_nr = 0; 2815e02119d5SChris Mason ret = btrfs_search_forward(root, &min_key, &max_key, 2816e02119d5SChris Mason path, 0, trans->transid); 2817e02119d5SChris Mason if (ret != 0) 2818e02119d5SChris Mason break; 28193a5f1d45SChris Mason again: 282031ff1cd2SChris Mason /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2821e02119d5SChris Mason if (min_key.objectid != inode->i_ino) 2822e02119d5SChris Mason break; 2823e02119d5SChris Mason if (min_key.type > max_key.type) 2824e02119d5SChris Mason break; 282531ff1cd2SChris Mason 2826e02119d5SChris Mason src = path->nodes[0]; 282731ff1cd2SChris Mason if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 282831ff1cd2SChris Mason ins_nr++; 282931ff1cd2SChris Mason goto next_slot; 283031ff1cd2SChris Mason } else if (!ins_nr) { 283131ff1cd2SChris Mason ins_start_slot = path->slots[0]; 283231ff1cd2SChris Mason ins_nr = 1; 283331ff1cd2SChris Mason goto next_slot; 2834e02119d5SChris Mason } 2835e02119d5SChris Mason 283631ff1cd2SChris Mason ret = copy_items(trans, log, dst_path, src, ins_start_slot, 283731ff1cd2SChris Mason ins_nr, inode_only); 28384a500fd1SYan, Zheng if (ret) { 28394a500fd1SYan, Zheng err = ret; 28404a500fd1SYan, Zheng goto out_unlock; 28414a500fd1SYan, Zheng } 284231ff1cd2SChris Mason ins_nr = 1; 284331ff1cd2SChris Mason ins_start_slot = path->slots[0]; 284431ff1cd2SChris Mason next_slot: 2845e02119d5SChris Mason 28463a5f1d45SChris Mason nritems = btrfs_header_nritems(path->nodes[0]); 28473a5f1d45SChris Mason path->slots[0]++; 28483a5f1d45SChris Mason if (path->slots[0] < nritems) { 28493a5f1d45SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &min_key, 28503a5f1d45SChris Mason path->slots[0]); 28513a5f1d45SChris Mason goto again; 28523a5f1d45SChris Mason } 285331ff1cd2SChris Mason if (ins_nr) { 285431ff1cd2SChris Mason ret = copy_items(trans, log, dst_path, src, 285531ff1cd2SChris Mason ins_start_slot, 285631ff1cd2SChris Mason ins_nr, inode_only); 28574a500fd1SYan, Zheng if (ret) { 28584a500fd1SYan, Zheng err = ret; 28594a500fd1SYan, Zheng goto out_unlock; 28604a500fd1SYan, Zheng } 286131ff1cd2SChris Mason ins_nr = 0; 286231ff1cd2SChris Mason } 28633a5f1d45SChris Mason btrfs_release_path(root, path); 28643a5f1d45SChris Mason 2865e02119d5SChris Mason if (min_key.offset < (u64)-1) 2866e02119d5SChris Mason min_key.offset++; 2867e02119d5SChris Mason else if (min_key.type < (u8)-1) 2868e02119d5SChris Mason min_key.type++; 2869e02119d5SChris Mason else if (min_key.objectid < (u64)-1) 2870e02119d5SChris Mason min_key.objectid++; 2871e02119d5SChris Mason else 2872e02119d5SChris Mason break; 2873e02119d5SChris Mason } 287431ff1cd2SChris Mason if (ins_nr) { 287531ff1cd2SChris Mason ret = copy_items(trans, log, dst_path, src, 287631ff1cd2SChris Mason ins_start_slot, 287731ff1cd2SChris Mason ins_nr, inode_only); 28784a500fd1SYan, Zheng if (ret) { 28794a500fd1SYan, Zheng err = ret; 28804a500fd1SYan, Zheng goto out_unlock; 28814a500fd1SYan, Zheng } 288231ff1cd2SChris Mason ins_nr = 0; 288331ff1cd2SChris Mason } 288431ff1cd2SChris Mason WARN_ON(ins_nr); 28859623f9a3SChris Mason if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2886e02119d5SChris Mason btrfs_release_path(root, path); 2887e02119d5SChris Mason btrfs_release_path(log, dst_path); 2888e02119d5SChris Mason ret = log_directory_changes(trans, root, inode, path, dst_path); 28894a500fd1SYan, Zheng if (ret) { 28904a500fd1SYan, Zheng err = ret; 28914a500fd1SYan, Zheng goto out_unlock; 28924a500fd1SYan, Zheng } 2893e02119d5SChris Mason } 28943a5f1d45SChris Mason BTRFS_I(inode)->logged_trans = trans->transid; 28954a500fd1SYan, Zheng out_unlock: 2896e02119d5SChris Mason mutex_unlock(&BTRFS_I(inode)->log_mutex); 2897e02119d5SChris Mason 2898e02119d5SChris Mason btrfs_free_path(path); 2899e02119d5SChris Mason btrfs_free_path(dst_path); 29004a500fd1SYan, Zheng return err; 2901e02119d5SChris Mason } 2902e02119d5SChris Mason 290312fcfd22SChris Mason /* 290412fcfd22SChris Mason * follow the dentry parent pointers up the chain and see if any 290512fcfd22SChris Mason * of the directories in it require a full commit before they can 290612fcfd22SChris Mason * be logged. Returns zero if nothing special needs to be done or 1 if 290712fcfd22SChris Mason * a full commit is required. 290812fcfd22SChris Mason */ 290912fcfd22SChris Mason static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 291012fcfd22SChris Mason struct inode *inode, 291112fcfd22SChris Mason struct dentry *parent, 291212fcfd22SChris Mason struct super_block *sb, 291312fcfd22SChris Mason u64 last_committed) 2914e02119d5SChris Mason { 291512fcfd22SChris Mason int ret = 0; 291612fcfd22SChris Mason struct btrfs_root *root; 29176a912213SJosef Bacik struct dentry *old_parent = NULL; 2918e02119d5SChris Mason 2919af4176b4SChris Mason /* 2920af4176b4SChris Mason * for regular files, if its inode is already on disk, we don't 2921af4176b4SChris Mason * have to worry about the parents at all. This is because 2922af4176b4SChris Mason * we can use the last_unlink_trans field to record renames 2923af4176b4SChris Mason * and other fun in this file. 2924af4176b4SChris Mason */ 2925af4176b4SChris Mason if (S_ISREG(inode->i_mode) && 2926af4176b4SChris Mason BTRFS_I(inode)->generation <= last_committed && 2927af4176b4SChris Mason BTRFS_I(inode)->last_unlink_trans <= last_committed) 2928af4176b4SChris Mason goto out; 2929af4176b4SChris Mason 293012fcfd22SChris Mason if (!S_ISDIR(inode->i_mode)) { 293112fcfd22SChris Mason if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 293212fcfd22SChris Mason goto out; 293312fcfd22SChris Mason inode = parent->d_inode; 293412fcfd22SChris Mason } 293512fcfd22SChris Mason 293612fcfd22SChris Mason while (1) { 293712fcfd22SChris Mason BTRFS_I(inode)->logged_trans = trans->transid; 293812fcfd22SChris Mason smp_mb(); 293912fcfd22SChris Mason 294012fcfd22SChris Mason if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 294112fcfd22SChris Mason root = BTRFS_I(inode)->root; 294212fcfd22SChris Mason 294312fcfd22SChris Mason /* 294412fcfd22SChris Mason * make sure any commits to the log are forced 294512fcfd22SChris Mason * to be full commits 294612fcfd22SChris Mason */ 294712fcfd22SChris Mason root->fs_info->last_trans_log_full_commit = 294812fcfd22SChris Mason trans->transid; 294912fcfd22SChris Mason ret = 1; 295012fcfd22SChris Mason break; 295112fcfd22SChris Mason } 295212fcfd22SChris Mason 295312fcfd22SChris Mason if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 295412fcfd22SChris Mason break; 295512fcfd22SChris Mason 295676dda93cSYan, Zheng if (IS_ROOT(parent)) 295712fcfd22SChris Mason break; 295812fcfd22SChris Mason 29596a912213SJosef Bacik parent = dget_parent(parent); 29606a912213SJosef Bacik dput(old_parent); 29616a912213SJosef Bacik old_parent = parent; 296212fcfd22SChris Mason inode = parent->d_inode; 296312fcfd22SChris Mason 296412fcfd22SChris Mason } 29656a912213SJosef Bacik dput(old_parent); 296612fcfd22SChris Mason out: 2967e02119d5SChris Mason return ret; 2968e02119d5SChris Mason } 2969e02119d5SChris Mason 2970257c62e1SChris Mason static int inode_in_log(struct btrfs_trans_handle *trans, 2971257c62e1SChris Mason struct inode *inode) 2972257c62e1SChris Mason { 2973257c62e1SChris Mason struct btrfs_root *root = BTRFS_I(inode)->root; 2974257c62e1SChris Mason int ret = 0; 2975257c62e1SChris Mason 2976257c62e1SChris Mason mutex_lock(&root->log_mutex); 2977257c62e1SChris Mason if (BTRFS_I(inode)->logged_trans == trans->transid && 2978257c62e1SChris Mason BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 2979257c62e1SChris Mason ret = 1; 2980257c62e1SChris Mason mutex_unlock(&root->log_mutex); 2981257c62e1SChris Mason return ret; 2982257c62e1SChris Mason } 2983257c62e1SChris Mason 2984257c62e1SChris Mason 2985e02119d5SChris Mason /* 2986e02119d5SChris Mason * helper function around btrfs_log_inode to make sure newly created 2987e02119d5SChris Mason * parent directories also end up in the log. A minimal inode and backref 2988e02119d5SChris Mason * only logging is done of any parent directories that are older than 2989e02119d5SChris Mason * the last committed transaction 2990e02119d5SChris Mason */ 299112fcfd22SChris Mason int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 299212fcfd22SChris Mason struct btrfs_root *root, struct inode *inode, 299312fcfd22SChris Mason struct dentry *parent, int exists_only) 2994e02119d5SChris Mason { 299512fcfd22SChris Mason int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2996e02119d5SChris Mason struct super_block *sb; 29976a912213SJosef Bacik struct dentry *old_parent = NULL; 299812fcfd22SChris Mason int ret = 0; 299912fcfd22SChris Mason u64 last_committed = root->fs_info->last_trans_committed; 300012fcfd22SChris Mason 300112fcfd22SChris Mason sb = inode->i_sb; 300212fcfd22SChris Mason 30033a5e1404SSage Weil if (btrfs_test_opt(root, NOTREELOG)) { 30043a5e1404SSage Weil ret = 1; 30053a5e1404SSage Weil goto end_no_trans; 30063a5e1404SSage Weil } 30073a5e1404SSage Weil 300812fcfd22SChris Mason if (root->fs_info->last_trans_log_full_commit > 300912fcfd22SChris Mason root->fs_info->last_trans_committed) { 301012fcfd22SChris Mason ret = 1; 301112fcfd22SChris Mason goto end_no_trans; 301212fcfd22SChris Mason } 301312fcfd22SChris Mason 301476dda93cSYan, Zheng if (root != BTRFS_I(inode)->root || 301576dda93cSYan, Zheng btrfs_root_refs(&root->root_item) == 0) { 301676dda93cSYan, Zheng ret = 1; 301776dda93cSYan, Zheng goto end_no_trans; 301876dda93cSYan, Zheng } 301976dda93cSYan, Zheng 302012fcfd22SChris Mason ret = check_parent_dirs_for_sync(trans, inode, parent, 302112fcfd22SChris Mason sb, last_committed); 302212fcfd22SChris Mason if (ret) 302312fcfd22SChris Mason goto end_no_trans; 3024e02119d5SChris Mason 3025257c62e1SChris Mason if (inode_in_log(trans, inode)) { 3026257c62e1SChris Mason ret = BTRFS_NO_LOG_SYNC; 3027257c62e1SChris Mason goto end_no_trans; 3028257c62e1SChris Mason } 3029257c62e1SChris Mason 30304a500fd1SYan, Zheng ret = start_log_trans(trans, root); 30314a500fd1SYan, Zheng if (ret) 30324a500fd1SYan, Zheng goto end_trans; 303312fcfd22SChris Mason 303412fcfd22SChris Mason ret = btrfs_log_inode(trans, root, inode, inode_only); 30354a500fd1SYan, Zheng if (ret) 30364a500fd1SYan, Zheng goto end_trans; 3037e02119d5SChris Mason 3038af4176b4SChris Mason /* 3039af4176b4SChris Mason * for regular files, if its inode is already on disk, we don't 3040af4176b4SChris Mason * have to worry about the parents at all. This is because 3041af4176b4SChris Mason * we can use the last_unlink_trans field to record renames 3042af4176b4SChris Mason * and other fun in this file. 3043af4176b4SChris Mason */ 3044af4176b4SChris Mason if (S_ISREG(inode->i_mode) && 3045af4176b4SChris Mason BTRFS_I(inode)->generation <= last_committed && 30464a500fd1SYan, Zheng BTRFS_I(inode)->last_unlink_trans <= last_committed) { 30474a500fd1SYan, Zheng ret = 0; 30484a500fd1SYan, Zheng goto end_trans; 30494a500fd1SYan, Zheng } 3050af4176b4SChris Mason 3051af4176b4SChris Mason inode_only = LOG_INODE_EXISTS; 305212fcfd22SChris Mason while (1) { 305312fcfd22SChris Mason if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3054e02119d5SChris Mason break; 3055e02119d5SChris Mason 305612fcfd22SChris Mason inode = parent->d_inode; 305776dda93cSYan, Zheng if (root != BTRFS_I(inode)->root) 305876dda93cSYan, Zheng break; 305976dda93cSYan, Zheng 306012fcfd22SChris Mason if (BTRFS_I(inode)->generation > 306112fcfd22SChris Mason root->fs_info->last_trans_committed) { 306212fcfd22SChris Mason ret = btrfs_log_inode(trans, root, inode, inode_only); 30634a500fd1SYan, Zheng if (ret) 30644a500fd1SYan, Zheng goto end_trans; 3065e02119d5SChris Mason } 306676dda93cSYan, Zheng if (IS_ROOT(parent)) 306712fcfd22SChris Mason break; 306812fcfd22SChris Mason 30696a912213SJosef Bacik parent = dget_parent(parent); 30706a912213SJosef Bacik dput(old_parent); 30716a912213SJosef Bacik old_parent = parent; 307212fcfd22SChris Mason } 307312fcfd22SChris Mason ret = 0; 30744a500fd1SYan, Zheng end_trans: 30756a912213SJosef Bacik dput(old_parent); 30764a500fd1SYan, Zheng if (ret < 0) { 30774a500fd1SYan, Zheng BUG_ON(ret != -ENOSPC); 30784a500fd1SYan, Zheng root->fs_info->last_trans_log_full_commit = trans->transid; 30794a500fd1SYan, Zheng ret = 1; 30804a500fd1SYan, Zheng } 308112fcfd22SChris Mason btrfs_end_log_trans(root); 308212fcfd22SChris Mason end_no_trans: 308312fcfd22SChris Mason return ret; 3084e02119d5SChris Mason } 3085e02119d5SChris Mason 3086e02119d5SChris Mason /* 3087e02119d5SChris Mason * it is not safe to log dentry if the chunk root has added new 3088e02119d5SChris Mason * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 3089e02119d5SChris Mason * If this returns 1, you must commit the transaction to safely get your 3090e02119d5SChris Mason * data on disk. 3091e02119d5SChris Mason */ 3092e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3093e02119d5SChris Mason struct btrfs_root *root, struct dentry *dentry) 3094e02119d5SChris Mason { 30956a912213SJosef Bacik struct dentry *parent = dget_parent(dentry); 30966a912213SJosef Bacik int ret; 30976a912213SJosef Bacik 30986a912213SJosef Bacik ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 30996a912213SJosef Bacik dput(parent); 31006a912213SJosef Bacik 31016a912213SJosef Bacik return ret; 3102e02119d5SChris Mason } 3103e02119d5SChris Mason 3104e02119d5SChris Mason /* 3105e02119d5SChris Mason * should be called during mount to recover any replay any log trees 3106e02119d5SChris Mason * from the FS 3107e02119d5SChris Mason */ 3108e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 3109e02119d5SChris Mason { 3110e02119d5SChris Mason int ret; 3111e02119d5SChris Mason struct btrfs_path *path; 3112e02119d5SChris Mason struct btrfs_trans_handle *trans; 3113e02119d5SChris Mason struct btrfs_key key; 3114e02119d5SChris Mason struct btrfs_key found_key; 3115e02119d5SChris Mason struct btrfs_key tmp_key; 3116e02119d5SChris Mason struct btrfs_root *log; 3117e02119d5SChris Mason struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 3118e02119d5SChris Mason struct walk_control wc = { 3119e02119d5SChris Mason .process_func = process_one_buffer, 3120e02119d5SChris Mason .stage = 0, 3121e02119d5SChris Mason }; 3122e02119d5SChris Mason 3123e02119d5SChris Mason path = btrfs_alloc_path(); 3124db5b493aSTsutomu Itoh if (!path) 3125db5b493aSTsutomu Itoh return -ENOMEM; 3126db5b493aSTsutomu Itoh 3127db5b493aSTsutomu Itoh fs_info->log_root_recovering = 1; 3128e02119d5SChris Mason 31294a500fd1SYan, Zheng trans = btrfs_start_transaction(fs_info->tree_root, 0); 313098d5dc13STsutomu Itoh BUG_ON(IS_ERR(trans)); 3131e02119d5SChris Mason 3132e02119d5SChris Mason wc.trans = trans; 3133e02119d5SChris Mason wc.pin = 1; 3134e02119d5SChris Mason 3135db5b493aSTsutomu Itoh ret = walk_log_tree(trans, log_root_tree, &wc); 3136db5b493aSTsutomu Itoh BUG_ON(ret); 3137e02119d5SChris Mason 3138e02119d5SChris Mason again: 3139e02119d5SChris Mason key.objectid = BTRFS_TREE_LOG_OBJECTID; 3140e02119d5SChris Mason key.offset = (u64)-1; 3141e02119d5SChris Mason btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3142e02119d5SChris Mason 3143e02119d5SChris Mason while (1) { 3144e02119d5SChris Mason ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3145e02119d5SChris Mason if (ret < 0) 3146e02119d5SChris Mason break; 3147e02119d5SChris Mason if (ret > 0) { 3148e02119d5SChris Mason if (path->slots[0] == 0) 3149e02119d5SChris Mason break; 3150e02119d5SChris Mason path->slots[0]--; 3151e02119d5SChris Mason } 3152e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3153e02119d5SChris Mason path->slots[0]); 3154e02119d5SChris Mason btrfs_release_path(log_root_tree, path); 3155e02119d5SChris Mason if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3156e02119d5SChris Mason break; 3157e02119d5SChris Mason 3158e02119d5SChris Mason log = btrfs_read_fs_root_no_radix(log_root_tree, 3159e02119d5SChris Mason &found_key); 3160db5b493aSTsutomu Itoh BUG_ON(IS_ERR(log)); 3161e02119d5SChris Mason 3162e02119d5SChris Mason tmp_key.objectid = found_key.offset; 3163e02119d5SChris Mason tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3164e02119d5SChris Mason tmp_key.offset = (u64)-1; 3165e02119d5SChris Mason 3166e02119d5SChris Mason wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3167e02119d5SChris Mason BUG_ON(!wc.replay_dest); 3168e02119d5SChris Mason 316907d400a6SYan Zheng wc.replay_dest->log_root = log; 31705d4f98a2SYan Zheng btrfs_record_root_in_trans(trans, wc.replay_dest); 3171e02119d5SChris Mason ret = walk_log_tree(trans, log, &wc); 3172e02119d5SChris Mason BUG_ON(ret); 3173e02119d5SChris Mason 3174e02119d5SChris Mason if (wc.stage == LOG_WALK_REPLAY_ALL) { 3175e02119d5SChris Mason ret = fixup_inode_link_counts(trans, wc.replay_dest, 3176e02119d5SChris Mason path); 3177e02119d5SChris Mason BUG_ON(ret); 3178e02119d5SChris Mason } 3179e02119d5SChris Mason 3180e02119d5SChris Mason key.offset = found_key.offset - 1; 318107d400a6SYan Zheng wc.replay_dest->log_root = NULL; 3182e02119d5SChris Mason free_extent_buffer(log->node); 3183b263c2c8SChris Mason free_extent_buffer(log->commit_root); 3184e02119d5SChris Mason kfree(log); 3185e02119d5SChris Mason 3186e02119d5SChris Mason if (found_key.offset == 0) 3187e02119d5SChris Mason break; 3188e02119d5SChris Mason } 3189e02119d5SChris Mason btrfs_release_path(log_root_tree, path); 3190e02119d5SChris Mason 3191e02119d5SChris Mason /* step one is to pin it all, step two is to replay just inodes */ 3192e02119d5SChris Mason if (wc.pin) { 3193e02119d5SChris Mason wc.pin = 0; 3194e02119d5SChris Mason wc.process_func = replay_one_buffer; 3195e02119d5SChris Mason wc.stage = LOG_WALK_REPLAY_INODES; 3196e02119d5SChris Mason goto again; 3197e02119d5SChris Mason } 3198e02119d5SChris Mason /* step three is to replay everything */ 3199e02119d5SChris Mason if (wc.stage < LOG_WALK_REPLAY_ALL) { 3200e02119d5SChris Mason wc.stage++; 3201e02119d5SChris Mason goto again; 3202e02119d5SChris Mason } 3203e02119d5SChris Mason 3204e02119d5SChris Mason btrfs_free_path(path); 3205e02119d5SChris Mason 3206e02119d5SChris Mason free_extent_buffer(log_root_tree->node); 3207e02119d5SChris Mason log_root_tree->log_root = NULL; 3208e02119d5SChris Mason fs_info->log_root_recovering = 0; 3209e02119d5SChris Mason 3210e02119d5SChris Mason /* step 4: commit the transaction, which also unpins the blocks */ 3211e02119d5SChris Mason btrfs_commit_transaction(trans, fs_info->tree_root); 3212e02119d5SChris Mason 3213e02119d5SChris Mason kfree(log_root_tree); 3214e02119d5SChris Mason return 0; 3215e02119d5SChris Mason } 321612fcfd22SChris Mason 321712fcfd22SChris Mason /* 321812fcfd22SChris Mason * there are some corner cases where we want to force a full 321912fcfd22SChris Mason * commit instead of allowing a directory to be logged. 322012fcfd22SChris Mason * 322112fcfd22SChris Mason * They revolve around files there were unlinked from the directory, and 322212fcfd22SChris Mason * this function updates the parent directory so that a full commit is 322312fcfd22SChris Mason * properly done if it is fsync'd later after the unlinks are done. 322412fcfd22SChris Mason */ 322512fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 322612fcfd22SChris Mason struct inode *dir, struct inode *inode, 322712fcfd22SChris Mason int for_rename) 322812fcfd22SChris Mason { 322912fcfd22SChris Mason /* 3230af4176b4SChris Mason * when we're logging a file, if it hasn't been renamed 3231af4176b4SChris Mason * or unlinked, and its inode is fully committed on disk, 3232af4176b4SChris Mason * we don't have to worry about walking up the directory chain 3233af4176b4SChris Mason * to log its parents. 3234af4176b4SChris Mason * 3235af4176b4SChris Mason * So, we use the last_unlink_trans field to put this transid 3236af4176b4SChris Mason * into the file. When the file is logged we check it and 3237af4176b4SChris Mason * don't log the parents if the file is fully on disk. 3238af4176b4SChris Mason */ 3239af4176b4SChris Mason if (S_ISREG(inode->i_mode)) 3240af4176b4SChris Mason BTRFS_I(inode)->last_unlink_trans = trans->transid; 3241af4176b4SChris Mason 3242af4176b4SChris Mason /* 324312fcfd22SChris Mason * if this directory was already logged any new 324412fcfd22SChris Mason * names for this file/dir will get recorded 324512fcfd22SChris Mason */ 324612fcfd22SChris Mason smp_mb(); 324712fcfd22SChris Mason if (BTRFS_I(dir)->logged_trans == trans->transid) 324812fcfd22SChris Mason return; 324912fcfd22SChris Mason 325012fcfd22SChris Mason /* 325112fcfd22SChris Mason * if the inode we're about to unlink was logged, 325212fcfd22SChris Mason * the log will be properly updated for any new names 325312fcfd22SChris Mason */ 325412fcfd22SChris Mason if (BTRFS_I(inode)->logged_trans == trans->transid) 325512fcfd22SChris Mason return; 325612fcfd22SChris Mason 325712fcfd22SChris Mason /* 325812fcfd22SChris Mason * when renaming files across directories, if the directory 325912fcfd22SChris Mason * there we're unlinking from gets fsync'd later on, there's 326012fcfd22SChris Mason * no way to find the destination directory later and fsync it 326112fcfd22SChris Mason * properly. So, we have to be conservative and force commits 326212fcfd22SChris Mason * so the new name gets discovered. 326312fcfd22SChris Mason */ 326412fcfd22SChris Mason if (for_rename) 326512fcfd22SChris Mason goto record; 326612fcfd22SChris Mason 326712fcfd22SChris Mason /* we can safely do the unlink without any special recording */ 326812fcfd22SChris Mason return; 326912fcfd22SChris Mason 327012fcfd22SChris Mason record: 327112fcfd22SChris Mason BTRFS_I(dir)->last_unlink_trans = trans->transid; 327212fcfd22SChris Mason } 327312fcfd22SChris Mason 327412fcfd22SChris Mason /* 327512fcfd22SChris Mason * Call this after adding a new name for a file and it will properly 327612fcfd22SChris Mason * update the log to reflect the new name. 327712fcfd22SChris Mason * 327812fcfd22SChris Mason * It will return zero if all goes well, and it will return 1 if a 327912fcfd22SChris Mason * full transaction commit is required. 328012fcfd22SChris Mason */ 328112fcfd22SChris Mason int btrfs_log_new_name(struct btrfs_trans_handle *trans, 328212fcfd22SChris Mason struct inode *inode, struct inode *old_dir, 328312fcfd22SChris Mason struct dentry *parent) 328412fcfd22SChris Mason { 328512fcfd22SChris Mason struct btrfs_root * root = BTRFS_I(inode)->root; 328612fcfd22SChris Mason 328712fcfd22SChris Mason /* 3288af4176b4SChris Mason * this will force the logging code to walk the dentry chain 3289af4176b4SChris Mason * up for the file 3290af4176b4SChris Mason */ 3291af4176b4SChris Mason if (S_ISREG(inode->i_mode)) 3292af4176b4SChris Mason BTRFS_I(inode)->last_unlink_trans = trans->transid; 3293af4176b4SChris Mason 3294af4176b4SChris Mason /* 329512fcfd22SChris Mason * if this inode hasn't been logged and directory we're renaming it 329612fcfd22SChris Mason * from hasn't been logged, we don't need to log it 329712fcfd22SChris Mason */ 329812fcfd22SChris Mason if (BTRFS_I(inode)->logged_trans <= 329912fcfd22SChris Mason root->fs_info->last_trans_committed && 330012fcfd22SChris Mason (!old_dir || BTRFS_I(old_dir)->logged_trans <= 330112fcfd22SChris Mason root->fs_info->last_trans_committed)) 330212fcfd22SChris Mason return 0; 330312fcfd22SChris Mason 330412fcfd22SChris Mason return btrfs_log_inode_parent(trans, root, inode, parent, 1); 330512fcfd22SChris Mason } 330612fcfd22SChris Mason 3307