1e02119d5SChris Mason /* 2e02119d5SChris Mason * Copyright (C) 2008 Oracle. All rights reserved. 3e02119d5SChris Mason * 4e02119d5SChris Mason * This program is free software; you can redistribute it and/or 5e02119d5SChris Mason * modify it under the terms of the GNU General Public 6e02119d5SChris Mason * License v2 as published by the Free Software Foundation. 7e02119d5SChris Mason * 8e02119d5SChris Mason * This program is distributed in the hope that it will be useful, 9e02119d5SChris Mason * but WITHOUT ANY WARRANTY; without even the implied warranty of 10e02119d5SChris Mason * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11e02119d5SChris Mason * General Public License for more details. 12e02119d5SChris Mason * 13e02119d5SChris Mason * You should have received a copy of the GNU General Public 14e02119d5SChris Mason * License along with this program; if not, write to the 15e02119d5SChris Mason * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16e02119d5SChris Mason * Boston, MA 021110-1307, USA. 17e02119d5SChris Mason */ 18e02119d5SChris Mason 19e02119d5SChris Mason #include <linux/sched.h> 205a0e3ad6STejun Heo #include <linux/slab.h> 21c6adc9ccSMiao Xie #include <linux/blkdev.h> 225dc562c5SJosef Bacik #include <linux/list_sort.h> 23995946ddSMiao Xie #include "tree-log.h" 24e02119d5SChris Mason #include "disk-io.h" 25e02119d5SChris Mason #include "locking.h" 26e02119d5SChris Mason #include "print-tree.h" 27f186373fSMark Fasheh #include "backref.h" 28f186373fSMark Fasheh #include "hash.h" 29ebb8765bSAnand Jain #include "compression.h" 30df2c95f3SQu Wenruo #include "qgroup.h" 31e02119d5SChris Mason 32e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode: 33e02119d5SChris Mason * 34e02119d5SChris Mason * LOG_INODE_ALL means to log everything 35e02119d5SChris Mason * LOG_INODE_EXISTS means to log just enough to recreate the inode 36e02119d5SChris Mason * during log replay 37e02119d5SChris Mason */ 38e02119d5SChris Mason #define LOG_INODE_ALL 0 39e02119d5SChris Mason #define LOG_INODE_EXISTS 1 40781feef7SLiu Bo #define LOG_OTHER_INODE 2 41e02119d5SChris Mason 42e02119d5SChris Mason /* 4312fcfd22SChris Mason * directory trouble cases 4412fcfd22SChris Mason * 4512fcfd22SChris Mason * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 4612fcfd22SChris Mason * log, we must force a full commit before doing an fsync of the directory 4712fcfd22SChris Mason * where the unlink was done. 4812fcfd22SChris Mason * ---> record transid of last unlink/rename per directory 4912fcfd22SChris Mason * 5012fcfd22SChris Mason * mkdir foo/some_dir 5112fcfd22SChris Mason * normal commit 5212fcfd22SChris Mason * rename foo/some_dir foo2/some_dir 5312fcfd22SChris Mason * mkdir foo/some_dir 5412fcfd22SChris Mason * fsync foo/some_dir/some_file 5512fcfd22SChris Mason * 5612fcfd22SChris Mason * The fsync above will unlink the original some_dir without recording 5712fcfd22SChris Mason * it in its new location (foo2). After a crash, some_dir will be gone 5812fcfd22SChris Mason * unless the fsync of some_file forces a full commit 5912fcfd22SChris Mason * 6012fcfd22SChris Mason * 2) we must log any new names for any file or dir that is in the fsync 6112fcfd22SChris Mason * log. ---> check inode while renaming/linking. 6212fcfd22SChris Mason * 6312fcfd22SChris Mason * 2a) we must log any new names for any file or dir during rename 6412fcfd22SChris Mason * when the directory they are being removed from was logged. 6512fcfd22SChris Mason * ---> check inode and old parent dir during rename 6612fcfd22SChris Mason * 6712fcfd22SChris Mason * 2a is actually the more important variant. With the extra logging 6812fcfd22SChris Mason * a crash might unlink the old name without recreating the new one 6912fcfd22SChris Mason * 7012fcfd22SChris Mason * 3) after a crash, we must go through any directories with a link count 7112fcfd22SChris Mason * of zero and redo the rm -rf 7212fcfd22SChris Mason * 7312fcfd22SChris Mason * mkdir f1/foo 7412fcfd22SChris Mason * normal commit 7512fcfd22SChris Mason * rm -rf f1/foo 7612fcfd22SChris Mason * fsync(f1) 7712fcfd22SChris Mason * 7812fcfd22SChris Mason * The directory f1 was fully removed from the FS, but fsync was never 7912fcfd22SChris Mason * called on f1, only its parent dir. After a crash the rm -rf must 8012fcfd22SChris Mason * be replayed. This must be able to recurse down the entire 8112fcfd22SChris Mason * directory tree. The inode link count fixup code takes care of the 8212fcfd22SChris Mason * ugly details. 8312fcfd22SChris Mason */ 8412fcfd22SChris Mason 8512fcfd22SChris Mason /* 86e02119d5SChris Mason * stages for the tree walking. The first 87e02119d5SChris Mason * stage (0) is to only pin down the blocks we find 88e02119d5SChris Mason * the second stage (1) is to make sure that all the inodes 89e02119d5SChris Mason * we find in the log are created in the subvolume. 90e02119d5SChris Mason * 91e02119d5SChris Mason * The last stage is to deal with directories and links and extents 92e02119d5SChris Mason * and all the other fun semantics 93e02119d5SChris Mason */ 94e02119d5SChris Mason #define LOG_WALK_PIN_ONLY 0 95e02119d5SChris Mason #define LOG_WALK_REPLAY_INODES 1 96dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_DIR_INDEX 2 97dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_ALL 3 98e02119d5SChris Mason 9912fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 10149dae1bcSFilipe Manana int inode_only, 10249dae1bcSFilipe Manana const loff_t start, 1038407f553SFilipe Manana const loff_t end, 1048407f553SFilipe Manana struct btrfs_log_ctx *ctx); 105ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 106ec051c0fSYan Zheng struct btrfs_root *root, 107ec051c0fSYan Zheng struct btrfs_path *path, u64 objectid); 10812fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 10912fcfd22SChris Mason struct btrfs_root *root, 11012fcfd22SChris Mason struct btrfs_root *log, 11112fcfd22SChris Mason struct btrfs_path *path, 11212fcfd22SChris Mason u64 dirid, int del_all); 113e02119d5SChris Mason 114e02119d5SChris Mason /* 115e02119d5SChris Mason * tree logging is a special write ahead log used to make sure that 116e02119d5SChris Mason * fsyncs and O_SYNCs can happen without doing full tree commits. 117e02119d5SChris Mason * 118e02119d5SChris Mason * Full tree commits are expensive because they require commonly 119e02119d5SChris Mason * modified blocks to be recowed, creating many dirty pages in the 120e02119d5SChris Mason * extent tree an 4x-6x higher write load than ext3. 121e02119d5SChris Mason * 122e02119d5SChris Mason * Instead of doing a tree commit on every fsync, we use the 123e02119d5SChris Mason * key ranges and transaction ids to find items for a given file or directory 124e02119d5SChris Mason * that have changed in this transaction. Those items are copied into 125e02119d5SChris Mason * a special tree (one per subvolume root), that tree is written to disk 126e02119d5SChris Mason * and then the fsync is considered complete. 127e02119d5SChris Mason * 128e02119d5SChris Mason * After a crash, items are copied out of the log-tree back into the 129e02119d5SChris Mason * subvolume tree. Any file data extents found are recorded in the extent 130e02119d5SChris Mason * allocation tree, and the log-tree freed. 131e02119d5SChris Mason * 132e02119d5SChris Mason * The log tree is read three times, once to pin down all the extents it is 133e02119d5SChris Mason * using in ram and once, once to create all the inodes logged in the tree 134e02119d5SChris Mason * and once to do all the other items. 135e02119d5SChris Mason */ 136e02119d5SChris Mason 137e02119d5SChris Mason /* 138e02119d5SChris Mason * start a sub transaction and setup the log tree 139e02119d5SChris Mason * this increments the log tree writer count to make the people 140e02119d5SChris Mason * syncing the tree wait for us to finish 141e02119d5SChris Mason */ 142e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans, 1438b050d35SMiao Xie struct btrfs_root *root, 1448b050d35SMiao Xie struct btrfs_log_ctx *ctx) 145e02119d5SChris Mason { 1460b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 14734eb2a52SZhaolei int ret = 0; 1487237f183SYan Zheng 1497237f183SYan Zheng mutex_lock(&root->log_mutex); 15034eb2a52SZhaolei 1517237f183SYan Zheng if (root->log_root) { 1520b246afaSJeff Mahoney if (btrfs_need_log_full_commit(fs_info, trans)) { 15350471a38SMiao Xie ret = -EAGAIN; 15450471a38SMiao Xie goto out; 15550471a38SMiao Xie } 15634eb2a52SZhaolei 157ff782e0aSJosef Bacik if (!root->log_start_pid) { 15827cdeb70SMiao Xie clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 15934eb2a52SZhaolei root->log_start_pid = current->pid; 160ff782e0aSJosef Bacik } else if (root->log_start_pid != current->pid) { 16127cdeb70SMiao Xie set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 162ff782e0aSJosef Bacik } 16334eb2a52SZhaolei } else { 1640b246afaSJeff Mahoney mutex_lock(&fs_info->tree_log_mutex); 1650b246afaSJeff Mahoney if (!fs_info->log_root_tree) 1660b246afaSJeff Mahoney ret = btrfs_init_log_root_tree(trans, fs_info); 1670b246afaSJeff Mahoney mutex_unlock(&fs_info->tree_log_mutex); 1684a500fd1SYan, Zheng if (ret) 169e87ac136SMiao Xie goto out; 170e87ac136SMiao Xie 171e02119d5SChris Mason ret = btrfs_add_log_tree(trans, root); 1724a500fd1SYan, Zheng if (ret) 173e87ac136SMiao Xie goto out; 17434eb2a52SZhaolei 17527cdeb70SMiao Xie clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 176e87ac136SMiao Xie root->log_start_pid = current->pid; 17734eb2a52SZhaolei } 17834eb2a52SZhaolei 1792ecb7923SMiao Xie atomic_inc(&root->log_batch); 1807237f183SYan Zheng atomic_inc(&root->log_writers); 1818b050d35SMiao Xie if (ctx) { 18234eb2a52SZhaolei int index = root->log_transid % 2; 1838b050d35SMiao Xie list_add_tail(&ctx->list, &root->log_ctxs[index]); 184d1433debSMiao Xie ctx->log_transid = root->log_transid; 1858b050d35SMiao Xie } 18634eb2a52SZhaolei 187e87ac136SMiao Xie out: 1887237f183SYan Zheng mutex_unlock(&root->log_mutex); 189e87ac136SMiao Xie return ret; 190e02119d5SChris Mason } 191e02119d5SChris Mason 192e02119d5SChris Mason /* 193e02119d5SChris Mason * returns 0 if there was a log transaction running and we were able 194e02119d5SChris Mason * to join, or returns -ENOENT if there were not transactions 195e02119d5SChris Mason * in progress 196e02119d5SChris Mason */ 197e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root) 198e02119d5SChris Mason { 199e02119d5SChris Mason int ret = -ENOENT; 200e02119d5SChris Mason 201e02119d5SChris Mason smp_mb(); 202e02119d5SChris Mason if (!root->log_root) 203e02119d5SChris Mason return -ENOENT; 204e02119d5SChris Mason 2057237f183SYan Zheng mutex_lock(&root->log_mutex); 206e02119d5SChris Mason if (root->log_root) { 207e02119d5SChris Mason ret = 0; 2087237f183SYan Zheng atomic_inc(&root->log_writers); 209e02119d5SChris Mason } 2107237f183SYan Zheng mutex_unlock(&root->log_mutex); 211e02119d5SChris Mason return ret; 212e02119d5SChris Mason } 213e02119d5SChris Mason 214e02119d5SChris Mason /* 21512fcfd22SChris Mason * This either makes the current running log transaction wait 21612fcfd22SChris Mason * until you call btrfs_end_log_trans() or it makes any future 21712fcfd22SChris Mason * log transactions wait until you call btrfs_end_log_trans() 21812fcfd22SChris Mason */ 21912fcfd22SChris Mason int btrfs_pin_log_trans(struct btrfs_root *root) 22012fcfd22SChris Mason { 22112fcfd22SChris Mason int ret = -ENOENT; 22212fcfd22SChris Mason 22312fcfd22SChris Mason mutex_lock(&root->log_mutex); 22412fcfd22SChris Mason atomic_inc(&root->log_writers); 22512fcfd22SChris Mason mutex_unlock(&root->log_mutex); 22612fcfd22SChris Mason return ret; 22712fcfd22SChris Mason } 22812fcfd22SChris Mason 22912fcfd22SChris Mason /* 230e02119d5SChris Mason * indicate we're done making changes to the log tree 231e02119d5SChris Mason * and wake up anyone waiting to do a sync 232e02119d5SChris Mason */ 233143bede5SJeff Mahoney void btrfs_end_log_trans(struct btrfs_root *root) 234e02119d5SChris Mason { 2357237f183SYan Zheng if (atomic_dec_and_test(&root->log_writers)) { 236779adf0fSDavid Sterba /* 237779adf0fSDavid Sterba * Implicit memory barrier after atomic_dec_and_test 238779adf0fSDavid Sterba */ 2397237f183SYan Zheng if (waitqueue_active(&root->log_writer_wait)) 2407237f183SYan Zheng wake_up(&root->log_writer_wait); 2417237f183SYan Zheng } 242e02119d5SChris Mason } 243e02119d5SChris Mason 244e02119d5SChris Mason 245e02119d5SChris Mason /* 246e02119d5SChris Mason * the walk control struct is used to pass state down the chain when 247e02119d5SChris Mason * processing the log tree. The stage field tells us which part 248e02119d5SChris Mason * of the log tree processing we are currently doing. The others 249e02119d5SChris Mason * are state fields used for that specific part 250e02119d5SChris Mason */ 251e02119d5SChris Mason struct walk_control { 252e02119d5SChris Mason /* should we free the extent on disk when done? This is used 253e02119d5SChris Mason * at transaction commit time while freeing a log tree 254e02119d5SChris Mason */ 255e02119d5SChris Mason int free; 256e02119d5SChris Mason 257e02119d5SChris Mason /* should we write out the extent buffer? This is used 258e02119d5SChris Mason * while flushing the log tree to disk during a sync 259e02119d5SChris Mason */ 260e02119d5SChris Mason int write; 261e02119d5SChris Mason 262e02119d5SChris Mason /* should we wait for the extent buffer io to finish? Also used 263e02119d5SChris Mason * while flushing the log tree to disk for a sync 264e02119d5SChris Mason */ 265e02119d5SChris Mason int wait; 266e02119d5SChris Mason 267e02119d5SChris Mason /* pin only walk, we record which extents on disk belong to the 268e02119d5SChris Mason * log trees 269e02119d5SChris Mason */ 270e02119d5SChris Mason int pin; 271e02119d5SChris Mason 272e02119d5SChris Mason /* what stage of the replay code we're currently in */ 273e02119d5SChris Mason int stage; 274e02119d5SChris Mason 275e02119d5SChris Mason /* the root we are currently replaying */ 276e02119d5SChris Mason struct btrfs_root *replay_dest; 277e02119d5SChris Mason 278e02119d5SChris Mason /* the trans handle for the current replay */ 279e02119d5SChris Mason struct btrfs_trans_handle *trans; 280e02119d5SChris Mason 281e02119d5SChris Mason /* the function that gets used to process blocks we find in the 282e02119d5SChris Mason * tree. Note the extent_buffer might not be up to date when it is 283e02119d5SChris Mason * passed in, and it must be checked or read if you need the data 284e02119d5SChris Mason * inside it 285e02119d5SChris Mason */ 286e02119d5SChris Mason int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 287e02119d5SChris Mason struct walk_control *wc, u64 gen); 288e02119d5SChris Mason }; 289e02119d5SChris Mason 290e02119d5SChris Mason /* 291e02119d5SChris Mason * process_func used to pin down extents, write them or wait on them 292e02119d5SChris Mason */ 293e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log, 294e02119d5SChris Mason struct extent_buffer *eb, 295e02119d5SChris Mason struct walk_control *wc, u64 gen) 296e02119d5SChris Mason { 2970b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = log->fs_info; 298b50c6e25SJosef Bacik int ret = 0; 299b50c6e25SJosef Bacik 3008c2a1a30SJosef Bacik /* 3018c2a1a30SJosef Bacik * If this fs is mixed then we need to be able to process the leaves to 3028c2a1a30SJosef Bacik * pin down any logged extents, so we have to read the block. 3038c2a1a30SJosef Bacik */ 3040b246afaSJeff Mahoney if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 3058c2a1a30SJosef Bacik ret = btrfs_read_buffer(eb, gen); 3068c2a1a30SJosef Bacik if (ret) 3078c2a1a30SJosef Bacik return ret; 3088c2a1a30SJosef Bacik } 3098c2a1a30SJosef Bacik 31004018de5SJosef Bacik if (wc->pin) 3112ff7e61eSJeff Mahoney ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, 3122ff7e61eSJeff Mahoney eb->len); 313e02119d5SChris Mason 314b50c6e25SJosef Bacik if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 3158c2a1a30SJosef Bacik if (wc->pin && btrfs_header_level(eb) == 0) 3162ff7e61eSJeff Mahoney ret = btrfs_exclude_logged_extents(fs_info, eb); 317e02119d5SChris Mason if (wc->write) 318e02119d5SChris Mason btrfs_write_tree_block(eb); 319e02119d5SChris Mason if (wc->wait) 320e02119d5SChris Mason btrfs_wait_tree_block_writeback(eb); 321e02119d5SChris Mason } 322b50c6e25SJosef Bacik return ret; 323e02119d5SChris Mason } 324e02119d5SChris Mason 325e02119d5SChris Mason /* 326e02119d5SChris Mason * Item overwrite used by replay and tree logging. eb, slot and key all refer 327e02119d5SChris Mason * to the src data we are copying out. 328e02119d5SChris Mason * 329e02119d5SChris Mason * root is the tree we are copying into, and path is a scratch 330e02119d5SChris Mason * path for use in this function (it should be released on entry and 331e02119d5SChris Mason * will be released on exit). 332e02119d5SChris Mason * 333e02119d5SChris Mason * If the key is already in the destination tree the existing item is 334e02119d5SChris Mason * overwritten. If the existing item isn't big enough, it is extended. 335e02119d5SChris Mason * If it is too large, it is truncated. 336e02119d5SChris Mason * 337e02119d5SChris Mason * If the key isn't in the destination yet, a new item is inserted. 338e02119d5SChris Mason */ 339e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans, 340e02119d5SChris Mason struct btrfs_root *root, 341e02119d5SChris Mason struct btrfs_path *path, 342e02119d5SChris Mason struct extent_buffer *eb, int slot, 343e02119d5SChris Mason struct btrfs_key *key) 344e02119d5SChris Mason { 3452ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 346e02119d5SChris Mason int ret; 347e02119d5SChris Mason u32 item_size; 348e02119d5SChris Mason u64 saved_i_size = 0; 349e02119d5SChris Mason int save_old_i_size = 0; 350e02119d5SChris Mason unsigned long src_ptr; 351e02119d5SChris Mason unsigned long dst_ptr; 352e02119d5SChris Mason int overwrite_root = 0; 3534bc4bee4SJosef Bacik bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 354e02119d5SChris Mason 355e02119d5SChris Mason if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 356e02119d5SChris Mason overwrite_root = 1; 357e02119d5SChris Mason 358e02119d5SChris Mason item_size = btrfs_item_size_nr(eb, slot); 359e02119d5SChris Mason src_ptr = btrfs_item_ptr_offset(eb, slot); 360e02119d5SChris Mason 361e02119d5SChris Mason /* look for the key in the destination tree */ 362e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 3634bc4bee4SJosef Bacik if (ret < 0) 3644bc4bee4SJosef Bacik return ret; 3654bc4bee4SJosef Bacik 366e02119d5SChris Mason if (ret == 0) { 367e02119d5SChris Mason char *src_copy; 368e02119d5SChris Mason char *dst_copy; 369e02119d5SChris Mason u32 dst_size = btrfs_item_size_nr(path->nodes[0], 370e02119d5SChris Mason path->slots[0]); 371e02119d5SChris Mason if (dst_size != item_size) 372e02119d5SChris Mason goto insert; 373e02119d5SChris Mason 374e02119d5SChris Mason if (item_size == 0) { 375b3b4aa74SDavid Sterba btrfs_release_path(path); 376e02119d5SChris Mason return 0; 377e02119d5SChris Mason } 378e02119d5SChris Mason dst_copy = kmalloc(item_size, GFP_NOFS); 379e02119d5SChris Mason src_copy = kmalloc(item_size, GFP_NOFS); 3802a29edc6Sliubo if (!dst_copy || !src_copy) { 381b3b4aa74SDavid Sterba btrfs_release_path(path); 3822a29edc6Sliubo kfree(dst_copy); 3832a29edc6Sliubo kfree(src_copy); 3842a29edc6Sliubo return -ENOMEM; 3852a29edc6Sliubo } 386e02119d5SChris Mason 387e02119d5SChris Mason read_extent_buffer(eb, src_copy, src_ptr, item_size); 388e02119d5SChris Mason 389e02119d5SChris Mason dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 390e02119d5SChris Mason read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 391e02119d5SChris Mason item_size); 392e02119d5SChris Mason ret = memcmp(dst_copy, src_copy, item_size); 393e02119d5SChris Mason 394e02119d5SChris Mason kfree(dst_copy); 395e02119d5SChris Mason kfree(src_copy); 396e02119d5SChris Mason /* 397e02119d5SChris Mason * they have the same contents, just return, this saves 398e02119d5SChris Mason * us from cowing blocks in the destination tree and doing 399e02119d5SChris Mason * extra writes that may not have been done by a previous 400e02119d5SChris Mason * sync 401e02119d5SChris Mason */ 402e02119d5SChris Mason if (ret == 0) { 403b3b4aa74SDavid Sterba btrfs_release_path(path); 404e02119d5SChris Mason return 0; 405e02119d5SChris Mason } 406e02119d5SChris Mason 4074bc4bee4SJosef Bacik /* 4084bc4bee4SJosef Bacik * We need to load the old nbytes into the inode so when we 4094bc4bee4SJosef Bacik * replay the extents we've logged we get the right nbytes. 4104bc4bee4SJosef Bacik */ 4114bc4bee4SJosef Bacik if (inode_item) { 4124bc4bee4SJosef Bacik struct btrfs_inode_item *item; 4134bc4bee4SJosef Bacik u64 nbytes; 414d555438bSJosef Bacik u32 mode; 4154bc4bee4SJosef Bacik 4164bc4bee4SJosef Bacik item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4174bc4bee4SJosef Bacik struct btrfs_inode_item); 4184bc4bee4SJosef Bacik nbytes = btrfs_inode_nbytes(path->nodes[0], item); 4194bc4bee4SJosef Bacik item = btrfs_item_ptr(eb, slot, 4204bc4bee4SJosef Bacik struct btrfs_inode_item); 4214bc4bee4SJosef Bacik btrfs_set_inode_nbytes(eb, item, nbytes); 422d555438bSJosef Bacik 423d555438bSJosef Bacik /* 424d555438bSJosef Bacik * If this is a directory we need to reset the i_size to 425d555438bSJosef Bacik * 0 so that we can set it up properly when replaying 426d555438bSJosef Bacik * the rest of the items in this log. 427d555438bSJosef Bacik */ 428d555438bSJosef Bacik mode = btrfs_inode_mode(eb, item); 429d555438bSJosef Bacik if (S_ISDIR(mode)) 430d555438bSJosef Bacik btrfs_set_inode_size(eb, item, 0); 4314bc4bee4SJosef Bacik } 4324bc4bee4SJosef Bacik } else if (inode_item) { 4334bc4bee4SJosef Bacik struct btrfs_inode_item *item; 434d555438bSJosef Bacik u32 mode; 4354bc4bee4SJosef Bacik 4364bc4bee4SJosef Bacik /* 4374bc4bee4SJosef Bacik * New inode, set nbytes to 0 so that the nbytes comes out 4384bc4bee4SJosef Bacik * properly when we replay the extents. 4394bc4bee4SJosef Bacik */ 4404bc4bee4SJosef Bacik item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 4414bc4bee4SJosef Bacik btrfs_set_inode_nbytes(eb, item, 0); 442d555438bSJosef Bacik 443d555438bSJosef Bacik /* 444d555438bSJosef Bacik * If this is a directory we need to reset the i_size to 0 so 445d555438bSJosef Bacik * that we can set it up properly when replaying the rest of 446d555438bSJosef Bacik * the items in this log. 447d555438bSJosef Bacik */ 448d555438bSJosef Bacik mode = btrfs_inode_mode(eb, item); 449d555438bSJosef Bacik if (S_ISDIR(mode)) 450d555438bSJosef Bacik btrfs_set_inode_size(eb, item, 0); 451e02119d5SChris Mason } 452e02119d5SChris Mason insert: 453b3b4aa74SDavid Sterba btrfs_release_path(path); 454e02119d5SChris Mason /* try to insert the key into the destination tree */ 455df8d116fSFilipe Manana path->skip_release_on_error = 1; 456e02119d5SChris Mason ret = btrfs_insert_empty_item(trans, root, path, 457e02119d5SChris Mason key, item_size); 458df8d116fSFilipe Manana path->skip_release_on_error = 0; 459e02119d5SChris Mason 460e02119d5SChris Mason /* make sure any existing item is the correct size */ 461df8d116fSFilipe Manana if (ret == -EEXIST || ret == -EOVERFLOW) { 462e02119d5SChris Mason u32 found_size; 463e02119d5SChris Mason found_size = btrfs_item_size_nr(path->nodes[0], 464e02119d5SChris Mason path->slots[0]); 465143bede5SJeff Mahoney if (found_size > item_size) 4662ff7e61eSJeff Mahoney btrfs_truncate_item(fs_info, path, item_size, 1); 467143bede5SJeff Mahoney else if (found_size < item_size) 4682ff7e61eSJeff Mahoney btrfs_extend_item(fs_info, path, 46987b29b20SYan Zheng item_size - found_size); 470e02119d5SChris Mason } else if (ret) { 4714a500fd1SYan, Zheng return ret; 472e02119d5SChris Mason } 473e02119d5SChris Mason dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 474e02119d5SChris Mason path->slots[0]); 475e02119d5SChris Mason 476e02119d5SChris Mason /* don't overwrite an existing inode if the generation number 477e02119d5SChris Mason * was logged as zero. This is done when the tree logging code 478e02119d5SChris Mason * is just logging an inode to make sure it exists after recovery. 479e02119d5SChris Mason * 480e02119d5SChris Mason * Also, don't overwrite i_size on directories during replay. 481e02119d5SChris Mason * log replay inserts and removes directory items based on the 482e02119d5SChris Mason * state of the tree found in the subvolume, and i_size is modified 483e02119d5SChris Mason * as it goes 484e02119d5SChris Mason */ 485e02119d5SChris Mason if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 486e02119d5SChris Mason struct btrfs_inode_item *src_item; 487e02119d5SChris Mason struct btrfs_inode_item *dst_item; 488e02119d5SChris Mason 489e02119d5SChris Mason src_item = (struct btrfs_inode_item *)src_ptr; 490e02119d5SChris Mason dst_item = (struct btrfs_inode_item *)dst_ptr; 491e02119d5SChris Mason 4921a4bcf47SFilipe Manana if (btrfs_inode_generation(eb, src_item) == 0) { 4931a4bcf47SFilipe Manana struct extent_buffer *dst_eb = path->nodes[0]; 4942f2ff0eeSFilipe Manana const u64 ino_size = btrfs_inode_size(eb, src_item); 4951a4bcf47SFilipe Manana 4962f2ff0eeSFilipe Manana /* 4972f2ff0eeSFilipe Manana * For regular files an ino_size == 0 is used only when 4982f2ff0eeSFilipe Manana * logging that an inode exists, as part of a directory 4992f2ff0eeSFilipe Manana * fsync, and the inode wasn't fsynced before. In this 5002f2ff0eeSFilipe Manana * case don't set the size of the inode in the fs/subvol 5012f2ff0eeSFilipe Manana * tree, otherwise we would be throwing valid data away. 5022f2ff0eeSFilipe Manana */ 5031a4bcf47SFilipe Manana if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 5042f2ff0eeSFilipe Manana S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 5052f2ff0eeSFilipe Manana ino_size != 0) { 5061a4bcf47SFilipe Manana struct btrfs_map_token token; 5071a4bcf47SFilipe Manana 5081a4bcf47SFilipe Manana btrfs_init_map_token(&token); 5091a4bcf47SFilipe Manana btrfs_set_token_inode_size(dst_eb, dst_item, 5101a4bcf47SFilipe Manana ino_size, &token); 5111a4bcf47SFilipe Manana } 512e02119d5SChris Mason goto no_copy; 5131a4bcf47SFilipe Manana } 514e02119d5SChris Mason 515e02119d5SChris Mason if (overwrite_root && 516e02119d5SChris Mason S_ISDIR(btrfs_inode_mode(eb, src_item)) && 517e02119d5SChris Mason S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 518e02119d5SChris Mason save_old_i_size = 1; 519e02119d5SChris Mason saved_i_size = btrfs_inode_size(path->nodes[0], 520e02119d5SChris Mason dst_item); 521e02119d5SChris Mason } 522e02119d5SChris Mason } 523e02119d5SChris Mason 524e02119d5SChris Mason copy_extent_buffer(path->nodes[0], eb, dst_ptr, 525e02119d5SChris Mason src_ptr, item_size); 526e02119d5SChris Mason 527e02119d5SChris Mason if (save_old_i_size) { 528e02119d5SChris Mason struct btrfs_inode_item *dst_item; 529e02119d5SChris Mason dst_item = (struct btrfs_inode_item *)dst_ptr; 530e02119d5SChris Mason btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 531e02119d5SChris Mason } 532e02119d5SChris Mason 533e02119d5SChris Mason /* make sure the generation is filled in */ 534e02119d5SChris Mason if (key->type == BTRFS_INODE_ITEM_KEY) { 535e02119d5SChris Mason struct btrfs_inode_item *dst_item; 536e02119d5SChris Mason dst_item = (struct btrfs_inode_item *)dst_ptr; 537e02119d5SChris Mason if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 538e02119d5SChris Mason btrfs_set_inode_generation(path->nodes[0], dst_item, 539e02119d5SChris Mason trans->transid); 540e02119d5SChris Mason } 541e02119d5SChris Mason } 542e02119d5SChris Mason no_copy: 543e02119d5SChris Mason btrfs_mark_buffer_dirty(path->nodes[0]); 544b3b4aa74SDavid Sterba btrfs_release_path(path); 545e02119d5SChris Mason return 0; 546e02119d5SChris Mason } 547e02119d5SChris Mason 548e02119d5SChris Mason /* 549e02119d5SChris Mason * simple helper to read an inode off the disk from a given root 550e02119d5SChris Mason * This can only be called for subvolume roots and not for the log 551e02119d5SChris Mason */ 552e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root, 553e02119d5SChris Mason u64 objectid) 554e02119d5SChris Mason { 5555d4f98a2SYan Zheng struct btrfs_key key; 556e02119d5SChris Mason struct inode *inode; 557e02119d5SChris Mason 5585d4f98a2SYan Zheng key.objectid = objectid; 5595d4f98a2SYan Zheng key.type = BTRFS_INODE_ITEM_KEY; 5605d4f98a2SYan Zheng key.offset = 0; 56173f73415SJosef Bacik inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 5625d4f98a2SYan Zheng if (IS_ERR(inode)) { 5635d4f98a2SYan Zheng inode = NULL; 5645d4f98a2SYan Zheng } else if (is_bad_inode(inode)) { 565e02119d5SChris Mason iput(inode); 566e02119d5SChris Mason inode = NULL; 567e02119d5SChris Mason } 568e02119d5SChris Mason return inode; 569e02119d5SChris Mason } 570e02119d5SChris Mason 571e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the 572e02119d5SChris Mason * subvolume 'root'. path is released on entry and should be released 573e02119d5SChris Mason * on exit. 574e02119d5SChris Mason * 575e02119d5SChris Mason * extents in the log tree have not been allocated out of the extent 576e02119d5SChris Mason * tree yet. So, this completes the allocation, taking a reference 577e02119d5SChris Mason * as required if the extent already exists or creating a new extent 578e02119d5SChris Mason * if it isn't in the extent allocation tree yet. 579e02119d5SChris Mason * 580e02119d5SChris Mason * The extent is inserted into the file, dropping any existing extents 581e02119d5SChris Mason * from the file that overlap the new one. 582e02119d5SChris Mason */ 583e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 584e02119d5SChris Mason struct btrfs_root *root, 585e02119d5SChris Mason struct btrfs_path *path, 586e02119d5SChris Mason struct extent_buffer *eb, int slot, 587e02119d5SChris Mason struct btrfs_key *key) 588e02119d5SChris Mason { 5890b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 590e02119d5SChris Mason int found_type; 591e02119d5SChris Mason u64 extent_end; 592e02119d5SChris Mason u64 start = key->offset; 5934bc4bee4SJosef Bacik u64 nbytes = 0; 594e02119d5SChris Mason struct btrfs_file_extent_item *item; 595e02119d5SChris Mason struct inode *inode = NULL; 596e02119d5SChris Mason unsigned long size; 597e02119d5SChris Mason int ret = 0; 598e02119d5SChris Mason 599e02119d5SChris Mason item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 600e02119d5SChris Mason found_type = btrfs_file_extent_type(eb, item); 601e02119d5SChris Mason 602d899e052SYan Zheng if (found_type == BTRFS_FILE_EXTENT_REG || 6034bc4bee4SJosef Bacik found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6044bc4bee4SJosef Bacik nbytes = btrfs_file_extent_num_bytes(eb, item); 6054bc4bee4SJosef Bacik extent_end = start + nbytes; 6064bc4bee4SJosef Bacik 6074bc4bee4SJosef Bacik /* 6084bc4bee4SJosef Bacik * We don't add to the inodes nbytes if we are prealloc or a 6094bc4bee4SJosef Bacik * hole. 6104bc4bee4SJosef Bacik */ 6114bc4bee4SJosef Bacik if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6124bc4bee4SJosef Bacik nbytes = 0; 6134bc4bee4SJosef Bacik } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 614514ac8adSChris Mason size = btrfs_file_extent_inline_len(eb, slot, item); 6154bc4bee4SJosef Bacik nbytes = btrfs_file_extent_ram_bytes(eb, item); 616da17066cSJeff Mahoney extent_end = ALIGN(start + size, 6170b246afaSJeff Mahoney fs_info->sectorsize); 618e02119d5SChris Mason } else { 619e02119d5SChris Mason ret = 0; 620e02119d5SChris Mason goto out; 621e02119d5SChris Mason } 622e02119d5SChris Mason 623e02119d5SChris Mason inode = read_one_inode(root, key->objectid); 624e02119d5SChris Mason if (!inode) { 625e02119d5SChris Mason ret = -EIO; 626e02119d5SChris Mason goto out; 627e02119d5SChris Mason } 628e02119d5SChris Mason 629e02119d5SChris Mason /* 630e02119d5SChris Mason * first check to see if we already have this extent in the 631e02119d5SChris Mason * file. This must be done before the btrfs_drop_extents run 632e02119d5SChris Mason * so we don't try to drop this extent. 633e02119d5SChris Mason */ 6344a0cc7caSNikolay Borisov ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(BTRFS_I(inode)), 635e02119d5SChris Mason start, 0); 636e02119d5SChris Mason 637d899e052SYan Zheng if (ret == 0 && 638d899e052SYan Zheng (found_type == BTRFS_FILE_EXTENT_REG || 639d899e052SYan Zheng found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 640e02119d5SChris Mason struct btrfs_file_extent_item cmp1; 641e02119d5SChris Mason struct btrfs_file_extent_item cmp2; 642e02119d5SChris Mason struct btrfs_file_extent_item *existing; 643e02119d5SChris Mason struct extent_buffer *leaf; 644e02119d5SChris Mason 645e02119d5SChris Mason leaf = path->nodes[0]; 646e02119d5SChris Mason existing = btrfs_item_ptr(leaf, path->slots[0], 647e02119d5SChris Mason struct btrfs_file_extent_item); 648e02119d5SChris Mason 649e02119d5SChris Mason read_extent_buffer(eb, &cmp1, (unsigned long)item, 650e02119d5SChris Mason sizeof(cmp1)); 651e02119d5SChris Mason read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 652e02119d5SChris Mason sizeof(cmp2)); 653e02119d5SChris Mason 654e02119d5SChris Mason /* 655e02119d5SChris Mason * we already have a pointer to this exact extent, 656e02119d5SChris Mason * we don't have to do anything 657e02119d5SChris Mason */ 658e02119d5SChris Mason if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 659b3b4aa74SDavid Sterba btrfs_release_path(path); 660e02119d5SChris Mason goto out; 661e02119d5SChris Mason } 662e02119d5SChris Mason } 663b3b4aa74SDavid Sterba btrfs_release_path(path); 664e02119d5SChris Mason 665e02119d5SChris Mason /* drop any overlapping extents */ 6662671485dSJosef Bacik ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 6673650860bSJosef Bacik if (ret) 6683650860bSJosef Bacik goto out; 669e02119d5SChris Mason 67007d400a6SYan Zheng if (found_type == BTRFS_FILE_EXTENT_REG || 67107d400a6SYan Zheng found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6725d4f98a2SYan Zheng u64 offset; 67307d400a6SYan Zheng unsigned long dest_offset; 67407d400a6SYan Zheng struct btrfs_key ins; 67507d400a6SYan Zheng 67607d400a6SYan Zheng ret = btrfs_insert_empty_item(trans, root, path, key, 67707d400a6SYan Zheng sizeof(*item)); 6783650860bSJosef Bacik if (ret) 6793650860bSJosef Bacik goto out; 68007d400a6SYan Zheng dest_offset = btrfs_item_ptr_offset(path->nodes[0], 68107d400a6SYan Zheng path->slots[0]); 68207d400a6SYan Zheng copy_extent_buffer(path->nodes[0], eb, dest_offset, 68307d400a6SYan Zheng (unsigned long)item, sizeof(*item)); 68407d400a6SYan Zheng 68507d400a6SYan Zheng ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 68607d400a6SYan Zheng ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 68707d400a6SYan Zheng ins.type = BTRFS_EXTENT_ITEM_KEY; 6885d4f98a2SYan Zheng offset = key->offset - btrfs_file_extent_offset(eb, item); 68907d400a6SYan Zheng 690df2c95f3SQu Wenruo /* 691df2c95f3SQu Wenruo * Manually record dirty extent, as here we did a shallow 692df2c95f3SQu Wenruo * file extent item copy and skip normal backref update, 693df2c95f3SQu Wenruo * but modifying extent tree all by ourselves. 694df2c95f3SQu Wenruo * So need to manually record dirty extent for qgroup, 695df2c95f3SQu Wenruo * as the owner of the file extent changed from log tree 696df2c95f3SQu Wenruo * (doesn't affect qgroup) to fs/file tree(affects qgroup) 697df2c95f3SQu Wenruo */ 6980b246afaSJeff Mahoney ret = btrfs_qgroup_trace_extent(trans, fs_info, 699df2c95f3SQu Wenruo btrfs_file_extent_disk_bytenr(eb, item), 700df2c95f3SQu Wenruo btrfs_file_extent_disk_num_bytes(eb, item), 701df2c95f3SQu Wenruo GFP_NOFS); 702df2c95f3SQu Wenruo if (ret < 0) 703df2c95f3SQu Wenruo goto out; 704df2c95f3SQu Wenruo 70507d400a6SYan Zheng if (ins.objectid > 0) { 70607d400a6SYan Zheng u64 csum_start; 70707d400a6SYan Zheng u64 csum_end; 70807d400a6SYan Zheng LIST_HEAD(ordered_sums); 70907d400a6SYan Zheng /* 71007d400a6SYan Zheng * is this extent already allocated in the extent 71107d400a6SYan Zheng * allocation tree? If so, just add a reference 71207d400a6SYan Zheng */ 7132ff7e61eSJeff Mahoney ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 71407d400a6SYan Zheng ins.offset); 71507d400a6SYan Zheng if (ret == 0) { 7162ff7e61eSJeff Mahoney ret = btrfs_inc_extent_ref(trans, fs_info, 71707d400a6SYan Zheng ins.objectid, ins.offset, 7185d4f98a2SYan Zheng 0, root->root_key.objectid, 719b06c4bf5SFilipe Manana key->objectid, offset); 720b50c6e25SJosef Bacik if (ret) 721b50c6e25SJosef Bacik goto out; 72207d400a6SYan Zheng } else { 72307d400a6SYan Zheng /* 72407d400a6SYan Zheng * insert the extent pointer in the extent 72507d400a6SYan Zheng * allocation tree 72607d400a6SYan Zheng */ 7275d4f98a2SYan Zheng ret = btrfs_alloc_logged_file_extent(trans, 7282ff7e61eSJeff Mahoney fs_info, 7292ff7e61eSJeff Mahoney root->root_key.objectid, 7305d4f98a2SYan Zheng key->objectid, offset, &ins); 731b50c6e25SJosef Bacik if (ret) 732b50c6e25SJosef Bacik goto out; 73307d400a6SYan Zheng } 734b3b4aa74SDavid Sterba btrfs_release_path(path); 73507d400a6SYan Zheng 73607d400a6SYan Zheng if (btrfs_file_extent_compression(eb, item)) { 73707d400a6SYan Zheng csum_start = ins.objectid; 73807d400a6SYan Zheng csum_end = csum_start + ins.offset; 73907d400a6SYan Zheng } else { 74007d400a6SYan Zheng csum_start = ins.objectid + 74107d400a6SYan Zheng btrfs_file_extent_offset(eb, item); 74207d400a6SYan Zheng csum_end = csum_start + 74307d400a6SYan Zheng btrfs_file_extent_num_bytes(eb, item); 74407d400a6SYan Zheng } 74507d400a6SYan Zheng 74607d400a6SYan Zheng ret = btrfs_lookup_csums_range(root->log_root, 74707d400a6SYan Zheng csum_start, csum_end - 1, 748a2de733cSArne Jansen &ordered_sums, 0); 7493650860bSJosef Bacik if (ret) 7503650860bSJosef Bacik goto out; 751b84b8390SFilipe Manana /* 752b84b8390SFilipe Manana * Now delete all existing cums in the csum root that 753b84b8390SFilipe Manana * cover our range. We do this because we can have an 754b84b8390SFilipe Manana * extent that is completely referenced by one file 755b84b8390SFilipe Manana * extent item and partially referenced by another 756b84b8390SFilipe Manana * file extent item (like after using the clone or 757b84b8390SFilipe Manana * extent_same ioctls). In this case if we end up doing 758b84b8390SFilipe Manana * the replay of the one that partially references the 759b84b8390SFilipe Manana * extent first, and we do not do the csum deletion 760b84b8390SFilipe Manana * below, we can get 2 csum items in the csum tree that 761b84b8390SFilipe Manana * overlap each other. For example, imagine our log has 762b84b8390SFilipe Manana * the two following file extent items: 763b84b8390SFilipe Manana * 764b84b8390SFilipe Manana * key (257 EXTENT_DATA 409600) 765b84b8390SFilipe Manana * extent data disk byte 12845056 nr 102400 766b84b8390SFilipe Manana * extent data offset 20480 nr 20480 ram 102400 767b84b8390SFilipe Manana * 768b84b8390SFilipe Manana * key (257 EXTENT_DATA 819200) 769b84b8390SFilipe Manana * extent data disk byte 12845056 nr 102400 770b84b8390SFilipe Manana * extent data offset 0 nr 102400 ram 102400 771b84b8390SFilipe Manana * 772b84b8390SFilipe Manana * Where the second one fully references the 100K extent 773b84b8390SFilipe Manana * that starts at disk byte 12845056, and the log tree 774b84b8390SFilipe Manana * has a single csum item that covers the entire range 775b84b8390SFilipe Manana * of the extent: 776b84b8390SFilipe Manana * 777b84b8390SFilipe Manana * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 778b84b8390SFilipe Manana * 779b84b8390SFilipe Manana * After the first file extent item is replayed, the 780b84b8390SFilipe Manana * csum tree gets the following csum item: 781b84b8390SFilipe Manana * 782b84b8390SFilipe Manana * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 783b84b8390SFilipe Manana * 784b84b8390SFilipe Manana * Which covers the 20K sub-range starting at offset 20K 785b84b8390SFilipe Manana * of our extent. Now when we replay the second file 786b84b8390SFilipe Manana * extent item, if we do not delete existing csum items 787b84b8390SFilipe Manana * that cover any of its blocks, we end up getting two 788b84b8390SFilipe Manana * csum items in our csum tree that overlap each other: 789b84b8390SFilipe Manana * 790b84b8390SFilipe Manana * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 791b84b8390SFilipe Manana * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 792b84b8390SFilipe Manana * 793b84b8390SFilipe Manana * Which is a problem, because after this anyone trying 794b84b8390SFilipe Manana * to lookup up for the checksum of any block of our 795b84b8390SFilipe Manana * extent starting at an offset of 40K or higher, will 796b84b8390SFilipe Manana * end up looking at the second csum item only, which 797b84b8390SFilipe Manana * does not contain the checksum for any block starting 798b84b8390SFilipe Manana * at offset 40K or higher of our extent. 799b84b8390SFilipe Manana */ 80007d400a6SYan Zheng while (!list_empty(&ordered_sums)) { 80107d400a6SYan Zheng struct btrfs_ordered_sum *sums; 80207d400a6SYan Zheng sums = list_entry(ordered_sums.next, 80307d400a6SYan Zheng struct btrfs_ordered_sum, 80407d400a6SYan Zheng list); 8053650860bSJosef Bacik if (!ret) 8060b246afaSJeff Mahoney ret = btrfs_del_csums(trans, fs_info, 807b84b8390SFilipe Manana sums->bytenr, 808b84b8390SFilipe Manana sums->len); 809b84b8390SFilipe Manana if (!ret) 81007d400a6SYan Zheng ret = btrfs_csum_file_blocks(trans, 8110b246afaSJeff Mahoney fs_info->csum_root, sums); 81207d400a6SYan Zheng list_del(&sums->list); 81307d400a6SYan Zheng kfree(sums); 81407d400a6SYan Zheng } 8153650860bSJosef Bacik if (ret) 8163650860bSJosef Bacik goto out; 81707d400a6SYan Zheng } else { 818b3b4aa74SDavid Sterba btrfs_release_path(path); 81907d400a6SYan Zheng } 82007d400a6SYan Zheng } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 82107d400a6SYan Zheng /* inline extents are easy, we just overwrite them */ 822e02119d5SChris Mason ret = overwrite_item(trans, root, path, eb, slot, key); 8233650860bSJosef Bacik if (ret) 8243650860bSJosef Bacik goto out; 82507d400a6SYan Zheng } 826e02119d5SChris Mason 8274bc4bee4SJosef Bacik inode_add_bytes(inode, nbytes); 828b9959295STsutomu Itoh ret = btrfs_update_inode(trans, root, inode); 829e02119d5SChris Mason out: 830e02119d5SChris Mason if (inode) 831e02119d5SChris Mason iput(inode); 832e02119d5SChris Mason return ret; 833e02119d5SChris Mason } 834e02119d5SChris Mason 835e02119d5SChris Mason /* 836e02119d5SChris Mason * when cleaning up conflicts between the directory names in the 837e02119d5SChris Mason * subvolume, directory names in the log and directory names in the 838e02119d5SChris Mason * inode back references, we may have to unlink inodes from directories. 839e02119d5SChris Mason * 840e02119d5SChris Mason * This is a helper function to do the unlink of a specific directory 841e02119d5SChris Mason * item 842e02119d5SChris Mason */ 843e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 844e02119d5SChris Mason struct btrfs_root *root, 845e02119d5SChris Mason struct btrfs_path *path, 846e02119d5SChris Mason struct inode *dir, 847e02119d5SChris Mason struct btrfs_dir_item *di) 848e02119d5SChris Mason { 8492ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 850e02119d5SChris Mason struct inode *inode; 851e02119d5SChris Mason char *name; 852e02119d5SChris Mason int name_len; 853e02119d5SChris Mason struct extent_buffer *leaf; 854e02119d5SChris Mason struct btrfs_key location; 855e02119d5SChris Mason int ret; 856e02119d5SChris Mason 857e02119d5SChris Mason leaf = path->nodes[0]; 858e02119d5SChris Mason 859e02119d5SChris Mason btrfs_dir_item_key_to_cpu(leaf, di, &location); 860e02119d5SChris Mason name_len = btrfs_dir_name_len(leaf, di); 861e02119d5SChris Mason name = kmalloc(name_len, GFP_NOFS); 8622a29edc6Sliubo if (!name) 8632a29edc6Sliubo return -ENOMEM; 8642a29edc6Sliubo 865e02119d5SChris Mason read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 866b3b4aa74SDavid Sterba btrfs_release_path(path); 867e02119d5SChris Mason 868e02119d5SChris Mason inode = read_one_inode(root, location.objectid); 869c00e9493STsutomu Itoh if (!inode) { 8703650860bSJosef Bacik ret = -EIO; 8713650860bSJosef Bacik goto out; 872c00e9493STsutomu Itoh } 873e02119d5SChris Mason 874ec051c0fSYan Zheng ret = link_to_fixup_dir(trans, root, path, location.objectid); 8753650860bSJosef Bacik if (ret) 8763650860bSJosef Bacik goto out; 87712fcfd22SChris Mason 878e02119d5SChris Mason ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 8793650860bSJosef Bacik if (ret) 8803650860bSJosef Bacik goto out; 881ada9af21SFilipe David Borba Manana else 8822ff7e61eSJeff Mahoney ret = btrfs_run_delayed_items(trans, fs_info); 8833650860bSJosef Bacik out: 8843650860bSJosef Bacik kfree(name); 8853650860bSJosef Bacik iput(inode); 886e02119d5SChris Mason return ret; 887e02119d5SChris Mason } 888e02119d5SChris Mason 889e02119d5SChris Mason /* 890e02119d5SChris Mason * helper function to see if a given name and sequence number found 891e02119d5SChris Mason * in an inode back reference are already in a directory and correctly 892e02119d5SChris Mason * point to this inode 893e02119d5SChris Mason */ 894e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root, 895e02119d5SChris Mason struct btrfs_path *path, 896e02119d5SChris Mason u64 dirid, u64 objectid, u64 index, 897e02119d5SChris Mason const char *name, int name_len) 898e02119d5SChris Mason { 899e02119d5SChris Mason struct btrfs_dir_item *di; 900e02119d5SChris Mason struct btrfs_key location; 901e02119d5SChris Mason int match = 0; 902e02119d5SChris Mason 903e02119d5SChris Mason di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 904e02119d5SChris Mason index, name, name_len, 0); 905e02119d5SChris Mason if (di && !IS_ERR(di)) { 906e02119d5SChris Mason btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 907e02119d5SChris Mason if (location.objectid != objectid) 908e02119d5SChris Mason goto out; 909e02119d5SChris Mason } else 910e02119d5SChris Mason goto out; 911b3b4aa74SDavid Sterba btrfs_release_path(path); 912e02119d5SChris Mason 913e02119d5SChris Mason di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 914e02119d5SChris Mason if (di && !IS_ERR(di)) { 915e02119d5SChris Mason btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 916e02119d5SChris Mason if (location.objectid != objectid) 917e02119d5SChris Mason goto out; 918e02119d5SChris Mason } else 919e02119d5SChris Mason goto out; 920e02119d5SChris Mason match = 1; 921e02119d5SChris Mason out: 922b3b4aa74SDavid Sterba btrfs_release_path(path); 923e02119d5SChris Mason return match; 924e02119d5SChris Mason } 925e02119d5SChris Mason 926e02119d5SChris Mason /* 927e02119d5SChris Mason * helper function to check a log tree for a named back reference in 928e02119d5SChris Mason * an inode. This is used to decide if a back reference that is 929e02119d5SChris Mason * found in the subvolume conflicts with what we find in the log. 930e02119d5SChris Mason * 931e02119d5SChris Mason * inode backreferences may have multiple refs in a single item, 932e02119d5SChris Mason * during replay we process one reference at a time, and we don't 933e02119d5SChris Mason * want to delete valid links to a file from the subvolume if that 934e02119d5SChris Mason * link is also in the log. 935e02119d5SChris Mason */ 936e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log, 937e02119d5SChris Mason struct btrfs_key *key, 938f186373fSMark Fasheh u64 ref_objectid, 939df8d116fSFilipe Manana const char *name, int namelen) 940e02119d5SChris Mason { 941e02119d5SChris Mason struct btrfs_path *path; 942e02119d5SChris Mason struct btrfs_inode_ref *ref; 943e02119d5SChris Mason unsigned long ptr; 944e02119d5SChris Mason unsigned long ptr_end; 945e02119d5SChris Mason unsigned long name_ptr; 946e02119d5SChris Mason int found_name_len; 947e02119d5SChris Mason int item_size; 948e02119d5SChris Mason int ret; 949e02119d5SChris Mason int match = 0; 950e02119d5SChris Mason 951e02119d5SChris Mason path = btrfs_alloc_path(); 9522a29edc6Sliubo if (!path) 9532a29edc6Sliubo return -ENOMEM; 9542a29edc6Sliubo 955e02119d5SChris Mason ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 956e02119d5SChris Mason if (ret != 0) 957e02119d5SChris Mason goto out; 958e02119d5SChris Mason 959e02119d5SChris Mason ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 960f186373fSMark Fasheh 961f186373fSMark Fasheh if (key->type == BTRFS_INODE_EXTREF_KEY) { 962f186373fSMark Fasheh if (btrfs_find_name_in_ext_backref(path, ref_objectid, 963f186373fSMark Fasheh name, namelen, NULL)) 964f186373fSMark Fasheh match = 1; 965f186373fSMark Fasheh 966f186373fSMark Fasheh goto out; 967f186373fSMark Fasheh } 968f186373fSMark Fasheh 969f186373fSMark Fasheh item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 970e02119d5SChris Mason ptr_end = ptr + item_size; 971e02119d5SChris Mason while (ptr < ptr_end) { 972e02119d5SChris Mason ref = (struct btrfs_inode_ref *)ptr; 973e02119d5SChris Mason found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 974e02119d5SChris Mason if (found_name_len == namelen) { 975e02119d5SChris Mason name_ptr = (unsigned long)(ref + 1); 976e02119d5SChris Mason ret = memcmp_extent_buffer(path->nodes[0], name, 977e02119d5SChris Mason name_ptr, namelen); 978e02119d5SChris Mason if (ret == 0) { 979e02119d5SChris Mason match = 1; 980e02119d5SChris Mason goto out; 981e02119d5SChris Mason } 982e02119d5SChris Mason } 983e02119d5SChris Mason ptr = (unsigned long)(ref + 1) + found_name_len; 984e02119d5SChris Mason } 985e02119d5SChris Mason out: 986e02119d5SChris Mason btrfs_free_path(path); 987e02119d5SChris Mason return match; 988e02119d5SChris Mason } 989e02119d5SChris Mason 9905a1d7843SJan Schmidt static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 9915a1d7843SJan Schmidt struct btrfs_root *root, 9925a1d7843SJan Schmidt struct btrfs_path *path, 9935a1d7843SJan Schmidt struct btrfs_root *log_root, 9945a1d7843SJan Schmidt struct inode *dir, struct inode *inode, 9955a1d7843SJan Schmidt struct extent_buffer *eb, 996f186373fSMark Fasheh u64 inode_objectid, u64 parent_objectid, 997f186373fSMark Fasheh u64 ref_index, char *name, int namelen, 998f186373fSMark Fasheh int *search_done) 9995a1d7843SJan Schmidt { 10002ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 10015a1d7843SJan Schmidt int ret; 10025a1d7843SJan Schmidt char *victim_name; 10035a1d7843SJan Schmidt int victim_name_len; 1004f186373fSMark Fasheh struct extent_buffer *leaf; 1005f186373fSMark Fasheh struct btrfs_dir_item *di; 1006f186373fSMark Fasheh struct btrfs_key search_key; 1007f186373fSMark Fasheh struct btrfs_inode_extref *extref; 1008f186373fSMark Fasheh 1009f186373fSMark Fasheh again: 1010f186373fSMark Fasheh /* Search old style refs */ 1011f186373fSMark Fasheh search_key.objectid = inode_objectid; 1012f186373fSMark Fasheh search_key.type = BTRFS_INODE_REF_KEY; 1013f186373fSMark Fasheh search_key.offset = parent_objectid; 1014f186373fSMark Fasheh ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1015f186373fSMark Fasheh if (ret == 0) { 10165a1d7843SJan Schmidt struct btrfs_inode_ref *victim_ref; 10175a1d7843SJan Schmidt unsigned long ptr; 10185a1d7843SJan Schmidt unsigned long ptr_end; 1019f186373fSMark Fasheh 1020f186373fSMark Fasheh leaf = path->nodes[0]; 10215a1d7843SJan Schmidt 10225a1d7843SJan Schmidt /* are we trying to overwrite a back ref for the root directory 10235a1d7843SJan Schmidt * if so, just jump out, we're done 10245a1d7843SJan Schmidt */ 1025f186373fSMark Fasheh if (search_key.objectid == search_key.offset) 10265a1d7843SJan Schmidt return 1; 10275a1d7843SJan Schmidt 10285a1d7843SJan Schmidt /* check all the names in this back reference to see 10295a1d7843SJan Schmidt * if they are in the log. if so, we allow them to stay 10305a1d7843SJan Schmidt * otherwise they must be unlinked as a conflict 10315a1d7843SJan Schmidt */ 10325a1d7843SJan Schmidt ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 10335a1d7843SJan Schmidt ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 10345a1d7843SJan Schmidt while (ptr < ptr_end) { 10355a1d7843SJan Schmidt victim_ref = (struct btrfs_inode_ref *)ptr; 10365a1d7843SJan Schmidt victim_name_len = btrfs_inode_ref_name_len(leaf, 10375a1d7843SJan Schmidt victim_ref); 10385a1d7843SJan Schmidt victim_name = kmalloc(victim_name_len, GFP_NOFS); 10393650860bSJosef Bacik if (!victim_name) 10403650860bSJosef Bacik return -ENOMEM; 10415a1d7843SJan Schmidt 10425a1d7843SJan Schmidt read_extent_buffer(leaf, victim_name, 10435a1d7843SJan Schmidt (unsigned long)(victim_ref + 1), 10445a1d7843SJan Schmidt victim_name_len); 10455a1d7843SJan Schmidt 1046f186373fSMark Fasheh if (!backref_in_log(log_root, &search_key, 1047f186373fSMark Fasheh parent_objectid, 1048f186373fSMark Fasheh victim_name, 10495a1d7843SJan Schmidt victim_name_len)) { 10508b558c5fSZach Brown inc_nlink(inode); 10515a1d7843SJan Schmidt btrfs_release_path(path); 10525a1d7843SJan Schmidt 10535a1d7843SJan Schmidt ret = btrfs_unlink_inode(trans, root, dir, 10545a1d7843SJan Schmidt inode, victim_name, 10555a1d7843SJan Schmidt victim_name_len); 1056f186373fSMark Fasheh kfree(victim_name); 10573650860bSJosef Bacik if (ret) 10583650860bSJosef Bacik return ret; 10592ff7e61eSJeff Mahoney ret = btrfs_run_delayed_items(trans, fs_info); 1060ada9af21SFilipe David Borba Manana if (ret) 1061ada9af21SFilipe David Borba Manana return ret; 1062f186373fSMark Fasheh *search_done = 1; 1063f186373fSMark Fasheh goto again; 10645a1d7843SJan Schmidt } 10655a1d7843SJan Schmidt kfree(victim_name); 1066f186373fSMark Fasheh 10675a1d7843SJan Schmidt ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 10685a1d7843SJan Schmidt } 10695a1d7843SJan Schmidt 10705a1d7843SJan Schmidt /* 10715a1d7843SJan Schmidt * NOTE: we have searched root tree and checked the 1072bb7ab3b9SAdam Buchbinder * corresponding ref, it does not need to check again. 10735a1d7843SJan Schmidt */ 10745a1d7843SJan Schmidt *search_done = 1; 10755a1d7843SJan Schmidt } 10765a1d7843SJan Schmidt btrfs_release_path(path); 10775a1d7843SJan Schmidt 1078f186373fSMark Fasheh /* Same search but for extended refs */ 1079f186373fSMark Fasheh extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1080f186373fSMark Fasheh inode_objectid, parent_objectid, 0, 1081f186373fSMark Fasheh 0); 1082f186373fSMark Fasheh if (!IS_ERR_OR_NULL(extref)) { 1083f186373fSMark Fasheh u32 item_size; 1084f186373fSMark Fasheh u32 cur_offset = 0; 1085f186373fSMark Fasheh unsigned long base; 1086f186373fSMark Fasheh struct inode *victim_parent; 1087f186373fSMark Fasheh 1088f186373fSMark Fasheh leaf = path->nodes[0]; 1089f186373fSMark Fasheh 1090f186373fSMark Fasheh item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1091f186373fSMark Fasheh base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1092f186373fSMark Fasheh 1093f186373fSMark Fasheh while (cur_offset < item_size) { 1094dd9ef135SQuentin Casasnovas extref = (struct btrfs_inode_extref *)(base + cur_offset); 1095f186373fSMark Fasheh 1096f186373fSMark Fasheh victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1097f186373fSMark Fasheh 1098f186373fSMark Fasheh if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1099f186373fSMark Fasheh goto next; 1100f186373fSMark Fasheh 1101f186373fSMark Fasheh victim_name = kmalloc(victim_name_len, GFP_NOFS); 11023650860bSJosef Bacik if (!victim_name) 11033650860bSJosef Bacik return -ENOMEM; 1104f186373fSMark Fasheh read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1105f186373fSMark Fasheh victim_name_len); 1106f186373fSMark Fasheh 1107f186373fSMark Fasheh search_key.objectid = inode_objectid; 1108f186373fSMark Fasheh search_key.type = BTRFS_INODE_EXTREF_KEY; 1109f186373fSMark Fasheh search_key.offset = btrfs_extref_hash(parent_objectid, 1110f186373fSMark Fasheh victim_name, 1111f186373fSMark Fasheh victim_name_len); 1112f186373fSMark Fasheh ret = 0; 1113f186373fSMark Fasheh if (!backref_in_log(log_root, &search_key, 1114f186373fSMark Fasheh parent_objectid, victim_name, 1115f186373fSMark Fasheh victim_name_len)) { 1116f186373fSMark Fasheh ret = -ENOENT; 1117f186373fSMark Fasheh victim_parent = read_one_inode(root, 1118f186373fSMark Fasheh parent_objectid); 1119f186373fSMark Fasheh if (victim_parent) { 11208b558c5fSZach Brown inc_nlink(inode); 1121f186373fSMark Fasheh btrfs_release_path(path); 1122f186373fSMark Fasheh 1123f186373fSMark Fasheh ret = btrfs_unlink_inode(trans, root, 1124f186373fSMark Fasheh victim_parent, 1125f186373fSMark Fasheh inode, 1126f186373fSMark Fasheh victim_name, 1127f186373fSMark Fasheh victim_name_len); 1128ada9af21SFilipe David Borba Manana if (!ret) 1129ada9af21SFilipe David Borba Manana ret = btrfs_run_delayed_items( 11302ff7e61eSJeff Mahoney trans, 11312ff7e61eSJeff Mahoney fs_info); 1132f186373fSMark Fasheh } 1133f186373fSMark Fasheh iput(victim_parent); 1134f186373fSMark Fasheh kfree(victim_name); 11353650860bSJosef Bacik if (ret) 11363650860bSJosef Bacik return ret; 1137f186373fSMark Fasheh *search_done = 1; 1138f186373fSMark Fasheh goto again; 1139f186373fSMark Fasheh } 1140f186373fSMark Fasheh kfree(victim_name); 11413650860bSJosef Bacik if (ret) 11423650860bSJosef Bacik return ret; 1143f186373fSMark Fasheh next: 1144f186373fSMark Fasheh cur_offset += victim_name_len + sizeof(*extref); 1145f186373fSMark Fasheh } 1146f186373fSMark Fasheh *search_done = 1; 1147f186373fSMark Fasheh } 1148f186373fSMark Fasheh btrfs_release_path(path); 1149f186373fSMark Fasheh 11505a1d7843SJan Schmidt /* look for a conflicting sequence number */ 11514a0cc7caSNikolay Borisov di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(BTRFS_I(dir)), 1152f186373fSMark Fasheh ref_index, name, namelen, 0); 11535a1d7843SJan Schmidt if (di && !IS_ERR(di)) { 11545a1d7843SJan Schmidt ret = drop_one_dir_item(trans, root, path, dir, di); 11553650860bSJosef Bacik if (ret) 11563650860bSJosef Bacik return ret; 11575a1d7843SJan Schmidt } 11585a1d7843SJan Schmidt btrfs_release_path(path); 11595a1d7843SJan Schmidt 11605a1d7843SJan Schmidt /* look for a conflicing name */ 11614a0cc7caSNikolay Borisov di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(BTRFS_I(dir)), 11625a1d7843SJan Schmidt name, namelen, 0); 11635a1d7843SJan Schmidt if (di && !IS_ERR(di)) { 11645a1d7843SJan Schmidt ret = drop_one_dir_item(trans, root, path, dir, di); 11653650860bSJosef Bacik if (ret) 11663650860bSJosef Bacik return ret; 11675a1d7843SJan Schmidt } 11685a1d7843SJan Schmidt btrfs_release_path(path); 11695a1d7843SJan Schmidt 11705a1d7843SJan Schmidt return 0; 11715a1d7843SJan Schmidt } 1172e02119d5SChris Mason 1173f186373fSMark Fasheh static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1174f186373fSMark Fasheh u32 *namelen, char **name, u64 *index, 1175f186373fSMark Fasheh u64 *parent_objectid) 1176f186373fSMark Fasheh { 1177f186373fSMark Fasheh struct btrfs_inode_extref *extref; 1178f186373fSMark Fasheh 1179f186373fSMark Fasheh extref = (struct btrfs_inode_extref *)ref_ptr; 1180f186373fSMark Fasheh 1181f186373fSMark Fasheh *namelen = btrfs_inode_extref_name_len(eb, extref); 1182f186373fSMark Fasheh *name = kmalloc(*namelen, GFP_NOFS); 1183f186373fSMark Fasheh if (*name == NULL) 1184f186373fSMark Fasheh return -ENOMEM; 1185f186373fSMark Fasheh 1186f186373fSMark Fasheh read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1187f186373fSMark Fasheh *namelen); 1188f186373fSMark Fasheh 1189f186373fSMark Fasheh *index = btrfs_inode_extref_index(eb, extref); 1190f186373fSMark Fasheh if (parent_objectid) 1191f186373fSMark Fasheh *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1192f186373fSMark Fasheh 1193f186373fSMark Fasheh return 0; 1194f186373fSMark Fasheh } 1195f186373fSMark Fasheh 1196f186373fSMark Fasheh static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1197f186373fSMark Fasheh u32 *namelen, char **name, u64 *index) 1198f186373fSMark Fasheh { 1199f186373fSMark Fasheh struct btrfs_inode_ref *ref; 1200f186373fSMark Fasheh 1201f186373fSMark Fasheh ref = (struct btrfs_inode_ref *)ref_ptr; 1202f186373fSMark Fasheh 1203f186373fSMark Fasheh *namelen = btrfs_inode_ref_name_len(eb, ref); 1204f186373fSMark Fasheh *name = kmalloc(*namelen, GFP_NOFS); 1205f186373fSMark Fasheh if (*name == NULL) 1206f186373fSMark Fasheh return -ENOMEM; 1207f186373fSMark Fasheh 1208f186373fSMark Fasheh read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1209f186373fSMark Fasheh 1210f186373fSMark Fasheh *index = btrfs_inode_ref_index(eb, ref); 1211f186373fSMark Fasheh 1212f186373fSMark Fasheh return 0; 1213f186373fSMark Fasheh } 1214f186373fSMark Fasheh 1215e02119d5SChris Mason /* 1216e02119d5SChris Mason * replay one inode back reference item found in the log tree. 1217e02119d5SChris Mason * eb, slot and key refer to the buffer and key found in the log tree. 1218e02119d5SChris Mason * root is the destination we are replaying into, and path is for temp 1219e02119d5SChris Mason * use by this function. (it should be released on return). 1220e02119d5SChris Mason */ 1221e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1222e02119d5SChris Mason struct btrfs_root *root, 1223e02119d5SChris Mason struct btrfs_root *log, 1224e02119d5SChris Mason struct btrfs_path *path, 1225e02119d5SChris Mason struct extent_buffer *eb, int slot, 1226e02119d5SChris Mason struct btrfs_key *key) 1227e02119d5SChris Mason { 122803b2f08bSGeyslan G. Bem struct inode *dir = NULL; 122903b2f08bSGeyslan G. Bem struct inode *inode = NULL; 1230e02119d5SChris Mason unsigned long ref_ptr; 1231e02119d5SChris Mason unsigned long ref_end; 123203b2f08bSGeyslan G. Bem char *name = NULL; 123334f3e4f2Sliubo int namelen; 123434f3e4f2Sliubo int ret; 1235c622ae60Sliubo int search_done = 0; 1236f186373fSMark Fasheh int log_ref_ver = 0; 1237f186373fSMark Fasheh u64 parent_objectid; 1238f186373fSMark Fasheh u64 inode_objectid; 1239f46dbe3dSChris Mason u64 ref_index = 0; 1240f186373fSMark Fasheh int ref_struct_size; 1241f186373fSMark Fasheh 1242f186373fSMark Fasheh ref_ptr = btrfs_item_ptr_offset(eb, slot); 1243f186373fSMark Fasheh ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1244f186373fSMark Fasheh 1245f186373fSMark Fasheh if (key->type == BTRFS_INODE_EXTREF_KEY) { 1246f186373fSMark Fasheh struct btrfs_inode_extref *r; 1247f186373fSMark Fasheh 1248f186373fSMark Fasheh ref_struct_size = sizeof(struct btrfs_inode_extref); 1249f186373fSMark Fasheh log_ref_ver = 1; 1250f186373fSMark Fasheh r = (struct btrfs_inode_extref *)ref_ptr; 1251f186373fSMark Fasheh parent_objectid = btrfs_inode_extref_parent(eb, r); 1252f186373fSMark Fasheh } else { 1253f186373fSMark Fasheh ref_struct_size = sizeof(struct btrfs_inode_ref); 1254f186373fSMark Fasheh parent_objectid = key->offset; 1255f186373fSMark Fasheh } 1256f186373fSMark Fasheh inode_objectid = key->objectid; 1257e02119d5SChris Mason 1258e02119d5SChris Mason /* 1259e02119d5SChris Mason * it is possible that we didn't log all the parent directories 1260e02119d5SChris Mason * for a given inode. If we don't find the dir, just don't 1261e02119d5SChris Mason * copy the back ref in. The link count fixup code will take 1262e02119d5SChris Mason * care of the rest 1263e02119d5SChris Mason */ 1264f186373fSMark Fasheh dir = read_one_inode(root, parent_objectid); 126503b2f08bSGeyslan G. Bem if (!dir) { 126603b2f08bSGeyslan G. Bem ret = -ENOENT; 126703b2f08bSGeyslan G. Bem goto out; 126803b2f08bSGeyslan G. Bem } 1269e02119d5SChris Mason 1270f186373fSMark Fasheh inode = read_one_inode(root, inode_objectid); 1271c00e9493STsutomu Itoh if (!inode) { 127203b2f08bSGeyslan G. Bem ret = -EIO; 127303b2f08bSGeyslan G. Bem goto out; 1274c00e9493STsutomu Itoh } 1275e02119d5SChris Mason 12765a1d7843SJan Schmidt while (ref_ptr < ref_end) { 1277f186373fSMark Fasheh if (log_ref_ver) { 1278f186373fSMark Fasheh ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1279f186373fSMark Fasheh &ref_index, &parent_objectid); 1280f186373fSMark Fasheh /* 1281f186373fSMark Fasheh * parent object can change from one array 1282f186373fSMark Fasheh * item to another. 1283f186373fSMark Fasheh */ 1284f186373fSMark Fasheh if (!dir) 1285f186373fSMark Fasheh dir = read_one_inode(root, parent_objectid); 128603b2f08bSGeyslan G. Bem if (!dir) { 128703b2f08bSGeyslan G. Bem ret = -ENOENT; 128803b2f08bSGeyslan G. Bem goto out; 128903b2f08bSGeyslan G. Bem } 1290f186373fSMark Fasheh } else { 1291f186373fSMark Fasheh ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1292f186373fSMark Fasheh &ref_index); 1293f186373fSMark Fasheh } 1294f186373fSMark Fasheh if (ret) 129503b2f08bSGeyslan G. Bem goto out; 1296e02119d5SChris Mason 1297e02119d5SChris Mason /* if we already have a perfect match, we're done */ 12984a0cc7caSNikolay Borisov if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), btrfs_ino(BTRFS_I(inode)), 1299f186373fSMark Fasheh ref_index, name, namelen)) { 13005a1d7843SJan Schmidt /* 13015a1d7843SJan Schmidt * look for a conflicting back reference in the 13025a1d7843SJan Schmidt * metadata. if we find one we have to unlink that name 13035a1d7843SJan Schmidt * of the file before we add our new link. Later on, we 13045a1d7843SJan Schmidt * overwrite any existing back reference, and we don't 13055a1d7843SJan Schmidt * want to create dangling pointers in the directory. 13065a1d7843SJan Schmidt */ 13075a1d7843SJan Schmidt 13085a1d7843SJan Schmidt if (!search_done) { 13095a1d7843SJan Schmidt ret = __add_inode_ref(trans, root, path, log, 1310f186373fSMark Fasheh dir, inode, eb, 1311f186373fSMark Fasheh inode_objectid, 1312f186373fSMark Fasheh parent_objectid, 1313f186373fSMark Fasheh ref_index, name, namelen, 13145a1d7843SJan Schmidt &search_done); 131503b2f08bSGeyslan G. Bem if (ret) { 131603b2f08bSGeyslan G. Bem if (ret == 1) 13173650860bSJosef Bacik ret = 0; 1318e02119d5SChris Mason goto out; 13193650860bSJosef Bacik } 132034f3e4f2Sliubo } 132134f3e4f2Sliubo 1322e02119d5SChris Mason /* insert our name */ 13235a1d7843SJan Schmidt ret = btrfs_add_link(trans, dir, inode, name, namelen, 1324f186373fSMark Fasheh 0, ref_index); 13253650860bSJosef Bacik if (ret) 13263650860bSJosef Bacik goto out; 1327e02119d5SChris Mason 1328e02119d5SChris Mason btrfs_update_inode(trans, root, inode); 13295a1d7843SJan Schmidt } 1330e02119d5SChris Mason 1331f186373fSMark Fasheh ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1332e02119d5SChris Mason kfree(name); 133303b2f08bSGeyslan G. Bem name = NULL; 1334f186373fSMark Fasheh if (log_ref_ver) { 1335f186373fSMark Fasheh iput(dir); 1336f186373fSMark Fasheh dir = NULL; 1337f186373fSMark Fasheh } 13385a1d7843SJan Schmidt } 1339e02119d5SChris Mason 1340e02119d5SChris Mason /* finally write the back reference in the inode */ 1341e02119d5SChris Mason ret = overwrite_item(trans, root, path, eb, slot, key); 13425a1d7843SJan Schmidt out: 1343b3b4aa74SDavid Sterba btrfs_release_path(path); 134403b2f08bSGeyslan G. Bem kfree(name); 1345e02119d5SChris Mason iput(dir); 1346e02119d5SChris Mason iput(inode); 13473650860bSJosef Bacik return ret; 1348e02119d5SChris Mason } 1349e02119d5SChris Mason 1350c71bf099SYan, Zheng static int insert_orphan_item(struct btrfs_trans_handle *trans, 13519c4f61f0SDavid Sterba struct btrfs_root *root, u64 ino) 1352c71bf099SYan, Zheng { 1353c71bf099SYan, Zheng int ret; 1354381cf658SDavid Sterba 13559c4f61f0SDavid Sterba ret = btrfs_insert_orphan_item(trans, root, ino); 13569c4f61f0SDavid Sterba if (ret == -EEXIST) 13579c4f61f0SDavid Sterba ret = 0; 1358381cf658SDavid Sterba 1359c71bf099SYan, Zheng return ret; 1360c71bf099SYan, Zheng } 1361c71bf099SYan, Zheng 1362f186373fSMark Fasheh static int count_inode_extrefs(struct btrfs_root *root, 1363f186373fSMark Fasheh struct inode *inode, struct btrfs_path *path) 1364e02119d5SChris Mason { 1365f186373fSMark Fasheh int ret = 0; 1366f186373fSMark Fasheh int name_len; 1367f186373fSMark Fasheh unsigned int nlink = 0; 1368f186373fSMark Fasheh u32 item_size; 1369f186373fSMark Fasheh u32 cur_offset = 0; 13704a0cc7caSNikolay Borisov u64 inode_objectid = btrfs_ino(BTRFS_I(inode)); 1371f186373fSMark Fasheh u64 offset = 0; 1372f186373fSMark Fasheh unsigned long ptr; 1373f186373fSMark Fasheh struct btrfs_inode_extref *extref; 1374f186373fSMark Fasheh struct extent_buffer *leaf; 1375f186373fSMark Fasheh 1376f186373fSMark Fasheh while (1) { 1377f186373fSMark Fasheh ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1378f186373fSMark Fasheh &extref, &offset); 1379f186373fSMark Fasheh if (ret) 1380f186373fSMark Fasheh break; 1381f186373fSMark Fasheh 1382f186373fSMark Fasheh leaf = path->nodes[0]; 1383f186373fSMark Fasheh item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1384f186373fSMark Fasheh ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 13852c2c452bSFilipe Manana cur_offset = 0; 1386f186373fSMark Fasheh 1387f186373fSMark Fasheh while (cur_offset < item_size) { 1388f186373fSMark Fasheh extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1389f186373fSMark Fasheh name_len = btrfs_inode_extref_name_len(leaf, extref); 1390f186373fSMark Fasheh 1391f186373fSMark Fasheh nlink++; 1392f186373fSMark Fasheh 1393f186373fSMark Fasheh cur_offset += name_len + sizeof(*extref); 1394f186373fSMark Fasheh } 1395f186373fSMark Fasheh 1396f186373fSMark Fasheh offset++; 1397f186373fSMark Fasheh btrfs_release_path(path); 1398f186373fSMark Fasheh } 1399f186373fSMark Fasheh btrfs_release_path(path); 1400f186373fSMark Fasheh 14012c2c452bSFilipe Manana if (ret < 0 && ret != -ENOENT) 1402f186373fSMark Fasheh return ret; 1403f186373fSMark Fasheh return nlink; 1404f186373fSMark Fasheh } 1405f186373fSMark Fasheh 1406f186373fSMark Fasheh static int count_inode_refs(struct btrfs_root *root, 1407f186373fSMark Fasheh struct inode *inode, struct btrfs_path *path) 1408f186373fSMark Fasheh { 1409e02119d5SChris Mason int ret; 1410e02119d5SChris Mason struct btrfs_key key; 1411f186373fSMark Fasheh unsigned int nlink = 0; 1412e02119d5SChris Mason unsigned long ptr; 1413e02119d5SChris Mason unsigned long ptr_end; 1414e02119d5SChris Mason int name_len; 14154a0cc7caSNikolay Borisov u64 ino = btrfs_ino(BTRFS_I(inode)); 1416e02119d5SChris Mason 141733345d01SLi Zefan key.objectid = ino; 1418e02119d5SChris Mason key.type = BTRFS_INODE_REF_KEY; 1419e02119d5SChris Mason key.offset = (u64)-1; 1420e02119d5SChris Mason 1421e02119d5SChris Mason while (1) { 1422e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1423e02119d5SChris Mason if (ret < 0) 1424e02119d5SChris Mason break; 1425e02119d5SChris Mason if (ret > 0) { 1426e02119d5SChris Mason if (path->slots[0] == 0) 1427e02119d5SChris Mason break; 1428e02119d5SChris Mason path->slots[0]--; 1429e02119d5SChris Mason } 1430e93ae26fSFilipe David Borba Manana process_slot: 1431e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, 1432e02119d5SChris Mason path->slots[0]); 143333345d01SLi Zefan if (key.objectid != ino || 1434e02119d5SChris Mason key.type != BTRFS_INODE_REF_KEY) 1435e02119d5SChris Mason break; 1436e02119d5SChris Mason ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1437e02119d5SChris Mason ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1438e02119d5SChris Mason path->slots[0]); 1439e02119d5SChris Mason while (ptr < ptr_end) { 1440e02119d5SChris Mason struct btrfs_inode_ref *ref; 1441e02119d5SChris Mason 1442e02119d5SChris Mason ref = (struct btrfs_inode_ref *)ptr; 1443e02119d5SChris Mason name_len = btrfs_inode_ref_name_len(path->nodes[0], 1444e02119d5SChris Mason ref); 1445e02119d5SChris Mason ptr = (unsigned long)(ref + 1) + name_len; 1446e02119d5SChris Mason nlink++; 1447e02119d5SChris Mason } 1448e02119d5SChris Mason 1449e02119d5SChris Mason if (key.offset == 0) 1450e02119d5SChris Mason break; 1451e93ae26fSFilipe David Borba Manana if (path->slots[0] > 0) { 1452e93ae26fSFilipe David Borba Manana path->slots[0]--; 1453e93ae26fSFilipe David Borba Manana goto process_slot; 1454e93ae26fSFilipe David Borba Manana } 1455e02119d5SChris Mason key.offset--; 1456b3b4aa74SDavid Sterba btrfs_release_path(path); 1457e02119d5SChris Mason } 1458b3b4aa74SDavid Sterba btrfs_release_path(path); 1459f186373fSMark Fasheh 1460f186373fSMark Fasheh return nlink; 1461f186373fSMark Fasheh } 1462f186373fSMark Fasheh 1463f186373fSMark Fasheh /* 1464f186373fSMark Fasheh * There are a few corners where the link count of the file can't 1465f186373fSMark Fasheh * be properly maintained during replay. So, instead of adding 1466f186373fSMark Fasheh * lots of complexity to the log code, we just scan the backrefs 1467f186373fSMark Fasheh * for any file that has been through replay. 1468f186373fSMark Fasheh * 1469f186373fSMark Fasheh * The scan will update the link count on the inode to reflect the 1470f186373fSMark Fasheh * number of back refs found. If it goes down to zero, the iput 1471f186373fSMark Fasheh * will free the inode. 1472f186373fSMark Fasheh */ 1473f186373fSMark Fasheh static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1474f186373fSMark Fasheh struct btrfs_root *root, 1475f186373fSMark Fasheh struct inode *inode) 1476f186373fSMark Fasheh { 1477f186373fSMark Fasheh struct btrfs_path *path; 1478f186373fSMark Fasheh int ret; 1479f186373fSMark Fasheh u64 nlink = 0; 14804a0cc7caSNikolay Borisov u64 ino = btrfs_ino(BTRFS_I(inode)); 1481f186373fSMark Fasheh 1482f186373fSMark Fasheh path = btrfs_alloc_path(); 1483f186373fSMark Fasheh if (!path) 1484f186373fSMark Fasheh return -ENOMEM; 1485f186373fSMark Fasheh 1486f186373fSMark Fasheh ret = count_inode_refs(root, inode, path); 1487f186373fSMark Fasheh if (ret < 0) 1488f186373fSMark Fasheh goto out; 1489f186373fSMark Fasheh 1490f186373fSMark Fasheh nlink = ret; 1491f186373fSMark Fasheh 1492f186373fSMark Fasheh ret = count_inode_extrefs(root, inode, path); 1493f186373fSMark Fasheh if (ret < 0) 1494f186373fSMark Fasheh goto out; 1495f186373fSMark Fasheh 1496f186373fSMark Fasheh nlink += ret; 1497f186373fSMark Fasheh 1498f186373fSMark Fasheh ret = 0; 1499f186373fSMark Fasheh 1500e02119d5SChris Mason if (nlink != inode->i_nlink) { 1501bfe86848SMiklos Szeredi set_nlink(inode, nlink); 1502e02119d5SChris Mason btrfs_update_inode(trans, root, inode); 1503e02119d5SChris Mason } 15048d5bf1cbSChris Mason BTRFS_I(inode)->index_cnt = (u64)-1; 1505e02119d5SChris Mason 1506c71bf099SYan, Zheng if (inode->i_nlink == 0) { 1507c71bf099SYan, Zheng if (S_ISDIR(inode->i_mode)) { 150812fcfd22SChris Mason ret = replay_dir_deletes(trans, root, NULL, path, 150933345d01SLi Zefan ino, 1); 15103650860bSJosef Bacik if (ret) 15113650860bSJosef Bacik goto out; 151212fcfd22SChris Mason } 151333345d01SLi Zefan ret = insert_orphan_item(trans, root, ino); 1514c71bf099SYan, Zheng } 151512fcfd22SChris Mason 1516f186373fSMark Fasheh out: 1517f186373fSMark Fasheh btrfs_free_path(path); 1518f186373fSMark Fasheh return ret; 1519e02119d5SChris Mason } 1520e02119d5SChris Mason 1521e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1522e02119d5SChris Mason struct btrfs_root *root, 1523e02119d5SChris Mason struct btrfs_path *path) 1524e02119d5SChris Mason { 1525e02119d5SChris Mason int ret; 1526e02119d5SChris Mason struct btrfs_key key; 1527e02119d5SChris Mason struct inode *inode; 1528e02119d5SChris Mason 1529e02119d5SChris Mason key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1530e02119d5SChris Mason key.type = BTRFS_ORPHAN_ITEM_KEY; 1531e02119d5SChris Mason key.offset = (u64)-1; 1532e02119d5SChris Mason while (1) { 1533e02119d5SChris Mason ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1534e02119d5SChris Mason if (ret < 0) 1535e02119d5SChris Mason break; 1536e02119d5SChris Mason 1537e02119d5SChris Mason if (ret == 1) { 1538e02119d5SChris Mason if (path->slots[0] == 0) 1539e02119d5SChris Mason break; 1540e02119d5SChris Mason path->slots[0]--; 1541e02119d5SChris Mason } 1542e02119d5SChris Mason 1543e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1544e02119d5SChris Mason if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1545e02119d5SChris Mason key.type != BTRFS_ORPHAN_ITEM_KEY) 1546e02119d5SChris Mason break; 1547e02119d5SChris Mason 1548e02119d5SChris Mason ret = btrfs_del_item(trans, root, path); 154965a246c5STsutomu Itoh if (ret) 155065a246c5STsutomu Itoh goto out; 1551e02119d5SChris Mason 1552b3b4aa74SDavid Sterba btrfs_release_path(path); 1553e02119d5SChris Mason inode = read_one_inode(root, key.offset); 1554c00e9493STsutomu Itoh if (!inode) 1555c00e9493STsutomu Itoh return -EIO; 1556e02119d5SChris Mason 1557e02119d5SChris Mason ret = fixup_inode_link_count(trans, root, inode); 1558e02119d5SChris Mason iput(inode); 15593650860bSJosef Bacik if (ret) 15603650860bSJosef Bacik goto out; 1561e02119d5SChris Mason 156212fcfd22SChris Mason /* 156312fcfd22SChris Mason * fixup on a directory may create new entries, 156412fcfd22SChris Mason * make sure we always look for the highset possible 156512fcfd22SChris Mason * offset 156612fcfd22SChris Mason */ 156712fcfd22SChris Mason key.offset = (u64)-1; 1568e02119d5SChris Mason } 156965a246c5STsutomu Itoh ret = 0; 157065a246c5STsutomu Itoh out: 1571b3b4aa74SDavid Sterba btrfs_release_path(path); 157265a246c5STsutomu Itoh return ret; 1573e02119d5SChris Mason } 1574e02119d5SChris Mason 1575e02119d5SChris Mason 1576e02119d5SChris Mason /* 1577e02119d5SChris Mason * record a given inode in the fixup dir so we can check its link 1578e02119d5SChris Mason * count when replay is done. The link count is incremented here 1579e02119d5SChris Mason * so the inode won't go away until we check it 1580e02119d5SChris Mason */ 1581e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1582e02119d5SChris Mason struct btrfs_root *root, 1583e02119d5SChris Mason struct btrfs_path *path, 1584e02119d5SChris Mason u64 objectid) 1585e02119d5SChris Mason { 1586e02119d5SChris Mason struct btrfs_key key; 1587e02119d5SChris Mason int ret = 0; 1588e02119d5SChris Mason struct inode *inode; 1589e02119d5SChris Mason 1590e02119d5SChris Mason inode = read_one_inode(root, objectid); 1591c00e9493STsutomu Itoh if (!inode) 1592c00e9493STsutomu Itoh return -EIO; 1593e02119d5SChris Mason 1594e02119d5SChris Mason key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1595962a298fSDavid Sterba key.type = BTRFS_ORPHAN_ITEM_KEY; 1596e02119d5SChris Mason key.offset = objectid; 1597e02119d5SChris Mason 1598e02119d5SChris Mason ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1599e02119d5SChris Mason 1600b3b4aa74SDavid Sterba btrfs_release_path(path); 1601e02119d5SChris Mason if (ret == 0) { 16029bf7a489SJosef Bacik if (!inode->i_nlink) 16039bf7a489SJosef Bacik set_nlink(inode, 1); 16049bf7a489SJosef Bacik else 16058b558c5fSZach Brown inc_nlink(inode); 1606b9959295STsutomu Itoh ret = btrfs_update_inode(trans, root, inode); 1607e02119d5SChris Mason } else if (ret == -EEXIST) { 1608e02119d5SChris Mason ret = 0; 1609e02119d5SChris Mason } else { 16103650860bSJosef Bacik BUG(); /* Logic Error */ 1611e02119d5SChris Mason } 1612e02119d5SChris Mason iput(inode); 1613e02119d5SChris Mason 1614e02119d5SChris Mason return ret; 1615e02119d5SChris Mason } 1616e02119d5SChris Mason 1617e02119d5SChris Mason /* 1618e02119d5SChris Mason * when replaying the log for a directory, we only insert names 1619e02119d5SChris Mason * for inodes that actually exist. This means an fsync on a directory 1620e02119d5SChris Mason * does not implicitly fsync all the new files in it 1621e02119d5SChris Mason */ 1622e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1623e02119d5SChris Mason struct btrfs_root *root, 1624e02119d5SChris Mason u64 dirid, u64 index, 162560d53eb3SZhaolei char *name, int name_len, 1626e02119d5SChris Mason struct btrfs_key *location) 1627e02119d5SChris Mason { 1628e02119d5SChris Mason struct inode *inode; 1629e02119d5SChris Mason struct inode *dir; 1630e02119d5SChris Mason int ret; 1631e02119d5SChris Mason 1632e02119d5SChris Mason inode = read_one_inode(root, location->objectid); 1633e02119d5SChris Mason if (!inode) 1634e02119d5SChris Mason return -ENOENT; 1635e02119d5SChris Mason 1636e02119d5SChris Mason dir = read_one_inode(root, dirid); 1637e02119d5SChris Mason if (!dir) { 1638e02119d5SChris Mason iput(inode); 1639e02119d5SChris Mason return -EIO; 1640e02119d5SChris Mason } 1641d555438bSJosef Bacik 1642e02119d5SChris Mason ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1643e02119d5SChris Mason 1644e02119d5SChris Mason /* FIXME, put inode into FIXUP list */ 1645e02119d5SChris Mason 1646e02119d5SChris Mason iput(inode); 1647e02119d5SChris Mason iput(dir); 1648e02119d5SChris Mason return ret; 1649e02119d5SChris Mason } 1650e02119d5SChris Mason 1651e02119d5SChris Mason /* 1652df8d116fSFilipe Manana * Return true if an inode reference exists in the log for the given name, 1653df8d116fSFilipe Manana * inode and parent inode. 1654df8d116fSFilipe Manana */ 1655df8d116fSFilipe Manana static bool name_in_log_ref(struct btrfs_root *log_root, 1656df8d116fSFilipe Manana const char *name, const int name_len, 1657df8d116fSFilipe Manana const u64 dirid, const u64 ino) 1658df8d116fSFilipe Manana { 1659df8d116fSFilipe Manana struct btrfs_key search_key; 1660df8d116fSFilipe Manana 1661df8d116fSFilipe Manana search_key.objectid = ino; 1662df8d116fSFilipe Manana search_key.type = BTRFS_INODE_REF_KEY; 1663df8d116fSFilipe Manana search_key.offset = dirid; 1664df8d116fSFilipe Manana if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1665df8d116fSFilipe Manana return true; 1666df8d116fSFilipe Manana 1667df8d116fSFilipe Manana search_key.type = BTRFS_INODE_EXTREF_KEY; 1668df8d116fSFilipe Manana search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1669df8d116fSFilipe Manana if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1670df8d116fSFilipe Manana return true; 1671df8d116fSFilipe Manana 1672df8d116fSFilipe Manana return false; 1673df8d116fSFilipe Manana } 1674df8d116fSFilipe Manana 1675df8d116fSFilipe Manana /* 1676e02119d5SChris Mason * take a single entry in a log directory item and replay it into 1677e02119d5SChris Mason * the subvolume. 1678e02119d5SChris Mason * 1679e02119d5SChris Mason * if a conflicting item exists in the subdirectory already, 1680e02119d5SChris Mason * the inode it points to is unlinked and put into the link count 1681e02119d5SChris Mason * fix up tree. 1682e02119d5SChris Mason * 1683e02119d5SChris Mason * If a name from the log points to a file or directory that does 1684e02119d5SChris Mason * not exist in the FS, it is skipped. fsyncs on directories 1685e02119d5SChris Mason * do not force down inodes inside that directory, just changes to the 1686e02119d5SChris Mason * names or unlinks in a directory. 1687bb53eda9SFilipe Manana * 1688bb53eda9SFilipe Manana * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1689bb53eda9SFilipe Manana * non-existing inode) and 1 if the name was replayed. 1690e02119d5SChris Mason */ 1691e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1692e02119d5SChris Mason struct btrfs_root *root, 1693e02119d5SChris Mason struct btrfs_path *path, 1694e02119d5SChris Mason struct extent_buffer *eb, 1695e02119d5SChris Mason struct btrfs_dir_item *di, 1696e02119d5SChris Mason struct btrfs_key *key) 1697e02119d5SChris Mason { 1698e02119d5SChris Mason char *name; 1699e02119d5SChris Mason int name_len; 1700e02119d5SChris Mason struct btrfs_dir_item *dst_di; 1701e02119d5SChris Mason struct btrfs_key found_key; 1702e02119d5SChris Mason struct btrfs_key log_key; 1703e02119d5SChris Mason struct inode *dir; 1704e02119d5SChris Mason u8 log_type; 17054bef0848SChris Mason int exists; 17063650860bSJosef Bacik int ret = 0; 1707d555438bSJosef Bacik bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1708bb53eda9SFilipe Manana bool name_added = false; 1709e02119d5SChris Mason 1710e02119d5SChris Mason dir = read_one_inode(root, key->objectid); 1711c00e9493STsutomu Itoh if (!dir) 1712c00e9493STsutomu Itoh return -EIO; 1713e02119d5SChris Mason 1714e02119d5SChris Mason name_len = btrfs_dir_name_len(eb, di); 1715e02119d5SChris Mason name = kmalloc(name_len, GFP_NOFS); 17162bac325eSFilipe David Borba Manana if (!name) { 17172bac325eSFilipe David Borba Manana ret = -ENOMEM; 17182bac325eSFilipe David Borba Manana goto out; 17192bac325eSFilipe David Borba Manana } 17202a29edc6Sliubo 1721e02119d5SChris Mason log_type = btrfs_dir_type(eb, di); 1722e02119d5SChris Mason read_extent_buffer(eb, name, (unsigned long)(di + 1), 1723e02119d5SChris Mason name_len); 1724e02119d5SChris Mason 1725e02119d5SChris Mason btrfs_dir_item_key_to_cpu(eb, di, &log_key); 17264bef0848SChris Mason exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 17274bef0848SChris Mason if (exists == 0) 17284bef0848SChris Mason exists = 1; 17294bef0848SChris Mason else 17304bef0848SChris Mason exists = 0; 1731b3b4aa74SDavid Sterba btrfs_release_path(path); 17324bef0848SChris Mason 1733e02119d5SChris Mason if (key->type == BTRFS_DIR_ITEM_KEY) { 1734e02119d5SChris Mason dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1735e02119d5SChris Mason name, name_len, 1); 1736d397712bSChris Mason } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1737e02119d5SChris Mason dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1738e02119d5SChris Mason key->objectid, 1739e02119d5SChris Mason key->offset, name, 1740e02119d5SChris Mason name_len, 1); 1741e02119d5SChris Mason } else { 17423650860bSJosef Bacik /* Corruption */ 17433650860bSJosef Bacik ret = -EINVAL; 17443650860bSJosef Bacik goto out; 1745e02119d5SChris Mason } 1746c704005dSDavid Sterba if (IS_ERR_OR_NULL(dst_di)) { 1747e02119d5SChris Mason /* we need a sequence number to insert, so we only 1748e02119d5SChris Mason * do inserts for the BTRFS_DIR_INDEX_KEY types 1749e02119d5SChris Mason */ 1750e02119d5SChris Mason if (key->type != BTRFS_DIR_INDEX_KEY) 1751e02119d5SChris Mason goto out; 1752e02119d5SChris Mason goto insert; 1753e02119d5SChris Mason } 1754e02119d5SChris Mason 1755e02119d5SChris Mason btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1756e02119d5SChris Mason /* the existing item matches the logged item */ 1757e02119d5SChris Mason if (found_key.objectid == log_key.objectid && 1758e02119d5SChris Mason found_key.type == log_key.type && 1759e02119d5SChris Mason found_key.offset == log_key.offset && 1760e02119d5SChris Mason btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1761a2cc11dbSFilipe Manana update_size = false; 1762e02119d5SChris Mason goto out; 1763e02119d5SChris Mason } 1764e02119d5SChris Mason 1765e02119d5SChris Mason /* 1766e02119d5SChris Mason * don't drop the conflicting directory entry if the inode 1767e02119d5SChris Mason * for the new entry doesn't exist 1768e02119d5SChris Mason */ 17694bef0848SChris Mason if (!exists) 1770e02119d5SChris Mason goto out; 1771e02119d5SChris Mason 1772e02119d5SChris Mason ret = drop_one_dir_item(trans, root, path, dir, dst_di); 17733650860bSJosef Bacik if (ret) 17743650860bSJosef Bacik goto out; 1775e02119d5SChris Mason 1776e02119d5SChris Mason if (key->type == BTRFS_DIR_INDEX_KEY) 1777e02119d5SChris Mason goto insert; 1778e02119d5SChris Mason out: 1779b3b4aa74SDavid Sterba btrfs_release_path(path); 1780d555438bSJosef Bacik if (!ret && update_size) { 1781d555438bSJosef Bacik btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1782d555438bSJosef Bacik ret = btrfs_update_inode(trans, root, dir); 1783d555438bSJosef Bacik } 1784e02119d5SChris Mason kfree(name); 1785e02119d5SChris Mason iput(dir); 1786bb53eda9SFilipe Manana if (!ret && name_added) 1787bb53eda9SFilipe Manana ret = 1; 17883650860bSJosef Bacik return ret; 1789e02119d5SChris Mason 1790e02119d5SChris Mason insert: 1791df8d116fSFilipe Manana if (name_in_log_ref(root->log_root, name, name_len, 1792df8d116fSFilipe Manana key->objectid, log_key.objectid)) { 1793df8d116fSFilipe Manana /* The dentry will be added later. */ 1794df8d116fSFilipe Manana ret = 0; 1795df8d116fSFilipe Manana update_size = false; 1796df8d116fSFilipe Manana goto out; 1797df8d116fSFilipe Manana } 1798b3b4aa74SDavid Sterba btrfs_release_path(path); 179960d53eb3SZhaolei ret = insert_one_name(trans, root, key->objectid, key->offset, 180060d53eb3SZhaolei name, name_len, &log_key); 1801df8d116fSFilipe Manana if (ret && ret != -ENOENT && ret != -EEXIST) 18023650860bSJosef Bacik goto out; 1803bb53eda9SFilipe Manana if (!ret) 1804bb53eda9SFilipe Manana name_added = true; 1805d555438bSJosef Bacik update_size = false; 18063650860bSJosef Bacik ret = 0; 1807e02119d5SChris Mason goto out; 1808e02119d5SChris Mason } 1809e02119d5SChris Mason 1810e02119d5SChris Mason /* 1811e02119d5SChris Mason * find all the names in a directory item and reconcile them into 1812e02119d5SChris Mason * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1813e02119d5SChris Mason * one name in a directory item, but the same code gets used for 1814e02119d5SChris Mason * both directory index types 1815e02119d5SChris Mason */ 1816e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1817e02119d5SChris Mason struct btrfs_root *root, 1818e02119d5SChris Mason struct btrfs_path *path, 1819e02119d5SChris Mason struct extent_buffer *eb, int slot, 1820e02119d5SChris Mason struct btrfs_key *key) 1821e02119d5SChris Mason { 18222ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 1823bb53eda9SFilipe Manana int ret = 0; 1824e02119d5SChris Mason u32 item_size = btrfs_item_size_nr(eb, slot); 1825e02119d5SChris Mason struct btrfs_dir_item *di; 1826e02119d5SChris Mason int name_len; 1827e02119d5SChris Mason unsigned long ptr; 1828e02119d5SChris Mason unsigned long ptr_end; 1829bb53eda9SFilipe Manana struct btrfs_path *fixup_path = NULL; 1830e02119d5SChris Mason 1831e02119d5SChris Mason ptr = btrfs_item_ptr_offset(eb, slot); 1832e02119d5SChris Mason ptr_end = ptr + item_size; 1833e02119d5SChris Mason while (ptr < ptr_end) { 1834e02119d5SChris Mason di = (struct btrfs_dir_item *)ptr; 18352ff7e61eSJeff Mahoney if (verify_dir_item(fs_info, eb, di)) 183622a94d44SJosef Bacik return -EIO; 1837e02119d5SChris Mason name_len = btrfs_dir_name_len(eb, di); 1838e02119d5SChris Mason ret = replay_one_name(trans, root, path, eb, di, key); 1839bb53eda9SFilipe Manana if (ret < 0) 1840bb53eda9SFilipe Manana break; 1841e02119d5SChris Mason ptr = (unsigned long)(di + 1); 1842e02119d5SChris Mason ptr += name_len; 1843bb53eda9SFilipe Manana 1844bb53eda9SFilipe Manana /* 1845bb53eda9SFilipe Manana * If this entry refers to a non-directory (directories can not 1846bb53eda9SFilipe Manana * have a link count > 1) and it was added in the transaction 1847bb53eda9SFilipe Manana * that was not committed, make sure we fixup the link count of 1848bb53eda9SFilipe Manana * the inode it the entry points to. Otherwise something like 1849bb53eda9SFilipe Manana * the following would result in a directory pointing to an 1850bb53eda9SFilipe Manana * inode with a wrong link that does not account for this dir 1851bb53eda9SFilipe Manana * entry: 1852bb53eda9SFilipe Manana * 1853bb53eda9SFilipe Manana * mkdir testdir 1854bb53eda9SFilipe Manana * touch testdir/foo 1855bb53eda9SFilipe Manana * touch testdir/bar 1856bb53eda9SFilipe Manana * sync 1857bb53eda9SFilipe Manana * 1858bb53eda9SFilipe Manana * ln testdir/bar testdir/bar_link 1859bb53eda9SFilipe Manana * ln testdir/foo testdir/foo_link 1860bb53eda9SFilipe Manana * xfs_io -c "fsync" testdir/bar 1861bb53eda9SFilipe Manana * 1862bb53eda9SFilipe Manana * <power failure> 1863bb53eda9SFilipe Manana * 1864bb53eda9SFilipe Manana * mount fs, log replay happens 1865bb53eda9SFilipe Manana * 1866bb53eda9SFilipe Manana * File foo would remain with a link count of 1 when it has two 1867bb53eda9SFilipe Manana * entries pointing to it in the directory testdir. This would 1868bb53eda9SFilipe Manana * make it impossible to ever delete the parent directory has 1869bb53eda9SFilipe Manana * it would result in stale dentries that can never be deleted. 1870bb53eda9SFilipe Manana */ 1871bb53eda9SFilipe Manana if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 1872bb53eda9SFilipe Manana struct btrfs_key di_key; 1873bb53eda9SFilipe Manana 1874bb53eda9SFilipe Manana if (!fixup_path) { 1875bb53eda9SFilipe Manana fixup_path = btrfs_alloc_path(); 1876bb53eda9SFilipe Manana if (!fixup_path) { 1877bb53eda9SFilipe Manana ret = -ENOMEM; 1878bb53eda9SFilipe Manana break; 1879e02119d5SChris Mason } 1880bb53eda9SFilipe Manana } 1881bb53eda9SFilipe Manana 1882bb53eda9SFilipe Manana btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1883bb53eda9SFilipe Manana ret = link_to_fixup_dir(trans, root, fixup_path, 1884bb53eda9SFilipe Manana di_key.objectid); 1885bb53eda9SFilipe Manana if (ret) 1886bb53eda9SFilipe Manana break; 1887bb53eda9SFilipe Manana } 1888bb53eda9SFilipe Manana ret = 0; 1889bb53eda9SFilipe Manana } 1890bb53eda9SFilipe Manana btrfs_free_path(fixup_path); 1891bb53eda9SFilipe Manana return ret; 1892e02119d5SChris Mason } 1893e02119d5SChris Mason 1894e02119d5SChris Mason /* 1895e02119d5SChris Mason * directory replay has two parts. There are the standard directory 1896e02119d5SChris Mason * items in the log copied from the subvolume, and range items 1897e02119d5SChris Mason * created in the log while the subvolume was logged. 1898e02119d5SChris Mason * 1899e02119d5SChris Mason * The range items tell us which parts of the key space the log 1900e02119d5SChris Mason * is authoritative for. During replay, if a key in the subvolume 1901e02119d5SChris Mason * directory is in a logged range item, but not actually in the log 1902e02119d5SChris Mason * that means it was deleted from the directory before the fsync 1903e02119d5SChris Mason * and should be removed. 1904e02119d5SChris Mason */ 1905e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root, 1906e02119d5SChris Mason struct btrfs_path *path, 1907e02119d5SChris Mason u64 dirid, int key_type, 1908e02119d5SChris Mason u64 *start_ret, u64 *end_ret) 1909e02119d5SChris Mason { 1910e02119d5SChris Mason struct btrfs_key key; 1911e02119d5SChris Mason u64 found_end; 1912e02119d5SChris Mason struct btrfs_dir_log_item *item; 1913e02119d5SChris Mason int ret; 1914e02119d5SChris Mason int nritems; 1915e02119d5SChris Mason 1916e02119d5SChris Mason if (*start_ret == (u64)-1) 1917e02119d5SChris Mason return 1; 1918e02119d5SChris Mason 1919e02119d5SChris Mason key.objectid = dirid; 1920e02119d5SChris Mason key.type = key_type; 1921e02119d5SChris Mason key.offset = *start_ret; 1922e02119d5SChris Mason 1923e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1924e02119d5SChris Mason if (ret < 0) 1925e02119d5SChris Mason goto out; 1926e02119d5SChris Mason if (ret > 0) { 1927e02119d5SChris Mason if (path->slots[0] == 0) 1928e02119d5SChris Mason goto out; 1929e02119d5SChris Mason path->slots[0]--; 1930e02119d5SChris Mason } 1931e02119d5SChris Mason if (ret != 0) 1932e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1933e02119d5SChris Mason 1934e02119d5SChris Mason if (key.type != key_type || key.objectid != dirid) { 1935e02119d5SChris Mason ret = 1; 1936e02119d5SChris Mason goto next; 1937e02119d5SChris Mason } 1938e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1939e02119d5SChris Mason struct btrfs_dir_log_item); 1940e02119d5SChris Mason found_end = btrfs_dir_log_end(path->nodes[0], item); 1941e02119d5SChris Mason 1942e02119d5SChris Mason if (*start_ret >= key.offset && *start_ret <= found_end) { 1943e02119d5SChris Mason ret = 0; 1944e02119d5SChris Mason *start_ret = key.offset; 1945e02119d5SChris Mason *end_ret = found_end; 1946e02119d5SChris Mason goto out; 1947e02119d5SChris Mason } 1948e02119d5SChris Mason ret = 1; 1949e02119d5SChris Mason next: 1950e02119d5SChris Mason /* check the next slot in the tree to see if it is a valid item */ 1951e02119d5SChris Mason nritems = btrfs_header_nritems(path->nodes[0]); 19522a7bf53fSRobbie Ko path->slots[0]++; 1953e02119d5SChris Mason if (path->slots[0] >= nritems) { 1954e02119d5SChris Mason ret = btrfs_next_leaf(root, path); 1955e02119d5SChris Mason if (ret) 1956e02119d5SChris Mason goto out; 1957e02119d5SChris Mason } 1958e02119d5SChris Mason 1959e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1960e02119d5SChris Mason 1961e02119d5SChris Mason if (key.type != key_type || key.objectid != dirid) { 1962e02119d5SChris Mason ret = 1; 1963e02119d5SChris Mason goto out; 1964e02119d5SChris Mason } 1965e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1966e02119d5SChris Mason struct btrfs_dir_log_item); 1967e02119d5SChris Mason found_end = btrfs_dir_log_end(path->nodes[0], item); 1968e02119d5SChris Mason *start_ret = key.offset; 1969e02119d5SChris Mason *end_ret = found_end; 1970e02119d5SChris Mason ret = 0; 1971e02119d5SChris Mason out: 1972b3b4aa74SDavid Sterba btrfs_release_path(path); 1973e02119d5SChris Mason return ret; 1974e02119d5SChris Mason } 1975e02119d5SChris Mason 1976e02119d5SChris Mason /* 1977e02119d5SChris Mason * this looks for a given directory item in the log. If the directory 1978e02119d5SChris Mason * item is not in the log, the item is removed and the inode it points 1979e02119d5SChris Mason * to is unlinked 1980e02119d5SChris Mason */ 1981e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1982e02119d5SChris Mason struct btrfs_root *root, 1983e02119d5SChris Mason struct btrfs_root *log, 1984e02119d5SChris Mason struct btrfs_path *path, 1985e02119d5SChris Mason struct btrfs_path *log_path, 1986e02119d5SChris Mason struct inode *dir, 1987e02119d5SChris Mason struct btrfs_key *dir_key) 1988e02119d5SChris Mason { 19892ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 1990e02119d5SChris Mason int ret; 1991e02119d5SChris Mason struct extent_buffer *eb; 1992e02119d5SChris Mason int slot; 1993e02119d5SChris Mason u32 item_size; 1994e02119d5SChris Mason struct btrfs_dir_item *di; 1995e02119d5SChris Mason struct btrfs_dir_item *log_di; 1996e02119d5SChris Mason int name_len; 1997e02119d5SChris Mason unsigned long ptr; 1998e02119d5SChris Mason unsigned long ptr_end; 1999e02119d5SChris Mason char *name; 2000e02119d5SChris Mason struct inode *inode; 2001e02119d5SChris Mason struct btrfs_key location; 2002e02119d5SChris Mason 2003e02119d5SChris Mason again: 2004e02119d5SChris Mason eb = path->nodes[0]; 2005e02119d5SChris Mason slot = path->slots[0]; 2006e02119d5SChris Mason item_size = btrfs_item_size_nr(eb, slot); 2007e02119d5SChris Mason ptr = btrfs_item_ptr_offset(eb, slot); 2008e02119d5SChris Mason ptr_end = ptr + item_size; 2009e02119d5SChris Mason while (ptr < ptr_end) { 2010e02119d5SChris Mason di = (struct btrfs_dir_item *)ptr; 20112ff7e61eSJeff Mahoney if (verify_dir_item(fs_info, eb, di)) { 201222a94d44SJosef Bacik ret = -EIO; 201322a94d44SJosef Bacik goto out; 201422a94d44SJosef Bacik } 201522a94d44SJosef Bacik 2016e02119d5SChris Mason name_len = btrfs_dir_name_len(eb, di); 2017e02119d5SChris Mason name = kmalloc(name_len, GFP_NOFS); 2018e02119d5SChris Mason if (!name) { 2019e02119d5SChris Mason ret = -ENOMEM; 2020e02119d5SChris Mason goto out; 2021e02119d5SChris Mason } 2022e02119d5SChris Mason read_extent_buffer(eb, name, (unsigned long)(di + 1), 2023e02119d5SChris Mason name_len); 2024e02119d5SChris Mason log_di = NULL; 202512fcfd22SChris Mason if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2026e02119d5SChris Mason log_di = btrfs_lookup_dir_item(trans, log, log_path, 2027e02119d5SChris Mason dir_key->objectid, 2028e02119d5SChris Mason name, name_len, 0); 202912fcfd22SChris Mason } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2030e02119d5SChris Mason log_di = btrfs_lookup_dir_index_item(trans, log, 2031e02119d5SChris Mason log_path, 2032e02119d5SChris Mason dir_key->objectid, 2033e02119d5SChris Mason dir_key->offset, 2034e02119d5SChris Mason name, name_len, 0); 2035e02119d5SChris Mason } 2036269d040fSFilipe David Borba Manana if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 2037e02119d5SChris Mason btrfs_dir_item_key_to_cpu(eb, di, &location); 2038b3b4aa74SDavid Sterba btrfs_release_path(path); 2039b3b4aa74SDavid Sterba btrfs_release_path(log_path); 2040e02119d5SChris Mason inode = read_one_inode(root, location.objectid); 2041c00e9493STsutomu Itoh if (!inode) { 2042c00e9493STsutomu Itoh kfree(name); 2043c00e9493STsutomu Itoh return -EIO; 2044c00e9493STsutomu Itoh } 2045e02119d5SChris Mason 2046e02119d5SChris Mason ret = link_to_fixup_dir(trans, root, 2047e02119d5SChris Mason path, location.objectid); 20483650860bSJosef Bacik if (ret) { 20493650860bSJosef Bacik kfree(name); 20503650860bSJosef Bacik iput(inode); 20513650860bSJosef Bacik goto out; 20523650860bSJosef Bacik } 20533650860bSJosef Bacik 20548b558c5fSZach Brown inc_nlink(inode); 2055e02119d5SChris Mason ret = btrfs_unlink_inode(trans, root, dir, inode, 2056e02119d5SChris Mason name, name_len); 20573650860bSJosef Bacik if (!ret) 20582ff7e61eSJeff Mahoney ret = btrfs_run_delayed_items(trans, fs_info); 2059e02119d5SChris Mason kfree(name); 2060e02119d5SChris Mason iput(inode); 20613650860bSJosef Bacik if (ret) 20623650860bSJosef Bacik goto out; 2063e02119d5SChris Mason 2064e02119d5SChris Mason /* there might still be more names under this key 2065e02119d5SChris Mason * check and repeat if required 2066e02119d5SChris Mason */ 2067e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, dir_key, path, 2068e02119d5SChris Mason 0, 0); 2069e02119d5SChris Mason if (ret == 0) 2070e02119d5SChris Mason goto again; 2071e02119d5SChris Mason ret = 0; 2072e02119d5SChris Mason goto out; 2073269d040fSFilipe David Borba Manana } else if (IS_ERR(log_di)) { 2074269d040fSFilipe David Borba Manana kfree(name); 2075269d040fSFilipe David Borba Manana return PTR_ERR(log_di); 2076e02119d5SChris Mason } 2077b3b4aa74SDavid Sterba btrfs_release_path(log_path); 2078e02119d5SChris Mason kfree(name); 2079e02119d5SChris Mason 2080e02119d5SChris Mason ptr = (unsigned long)(di + 1); 2081e02119d5SChris Mason ptr += name_len; 2082e02119d5SChris Mason } 2083e02119d5SChris Mason ret = 0; 2084e02119d5SChris Mason out: 2085b3b4aa74SDavid Sterba btrfs_release_path(path); 2086b3b4aa74SDavid Sterba btrfs_release_path(log_path); 2087e02119d5SChris Mason return ret; 2088e02119d5SChris Mason } 2089e02119d5SChris Mason 20904f764e51SFilipe Manana static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 20914f764e51SFilipe Manana struct btrfs_root *root, 20924f764e51SFilipe Manana struct btrfs_root *log, 20934f764e51SFilipe Manana struct btrfs_path *path, 20944f764e51SFilipe Manana const u64 ino) 20954f764e51SFilipe Manana { 20964f764e51SFilipe Manana struct btrfs_key search_key; 20974f764e51SFilipe Manana struct btrfs_path *log_path; 20984f764e51SFilipe Manana int i; 20994f764e51SFilipe Manana int nritems; 21004f764e51SFilipe Manana int ret; 21014f764e51SFilipe Manana 21024f764e51SFilipe Manana log_path = btrfs_alloc_path(); 21034f764e51SFilipe Manana if (!log_path) 21044f764e51SFilipe Manana return -ENOMEM; 21054f764e51SFilipe Manana 21064f764e51SFilipe Manana search_key.objectid = ino; 21074f764e51SFilipe Manana search_key.type = BTRFS_XATTR_ITEM_KEY; 21084f764e51SFilipe Manana search_key.offset = 0; 21094f764e51SFilipe Manana again: 21104f764e51SFilipe Manana ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 21114f764e51SFilipe Manana if (ret < 0) 21124f764e51SFilipe Manana goto out; 21134f764e51SFilipe Manana process_leaf: 21144f764e51SFilipe Manana nritems = btrfs_header_nritems(path->nodes[0]); 21154f764e51SFilipe Manana for (i = path->slots[0]; i < nritems; i++) { 21164f764e51SFilipe Manana struct btrfs_key key; 21174f764e51SFilipe Manana struct btrfs_dir_item *di; 21184f764e51SFilipe Manana struct btrfs_dir_item *log_di; 21194f764e51SFilipe Manana u32 total_size; 21204f764e51SFilipe Manana u32 cur; 21214f764e51SFilipe Manana 21224f764e51SFilipe Manana btrfs_item_key_to_cpu(path->nodes[0], &key, i); 21234f764e51SFilipe Manana if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 21244f764e51SFilipe Manana ret = 0; 21254f764e51SFilipe Manana goto out; 21264f764e51SFilipe Manana } 21274f764e51SFilipe Manana 21284f764e51SFilipe Manana di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 21294f764e51SFilipe Manana total_size = btrfs_item_size_nr(path->nodes[0], i); 21304f764e51SFilipe Manana cur = 0; 21314f764e51SFilipe Manana while (cur < total_size) { 21324f764e51SFilipe Manana u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 21334f764e51SFilipe Manana u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 21344f764e51SFilipe Manana u32 this_len = sizeof(*di) + name_len + data_len; 21354f764e51SFilipe Manana char *name; 21364f764e51SFilipe Manana 21374f764e51SFilipe Manana name = kmalloc(name_len, GFP_NOFS); 21384f764e51SFilipe Manana if (!name) { 21394f764e51SFilipe Manana ret = -ENOMEM; 21404f764e51SFilipe Manana goto out; 21414f764e51SFilipe Manana } 21424f764e51SFilipe Manana read_extent_buffer(path->nodes[0], name, 21434f764e51SFilipe Manana (unsigned long)(di + 1), name_len); 21444f764e51SFilipe Manana 21454f764e51SFilipe Manana log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 21464f764e51SFilipe Manana name, name_len, 0); 21474f764e51SFilipe Manana btrfs_release_path(log_path); 21484f764e51SFilipe Manana if (!log_di) { 21494f764e51SFilipe Manana /* Doesn't exist in log tree, so delete it. */ 21504f764e51SFilipe Manana btrfs_release_path(path); 21514f764e51SFilipe Manana di = btrfs_lookup_xattr(trans, root, path, ino, 21524f764e51SFilipe Manana name, name_len, -1); 21534f764e51SFilipe Manana kfree(name); 21544f764e51SFilipe Manana if (IS_ERR(di)) { 21554f764e51SFilipe Manana ret = PTR_ERR(di); 21564f764e51SFilipe Manana goto out; 21574f764e51SFilipe Manana } 21584f764e51SFilipe Manana ASSERT(di); 21594f764e51SFilipe Manana ret = btrfs_delete_one_dir_name(trans, root, 21604f764e51SFilipe Manana path, di); 21614f764e51SFilipe Manana if (ret) 21624f764e51SFilipe Manana goto out; 21634f764e51SFilipe Manana btrfs_release_path(path); 21644f764e51SFilipe Manana search_key = key; 21654f764e51SFilipe Manana goto again; 21664f764e51SFilipe Manana } 21674f764e51SFilipe Manana kfree(name); 21684f764e51SFilipe Manana if (IS_ERR(log_di)) { 21694f764e51SFilipe Manana ret = PTR_ERR(log_di); 21704f764e51SFilipe Manana goto out; 21714f764e51SFilipe Manana } 21724f764e51SFilipe Manana cur += this_len; 21734f764e51SFilipe Manana di = (struct btrfs_dir_item *)((char *)di + this_len); 21744f764e51SFilipe Manana } 21754f764e51SFilipe Manana } 21764f764e51SFilipe Manana ret = btrfs_next_leaf(root, path); 21774f764e51SFilipe Manana if (ret > 0) 21784f764e51SFilipe Manana ret = 0; 21794f764e51SFilipe Manana else if (ret == 0) 21804f764e51SFilipe Manana goto process_leaf; 21814f764e51SFilipe Manana out: 21824f764e51SFilipe Manana btrfs_free_path(log_path); 21834f764e51SFilipe Manana btrfs_release_path(path); 21844f764e51SFilipe Manana return ret; 21854f764e51SFilipe Manana } 21864f764e51SFilipe Manana 21874f764e51SFilipe Manana 2188e02119d5SChris Mason /* 2189e02119d5SChris Mason * deletion replay happens before we copy any new directory items 2190e02119d5SChris Mason * out of the log or out of backreferences from inodes. It 2191e02119d5SChris Mason * scans the log to find ranges of keys that log is authoritative for, 2192e02119d5SChris Mason * and then scans the directory to find items in those ranges that are 2193e02119d5SChris Mason * not present in the log. 2194e02119d5SChris Mason * 2195e02119d5SChris Mason * Anything we don't find in the log is unlinked and removed from the 2196e02119d5SChris Mason * directory. 2197e02119d5SChris Mason */ 2198e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2199e02119d5SChris Mason struct btrfs_root *root, 2200e02119d5SChris Mason struct btrfs_root *log, 2201e02119d5SChris Mason struct btrfs_path *path, 220212fcfd22SChris Mason u64 dirid, int del_all) 2203e02119d5SChris Mason { 2204e02119d5SChris Mason u64 range_start; 2205e02119d5SChris Mason u64 range_end; 2206e02119d5SChris Mason int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2207e02119d5SChris Mason int ret = 0; 2208e02119d5SChris Mason struct btrfs_key dir_key; 2209e02119d5SChris Mason struct btrfs_key found_key; 2210e02119d5SChris Mason struct btrfs_path *log_path; 2211e02119d5SChris Mason struct inode *dir; 2212e02119d5SChris Mason 2213e02119d5SChris Mason dir_key.objectid = dirid; 2214e02119d5SChris Mason dir_key.type = BTRFS_DIR_ITEM_KEY; 2215e02119d5SChris Mason log_path = btrfs_alloc_path(); 2216e02119d5SChris Mason if (!log_path) 2217e02119d5SChris Mason return -ENOMEM; 2218e02119d5SChris Mason 2219e02119d5SChris Mason dir = read_one_inode(root, dirid); 2220e02119d5SChris Mason /* it isn't an error if the inode isn't there, that can happen 2221e02119d5SChris Mason * because we replay the deletes before we copy in the inode item 2222e02119d5SChris Mason * from the log 2223e02119d5SChris Mason */ 2224e02119d5SChris Mason if (!dir) { 2225e02119d5SChris Mason btrfs_free_path(log_path); 2226e02119d5SChris Mason return 0; 2227e02119d5SChris Mason } 2228e02119d5SChris Mason again: 2229e02119d5SChris Mason range_start = 0; 2230e02119d5SChris Mason range_end = 0; 2231e02119d5SChris Mason while (1) { 223212fcfd22SChris Mason if (del_all) 223312fcfd22SChris Mason range_end = (u64)-1; 223412fcfd22SChris Mason else { 2235e02119d5SChris Mason ret = find_dir_range(log, path, dirid, key_type, 2236e02119d5SChris Mason &range_start, &range_end); 2237e02119d5SChris Mason if (ret != 0) 2238e02119d5SChris Mason break; 223912fcfd22SChris Mason } 2240e02119d5SChris Mason 2241e02119d5SChris Mason dir_key.offset = range_start; 2242e02119d5SChris Mason while (1) { 2243e02119d5SChris Mason int nritems; 2244e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &dir_key, path, 2245e02119d5SChris Mason 0, 0); 2246e02119d5SChris Mason if (ret < 0) 2247e02119d5SChris Mason goto out; 2248e02119d5SChris Mason 2249e02119d5SChris Mason nritems = btrfs_header_nritems(path->nodes[0]); 2250e02119d5SChris Mason if (path->slots[0] >= nritems) { 2251e02119d5SChris Mason ret = btrfs_next_leaf(root, path); 2252e02119d5SChris Mason if (ret) 2253e02119d5SChris Mason break; 2254e02119d5SChris Mason } 2255e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2256e02119d5SChris Mason path->slots[0]); 2257e02119d5SChris Mason if (found_key.objectid != dirid || 2258e02119d5SChris Mason found_key.type != dir_key.type) 2259e02119d5SChris Mason goto next_type; 2260e02119d5SChris Mason 2261e02119d5SChris Mason if (found_key.offset > range_end) 2262e02119d5SChris Mason break; 2263e02119d5SChris Mason 2264e02119d5SChris Mason ret = check_item_in_log(trans, root, log, path, 226512fcfd22SChris Mason log_path, dir, 226612fcfd22SChris Mason &found_key); 22673650860bSJosef Bacik if (ret) 22683650860bSJosef Bacik goto out; 2269e02119d5SChris Mason if (found_key.offset == (u64)-1) 2270e02119d5SChris Mason break; 2271e02119d5SChris Mason dir_key.offset = found_key.offset + 1; 2272e02119d5SChris Mason } 2273b3b4aa74SDavid Sterba btrfs_release_path(path); 2274e02119d5SChris Mason if (range_end == (u64)-1) 2275e02119d5SChris Mason break; 2276e02119d5SChris Mason range_start = range_end + 1; 2277e02119d5SChris Mason } 2278e02119d5SChris Mason 2279e02119d5SChris Mason next_type: 2280e02119d5SChris Mason ret = 0; 2281e02119d5SChris Mason if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2282e02119d5SChris Mason key_type = BTRFS_DIR_LOG_INDEX_KEY; 2283e02119d5SChris Mason dir_key.type = BTRFS_DIR_INDEX_KEY; 2284b3b4aa74SDavid Sterba btrfs_release_path(path); 2285e02119d5SChris Mason goto again; 2286e02119d5SChris Mason } 2287e02119d5SChris Mason out: 2288b3b4aa74SDavid Sterba btrfs_release_path(path); 2289e02119d5SChris Mason btrfs_free_path(log_path); 2290e02119d5SChris Mason iput(dir); 2291e02119d5SChris Mason return ret; 2292e02119d5SChris Mason } 2293e02119d5SChris Mason 2294e02119d5SChris Mason /* 2295e02119d5SChris Mason * the process_func used to replay items from the log tree. This 2296e02119d5SChris Mason * gets called in two different stages. The first stage just looks 2297e02119d5SChris Mason * for inodes and makes sure they are all copied into the subvolume. 2298e02119d5SChris Mason * 2299e02119d5SChris Mason * The second stage copies all the other item types from the log into 2300e02119d5SChris Mason * the subvolume. The two stage approach is slower, but gets rid of 2301e02119d5SChris Mason * lots of complexity around inodes referencing other inodes that exist 2302e02119d5SChris Mason * only in the log (references come from either directory items or inode 2303e02119d5SChris Mason * back refs). 2304e02119d5SChris Mason */ 2305e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2306e02119d5SChris Mason struct walk_control *wc, u64 gen) 2307e02119d5SChris Mason { 2308e02119d5SChris Mason int nritems; 2309e02119d5SChris Mason struct btrfs_path *path; 2310e02119d5SChris Mason struct btrfs_root *root = wc->replay_dest; 2311e02119d5SChris Mason struct btrfs_key key; 2312e02119d5SChris Mason int level; 2313e02119d5SChris Mason int i; 2314e02119d5SChris Mason int ret; 2315e02119d5SChris Mason 2316018642a1STsutomu Itoh ret = btrfs_read_buffer(eb, gen); 2317018642a1STsutomu Itoh if (ret) 2318018642a1STsutomu Itoh return ret; 2319e02119d5SChris Mason 2320e02119d5SChris Mason level = btrfs_header_level(eb); 2321e02119d5SChris Mason 2322e02119d5SChris Mason if (level != 0) 2323e02119d5SChris Mason return 0; 2324e02119d5SChris Mason 2325e02119d5SChris Mason path = btrfs_alloc_path(); 23261e5063d0SMark Fasheh if (!path) 23271e5063d0SMark Fasheh return -ENOMEM; 2328e02119d5SChris Mason 2329e02119d5SChris Mason nritems = btrfs_header_nritems(eb); 2330e02119d5SChris Mason for (i = 0; i < nritems; i++) { 2331e02119d5SChris Mason btrfs_item_key_to_cpu(eb, &key, i); 2332e02119d5SChris Mason 2333e02119d5SChris Mason /* inode keys are done during the first stage */ 2334e02119d5SChris Mason if (key.type == BTRFS_INODE_ITEM_KEY && 2335e02119d5SChris Mason wc->stage == LOG_WALK_REPLAY_INODES) { 2336e02119d5SChris Mason struct btrfs_inode_item *inode_item; 2337e02119d5SChris Mason u32 mode; 2338e02119d5SChris Mason 2339e02119d5SChris Mason inode_item = btrfs_item_ptr(eb, i, 2340e02119d5SChris Mason struct btrfs_inode_item); 23414f764e51SFilipe Manana ret = replay_xattr_deletes(wc->trans, root, log, 23424f764e51SFilipe Manana path, key.objectid); 23434f764e51SFilipe Manana if (ret) 23444f764e51SFilipe Manana break; 2345e02119d5SChris Mason mode = btrfs_inode_mode(eb, inode_item); 2346e02119d5SChris Mason if (S_ISDIR(mode)) { 2347e02119d5SChris Mason ret = replay_dir_deletes(wc->trans, 234812fcfd22SChris Mason root, log, path, key.objectid, 0); 2349b50c6e25SJosef Bacik if (ret) 2350b50c6e25SJosef Bacik break; 2351e02119d5SChris Mason } 2352e02119d5SChris Mason ret = overwrite_item(wc->trans, root, path, 2353e02119d5SChris Mason eb, i, &key); 2354b50c6e25SJosef Bacik if (ret) 2355b50c6e25SJosef Bacik break; 2356e02119d5SChris Mason 2357c71bf099SYan, Zheng /* for regular files, make sure corresponding 235801327610SNicholas D Steeves * orphan item exist. extents past the new EOF 2359c71bf099SYan, Zheng * will be truncated later by orphan cleanup. 2360e02119d5SChris Mason */ 2361e02119d5SChris Mason if (S_ISREG(mode)) { 2362c71bf099SYan, Zheng ret = insert_orphan_item(wc->trans, root, 2363e02119d5SChris Mason key.objectid); 2364b50c6e25SJosef Bacik if (ret) 2365b50c6e25SJosef Bacik break; 2366c71bf099SYan, Zheng } 2367a74ac322SChris Mason 2368e02119d5SChris Mason ret = link_to_fixup_dir(wc->trans, root, 2369e02119d5SChris Mason path, key.objectid); 2370b50c6e25SJosef Bacik if (ret) 2371b50c6e25SJosef Bacik break; 2372e02119d5SChris Mason } 2373dd8e7217SJosef Bacik 2374dd8e7217SJosef Bacik if (key.type == BTRFS_DIR_INDEX_KEY && 2375dd8e7217SJosef Bacik wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2376dd8e7217SJosef Bacik ret = replay_one_dir_item(wc->trans, root, path, 2377dd8e7217SJosef Bacik eb, i, &key); 2378dd8e7217SJosef Bacik if (ret) 2379dd8e7217SJosef Bacik break; 2380dd8e7217SJosef Bacik } 2381dd8e7217SJosef Bacik 2382e02119d5SChris Mason if (wc->stage < LOG_WALK_REPLAY_ALL) 2383e02119d5SChris Mason continue; 2384e02119d5SChris Mason 2385e02119d5SChris Mason /* these keys are simply copied */ 2386e02119d5SChris Mason if (key.type == BTRFS_XATTR_ITEM_KEY) { 2387e02119d5SChris Mason ret = overwrite_item(wc->trans, root, path, 2388e02119d5SChris Mason eb, i, &key); 2389b50c6e25SJosef Bacik if (ret) 2390b50c6e25SJosef Bacik break; 23912da1c669SLiu Bo } else if (key.type == BTRFS_INODE_REF_KEY || 23922da1c669SLiu Bo key.type == BTRFS_INODE_EXTREF_KEY) { 2393f186373fSMark Fasheh ret = add_inode_ref(wc->trans, root, log, path, 2394f186373fSMark Fasheh eb, i, &key); 2395b50c6e25SJosef Bacik if (ret && ret != -ENOENT) 2396b50c6e25SJosef Bacik break; 2397b50c6e25SJosef Bacik ret = 0; 2398e02119d5SChris Mason } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2399e02119d5SChris Mason ret = replay_one_extent(wc->trans, root, path, 2400e02119d5SChris Mason eb, i, &key); 2401b50c6e25SJosef Bacik if (ret) 2402b50c6e25SJosef Bacik break; 2403dd8e7217SJosef Bacik } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2404e02119d5SChris Mason ret = replay_one_dir_item(wc->trans, root, path, 2405e02119d5SChris Mason eb, i, &key); 2406b50c6e25SJosef Bacik if (ret) 2407b50c6e25SJosef Bacik break; 2408e02119d5SChris Mason } 2409e02119d5SChris Mason } 2410e02119d5SChris Mason btrfs_free_path(path); 2411b50c6e25SJosef Bacik return ret; 2412e02119d5SChris Mason } 2413e02119d5SChris Mason 2414d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2415e02119d5SChris Mason struct btrfs_root *root, 2416e02119d5SChris Mason struct btrfs_path *path, int *level, 2417e02119d5SChris Mason struct walk_control *wc) 2418e02119d5SChris Mason { 24190b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 2420e02119d5SChris Mason u64 root_owner; 2421e02119d5SChris Mason u64 bytenr; 2422e02119d5SChris Mason u64 ptr_gen; 2423e02119d5SChris Mason struct extent_buffer *next; 2424e02119d5SChris Mason struct extent_buffer *cur; 2425e02119d5SChris Mason struct extent_buffer *parent; 2426e02119d5SChris Mason u32 blocksize; 2427e02119d5SChris Mason int ret = 0; 2428e02119d5SChris Mason 2429e02119d5SChris Mason WARN_ON(*level < 0); 2430e02119d5SChris Mason WARN_ON(*level >= BTRFS_MAX_LEVEL); 2431e02119d5SChris Mason 2432e02119d5SChris Mason while (*level > 0) { 2433e02119d5SChris Mason WARN_ON(*level < 0); 2434e02119d5SChris Mason WARN_ON(*level >= BTRFS_MAX_LEVEL); 2435e02119d5SChris Mason cur = path->nodes[*level]; 2436e02119d5SChris Mason 2437fae7f21cSDulshani Gunawardhana WARN_ON(btrfs_header_level(cur) != *level); 2438e02119d5SChris Mason 2439e02119d5SChris Mason if (path->slots[*level] >= 2440e02119d5SChris Mason btrfs_header_nritems(cur)) 2441e02119d5SChris Mason break; 2442e02119d5SChris Mason 2443e02119d5SChris Mason bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2444e02119d5SChris Mason ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 24450b246afaSJeff Mahoney blocksize = fs_info->nodesize; 2446e02119d5SChris Mason 2447e02119d5SChris Mason parent = path->nodes[*level]; 2448e02119d5SChris Mason root_owner = btrfs_header_owner(parent); 2449e02119d5SChris Mason 24502ff7e61eSJeff Mahoney next = btrfs_find_create_tree_block(fs_info, bytenr); 2451c871b0f2SLiu Bo if (IS_ERR(next)) 2452c871b0f2SLiu Bo return PTR_ERR(next); 2453e02119d5SChris Mason 24544a500fd1SYan, Zheng if (*level == 1) { 24551e5063d0SMark Fasheh ret = wc->process_func(root, next, wc, ptr_gen); 2456b50c6e25SJosef Bacik if (ret) { 2457b50c6e25SJosef Bacik free_extent_buffer(next); 24581e5063d0SMark Fasheh return ret; 2459b50c6e25SJosef Bacik } 2460e02119d5SChris Mason 2461e02119d5SChris Mason path->slots[*level]++; 2462e02119d5SChris Mason if (wc->free) { 2463018642a1STsutomu Itoh ret = btrfs_read_buffer(next, ptr_gen); 2464018642a1STsutomu Itoh if (ret) { 2465018642a1STsutomu Itoh free_extent_buffer(next); 2466018642a1STsutomu Itoh return ret; 2467018642a1STsutomu Itoh } 2468e02119d5SChris Mason 2469681ae509SJosef Bacik if (trans) { 2470e02119d5SChris Mason btrfs_tree_lock(next); 2471b4ce94deSChris Mason btrfs_set_lock_blocking(next); 24720b246afaSJeff Mahoney clean_tree_block(trans, fs_info, next); 2473e02119d5SChris Mason btrfs_wait_tree_block_writeback(next); 2474e02119d5SChris Mason btrfs_tree_unlock(next); 2475681ae509SJosef Bacik } 2476e02119d5SChris Mason 2477e02119d5SChris Mason WARN_ON(root_owner != 2478e02119d5SChris Mason BTRFS_TREE_LOG_OBJECTID); 24792ff7e61eSJeff Mahoney ret = btrfs_free_and_pin_reserved_extent( 24802ff7e61eSJeff Mahoney fs_info, bytenr, 24812ff7e61eSJeff Mahoney blocksize); 24823650860bSJosef Bacik if (ret) { 24833650860bSJosef Bacik free_extent_buffer(next); 24843650860bSJosef Bacik return ret; 24853650860bSJosef Bacik } 2486e02119d5SChris Mason } 2487e02119d5SChris Mason free_extent_buffer(next); 2488e02119d5SChris Mason continue; 2489e02119d5SChris Mason } 2490018642a1STsutomu Itoh ret = btrfs_read_buffer(next, ptr_gen); 2491018642a1STsutomu Itoh if (ret) { 2492018642a1STsutomu Itoh free_extent_buffer(next); 2493018642a1STsutomu Itoh return ret; 2494018642a1STsutomu Itoh } 2495e02119d5SChris Mason 2496e02119d5SChris Mason WARN_ON(*level <= 0); 2497e02119d5SChris Mason if (path->nodes[*level-1]) 2498e02119d5SChris Mason free_extent_buffer(path->nodes[*level-1]); 2499e02119d5SChris Mason path->nodes[*level-1] = next; 2500e02119d5SChris Mason *level = btrfs_header_level(next); 2501e02119d5SChris Mason path->slots[*level] = 0; 2502e02119d5SChris Mason cond_resched(); 2503e02119d5SChris Mason } 2504e02119d5SChris Mason WARN_ON(*level < 0); 2505e02119d5SChris Mason WARN_ON(*level >= BTRFS_MAX_LEVEL); 2506e02119d5SChris Mason 25074a500fd1SYan, Zheng path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2508e02119d5SChris Mason 2509e02119d5SChris Mason cond_resched(); 2510e02119d5SChris Mason return 0; 2511e02119d5SChris Mason } 2512e02119d5SChris Mason 2513d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2514e02119d5SChris Mason struct btrfs_root *root, 2515e02119d5SChris Mason struct btrfs_path *path, int *level, 2516e02119d5SChris Mason struct walk_control *wc) 2517e02119d5SChris Mason { 25180b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 2519e02119d5SChris Mason u64 root_owner; 2520e02119d5SChris Mason int i; 2521e02119d5SChris Mason int slot; 2522e02119d5SChris Mason int ret; 2523e02119d5SChris Mason 2524e02119d5SChris Mason for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2525e02119d5SChris Mason slot = path->slots[i]; 25264a500fd1SYan, Zheng if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2527e02119d5SChris Mason path->slots[i]++; 2528e02119d5SChris Mason *level = i; 2529e02119d5SChris Mason WARN_ON(*level == 0); 2530e02119d5SChris Mason return 0; 2531e02119d5SChris Mason } else { 253231840ae1SZheng Yan struct extent_buffer *parent; 253331840ae1SZheng Yan if (path->nodes[*level] == root->node) 253431840ae1SZheng Yan parent = path->nodes[*level]; 253531840ae1SZheng Yan else 253631840ae1SZheng Yan parent = path->nodes[*level + 1]; 253731840ae1SZheng Yan 253831840ae1SZheng Yan root_owner = btrfs_header_owner(parent); 25391e5063d0SMark Fasheh ret = wc->process_func(root, path->nodes[*level], wc, 2540e02119d5SChris Mason btrfs_header_generation(path->nodes[*level])); 25411e5063d0SMark Fasheh if (ret) 25421e5063d0SMark Fasheh return ret; 25431e5063d0SMark Fasheh 2544e02119d5SChris Mason if (wc->free) { 2545e02119d5SChris Mason struct extent_buffer *next; 2546e02119d5SChris Mason 2547e02119d5SChris Mason next = path->nodes[*level]; 2548e02119d5SChris Mason 2549681ae509SJosef Bacik if (trans) { 2550e02119d5SChris Mason btrfs_tree_lock(next); 2551b4ce94deSChris Mason btrfs_set_lock_blocking(next); 25520b246afaSJeff Mahoney clean_tree_block(trans, fs_info, next); 2553e02119d5SChris Mason btrfs_wait_tree_block_writeback(next); 2554e02119d5SChris Mason btrfs_tree_unlock(next); 2555681ae509SJosef Bacik } 2556e02119d5SChris Mason 2557e02119d5SChris Mason WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 25582ff7e61eSJeff Mahoney ret = btrfs_free_and_pin_reserved_extent( 25592ff7e61eSJeff Mahoney fs_info, 2560e02119d5SChris Mason path->nodes[*level]->start, 2561d00aff00SChris Mason path->nodes[*level]->len); 25623650860bSJosef Bacik if (ret) 25633650860bSJosef Bacik return ret; 2564e02119d5SChris Mason } 2565e02119d5SChris Mason free_extent_buffer(path->nodes[*level]); 2566e02119d5SChris Mason path->nodes[*level] = NULL; 2567e02119d5SChris Mason *level = i + 1; 2568e02119d5SChris Mason } 2569e02119d5SChris Mason } 2570e02119d5SChris Mason return 1; 2571e02119d5SChris Mason } 2572e02119d5SChris Mason 2573e02119d5SChris Mason /* 2574e02119d5SChris Mason * drop the reference count on the tree rooted at 'snap'. This traverses 2575e02119d5SChris Mason * the tree freeing any blocks that have a ref count of zero after being 2576e02119d5SChris Mason * decremented. 2577e02119d5SChris Mason */ 2578e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans, 2579e02119d5SChris Mason struct btrfs_root *log, struct walk_control *wc) 2580e02119d5SChris Mason { 25812ff7e61eSJeff Mahoney struct btrfs_fs_info *fs_info = log->fs_info; 2582e02119d5SChris Mason int ret = 0; 2583e02119d5SChris Mason int wret; 2584e02119d5SChris Mason int level; 2585e02119d5SChris Mason struct btrfs_path *path; 2586e02119d5SChris Mason int orig_level; 2587e02119d5SChris Mason 2588e02119d5SChris Mason path = btrfs_alloc_path(); 2589db5b493aSTsutomu Itoh if (!path) 2590db5b493aSTsutomu Itoh return -ENOMEM; 2591e02119d5SChris Mason 2592e02119d5SChris Mason level = btrfs_header_level(log->node); 2593e02119d5SChris Mason orig_level = level; 2594e02119d5SChris Mason path->nodes[level] = log->node; 2595e02119d5SChris Mason extent_buffer_get(log->node); 2596e02119d5SChris Mason path->slots[level] = 0; 2597e02119d5SChris Mason 2598e02119d5SChris Mason while (1) { 2599e02119d5SChris Mason wret = walk_down_log_tree(trans, log, path, &level, wc); 2600e02119d5SChris Mason if (wret > 0) 2601e02119d5SChris Mason break; 260279787eaaSJeff Mahoney if (wret < 0) { 2603e02119d5SChris Mason ret = wret; 260479787eaaSJeff Mahoney goto out; 260579787eaaSJeff Mahoney } 2606e02119d5SChris Mason 2607e02119d5SChris Mason wret = walk_up_log_tree(trans, log, path, &level, wc); 2608e02119d5SChris Mason if (wret > 0) 2609e02119d5SChris Mason break; 261079787eaaSJeff Mahoney if (wret < 0) { 2611e02119d5SChris Mason ret = wret; 261279787eaaSJeff Mahoney goto out; 261379787eaaSJeff Mahoney } 2614e02119d5SChris Mason } 2615e02119d5SChris Mason 2616e02119d5SChris Mason /* was the root node processed? if not, catch it here */ 2617e02119d5SChris Mason if (path->nodes[orig_level]) { 261879787eaaSJeff Mahoney ret = wc->process_func(log, path->nodes[orig_level], wc, 2619e02119d5SChris Mason btrfs_header_generation(path->nodes[orig_level])); 262079787eaaSJeff Mahoney if (ret) 262179787eaaSJeff Mahoney goto out; 2622e02119d5SChris Mason if (wc->free) { 2623e02119d5SChris Mason struct extent_buffer *next; 2624e02119d5SChris Mason 2625e02119d5SChris Mason next = path->nodes[orig_level]; 2626e02119d5SChris Mason 2627681ae509SJosef Bacik if (trans) { 2628e02119d5SChris Mason btrfs_tree_lock(next); 2629b4ce94deSChris Mason btrfs_set_lock_blocking(next); 26302ff7e61eSJeff Mahoney clean_tree_block(trans, fs_info, next); 2631e02119d5SChris Mason btrfs_wait_tree_block_writeback(next); 2632e02119d5SChris Mason btrfs_tree_unlock(next); 2633681ae509SJosef Bacik } 2634e02119d5SChris Mason 2635e02119d5SChris Mason WARN_ON(log->root_key.objectid != 2636e02119d5SChris Mason BTRFS_TREE_LOG_OBJECTID); 26372ff7e61eSJeff Mahoney ret = btrfs_free_and_pin_reserved_extent(fs_info, 26382ff7e61eSJeff Mahoney next->start, next->len); 26393650860bSJosef Bacik if (ret) 26403650860bSJosef Bacik goto out; 2641e02119d5SChris Mason } 2642e02119d5SChris Mason } 2643e02119d5SChris Mason 264479787eaaSJeff Mahoney out: 2645e02119d5SChris Mason btrfs_free_path(path); 2646e02119d5SChris Mason return ret; 2647e02119d5SChris Mason } 2648e02119d5SChris Mason 26497237f183SYan Zheng /* 26507237f183SYan Zheng * helper function to update the item for a given subvolumes log root 26517237f183SYan Zheng * in the tree of log roots 26527237f183SYan Zheng */ 26537237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans, 26547237f183SYan Zheng struct btrfs_root *log) 26557237f183SYan Zheng { 26560b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = log->fs_info; 26577237f183SYan Zheng int ret; 26587237f183SYan Zheng 26597237f183SYan Zheng if (log->log_transid == 1) { 26607237f183SYan Zheng /* insert root item on the first sync */ 26610b246afaSJeff Mahoney ret = btrfs_insert_root(trans, fs_info->log_root_tree, 26627237f183SYan Zheng &log->root_key, &log->root_item); 26637237f183SYan Zheng } else { 26640b246afaSJeff Mahoney ret = btrfs_update_root(trans, fs_info->log_root_tree, 26657237f183SYan Zheng &log->root_key, &log->root_item); 26667237f183SYan Zheng } 26677237f183SYan Zheng return ret; 26687237f183SYan Zheng } 26697237f183SYan Zheng 267060d53eb3SZhaolei static void wait_log_commit(struct btrfs_root *root, int transid) 2671e02119d5SChris Mason { 2672e02119d5SChris Mason DEFINE_WAIT(wait); 26737237f183SYan Zheng int index = transid % 2; 2674e02119d5SChris Mason 26757237f183SYan Zheng /* 26767237f183SYan Zheng * we only allow two pending log transactions at a time, 26777237f183SYan Zheng * so we know that if ours is more than 2 older than the 26787237f183SYan Zheng * current transaction, we're done 26797237f183SYan Zheng */ 2680e02119d5SChris Mason do { 26817237f183SYan Zheng prepare_to_wait(&root->log_commit_wait[index], 26827237f183SYan Zheng &wait, TASK_UNINTERRUPTIBLE); 26837237f183SYan Zheng mutex_unlock(&root->log_mutex); 268412fcfd22SChris Mason 2685d1433debSMiao Xie if (root->log_transid_committed < transid && 26867237f183SYan Zheng atomic_read(&root->log_commit[index])) 2687e02119d5SChris Mason schedule(); 268812fcfd22SChris Mason 26897237f183SYan Zheng finish_wait(&root->log_commit_wait[index], &wait); 26907237f183SYan Zheng mutex_lock(&root->log_mutex); 2691d1433debSMiao Xie } while (root->log_transid_committed < transid && 26927237f183SYan Zheng atomic_read(&root->log_commit[index])); 26937237f183SYan Zheng } 26947237f183SYan Zheng 269560d53eb3SZhaolei static void wait_for_writer(struct btrfs_root *root) 26967237f183SYan Zheng { 26977237f183SYan Zheng DEFINE_WAIT(wait); 26988b050d35SMiao Xie 26998b050d35SMiao Xie while (atomic_read(&root->log_writers)) { 27007237f183SYan Zheng prepare_to_wait(&root->log_writer_wait, 27017237f183SYan Zheng &wait, TASK_UNINTERRUPTIBLE); 27027237f183SYan Zheng mutex_unlock(&root->log_mutex); 27038b050d35SMiao Xie if (atomic_read(&root->log_writers)) 27047237f183SYan Zheng schedule(); 27057237f183SYan Zheng finish_wait(&root->log_writer_wait, &wait); 2706575849ecSFilipe Manana mutex_lock(&root->log_mutex); 27077237f183SYan Zheng } 2708e02119d5SChris Mason } 2709e02119d5SChris Mason 27108b050d35SMiao Xie static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 27118b050d35SMiao Xie struct btrfs_log_ctx *ctx) 27128b050d35SMiao Xie { 27138b050d35SMiao Xie if (!ctx) 27148b050d35SMiao Xie return; 27158b050d35SMiao Xie 27168b050d35SMiao Xie mutex_lock(&root->log_mutex); 27178b050d35SMiao Xie list_del_init(&ctx->list); 27188b050d35SMiao Xie mutex_unlock(&root->log_mutex); 27198b050d35SMiao Xie } 27208b050d35SMiao Xie 27218b050d35SMiao Xie /* 27228b050d35SMiao Xie * Invoked in log mutex context, or be sure there is no other task which 27238b050d35SMiao Xie * can access the list. 27248b050d35SMiao Xie */ 27258b050d35SMiao Xie static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 27268b050d35SMiao Xie int index, int error) 27278b050d35SMiao Xie { 27288b050d35SMiao Xie struct btrfs_log_ctx *ctx; 2729570dd450SChris Mason struct btrfs_log_ctx *safe; 27308b050d35SMiao Xie 2731570dd450SChris Mason list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2732570dd450SChris Mason list_del_init(&ctx->list); 27338b050d35SMiao Xie ctx->log_ret = error; 2734570dd450SChris Mason } 27358b050d35SMiao Xie 27368b050d35SMiao Xie INIT_LIST_HEAD(&root->log_ctxs[index]); 27378b050d35SMiao Xie } 27388b050d35SMiao Xie 2739e02119d5SChris Mason /* 2740e02119d5SChris Mason * btrfs_sync_log does sends a given tree log down to the disk and 2741e02119d5SChris Mason * updates the super blocks to record it. When this call is done, 274212fcfd22SChris Mason * you know that any inodes previously logged are safely on disk only 274312fcfd22SChris Mason * if it returns 0. 274412fcfd22SChris Mason * 274512fcfd22SChris Mason * Any other return value means you need to call btrfs_commit_transaction. 274612fcfd22SChris Mason * Some of the edge cases for fsyncing directories that have had unlinks 274712fcfd22SChris Mason * or renames done in the past mean that sometimes the only safe 274812fcfd22SChris Mason * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 274912fcfd22SChris Mason * that has happened. 2750e02119d5SChris Mason */ 2751e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans, 27528b050d35SMiao Xie struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2753e02119d5SChris Mason { 27547237f183SYan Zheng int index1; 27557237f183SYan Zheng int index2; 27568cef4e16SYan, Zheng int mark; 2757e02119d5SChris Mason int ret; 27580b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 2759e02119d5SChris Mason struct btrfs_root *log = root->log_root; 27600b246afaSJeff Mahoney struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2761bb14a59bSMiao Xie int log_transid = 0; 27628b050d35SMiao Xie struct btrfs_log_ctx root_log_ctx; 2763c6adc9ccSMiao Xie struct blk_plug plug; 2764e02119d5SChris Mason 27657237f183SYan Zheng mutex_lock(&root->log_mutex); 2766d1433debSMiao Xie log_transid = ctx->log_transid; 2767d1433debSMiao Xie if (root->log_transid_committed >= log_transid) { 27687237f183SYan Zheng mutex_unlock(&root->log_mutex); 27698b050d35SMiao Xie return ctx->log_ret; 2770e02119d5SChris Mason } 2771d1433debSMiao Xie 2772d1433debSMiao Xie index1 = log_transid % 2; 2773d1433debSMiao Xie if (atomic_read(&root->log_commit[index1])) { 277460d53eb3SZhaolei wait_log_commit(root, log_transid); 2775d1433debSMiao Xie mutex_unlock(&root->log_mutex); 2776d1433debSMiao Xie return ctx->log_ret; 2777d1433debSMiao Xie } 2778d1433debSMiao Xie ASSERT(log_transid == root->log_transid); 27797237f183SYan Zheng atomic_set(&root->log_commit[index1], 1); 27807237f183SYan Zheng 27817237f183SYan Zheng /* wait for previous tree log sync to complete */ 27827237f183SYan Zheng if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 278360d53eb3SZhaolei wait_log_commit(root, log_transid - 1); 278448cab2e0SMiao Xie 278586df7eb9SYan, Zheng while (1) { 27862ecb7923SMiao Xie int batch = atomic_read(&root->log_batch); 2787cd354ad6SChris Mason /* when we're on an ssd, just kick the log commit out */ 27880b246afaSJeff Mahoney if (!btrfs_test_opt(fs_info, SSD) && 278927cdeb70SMiao Xie test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 27907237f183SYan Zheng mutex_unlock(&root->log_mutex); 2791e02119d5SChris Mason schedule_timeout_uninterruptible(1); 27927237f183SYan Zheng mutex_lock(&root->log_mutex); 279386df7eb9SYan, Zheng } 279460d53eb3SZhaolei wait_for_writer(root); 27952ecb7923SMiao Xie if (batch == atomic_read(&root->log_batch)) 2796e02119d5SChris Mason break; 2797e02119d5SChris Mason } 2798d0c803c4SChris Mason 279912fcfd22SChris Mason /* bail out if we need to do a full commit */ 28000b246afaSJeff Mahoney if (btrfs_need_log_full_commit(fs_info, trans)) { 280112fcfd22SChris Mason ret = -EAGAIN; 28022ab28f32SJosef Bacik btrfs_free_logged_extents(log, log_transid); 280312fcfd22SChris Mason mutex_unlock(&root->log_mutex); 280412fcfd22SChris Mason goto out; 280512fcfd22SChris Mason } 280612fcfd22SChris Mason 28078cef4e16SYan, Zheng if (log_transid % 2 == 0) 28088cef4e16SYan, Zheng mark = EXTENT_DIRTY; 28098cef4e16SYan, Zheng else 28108cef4e16SYan, Zheng mark = EXTENT_NEW; 28118cef4e16SYan, Zheng 2812690587d1SChris Mason /* we start IO on all the marked extents here, but we don't actually 2813690587d1SChris Mason * wait for them until later. 2814690587d1SChris Mason */ 2815c6adc9ccSMiao Xie blk_start_plug(&plug); 28162ff7e61eSJeff Mahoney ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 281779787eaaSJeff Mahoney if (ret) { 2818c6adc9ccSMiao Xie blk_finish_plug(&plug); 281966642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 28202ab28f32SJosef Bacik btrfs_free_logged_extents(log, log_transid); 28210b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 282279787eaaSJeff Mahoney mutex_unlock(&root->log_mutex); 282379787eaaSJeff Mahoney goto out; 282479787eaaSJeff Mahoney } 28257237f183SYan Zheng 28265d4f98a2SYan Zheng btrfs_set_root_node(&log->root_item, log->node); 28277237f183SYan Zheng 28287237f183SYan Zheng root->log_transid++; 28297237f183SYan Zheng log->log_transid = root->log_transid; 2830ff782e0aSJosef Bacik root->log_start_pid = 0; 28317237f183SYan Zheng /* 28328cef4e16SYan, Zheng * IO has been started, blocks of the log tree have WRITTEN flag set 28338cef4e16SYan, Zheng * in their headers. new modifications of the log will be written to 28348cef4e16SYan, Zheng * new positions. so it's safe to allow log writers to go in. 28357237f183SYan Zheng */ 28367237f183SYan Zheng mutex_unlock(&root->log_mutex); 28377237f183SYan Zheng 283828a23593SFilipe Manana btrfs_init_log_ctx(&root_log_ctx, NULL); 2839d1433debSMiao Xie 28407237f183SYan Zheng mutex_lock(&log_root_tree->log_mutex); 28412ecb7923SMiao Xie atomic_inc(&log_root_tree->log_batch); 28427237f183SYan Zheng atomic_inc(&log_root_tree->log_writers); 2843d1433debSMiao Xie 2844d1433debSMiao Xie index2 = log_root_tree->log_transid % 2; 2845d1433debSMiao Xie list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2846d1433debSMiao Xie root_log_ctx.log_transid = log_root_tree->log_transid; 2847d1433debSMiao Xie 28487237f183SYan Zheng mutex_unlock(&log_root_tree->log_mutex); 28497237f183SYan Zheng 28507237f183SYan Zheng ret = update_log_root(trans, log); 28517237f183SYan Zheng 28527237f183SYan Zheng mutex_lock(&log_root_tree->log_mutex); 28537237f183SYan Zheng if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2854779adf0fSDavid Sterba /* 2855779adf0fSDavid Sterba * Implicit memory barrier after atomic_dec_and_test 2856779adf0fSDavid Sterba */ 28577237f183SYan Zheng if (waitqueue_active(&log_root_tree->log_writer_wait)) 28587237f183SYan Zheng wake_up(&log_root_tree->log_writer_wait); 28597237f183SYan Zheng } 28607237f183SYan Zheng 28614a500fd1SYan, Zheng if (ret) { 2862d1433debSMiao Xie if (!list_empty(&root_log_ctx.list)) 2863d1433debSMiao Xie list_del_init(&root_log_ctx.list); 2864d1433debSMiao Xie 2865c6adc9ccSMiao Xie blk_finish_plug(&plug); 28660b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 2867995946ddSMiao Xie 286879787eaaSJeff Mahoney if (ret != -ENOSPC) { 286966642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 287079787eaaSJeff Mahoney mutex_unlock(&log_root_tree->log_mutex); 287179787eaaSJeff Mahoney goto out; 287279787eaaSJeff Mahoney } 2873bf89d38fSJeff Mahoney btrfs_wait_tree_log_extents(log, mark); 28742ab28f32SJosef Bacik btrfs_free_logged_extents(log, log_transid); 28754a500fd1SYan, Zheng mutex_unlock(&log_root_tree->log_mutex); 28764a500fd1SYan, Zheng ret = -EAGAIN; 28774a500fd1SYan, Zheng goto out; 28784a500fd1SYan, Zheng } 28794a500fd1SYan, Zheng 2880d1433debSMiao Xie if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 28813da5ab56SForrest Liu blk_finish_plug(&plug); 2882cbd60aa7SChris Mason list_del_init(&root_log_ctx.list); 2883d1433debSMiao Xie mutex_unlock(&log_root_tree->log_mutex); 2884d1433debSMiao Xie ret = root_log_ctx.log_ret; 2885d1433debSMiao Xie goto out; 2886d1433debSMiao Xie } 28878b050d35SMiao Xie 2888d1433debSMiao Xie index2 = root_log_ctx.log_transid % 2; 28897237f183SYan Zheng if (atomic_read(&log_root_tree->log_commit[index2])) { 2890c6adc9ccSMiao Xie blk_finish_plug(&plug); 2891bf89d38fSJeff Mahoney ret = btrfs_wait_tree_log_extents(log, mark); 289250d9aa99SJosef Bacik btrfs_wait_logged_extents(trans, log, log_transid); 289360d53eb3SZhaolei wait_log_commit(log_root_tree, 2894d1433debSMiao Xie root_log_ctx.log_transid); 28957237f183SYan Zheng mutex_unlock(&log_root_tree->log_mutex); 28965ab5e44aSFilipe Manana if (!ret) 28978b050d35SMiao Xie ret = root_log_ctx.log_ret; 28987237f183SYan Zheng goto out; 28997237f183SYan Zheng } 2900d1433debSMiao Xie ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 29017237f183SYan Zheng atomic_set(&log_root_tree->log_commit[index2], 1); 29027237f183SYan Zheng 290312fcfd22SChris Mason if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 290460d53eb3SZhaolei wait_log_commit(log_root_tree, 2905d1433debSMiao Xie root_log_ctx.log_transid - 1); 290612fcfd22SChris Mason } 29077237f183SYan Zheng 290860d53eb3SZhaolei wait_for_writer(log_root_tree); 290912fcfd22SChris Mason 291012fcfd22SChris Mason /* 291112fcfd22SChris Mason * now that we've moved on to the tree of log tree roots, 291212fcfd22SChris Mason * check the full commit flag again 291312fcfd22SChris Mason */ 29140b246afaSJeff Mahoney if (btrfs_need_log_full_commit(fs_info, trans)) { 2915c6adc9ccSMiao Xie blk_finish_plug(&plug); 2916bf89d38fSJeff Mahoney btrfs_wait_tree_log_extents(log, mark); 29172ab28f32SJosef Bacik btrfs_free_logged_extents(log, log_transid); 291812fcfd22SChris Mason mutex_unlock(&log_root_tree->log_mutex); 291912fcfd22SChris Mason ret = -EAGAIN; 292012fcfd22SChris Mason goto out_wake_log_root; 292112fcfd22SChris Mason } 29227237f183SYan Zheng 29232ff7e61eSJeff Mahoney ret = btrfs_write_marked_extents(fs_info, 29248cef4e16SYan, Zheng &log_root_tree->dirty_log_pages, 29258cef4e16SYan, Zheng EXTENT_DIRTY | EXTENT_NEW); 2926c6adc9ccSMiao Xie blk_finish_plug(&plug); 292779787eaaSJeff Mahoney if (ret) { 29280b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 292966642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 29302ab28f32SJosef Bacik btrfs_free_logged_extents(log, log_transid); 293179787eaaSJeff Mahoney mutex_unlock(&log_root_tree->log_mutex); 293279787eaaSJeff Mahoney goto out_wake_log_root; 293379787eaaSJeff Mahoney } 2934bf89d38fSJeff Mahoney ret = btrfs_wait_tree_log_extents(log, mark); 29355ab5e44aSFilipe Manana if (!ret) 2936bf89d38fSJeff Mahoney ret = btrfs_wait_tree_log_extents(log_root_tree, 2937c6adc9ccSMiao Xie EXTENT_NEW | EXTENT_DIRTY); 29385ab5e44aSFilipe Manana if (ret) { 29390b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 29405ab5e44aSFilipe Manana btrfs_free_logged_extents(log, log_transid); 29415ab5e44aSFilipe Manana mutex_unlock(&log_root_tree->log_mutex); 29425ab5e44aSFilipe Manana goto out_wake_log_root; 29435ab5e44aSFilipe Manana } 294450d9aa99SJosef Bacik btrfs_wait_logged_extents(trans, log, log_transid); 2945e02119d5SChris Mason 29460b246afaSJeff Mahoney btrfs_set_super_log_root(fs_info->super_for_commit, 29477237f183SYan Zheng log_root_tree->node->start); 29480b246afaSJeff Mahoney btrfs_set_super_log_root_level(fs_info->super_for_commit, 29497237f183SYan Zheng btrfs_header_level(log_root_tree->node)); 2950e02119d5SChris Mason 29517237f183SYan Zheng log_root_tree->log_transid++; 29527237f183SYan Zheng mutex_unlock(&log_root_tree->log_mutex); 29537237f183SYan Zheng 29547237f183SYan Zheng /* 29557237f183SYan Zheng * nobody else is going to jump in and write the the ctree 29567237f183SYan Zheng * super here because the log_commit atomic below is protecting 29577237f183SYan Zheng * us. We must be called with a transaction handle pinning 29587237f183SYan Zheng * the running transaction open, so a full commit can't hop 29597237f183SYan Zheng * in and cause problems either. 29607237f183SYan Zheng */ 29612ff7e61eSJeff Mahoney ret = write_ctree_super(trans, fs_info, 1); 29625af3e8ccSStefan Behrens if (ret) { 29630b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 296466642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 29655af3e8ccSStefan Behrens goto out_wake_log_root; 29665af3e8ccSStefan Behrens } 29677237f183SYan Zheng 2968257c62e1SChris Mason mutex_lock(&root->log_mutex); 2969257c62e1SChris Mason if (root->last_log_commit < log_transid) 2970257c62e1SChris Mason root->last_log_commit = log_transid; 2971257c62e1SChris Mason mutex_unlock(&root->log_mutex); 2972257c62e1SChris Mason 297312fcfd22SChris Mason out_wake_log_root: 2974570dd450SChris Mason mutex_lock(&log_root_tree->log_mutex); 29758b050d35SMiao Xie btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 29768b050d35SMiao Xie 2977d1433debSMiao Xie log_root_tree->log_transid_committed++; 29787237f183SYan Zheng atomic_set(&log_root_tree->log_commit[index2], 0); 2979d1433debSMiao Xie mutex_unlock(&log_root_tree->log_mutex); 2980d1433debSMiao Xie 298133a9eca7SDavid Sterba /* 298233a9eca7SDavid Sterba * The barrier before waitqueue_active is implied by mutex_unlock 298333a9eca7SDavid Sterba */ 29847237f183SYan Zheng if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 29857237f183SYan Zheng wake_up(&log_root_tree->log_commit_wait[index2]); 2986e02119d5SChris Mason out: 2987d1433debSMiao Xie mutex_lock(&root->log_mutex); 2988570dd450SChris Mason btrfs_remove_all_log_ctxs(root, index1, ret); 2989d1433debSMiao Xie root->log_transid_committed++; 29907237f183SYan Zheng atomic_set(&root->log_commit[index1], 0); 2991d1433debSMiao Xie mutex_unlock(&root->log_mutex); 29928b050d35SMiao Xie 299333a9eca7SDavid Sterba /* 299433a9eca7SDavid Sterba * The barrier before waitqueue_active is implied by mutex_unlock 299533a9eca7SDavid Sterba */ 29967237f183SYan Zheng if (waitqueue_active(&root->log_commit_wait[index1])) 29977237f183SYan Zheng wake_up(&root->log_commit_wait[index1]); 2998b31eabd8SChris Mason return ret; 2999e02119d5SChris Mason } 3000e02119d5SChris Mason 30014a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans, 30024a500fd1SYan, Zheng struct btrfs_root *log) 3003e02119d5SChris Mason { 3004e02119d5SChris Mason int ret; 3005d0c803c4SChris Mason u64 start; 3006d0c803c4SChris Mason u64 end; 3007e02119d5SChris Mason struct walk_control wc = { 3008e02119d5SChris Mason .free = 1, 3009e02119d5SChris Mason .process_func = process_one_buffer 3010e02119d5SChris Mason }; 3011e02119d5SChris Mason 3012e02119d5SChris Mason ret = walk_log_tree(trans, log, &wc); 30133650860bSJosef Bacik /* I don't think this can happen but just in case */ 30143650860bSJosef Bacik if (ret) 301566642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 3016e02119d5SChris Mason 3017d0c803c4SChris Mason while (1) { 3018d0c803c4SChris Mason ret = find_first_extent_bit(&log->dirty_log_pages, 3019e6138876SJosef Bacik 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 3020e6138876SJosef Bacik NULL); 3021d0c803c4SChris Mason if (ret) 3022d0c803c4SChris Mason break; 3023d0c803c4SChris Mason 30248cef4e16SYan, Zheng clear_extent_bits(&log->dirty_log_pages, start, end, 302591166212SDavid Sterba EXTENT_DIRTY | EXTENT_NEW); 3026d0c803c4SChris Mason } 3027d0c803c4SChris Mason 30282ab28f32SJosef Bacik /* 30292ab28f32SJosef Bacik * We may have short-circuited the log tree with the full commit logic 30302ab28f32SJosef Bacik * and left ordered extents on our list, so clear these out to keep us 30312ab28f32SJosef Bacik * from leaking inodes and memory. 30322ab28f32SJosef Bacik */ 30332ab28f32SJosef Bacik btrfs_free_logged_extents(log, 0); 30342ab28f32SJosef Bacik btrfs_free_logged_extents(log, 1); 30352ab28f32SJosef Bacik 30367237f183SYan Zheng free_extent_buffer(log->node); 30377237f183SYan Zheng kfree(log); 30384a500fd1SYan, Zheng } 30394a500fd1SYan, Zheng 30404a500fd1SYan, Zheng /* 30414a500fd1SYan, Zheng * free all the extents used by the tree log. This should be called 30424a500fd1SYan, Zheng * at commit time of the full transaction 30434a500fd1SYan, Zheng */ 30444a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 30454a500fd1SYan, Zheng { 30464a500fd1SYan, Zheng if (root->log_root) { 30474a500fd1SYan, Zheng free_log_tree(trans, root->log_root); 30484a500fd1SYan, Zheng root->log_root = NULL; 30494a500fd1SYan, Zheng } 30504a500fd1SYan, Zheng return 0; 30514a500fd1SYan, Zheng } 30524a500fd1SYan, Zheng 30534a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 30544a500fd1SYan, Zheng struct btrfs_fs_info *fs_info) 30554a500fd1SYan, Zheng { 30564a500fd1SYan, Zheng if (fs_info->log_root_tree) { 30574a500fd1SYan, Zheng free_log_tree(trans, fs_info->log_root_tree); 30584a500fd1SYan, Zheng fs_info->log_root_tree = NULL; 30594a500fd1SYan, Zheng } 3060e02119d5SChris Mason return 0; 3061e02119d5SChris Mason } 3062e02119d5SChris Mason 3063e02119d5SChris Mason /* 3064e02119d5SChris Mason * If both a file and directory are logged, and unlinks or renames are 3065e02119d5SChris Mason * mixed in, we have a few interesting corners: 3066e02119d5SChris Mason * 3067e02119d5SChris Mason * create file X in dir Y 3068e02119d5SChris Mason * link file X to X.link in dir Y 3069e02119d5SChris Mason * fsync file X 3070e02119d5SChris Mason * unlink file X but leave X.link 3071e02119d5SChris Mason * fsync dir Y 3072e02119d5SChris Mason * 3073e02119d5SChris Mason * After a crash we would expect only X.link to exist. But file X 3074e02119d5SChris Mason * didn't get fsync'd again so the log has back refs for X and X.link. 3075e02119d5SChris Mason * 3076e02119d5SChris Mason * We solve this by removing directory entries and inode backrefs from the 3077e02119d5SChris Mason * log when a file that was logged in the current transaction is 3078e02119d5SChris Mason * unlinked. Any later fsync will include the updated log entries, and 3079e02119d5SChris Mason * we'll be able to reconstruct the proper directory items from backrefs. 3080e02119d5SChris Mason * 3081e02119d5SChris Mason * This optimizations allows us to avoid relogging the entire inode 3082e02119d5SChris Mason * or the entire directory. 3083e02119d5SChris Mason */ 3084e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3085e02119d5SChris Mason struct btrfs_root *root, 3086e02119d5SChris Mason const char *name, int name_len, 308749f34d1fSNikolay Borisov struct btrfs_inode *dir, u64 index) 3088e02119d5SChris Mason { 3089e02119d5SChris Mason struct btrfs_root *log; 3090e02119d5SChris Mason struct btrfs_dir_item *di; 3091e02119d5SChris Mason struct btrfs_path *path; 3092e02119d5SChris Mason int ret; 30934a500fd1SYan, Zheng int err = 0; 3094e02119d5SChris Mason int bytes_del = 0; 309549f34d1fSNikolay Borisov u64 dir_ino = btrfs_ino(dir); 3096e02119d5SChris Mason 309749f34d1fSNikolay Borisov if (dir->logged_trans < trans->transid) 30983a5f1d45SChris Mason return 0; 30993a5f1d45SChris Mason 3100e02119d5SChris Mason ret = join_running_log_trans(root); 3101e02119d5SChris Mason if (ret) 3102e02119d5SChris Mason return 0; 3103e02119d5SChris Mason 310449f34d1fSNikolay Borisov mutex_lock(&dir->log_mutex); 3105e02119d5SChris Mason 3106e02119d5SChris Mason log = root->log_root; 3107e02119d5SChris Mason path = btrfs_alloc_path(); 3108a62f44a5STsutomu Itoh if (!path) { 3109a62f44a5STsutomu Itoh err = -ENOMEM; 3110a62f44a5STsutomu Itoh goto out_unlock; 3111a62f44a5STsutomu Itoh } 31122a29edc6Sliubo 311333345d01SLi Zefan di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3114e02119d5SChris Mason name, name_len, -1); 31154a500fd1SYan, Zheng if (IS_ERR(di)) { 31164a500fd1SYan, Zheng err = PTR_ERR(di); 31174a500fd1SYan, Zheng goto fail; 31184a500fd1SYan, Zheng } 31194a500fd1SYan, Zheng if (di) { 3120e02119d5SChris Mason ret = btrfs_delete_one_dir_name(trans, log, path, di); 3121e02119d5SChris Mason bytes_del += name_len; 31223650860bSJosef Bacik if (ret) { 31233650860bSJosef Bacik err = ret; 31243650860bSJosef Bacik goto fail; 31253650860bSJosef Bacik } 3126e02119d5SChris Mason } 3127b3b4aa74SDavid Sterba btrfs_release_path(path); 312833345d01SLi Zefan di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3129e02119d5SChris Mason index, name, name_len, -1); 31304a500fd1SYan, Zheng if (IS_ERR(di)) { 31314a500fd1SYan, Zheng err = PTR_ERR(di); 31324a500fd1SYan, Zheng goto fail; 31334a500fd1SYan, Zheng } 31344a500fd1SYan, Zheng if (di) { 3135e02119d5SChris Mason ret = btrfs_delete_one_dir_name(trans, log, path, di); 3136e02119d5SChris Mason bytes_del += name_len; 31373650860bSJosef Bacik if (ret) { 31383650860bSJosef Bacik err = ret; 31393650860bSJosef Bacik goto fail; 31403650860bSJosef Bacik } 3141e02119d5SChris Mason } 3142e02119d5SChris Mason 3143e02119d5SChris Mason /* update the directory size in the log to reflect the names 3144e02119d5SChris Mason * we have removed 3145e02119d5SChris Mason */ 3146e02119d5SChris Mason if (bytes_del) { 3147e02119d5SChris Mason struct btrfs_key key; 3148e02119d5SChris Mason 314933345d01SLi Zefan key.objectid = dir_ino; 3150e02119d5SChris Mason key.offset = 0; 3151e02119d5SChris Mason key.type = BTRFS_INODE_ITEM_KEY; 3152b3b4aa74SDavid Sterba btrfs_release_path(path); 3153e02119d5SChris Mason 3154e02119d5SChris Mason ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 31554a500fd1SYan, Zheng if (ret < 0) { 31564a500fd1SYan, Zheng err = ret; 31574a500fd1SYan, Zheng goto fail; 31584a500fd1SYan, Zheng } 3159e02119d5SChris Mason if (ret == 0) { 3160e02119d5SChris Mason struct btrfs_inode_item *item; 3161e02119d5SChris Mason u64 i_size; 3162e02119d5SChris Mason 3163e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3164e02119d5SChris Mason struct btrfs_inode_item); 3165e02119d5SChris Mason i_size = btrfs_inode_size(path->nodes[0], item); 3166e02119d5SChris Mason if (i_size > bytes_del) 3167e02119d5SChris Mason i_size -= bytes_del; 3168e02119d5SChris Mason else 3169e02119d5SChris Mason i_size = 0; 3170e02119d5SChris Mason btrfs_set_inode_size(path->nodes[0], item, i_size); 3171e02119d5SChris Mason btrfs_mark_buffer_dirty(path->nodes[0]); 3172e02119d5SChris Mason } else 3173e02119d5SChris Mason ret = 0; 3174b3b4aa74SDavid Sterba btrfs_release_path(path); 3175e02119d5SChris Mason } 31764a500fd1SYan, Zheng fail: 3177e02119d5SChris Mason btrfs_free_path(path); 3178a62f44a5STsutomu Itoh out_unlock: 317949f34d1fSNikolay Borisov mutex_unlock(&dir->log_mutex); 31804a500fd1SYan, Zheng if (ret == -ENOSPC) { 3181995946ddSMiao Xie btrfs_set_log_full_commit(root->fs_info, trans); 31824a500fd1SYan, Zheng ret = 0; 318379787eaaSJeff Mahoney } else if (ret < 0) 318466642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 318579787eaaSJeff Mahoney 318612fcfd22SChris Mason btrfs_end_log_trans(root); 3187e02119d5SChris Mason 3188411fc6bcSAndi Kleen return err; 3189e02119d5SChris Mason } 3190e02119d5SChris Mason 3191e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */ 3192e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3193e02119d5SChris Mason struct btrfs_root *root, 3194e02119d5SChris Mason const char *name, int name_len, 3195a491abb2SNikolay Borisov struct btrfs_inode *inode, u64 dirid) 3196e02119d5SChris Mason { 31970b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 3198e02119d5SChris Mason struct btrfs_root *log; 3199e02119d5SChris Mason u64 index; 3200e02119d5SChris Mason int ret; 3201e02119d5SChris Mason 3202a491abb2SNikolay Borisov if (inode->logged_trans < trans->transid) 32033a5f1d45SChris Mason return 0; 32043a5f1d45SChris Mason 3205e02119d5SChris Mason ret = join_running_log_trans(root); 3206e02119d5SChris Mason if (ret) 3207e02119d5SChris Mason return 0; 3208e02119d5SChris Mason log = root->log_root; 3209a491abb2SNikolay Borisov mutex_lock(&inode->log_mutex); 3210e02119d5SChris Mason 3211a491abb2SNikolay Borisov ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3212e02119d5SChris Mason dirid, &index); 3213a491abb2SNikolay Borisov mutex_unlock(&inode->log_mutex); 32144a500fd1SYan, Zheng if (ret == -ENOSPC) { 32150b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 32164a500fd1SYan, Zheng ret = 0; 321779787eaaSJeff Mahoney } else if (ret < 0 && ret != -ENOENT) 321866642832SJeff Mahoney btrfs_abort_transaction(trans, ret); 321912fcfd22SChris Mason btrfs_end_log_trans(root); 3220e02119d5SChris Mason 3221e02119d5SChris Mason return ret; 3222e02119d5SChris Mason } 3223e02119d5SChris Mason 3224e02119d5SChris Mason /* 3225e02119d5SChris Mason * creates a range item in the log for 'dirid'. first_offset and 3226e02119d5SChris Mason * last_offset tell us which parts of the key space the log should 3227e02119d5SChris Mason * be considered authoritative for. 3228e02119d5SChris Mason */ 3229e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3230e02119d5SChris Mason struct btrfs_root *log, 3231e02119d5SChris Mason struct btrfs_path *path, 3232e02119d5SChris Mason int key_type, u64 dirid, 3233e02119d5SChris Mason u64 first_offset, u64 last_offset) 3234e02119d5SChris Mason { 3235e02119d5SChris Mason int ret; 3236e02119d5SChris Mason struct btrfs_key key; 3237e02119d5SChris Mason struct btrfs_dir_log_item *item; 3238e02119d5SChris Mason 3239e02119d5SChris Mason key.objectid = dirid; 3240e02119d5SChris Mason key.offset = first_offset; 3241e02119d5SChris Mason if (key_type == BTRFS_DIR_ITEM_KEY) 3242e02119d5SChris Mason key.type = BTRFS_DIR_LOG_ITEM_KEY; 3243e02119d5SChris Mason else 3244e02119d5SChris Mason key.type = BTRFS_DIR_LOG_INDEX_KEY; 3245e02119d5SChris Mason ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 32464a500fd1SYan, Zheng if (ret) 32474a500fd1SYan, Zheng return ret; 3248e02119d5SChris Mason 3249e02119d5SChris Mason item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3250e02119d5SChris Mason struct btrfs_dir_log_item); 3251e02119d5SChris Mason btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3252e02119d5SChris Mason btrfs_mark_buffer_dirty(path->nodes[0]); 3253b3b4aa74SDavid Sterba btrfs_release_path(path); 3254e02119d5SChris Mason return 0; 3255e02119d5SChris Mason } 3256e02119d5SChris Mason 3257e02119d5SChris Mason /* 3258e02119d5SChris Mason * log all the items included in the current transaction for a given 3259e02119d5SChris Mason * directory. This also creates the range items in the log tree required 3260e02119d5SChris Mason * to replay anything deleted before the fsync 3261e02119d5SChris Mason */ 3262e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3263e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 3264e02119d5SChris Mason struct btrfs_path *path, 3265e02119d5SChris Mason struct btrfs_path *dst_path, int key_type, 32662f2ff0eeSFilipe Manana struct btrfs_log_ctx *ctx, 3267e02119d5SChris Mason u64 min_offset, u64 *last_offset_ret) 3268e02119d5SChris Mason { 3269e02119d5SChris Mason struct btrfs_key min_key; 3270e02119d5SChris Mason struct btrfs_root *log = root->log_root; 3271e02119d5SChris Mason struct extent_buffer *src; 32724a500fd1SYan, Zheng int err = 0; 3273e02119d5SChris Mason int ret; 3274e02119d5SChris Mason int i; 3275e02119d5SChris Mason int nritems; 3276e02119d5SChris Mason u64 first_offset = min_offset; 3277e02119d5SChris Mason u64 last_offset = (u64)-1; 32784a0cc7caSNikolay Borisov u64 ino = btrfs_ino(BTRFS_I(inode)); 3279e02119d5SChris Mason 3280e02119d5SChris Mason log = root->log_root; 3281e02119d5SChris Mason 328233345d01SLi Zefan min_key.objectid = ino; 3283e02119d5SChris Mason min_key.type = key_type; 3284e02119d5SChris Mason min_key.offset = min_offset; 3285e02119d5SChris Mason 32866174d3cbSFilipe David Borba Manana ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3287e02119d5SChris Mason 3288e02119d5SChris Mason /* 3289e02119d5SChris Mason * we didn't find anything from this transaction, see if there 3290e02119d5SChris Mason * is anything at all 3291e02119d5SChris Mason */ 329233345d01SLi Zefan if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 329333345d01SLi Zefan min_key.objectid = ino; 3294e02119d5SChris Mason min_key.type = key_type; 3295e02119d5SChris Mason min_key.offset = (u64)-1; 3296b3b4aa74SDavid Sterba btrfs_release_path(path); 3297e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3298e02119d5SChris Mason if (ret < 0) { 3299b3b4aa74SDavid Sterba btrfs_release_path(path); 3300e02119d5SChris Mason return ret; 3301e02119d5SChris Mason } 330233345d01SLi Zefan ret = btrfs_previous_item(root, path, ino, key_type); 3303e02119d5SChris Mason 3304e02119d5SChris Mason /* if ret == 0 there are items for this type, 3305e02119d5SChris Mason * create a range to tell us the last key of this type. 3306e02119d5SChris Mason * otherwise, there are no items in this directory after 3307e02119d5SChris Mason * *min_offset, and we create a range to indicate that. 3308e02119d5SChris Mason */ 3309e02119d5SChris Mason if (ret == 0) { 3310e02119d5SChris Mason struct btrfs_key tmp; 3311e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3312e02119d5SChris Mason path->slots[0]); 3313d397712bSChris Mason if (key_type == tmp.type) 3314e02119d5SChris Mason first_offset = max(min_offset, tmp.offset) + 1; 3315e02119d5SChris Mason } 3316e02119d5SChris Mason goto done; 3317e02119d5SChris Mason } 3318e02119d5SChris Mason 3319e02119d5SChris Mason /* go backward to find any previous key */ 332033345d01SLi Zefan ret = btrfs_previous_item(root, path, ino, key_type); 3321e02119d5SChris Mason if (ret == 0) { 3322e02119d5SChris Mason struct btrfs_key tmp; 3323e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3324e02119d5SChris Mason if (key_type == tmp.type) { 3325e02119d5SChris Mason first_offset = tmp.offset; 3326e02119d5SChris Mason ret = overwrite_item(trans, log, dst_path, 3327e02119d5SChris Mason path->nodes[0], path->slots[0], 3328e02119d5SChris Mason &tmp); 33294a500fd1SYan, Zheng if (ret) { 33304a500fd1SYan, Zheng err = ret; 33314a500fd1SYan, Zheng goto done; 33324a500fd1SYan, Zheng } 3333e02119d5SChris Mason } 3334e02119d5SChris Mason } 3335b3b4aa74SDavid Sterba btrfs_release_path(path); 3336e02119d5SChris Mason 3337e02119d5SChris Mason /* find the first key from this transaction again */ 3338e02119d5SChris Mason ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3339fae7f21cSDulshani Gunawardhana if (WARN_ON(ret != 0)) 3340e02119d5SChris Mason goto done; 3341e02119d5SChris Mason 3342e02119d5SChris Mason /* 3343e02119d5SChris Mason * we have a block from this transaction, log every item in it 3344e02119d5SChris Mason * from our directory 3345e02119d5SChris Mason */ 3346e02119d5SChris Mason while (1) { 3347e02119d5SChris Mason struct btrfs_key tmp; 3348e02119d5SChris Mason src = path->nodes[0]; 3349e02119d5SChris Mason nritems = btrfs_header_nritems(src); 3350e02119d5SChris Mason for (i = path->slots[0]; i < nritems; i++) { 33512f2ff0eeSFilipe Manana struct btrfs_dir_item *di; 33522f2ff0eeSFilipe Manana 3353e02119d5SChris Mason btrfs_item_key_to_cpu(src, &min_key, i); 3354e02119d5SChris Mason 335533345d01SLi Zefan if (min_key.objectid != ino || min_key.type != key_type) 3356e02119d5SChris Mason goto done; 3357e02119d5SChris Mason ret = overwrite_item(trans, log, dst_path, src, i, 3358e02119d5SChris Mason &min_key); 33594a500fd1SYan, Zheng if (ret) { 33604a500fd1SYan, Zheng err = ret; 33614a500fd1SYan, Zheng goto done; 33624a500fd1SYan, Zheng } 33632f2ff0eeSFilipe Manana 33642f2ff0eeSFilipe Manana /* 33652f2ff0eeSFilipe Manana * We must make sure that when we log a directory entry, 33662f2ff0eeSFilipe Manana * the corresponding inode, after log replay, has a 33672f2ff0eeSFilipe Manana * matching link count. For example: 33682f2ff0eeSFilipe Manana * 33692f2ff0eeSFilipe Manana * touch foo 33702f2ff0eeSFilipe Manana * mkdir mydir 33712f2ff0eeSFilipe Manana * sync 33722f2ff0eeSFilipe Manana * ln foo mydir/bar 33732f2ff0eeSFilipe Manana * xfs_io -c "fsync" mydir 33742f2ff0eeSFilipe Manana * <crash> 33752f2ff0eeSFilipe Manana * <mount fs and log replay> 33762f2ff0eeSFilipe Manana * 33772f2ff0eeSFilipe Manana * Would result in a fsync log that when replayed, our 33782f2ff0eeSFilipe Manana * file inode would have a link count of 1, but we get 33792f2ff0eeSFilipe Manana * two directory entries pointing to the same inode. 33802f2ff0eeSFilipe Manana * After removing one of the names, it would not be 33812f2ff0eeSFilipe Manana * possible to remove the other name, which resulted 33822f2ff0eeSFilipe Manana * always in stale file handle errors, and would not 33832f2ff0eeSFilipe Manana * be possible to rmdir the parent directory, since 33842f2ff0eeSFilipe Manana * its i_size could never decrement to the value 33852f2ff0eeSFilipe Manana * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 33862f2ff0eeSFilipe Manana */ 33872f2ff0eeSFilipe Manana di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 33882f2ff0eeSFilipe Manana btrfs_dir_item_key_to_cpu(src, di, &tmp); 33892f2ff0eeSFilipe Manana if (ctx && 33902f2ff0eeSFilipe Manana (btrfs_dir_transid(src, di) == trans->transid || 33912f2ff0eeSFilipe Manana btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 33922f2ff0eeSFilipe Manana tmp.type != BTRFS_ROOT_ITEM_KEY) 33932f2ff0eeSFilipe Manana ctx->log_new_dentries = true; 3394e02119d5SChris Mason } 3395e02119d5SChris Mason path->slots[0] = nritems; 3396e02119d5SChris Mason 3397e02119d5SChris Mason /* 3398e02119d5SChris Mason * look ahead to the next item and see if it is also 3399e02119d5SChris Mason * from this directory and from this transaction 3400e02119d5SChris Mason */ 3401e02119d5SChris Mason ret = btrfs_next_leaf(root, path); 3402e02119d5SChris Mason if (ret == 1) { 3403e02119d5SChris Mason last_offset = (u64)-1; 3404e02119d5SChris Mason goto done; 3405e02119d5SChris Mason } 3406e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 340733345d01SLi Zefan if (tmp.objectid != ino || tmp.type != key_type) { 3408e02119d5SChris Mason last_offset = (u64)-1; 3409e02119d5SChris Mason goto done; 3410e02119d5SChris Mason } 3411e02119d5SChris Mason if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3412e02119d5SChris Mason ret = overwrite_item(trans, log, dst_path, 3413e02119d5SChris Mason path->nodes[0], path->slots[0], 3414e02119d5SChris Mason &tmp); 34154a500fd1SYan, Zheng if (ret) 34164a500fd1SYan, Zheng err = ret; 34174a500fd1SYan, Zheng else 3418e02119d5SChris Mason last_offset = tmp.offset; 3419e02119d5SChris Mason goto done; 3420e02119d5SChris Mason } 3421e02119d5SChris Mason } 3422e02119d5SChris Mason done: 3423b3b4aa74SDavid Sterba btrfs_release_path(path); 3424b3b4aa74SDavid Sterba btrfs_release_path(dst_path); 3425e02119d5SChris Mason 34264a500fd1SYan, Zheng if (err == 0) { 34274a500fd1SYan, Zheng *last_offset_ret = last_offset; 34284a500fd1SYan, Zheng /* 34294a500fd1SYan, Zheng * insert the log range keys to indicate where the log 34304a500fd1SYan, Zheng * is valid 34314a500fd1SYan, Zheng */ 34324a500fd1SYan, Zheng ret = insert_dir_log_key(trans, log, path, key_type, 343333345d01SLi Zefan ino, first_offset, last_offset); 34344a500fd1SYan, Zheng if (ret) 34354a500fd1SYan, Zheng err = ret; 34364a500fd1SYan, Zheng } 34374a500fd1SYan, Zheng return err; 3438e02119d5SChris Mason } 3439e02119d5SChris Mason 3440e02119d5SChris Mason /* 3441e02119d5SChris Mason * logging directories is very similar to logging inodes, We find all the items 3442e02119d5SChris Mason * from the current transaction and write them to the log. 3443e02119d5SChris Mason * 3444e02119d5SChris Mason * The recovery code scans the directory in the subvolume, and if it finds a 3445e02119d5SChris Mason * key in the range logged that is not present in the log tree, then it means 3446e02119d5SChris Mason * that dir entry was unlinked during the transaction. 3447e02119d5SChris Mason * 3448e02119d5SChris Mason * In order for that scan to work, we must include one key smaller than 3449e02119d5SChris Mason * the smallest logged by this transaction and one key larger than the largest 3450e02119d5SChris Mason * key logged by this transaction. 3451e02119d5SChris Mason */ 3452e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3453e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 3454e02119d5SChris Mason struct btrfs_path *path, 34552f2ff0eeSFilipe Manana struct btrfs_path *dst_path, 34562f2ff0eeSFilipe Manana struct btrfs_log_ctx *ctx) 3457e02119d5SChris Mason { 3458e02119d5SChris Mason u64 min_key; 3459e02119d5SChris Mason u64 max_key; 3460e02119d5SChris Mason int ret; 3461e02119d5SChris Mason int key_type = BTRFS_DIR_ITEM_KEY; 3462e02119d5SChris Mason 3463e02119d5SChris Mason again: 3464e02119d5SChris Mason min_key = 0; 3465e02119d5SChris Mason max_key = 0; 3466e02119d5SChris Mason while (1) { 3467e02119d5SChris Mason ret = log_dir_items(trans, root, inode, path, 34682f2ff0eeSFilipe Manana dst_path, key_type, ctx, min_key, 3469e02119d5SChris Mason &max_key); 34704a500fd1SYan, Zheng if (ret) 34714a500fd1SYan, Zheng return ret; 3472e02119d5SChris Mason if (max_key == (u64)-1) 3473e02119d5SChris Mason break; 3474e02119d5SChris Mason min_key = max_key + 1; 3475e02119d5SChris Mason } 3476e02119d5SChris Mason 3477e02119d5SChris Mason if (key_type == BTRFS_DIR_ITEM_KEY) { 3478e02119d5SChris Mason key_type = BTRFS_DIR_INDEX_KEY; 3479e02119d5SChris Mason goto again; 3480e02119d5SChris Mason } 3481e02119d5SChris Mason return 0; 3482e02119d5SChris Mason } 3483e02119d5SChris Mason 3484e02119d5SChris Mason /* 3485e02119d5SChris Mason * a helper function to drop items from the log before we relog an 3486e02119d5SChris Mason * inode. max_key_type indicates the highest item type to remove. 3487e02119d5SChris Mason * This cannot be run for file data extents because it does not 3488e02119d5SChris Mason * free the extents they point to. 3489e02119d5SChris Mason */ 3490e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans, 3491e02119d5SChris Mason struct btrfs_root *log, 3492e02119d5SChris Mason struct btrfs_path *path, 3493e02119d5SChris Mason u64 objectid, int max_key_type) 3494e02119d5SChris Mason { 3495e02119d5SChris Mason int ret; 3496e02119d5SChris Mason struct btrfs_key key; 3497e02119d5SChris Mason struct btrfs_key found_key; 349818ec90d6SJosef Bacik int start_slot; 3499e02119d5SChris Mason 3500e02119d5SChris Mason key.objectid = objectid; 3501e02119d5SChris Mason key.type = max_key_type; 3502e02119d5SChris Mason key.offset = (u64)-1; 3503e02119d5SChris Mason 3504e02119d5SChris Mason while (1) { 3505e02119d5SChris Mason ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 35063650860bSJosef Bacik BUG_ON(ret == 0); /* Logic error */ 35074a500fd1SYan, Zheng if (ret < 0) 3508e02119d5SChris Mason break; 3509e02119d5SChris Mason 3510e02119d5SChris Mason if (path->slots[0] == 0) 3511e02119d5SChris Mason break; 3512e02119d5SChris Mason 3513e02119d5SChris Mason path->slots[0]--; 3514e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3515e02119d5SChris Mason path->slots[0]); 3516e02119d5SChris Mason 3517e02119d5SChris Mason if (found_key.objectid != objectid) 3518e02119d5SChris Mason break; 3519e02119d5SChris Mason 352018ec90d6SJosef Bacik found_key.offset = 0; 352118ec90d6SJosef Bacik found_key.type = 0; 352218ec90d6SJosef Bacik ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 352318ec90d6SJosef Bacik &start_slot); 352418ec90d6SJosef Bacik 352518ec90d6SJosef Bacik ret = btrfs_del_items(trans, log, path, start_slot, 352618ec90d6SJosef Bacik path->slots[0] - start_slot + 1); 352718ec90d6SJosef Bacik /* 352818ec90d6SJosef Bacik * If start slot isn't 0 then we don't need to re-search, we've 352918ec90d6SJosef Bacik * found the last guy with the objectid in this tree. 353018ec90d6SJosef Bacik */ 353118ec90d6SJosef Bacik if (ret || start_slot != 0) 353265a246c5STsutomu Itoh break; 3533b3b4aa74SDavid Sterba btrfs_release_path(path); 3534e02119d5SChris Mason } 3535b3b4aa74SDavid Sterba btrfs_release_path(path); 35365bdbeb21SJosef Bacik if (ret > 0) 35375bdbeb21SJosef Bacik ret = 0; 35384a500fd1SYan, Zheng return ret; 3539e02119d5SChris Mason } 3540e02119d5SChris Mason 354194edf4aeSJosef Bacik static void fill_inode_item(struct btrfs_trans_handle *trans, 354294edf4aeSJosef Bacik struct extent_buffer *leaf, 354394edf4aeSJosef Bacik struct btrfs_inode_item *item, 35441a4bcf47SFilipe Manana struct inode *inode, int log_inode_only, 35451a4bcf47SFilipe Manana u64 logged_isize) 354694edf4aeSJosef Bacik { 35470b1c6ccaSJosef Bacik struct btrfs_map_token token; 354894edf4aeSJosef Bacik 35490b1c6ccaSJosef Bacik btrfs_init_map_token(&token); 355094edf4aeSJosef Bacik 355194edf4aeSJosef Bacik if (log_inode_only) { 355294edf4aeSJosef Bacik /* set the generation to zero so the recover code 355394edf4aeSJosef Bacik * can tell the difference between an logging 355494edf4aeSJosef Bacik * just to say 'this inode exists' and a logging 355594edf4aeSJosef Bacik * to say 'update this inode with these values' 355694edf4aeSJosef Bacik */ 35570b1c6ccaSJosef Bacik btrfs_set_token_inode_generation(leaf, item, 0, &token); 35581a4bcf47SFilipe Manana btrfs_set_token_inode_size(leaf, item, logged_isize, &token); 355994edf4aeSJosef Bacik } else { 35600b1c6ccaSJosef Bacik btrfs_set_token_inode_generation(leaf, item, 35610b1c6ccaSJosef Bacik BTRFS_I(inode)->generation, 35620b1c6ccaSJosef Bacik &token); 35630b1c6ccaSJosef Bacik btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 356494edf4aeSJosef Bacik } 356594edf4aeSJosef Bacik 35660b1c6ccaSJosef Bacik btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 35670b1c6ccaSJosef Bacik btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 35680b1c6ccaSJosef Bacik btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 35690b1c6ccaSJosef Bacik btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 35700b1c6ccaSJosef Bacik 3571a937b979SDavid Sterba btrfs_set_token_timespec_sec(leaf, &item->atime, 35720b1c6ccaSJosef Bacik inode->i_atime.tv_sec, &token); 3573a937b979SDavid Sterba btrfs_set_token_timespec_nsec(leaf, &item->atime, 35740b1c6ccaSJosef Bacik inode->i_atime.tv_nsec, &token); 35750b1c6ccaSJosef Bacik 3576a937b979SDavid Sterba btrfs_set_token_timespec_sec(leaf, &item->mtime, 35770b1c6ccaSJosef Bacik inode->i_mtime.tv_sec, &token); 3578a937b979SDavid Sterba btrfs_set_token_timespec_nsec(leaf, &item->mtime, 35790b1c6ccaSJosef Bacik inode->i_mtime.tv_nsec, &token); 35800b1c6ccaSJosef Bacik 3581a937b979SDavid Sterba btrfs_set_token_timespec_sec(leaf, &item->ctime, 35820b1c6ccaSJosef Bacik inode->i_ctime.tv_sec, &token); 3583a937b979SDavid Sterba btrfs_set_token_timespec_nsec(leaf, &item->ctime, 35840b1c6ccaSJosef Bacik inode->i_ctime.tv_nsec, &token); 35850b1c6ccaSJosef Bacik 35860b1c6ccaSJosef Bacik btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 35870b1c6ccaSJosef Bacik &token); 35880b1c6ccaSJosef Bacik 35890b1c6ccaSJosef Bacik btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 35900b1c6ccaSJosef Bacik btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 35910b1c6ccaSJosef Bacik btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 35920b1c6ccaSJosef Bacik btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 35930b1c6ccaSJosef Bacik btrfs_set_token_inode_block_group(leaf, item, 0, &token); 359494edf4aeSJosef Bacik } 359594edf4aeSJosef Bacik 3596a95249b3SJosef Bacik static int log_inode_item(struct btrfs_trans_handle *trans, 3597a95249b3SJosef Bacik struct btrfs_root *log, struct btrfs_path *path, 3598a95249b3SJosef Bacik struct inode *inode) 3599a95249b3SJosef Bacik { 3600a95249b3SJosef Bacik struct btrfs_inode_item *inode_item; 3601a95249b3SJosef Bacik int ret; 3602a95249b3SJosef Bacik 3603efd0c405SFilipe David Borba Manana ret = btrfs_insert_empty_item(trans, log, path, 3604efd0c405SFilipe David Borba Manana &BTRFS_I(inode)->location, 3605a95249b3SJosef Bacik sizeof(*inode_item)); 3606a95249b3SJosef Bacik if (ret && ret != -EEXIST) 3607a95249b3SJosef Bacik return ret; 3608a95249b3SJosef Bacik inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3609a95249b3SJosef Bacik struct btrfs_inode_item); 36101a4bcf47SFilipe Manana fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); 3611a95249b3SJosef Bacik btrfs_release_path(path); 3612a95249b3SJosef Bacik return 0; 3613a95249b3SJosef Bacik } 3614a95249b3SJosef Bacik 361531ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans, 361644d70e19SNikolay Borisov struct btrfs_inode *inode, 361731ff1cd2SChris Mason struct btrfs_path *dst_path, 361816e7549fSJosef Bacik struct btrfs_path *src_path, u64 *last_extent, 36191a4bcf47SFilipe Manana int start_slot, int nr, int inode_only, 36201a4bcf47SFilipe Manana u64 logged_isize) 362131ff1cd2SChris Mason { 362244d70e19SNikolay Borisov struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 362331ff1cd2SChris Mason unsigned long src_offset; 362431ff1cd2SChris Mason unsigned long dst_offset; 362544d70e19SNikolay Borisov struct btrfs_root *log = inode->root->log_root; 362631ff1cd2SChris Mason struct btrfs_file_extent_item *extent; 362731ff1cd2SChris Mason struct btrfs_inode_item *inode_item; 362816e7549fSJosef Bacik struct extent_buffer *src = src_path->nodes[0]; 362916e7549fSJosef Bacik struct btrfs_key first_key, last_key, key; 363031ff1cd2SChris Mason int ret; 363131ff1cd2SChris Mason struct btrfs_key *ins_keys; 363231ff1cd2SChris Mason u32 *ins_sizes; 363331ff1cd2SChris Mason char *ins_data; 363431ff1cd2SChris Mason int i; 3635d20f7043SChris Mason struct list_head ordered_sums; 363644d70e19SNikolay Borisov int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 363716e7549fSJosef Bacik bool has_extents = false; 363874121f7cSFilipe Manana bool need_find_last_extent = true; 363916e7549fSJosef Bacik bool done = false; 3640d20f7043SChris Mason 3641d20f7043SChris Mason INIT_LIST_HEAD(&ordered_sums); 364231ff1cd2SChris Mason 364331ff1cd2SChris Mason ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 364431ff1cd2SChris Mason nr * sizeof(u32), GFP_NOFS); 36452a29edc6Sliubo if (!ins_data) 36462a29edc6Sliubo return -ENOMEM; 36472a29edc6Sliubo 364816e7549fSJosef Bacik first_key.objectid = (u64)-1; 364916e7549fSJosef Bacik 365031ff1cd2SChris Mason ins_sizes = (u32 *)ins_data; 365131ff1cd2SChris Mason ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 365231ff1cd2SChris Mason 365331ff1cd2SChris Mason for (i = 0; i < nr; i++) { 365431ff1cd2SChris Mason ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 365531ff1cd2SChris Mason btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 365631ff1cd2SChris Mason } 365731ff1cd2SChris Mason ret = btrfs_insert_empty_items(trans, log, dst_path, 365831ff1cd2SChris Mason ins_keys, ins_sizes, nr); 36594a500fd1SYan, Zheng if (ret) { 36604a500fd1SYan, Zheng kfree(ins_data); 36614a500fd1SYan, Zheng return ret; 36624a500fd1SYan, Zheng } 366331ff1cd2SChris Mason 36645d4f98a2SYan Zheng for (i = 0; i < nr; i++, dst_path->slots[0]++) { 366531ff1cd2SChris Mason dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 366631ff1cd2SChris Mason dst_path->slots[0]); 366731ff1cd2SChris Mason 366831ff1cd2SChris Mason src_offset = btrfs_item_ptr_offset(src, start_slot + i); 366931ff1cd2SChris Mason 367016e7549fSJosef Bacik if ((i == (nr - 1))) 367116e7549fSJosef Bacik last_key = ins_keys[i]; 367216e7549fSJosef Bacik 367394edf4aeSJosef Bacik if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 367431ff1cd2SChris Mason inode_item = btrfs_item_ptr(dst_path->nodes[0], 367531ff1cd2SChris Mason dst_path->slots[0], 367631ff1cd2SChris Mason struct btrfs_inode_item); 367794edf4aeSJosef Bacik fill_inode_item(trans, dst_path->nodes[0], inode_item, 367844d70e19SNikolay Borisov &inode->vfs_inode, inode_only == LOG_INODE_EXISTS, 36791a4bcf47SFilipe Manana logged_isize); 368094edf4aeSJosef Bacik } else { 368194edf4aeSJosef Bacik copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 368294edf4aeSJosef Bacik src_offset, ins_sizes[i]); 368331ff1cd2SChris Mason } 368494edf4aeSJosef Bacik 368516e7549fSJosef Bacik /* 368616e7549fSJosef Bacik * We set need_find_last_extent here in case we know we were 368716e7549fSJosef Bacik * processing other items and then walk into the first extent in 368816e7549fSJosef Bacik * the inode. If we don't hit an extent then nothing changes, 368916e7549fSJosef Bacik * we'll do the last search the next time around. 369016e7549fSJosef Bacik */ 369116e7549fSJosef Bacik if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 369216e7549fSJosef Bacik has_extents = true; 369374121f7cSFilipe Manana if (first_key.objectid == (u64)-1) 369416e7549fSJosef Bacik first_key = ins_keys[i]; 369516e7549fSJosef Bacik } else { 369616e7549fSJosef Bacik need_find_last_extent = false; 369716e7549fSJosef Bacik } 369816e7549fSJosef Bacik 369931ff1cd2SChris Mason /* take a reference on file data extents so that truncates 370031ff1cd2SChris Mason * or deletes of this inode don't have to relog the inode 370131ff1cd2SChris Mason * again 370231ff1cd2SChris Mason */ 3703962a298fSDavid Sterba if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 3704d2794405SLiu Bo !skip_csum) { 370531ff1cd2SChris Mason int found_type; 370631ff1cd2SChris Mason extent = btrfs_item_ptr(src, start_slot + i, 370731ff1cd2SChris Mason struct btrfs_file_extent_item); 370831ff1cd2SChris Mason 37098e531cdfSliubo if (btrfs_file_extent_generation(src, extent) < trans->transid) 37108e531cdfSliubo continue; 37118e531cdfSliubo 371231ff1cd2SChris Mason found_type = btrfs_file_extent_type(src, extent); 37136f1fed77SJosef Bacik if (found_type == BTRFS_FILE_EXTENT_REG) { 37145d4f98a2SYan Zheng u64 ds, dl, cs, cl; 37155d4f98a2SYan Zheng ds = btrfs_file_extent_disk_bytenr(src, 371631ff1cd2SChris Mason extent); 37175d4f98a2SYan Zheng /* ds == 0 is a hole */ 37185d4f98a2SYan Zheng if (ds == 0) 37195d4f98a2SYan Zheng continue; 37205d4f98a2SYan Zheng 37215d4f98a2SYan Zheng dl = btrfs_file_extent_disk_num_bytes(src, 372231ff1cd2SChris Mason extent); 37235d4f98a2SYan Zheng cs = btrfs_file_extent_offset(src, extent); 37245d4f98a2SYan Zheng cl = btrfs_file_extent_num_bytes(src, 3725a419aef8SJoe Perches extent); 3726580afd76SChris Mason if (btrfs_file_extent_compression(src, 3727580afd76SChris Mason extent)) { 3728580afd76SChris Mason cs = 0; 3729580afd76SChris Mason cl = dl; 3730580afd76SChris Mason } 37315d4f98a2SYan Zheng 373207d400a6SYan Zheng ret = btrfs_lookup_csums_range( 37330b246afaSJeff Mahoney fs_info->csum_root, 373407d400a6SYan Zheng ds + cs, ds + cs + cl - 1, 3735a2de733cSArne Jansen &ordered_sums, 0); 37363650860bSJosef Bacik if (ret) { 37373650860bSJosef Bacik btrfs_release_path(dst_path); 37383650860bSJosef Bacik kfree(ins_data); 37393650860bSJosef Bacik return ret; 37403650860bSJosef Bacik } 374131ff1cd2SChris Mason } 374231ff1cd2SChris Mason } 374331ff1cd2SChris Mason } 374431ff1cd2SChris Mason 374531ff1cd2SChris Mason btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3746b3b4aa74SDavid Sterba btrfs_release_path(dst_path); 374731ff1cd2SChris Mason kfree(ins_data); 3748d20f7043SChris Mason 3749d20f7043SChris Mason /* 3750d20f7043SChris Mason * we have to do this after the loop above to avoid changing the 3751d20f7043SChris Mason * log tree while trying to change the log tree. 3752d20f7043SChris Mason */ 37534a500fd1SYan, Zheng ret = 0; 3754d20f7043SChris Mason while (!list_empty(&ordered_sums)) { 3755d20f7043SChris Mason struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3756d20f7043SChris Mason struct btrfs_ordered_sum, 3757d20f7043SChris Mason list); 37584a500fd1SYan, Zheng if (!ret) 3759d20f7043SChris Mason ret = btrfs_csum_file_blocks(trans, log, sums); 3760d20f7043SChris Mason list_del(&sums->list); 3761d20f7043SChris Mason kfree(sums); 3762d20f7043SChris Mason } 376316e7549fSJosef Bacik 376416e7549fSJosef Bacik if (!has_extents) 376516e7549fSJosef Bacik return ret; 376616e7549fSJosef Bacik 376774121f7cSFilipe Manana if (need_find_last_extent && *last_extent == first_key.offset) { 376874121f7cSFilipe Manana /* 376974121f7cSFilipe Manana * We don't have any leafs between our current one and the one 377074121f7cSFilipe Manana * we processed before that can have file extent items for our 377174121f7cSFilipe Manana * inode (and have a generation number smaller than our current 377274121f7cSFilipe Manana * transaction id). 377374121f7cSFilipe Manana */ 377474121f7cSFilipe Manana need_find_last_extent = false; 377574121f7cSFilipe Manana } 377674121f7cSFilipe Manana 377716e7549fSJosef Bacik /* 377816e7549fSJosef Bacik * Because we use btrfs_search_forward we could skip leaves that were 377916e7549fSJosef Bacik * not modified and then assume *last_extent is valid when it really 378016e7549fSJosef Bacik * isn't. So back up to the previous leaf and read the end of the last 378116e7549fSJosef Bacik * extent before we go and fill in holes. 378216e7549fSJosef Bacik */ 378316e7549fSJosef Bacik if (need_find_last_extent) { 378416e7549fSJosef Bacik u64 len; 378516e7549fSJosef Bacik 378644d70e19SNikolay Borisov ret = btrfs_prev_leaf(inode->root, src_path); 378716e7549fSJosef Bacik if (ret < 0) 378816e7549fSJosef Bacik return ret; 378916e7549fSJosef Bacik if (ret) 379016e7549fSJosef Bacik goto fill_holes; 379116e7549fSJosef Bacik if (src_path->slots[0]) 379216e7549fSJosef Bacik src_path->slots[0]--; 379316e7549fSJosef Bacik src = src_path->nodes[0]; 379416e7549fSJosef Bacik btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); 379544d70e19SNikolay Borisov if (key.objectid != btrfs_ino(inode) || 379616e7549fSJosef Bacik key.type != BTRFS_EXTENT_DATA_KEY) 379716e7549fSJosef Bacik goto fill_holes; 379816e7549fSJosef Bacik extent = btrfs_item_ptr(src, src_path->slots[0], 379916e7549fSJosef Bacik struct btrfs_file_extent_item); 380016e7549fSJosef Bacik if (btrfs_file_extent_type(src, extent) == 380116e7549fSJosef Bacik BTRFS_FILE_EXTENT_INLINE) { 3802514ac8adSChris Mason len = btrfs_file_extent_inline_len(src, 3803514ac8adSChris Mason src_path->slots[0], 3804514ac8adSChris Mason extent); 380516e7549fSJosef Bacik *last_extent = ALIGN(key.offset + len, 38060b246afaSJeff Mahoney fs_info->sectorsize); 380716e7549fSJosef Bacik } else { 380816e7549fSJosef Bacik len = btrfs_file_extent_num_bytes(src, extent); 380916e7549fSJosef Bacik *last_extent = key.offset + len; 381016e7549fSJosef Bacik } 381116e7549fSJosef Bacik } 381216e7549fSJosef Bacik fill_holes: 381316e7549fSJosef Bacik /* So we did prev_leaf, now we need to move to the next leaf, but a few 381416e7549fSJosef Bacik * things could have happened 381516e7549fSJosef Bacik * 381616e7549fSJosef Bacik * 1) A merge could have happened, so we could currently be on a leaf 381716e7549fSJosef Bacik * that holds what we were copying in the first place. 381816e7549fSJosef Bacik * 2) A split could have happened, and now not all of the items we want 381916e7549fSJosef Bacik * are on the same leaf. 382016e7549fSJosef Bacik * 382116e7549fSJosef Bacik * So we need to adjust how we search for holes, we need to drop the 382216e7549fSJosef Bacik * path and re-search for the first extent key we found, and then walk 382316e7549fSJosef Bacik * forward until we hit the last one we copied. 382416e7549fSJosef Bacik */ 382516e7549fSJosef Bacik if (need_find_last_extent) { 382616e7549fSJosef Bacik /* btrfs_prev_leaf could return 1 without releasing the path */ 382716e7549fSJosef Bacik btrfs_release_path(src_path); 382844d70e19SNikolay Borisov ret = btrfs_search_slot(NULL, inode->root, &first_key, src_path, 0, 0); 382916e7549fSJosef Bacik if (ret < 0) 383016e7549fSJosef Bacik return ret; 383116e7549fSJosef Bacik ASSERT(ret == 0); 383216e7549fSJosef Bacik src = src_path->nodes[0]; 383316e7549fSJosef Bacik i = src_path->slots[0]; 383416e7549fSJosef Bacik } else { 383516e7549fSJosef Bacik i = start_slot; 383616e7549fSJosef Bacik } 383716e7549fSJosef Bacik 383816e7549fSJosef Bacik /* 383916e7549fSJosef Bacik * Ok so here we need to go through and fill in any holes we may have 384016e7549fSJosef Bacik * to make sure that holes are punched for those areas in case they had 384116e7549fSJosef Bacik * extents previously. 384216e7549fSJosef Bacik */ 384316e7549fSJosef Bacik while (!done) { 384416e7549fSJosef Bacik u64 offset, len; 384516e7549fSJosef Bacik u64 extent_end; 384616e7549fSJosef Bacik 384716e7549fSJosef Bacik if (i >= btrfs_header_nritems(src_path->nodes[0])) { 384844d70e19SNikolay Borisov ret = btrfs_next_leaf(inode->root, src_path); 384916e7549fSJosef Bacik if (ret < 0) 385016e7549fSJosef Bacik return ret; 385116e7549fSJosef Bacik ASSERT(ret == 0); 385216e7549fSJosef Bacik src = src_path->nodes[0]; 385316e7549fSJosef Bacik i = 0; 385416e7549fSJosef Bacik } 385516e7549fSJosef Bacik 385616e7549fSJosef Bacik btrfs_item_key_to_cpu(src, &key, i); 385716e7549fSJosef Bacik if (!btrfs_comp_cpu_keys(&key, &last_key)) 385816e7549fSJosef Bacik done = true; 385944d70e19SNikolay Borisov if (key.objectid != btrfs_ino(inode) || 386016e7549fSJosef Bacik key.type != BTRFS_EXTENT_DATA_KEY) { 386116e7549fSJosef Bacik i++; 386216e7549fSJosef Bacik continue; 386316e7549fSJosef Bacik } 386416e7549fSJosef Bacik extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); 386516e7549fSJosef Bacik if (btrfs_file_extent_type(src, extent) == 386616e7549fSJosef Bacik BTRFS_FILE_EXTENT_INLINE) { 3867514ac8adSChris Mason len = btrfs_file_extent_inline_len(src, i, extent); 3868da17066cSJeff Mahoney extent_end = ALIGN(key.offset + len, 38690b246afaSJeff Mahoney fs_info->sectorsize); 387016e7549fSJosef Bacik } else { 387116e7549fSJosef Bacik len = btrfs_file_extent_num_bytes(src, extent); 387216e7549fSJosef Bacik extent_end = key.offset + len; 387316e7549fSJosef Bacik } 387416e7549fSJosef Bacik i++; 387516e7549fSJosef Bacik 387616e7549fSJosef Bacik if (*last_extent == key.offset) { 387716e7549fSJosef Bacik *last_extent = extent_end; 387816e7549fSJosef Bacik continue; 387916e7549fSJosef Bacik } 388016e7549fSJosef Bacik offset = *last_extent; 388116e7549fSJosef Bacik len = key.offset - *last_extent; 388244d70e19SNikolay Borisov ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), 388344d70e19SNikolay Borisov offset, 0, 0, len, 0, len, 0, 0, 0); 388416e7549fSJosef Bacik if (ret) 388516e7549fSJosef Bacik break; 388674121f7cSFilipe Manana *last_extent = extent_end; 388716e7549fSJosef Bacik } 388816e7549fSJosef Bacik /* 388916e7549fSJosef Bacik * Need to let the callers know we dropped the path so they should 389016e7549fSJosef Bacik * re-search. 389116e7549fSJosef Bacik */ 389216e7549fSJosef Bacik if (!ret && need_find_last_extent) 389316e7549fSJosef Bacik ret = 1; 38944a500fd1SYan, Zheng return ret; 389531ff1cd2SChris Mason } 389631ff1cd2SChris Mason 38975dc562c5SJosef Bacik static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 38985dc562c5SJosef Bacik { 38995dc562c5SJosef Bacik struct extent_map *em1, *em2; 39005dc562c5SJosef Bacik 39015dc562c5SJosef Bacik em1 = list_entry(a, struct extent_map, list); 39025dc562c5SJosef Bacik em2 = list_entry(b, struct extent_map, list); 39035dc562c5SJosef Bacik 39045dc562c5SJosef Bacik if (em1->start < em2->start) 39055dc562c5SJosef Bacik return -1; 39065dc562c5SJosef Bacik else if (em1->start > em2->start) 39075dc562c5SJosef Bacik return 1; 39085dc562c5SJosef Bacik return 0; 39095dc562c5SJosef Bacik } 39105dc562c5SJosef Bacik 39118407f553SFilipe Manana static int wait_ordered_extents(struct btrfs_trans_handle *trans, 39128407f553SFilipe Manana struct inode *inode, 39138407f553SFilipe Manana struct btrfs_root *root, 39148407f553SFilipe Manana const struct extent_map *em, 39158407f553SFilipe Manana const struct list_head *logged_list, 39168407f553SFilipe Manana bool *ordered_io_error) 39175dc562c5SJosef Bacik { 39180b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 39192ab28f32SJosef Bacik struct btrfs_ordered_extent *ordered; 39208407f553SFilipe Manana struct btrfs_root *log = root->log_root; 39212ab28f32SJosef Bacik u64 mod_start = em->mod_start; 39222ab28f32SJosef Bacik u64 mod_len = em->mod_len; 39238407f553SFilipe Manana const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 39242ab28f32SJosef Bacik u64 csum_offset; 39252ab28f32SJosef Bacik u64 csum_len; 39268407f553SFilipe Manana LIST_HEAD(ordered_sums); 39278407f553SFilipe Manana int ret = 0; 392809a2a8f9SJosef Bacik 39298407f553SFilipe Manana *ordered_io_error = false; 39301acae57bSFilipe David Borba Manana 39318407f553SFilipe Manana if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 39328407f553SFilipe Manana em->block_start == EXTENT_MAP_HOLE) 393370c8a91cSJosef Bacik return 0; 393470c8a91cSJosef Bacik 39352ab28f32SJosef Bacik /* 39368407f553SFilipe Manana * Wait far any ordered extent that covers our extent map. If it 39378407f553SFilipe Manana * finishes without an error, first check and see if our csums are on 39388407f553SFilipe Manana * our outstanding ordered extents. 39392ab28f32SJosef Bacik */ 3940827463c4SMiao Xie list_for_each_entry(ordered, logged_list, log_list) { 39412ab28f32SJosef Bacik struct btrfs_ordered_sum *sum; 39422ab28f32SJosef Bacik 39432ab28f32SJosef Bacik if (!mod_len) 39442ab28f32SJosef Bacik break; 39452ab28f32SJosef Bacik 39462ab28f32SJosef Bacik if (ordered->file_offset + ordered->len <= mod_start || 39472ab28f32SJosef Bacik mod_start + mod_len <= ordered->file_offset) 39482ab28f32SJosef Bacik continue; 39492ab28f32SJosef Bacik 39508407f553SFilipe Manana if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 39518407f553SFilipe Manana !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 39528407f553SFilipe Manana !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 39538407f553SFilipe Manana const u64 start = ordered->file_offset; 39548407f553SFilipe Manana const u64 end = ordered->file_offset + ordered->len - 1; 39558407f553SFilipe Manana 39568407f553SFilipe Manana WARN_ON(ordered->inode != inode); 39578407f553SFilipe Manana filemap_fdatawrite_range(inode->i_mapping, start, end); 39588407f553SFilipe Manana } 39598407f553SFilipe Manana 39608407f553SFilipe Manana wait_event(ordered->wait, 39618407f553SFilipe Manana (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || 39628407f553SFilipe Manana test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 39638407f553SFilipe Manana 39648407f553SFilipe Manana if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3965b38ef71cSFilipe Manana /* 3966b38ef71cSFilipe Manana * Clear the AS_EIO/AS_ENOSPC flags from the inode's 3967b38ef71cSFilipe Manana * i_mapping flags, so that the next fsync won't get 3968b38ef71cSFilipe Manana * an outdated io error too. 3969b38ef71cSFilipe Manana */ 3970f0312210SMiklos Szeredi filemap_check_errors(inode->i_mapping); 39718407f553SFilipe Manana *ordered_io_error = true; 39728407f553SFilipe Manana break; 39738407f553SFilipe Manana } 39742ab28f32SJosef Bacik /* 39752ab28f32SJosef Bacik * We are going to copy all the csums on this ordered extent, so 39762ab28f32SJosef Bacik * go ahead and adjust mod_start and mod_len in case this 39772ab28f32SJosef Bacik * ordered extent has already been logged. 39782ab28f32SJosef Bacik */ 39792ab28f32SJosef Bacik if (ordered->file_offset > mod_start) { 39802ab28f32SJosef Bacik if (ordered->file_offset + ordered->len >= 39812ab28f32SJosef Bacik mod_start + mod_len) 39822ab28f32SJosef Bacik mod_len = ordered->file_offset - mod_start; 39832ab28f32SJosef Bacik /* 39842ab28f32SJosef Bacik * If we have this case 39852ab28f32SJosef Bacik * 39862ab28f32SJosef Bacik * |--------- logged extent ---------| 39872ab28f32SJosef Bacik * |----- ordered extent ----| 39882ab28f32SJosef Bacik * 39892ab28f32SJosef Bacik * Just don't mess with mod_start and mod_len, we'll 39902ab28f32SJosef Bacik * just end up logging more csums than we need and it 39912ab28f32SJosef Bacik * will be ok. 39922ab28f32SJosef Bacik */ 39932ab28f32SJosef Bacik } else { 39942ab28f32SJosef Bacik if (ordered->file_offset + ordered->len < 39952ab28f32SJosef Bacik mod_start + mod_len) { 39962ab28f32SJosef Bacik mod_len = (mod_start + mod_len) - 39972ab28f32SJosef Bacik (ordered->file_offset + ordered->len); 39982ab28f32SJosef Bacik mod_start = ordered->file_offset + 39992ab28f32SJosef Bacik ordered->len; 40002ab28f32SJosef Bacik } else { 40012ab28f32SJosef Bacik mod_len = 0; 40022ab28f32SJosef Bacik } 40032ab28f32SJosef Bacik } 40042ab28f32SJosef Bacik 40058407f553SFilipe Manana if (skip_csum) 40068407f553SFilipe Manana continue; 40078407f553SFilipe Manana 40082ab28f32SJosef Bacik /* 40092ab28f32SJosef Bacik * To keep us from looping for the above case of an ordered 40102ab28f32SJosef Bacik * extent that falls inside of the logged extent. 40112ab28f32SJosef Bacik */ 40122ab28f32SJosef Bacik if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 40132ab28f32SJosef Bacik &ordered->flags)) 40142ab28f32SJosef Bacik continue; 40152ab28f32SJosef Bacik 40162ab28f32SJosef Bacik list_for_each_entry(sum, &ordered->list, list) { 40172ab28f32SJosef Bacik ret = btrfs_csum_file_blocks(trans, log, sum); 4018827463c4SMiao Xie if (ret) 40198407f553SFilipe Manana break; 40208407f553SFilipe Manana } 40212ab28f32SJosef Bacik } 40222ab28f32SJosef Bacik 40238407f553SFilipe Manana if (*ordered_io_error || !mod_len || ret || skip_csum) 40242ab28f32SJosef Bacik return ret; 40252ab28f32SJosef Bacik 4026488111aaSFilipe David Borba Manana if (em->compress_type) { 4027488111aaSFilipe David Borba Manana csum_offset = 0; 40288407f553SFilipe Manana csum_len = max(em->block_len, em->orig_block_len); 4029488111aaSFilipe David Borba Manana } else { 40302ab28f32SJosef Bacik csum_offset = mod_start - em->start; 40312ab28f32SJosef Bacik csum_len = mod_len; 4032488111aaSFilipe David Borba Manana } 40332ab28f32SJosef Bacik 403470c8a91cSJosef Bacik /* block start is already adjusted for the file extent offset. */ 40350b246afaSJeff Mahoney ret = btrfs_lookup_csums_range(fs_info->csum_root, 403670c8a91cSJosef Bacik em->block_start + csum_offset, 403770c8a91cSJosef Bacik em->block_start + csum_offset + 403870c8a91cSJosef Bacik csum_len - 1, &ordered_sums, 0); 40395dc562c5SJosef Bacik if (ret) 40405dc562c5SJosef Bacik return ret; 404170c8a91cSJosef Bacik 404270c8a91cSJosef Bacik while (!list_empty(&ordered_sums)) { 404370c8a91cSJosef Bacik struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 404470c8a91cSJosef Bacik struct btrfs_ordered_sum, 404570c8a91cSJosef Bacik list); 404670c8a91cSJosef Bacik if (!ret) 404770c8a91cSJosef Bacik ret = btrfs_csum_file_blocks(trans, log, sums); 404870c8a91cSJosef Bacik list_del(&sums->list); 404970c8a91cSJosef Bacik kfree(sums); 40505dc562c5SJosef Bacik } 40515dc562c5SJosef Bacik 405270c8a91cSJosef Bacik return ret; 40535dc562c5SJosef Bacik } 40545dc562c5SJosef Bacik 40558407f553SFilipe Manana static int log_one_extent(struct btrfs_trans_handle *trans, 40568407f553SFilipe Manana struct inode *inode, struct btrfs_root *root, 40578407f553SFilipe Manana const struct extent_map *em, 40588407f553SFilipe Manana struct btrfs_path *path, 40598407f553SFilipe Manana const struct list_head *logged_list, 40608407f553SFilipe Manana struct btrfs_log_ctx *ctx) 40618407f553SFilipe Manana { 40628407f553SFilipe Manana struct btrfs_root *log = root->log_root; 40638407f553SFilipe Manana struct btrfs_file_extent_item *fi; 40648407f553SFilipe Manana struct extent_buffer *leaf; 40658407f553SFilipe Manana struct btrfs_map_token token; 40668407f553SFilipe Manana struct btrfs_key key; 40678407f553SFilipe Manana u64 extent_offset = em->start - em->orig_start; 40688407f553SFilipe Manana u64 block_len; 40698407f553SFilipe Manana int ret; 40708407f553SFilipe Manana int extent_inserted = 0; 40718407f553SFilipe Manana bool ordered_io_err = false; 40728407f553SFilipe Manana 40738407f553SFilipe Manana ret = wait_ordered_extents(trans, inode, root, em, logged_list, 40748407f553SFilipe Manana &ordered_io_err); 40758407f553SFilipe Manana if (ret) 40768407f553SFilipe Manana return ret; 40778407f553SFilipe Manana 40788407f553SFilipe Manana if (ordered_io_err) { 40798407f553SFilipe Manana ctx->io_err = -EIO; 40808407f553SFilipe Manana return 0; 40818407f553SFilipe Manana } 40828407f553SFilipe Manana 40838407f553SFilipe Manana btrfs_init_map_token(&token); 40848407f553SFilipe Manana 40858407f553SFilipe Manana ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 40868407f553SFilipe Manana em->start + em->len, NULL, 0, 1, 40878407f553SFilipe Manana sizeof(*fi), &extent_inserted); 40888407f553SFilipe Manana if (ret) 40898407f553SFilipe Manana return ret; 40908407f553SFilipe Manana 40918407f553SFilipe Manana if (!extent_inserted) { 40924a0cc7caSNikolay Borisov key.objectid = btrfs_ino(BTRFS_I(inode)); 40938407f553SFilipe Manana key.type = BTRFS_EXTENT_DATA_KEY; 40948407f553SFilipe Manana key.offset = em->start; 40958407f553SFilipe Manana 40968407f553SFilipe Manana ret = btrfs_insert_empty_item(trans, log, path, &key, 40978407f553SFilipe Manana sizeof(*fi)); 40988407f553SFilipe Manana if (ret) 40998407f553SFilipe Manana return ret; 41008407f553SFilipe Manana } 41018407f553SFilipe Manana leaf = path->nodes[0]; 41028407f553SFilipe Manana fi = btrfs_item_ptr(leaf, path->slots[0], 41038407f553SFilipe Manana struct btrfs_file_extent_item); 41048407f553SFilipe Manana 410550d9aa99SJosef Bacik btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, 41068407f553SFilipe Manana &token); 41078407f553SFilipe Manana if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 41088407f553SFilipe Manana btrfs_set_token_file_extent_type(leaf, fi, 41098407f553SFilipe Manana BTRFS_FILE_EXTENT_PREALLOC, 41108407f553SFilipe Manana &token); 41118407f553SFilipe Manana else 41128407f553SFilipe Manana btrfs_set_token_file_extent_type(leaf, fi, 41138407f553SFilipe Manana BTRFS_FILE_EXTENT_REG, 41148407f553SFilipe Manana &token); 41158407f553SFilipe Manana 41168407f553SFilipe Manana block_len = max(em->block_len, em->orig_block_len); 41178407f553SFilipe Manana if (em->compress_type != BTRFS_COMPRESS_NONE) { 41188407f553SFilipe Manana btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 41198407f553SFilipe Manana em->block_start, 41208407f553SFilipe Manana &token); 41218407f553SFilipe Manana btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 41228407f553SFilipe Manana &token); 41238407f553SFilipe Manana } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 41248407f553SFilipe Manana btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 41258407f553SFilipe Manana em->block_start - 41268407f553SFilipe Manana extent_offset, &token); 41278407f553SFilipe Manana btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 41288407f553SFilipe Manana &token); 41298407f553SFilipe Manana } else { 41308407f553SFilipe Manana btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 41318407f553SFilipe Manana btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 41328407f553SFilipe Manana &token); 41338407f553SFilipe Manana } 41348407f553SFilipe Manana 41358407f553SFilipe Manana btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); 41368407f553SFilipe Manana btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 41378407f553SFilipe Manana btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 41388407f553SFilipe Manana btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 41398407f553SFilipe Manana &token); 41408407f553SFilipe Manana btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 41418407f553SFilipe Manana btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 41428407f553SFilipe Manana btrfs_mark_buffer_dirty(leaf); 41438407f553SFilipe Manana 41448407f553SFilipe Manana btrfs_release_path(path); 41458407f553SFilipe Manana 41468407f553SFilipe Manana return ret; 41478407f553SFilipe Manana } 41488407f553SFilipe Manana 41495dc562c5SJosef Bacik static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 41505dc562c5SJosef Bacik struct btrfs_root *root, 41515dc562c5SJosef Bacik struct inode *inode, 4152827463c4SMiao Xie struct btrfs_path *path, 41538407f553SFilipe Manana struct list_head *logged_list, 4154de0ee0edSFilipe Manana struct btrfs_log_ctx *ctx, 4155de0ee0edSFilipe Manana const u64 start, 4156de0ee0edSFilipe Manana const u64 end) 41575dc562c5SJosef Bacik { 41585dc562c5SJosef Bacik struct extent_map *em, *n; 41595dc562c5SJosef Bacik struct list_head extents; 41605dc562c5SJosef Bacik struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 41615dc562c5SJosef Bacik u64 test_gen; 41625dc562c5SJosef Bacik int ret = 0; 41632ab28f32SJosef Bacik int num = 0; 41645dc562c5SJosef Bacik 41655dc562c5SJosef Bacik INIT_LIST_HEAD(&extents); 41665dc562c5SJosef Bacik 41675f9a8a51SFilipe Manana down_write(&BTRFS_I(inode)->dio_sem); 41685dc562c5SJosef Bacik write_lock(&tree->lock); 41695dc562c5SJosef Bacik test_gen = root->fs_info->last_trans_committed; 41705dc562c5SJosef Bacik 41715dc562c5SJosef Bacik list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 41725dc562c5SJosef Bacik list_del_init(&em->list); 41732ab28f32SJosef Bacik 41742ab28f32SJosef Bacik /* 41752ab28f32SJosef Bacik * Just an arbitrary number, this can be really CPU intensive 41762ab28f32SJosef Bacik * once we start getting a lot of extents, and really once we 41772ab28f32SJosef Bacik * have a bunch of extents we just want to commit since it will 41782ab28f32SJosef Bacik * be faster. 41792ab28f32SJosef Bacik */ 41802ab28f32SJosef Bacik if (++num > 32768) { 41812ab28f32SJosef Bacik list_del_init(&tree->modified_extents); 41822ab28f32SJosef Bacik ret = -EFBIG; 41832ab28f32SJosef Bacik goto process; 41842ab28f32SJosef Bacik } 41852ab28f32SJosef Bacik 41865dc562c5SJosef Bacik if (em->generation <= test_gen) 41875dc562c5SJosef Bacik continue; 4188ff44c6e3SJosef Bacik /* Need a ref to keep it from getting evicted from cache */ 4189ff44c6e3SJosef Bacik atomic_inc(&em->refs); 4190ff44c6e3SJosef Bacik set_bit(EXTENT_FLAG_LOGGING, &em->flags); 41915dc562c5SJosef Bacik list_add_tail(&em->list, &extents); 41922ab28f32SJosef Bacik num++; 41935dc562c5SJosef Bacik } 41945dc562c5SJosef Bacik 41955dc562c5SJosef Bacik list_sort(NULL, &extents, extent_cmp); 4196de0ee0edSFilipe Manana btrfs_get_logged_extents(inode, logged_list, start, end); 41975f9a8a51SFilipe Manana /* 41985f9a8a51SFilipe Manana * Some ordered extents started by fsync might have completed 41995f9a8a51SFilipe Manana * before we could collect them into the list logged_list, which 42005f9a8a51SFilipe Manana * means they're gone, not in our logged_list nor in the inode's 42015f9a8a51SFilipe Manana * ordered tree. We want the application/user space to know an 42025f9a8a51SFilipe Manana * error happened while attempting to persist file data so that 42035f9a8a51SFilipe Manana * it can take proper action. If such error happened, we leave 42045f9a8a51SFilipe Manana * without writing to the log tree and the fsync must report the 42055f9a8a51SFilipe Manana * file data write error and not commit the current transaction. 42065f9a8a51SFilipe Manana */ 4207f0312210SMiklos Szeredi ret = filemap_check_errors(inode->i_mapping); 42085f9a8a51SFilipe Manana if (ret) 42095f9a8a51SFilipe Manana ctx->io_err = ret; 42102ab28f32SJosef Bacik process: 42115dc562c5SJosef Bacik while (!list_empty(&extents)) { 42125dc562c5SJosef Bacik em = list_entry(extents.next, struct extent_map, list); 42135dc562c5SJosef Bacik 42145dc562c5SJosef Bacik list_del_init(&em->list); 42155dc562c5SJosef Bacik 42165dc562c5SJosef Bacik /* 42175dc562c5SJosef Bacik * If we had an error we just need to delete everybody from our 42185dc562c5SJosef Bacik * private list. 42195dc562c5SJosef Bacik */ 4220ff44c6e3SJosef Bacik if (ret) { 4221201a9038SJosef Bacik clear_em_logging(tree, em); 4222ff44c6e3SJosef Bacik free_extent_map(em); 42235dc562c5SJosef Bacik continue; 4224ff44c6e3SJosef Bacik } 4225ff44c6e3SJosef Bacik 4226ff44c6e3SJosef Bacik write_unlock(&tree->lock); 42275dc562c5SJosef Bacik 42288407f553SFilipe Manana ret = log_one_extent(trans, inode, root, em, path, logged_list, 42298407f553SFilipe Manana ctx); 4230ff44c6e3SJosef Bacik write_lock(&tree->lock); 4231201a9038SJosef Bacik clear_em_logging(tree, em); 4232201a9038SJosef Bacik free_extent_map(em); 42335dc562c5SJosef Bacik } 4234ff44c6e3SJosef Bacik WARN_ON(!list_empty(&extents)); 4235ff44c6e3SJosef Bacik write_unlock(&tree->lock); 42365f9a8a51SFilipe Manana up_write(&BTRFS_I(inode)->dio_sem); 42375dc562c5SJosef Bacik 42385dc562c5SJosef Bacik btrfs_release_path(path); 42395dc562c5SJosef Bacik return ret; 42405dc562c5SJosef Bacik } 42415dc562c5SJosef Bacik 4242481b01c0SNikolay Borisov static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 42431a4bcf47SFilipe Manana struct btrfs_path *path, u64 *size_ret) 42441a4bcf47SFilipe Manana { 42451a4bcf47SFilipe Manana struct btrfs_key key; 42461a4bcf47SFilipe Manana int ret; 42471a4bcf47SFilipe Manana 4248481b01c0SNikolay Borisov key.objectid = btrfs_ino(inode); 42491a4bcf47SFilipe Manana key.type = BTRFS_INODE_ITEM_KEY; 42501a4bcf47SFilipe Manana key.offset = 0; 42511a4bcf47SFilipe Manana 42521a4bcf47SFilipe Manana ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 42531a4bcf47SFilipe Manana if (ret < 0) { 42541a4bcf47SFilipe Manana return ret; 42551a4bcf47SFilipe Manana } else if (ret > 0) { 42562f2ff0eeSFilipe Manana *size_ret = 0; 42571a4bcf47SFilipe Manana } else { 42581a4bcf47SFilipe Manana struct btrfs_inode_item *item; 42591a4bcf47SFilipe Manana 42601a4bcf47SFilipe Manana item = btrfs_item_ptr(path->nodes[0], path->slots[0], 42611a4bcf47SFilipe Manana struct btrfs_inode_item); 42621a4bcf47SFilipe Manana *size_ret = btrfs_inode_size(path->nodes[0], item); 42631a4bcf47SFilipe Manana } 42641a4bcf47SFilipe Manana 42651a4bcf47SFilipe Manana btrfs_release_path(path); 42661a4bcf47SFilipe Manana return 0; 42671a4bcf47SFilipe Manana } 42681a4bcf47SFilipe Manana 426936283bf7SFilipe Manana /* 427036283bf7SFilipe Manana * At the moment we always log all xattrs. This is to figure out at log replay 427136283bf7SFilipe Manana * time which xattrs must have their deletion replayed. If a xattr is missing 427236283bf7SFilipe Manana * in the log tree and exists in the fs/subvol tree, we delete it. This is 427336283bf7SFilipe Manana * because if a xattr is deleted, the inode is fsynced and a power failure 427436283bf7SFilipe Manana * happens, causing the log to be replayed the next time the fs is mounted, 427536283bf7SFilipe Manana * we want the xattr to not exist anymore (same behaviour as other filesystems 427636283bf7SFilipe Manana * with a journal, ext3/4, xfs, f2fs, etc). 427736283bf7SFilipe Manana */ 427836283bf7SFilipe Manana static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 427936283bf7SFilipe Manana struct btrfs_root *root, 4280*1a93c36aSNikolay Borisov struct btrfs_inode *inode, 428136283bf7SFilipe Manana struct btrfs_path *path, 428236283bf7SFilipe Manana struct btrfs_path *dst_path) 428336283bf7SFilipe Manana { 428436283bf7SFilipe Manana int ret; 428536283bf7SFilipe Manana struct btrfs_key key; 4286*1a93c36aSNikolay Borisov const u64 ino = btrfs_ino(inode); 428736283bf7SFilipe Manana int ins_nr = 0; 428836283bf7SFilipe Manana int start_slot = 0; 428936283bf7SFilipe Manana 429036283bf7SFilipe Manana key.objectid = ino; 429136283bf7SFilipe Manana key.type = BTRFS_XATTR_ITEM_KEY; 429236283bf7SFilipe Manana key.offset = 0; 429336283bf7SFilipe Manana 429436283bf7SFilipe Manana ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 429536283bf7SFilipe Manana if (ret < 0) 429636283bf7SFilipe Manana return ret; 429736283bf7SFilipe Manana 429836283bf7SFilipe Manana while (true) { 429936283bf7SFilipe Manana int slot = path->slots[0]; 430036283bf7SFilipe Manana struct extent_buffer *leaf = path->nodes[0]; 430136283bf7SFilipe Manana int nritems = btrfs_header_nritems(leaf); 430236283bf7SFilipe Manana 430336283bf7SFilipe Manana if (slot >= nritems) { 430436283bf7SFilipe Manana if (ins_nr > 0) { 430536283bf7SFilipe Manana u64 last_extent = 0; 430636283bf7SFilipe Manana 4307*1a93c36aSNikolay Borisov ret = copy_items(trans, inode, dst_path, path, 430836283bf7SFilipe Manana &last_extent, start_slot, 430936283bf7SFilipe Manana ins_nr, 1, 0); 431036283bf7SFilipe Manana /* can't be 1, extent items aren't processed */ 431136283bf7SFilipe Manana ASSERT(ret <= 0); 431236283bf7SFilipe Manana if (ret < 0) 431336283bf7SFilipe Manana return ret; 431436283bf7SFilipe Manana ins_nr = 0; 431536283bf7SFilipe Manana } 431636283bf7SFilipe Manana ret = btrfs_next_leaf(root, path); 431736283bf7SFilipe Manana if (ret < 0) 431836283bf7SFilipe Manana return ret; 431936283bf7SFilipe Manana else if (ret > 0) 432036283bf7SFilipe Manana break; 432136283bf7SFilipe Manana continue; 432236283bf7SFilipe Manana } 432336283bf7SFilipe Manana 432436283bf7SFilipe Manana btrfs_item_key_to_cpu(leaf, &key, slot); 432536283bf7SFilipe Manana if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 432636283bf7SFilipe Manana break; 432736283bf7SFilipe Manana 432836283bf7SFilipe Manana if (ins_nr == 0) 432936283bf7SFilipe Manana start_slot = slot; 433036283bf7SFilipe Manana ins_nr++; 433136283bf7SFilipe Manana path->slots[0]++; 433236283bf7SFilipe Manana cond_resched(); 433336283bf7SFilipe Manana } 433436283bf7SFilipe Manana if (ins_nr > 0) { 433536283bf7SFilipe Manana u64 last_extent = 0; 433636283bf7SFilipe Manana 4337*1a93c36aSNikolay Borisov ret = copy_items(trans, inode, dst_path, path, 433836283bf7SFilipe Manana &last_extent, start_slot, 433936283bf7SFilipe Manana ins_nr, 1, 0); 434036283bf7SFilipe Manana /* can't be 1, extent items aren't processed */ 434136283bf7SFilipe Manana ASSERT(ret <= 0); 434236283bf7SFilipe Manana if (ret < 0) 434336283bf7SFilipe Manana return ret; 434436283bf7SFilipe Manana } 434536283bf7SFilipe Manana 434636283bf7SFilipe Manana return 0; 434736283bf7SFilipe Manana } 434836283bf7SFilipe Manana 4349a89ca6f2SFilipe Manana /* 4350a89ca6f2SFilipe Manana * If the no holes feature is enabled we need to make sure any hole between the 4351a89ca6f2SFilipe Manana * last extent and the i_size of our inode is explicitly marked in the log. This 4352a89ca6f2SFilipe Manana * is to make sure that doing something like: 4353a89ca6f2SFilipe Manana * 4354a89ca6f2SFilipe Manana * 1) create file with 128Kb of data 4355a89ca6f2SFilipe Manana * 2) truncate file to 64Kb 4356a89ca6f2SFilipe Manana * 3) truncate file to 256Kb 4357a89ca6f2SFilipe Manana * 4) fsync file 4358a89ca6f2SFilipe Manana * 5) <crash/power failure> 4359a89ca6f2SFilipe Manana * 6) mount fs and trigger log replay 4360a89ca6f2SFilipe Manana * 4361a89ca6f2SFilipe Manana * Will give us a file with a size of 256Kb, the first 64Kb of data match what 4362a89ca6f2SFilipe Manana * the file had in its first 64Kb of data at step 1 and the last 192Kb of the 4363a89ca6f2SFilipe Manana * file correspond to a hole. The presence of explicit holes in a log tree is 4364a89ca6f2SFilipe Manana * what guarantees that log replay will remove/adjust file extent items in the 4365a89ca6f2SFilipe Manana * fs/subvol tree. 4366a89ca6f2SFilipe Manana * 4367a89ca6f2SFilipe Manana * Here we do not need to care about holes between extents, that is already done 4368a89ca6f2SFilipe Manana * by copy_items(). We also only need to do this in the full sync path, where we 4369a89ca6f2SFilipe Manana * lookup for extents from the fs/subvol tree only. In the fast path case, we 4370a89ca6f2SFilipe Manana * lookup the list of modified extent maps and if any represents a hole, we 4371a89ca6f2SFilipe Manana * insert a corresponding extent representing a hole in the log tree. 4372a89ca6f2SFilipe Manana */ 4373a89ca6f2SFilipe Manana static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, 4374a89ca6f2SFilipe Manana struct btrfs_root *root, 4375a89ca6f2SFilipe Manana struct inode *inode, 4376a89ca6f2SFilipe Manana struct btrfs_path *path) 4377a89ca6f2SFilipe Manana { 43780b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 4379a89ca6f2SFilipe Manana int ret; 4380a89ca6f2SFilipe Manana struct btrfs_key key; 4381a89ca6f2SFilipe Manana u64 hole_start; 4382a89ca6f2SFilipe Manana u64 hole_size; 4383a89ca6f2SFilipe Manana struct extent_buffer *leaf; 4384a89ca6f2SFilipe Manana struct btrfs_root *log = root->log_root; 43854a0cc7caSNikolay Borisov const u64 ino = btrfs_ino(BTRFS_I(inode)); 4386a89ca6f2SFilipe Manana const u64 i_size = i_size_read(inode); 4387a89ca6f2SFilipe Manana 43880b246afaSJeff Mahoney if (!btrfs_fs_incompat(fs_info, NO_HOLES)) 4389a89ca6f2SFilipe Manana return 0; 4390a89ca6f2SFilipe Manana 4391a89ca6f2SFilipe Manana key.objectid = ino; 4392a89ca6f2SFilipe Manana key.type = BTRFS_EXTENT_DATA_KEY; 4393a89ca6f2SFilipe Manana key.offset = (u64)-1; 4394a89ca6f2SFilipe Manana 4395a89ca6f2SFilipe Manana ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4396a89ca6f2SFilipe Manana ASSERT(ret != 0); 4397a89ca6f2SFilipe Manana if (ret < 0) 4398a89ca6f2SFilipe Manana return ret; 4399a89ca6f2SFilipe Manana 4400a89ca6f2SFilipe Manana ASSERT(path->slots[0] > 0); 4401a89ca6f2SFilipe Manana path->slots[0]--; 4402a89ca6f2SFilipe Manana leaf = path->nodes[0]; 4403a89ca6f2SFilipe Manana btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4404a89ca6f2SFilipe Manana 4405a89ca6f2SFilipe Manana if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 4406a89ca6f2SFilipe Manana /* inode does not have any extents */ 4407a89ca6f2SFilipe Manana hole_start = 0; 4408a89ca6f2SFilipe Manana hole_size = i_size; 4409a89ca6f2SFilipe Manana } else { 4410a89ca6f2SFilipe Manana struct btrfs_file_extent_item *extent; 4411a89ca6f2SFilipe Manana u64 len; 4412a89ca6f2SFilipe Manana 4413a89ca6f2SFilipe Manana /* 4414a89ca6f2SFilipe Manana * If there's an extent beyond i_size, an explicit hole was 4415a89ca6f2SFilipe Manana * already inserted by copy_items(). 4416a89ca6f2SFilipe Manana */ 4417a89ca6f2SFilipe Manana if (key.offset >= i_size) 4418a89ca6f2SFilipe Manana return 0; 4419a89ca6f2SFilipe Manana 4420a89ca6f2SFilipe Manana extent = btrfs_item_ptr(leaf, path->slots[0], 4421a89ca6f2SFilipe Manana struct btrfs_file_extent_item); 4422a89ca6f2SFilipe Manana 4423a89ca6f2SFilipe Manana if (btrfs_file_extent_type(leaf, extent) == 4424a89ca6f2SFilipe Manana BTRFS_FILE_EXTENT_INLINE) { 4425a89ca6f2SFilipe Manana len = btrfs_file_extent_inline_len(leaf, 4426a89ca6f2SFilipe Manana path->slots[0], 4427a89ca6f2SFilipe Manana extent); 4428a89ca6f2SFilipe Manana ASSERT(len == i_size); 4429a89ca6f2SFilipe Manana return 0; 4430a89ca6f2SFilipe Manana } 4431a89ca6f2SFilipe Manana 4432a89ca6f2SFilipe Manana len = btrfs_file_extent_num_bytes(leaf, extent); 4433a89ca6f2SFilipe Manana /* Last extent goes beyond i_size, no need to log a hole. */ 4434a89ca6f2SFilipe Manana if (key.offset + len > i_size) 4435a89ca6f2SFilipe Manana return 0; 4436a89ca6f2SFilipe Manana hole_start = key.offset + len; 4437a89ca6f2SFilipe Manana hole_size = i_size - hole_start; 4438a89ca6f2SFilipe Manana } 4439a89ca6f2SFilipe Manana btrfs_release_path(path); 4440a89ca6f2SFilipe Manana 4441a89ca6f2SFilipe Manana /* Last extent ends at i_size. */ 4442a89ca6f2SFilipe Manana if (hole_size == 0) 4443a89ca6f2SFilipe Manana return 0; 4444a89ca6f2SFilipe Manana 44450b246afaSJeff Mahoney hole_size = ALIGN(hole_size, fs_info->sectorsize); 4446a89ca6f2SFilipe Manana ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, 4447a89ca6f2SFilipe Manana hole_size, 0, hole_size, 0, 0, 0); 4448a89ca6f2SFilipe Manana return ret; 4449a89ca6f2SFilipe Manana } 4450a89ca6f2SFilipe Manana 445156f23fdbSFilipe Manana /* 445256f23fdbSFilipe Manana * When we are logging a new inode X, check if it doesn't have a reference that 445356f23fdbSFilipe Manana * matches the reference from some other inode Y created in a past transaction 445456f23fdbSFilipe Manana * and that was renamed in the current transaction. If we don't do this, then at 445556f23fdbSFilipe Manana * log replay time we can lose inode Y (and all its files if it's a directory): 445656f23fdbSFilipe Manana * 445756f23fdbSFilipe Manana * mkdir /mnt/x 445856f23fdbSFilipe Manana * echo "hello world" > /mnt/x/foobar 445956f23fdbSFilipe Manana * sync 446056f23fdbSFilipe Manana * mv /mnt/x /mnt/y 446156f23fdbSFilipe Manana * mkdir /mnt/x # or touch /mnt/x 446256f23fdbSFilipe Manana * xfs_io -c fsync /mnt/x 446356f23fdbSFilipe Manana * <power fail> 446456f23fdbSFilipe Manana * mount fs, trigger log replay 446556f23fdbSFilipe Manana * 446656f23fdbSFilipe Manana * After the log replay procedure, we would lose the first directory and all its 446756f23fdbSFilipe Manana * files (file foobar). 446856f23fdbSFilipe Manana * For the case where inode Y is not a directory we simply end up losing it: 446956f23fdbSFilipe Manana * 447056f23fdbSFilipe Manana * echo "123" > /mnt/foo 447156f23fdbSFilipe Manana * sync 447256f23fdbSFilipe Manana * mv /mnt/foo /mnt/bar 447356f23fdbSFilipe Manana * echo "abc" > /mnt/foo 447456f23fdbSFilipe Manana * xfs_io -c fsync /mnt/foo 447556f23fdbSFilipe Manana * <power fail> 447656f23fdbSFilipe Manana * 447756f23fdbSFilipe Manana * We also need this for cases where a snapshot entry is replaced by some other 447856f23fdbSFilipe Manana * entry (file or directory) otherwise we end up with an unreplayable log due to 447956f23fdbSFilipe Manana * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 448056f23fdbSFilipe Manana * if it were a regular entry: 448156f23fdbSFilipe Manana * 448256f23fdbSFilipe Manana * mkdir /mnt/x 448356f23fdbSFilipe Manana * btrfs subvolume snapshot /mnt /mnt/x/snap 448456f23fdbSFilipe Manana * btrfs subvolume delete /mnt/x/snap 448556f23fdbSFilipe Manana * rmdir /mnt/x 448656f23fdbSFilipe Manana * mkdir /mnt/x 448756f23fdbSFilipe Manana * fsync /mnt/x or fsync some new file inside it 448856f23fdbSFilipe Manana * <power fail> 448956f23fdbSFilipe Manana * 449056f23fdbSFilipe Manana * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 449156f23fdbSFilipe Manana * the same transaction. 449256f23fdbSFilipe Manana */ 449356f23fdbSFilipe Manana static int btrfs_check_ref_name_override(struct extent_buffer *eb, 449456f23fdbSFilipe Manana const int slot, 449556f23fdbSFilipe Manana const struct btrfs_key *key, 44964791c8f1SNikolay Borisov struct btrfs_inode *inode, 449744f714daSFilipe Manana u64 *other_ino) 449856f23fdbSFilipe Manana { 449956f23fdbSFilipe Manana int ret; 450056f23fdbSFilipe Manana struct btrfs_path *search_path; 450156f23fdbSFilipe Manana char *name = NULL; 450256f23fdbSFilipe Manana u32 name_len = 0; 450356f23fdbSFilipe Manana u32 item_size = btrfs_item_size_nr(eb, slot); 450456f23fdbSFilipe Manana u32 cur_offset = 0; 450556f23fdbSFilipe Manana unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 450656f23fdbSFilipe Manana 450756f23fdbSFilipe Manana search_path = btrfs_alloc_path(); 450856f23fdbSFilipe Manana if (!search_path) 450956f23fdbSFilipe Manana return -ENOMEM; 451056f23fdbSFilipe Manana search_path->search_commit_root = 1; 451156f23fdbSFilipe Manana search_path->skip_locking = 1; 451256f23fdbSFilipe Manana 451356f23fdbSFilipe Manana while (cur_offset < item_size) { 451456f23fdbSFilipe Manana u64 parent; 451556f23fdbSFilipe Manana u32 this_name_len; 451656f23fdbSFilipe Manana u32 this_len; 451756f23fdbSFilipe Manana unsigned long name_ptr; 451856f23fdbSFilipe Manana struct btrfs_dir_item *di; 451956f23fdbSFilipe Manana 452056f23fdbSFilipe Manana if (key->type == BTRFS_INODE_REF_KEY) { 452156f23fdbSFilipe Manana struct btrfs_inode_ref *iref; 452256f23fdbSFilipe Manana 452356f23fdbSFilipe Manana iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 452456f23fdbSFilipe Manana parent = key->offset; 452556f23fdbSFilipe Manana this_name_len = btrfs_inode_ref_name_len(eb, iref); 452656f23fdbSFilipe Manana name_ptr = (unsigned long)(iref + 1); 452756f23fdbSFilipe Manana this_len = sizeof(*iref) + this_name_len; 452856f23fdbSFilipe Manana } else { 452956f23fdbSFilipe Manana struct btrfs_inode_extref *extref; 453056f23fdbSFilipe Manana 453156f23fdbSFilipe Manana extref = (struct btrfs_inode_extref *)(ptr + 453256f23fdbSFilipe Manana cur_offset); 453356f23fdbSFilipe Manana parent = btrfs_inode_extref_parent(eb, extref); 453456f23fdbSFilipe Manana this_name_len = btrfs_inode_extref_name_len(eb, extref); 453556f23fdbSFilipe Manana name_ptr = (unsigned long)&extref->name; 453656f23fdbSFilipe Manana this_len = sizeof(*extref) + this_name_len; 453756f23fdbSFilipe Manana } 453856f23fdbSFilipe Manana 453956f23fdbSFilipe Manana if (this_name_len > name_len) { 454056f23fdbSFilipe Manana char *new_name; 454156f23fdbSFilipe Manana 454256f23fdbSFilipe Manana new_name = krealloc(name, this_name_len, GFP_NOFS); 454356f23fdbSFilipe Manana if (!new_name) { 454456f23fdbSFilipe Manana ret = -ENOMEM; 454556f23fdbSFilipe Manana goto out; 454656f23fdbSFilipe Manana } 454756f23fdbSFilipe Manana name_len = this_name_len; 454856f23fdbSFilipe Manana name = new_name; 454956f23fdbSFilipe Manana } 455056f23fdbSFilipe Manana 455156f23fdbSFilipe Manana read_extent_buffer(eb, name, name_ptr, this_name_len); 45524791c8f1SNikolay Borisov di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 45534791c8f1SNikolay Borisov parent, name, this_name_len, 0); 455456f23fdbSFilipe Manana if (di && !IS_ERR(di)) { 455544f714daSFilipe Manana struct btrfs_key di_key; 455644f714daSFilipe Manana 455744f714daSFilipe Manana btrfs_dir_item_key_to_cpu(search_path->nodes[0], 455844f714daSFilipe Manana di, &di_key); 455944f714daSFilipe Manana if (di_key.type == BTRFS_INODE_ITEM_KEY) { 456056f23fdbSFilipe Manana ret = 1; 456144f714daSFilipe Manana *other_ino = di_key.objectid; 456244f714daSFilipe Manana } else { 456344f714daSFilipe Manana ret = -EAGAIN; 456444f714daSFilipe Manana } 456556f23fdbSFilipe Manana goto out; 456656f23fdbSFilipe Manana } else if (IS_ERR(di)) { 456756f23fdbSFilipe Manana ret = PTR_ERR(di); 456856f23fdbSFilipe Manana goto out; 456956f23fdbSFilipe Manana } 457056f23fdbSFilipe Manana btrfs_release_path(search_path); 457156f23fdbSFilipe Manana 457256f23fdbSFilipe Manana cur_offset += this_len; 457356f23fdbSFilipe Manana } 457456f23fdbSFilipe Manana ret = 0; 457556f23fdbSFilipe Manana out: 457656f23fdbSFilipe Manana btrfs_free_path(search_path); 457756f23fdbSFilipe Manana kfree(name); 457856f23fdbSFilipe Manana return ret; 457956f23fdbSFilipe Manana } 458056f23fdbSFilipe Manana 4581e02119d5SChris Mason /* log a single inode in the tree log. 4582e02119d5SChris Mason * At least one parent directory for this inode must exist in the tree 4583e02119d5SChris Mason * or be logged already. 4584e02119d5SChris Mason * 4585e02119d5SChris Mason * Any items from this inode changed by the current transaction are copied 4586e02119d5SChris Mason * to the log tree. An extra reference is taken on any extents in this 4587e02119d5SChris Mason * file, allowing us to avoid a whole pile of corner cases around logging 4588e02119d5SChris Mason * blocks that have been removed from the tree. 4589e02119d5SChris Mason * 4590e02119d5SChris Mason * See LOG_INODE_ALL and related defines for a description of what inode_only 4591e02119d5SChris Mason * does. 4592e02119d5SChris Mason * 4593e02119d5SChris Mason * This handles both files and directories. 4594e02119d5SChris Mason */ 459512fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans, 4596e02119d5SChris Mason struct btrfs_root *root, struct inode *inode, 459749dae1bcSFilipe Manana int inode_only, 459849dae1bcSFilipe Manana const loff_t start, 45998407f553SFilipe Manana const loff_t end, 46008407f553SFilipe Manana struct btrfs_log_ctx *ctx) 4601e02119d5SChris Mason { 46020b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 4603e02119d5SChris Mason struct btrfs_path *path; 4604e02119d5SChris Mason struct btrfs_path *dst_path; 4605e02119d5SChris Mason struct btrfs_key min_key; 4606e02119d5SChris Mason struct btrfs_key max_key; 4607e02119d5SChris Mason struct btrfs_root *log = root->log_root; 460831ff1cd2SChris Mason struct extent_buffer *src = NULL; 4609827463c4SMiao Xie LIST_HEAD(logged_list); 461016e7549fSJosef Bacik u64 last_extent = 0; 46114a500fd1SYan, Zheng int err = 0; 4612e02119d5SChris Mason int ret; 46133a5f1d45SChris Mason int nritems; 461431ff1cd2SChris Mason int ins_start_slot = 0; 461531ff1cd2SChris Mason int ins_nr; 46165dc562c5SJosef Bacik bool fast_search = false; 46174a0cc7caSNikolay Borisov u64 ino = btrfs_ino(BTRFS_I(inode)); 461849dae1bcSFilipe Manana struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 46191a4bcf47SFilipe Manana u64 logged_isize = 0; 4620e4545de5SFilipe Manana bool need_log_inode_item = true; 4621e02119d5SChris Mason 4622e02119d5SChris Mason path = btrfs_alloc_path(); 46235df67083STsutomu Itoh if (!path) 46245df67083STsutomu Itoh return -ENOMEM; 4625e02119d5SChris Mason dst_path = btrfs_alloc_path(); 46265df67083STsutomu Itoh if (!dst_path) { 46275df67083STsutomu Itoh btrfs_free_path(path); 46285df67083STsutomu Itoh return -ENOMEM; 46295df67083STsutomu Itoh } 4630e02119d5SChris Mason 463133345d01SLi Zefan min_key.objectid = ino; 4632e02119d5SChris Mason min_key.type = BTRFS_INODE_ITEM_KEY; 4633e02119d5SChris Mason min_key.offset = 0; 4634e02119d5SChris Mason 463533345d01SLi Zefan max_key.objectid = ino; 463612fcfd22SChris Mason 463712fcfd22SChris Mason 46385dc562c5SJosef Bacik /* today the code can only do partial logging of directories */ 46395269b67eSMiao Xie if (S_ISDIR(inode->i_mode) || 46405269b67eSMiao Xie (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 46415269b67eSMiao Xie &BTRFS_I(inode)->runtime_flags) && 4642781feef7SLiu Bo inode_only >= LOG_INODE_EXISTS)) 4643e02119d5SChris Mason max_key.type = BTRFS_XATTR_ITEM_KEY; 4644e02119d5SChris Mason else 4645e02119d5SChris Mason max_key.type = (u8)-1; 4646e02119d5SChris Mason max_key.offset = (u64)-1; 4647e02119d5SChris Mason 46482c2c452bSFilipe Manana /* 46492c2c452bSFilipe Manana * Only run delayed items if we are a dir or a new file. 46502c2c452bSFilipe Manana * Otherwise commit the delayed inode only, which is needed in 46512c2c452bSFilipe Manana * order for the log replay code to mark inodes for link count 46522c2c452bSFilipe Manana * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). 46532c2c452bSFilipe Manana */ 465494edf4aeSJosef Bacik if (S_ISDIR(inode->i_mode) || 46550b246afaSJeff Mahoney BTRFS_I(inode)->generation > fs_info->last_trans_committed) 46565f4b32e9SNikolay Borisov ret = btrfs_commit_inode_delayed_items(trans, BTRFS_I(inode)); 46572c2c452bSFilipe Manana else 4658aa79021fSNikolay Borisov ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 46592c2c452bSFilipe Manana 466016cdcec7SMiao Xie if (ret) { 466116cdcec7SMiao Xie btrfs_free_path(path); 466216cdcec7SMiao Xie btrfs_free_path(dst_path); 466316cdcec7SMiao Xie return ret; 466416cdcec7SMiao Xie } 466516cdcec7SMiao Xie 4666781feef7SLiu Bo if (inode_only == LOG_OTHER_INODE) { 4667781feef7SLiu Bo inode_only = LOG_INODE_EXISTS; 4668781feef7SLiu Bo mutex_lock_nested(&BTRFS_I(inode)->log_mutex, 4669781feef7SLiu Bo SINGLE_DEPTH_NESTING); 4670781feef7SLiu Bo } else { 4671e02119d5SChris Mason mutex_lock(&BTRFS_I(inode)->log_mutex); 4672781feef7SLiu Bo } 4673e02119d5SChris Mason 46745e33a2bdSFilipe Manana /* 4675e02119d5SChris Mason * a brute force approach to making sure we get the most uptodate 4676e02119d5SChris Mason * copies of everything. 4677e02119d5SChris Mason */ 4678e02119d5SChris Mason if (S_ISDIR(inode->i_mode)) { 4679e02119d5SChris Mason int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4680e02119d5SChris Mason 46814f764e51SFilipe Manana if (inode_only == LOG_INODE_EXISTS) 46824f764e51SFilipe Manana max_key_type = BTRFS_XATTR_ITEM_KEY; 468333345d01SLi Zefan ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4684e02119d5SChris Mason } else { 46851a4bcf47SFilipe Manana if (inode_only == LOG_INODE_EXISTS) { 46861a4bcf47SFilipe Manana /* 46871a4bcf47SFilipe Manana * Make sure the new inode item we write to the log has 46881a4bcf47SFilipe Manana * the same isize as the current one (if it exists). 46891a4bcf47SFilipe Manana * This is necessary to prevent data loss after log 46901a4bcf47SFilipe Manana * replay, and also to prevent doing a wrong expanding 46911a4bcf47SFilipe Manana * truncate - for e.g. create file, write 4K into offset 46921a4bcf47SFilipe Manana * 0, fsync, write 4K into offset 4096, add hard link, 46931a4bcf47SFilipe Manana * fsync some other file (to sync log), power fail - if 46941a4bcf47SFilipe Manana * we use the inode's current i_size, after log replay 46951a4bcf47SFilipe Manana * we get a 8Kb file, with the last 4Kb extent as a hole 46961a4bcf47SFilipe Manana * (zeroes), as if an expanding truncate happened, 46971a4bcf47SFilipe Manana * instead of getting a file of 4Kb only. 46981a4bcf47SFilipe Manana */ 4699481b01c0SNikolay Borisov err = logged_inode_size(log, BTRFS_I(inode), path, 47001a4bcf47SFilipe Manana &logged_isize); 47011a4bcf47SFilipe Manana if (err) 47021a4bcf47SFilipe Manana goto out_unlock; 47031a4bcf47SFilipe Manana } 4704a742994aSFilipe Manana if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 47055dc562c5SJosef Bacik &BTRFS_I(inode)->runtime_flags)) { 4706a742994aSFilipe Manana if (inode_only == LOG_INODE_EXISTS) { 47074f764e51SFilipe Manana max_key.type = BTRFS_XATTR_ITEM_KEY; 4708a742994aSFilipe Manana ret = drop_objectid_items(trans, log, path, ino, 4709a742994aSFilipe Manana max_key.type); 4710a742994aSFilipe Manana } else { 4711a742994aSFilipe Manana clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4712a742994aSFilipe Manana &BTRFS_I(inode)->runtime_flags); 4713e9976151SJosef Bacik clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4714e9976151SJosef Bacik &BTRFS_I(inode)->runtime_flags); 471528ed1345SChris Mason while(1) { 471628ed1345SChris Mason ret = btrfs_truncate_inode_items(trans, 471728ed1345SChris Mason log, inode, 0, 0); 471828ed1345SChris Mason if (ret != -EAGAIN) 471928ed1345SChris Mason break; 472028ed1345SChris Mason } 4721a742994aSFilipe Manana } 47224f764e51SFilipe Manana } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 47236cfab851SJosef Bacik &BTRFS_I(inode)->runtime_flags) || 47246cfab851SJosef Bacik inode_only == LOG_INODE_EXISTS) { 47254f764e51SFilipe Manana if (inode_only == LOG_INODE_ALL) 4726a95249b3SJosef Bacik fast_search = true; 4727a95249b3SJosef Bacik max_key.type = BTRFS_XATTR_ITEM_KEY; 4728a95249b3SJosef Bacik ret = drop_objectid_items(trans, log, path, ino, 4729a95249b3SJosef Bacik max_key.type); 47305dc562c5SJosef Bacik } else { 4731183f37faSLiu Bo if (inode_only == LOG_INODE_ALL) 47325dc562c5SJosef Bacik fast_search = true; 4733a95249b3SJosef Bacik goto log_extents; 4734a95249b3SJosef Bacik } 4735a95249b3SJosef Bacik 4736e02119d5SChris Mason } 47374a500fd1SYan, Zheng if (ret) { 47384a500fd1SYan, Zheng err = ret; 47394a500fd1SYan, Zheng goto out_unlock; 47404a500fd1SYan, Zheng } 4741e02119d5SChris Mason 4742e02119d5SChris Mason while (1) { 474331ff1cd2SChris Mason ins_nr = 0; 47446174d3cbSFilipe David Borba Manana ret = btrfs_search_forward(root, &min_key, 4745de78b51aSEric Sandeen path, trans->transid); 4746fb770ae4SLiu Bo if (ret < 0) { 4747fb770ae4SLiu Bo err = ret; 4748fb770ae4SLiu Bo goto out_unlock; 4749fb770ae4SLiu Bo } 4750e02119d5SChris Mason if (ret != 0) 4751e02119d5SChris Mason break; 47523a5f1d45SChris Mason again: 475331ff1cd2SChris Mason /* note, ins_nr might be > 0 here, cleanup outside the loop */ 475433345d01SLi Zefan if (min_key.objectid != ino) 4755e02119d5SChris Mason break; 4756e02119d5SChris Mason if (min_key.type > max_key.type) 4757e02119d5SChris Mason break; 475831ff1cd2SChris Mason 4759e4545de5SFilipe Manana if (min_key.type == BTRFS_INODE_ITEM_KEY) 4760e4545de5SFilipe Manana need_log_inode_item = false; 4761e4545de5SFilipe Manana 476256f23fdbSFilipe Manana if ((min_key.type == BTRFS_INODE_REF_KEY || 476356f23fdbSFilipe Manana min_key.type == BTRFS_INODE_EXTREF_KEY) && 476456f23fdbSFilipe Manana BTRFS_I(inode)->generation == trans->transid) { 476544f714daSFilipe Manana u64 other_ino = 0; 476644f714daSFilipe Manana 476756f23fdbSFilipe Manana ret = btrfs_check_ref_name_override(path->nodes[0], 476856f23fdbSFilipe Manana path->slots[0], 47694791c8f1SNikolay Borisov &min_key, BTRFS_I(inode), 477044f714daSFilipe Manana &other_ino); 477156f23fdbSFilipe Manana if (ret < 0) { 477256f23fdbSFilipe Manana err = ret; 477356f23fdbSFilipe Manana goto out_unlock; 477428a23593SFilipe Manana } else if (ret > 0 && ctx && 47754a0cc7caSNikolay Borisov other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 477644f714daSFilipe Manana struct btrfs_key inode_key; 477744f714daSFilipe Manana struct inode *other_inode; 477844f714daSFilipe Manana 477944f714daSFilipe Manana if (ins_nr > 0) { 478044f714daSFilipe Manana ins_nr++; 478144f714daSFilipe Manana } else { 478244f714daSFilipe Manana ins_nr = 1; 478344f714daSFilipe Manana ins_start_slot = path->slots[0]; 478444f714daSFilipe Manana } 478544d70e19SNikolay Borisov ret = copy_items(trans, BTRFS_I(inode), dst_path, path, 478644f714daSFilipe Manana &last_extent, ins_start_slot, 478744f714daSFilipe Manana ins_nr, inode_only, 478844f714daSFilipe Manana logged_isize); 478944f714daSFilipe Manana if (ret < 0) { 479044f714daSFilipe Manana err = ret; 479156f23fdbSFilipe Manana goto out_unlock; 479256f23fdbSFilipe Manana } 479344f714daSFilipe Manana ins_nr = 0; 479444f714daSFilipe Manana btrfs_release_path(path); 479544f714daSFilipe Manana inode_key.objectid = other_ino; 479644f714daSFilipe Manana inode_key.type = BTRFS_INODE_ITEM_KEY; 479744f714daSFilipe Manana inode_key.offset = 0; 47980b246afaSJeff Mahoney other_inode = btrfs_iget(fs_info->sb, 479944f714daSFilipe Manana &inode_key, root, 480044f714daSFilipe Manana NULL); 480144f714daSFilipe Manana /* 480244f714daSFilipe Manana * If the other inode that had a conflicting dir 480344f714daSFilipe Manana * entry was deleted in the current transaction, 480444f714daSFilipe Manana * we don't need to do more work nor fallback to 480544f714daSFilipe Manana * a transaction commit. 480644f714daSFilipe Manana */ 480744f714daSFilipe Manana if (IS_ERR(other_inode) && 480844f714daSFilipe Manana PTR_ERR(other_inode) == -ENOENT) { 480944f714daSFilipe Manana goto next_key; 481044f714daSFilipe Manana } else if (IS_ERR(other_inode)) { 481144f714daSFilipe Manana err = PTR_ERR(other_inode); 481244f714daSFilipe Manana goto out_unlock; 481344f714daSFilipe Manana } 481444f714daSFilipe Manana /* 481544f714daSFilipe Manana * We are safe logging the other inode without 481644f714daSFilipe Manana * acquiring its i_mutex as long as we log with 481744f714daSFilipe Manana * the LOG_INODE_EXISTS mode. We're safe against 481844f714daSFilipe Manana * concurrent renames of the other inode as well 481944f714daSFilipe Manana * because during a rename we pin the log and 482044f714daSFilipe Manana * update the log with the new name before we 482144f714daSFilipe Manana * unpin it. 482244f714daSFilipe Manana */ 482344f714daSFilipe Manana err = btrfs_log_inode(trans, root, other_inode, 4824781feef7SLiu Bo LOG_OTHER_INODE, 482544f714daSFilipe Manana 0, LLONG_MAX, ctx); 482644f714daSFilipe Manana iput(other_inode); 482744f714daSFilipe Manana if (err) 482844f714daSFilipe Manana goto out_unlock; 482944f714daSFilipe Manana else 483044f714daSFilipe Manana goto next_key; 483144f714daSFilipe Manana } 483256f23fdbSFilipe Manana } 483356f23fdbSFilipe Manana 483436283bf7SFilipe Manana /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 483536283bf7SFilipe Manana if (min_key.type == BTRFS_XATTR_ITEM_KEY) { 483636283bf7SFilipe Manana if (ins_nr == 0) 483736283bf7SFilipe Manana goto next_slot; 483844d70e19SNikolay Borisov ret = copy_items(trans, BTRFS_I(inode), dst_path, path, 483936283bf7SFilipe Manana &last_extent, ins_start_slot, 484036283bf7SFilipe Manana ins_nr, inode_only, logged_isize); 484136283bf7SFilipe Manana if (ret < 0) { 484236283bf7SFilipe Manana err = ret; 484336283bf7SFilipe Manana goto out_unlock; 484436283bf7SFilipe Manana } 484536283bf7SFilipe Manana ins_nr = 0; 484636283bf7SFilipe Manana if (ret) { 484736283bf7SFilipe Manana btrfs_release_path(path); 484836283bf7SFilipe Manana continue; 484936283bf7SFilipe Manana } 485036283bf7SFilipe Manana goto next_slot; 485136283bf7SFilipe Manana } 485236283bf7SFilipe Manana 4853e02119d5SChris Mason src = path->nodes[0]; 485431ff1cd2SChris Mason if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 485531ff1cd2SChris Mason ins_nr++; 485631ff1cd2SChris Mason goto next_slot; 485731ff1cd2SChris Mason } else if (!ins_nr) { 485831ff1cd2SChris Mason ins_start_slot = path->slots[0]; 485931ff1cd2SChris Mason ins_nr = 1; 486031ff1cd2SChris Mason goto next_slot; 4861e02119d5SChris Mason } 4862e02119d5SChris Mason 486344d70e19SNikolay Borisov ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent, 48641a4bcf47SFilipe Manana ins_start_slot, ins_nr, inode_only, 48651a4bcf47SFilipe Manana logged_isize); 486616e7549fSJosef Bacik if (ret < 0) { 48674a500fd1SYan, Zheng err = ret; 48684a500fd1SYan, Zheng goto out_unlock; 4869a71db86eSRasmus Villemoes } 4870a71db86eSRasmus Villemoes if (ret) { 487116e7549fSJosef Bacik ins_nr = 0; 487216e7549fSJosef Bacik btrfs_release_path(path); 487316e7549fSJosef Bacik continue; 48744a500fd1SYan, Zheng } 487531ff1cd2SChris Mason ins_nr = 1; 487631ff1cd2SChris Mason ins_start_slot = path->slots[0]; 487731ff1cd2SChris Mason next_slot: 4878e02119d5SChris Mason 48793a5f1d45SChris Mason nritems = btrfs_header_nritems(path->nodes[0]); 48803a5f1d45SChris Mason path->slots[0]++; 48813a5f1d45SChris Mason if (path->slots[0] < nritems) { 48823a5f1d45SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &min_key, 48833a5f1d45SChris Mason path->slots[0]); 48843a5f1d45SChris Mason goto again; 48853a5f1d45SChris Mason } 488631ff1cd2SChris Mason if (ins_nr) { 488744d70e19SNikolay Borisov ret = copy_items(trans, BTRFS_I(inode), dst_path, path, 488816e7549fSJosef Bacik &last_extent, ins_start_slot, 48891a4bcf47SFilipe Manana ins_nr, inode_only, logged_isize); 489016e7549fSJosef Bacik if (ret < 0) { 48914a500fd1SYan, Zheng err = ret; 48924a500fd1SYan, Zheng goto out_unlock; 48934a500fd1SYan, Zheng } 489416e7549fSJosef Bacik ret = 0; 489531ff1cd2SChris Mason ins_nr = 0; 489631ff1cd2SChris Mason } 4897b3b4aa74SDavid Sterba btrfs_release_path(path); 489844f714daSFilipe Manana next_key: 48993d41d702SFilipe David Borba Manana if (min_key.offset < (u64)-1) { 4900e02119d5SChris Mason min_key.offset++; 49013d41d702SFilipe David Borba Manana } else if (min_key.type < max_key.type) { 4902e02119d5SChris Mason min_key.type++; 49033d41d702SFilipe David Borba Manana min_key.offset = 0; 49043d41d702SFilipe David Borba Manana } else { 4905e02119d5SChris Mason break; 4906e02119d5SChris Mason } 49073d41d702SFilipe David Borba Manana } 490831ff1cd2SChris Mason if (ins_nr) { 490944d70e19SNikolay Borisov ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent, 49101a4bcf47SFilipe Manana ins_start_slot, ins_nr, inode_only, 49111a4bcf47SFilipe Manana logged_isize); 491216e7549fSJosef Bacik if (ret < 0) { 49134a500fd1SYan, Zheng err = ret; 49144a500fd1SYan, Zheng goto out_unlock; 49154a500fd1SYan, Zheng } 491616e7549fSJosef Bacik ret = 0; 491731ff1cd2SChris Mason ins_nr = 0; 491831ff1cd2SChris Mason } 49195dc562c5SJosef Bacik 492036283bf7SFilipe Manana btrfs_release_path(path); 492136283bf7SFilipe Manana btrfs_release_path(dst_path); 4922*1a93c36aSNikolay Borisov err = btrfs_log_all_xattrs(trans, root, BTRFS_I(inode), path, dst_path); 492336283bf7SFilipe Manana if (err) 492436283bf7SFilipe Manana goto out_unlock; 4925a89ca6f2SFilipe Manana if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 4926a89ca6f2SFilipe Manana btrfs_release_path(path); 4927a89ca6f2SFilipe Manana btrfs_release_path(dst_path); 4928a89ca6f2SFilipe Manana err = btrfs_log_trailing_hole(trans, root, inode, path); 4929a89ca6f2SFilipe Manana if (err) 4930a89ca6f2SFilipe Manana goto out_unlock; 4931a89ca6f2SFilipe Manana } 4932a95249b3SJosef Bacik log_extents: 4933f3b15ccdSJosef Bacik btrfs_release_path(path); 49345dc562c5SJosef Bacik btrfs_release_path(dst_path); 4935e4545de5SFilipe Manana if (need_log_inode_item) { 4936e4545de5SFilipe Manana err = log_inode_item(trans, log, dst_path, inode); 4937e4545de5SFilipe Manana if (err) 4938e4545de5SFilipe Manana goto out_unlock; 4939e4545de5SFilipe Manana } 4940f3b15ccdSJosef Bacik if (fast_search) { 4941827463c4SMiao Xie ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4942de0ee0edSFilipe Manana &logged_list, ctx, start, end); 49435dc562c5SJosef Bacik if (ret) { 49445dc562c5SJosef Bacik err = ret; 49455dc562c5SJosef Bacik goto out_unlock; 49465dc562c5SJosef Bacik } 4947d006a048SJosef Bacik } else if (inode_only == LOG_INODE_ALL) { 494806d3d22bSLiu Bo struct extent_map *em, *n; 494906d3d22bSLiu Bo 495049dae1bcSFilipe Manana write_lock(&em_tree->lock); 495149dae1bcSFilipe Manana /* 495249dae1bcSFilipe Manana * We can't just remove every em if we're called for a ranged 495349dae1bcSFilipe Manana * fsync - that is, one that doesn't cover the whole possible 495449dae1bcSFilipe Manana * file range (0 to LLONG_MAX). This is because we can have 495549dae1bcSFilipe Manana * em's that fall outside the range we're logging and therefore 495649dae1bcSFilipe Manana * their ordered operations haven't completed yet 495749dae1bcSFilipe Manana * (btrfs_finish_ordered_io() not invoked yet). This means we 495849dae1bcSFilipe Manana * didn't get their respective file extent item in the fs/subvol 495949dae1bcSFilipe Manana * tree yet, and need to let the next fast fsync (one which 496049dae1bcSFilipe Manana * consults the list of modified extent maps) find the em so 496149dae1bcSFilipe Manana * that it logs a matching file extent item and waits for the 496249dae1bcSFilipe Manana * respective ordered operation to complete (if it's still 496349dae1bcSFilipe Manana * running). 496449dae1bcSFilipe Manana * 496549dae1bcSFilipe Manana * Removing every em outside the range we're logging would make 496649dae1bcSFilipe Manana * the next fast fsync not log their matching file extent items, 496749dae1bcSFilipe Manana * therefore making us lose data after a log replay. 496849dae1bcSFilipe Manana */ 496949dae1bcSFilipe Manana list_for_each_entry_safe(em, n, &em_tree->modified_extents, 497049dae1bcSFilipe Manana list) { 497149dae1bcSFilipe Manana const u64 mod_end = em->mod_start + em->mod_len - 1; 497249dae1bcSFilipe Manana 497349dae1bcSFilipe Manana if (em->mod_start >= start && mod_end <= end) 497406d3d22bSLiu Bo list_del_init(&em->list); 497549dae1bcSFilipe Manana } 497649dae1bcSFilipe Manana write_unlock(&em_tree->lock); 49775dc562c5SJosef Bacik } 49785dc562c5SJosef Bacik 49799623f9a3SChris Mason if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 49802f2ff0eeSFilipe Manana ret = log_directory_changes(trans, root, inode, path, dst_path, 49812f2ff0eeSFilipe Manana ctx); 49824a500fd1SYan, Zheng if (ret) { 49834a500fd1SYan, Zheng err = ret; 49844a500fd1SYan, Zheng goto out_unlock; 49854a500fd1SYan, Zheng } 4986e02119d5SChris Mason } 498749dae1bcSFilipe Manana 49882f2ff0eeSFilipe Manana spin_lock(&BTRFS_I(inode)->lock); 49893a5f1d45SChris Mason BTRFS_I(inode)->logged_trans = trans->transid; 4990125c4cf9SFilipe Manana BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 49912f2ff0eeSFilipe Manana spin_unlock(&BTRFS_I(inode)->lock); 49924a500fd1SYan, Zheng out_unlock: 4993827463c4SMiao Xie if (unlikely(err)) 4994827463c4SMiao Xie btrfs_put_logged_extents(&logged_list); 4995827463c4SMiao Xie else 4996827463c4SMiao Xie btrfs_submit_logged_extents(&logged_list, log); 4997e02119d5SChris Mason mutex_unlock(&BTRFS_I(inode)->log_mutex); 4998e02119d5SChris Mason 4999e02119d5SChris Mason btrfs_free_path(path); 5000e02119d5SChris Mason btrfs_free_path(dst_path); 50014a500fd1SYan, Zheng return err; 5002e02119d5SChris Mason } 5003e02119d5SChris Mason 500412fcfd22SChris Mason /* 50052be63d5cSFilipe Manana * Check if we must fallback to a transaction commit when logging an inode. 50062be63d5cSFilipe Manana * This must be called after logging the inode and is used only in the context 50072be63d5cSFilipe Manana * when fsyncing an inode requires the need to log some other inode - in which 50082be63d5cSFilipe Manana * case we can't lock the i_mutex of each other inode we need to log as that 50092be63d5cSFilipe Manana * can lead to deadlocks with concurrent fsync against other inodes (as we can 50102be63d5cSFilipe Manana * log inodes up or down in the hierarchy) or rename operations for example. So 50112be63d5cSFilipe Manana * we take the log_mutex of the inode after we have logged it and then check for 50122be63d5cSFilipe Manana * its last_unlink_trans value - this is safe because any task setting 50132be63d5cSFilipe Manana * last_unlink_trans must take the log_mutex and it must do this before it does 50142be63d5cSFilipe Manana * the actual unlink operation, so if we do this check before a concurrent task 50152be63d5cSFilipe Manana * sets last_unlink_trans it means we've logged a consistent version/state of 50162be63d5cSFilipe Manana * all the inode items, otherwise we are not sure and must do a transaction 501701327610SNicholas D Steeves * commit (the concurrent task might have only updated last_unlink_trans before 50182be63d5cSFilipe Manana * we logged the inode or it might have also done the unlink). 50192be63d5cSFilipe Manana */ 50202be63d5cSFilipe Manana static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5021ab1717b2SNikolay Borisov struct btrfs_inode *inode) 50222be63d5cSFilipe Manana { 5023ab1717b2SNikolay Borisov struct btrfs_fs_info *fs_info = inode->root->fs_info; 50242be63d5cSFilipe Manana bool ret = false; 50252be63d5cSFilipe Manana 5026ab1717b2SNikolay Borisov mutex_lock(&inode->log_mutex); 5027ab1717b2SNikolay Borisov if (inode->last_unlink_trans > fs_info->last_trans_committed) { 50282be63d5cSFilipe Manana /* 50292be63d5cSFilipe Manana * Make sure any commits to the log are forced to be full 50302be63d5cSFilipe Manana * commits. 50312be63d5cSFilipe Manana */ 50322be63d5cSFilipe Manana btrfs_set_log_full_commit(fs_info, trans); 50332be63d5cSFilipe Manana ret = true; 50342be63d5cSFilipe Manana } 5035ab1717b2SNikolay Borisov mutex_unlock(&inode->log_mutex); 50362be63d5cSFilipe Manana 50372be63d5cSFilipe Manana return ret; 50382be63d5cSFilipe Manana } 50392be63d5cSFilipe Manana 50402be63d5cSFilipe Manana /* 504112fcfd22SChris Mason * follow the dentry parent pointers up the chain and see if any 504212fcfd22SChris Mason * of the directories in it require a full commit before they can 504312fcfd22SChris Mason * be logged. Returns zero if nothing special needs to be done or 1 if 504412fcfd22SChris Mason * a full commit is required. 504512fcfd22SChris Mason */ 504612fcfd22SChris Mason static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 504712fcfd22SChris Mason struct inode *inode, 504812fcfd22SChris Mason struct dentry *parent, 504912fcfd22SChris Mason struct super_block *sb, 505012fcfd22SChris Mason u64 last_committed) 5051e02119d5SChris Mason { 505212fcfd22SChris Mason int ret = 0; 50536a912213SJosef Bacik struct dentry *old_parent = NULL; 5054de2b530bSJosef Bacik struct inode *orig_inode = inode; 5055e02119d5SChris Mason 5056af4176b4SChris Mason /* 5057af4176b4SChris Mason * for regular files, if its inode is already on disk, we don't 5058af4176b4SChris Mason * have to worry about the parents at all. This is because 5059af4176b4SChris Mason * we can use the last_unlink_trans field to record renames 5060af4176b4SChris Mason * and other fun in this file. 5061af4176b4SChris Mason */ 5062af4176b4SChris Mason if (S_ISREG(inode->i_mode) && 5063af4176b4SChris Mason BTRFS_I(inode)->generation <= last_committed && 5064af4176b4SChris Mason BTRFS_I(inode)->last_unlink_trans <= last_committed) 5065af4176b4SChris Mason goto out; 5066af4176b4SChris Mason 506712fcfd22SChris Mason if (!S_ISDIR(inode->i_mode)) { 5068fc64005cSAl Viro if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 506912fcfd22SChris Mason goto out; 50702b0143b5SDavid Howells inode = d_inode(parent); 507112fcfd22SChris Mason } 507212fcfd22SChris Mason 507312fcfd22SChris Mason while (1) { 5074de2b530bSJosef Bacik /* 5075de2b530bSJosef Bacik * If we are logging a directory then we start with our inode, 507601327610SNicholas D Steeves * not our parent's inode, so we need to skip setting the 5077de2b530bSJosef Bacik * logged_trans so that further down in the log code we don't 5078de2b530bSJosef Bacik * think this inode has already been logged. 5079de2b530bSJosef Bacik */ 5080de2b530bSJosef Bacik if (inode != orig_inode) 508112fcfd22SChris Mason BTRFS_I(inode)->logged_trans = trans->transid; 508212fcfd22SChris Mason smp_mb(); 508312fcfd22SChris Mason 5084ab1717b2SNikolay Borisov if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) { 508512fcfd22SChris Mason ret = 1; 508612fcfd22SChris Mason break; 508712fcfd22SChris Mason } 508812fcfd22SChris Mason 5089fc64005cSAl Viro if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 509012fcfd22SChris Mason break; 509112fcfd22SChris Mason 509244f714daSFilipe Manana if (IS_ROOT(parent)) { 509344f714daSFilipe Manana inode = d_inode(parent); 5094ab1717b2SNikolay Borisov if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) 509544f714daSFilipe Manana ret = 1; 509612fcfd22SChris Mason break; 509744f714daSFilipe Manana } 509812fcfd22SChris Mason 50996a912213SJosef Bacik parent = dget_parent(parent); 51006a912213SJosef Bacik dput(old_parent); 51016a912213SJosef Bacik old_parent = parent; 51022b0143b5SDavid Howells inode = d_inode(parent); 510312fcfd22SChris Mason 510412fcfd22SChris Mason } 51056a912213SJosef Bacik dput(old_parent); 510612fcfd22SChris Mason out: 5107e02119d5SChris Mason return ret; 5108e02119d5SChris Mason } 5109e02119d5SChris Mason 51102f2ff0eeSFilipe Manana struct btrfs_dir_list { 51112f2ff0eeSFilipe Manana u64 ino; 51122f2ff0eeSFilipe Manana struct list_head list; 51132f2ff0eeSFilipe Manana }; 51142f2ff0eeSFilipe Manana 51152f2ff0eeSFilipe Manana /* 51162f2ff0eeSFilipe Manana * Log the inodes of the new dentries of a directory. See log_dir_items() for 51172f2ff0eeSFilipe Manana * details about the why it is needed. 51182f2ff0eeSFilipe Manana * This is a recursive operation - if an existing dentry corresponds to a 51192f2ff0eeSFilipe Manana * directory, that directory's new entries are logged too (same behaviour as 51202f2ff0eeSFilipe Manana * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 51212f2ff0eeSFilipe Manana * the dentries point to we do not lock their i_mutex, otherwise lockdep 51222f2ff0eeSFilipe Manana * complains about the following circular lock dependency / possible deadlock: 51232f2ff0eeSFilipe Manana * 51242f2ff0eeSFilipe Manana * CPU0 CPU1 51252f2ff0eeSFilipe Manana * ---- ---- 51262f2ff0eeSFilipe Manana * lock(&type->i_mutex_dir_key#3/2); 51272f2ff0eeSFilipe Manana * lock(sb_internal#2); 51282f2ff0eeSFilipe Manana * lock(&type->i_mutex_dir_key#3/2); 51292f2ff0eeSFilipe Manana * lock(&sb->s_type->i_mutex_key#14); 51302f2ff0eeSFilipe Manana * 51312f2ff0eeSFilipe Manana * Where sb_internal is the lock (a counter that works as a lock) acquired by 51322f2ff0eeSFilipe Manana * sb_start_intwrite() in btrfs_start_transaction(). 51332f2ff0eeSFilipe Manana * Not locking i_mutex of the inodes is still safe because: 51342f2ff0eeSFilipe Manana * 51352f2ff0eeSFilipe Manana * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 51362f2ff0eeSFilipe Manana * that while logging the inode new references (names) are added or removed 51372f2ff0eeSFilipe Manana * from the inode, leaving the logged inode item with a link count that does 51382f2ff0eeSFilipe Manana * not match the number of logged inode reference items. This is fine because 51392f2ff0eeSFilipe Manana * at log replay time we compute the real number of links and correct the 51402f2ff0eeSFilipe Manana * link count in the inode item (see replay_one_buffer() and 51412f2ff0eeSFilipe Manana * link_to_fixup_dir()); 51422f2ff0eeSFilipe Manana * 51432f2ff0eeSFilipe Manana * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 51442f2ff0eeSFilipe Manana * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 51452f2ff0eeSFilipe Manana * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 51462f2ff0eeSFilipe Manana * has a size that doesn't match the sum of the lengths of all the logged 51472f2ff0eeSFilipe Manana * names. This does not result in a problem because if a dir_item key is 51482f2ff0eeSFilipe Manana * logged but its matching dir_index key is not logged, at log replay time we 51492f2ff0eeSFilipe Manana * don't use it to replay the respective name (see replay_one_name()). On the 51502f2ff0eeSFilipe Manana * other hand if only the dir_index key ends up being logged, the respective 51512f2ff0eeSFilipe Manana * name is added to the fs/subvol tree with both the dir_item and dir_index 51522f2ff0eeSFilipe Manana * keys created (see replay_one_name()). 51532f2ff0eeSFilipe Manana * The directory's inode item with a wrong i_size is not a problem as well, 51542f2ff0eeSFilipe Manana * since we don't use it at log replay time to set the i_size in the inode 51552f2ff0eeSFilipe Manana * item of the fs/subvol tree (see overwrite_item()). 51562f2ff0eeSFilipe Manana */ 51572f2ff0eeSFilipe Manana static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 51582f2ff0eeSFilipe Manana struct btrfs_root *root, 51592f2ff0eeSFilipe Manana struct inode *start_inode, 51602f2ff0eeSFilipe Manana struct btrfs_log_ctx *ctx) 51612f2ff0eeSFilipe Manana { 51620b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 51632f2ff0eeSFilipe Manana struct btrfs_root *log = root->log_root; 51642f2ff0eeSFilipe Manana struct btrfs_path *path; 51652f2ff0eeSFilipe Manana LIST_HEAD(dir_list); 51662f2ff0eeSFilipe Manana struct btrfs_dir_list *dir_elem; 51672f2ff0eeSFilipe Manana int ret = 0; 51682f2ff0eeSFilipe Manana 51692f2ff0eeSFilipe Manana path = btrfs_alloc_path(); 51702f2ff0eeSFilipe Manana if (!path) 51712f2ff0eeSFilipe Manana return -ENOMEM; 51722f2ff0eeSFilipe Manana 51732f2ff0eeSFilipe Manana dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 51742f2ff0eeSFilipe Manana if (!dir_elem) { 51752f2ff0eeSFilipe Manana btrfs_free_path(path); 51762f2ff0eeSFilipe Manana return -ENOMEM; 51772f2ff0eeSFilipe Manana } 51784a0cc7caSNikolay Borisov dir_elem->ino = btrfs_ino(BTRFS_I(start_inode)); 51792f2ff0eeSFilipe Manana list_add_tail(&dir_elem->list, &dir_list); 51802f2ff0eeSFilipe Manana 51812f2ff0eeSFilipe Manana while (!list_empty(&dir_list)) { 51822f2ff0eeSFilipe Manana struct extent_buffer *leaf; 51832f2ff0eeSFilipe Manana struct btrfs_key min_key; 51842f2ff0eeSFilipe Manana int nritems; 51852f2ff0eeSFilipe Manana int i; 51862f2ff0eeSFilipe Manana 51872f2ff0eeSFilipe Manana dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 51882f2ff0eeSFilipe Manana list); 51892f2ff0eeSFilipe Manana if (ret) 51902f2ff0eeSFilipe Manana goto next_dir_inode; 51912f2ff0eeSFilipe Manana 51922f2ff0eeSFilipe Manana min_key.objectid = dir_elem->ino; 51932f2ff0eeSFilipe Manana min_key.type = BTRFS_DIR_ITEM_KEY; 51942f2ff0eeSFilipe Manana min_key.offset = 0; 51952f2ff0eeSFilipe Manana again: 51962f2ff0eeSFilipe Manana btrfs_release_path(path); 51972f2ff0eeSFilipe Manana ret = btrfs_search_forward(log, &min_key, path, trans->transid); 51982f2ff0eeSFilipe Manana if (ret < 0) { 51992f2ff0eeSFilipe Manana goto next_dir_inode; 52002f2ff0eeSFilipe Manana } else if (ret > 0) { 52012f2ff0eeSFilipe Manana ret = 0; 52022f2ff0eeSFilipe Manana goto next_dir_inode; 52032f2ff0eeSFilipe Manana } 52042f2ff0eeSFilipe Manana 52052f2ff0eeSFilipe Manana process_leaf: 52062f2ff0eeSFilipe Manana leaf = path->nodes[0]; 52072f2ff0eeSFilipe Manana nritems = btrfs_header_nritems(leaf); 52082f2ff0eeSFilipe Manana for (i = path->slots[0]; i < nritems; i++) { 52092f2ff0eeSFilipe Manana struct btrfs_dir_item *di; 52102f2ff0eeSFilipe Manana struct btrfs_key di_key; 52112f2ff0eeSFilipe Manana struct inode *di_inode; 52122f2ff0eeSFilipe Manana struct btrfs_dir_list *new_dir_elem; 52132f2ff0eeSFilipe Manana int log_mode = LOG_INODE_EXISTS; 52142f2ff0eeSFilipe Manana int type; 52152f2ff0eeSFilipe Manana 52162f2ff0eeSFilipe Manana btrfs_item_key_to_cpu(leaf, &min_key, i); 52172f2ff0eeSFilipe Manana if (min_key.objectid != dir_elem->ino || 52182f2ff0eeSFilipe Manana min_key.type != BTRFS_DIR_ITEM_KEY) 52192f2ff0eeSFilipe Manana goto next_dir_inode; 52202f2ff0eeSFilipe Manana 52212f2ff0eeSFilipe Manana di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 52222f2ff0eeSFilipe Manana type = btrfs_dir_type(leaf, di); 52232f2ff0eeSFilipe Manana if (btrfs_dir_transid(leaf, di) < trans->transid && 52242f2ff0eeSFilipe Manana type != BTRFS_FT_DIR) 52252f2ff0eeSFilipe Manana continue; 52262f2ff0eeSFilipe Manana btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 52272f2ff0eeSFilipe Manana if (di_key.type == BTRFS_ROOT_ITEM_KEY) 52282f2ff0eeSFilipe Manana continue; 52292f2ff0eeSFilipe Manana 5230ec125cfbSRobbie Ko btrfs_release_path(path); 52310b246afaSJeff Mahoney di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); 52322f2ff0eeSFilipe Manana if (IS_ERR(di_inode)) { 52332f2ff0eeSFilipe Manana ret = PTR_ERR(di_inode); 52342f2ff0eeSFilipe Manana goto next_dir_inode; 52352f2ff0eeSFilipe Manana } 52362f2ff0eeSFilipe Manana 52370f8939b8SNikolay Borisov if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 52382f2ff0eeSFilipe Manana iput(di_inode); 5239ec125cfbSRobbie Ko break; 52402f2ff0eeSFilipe Manana } 52412f2ff0eeSFilipe Manana 52422f2ff0eeSFilipe Manana ctx->log_new_dentries = false; 52433f9749f6SFilipe Manana if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 52442f2ff0eeSFilipe Manana log_mode = LOG_INODE_ALL; 52452f2ff0eeSFilipe Manana ret = btrfs_log_inode(trans, root, di_inode, 52462f2ff0eeSFilipe Manana log_mode, 0, LLONG_MAX, ctx); 52472be63d5cSFilipe Manana if (!ret && 5248ab1717b2SNikolay Borisov btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 52492be63d5cSFilipe Manana ret = 1; 52502f2ff0eeSFilipe Manana iput(di_inode); 52512f2ff0eeSFilipe Manana if (ret) 52522f2ff0eeSFilipe Manana goto next_dir_inode; 52532f2ff0eeSFilipe Manana if (ctx->log_new_dentries) { 52542f2ff0eeSFilipe Manana new_dir_elem = kmalloc(sizeof(*new_dir_elem), 52552f2ff0eeSFilipe Manana GFP_NOFS); 52562f2ff0eeSFilipe Manana if (!new_dir_elem) { 52572f2ff0eeSFilipe Manana ret = -ENOMEM; 52582f2ff0eeSFilipe Manana goto next_dir_inode; 52592f2ff0eeSFilipe Manana } 52602f2ff0eeSFilipe Manana new_dir_elem->ino = di_key.objectid; 52612f2ff0eeSFilipe Manana list_add_tail(&new_dir_elem->list, &dir_list); 52622f2ff0eeSFilipe Manana } 52632f2ff0eeSFilipe Manana break; 52642f2ff0eeSFilipe Manana } 52652f2ff0eeSFilipe Manana if (i == nritems) { 52662f2ff0eeSFilipe Manana ret = btrfs_next_leaf(log, path); 52672f2ff0eeSFilipe Manana if (ret < 0) { 52682f2ff0eeSFilipe Manana goto next_dir_inode; 52692f2ff0eeSFilipe Manana } else if (ret > 0) { 52702f2ff0eeSFilipe Manana ret = 0; 52712f2ff0eeSFilipe Manana goto next_dir_inode; 52722f2ff0eeSFilipe Manana } 52732f2ff0eeSFilipe Manana goto process_leaf; 52742f2ff0eeSFilipe Manana } 52752f2ff0eeSFilipe Manana if (min_key.offset < (u64)-1) { 52762f2ff0eeSFilipe Manana min_key.offset++; 52772f2ff0eeSFilipe Manana goto again; 52782f2ff0eeSFilipe Manana } 52792f2ff0eeSFilipe Manana next_dir_inode: 52802f2ff0eeSFilipe Manana list_del(&dir_elem->list); 52812f2ff0eeSFilipe Manana kfree(dir_elem); 52822f2ff0eeSFilipe Manana } 52832f2ff0eeSFilipe Manana 52842f2ff0eeSFilipe Manana btrfs_free_path(path); 52852f2ff0eeSFilipe Manana return ret; 52862f2ff0eeSFilipe Manana } 52872f2ff0eeSFilipe Manana 528818aa0922SFilipe Manana static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 528918aa0922SFilipe Manana struct inode *inode, 529018aa0922SFilipe Manana struct btrfs_log_ctx *ctx) 529118aa0922SFilipe Manana { 52920b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 529318aa0922SFilipe Manana int ret; 529418aa0922SFilipe Manana struct btrfs_path *path; 529518aa0922SFilipe Manana struct btrfs_key key; 529618aa0922SFilipe Manana struct btrfs_root *root = BTRFS_I(inode)->root; 52974a0cc7caSNikolay Borisov const u64 ino = btrfs_ino(BTRFS_I(inode)); 529818aa0922SFilipe Manana 529918aa0922SFilipe Manana path = btrfs_alloc_path(); 530018aa0922SFilipe Manana if (!path) 530118aa0922SFilipe Manana return -ENOMEM; 530218aa0922SFilipe Manana path->skip_locking = 1; 530318aa0922SFilipe Manana path->search_commit_root = 1; 530418aa0922SFilipe Manana 530518aa0922SFilipe Manana key.objectid = ino; 530618aa0922SFilipe Manana key.type = BTRFS_INODE_REF_KEY; 530718aa0922SFilipe Manana key.offset = 0; 530818aa0922SFilipe Manana ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 530918aa0922SFilipe Manana if (ret < 0) 531018aa0922SFilipe Manana goto out; 531118aa0922SFilipe Manana 531218aa0922SFilipe Manana while (true) { 531318aa0922SFilipe Manana struct extent_buffer *leaf = path->nodes[0]; 531418aa0922SFilipe Manana int slot = path->slots[0]; 531518aa0922SFilipe Manana u32 cur_offset = 0; 531618aa0922SFilipe Manana u32 item_size; 531718aa0922SFilipe Manana unsigned long ptr; 531818aa0922SFilipe Manana 531918aa0922SFilipe Manana if (slot >= btrfs_header_nritems(leaf)) { 532018aa0922SFilipe Manana ret = btrfs_next_leaf(root, path); 532118aa0922SFilipe Manana if (ret < 0) 532218aa0922SFilipe Manana goto out; 532318aa0922SFilipe Manana else if (ret > 0) 532418aa0922SFilipe Manana break; 532518aa0922SFilipe Manana continue; 532618aa0922SFilipe Manana } 532718aa0922SFilipe Manana 532818aa0922SFilipe Manana btrfs_item_key_to_cpu(leaf, &key, slot); 532918aa0922SFilipe Manana /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 533018aa0922SFilipe Manana if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 533118aa0922SFilipe Manana break; 533218aa0922SFilipe Manana 533318aa0922SFilipe Manana item_size = btrfs_item_size_nr(leaf, slot); 533418aa0922SFilipe Manana ptr = btrfs_item_ptr_offset(leaf, slot); 533518aa0922SFilipe Manana while (cur_offset < item_size) { 533618aa0922SFilipe Manana struct btrfs_key inode_key; 533718aa0922SFilipe Manana struct inode *dir_inode; 533818aa0922SFilipe Manana 533918aa0922SFilipe Manana inode_key.type = BTRFS_INODE_ITEM_KEY; 534018aa0922SFilipe Manana inode_key.offset = 0; 534118aa0922SFilipe Manana 534218aa0922SFilipe Manana if (key.type == BTRFS_INODE_EXTREF_KEY) { 534318aa0922SFilipe Manana struct btrfs_inode_extref *extref; 534418aa0922SFilipe Manana 534518aa0922SFilipe Manana extref = (struct btrfs_inode_extref *) 534618aa0922SFilipe Manana (ptr + cur_offset); 534718aa0922SFilipe Manana inode_key.objectid = btrfs_inode_extref_parent( 534818aa0922SFilipe Manana leaf, extref); 534918aa0922SFilipe Manana cur_offset += sizeof(*extref); 535018aa0922SFilipe Manana cur_offset += btrfs_inode_extref_name_len(leaf, 535118aa0922SFilipe Manana extref); 535218aa0922SFilipe Manana } else { 535318aa0922SFilipe Manana inode_key.objectid = key.offset; 535418aa0922SFilipe Manana cur_offset = item_size; 535518aa0922SFilipe Manana } 535618aa0922SFilipe Manana 53570b246afaSJeff Mahoney dir_inode = btrfs_iget(fs_info->sb, &inode_key, 535818aa0922SFilipe Manana root, NULL); 535918aa0922SFilipe Manana /* If parent inode was deleted, skip it. */ 536018aa0922SFilipe Manana if (IS_ERR(dir_inode)) 536118aa0922SFilipe Manana continue; 536218aa0922SFilipe Manana 5363657ed1aaSFilipe Manana if (ctx) 5364657ed1aaSFilipe Manana ctx->log_new_dentries = false; 536518aa0922SFilipe Manana ret = btrfs_log_inode(trans, root, dir_inode, 536618aa0922SFilipe Manana LOG_INODE_ALL, 0, LLONG_MAX, ctx); 53672be63d5cSFilipe Manana if (!ret && 5368ab1717b2SNikolay Borisov btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 53692be63d5cSFilipe Manana ret = 1; 5370657ed1aaSFilipe Manana if (!ret && ctx && ctx->log_new_dentries) 5371657ed1aaSFilipe Manana ret = log_new_dir_dentries(trans, root, 5372657ed1aaSFilipe Manana dir_inode, ctx); 537318aa0922SFilipe Manana iput(dir_inode); 537418aa0922SFilipe Manana if (ret) 537518aa0922SFilipe Manana goto out; 537618aa0922SFilipe Manana } 537718aa0922SFilipe Manana path->slots[0]++; 537818aa0922SFilipe Manana } 537918aa0922SFilipe Manana ret = 0; 538018aa0922SFilipe Manana out: 538118aa0922SFilipe Manana btrfs_free_path(path); 538218aa0922SFilipe Manana return ret; 538318aa0922SFilipe Manana } 538418aa0922SFilipe Manana 5385e02119d5SChris Mason /* 5386e02119d5SChris Mason * helper function around btrfs_log_inode to make sure newly created 5387e02119d5SChris Mason * parent directories also end up in the log. A minimal inode and backref 5388e02119d5SChris Mason * only logging is done of any parent directories that are older than 5389e02119d5SChris Mason * the last committed transaction 5390e02119d5SChris Mason */ 539148a3b636SEric Sandeen static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 539212fcfd22SChris Mason struct btrfs_root *root, struct inode *inode, 539349dae1bcSFilipe Manana struct dentry *parent, 539449dae1bcSFilipe Manana const loff_t start, 539549dae1bcSFilipe Manana const loff_t end, 539649dae1bcSFilipe Manana int exists_only, 53978b050d35SMiao Xie struct btrfs_log_ctx *ctx) 5398e02119d5SChris Mason { 53990b246afaSJeff Mahoney struct btrfs_fs_info *fs_info = root->fs_info; 540012fcfd22SChris Mason int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 5401e02119d5SChris Mason struct super_block *sb; 54026a912213SJosef Bacik struct dentry *old_parent = NULL; 540312fcfd22SChris Mason int ret = 0; 54040b246afaSJeff Mahoney u64 last_committed = fs_info->last_trans_committed; 54052f2ff0eeSFilipe Manana bool log_dentries = false; 54062f2ff0eeSFilipe Manana struct inode *orig_inode = inode; 540712fcfd22SChris Mason 540812fcfd22SChris Mason sb = inode->i_sb; 540912fcfd22SChris Mason 54100b246afaSJeff Mahoney if (btrfs_test_opt(fs_info, NOTREELOG)) { 54113a5e1404SSage Weil ret = 1; 54123a5e1404SSage Weil goto end_no_trans; 54133a5e1404SSage Weil } 54143a5e1404SSage Weil 5415995946ddSMiao Xie /* 5416995946ddSMiao Xie * The prev transaction commit doesn't complete, we need do 5417995946ddSMiao Xie * full commit by ourselves. 5418995946ddSMiao Xie */ 54190b246afaSJeff Mahoney if (fs_info->last_trans_log_full_commit > 54200b246afaSJeff Mahoney fs_info->last_trans_committed) { 542112fcfd22SChris Mason ret = 1; 542212fcfd22SChris Mason goto end_no_trans; 542312fcfd22SChris Mason } 542412fcfd22SChris Mason 542576dda93cSYan, Zheng if (root != BTRFS_I(inode)->root || 542676dda93cSYan, Zheng btrfs_root_refs(&root->root_item) == 0) { 542776dda93cSYan, Zheng ret = 1; 542876dda93cSYan, Zheng goto end_no_trans; 542976dda93cSYan, Zheng } 543076dda93cSYan, Zheng 543112fcfd22SChris Mason ret = check_parent_dirs_for_sync(trans, inode, parent, 543212fcfd22SChris Mason sb, last_committed); 543312fcfd22SChris Mason if (ret) 543412fcfd22SChris Mason goto end_no_trans; 5435e02119d5SChris Mason 54360f8939b8SNikolay Borisov if (btrfs_inode_in_log(BTRFS_I(inode), trans->transid)) { 5437257c62e1SChris Mason ret = BTRFS_NO_LOG_SYNC; 5438257c62e1SChris Mason goto end_no_trans; 5439257c62e1SChris Mason } 5440257c62e1SChris Mason 54418b050d35SMiao Xie ret = start_log_trans(trans, root, ctx); 54424a500fd1SYan, Zheng if (ret) 5443e87ac136SMiao Xie goto end_no_trans; 544412fcfd22SChris Mason 54458407f553SFilipe Manana ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); 54464a500fd1SYan, Zheng if (ret) 54474a500fd1SYan, Zheng goto end_trans; 5448e02119d5SChris Mason 5449af4176b4SChris Mason /* 5450af4176b4SChris Mason * for regular files, if its inode is already on disk, we don't 5451af4176b4SChris Mason * have to worry about the parents at all. This is because 5452af4176b4SChris Mason * we can use the last_unlink_trans field to record renames 5453af4176b4SChris Mason * and other fun in this file. 5454af4176b4SChris Mason */ 5455af4176b4SChris Mason if (S_ISREG(inode->i_mode) && 5456af4176b4SChris Mason BTRFS_I(inode)->generation <= last_committed && 54574a500fd1SYan, Zheng BTRFS_I(inode)->last_unlink_trans <= last_committed) { 54584a500fd1SYan, Zheng ret = 0; 54594a500fd1SYan, Zheng goto end_trans; 54604a500fd1SYan, Zheng } 5461af4176b4SChris Mason 54622f2ff0eeSFilipe Manana if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 54632f2ff0eeSFilipe Manana log_dentries = true; 54642f2ff0eeSFilipe Manana 546518aa0922SFilipe Manana /* 546601327610SNicholas D Steeves * On unlink we must make sure all our current and old parent directory 546718aa0922SFilipe Manana * inodes are fully logged. This is to prevent leaving dangling 546818aa0922SFilipe Manana * directory index entries in directories that were our parents but are 546918aa0922SFilipe Manana * not anymore. Not doing this results in old parent directory being 547018aa0922SFilipe Manana * impossible to delete after log replay (rmdir will always fail with 547118aa0922SFilipe Manana * error -ENOTEMPTY). 547218aa0922SFilipe Manana * 547318aa0922SFilipe Manana * Example 1: 547418aa0922SFilipe Manana * 547518aa0922SFilipe Manana * mkdir testdir 547618aa0922SFilipe Manana * touch testdir/foo 547718aa0922SFilipe Manana * ln testdir/foo testdir/bar 547818aa0922SFilipe Manana * sync 547918aa0922SFilipe Manana * unlink testdir/bar 548018aa0922SFilipe Manana * xfs_io -c fsync testdir/foo 548118aa0922SFilipe Manana * <power failure> 548218aa0922SFilipe Manana * mount fs, triggers log replay 548318aa0922SFilipe Manana * 548418aa0922SFilipe Manana * If we don't log the parent directory (testdir), after log replay the 548518aa0922SFilipe Manana * directory still has an entry pointing to the file inode using the bar 548618aa0922SFilipe Manana * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 548718aa0922SFilipe Manana * the file inode has a link count of 1. 548818aa0922SFilipe Manana * 548918aa0922SFilipe Manana * Example 2: 549018aa0922SFilipe Manana * 549118aa0922SFilipe Manana * mkdir testdir 549218aa0922SFilipe Manana * touch foo 549318aa0922SFilipe Manana * ln foo testdir/foo2 549418aa0922SFilipe Manana * ln foo testdir/foo3 549518aa0922SFilipe Manana * sync 549618aa0922SFilipe Manana * unlink testdir/foo3 549718aa0922SFilipe Manana * xfs_io -c fsync foo 549818aa0922SFilipe Manana * <power failure> 549918aa0922SFilipe Manana * mount fs, triggers log replay 550018aa0922SFilipe Manana * 550118aa0922SFilipe Manana * Similar as the first example, after log replay the parent directory 550218aa0922SFilipe Manana * testdir still has an entry pointing to the inode file with name foo3 550318aa0922SFilipe Manana * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 550418aa0922SFilipe Manana * and has a link count of 2. 550518aa0922SFilipe Manana */ 550618aa0922SFilipe Manana if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 550718aa0922SFilipe Manana ret = btrfs_log_all_parents(trans, orig_inode, ctx); 550818aa0922SFilipe Manana if (ret) 550918aa0922SFilipe Manana goto end_trans; 551018aa0922SFilipe Manana } 551118aa0922SFilipe Manana 551212fcfd22SChris Mason while (1) { 5513fc64005cSAl Viro if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5514e02119d5SChris Mason break; 5515e02119d5SChris Mason 55162b0143b5SDavid Howells inode = d_inode(parent); 551776dda93cSYan, Zheng if (root != BTRFS_I(inode)->root) 551876dda93cSYan, Zheng break; 551976dda93cSYan, Zheng 552018aa0922SFilipe Manana if (BTRFS_I(inode)->generation > last_committed) { 552118aa0922SFilipe Manana ret = btrfs_log_inode(trans, root, inode, 552218aa0922SFilipe Manana LOG_INODE_EXISTS, 55238407f553SFilipe Manana 0, LLONG_MAX, ctx); 55244a500fd1SYan, Zheng if (ret) 55254a500fd1SYan, Zheng goto end_trans; 5526e02119d5SChris Mason } 552776dda93cSYan, Zheng if (IS_ROOT(parent)) 552812fcfd22SChris Mason break; 552912fcfd22SChris Mason 55306a912213SJosef Bacik parent = dget_parent(parent); 55316a912213SJosef Bacik dput(old_parent); 55326a912213SJosef Bacik old_parent = parent; 553312fcfd22SChris Mason } 55342f2ff0eeSFilipe Manana if (log_dentries) 55352f2ff0eeSFilipe Manana ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 55362f2ff0eeSFilipe Manana else 553712fcfd22SChris Mason ret = 0; 55384a500fd1SYan, Zheng end_trans: 55396a912213SJosef Bacik dput(old_parent); 55404a500fd1SYan, Zheng if (ret < 0) { 55410b246afaSJeff Mahoney btrfs_set_log_full_commit(fs_info, trans); 55424a500fd1SYan, Zheng ret = 1; 55434a500fd1SYan, Zheng } 55448b050d35SMiao Xie 55458b050d35SMiao Xie if (ret) 55468b050d35SMiao Xie btrfs_remove_log_ctx(root, ctx); 554712fcfd22SChris Mason btrfs_end_log_trans(root); 554812fcfd22SChris Mason end_no_trans: 554912fcfd22SChris Mason return ret; 5550e02119d5SChris Mason } 5551e02119d5SChris Mason 5552e02119d5SChris Mason /* 5553e02119d5SChris Mason * it is not safe to log dentry if the chunk root has added new 5554e02119d5SChris Mason * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 5555e02119d5SChris Mason * If this returns 1, you must commit the transaction to safely get your 5556e02119d5SChris Mason * data on disk. 5557e02119d5SChris Mason */ 5558e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 55598b050d35SMiao Xie struct btrfs_root *root, struct dentry *dentry, 556049dae1bcSFilipe Manana const loff_t start, 556149dae1bcSFilipe Manana const loff_t end, 55628b050d35SMiao Xie struct btrfs_log_ctx *ctx) 5563e02119d5SChris Mason { 55646a912213SJosef Bacik struct dentry *parent = dget_parent(dentry); 55656a912213SJosef Bacik int ret; 55666a912213SJosef Bacik 55672b0143b5SDavid Howells ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, 556849dae1bcSFilipe Manana start, end, 0, ctx); 55696a912213SJosef Bacik dput(parent); 55706a912213SJosef Bacik 55716a912213SJosef Bacik return ret; 5572e02119d5SChris Mason } 5573e02119d5SChris Mason 5574e02119d5SChris Mason /* 5575e02119d5SChris Mason * should be called during mount to recover any replay any log trees 5576e02119d5SChris Mason * from the FS 5577e02119d5SChris Mason */ 5578e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 5579e02119d5SChris Mason { 5580e02119d5SChris Mason int ret; 5581e02119d5SChris Mason struct btrfs_path *path; 5582e02119d5SChris Mason struct btrfs_trans_handle *trans; 5583e02119d5SChris Mason struct btrfs_key key; 5584e02119d5SChris Mason struct btrfs_key found_key; 5585e02119d5SChris Mason struct btrfs_key tmp_key; 5586e02119d5SChris Mason struct btrfs_root *log; 5587e02119d5SChris Mason struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 5588e02119d5SChris Mason struct walk_control wc = { 5589e02119d5SChris Mason .process_func = process_one_buffer, 5590e02119d5SChris Mason .stage = 0, 5591e02119d5SChris Mason }; 5592e02119d5SChris Mason 5593e02119d5SChris Mason path = btrfs_alloc_path(); 5594db5b493aSTsutomu Itoh if (!path) 5595db5b493aSTsutomu Itoh return -ENOMEM; 5596db5b493aSTsutomu Itoh 5597afcdd129SJosef Bacik set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5598e02119d5SChris Mason 55994a500fd1SYan, Zheng trans = btrfs_start_transaction(fs_info->tree_root, 0); 560079787eaaSJeff Mahoney if (IS_ERR(trans)) { 560179787eaaSJeff Mahoney ret = PTR_ERR(trans); 560279787eaaSJeff Mahoney goto error; 560379787eaaSJeff Mahoney } 5604e02119d5SChris Mason 5605e02119d5SChris Mason wc.trans = trans; 5606e02119d5SChris Mason wc.pin = 1; 5607e02119d5SChris Mason 5608db5b493aSTsutomu Itoh ret = walk_log_tree(trans, log_root_tree, &wc); 560979787eaaSJeff Mahoney if (ret) { 56105d163e0eSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 56115d163e0eSJeff Mahoney "Failed to pin buffers while recovering log root tree."); 561279787eaaSJeff Mahoney goto error; 561379787eaaSJeff Mahoney } 5614e02119d5SChris Mason 5615e02119d5SChris Mason again: 5616e02119d5SChris Mason key.objectid = BTRFS_TREE_LOG_OBJECTID; 5617e02119d5SChris Mason key.offset = (u64)-1; 5618962a298fSDavid Sterba key.type = BTRFS_ROOT_ITEM_KEY; 5619e02119d5SChris Mason 5620e02119d5SChris Mason while (1) { 5621e02119d5SChris Mason ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 562279787eaaSJeff Mahoney 562379787eaaSJeff Mahoney if (ret < 0) { 562434d97007SAnand Jain btrfs_handle_fs_error(fs_info, ret, 562579787eaaSJeff Mahoney "Couldn't find tree log root."); 562679787eaaSJeff Mahoney goto error; 562779787eaaSJeff Mahoney } 5628e02119d5SChris Mason if (ret > 0) { 5629e02119d5SChris Mason if (path->slots[0] == 0) 5630e02119d5SChris Mason break; 5631e02119d5SChris Mason path->slots[0]--; 5632e02119d5SChris Mason } 5633e02119d5SChris Mason btrfs_item_key_to_cpu(path->nodes[0], &found_key, 5634e02119d5SChris Mason path->slots[0]); 5635b3b4aa74SDavid Sterba btrfs_release_path(path); 5636e02119d5SChris Mason if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 5637e02119d5SChris Mason break; 5638e02119d5SChris Mason 5639cb517eabSMiao Xie log = btrfs_read_fs_root(log_root_tree, &found_key); 564079787eaaSJeff Mahoney if (IS_ERR(log)) { 564179787eaaSJeff Mahoney ret = PTR_ERR(log); 564234d97007SAnand Jain btrfs_handle_fs_error(fs_info, ret, 564379787eaaSJeff Mahoney "Couldn't read tree log root."); 564479787eaaSJeff Mahoney goto error; 564579787eaaSJeff Mahoney } 5646e02119d5SChris Mason 5647e02119d5SChris Mason tmp_key.objectid = found_key.offset; 5648e02119d5SChris Mason tmp_key.type = BTRFS_ROOT_ITEM_KEY; 5649e02119d5SChris Mason tmp_key.offset = (u64)-1; 5650e02119d5SChris Mason 5651e02119d5SChris Mason wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 565279787eaaSJeff Mahoney if (IS_ERR(wc.replay_dest)) { 565379787eaaSJeff Mahoney ret = PTR_ERR(wc.replay_dest); 5654b50c6e25SJosef Bacik free_extent_buffer(log->node); 5655b50c6e25SJosef Bacik free_extent_buffer(log->commit_root); 5656b50c6e25SJosef Bacik kfree(log); 56575d163e0eSJeff Mahoney btrfs_handle_fs_error(fs_info, ret, 56585d163e0eSJeff Mahoney "Couldn't read target root for tree log recovery."); 565979787eaaSJeff Mahoney goto error; 566079787eaaSJeff Mahoney } 5661e02119d5SChris Mason 566207d400a6SYan Zheng wc.replay_dest->log_root = log; 56635d4f98a2SYan Zheng btrfs_record_root_in_trans(trans, wc.replay_dest); 5664e02119d5SChris Mason ret = walk_log_tree(trans, log, &wc); 5665e02119d5SChris Mason 5666b50c6e25SJosef Bacik if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 5667e02119d5SChris Mason ret = fixup_inode_link_counts(trans, wc.replay_dest, 5668e02119d5SChris Mason path); 5669e02119d5SChris Mason } 5670e02119d5SChris Mason 5671e02119d5SChris Mason key.offset = found_key.offset - 1; 567207d400a6SYan Zheng wc.replay_dest->log_root = NULL; 5673e02119d5SChris Mason free_extent_buffer(log->node); 5674b263c2c8SChris Mason free_extent_buffer(log->commit_root); 5675e02119d5SChris Mason kfree(log); 5676e02119d5SChris Mason 5677b50c6e25SJosef Bacik if (ret) 5678b50c6e25SJosef Bacik goto error; 5679b50c6e25SJosef Bacik 5680e02119d5SChris Mason if (found_key.offset == 0) 5681e02119d5SChris Mason break; 5682e02119d5SChris Mason } 5683b3b4aa74SDavid Sterba btrfs_release_path(path); 5684e02119d5SChris Mason 5685e02119d5SChris Mason /* step one is to pin it all, step two is to replay just inodes */ 5686e02119d5SChris Mason if (wc.pin) { 5687e02119d5SChris Mason wc.pin = 0; 5688e02119d5SChris Mason wc.process_func = replay_one_buffer; 5689e02119d5SChris Mason wc.stage = LOG_WALK_REPLAY_INODES; 5690e02119d5SChris Mason goto again; 5691e02119d5SChris Mason } 5692e02119d5SChris Mason /* step three is to replay everything */ 5693e02119d5SChris Mason if (wc.stage < LOG_WALK_REPLAY_ALL) { 5694e02119d5SChris Mason wc.stage++; 5695e02119d5SChris Mason goto again; 5696e02119d5SChris Mason } 5697e02119d5SChris Mason 5698e02119d5SChris Mason btrfs_free_path(path); 5699e02119d5SChris Mason 5700abefa55aSJosef Bacik /* step 4: commit the transaction, which also unpins the blocks */ 57013a45bb20SJeff Mahoney ret = btrfs_commit_transaction(trans); 5702abefa55aSJosef Bacik if (ret) 5703abefa55aSJosef Bacik return ret; 5704abefa55aSJosef Bacik 5705e02119d5SChris Mason free_extent_buffer(log_root_tree->node); 5706e02119d5SChris Mason log_root_tree->log_root = NULL; 5707afcdd129SJosef Bacik clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 5708e02119d5SChris Mason kfree(log_root_tree); 570979787eaaSJeff Mahoney 5710abefa55aSJosef Bacik return 0; 571179787eaaSJeff Mahoney error: 5712b50c6e25SJosef Bacik if (wc.trans) 57133a45bb20SJeff Mahoney btrfs_end_transaction(wc.trans); 571479787eaaSJeff Mahoney btrfs_free_path(path); 571579787eaaSJeff Mahoney return ret; 5716e02119d5SChris Mason } 571712fcfd22SChris Mason 571812fcfd22SChris Mason /* 571912fcfd22SChris Mason * there are some corner cases where we want to force a full 572012fcfd22SChris Mason * commit instead of allowing a directory to be logged. 572112fcfd22SChris Mason * 572212fcfd22SChris Mason * They revolve around files there were unlinked from the directory, and 572312fcfd22SChris Mason * this function updates the parent directory so that a full commit is 572412fcfd22SChris Mason * properly done if it is fsync'd later after the unlinks are done. 57252be63d5cSFilipe Manana * 57262be63d5cSFilipe Manana * Must be called before the unlink operations (updates to the subvolume tree, 57272be63d5cSFilipe Manana * inodes, etc) are done. 572812fcfd22SChris Mason */ 572912fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 57304176bdbfSNikolay Borisov struct btrfs_inode *dir, struct btrfs_inode *inode, 573112fcfd22SChris Mason int for_rename) 573212fcfd22SChris Mason { 573312fcfd22SChris Mason /* 5734af4176b4SChris Mason * when we're logging a file, if it hasn't been renamed 5735af4176b4SChris Mason * or unlinked, and its inode is fully committed on disk, 5736af4176b4SChris Mason * we don't have to worry about walking up the directory chain 5737af4176b4SChris Mason * to log its parents. 5738af4176b4SChris Mason * 5739af4176b4SChris Mason * So, we use the last_unlink_trans field to put this transid 5740af4176b4SChris Mason * into the file. When the file is logged we check it and 5741af4176b4SChris Mason * don't log the parents if the file is fully on disk. 5742af4176b4SChris Mason */ 57434176bdbfSNikolay Borisov mutex_lock(&inode->log_mutex); 57444176bdbfSNikolay Borisov inode->last_unlink_trans = trans->transid; 57454176bdbfSNikolay Borisov mutex_unlock(&inode->log_mutex); 5746af4176b4SChris Mason 5747af4176b4SChris Mason /* 574812fcfd22SChris Mason * if this directory was already logged any new 574912fcfd22SChris Mason * names for this file/dir will get recorded 575012fcfd22SChris Mason */ 575112fcfd22SChris Mason smp_mb(); 57524176bdbfSNikolay Borisov if (dir->logged_trans == trans->transid) 575312fcfd22SChris Mason return; 575412fcfd22SChris Mason 575512fcfd22SChris Mason /* 575612fcfd22SChris Mason * if the inode we're about to unlink was logged, 575712fcfd22SChris Mason * the log will be properly updated for any new names 575812fcfd22SChris Mason */ 57594176bdbfSNikolay Borisov if (inode->logged_trans == trans->transid) 576012fcfd22SChris Mason return; 576112fcfd22SChris Mason 576212fcfd22SChris Mason /* 576312fcfd22SChris Mason * when renaming files across directories, if the directory 576412fcfd22SChris Mason * there we're unlinking from gets fsync'd later on, there's 576512fcfd22SChris Mason * no way to find the destination directory later and fsync it 576612fcfd22SChris Mason * properly. So, we have to be conservative and force commits 576712fcfd22SChris Mason * so the new name gets discovered. 576812fcfd22SChris Mason */ 576912fcfd22SChris Mason if (for_rename) 577012fcfd22SChris Mason goto record; 577112fcfd22SChris Mason 577212fcfd22SChris Mason /* we can safely do the unlink without any special recording */ 577312fcfd22SChris Mason return; 577412fcfd22SChris Mason 577512fcfd22SChris Mason record: 57764176bdbfSNikolay Borisov mutex_lock(&dir->log_mutex); 57774176bdbfSNikolay Borisov dir->last_unlink_trans = trans->transid; 57784176bdbfSNikolay Borisov mutex_unlock(&dir->log_mutex); 577912fcfd22SChris Mason } 578012fcfd22SChris Mason 578112fcfd22SChris Mason /* 57821ec9a1aeSFilipe Manana * Make sure that if someone attempts to fsync the parent directory of a deleted 57831ec9a1aeSFilipe Manana * snapshot, it ends up triggering a transaction commit. This is to guarantee 57841ec9a1aeSFilipe Manana * that after replaying the log tree of the parent directory's root we will not 57851ec9a1aeSFilipe Manana * see the snapshot anymore and at log replay time we will not see any log tree 57861ec9a1aeSFilipe Manana * corresponding to the deleted snapshot's root, which could lead to replaying 57871ec9a1aeSFilipe Manana * it after replaying the log tree of the parent directory (which would replay 57881ec9a1aeSFilipe Manana * the snapshot delete operation). 57892be63d5cSFilipe Manana * 57902be63d5cSFilipe Manana * Must be called before the actual snapshot destroy operation (updates to the 57912be63d5cSFilipe Manana * parent root and tree of tree roots trees, etc) are done. 57921ec9a1aeSFilipe Manana */ 57931ec9a1aeSFilipe Manana void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 579443663557SNikolay Borisov struct btrfs_inode *dir) 57951ec9a1aeSFilipe Manana { 579643663557SNikolay Borisov mutex_lock(&dir->log_mutex); 579743663557SNikolay Borisov dir->last_unlink_trans = trans->transid; 579843663557SNikolay Borisov mutex_unlock(&dir->log_mutex); 57991ec9a1aeSFilipe Manana } 58001ec9a1aeSFilipe Manana 58011ec9a1aeSFilipe Manana /* 580212fcfd22SChris Mason * Call this after adding a new name for a file and it will properly 580312fcfd22SChris Mason * update the log to reflect the new name. 580412fcfd22SChris Mason * 580512fcfd22SChris Mason * It will return zero if all goes well, and it will return 1 if a 580612fcfd22SChris Mason * full transaction commit is required. 580712fcfd22SChris Mason */ 580812fcfd22SChris Mason int btrfs_log_new_name(struct btrfs_trans_handle *trans, 58099ca5fbfbSNikolay Borisov struct btrfs_inode *inode, struct btrfs_inode *old_dir, 581012fcfd22SChris Mason struct dentry *parent) 581112fcfd22SChris Mason { 58129ca5fbfbSNikolay Borisov struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 58139ca5fbfbSNikolay Borisov struct btrfs_root * root = inode->root; 581412fcfd22SChris Mason 581512fcfd22SChris Mason /* 5816af4176b4SChris Mason * this will force the logging code to walk the dentry chain 5817af4176b4SChris Mason * up for the file 5818af4176b4SChris Mason */ 58199ca5fbfbSNikolay Borisov if (S_ISREG(inode->vfs_inode.i_mode)) 58209ca5fbfbSNikolay Borisov inode->last_unlink_trans = trans->transid; 5821af4176b4SChris Mason 5822af4176b4SChris Mason /* 582312fcfd22SChris Mason * if this inode hasn't been logged and directory we're renaming it 582412fcfd22SChris Mason * from hasn't been logged, we don't need to log it 582512fcfd22SChris Mason */ 58269ca5fbfbSNikolay Borisov if (inode->logged_trans <= fs_info->last_trans_committed && 58279ca5fbfbSNikolay Borisov (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) 582812fcfd22SChris Mason return 0; 582912fcfd22SChris Mason 58309ca5fbfbSNikolay Borisov return btrfs_log_inode_parent(trans, root, &inode->vfs_inode, parent, 0, 583149dae1bcSFilipe Manana LLONG_MAX, 1, NULL); 583212fcfd22SChris Mason } 583312fcfd22SChris Mason 5834