xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 6d889a3b9e66d776a6052cac5b6ff52014862936)
1e02119d5SChris Mason /*
2e02119d5SChris Mason  * Copyright (C) 2008 Oracle.  All rights reserved.
3e02119d5SChris Mason  *
4e02119d5SChris Mason  * This program is free software; you can redistribute it and/or
5e02119d5SChris Mason  * modify it under the terms of the GNU General Public
6e02119d5SChris Mason  * License v2 as published by the Free Software Foundation.
7e02119d5SChris Mason  *
8e02119d5SChris Mason  * This program is distributed in the hope that it will be useful,
9e02119d5SChris Mason  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10e02119d5SChris Mason  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11e02119d5SChris Mason  * General Public License for more details.
12e02119d5SChris Mason  *
13e02119d5SChris Mason  * You should have received a copy of the GNU General Public
14e02119d5SChris Mason  * License along with this program; if not, write to the
15e02119d5SChris Mason  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16e02119d5SChris Mason  * Boston, MA 021110-1307, USA.
17e02119d5SChris Mason  */
18e02119d5SChris Mason 
19e02119d5SChris Mason #include <linux/sched.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
21c6adc9ccSMiao Xie #include <linux/blkdev.h>
225dc562c5SJosef Bacik #include <linux/list_sort.h>
23995946ddSMiao Xie #include "tree-log.h"
24e02119d5SChris Mason #include "disk-io.h"
25e02119d5SChris Mason #include "locking.h"
26e02119d5SChris Mason #include "print-tree.h"
27f186373fSMark Fasheh #include "backref.h"
28f186373fSMark Fasheh #include "hash.h"
29ebb8765bSAnand Jain #include "compression.h"
30df2c95f3SQu Wenruo #include "qgroup.h"
31e02119d5SChris Mason 
32e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode:
33e02119d5SChris Mason  *
34e02119d5SChris Mason  * LOG_INODE_ALL means to log everything
35e02119d5SChris Mason  * LOG_INODE_EXISTS means to log just enough to recreate the inode
36e02119d5SChris Mason  * during log replay
37e02119d5SChris Mason  */
38e02119d5SChris Mason #define LOG_INODE_ALL 0
39e02119d5SChris Mason #define LOG_INODE_EXISTS 1
40781feef7SLiu Bo #define LOG_OTHER_INODE 2
41e02119d5SChris Mason 
42e02119d5SChris Mason /*
4312fcfd22SChris Mason  * directory trouble cases
4412fcfd22SChris Mason  *
4512fcfd22SChris Mason  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
4612fcfd22SChris Mason  * log, we must force a full commit before doing an fsync of the directory
4712fcfd22SChris Mason  * where the unlink was done.
4812fcfd22SChris Mason  * ---> record transid of last unlink/rename per directory
4912fcfd22SChris Mason  *
5012fcfd22SChris Mason  * mkdir foo/some_dir
5112fcfd22SChris Mason  * normal commit
5212fcfd22SChris Mason  * rename foo/some_dir foo2/some_dir
5312fcfd22SChris Mason  * mkdir foo/some_dir
5412fcfd22SChris Mason  * fsync foo/some_dir/some_file
5512fcfd22SChris Mason  *
5612fcfd22SChris Mason  * The fsync above will unlink the original some_dir without recording
5712fcfd22SChris Mason  * it in its new location (foo2).  After a crash, some_dir will be gone
5812fcfd22SChris Mason  * unless the fsync of some_file forces a full commit
5912fcfd22SChris Mason  *
6012fcfd22SChris Mason  * 2) we must log any new names for any file or dir that is in the fsync
6112fcfd22SChris Mason  * log. ---> check inode while renaming/linking.
6212fcfd22SChris Mason  *
6312fcfd22SChris Mason  * 2a) we must log any new names for any file or dir during rename
6412fcfd22SChris Mason  * when the directory they are being removed from was logged.
6512fcfd22SChris Mason  * ---> check inode and old parent dir during rename
6612fcfd22SChris Mason  *
6712fcfd22SChris Mason  *  2a is actually the more important variant.  With the extra logging
6812fcfd22SChris Mason  *  a crash might unlink the old name without recreating the new one
6912fcfd22SChris Mason  *
7012fcfd22SChris Mason  * 3) after a crash, we must go through any directories with a link count
7112fcfd22SChris Mason  * of zero and redo the rm -rf
7212fcfd22SChris Mason  *
7312fcfd22SChris Mason  * mkdir f1/foo
7412fcfd22SChris Mason  * normal commit
7512fcfd22SChris Mason  * rm -rf f1/foo
7612fcfd22SChris Mason  * fsync(f1)
7712fcfd22SChris Mason  *
7812fcfd22SChris Mason  * The directory f1 was fully removed from the FS, but fsync was never
7912fcfd22SChris Mason  * called on f1, only its parent dir.  After a crash the rm -rf must
8012fcfd22SChris Mason  * be replayed.  This must be able to recurse down the entire
8112fcfd22SChris Mason  * directory tree.  The inode link count fixup code takes care of the
8212fcfd22SChris Mason  * ugly details.
8312fcfd22SChris Mason  */
8412fcfd22SChris Mason 
8512fcfd22SChris Mason /*
86e02119d5SChris Mason  * stages for the tree walking.  The first
87e02119d5SChris Mason  * stage (0) is to only pin down the blocks we find
88e02119d5SChris Mason  * the second stage (1) is to make sure that all the inodes
89e02119d5SChris Mason  * we find in the log are created in the subvolume.
90e02119d5SChris Mason  *
91e02119d5SChris Mason  * The last stage is to deal with directories and links and extents
92e02119d5SChris Mason  * and all the other fun semantics
93e02119d5SChris Mason  */
94e02119d5SChris Mason #define LOG_WALK_PIN_ONLY 0
95e02119d5SChris Mason #define LOG_WALK_REPLAY_INODES 1
96dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_DIR_INDEX 2
97dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_ALL 3
98e02119d5SChris Mason 
9912fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
100e02119d5SChris Mason 			   struct btrfs_root *root, struct inode *inode,
10149dae1bcSFilipe Manana 			   int inode_only,
10249dae1bcSFilipe Manana 			   const loff_t start,
1038407f553SFilipe Manana 			   const loff_t end,
1048407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx);
105ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
106ec051c0fSYan Zheng 			     struct btrfs_root *root,
107ec051c0fSYan Zheng 			     struct btrfs_path *path, u64 objectid);
10812fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
10912fcfd22SChris Mason 				       struct btrfs_root *root,
11012fcfd22SChris Mason 				       struct btrfs_root *log,
11112fcfd22SChris Mason 				       struct btrfs_path *path,
11212fcfd22SChris Mason 				       u64 dirid, int del_all);
113e02119d5SChris Mason 
114e02119d5SChris Mason /*
115e02119d5SChris Mason  * tree logging is a special write ahead log used to make sure that
116e02119d5SChris Mason  * fsyncs and O_SYNCs can happen without doing full tree commits.
117e02119d5SChris Mason  *
118e02119d5SChris Mason  * Full tree commits are expensive because they require commonly
119e02119d5SChris Mason  * modified blocks to be recowed, creating many dirty pages in the
120e02119d5SChris Mason  * extent tree an 4x-6x higher write load than ext3.
121e02119d5SChris Mason  *
122e02119d5SChris Mason  * Instead of doing a tree commit on every fsync, we use the
123e02119d5SChris Mason  * key ranges and transaction ids to find items for a given file or directory
124e02119d5SChris Mason  * that have changed in this transaction.  Those items are copied into
125e02119d5SChris Mason  * a special tree (one per subvolume root), that tree is written to disk
126e02119d5SChris Mason  * and then the fsync is considered complete.
127e02119d5SChris Mason  *
128e02119d5SChris Mason  * After a crash, items are copied out of the log-tree back into the
129e02119d5SChris Mason  * subvolume tree.  Any file data extents found are recorded in the extent
130e02119d5SChris Mason  * allocation tree, and the log-tree freed.
131e02119d5SChris Mason  *
132e02119d5SChris Mason  * The log tree is read three times, once to pin down all the extents it is
133e02119d5SChris Mason  * using in ram and once, once to create all the inodes logged in the tree
134e02119d5SChris Mason  * and once to do all the other items.
135e02119d5SChris Mason  */
136e02119d5SChris Mason 
137e02119d5SChris Mason /*
138e02119d5SChris Mason  * start a sub transaction and setup the log tree
139e02119d5SChris Mason  * this increments the log tree writer count to make the people
140e02119d5SChris Mason  * syncing the tree wait for us to finish
141e02119d5SChris Mason  */
142e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans,
1438b050d35SMiao Xie 			   struct btrfs_root *root,
1448b050d35SMiao Xie 			   struct btrfs_log_ctx *ctx)
145e02119d5SChris Mason {
1460b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
14734eb2a52SZhaolei 	int ret = 0;
1487237f183SYan Zheng 
1497237f183SYan Zheng 	mutex_lock(&root->log_mutex);
15034eb2a52SZhaolei 
1517237f183SYan Zheng 	if (root->log_root) {
1520b246afaSJeff Mahoney 		if (btrfs_need_log_full_commit(fs_info, trans)) {
15350471a38SMiao Xie 			ret = -EAGAIN;
15450471a38SMiao Xie 			goto out;
15550471a38SMiao Xie 		}
15634eb2a52SZhaolei 
157ff782e0aSJosef Bacik 		if (!root->log_start_pid) {
15827cdeb70SMiao Xie 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
15934eb2a52SZhaolei 			root->log_start_pid = current->pid;
160ff782e0aSJosef Bacik 		} else if (root->log_start_pid != current->pid) {
16127cdeb70SMiao Xie 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
162ff782e0aSJosef Bacik 		}
16334eb2a52SZhaolei 	} else {
1640b246afaSJeff Mahoney 		mutex_lock(&fs_info->tree_log_mutex);
1650b246afaSJeff Mahoney 		if (!fs_info->log_root_tree)
1660b246afaSJeff Mahoney 			ret = btrfs_init_log_root_tree(trans, fs_info);
1670b246afaSJeff Mahoney 		mutex_unlock(&fs_info->tree_log_mutex);
1684a500fd1SYan, Zheng 		if (ret)
169e87ac136SMiao Xie 			goto out;
170e87ac136SMiao Xie 
171e02119d5SChris Mason 		ret = btrfs_add_log_tree(trans, root);
1724a500fd1SYan, Zheng 		if (ret)
173e87ac136SMiao Xie 			goto out;
17434eb2a52SZhaolei 
17527cdeb70SMiao Xie 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
176e87ac136SMiao Xie 		root->log_start_pid = current->pid;
17734eb2a52SZhaolei 	}
17834eb2a52SZhaolei 
1792ecb7923SMiao Xie 	atomic_inc(&root->log_batch);
1807237f183SYan Zheng 	atomic_inc(&root->log_writers);
1818b050d35SMiao Xie 	if (ctx) {
18234eb2a52SZhaolei 		int index = root->log_transid % 2;
1838b050d35SMiao Xie 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
184d1433debSMiao Xie 		ctx->log_transid = root->log_transid;
1858b050d35SMiao Xie 	}
18634eb2a52SZhaolei 
187e87ac136SMiao Xie out:
1887237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
189e87ac136SMiao Xie 	return ret;
190e02119d5SChris Mason }
191e02119d5SChris Mason 
192e02119d5SChris Mason /*
193e02119d5SChris Mason  * returns 0 if there was a log transaction running and we were able
194e02119d5SChris Mason  * to join, or returns -ENOENT if there were not transactions
195e02119d5SChris Mason  * in progress
196e02119d5SChris Mason  */
197e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root)
198e02119d5SChris Mason {
199e02119d5SChris Mason 	int ret = -ENOENT;
200e02119d5SChris Mason 
201e02119d5SChris Mason 	smp_mb();
202e02119d5SChris Mason 	if (!root->log_root)
203e02119d5SChris Mason 		return -ENOENT;
204e02119d5SChris Mason 
2057237f183SYan Zheng 	mutex_lock(&root->log_mutex);
206e02119d5SChris Mason 	if (root->log_root) {
207e02119d5SChris Mason 		ret = 0;
2087237f183SYan Zheng 		atomic_inc(&root->log_writers);
209e02119d5SChris Mason 	}
2107237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
211e02119d5SChris Mason 	return ret;
212e02119d5SChris Mason }
213e02119d5SChris Mason 
214e02119d5SChris Mason /*
21512fcfd22SChris Mason  * This either makes the current running log transaction wait
21612fcfd22SChris Mason  * until you call btrfs_end_log_trans() or it makes any future
21712fcfd22SChris Mason  * log transactions wait until you call btrfs_end_log_trans()
21812fcfd22SChris Mason  */
21912fcfd22SChris Mason int btrfs_pin_log_trans(struct btrfs_root *root)
22012fcfd22SChris Mason {
22112fcfd22SChris Mason 	int ret = -ENOENT;
22212fcfd22SChris Mason 
22312fcfd22SChris Mason 	mutex_lock(&root->log_mutex);
22412fcfd22SChris Mason 	atomic_inc(&root->log_writers);
22512fcfd22SChris Mason 	mutex_unlock(&root->log_mutex);
22612fcfd22SChris Mason 	return ret;
22712fcfd22SChris Mason }
22812fcfd22SChris Mason 
22912fcfd22SChris Mason /*
230e02119d5SChris Mason  * indicate we're done making changes to the log tree
231e02119d5SChris Mason  * and wake up anyone waiting to do a sync
232e02119d5SChris Mason  */
233143bede5SJeff Mahoney void btrfs_end_log_trans(struct btrfs_root *root)
234e02119d5SChris Mason {
2357237f183SYan Zheng 	if (atomic_dec_and_test(&root->log_writers)) {
236779adf0fSDavid Sterba 		/*
237779adf0fSDavid Sterba 		 * Implicit memory barrier after atomic_dec_and_test
238779adf0fSDavid Sterba 		 */
2397237f183SYan Zheng 		if (waitqueue_active(&root->log_writer_wait))
2407237f183SYan Zheng 			wake_up(&root->log_writer_wait);
2417237f183SYan Zheng 	}
242e02119d5SChris Mason }
243e02119d5SChris Mason 
244e02119d5SChris Mason 
245e02119d5SChris Mason /*
246e02119d5SChris Mason  * the walk control struct is used to pass state down the chain when
247e02119d5SChris Mason  * processing the log tree.  The stage field tells us which part
248e02119d5SChris Mason  * of the log tree processing we are currently doing.  The others
249e02119d5SChris Mason  * are state fields used for that specific part
250e02119d5SChris Mason  */
251e02119d5SChris Mason struct walk_control {
252e02119d5SChris Mason 	/* should we free the extent on disk when done?  This is used
253e02119d5SChris Mason 	 * at transaction commit time while freeing a log tree
254e02119d5SChris Mason 	 */
255e02119d5SChris Mason 	int free;
256e02119d5SChris Mason 
257e02119d5SChris Mason 	/* should we write out the extent buffer?  This is used
258e02119d5SChris Mason 	 * while flushing the log tree to disk during a sync
259e02119d5SChris Mason 	 */
260e02119d5SChris Mason 	int write;
261e02119d5SChris Mason 
262e02119d5SChris Mason 	/* should we wait for the extent buffer io to finish?  Also used
263e02119d5SChris Mason 	 * while flushing the log tree to disk for a sync
264e02119d5SChris Mason 	 */
265e02119d5SChris Mason 	int wait;
266e02119d5SChris Mason 
267e02119d5SChris Mason 	/* pin only walk, we record which extents on disk belong to the
268e02119d5SChris Mason 	 * log trees
269e02119d5SChris Mason 	 */
270e02119d5SChris Mason 	int pin;
271e02119d5SChris Mason 
272e02119d5SChris Mason 	/* what stage of the replay code we're currently in */
273e02119d5SChris Mason 	int stage;
274e02119d5SChris Mason 
275e02119d5SChris Mason 	/* the root we are currently replaying */
276e02119d5SChris Mason 	struct btrfs_root *replay_dest;
277e02119d5SChris Mason 
278e02119d5SChris Mason 	/* the trans handle for the current replay */
279e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
280e02119d5SChris Mason 
281e02119d5SChris Mason 	/* the function that gets used to process blocks we find in the
282e02119d5SChris Mason 	 * tree.  Note the extent_buffer might not be up to date when it is
283e02119d5SChris Mason 	 * passed in, and it must be checked or read if you need the data
284e02119d5SChris Mason 	 * inside it
285e02119d5SChris Mason 	 */
286e02119d5SChris Mason 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
287e02119d5SChris Mason 			    struct walk_control *wc, u64 gen);
288e02119d5SChris Mason };
289e02119d5SChris Mason 
290e02119d5SChris Mason /*
291e02119d5SChris Mason  * process_func used to pin down extents, write them or wait on them
292e02119d5SChris Mason  */
293e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log,
294e02119d5SChris Mason 			      struct extent_buffer *eb,
295e02119d5SChris Mason 			      struct walk_control *wc, u64 gen)
296e02119d5SChris Mason {
2970b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
298b50c6e25SJosef Bacik 	int ret = 0;
299b50c6e25SJosef Bacik 
3008c2a1a30SJosef Bacik 	/*
3018c2a1a30SJosef Bacik 	 * If this fs is mixed then we need to be able to process the leaves to
3028c2a1a30SJosef Bacik 	 * pin down any logged extents, so we have to read the block.
3038c2a1a30SJosef Bacik 	 */
3040b246afaSJeff Mahoney 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
3058c2a1a30SJosef Bacik 		ret = btrfs_read_buffer(eb, gen);
3068c2a1a30SJosef Bacik 		if (ret)
3078c2a1a30SJosef Bacik 			return ret;
3088c2a1a30SJosef Bacik 	}
3098c2a1a30SJosef Bacik 
31004018de5SJosef Bacik 	if (wc->pin)
3112ff7e61eSJeff Mahoney 		ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
3122ff7e61eSJeff Mahoney 						      eb->len);
313e02119d5SChris Mason 
314b50c6e25SJosef Bacik 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
3158c2a1a30SJosef Bacik 		if (wc->pin && btrfs_header_level(eb) == 0)
3162ff7e61eSJeff Mahoney 			ret = btrfs_exclude_logged_extents(fs_info, eb);
317e02119d5SChris Mason 		if (wc->write)
318e02119d5SChris Mason 			btrfs_write_tree_block(eb);
319e02119d5SChris Mason 		if (wc->wait)
320e02119d5SChris Mason 			btrfs_wait_tree_block_writeback(eb);
321e02119d5SChris Mason 	}
322b50c6e25SJosef Bacik 	return ret;
323e02119d5SChris Mason }
324e02119d5SChris Mason 
325e02119d5SChris Mason /*
326e02119d5SChris Mason  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
327e02119d5SChris Mason  * to the src data we are copying out.
328e02119d5SChris Mason  *
329e02119d5SChris Mason  * root is the tree we are copying into, and path is a scratch
330e02119d5SChris Mason  * path for use in this function (it should be released on entry and
331e02119d5SChris Mason  * will be released on exit).
332e02119d5SChris Mason  *
333e02119d5SChris Mason  * If the key is already in the destination tree the existing item is
334e02119d5SChris Mason  * overwritten.  If the existing item isn't big enough, it is extended.
335e02119d5SChris Mason  * If it is too large, it is truncated.
336e02119d5SChris Mason  *
337e02119d5SChris Mason  * If the key isn't in the destination yet, a new item is inserted.
338e02119d5SChris Mason  */
339e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans,
340e02119d5SChris Mason 				   struct btrfs_root *root,
341e02119d5SChris Mason 				   struct btrfs_path *path,
342e02119d5SChris Mason 				   struct extent_buffer *eb, int slot,
343e02119d5SChris Mason 				   struct btrfs_key *key)
344e02119d5SChris Mason {
3452ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
346e02119d5SChris Mason 	int ret;
347e02119d5SChris Mason 	u32 item_size;
348e02119d5SChris Mason 	u64 saved_i_size = 0;
349e02119d5SChris Mason 	int save_old_i_size = 0;
350e02119d5SChris Mason 	unsigned long src_ptr;
351e02119d5SChris Mason 	unsigned long dst_ptr;
352e02119d5SChris Mason 	int overwrite_root = 0;
3534bc4bee4SJosef Bacik 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
354e02119d5SChris Mason 
355e02119d5SChris Mason 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
356e02119d5SChris Mason 		overwrite_root = 1;
357e02119d5SChris Mason 
358e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
359e02119d5SChris Mason 	src_ptr = btrfs_item_ptr_offset(eb, slot);
360e02119d5SChris Mason 
361e02119d5SChris Mason 	/* look for the key in the destination tree */
362e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3634bc4bee4SJosef Bacik 	if (ret < 0)
3644bc4bee4SJosef Bacik 		return ret;
3654bc4bee4SJosef Bacik 
366e02119d5SChris Mason 	if (ret == 0) {
367e02119d5SChris Mason 		char *src_copy;
368e02119d5SChris Mason 		char *dst_copy;
369e02119d5SChris Mason 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
370e02119d5SChris Mason 						  path->slots[0]);
371e02119d5SChris Mason 		if (dst_size != item_size)
372e02119d5SChris Mason 			goto insert;
373e02119d5SChris Mason 
374e02119d5SChris Mason 		if (item_size == 0) {
375b3b4aa74SDavid Sterba 			btrfs_release_path(path);
376e02119d5SChris Mason 			return 0;
377e02119d5SChris Mason 		}
378e02119d5SChris Mason 		dst_copy = kmalloc(item_size, GFP_NOFS);
379e02119d5SChris Mason 		src_copy = kmalloc(item_size, GFP_NOFS);
3802a29edc6Sliubo 		if (!dst_copy || !src_copy) {
381b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3822a29edc6Sliubo 			kfree(dst_copy);
3832a29edc6Sliubo 			kfree(src_copy);
3842a29edc6Sliubo 			return -ENOMEM;
3852a29edc6Sliubo 		}
386e02119d5SChris Mason 
387e02119d5SChris Mason 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
388e02119d5SChris Mason 
389e02119d5SChris Mason 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
390e02119d5SChris Mason 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
391e02119d5SChris Mason 				   item_size);
392e02119d5SChris Mason 		ret = memcmp(dst_copy, src_copy, item_size);
393e02119d5SChris Mason 
394e02119d5SChris Mason 		kfree(dst_copy);
395e02119d5SChris Mason 		kfree(src_copy);
396e02119d5SChris Mason 		/*
397e02119d5SChris Mason 		 * they have the same contents, just return, this saves
398e02119d5SChris Mason 		 * us from cowing blocks in the destination tree and doing
399e02119d5SChris Mason 		 * extra writes that may not have been done by a previous
400e02119d5SChris Mason 		 * sync
401e02119d5SChris Mason 		 */
402e02119d5SChris Mason 		if (ret == 0) {
403b3b4aa74SDavid Sterba 			btrfs_release_path(path);
404e02119d5SChris Mason 			return 0;
405e02119d5SChris Mason 		}
406e02119d5SChris Mason 
4074bc4bee4SJosef Bacik 		/*
4084bc4bee4SJosef Bacik 		 * We need to load the old nbytes into the inode so when we
4094bc4bee4SJosef Bacik 		 * replay the extents we've logged we get the right nbytes.
4104bc4bee4SJosef Bacik 		 */
4114bc4bee4SJosef Bacik 		if (inode_item) {
4124bc4bee4SJosef Bacik 			struct btrfs_inode_item *item;
4134bc4bee4SJosef Bacik 			u64 nbytes;
414d555438bSJosef Bacik 			u32 mode;
4154bc4bee4SJosef Bacik 
4164bc4bee4SJosef Bacik 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4174bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4184bc4bee4SJosef Bacik 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
4194bc4bee4SJosef Bacik 			item = btrfs_item_ptr(eb, slot,
4204bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4214bc4bee4SJosef Bacik 			btrfs_set_inode_nbytes(eb, item, nbytes);
422d555438bSJosef Bacik 
423d555438bSJosef Bacik 			/*
424d555438bSJosef Bacik 			 * If this is a directory we need to reset the i_size to
425d555438bSJosef Bacik 			 * 0 so that we can set it up properly when replaying
426d555438bSJosef Bacik 			 * the rest of the items in this log.
427d555438bSJosef Bacik 			 */
428d555438bSJosef Bacik 			mode = btrfs_inode_mode(eb, item);
429d555438bSJosef Bacik 			if (S_ISDIR(mode))
430d555438bSJosef Bacik 				btrfs_set_inode_size(eb, item, 0);
4314bc4bee4SJosef Bacik 		}
4324bc4bee4SJosef Bacik 	} else if (inode_item) {
4334bc4bee4SJosef Bacik 		struct btrfs_inode_item *item;
434d555438bSJosef Bacik 		u32 mode;
4354bc4bee4SJosef Bacik 
4364bc4bee4SJosef Bacik 		/*
4374bc4bee4SJosef Bacik 		 * New inode, set nbytes to 0 so that the nbytes comes out
4384bc4bee4SJosef Bacik 		 * properly when we replay the extents.
4394bc4bee4SJosef Bacik 		 */
4404bc4bee4SJosef Bacik 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
4414bc4bee4SJosef Bacik 		btrfs_set_inode_nbytes(eb, item, 0);
442d555438bSJosef Bacik 
443d555438bSJosef Bacik 		/*
444d555438bSJosef Bacik 		 * If this is a directory we need to reset the i_size to 0 so
445d555438bSJosef Bacik 		 * that we can set it up properly when replaying the rest of
446d555438bSJosef Bacik 		 * the items in this log.
447d555438bSJosef Bacik 		 */
448d555438bSJosef Bacik 		mode = btrfs_inode_mode(eb, item);
449d555438bSJosef Bacik 		if (S_ISDIR(mode))
450d555438bSJosef Bacik 			btrfs_set_inode_size(eb, item, 0);
451e02119d5SChris Mason 	}
452e02119d5SChris Mason insert:
453b3b4aa74SDavid Sterba 	btrfs_release_path(path);
454e02119d5SChris Mason 	/* try to insert the key into the destination tree */
455df8d116fSFilipe Manana 	path->skip_release_on_error = 1;
456e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path,
457e02119d5SChris Mason 				      key, item_size);
458df8d116fSFilipe Manana 	path->skip_release_on_error = 0;
459e02119d5SChris Mason 
460e02119d5SChris Mason 	/* make sure any existing item is the correct size */
461df8d116fSFilipe Manana 	if (ret == -EEXIST || ret == -EOVERFLOW) {
462e02119d5SChris Mason 		u32 found_size;
463e02119d5SChris Mason 		found_size = btrfs_item_size_nr(path->nodes[0],
464e02119d5SChris Mason 						path->slots[0]);
465143bede5SJeff Mahoney 		if (found_size > item_size)
4662ff7e61eSJeff Mahoney 			btrfs_truncate_item(fs_info, path, item_size, 1);
467143bede5SJeff Mahoney 		else if (found_size < item_size)
4682ff7e61eSJeff Mahoney 			btrfs_extend_item(fs_info, path,
46987b29b20SYan Zheng 					  item_size - found_size);
470e02119d5SChris Mason 	} else if (ret) {
4714a500fd1SYan, Zheng 		return ret;
472e02119d5SChris Mason 	}
473e02119d5SChris Mason 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
474e02119d5SChris Mason 					path->slots[0]);
475e02119d5SChris Mason 
476e02119d5SChris Mason 	/* don't overwrite an existing inode if the generation number
477e02119d5SChris Mason 	 * was logged as zero.  This is done when the tree logging code
478e02119d5SChris Mason 	 * is just logging an inode to make sure it exists after recovery.
479e02119d5SChris Mason 	 *
480e02119d5SChris Mason 	 * Also, don't overwrite i_size on directories during replay.
481e02119d5SChris Mason 	 * log replay inserts and removes directory items based on the
482e02119d5SChris Mason 	 * state of the tree found in the subvolume, and i_size is modified
483e02119d5SChris Mason 	 * as it goes
484e02119d5SChris Mason 	 */
485e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
486e02119d5SChris Mason 		struct btrfs_inode_item *src_item;
487e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
488e02119d5SChris Mason 
489e02119d5SChris Mason 		src_item = (struct btrfs_inode_item *)src_ptr;
490e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
491e02119d5SChris Mason 
4921a4bcf47SFilipe Manana 		if (btrfs_inode_generation(eb, src_item) == 0) {
4931a4bcf47SFilipe Manana 			struct extent_buffer *dst_eb = path->nodes[0];
4942f2ff0eeSFilipe Manana 			const u64 ino_size = btrfs_inode_size(eb, src_item);
4951a4bcf47SFilipe Manana 
4962f2ff0eeSFilipe Manana 			/*
4972f2ff0eeSFilipe Manana 			 * For regular files an ino_size == 0 is used only when
4982f2ff0eeSFilipe Manana 			 * logging that an inode exists, as part of a directory
4992f2ff0eeSFilipe Manana 			 * fsync, and the inode wasn't fsynced before. In this
5002f2ff0eeSFilipe Manana 			 * case don't set the size of the inode in the fs/subvol
5012f2ff0eeSFilipe Manana 			 * tree, otherwise we would be throwing valid data away.
5022f2ff0eeSFilipe Manana 			 */
5031a4bcf47SFilipe Manana 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
5042f2ff0eeSFilipe Manana 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
5052f2ff0eeSFilipe Manana 			    ino_size != 0) {
5061a4bcf47SFilipe Manana 				struct btrfs_map_token token;
5071a4bcf47SFilipe Manana 
5081a4bcf47SFilipe Manana 				btrfs_init_map_token(&token);
5091a4bcf47SFilipe Manana 				btrfs_set_token_inode_size(dst_eb, dst_item,
5101a4bcf47SFilipe Manana 							   ino_size, &token);
5111a4bcf47SFilipe Manana 			}
512e02119d5SChris Mason 			goto no_copy;
5131a4bcf47SFilipe Manana 		}
514e02119d5SChris Mason 
515e02119d5SChris Mason 		if (overwrite_root &&
516e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
517e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
518e02119d5SChris Mason 			save_old_i_size = 1;
519e02119d5SChris Mason 			saved_i_size = btrfs_inode_size(path->nodes[0],
520e02119d5SChris Mason 							dst_item);
521e02119d5SChris Mason 		}
522e02119d5SChris Mason 	}
523e02119d5SChris Mason 
524e02119d5SChris Mason 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
525e02119d5SChris Mason 			   src_ptr, item_size);
526e02119d5SChris Mason 
527e02119d5SChris Mason 	if (save_old_i_size) {
528e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
529e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
530e02119d5SChris Mason 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
531e02119d5SChris Mason 	}
532e02119d5SChris Mason 
533e02119d5SChris Mason 	/* make sure the generation is filled in */
534e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY) {
535e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
536e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
537e02119d5SChris Mason 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
538e02119d5SChris Mason 			btrfs_set_inode_generation(path->nodes[0], dst_item,
539e02119d5SChris Mason 						   trans->transid);
540e02119d5SChris Mason 		}
541e02119d5SChris Mason 	}
542e02119d5SChris Mason no_copy:
543e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
544b3b4aa74SDavid Sterba 	btrfs_release_path(path);
545e02119d5SChris Mason 	return 0;
546e02119d5SChris Mason }
547e02119d5SChris Mason 
548e02119d5SChris Mason /*
549e02119d5SChris Mason  * simple helper to read an inode off the disk from a given root
550e02119d5SChris Mason  * This can only be called for subvolume roots and not for the log
551e02119d5SChris Mason  */
552e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root,
553e02119d5SChris Mason 					     u64 objectid)
554e02119d5SChris Mason {
5555d4f98a2SYan Zheng 	struct btrfs_key key;
556e02119d5SChris Mason 	struct inode *inode;
557e02119d5SChris Mason 
5585d4f98a2SYan Zheng 	key.objectid = objectid;
5595d4f98a2SYan Zheng 	key.type = BTRFS_INODE_ITEM_KEY;
5605d4f98a2SYan Zheng 	key.offset = 0;
56173f73415SJosef Bacik 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
5625d4f98a2SYan Zheng 	if (IS_ERR(inode)) {
5635d4f98a2SYan Zheng 		inode = NULL;
5645d4f98a2SYan Zheng 	} else if (is_bad_inode(inode)) {
565e02119d5SChris Mason 		iput(inode);
566e02119d5SChris Mason 		inode = NULL;
567e02119d5SChris Mason 	}
568e02119d5SChris Mason 	return inode;
569e02119d5SChris Mason }
570e02119d5SChris Mason 
571e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the
572e02119d5SChris Mason  * subvolume 'root'.  path is released on entry and should be released
573e02119d5SChris Mason  * on exit.
574e02119d5SChris Mason  *
575e02119d5SChris Mason  * extents in the log tree have not been allocated out of the extent
576e02119d5SChris Mason  * tree yet.  So, this completes the allocation, taking a reference
577e02119d5SChris Mason  * as required if the extent already exists or creating a new extent
578e02119d5SChris Mason  * if it isn't in the extent allocation tree yet.
579e02119d5SChris Mason  *
580e02119d5SChris Mason  * The extent is inserted into the file, dropping any existing extents
581e02119d5SChris Mason  * from the file that overlap the new one.
582e02119d5SChris Mason  */
583e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
584e02119d5SChris Mason 				      struct btrfs_root *root,
585e02119d5SChris Mason 				      struct btrfs_path *path,
586e02119d5SChris Mason 				      struct extent_buffer *eb, int slot,
587e02119d5SChris Mason 				      struct btrfs_key *key)
588e02119d5SChris Mason {
5890b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
590e02119d5SChris Mason 	int found_type;
591e02119d5SChris Mason 	u64 extent_end;
592e02119d5SChris Mason 	u64 start = key->offset;
5934bc4bee4SJosef Bacik 	u64 nbytes = 0;
594e02119d5SChris Mason 	struct btrfs_file_extent_item *item;
595e02119d5SChris Mason 	struct inode *inode = NULL;
596e02119d5SChris Mason 	unsigned long size;
597e02119d5SChris Mason 	int ret = 0;
598e02119d5SChris Mason 
599e02119d5SChris Mason 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
600e02119d5SChris Mason 	found_type = btrfs_file_extent_type(eb, item);
601e02119d5SChris Mason 
602d899e052SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
6034bc4bee4SJosef Bacik 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6044bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_num_bytes(eb, item);
6054bc4bee4SJosef Bacik 		extent_end = start + nbytes;
6064bc4bee4SJosef Bacik 
6074bc4bee4SJosef Bacik 		/*
6084bc4bee4SJosef Bacik 		 * We don't add to the inodes nbytes if we are prealloc or a
6094bc4bee4SJosef Bacik 		 * hole.
6104bc4bee4SJosef Bacik 		 */
6114bc4bee4SJosef Bacik 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6124bc4bee4SJosef Bacik 			nbytes = 0;
6134bc4bee4SJosef Bacik 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
614514ac8adSChris Mason 		size = btrfs_file_extent_inline_len(eb, slot, item);
6154bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
616da17066cSJeff Mahoney 		extent_end = ALIGN(start + size,
6170b246afaSJeff Mahoney 				   fs_info->sectorsize);
618e02119d5SChris Mason 	} else {
619e02119d5SChris Mason 		ret = 0;
620e02119d5SChris Mason 		goto out;
621e02119d5SChris Mason 	}
622e02119d5SChris Mason 
623e02119d5SChris Mason 	inode = read_one_inode(root, key->objectid);
624e02119d5SChris Mason 	if (!inode) {
625e02119d5SChris Mason 		ret = -EIO;
626e02119d5SChris Mason 		goto out;
627e02119d5SChris Mason 	}
628e02119d5SChris Mason 
629e02119d5SChris Mason 	/*
630e02119d5SChris Mason 	 * first check to see if we already have this extent in the
631e02119d5SChris Mason 	 * file.  This must be done before the btrfs_drop_extents run
632e02119d5SChris Mason 	 * so we don't try to drop this extent.
633e02119d5SChris Mason 	 */
6344a0cc7caSNikolay Borisov 	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(BTRFS_I(inode)),
635e02119d5SChris Mason 				       start, 0);
636e02119d5SChris Mason 
637d899e052SYan Zheng 	if (ret == 0 &&
638d899e052SYan Zheng 	    (found_type == BTRFS_FILE_EXTENT_REG ||
639d899e052SYan Zheng 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
640e02119d5SChris Mason 		struct btrfs_file_extent_item cmp1;
641e02119d5SChris Mason 		struct btrfs_file_extent_item cmp2;
642e02119d5SChris Mason 		struct btrfs_file_extent_item *existing;
643e02119d5SChris Mason 		struct extent_buffer *leaf;
644e02119d5SChris Mason 
645e02119d5SChris Mason 		leaf = path->nodes[0];
646e02119d5SChris Mason 		existing = btrfs_item_ptr(leaf, path->slots[0],
647e02119d5SChris Mason 					  struct btrfs_file_extent_item);
648e02119d5SChris Mason 
649e02119d5SChris Mason 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
650e02119d5SChris Mason 				   sizeof(cmp1));
651e02119d5SChris Mason 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
652e02119d5SChris Mason 				   sizeof(cmp2));
653e02119d5SChris Mason 
654e02119d5SChris Mason 		/*
655e02119d5SChris Mason 		 * we already have a pointer to this exact extent,
656e02119d5SChris Mason 		 * we don't have to do anything
657e02119d5SChris Mason 		 */
658e02119d5SChris Mason 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
659b3b4aa74SDavid Sterba 			btrfs_release_path(path);
660e02119d5SChris Mason 			goto out;
661e02119d5SChris Mason 		}
662e02119d5SChris Mason 	}
663b3b4aa74SDavid Sterba 	btrfs_release_path(path);
664e02119d5SChris Mason 
665e02119d5SChris Mason 	/* drop any overlapping extents */
6662671485dSJosef Bacik 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
6673650860bSJosef Bacik 	if (ret)
6683650860bSJosef Bacik 		goto out;
669e02119d5SChris Mason 
67007d400a6SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
67107d400a6SYan Zheng 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6725d4f98a2SYan Zheng 		u64 offset;
67307d400a6SYan Zheng 		unsigned long dest_offset;
67407d400a6SYan Zheng 		struct btrfs_key ins;
67507d400a6SYan Zheng 
67607d400a6SYan Zheng 		ret = btrfs_insert_empty_item(trans, root, path, key,
67707d400a6SYan Zheng 					      sizeof(*item));
6783650860bSJosef Bacik 		if (ret)
6793650860bSJosef Bacik 			goto out;
68007d400a6SYan Zheng 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
68107d400a6SYan Zheng 						    path->slots[0]);
68207d400a6SYan Zheng 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
68307d400a6SYan Zheng 				(unsigned long)item,  sizeof(*item));
68407d400a6SYan Zheng 
68507d400a6SYan Zheng 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
68607d400a6SYan Zheng 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
68707d400a6SYan Zheng 		ins.type = BTRFS_EXTENT_ITEM_KEY;
6885d4f98a2SYan Zheng 		offset = key->offset - btrfs_file_extent_offset(eb, item);
68907d400a6SYan Zheng 
690df2c95f3SQu Wenruo 		/*
691df2c95f3SQu Wenruo 		 * Manually record dirty extent, as here we did a shallow
692df2c95f3SQu Wenruo 		 * file extent item copy and skip normal backref update,
693df2c95f3SQu Wenruo 		 * but modifying extent tree all by ourselves.
694df2c95f3SQu Wenruo 		 * So need to manually record dirty extent for qgroup,
695df2c95f3SQu Wenruo 		 * as the owner of the file extent changed from log tree
696df2c95f3SQu Wenruo 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
697df2c95f3SQu Wenruo 		 */
6980b246afaSJeff Mahoney 		ret = btrfs_qgroup_trace_extent(trans, fs_info,
699df2c95f3SQu Wenruo 				btrfs_file_extent_disk_bytenr(eb, item),
700df2c95f3SQu Wenruo 				btrfs_file_extent_disk_num_bytes(eb, item),
701df2c95f3SQu Wenruo 				GFP_NOFS);
702df2c95f3SQu Wenruo 		if (ret < 0)
703df2c95f3SQu Wenruo 			goto out;
704df2c95f3SQu Wenruo 
70507d400a6SYan Zheng 		if (ins.objectid > 0) {
70607d400a6SYan Zheng 			u64 csum_start;
70707d400a6SYan Zheng 			u64 csum_end;
70807d400a6SYan Zheng 			LIST_HEAD(ordered_sums);
70907d400a6SYan Zheng 			/*
71007d400a6SYan Zheng 			 * is this extent already allocated in the extent
71107d400a6SYan Zheng 			 * allocation tree?  If so, just add a reference
71207d400a6SYan Zheng 			 */
7132ff7e61eSJeff Mahoney 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
71407d400a6SYan Zheng 						ins.offset);
71507d400a6SYan Zheng 			if (ret == 0) {
7162ff7e61eSJeff Mahoney 				ret = btrfs_inc_extent_ref(trans, fs_info,
71707d400a6SYan Zheng 						ins.objectid, ins.offset,
7185d4f98a2SYan Zheng 						0, root->root_key.objectid,
719b06c4bf5SFilipe Manana 						key->objectid, offset);
720b50c6e25SJosef Bacik 				if (ret)
721b50c6e25SJosef Bacik 					goto out;
72207d400a6SYan Zheng 			} else {
72307d400a6SYan Zheng 				/*
72407d400a6SYan Zheng 				 * insert the extent pointer in the extent
72507d400a6SYan Zheng 				 * allocation tree
72607d400a6SYan Zheng 				 */
7275d4f98a2SYan Zheng 				ret = btrfs_alloc_logged_file_extent(trans,
7282ff7e61eSJeff Mahoney 						fs_info,
7292ff7e61eSJeff Mahoney 						root->root_key.objectid,
7305d4f98a2SYan Zheng 						key->objectid, offset, &ins);
731b50c6e25SJosef Bacik 				if (ret)
732b50c6e25SJosef Bacik 					goto out;
73307d400a6SYan Zheng 			}
734b3b4aa74SDavid Sterba 			btrfs_release_path(path);
73507d400a6SYan Zheng 
73607d400a6SYan Zheng 			if (btrfs_file_extent_compression(eb, item)) {
73707d400a6SYan Zheng 				csum_start = ins.objectid;
73807d400a6SYan Zheng 				csum_end = csum_start + ins.offset;
73907d400a6SYan Zheng 			} else {
74007d400a6SYan Zheng 				csum_start = ins.objectid +
74107d400a6SYan Zheng 					btrfs_file_extent_offset(eb, item);
74207d400a6SYan Zheng 				csum_end = csum_start +
74307d400a6SYan Zheng 					btrfs_file_extent_num_bytes(eb, item);
74407d400a6SYan Zheng 			}
74507d400a6SYan Zheng 
74607d400a6SYan Zheng 			ret = btrfs_lookup_csums_range(root->log_root,
74707d400a6SYan Zheng 						csum_start, csum_end - 1,
748a2de733cSArne Jansen 						&ordered_sums, 0);
7493650860bSJosef Bacik 			if (ret)
7503650860bSJosef Bacik 				goto out;
751b84b8390SFilipe Manana 			/*
752b84b8390SFilipe Manana 			 * Now delete all existing cums in the csum root that
753b84b8390SFilipe Manana 			 * cover our range. We do this because we can have an
754b84b8390SFilipe Manana 			 * extent that is completely referenced by one file
755b84b8390SFilipe Manana 			 * extent item and partially referenced by another
756b84b8390SFilipe Manana 			 * file extent item (like after using the clone or
757b84b8390SFilipe Manana 			 * extent_same ioctls). In this case if we end up doing
758b84b8390SFilipe Manana 			 * the replay of the one that partially references the
759b84b8390SFilipe Manana 			 * extent first, and we do not do the csum deletion
760b84b8390SFilipe Manana 			 * below, we can get 2 csum items in the csum tree that
761b84b8390SFilipe Manana 			 * overlap each other. For example, imagine our log has
762b84b8390SFilipe Manana 			 * the two following file extent items:
763b84b8390SFilipe Manana 			 *
764b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 409600)
765b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
766b84b8390SFilipe Manana 			 *     extent data offset 20480 nr 20480 ram 102400
767b84b8390SFilipe Manana 			 *
768b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 819200)
769b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
770b84b8390SFilipe Manana 			 *     extent data offset 0 nr 102400 ram 102400
771b84b8390SFilipe Manana 			 *
772b84b8390SFilipe Manana 			 * Where the second one fully references the 100K extent
773b84b8390SFilipe Manana 			 * that starts at disk byte 12845056, and the log tree
774b84b8390SFilipe Manana 			 * has a single csum item that covers the entire range
775b84b8390SFilipe Manana 			 * of the extent:
776b84b8390SFilipe Manana 			 *
777b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
778b84b8390SFilipe Manana 			 *
779b84b8390SFilipe Manana 			 * After the first file extent item is replayed, the
780b84b8390SFilipe Manana 			 * csum tree gets the following csum item:
781b84b8390SFilipe Manana 			 *
782b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
783b84b8390SFilipe Manana 			 *
784b84b8390SFilipe Manana 			 * Which covers the 20K sub-range starting at offset 20K
785b84b8390SFilipe Manana 			 * of our extent. Now when we replay the second file
786b84b8390SFilipe Manana 			 * extent item, if we do not delete existing csum items
787b84b8390SFilipe Manana 			 * that cover any of its blocks, we end up getting two
788b84b8390SFilipe Manana 			 * csum items in our csum tree that overlap each other:
789b84b8390SFilipe Manana 			 *
790b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
791b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
792b84b8390SFilipe Manana 			 *
793b84b8390SFilipe Manana 			 * Which is a problem, because after this anyone trying
794b84b8390SFilipe Manana 			 * to lookup up for the checksum of any block of our
795b84b8390SFilipe Manana 			 * extent starting at an offset of 40K or higher, will
796b84b8390SFilipe Manana 			 * end up looking at the second csum item only, which
797b84b8390SFilipe Manana 			 * does not contain the checksum for any block starting
798b84b8390SFilipe Manana 			 * at offset 40K or higher of our extent.
799b84b8390SFilipe Manana 			 */
80007d400a6SYan Zheng 			while (!list_empty(&ordered_sums)) {
80107d400a6SYan Zheng 				struct btrfs_ordered_sum *sums;
80207d400a6SYan Zheng 				sums = list_entry(ordered_sums.next,
80307d400a6SYan Zheng 						struct btrfs_ordered_sum,
80407d400a6SYan Zheng 						list);
8053650860bSJosef Bacik 				if (!ret)
8060b246afaSJeff Mahoney 					ret = btrfs_del_csums(trans, fs_info,
807b84b8390SFilipe Manana 							      sums->bytenr,
808b84b8390SFilipe Manana 							      sums->len);
809b84b8390SFilipe Manana 				if (!ret)
81007d400a6SYan Zheng 					ret = btrfs_csum_file_blocks(trans,
8110b246afaSJeff Mahoney 						fs_info->csum_root, sums);
81207d400a6SYan Zheng 				list_del(&sums->list);
81307d400a6SYan Zheng 				kfree(sums);
81407d400a6SYan Zheng 			}
8153650860bSJosef Bacik 			if (ret)
8163650860bSJosef Bacik 				goto out;
81707d400a6SYan Zheng 		} else {
818b3b4aa74SDavid Sterba 			btrfs_release_path(path);
81907d400a6SYan Zheng 		}
82007d400a6SYan Zheng 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
82107d400a6SYan Zheng 		/* inline extents are easy, we just overwrite them */
822e02119d5SChris Mason 		ret = overwrite_item(trans, root, path, eb, slot, key);
8233650860bSJosef Bacik 		if (ret)
8243650860bSJosef Bacik 			goto out;
82507d400a6SYan Zheng 	}
826e02119d5SChris Mason 
8274bc4bee4SJosef Bacik 	inode_add_bytes(inode, nbytes);
828b9959295STsutomu Itoh 	ret = btrfs_update_inode(trans, root, inode);
829e02119d5SChris Mason out:
830e02119d5SChris Mason 	if (inode)
831e02119d5SChris Mason 		iput(inode);
832e02119d5SChris Mason 	return ret;
833e02119d5SChris Mason }
834e02119d5SChris Mason 
835e02119d5SChris Mason /*
836e02119d5SChris Mason  * when cleaning up conflicts between the directory names in the
837e02119d5SChris Mason  * subvolume, directory names in the log and directory names in the
838e02119d5SChris Mason  * inode back references, we may have to unlink inodes from directories.
839e02119d5SChris Mason  *
840e02119d5SChris Mason  * This is a helper function to do the unlink of a specific directory
841e02119d5SChris Mason  * item
842e02119d5SChris Mason  */
843e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
844e02119d5SChris Mason 				      struct btrfs_root *root,
845e02119d5SChris Mason 				      struct btrfs_path *path,
846207e7d92SNikolay Borisov 				      struct btrfs_inode *dir,
847e02119d5SChris Mason 				      struct btrfs_dir_item *di)
848e02119d5SChris Mason {
8492ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
850e02119d5SChris Mason 	struct inode *inode;
851e02119d5SChris Mason 	char *name;
852e02119d5SChris Mason 	int name_len;
853e02119d5SChris Mason 	struct extent_buffer *leaf;
854e02119d5SChris Mason 	struct btrfs_key location;
855e02119d5SChris Mason 	int ret;
856e02119d5SChris Mason 
857e02119d5SChris Mason 	leaf = path->nodes[0];
858e02119d5SChris Mason 
859e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
860e02119d5SChris Mason 	name_len = btrfs_dir_name_len(leaf, di);
861e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
8622a29edc6Sliubo 	if (!name)
8632a29edc6Sliubo 		return -ENOMEM;
8642a29edc6Sliubo 
865e02119d5SChris Mason 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
866b3b4aa74SDavid Sterba 	btrfs_release_path(path);
867e02119d5SChris Mason 
868e02119d5SChris Mason 	inode = read_one_inode(root, location.objectid);
869c00e9493STsutomu Itoh 	if (!inode) {
8703650860bSJosef Bacik 		ret = -EIO;
8713650860bSJosef Bacik 		goto out;
872c00e9493STsutomu Itoh 	}
873e02119d5SChris Mason 
874ec051c0fSYan Zheng 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
8753650860bSJosef Bacik 	if (ret)
8763650860bSJosef Bacik 		goto out;
87712fcfd22SChris Mason 
878207e7d92SNikolay Borisov 	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
879207e7d92SNikolay Borisov 			name_len);
8803650860bSJosef Bacik 	if (ret)
8813650860bSJosef Bacik 		goto out;
882ada9af21SFilipe David Borba Manana 	else
8832ff7e61eSJeff Mahoney 		ret = btrfs_run_delayed_items(trans, fs_info);
8843650860bSJosef Bacik out:
8853650860bSJosef Bacik 	kfree(name);
8863650860bSJosef Bacik 	iput(inode);
887e02119d5SChris Mason 	return ret;
888e02119d5SChris Mason }
889e02119d5SChris Mason 
890e02119d5SChris Mason /*
891e02119d5SChris Mason  * helper function to see if a given name and sequence number found
892e02119d5SChris Mason  * in an inode back reference are already in a directory and correctly
893e02119d5SChris Mason  * point to this inode
894e02119d5SChris Mason  */
895e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root,
896e02119d5SChris Mason 				 struct btrfs_path *path,
897e02119d5SChris Mason 				 u64 dirid, u64 objectid, u64 index,
898e02119d5SChris Mason 				 const char *name, int name_len)
899e02119d5SChris Mason {
900e02119d5SChris Mason 	struct btrfs_dir_item *di;
901e02119d5SChris Mason 	struct btrfs_key location;
902e02119d5SChris Mason 	int match = 0;
903e02119d5SChris Mason 
904e02119d5SChris Mason 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
905e02119d5SChris Mason 					 index, name, name_len, 0);
906e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
907e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
908e02119d5SChris Mason 		if (location.objectid != objectid)
909e02119d5SChris Mason 			goto out;
910e02119d5SChris Mason 	} else
911e02119d5SChris Mason 		goto out;
912b3b4aa74SDavid Sterba 	btrfs_release_path(path);
913e02119d5SChris Mason 
914e02119d5SChris Mason 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
915e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
916e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
917e02119d5SChris Mason 		if (location.objectid != objectid)
918e02119d5SChris Mason 			goto out;
919e02119d5SChris Mason 	} else
920e02119d5SChris Mason 		goto out;
921e02119d5SChris Mason 	match = 1;
922e02119d5SChris Mason out:
923b3b4aa74SDavid Sterba 	btrfs_release_path(path);
924e02119d5SChris Mason 	return match;
925e02119d5SChris Mason }
926e02119d5SChris Mason 
927e02119d5SChris Mason /*
928e02119d5SChris Mason  * helper function to check a log tree for a named back reference in
929e02119d5SChris Mason  * an inode.  This is used to decide if a back reference that is
930e02119d5SChris Mason  * found in the subvolume conflicts with what we find in the log.
931e02119d5SChris Mason  *
932e02119d5SChris Mason  * inode backreferences may have multiple refs in a single item,
933e02119d5SChris Mason  * during replay we process one reference at a time, and we don't
934e02119d5SChris Mason  * want to delete valid links to a file from the subvolume if that
935e02119d5SChris Mason  * link is also in the log.
936e02119d5SChris Mason  */
937e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log,
938e02119d5SChris Mason 				   struct btrfs_key *key,
939f186373fSMark Fasheh 				   u64 ref_objectid,
940df8d116fSFilipe Manana 				   const char *name, int namelen)
941e02119d5SChris Mason {
942e02119d5SChris Mason 	struct btrfs_path *path;
943e02119d5SChris Mason 	struct btrfs_inode_ref *ref;
944e02119d5SChris Mason 	unsigned long ptr;
945e02119d5SChris Mason 	unsigned long ptr_end;
946e02119d5SChris Mason 	unsigned long name_ptr;
947e02119d5SChris Mason 	int found_name_len;
948e02119d5SChris Mason 	int item_size;
949e02119d5SChris Mason 	int ret;
950e02119d5SChris Mason 	int match = 0;
951e02119d5SChris Mason 
952e02119d5SChris Mason 	path = btrfs_alloc_path();
9532a29edc6Sliubo 	if (!path)
9542a29edc6Sliubo 		return -ENOMEM;
9552a29edc6Sliubo 
956e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
957e02119d5SChris Mason 	if (ret != 0)
958e02119d5SChris Mason 		goto out;
959e02119d5SChris Mason 
960e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
961f186373fSMark Fasheh 
962f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
963f186373fSMark Fasheh 		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
964f186373fSMark Fasheh 						   name, namelen, NULL))
965f186373fSMark Fasheh 			match = 1;
966f186373fSMark Fasheh 
967f186373fSMark Fasheh 		goto out;
968f186373fSMark Fasheh 	}
969f186373fSMark Fasheh 
970f186373fSMark Fasheh 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
971e02119d5SChris Mason 	ptr_end = ptr + item_size;
972e02119d5SChris Mason 	while (ptr < ptr_end) {
973e02119d5SChris Mason 		ref = (struct btrfs_inode_ref *)ptr;
974e02119d5SChris Mason 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
975e02119d5SChris Mason 		if (found_name_len == namelen) {
976e02119d5SChris Mason 			name_ptr = (unsigned long)(ref + 1);
977e02119d5SChris Mason 			ret = memcmp_extent_buffer(path->nodes[0], name,
978e02119d5SChris Mason 						   name_ptr, namelen);
979e02119d5SChris Mason 			if (ret == 0) {
980e02119d5SChris Mason 				match = 1;
981e02119d5SChris Mason 				goto out;
982e02119d5SChris Mason 			}
983e02119d5SChris Mason 		}
984e02119d5SChris Mason 		ptr = (unsigned long)(ref + 1) + found_name_len;
985e02119d5SChris Mason 	}
986e02119d5SChris Mason out:
987e02119d5SChris Mason 	btrfs_free_path(path);
988e02119d5SChris Mason 	return match;
989e02119d5SChris Mason }
990e02119d5SChris Mason 
9915a1d7843SJan Schmidt static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
9925a1d7843SJan Schmidt 				  struct btrfs_root *root,
9935a1d7843SJan Schmidt 				  struct btrfs_path *path,
9945a1d7843SJan Schmidt 				  struct btrfs_root *log_root,
99594c91a1fSNikolay Borisov 				  struct btrfs_inode *dir,
99694c91a1fSNikolay Borisov 				  struct btrfs_inode *inode,
9975a1d7843SJan Schmidt 				  struct extent_buffer *eb,
998f186373fSMark Fasheh 				  u64 inode_objectid, u64 parent_objectid,
999f186373fSMark Fasheh 				  u64 ref_index, char *name, int namelen,
1000f186373fSMark Fasheh 				  int *search_done)
10015a1d7843SJan Schmidt {
10022ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
10035a1d7843SJan Schmidt 	int ret;
10045a1d7843SJan Schmidt 	char *victim_name;
10055a1d7843SJan Schmidt 	int victim_name_len;
1006f186373fSMark Fasheh 	struct extent_buffer *leaf;
1007f186373fSMark Fasheh 	struct btrfs_dir_item *di;
1008f186373fSMark Fasheh 	struct btrfs_key search_key;
1009f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1010f186373fSMark Fasheh 
1011f186373fSMark Fasheh again:
1012f186373fSMark Fasheh 	/* Search old style refs */
1013f186373fSMark Fasheh 	search_key.objectid = inode_objectid;
1014f186373fSMark Fasheh 	search_key.type = BTRFS_INODE_REF_KEY;
1015f186373fSMark Fasheh 	search_key.offset = parent_objectid;
1016f186373fSMark Fasheh 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1017f186373fSMark Fasheh 	if (ret == 0) {
10185a1d7843SJan Schmidt 		struct btrfs_inode_ref *victim_ref;
10195a1d7843SJan Schmidt 		unsigned long ptr;
10205a1d7843SJan Schmidt 		unsigned long ptr_end;
1021f186373fSMark Fasheh 
1022f186373fSMark Fasheh 		leaf = path->nodes[0];
10235a1d7843SJan Schmidt 
10245a1d7843SJan Schmidt 		/* are we trying to overwrite a back ref for the root directory
10255a1d7843SJan Schmidt 		 * if so, just jump out, we're done
10265a1d7843SJan Schmidt 		 */
1027f186373fSMark Fasheh 		if (search_key.objectid == search_key.offset)
10285a1d7843SJan Schmidt 			return 1;
10295a1d7843SJan Schmidt 
10305a1d7843SJan Schmidt 		/* check all the names in this back reference to see
10315a1d7843SJan Schmidt 		 * if they are in the log.  if so, we allow them to stay
10325a1d7843SJan Schmidt 		 * otherwise they must be unlinked as a conflict
10335a1d7843SJan Schmidt 		 */
10345a1d7843SJan Schmidt 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
10355a1d7843SJan Schmidt 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
10365a1d7843SJan Schmidt 		while (ptr < ptr_end) {
10375a1d7843SJan Schmidt 			victim_ref = (struct btrfs_inode_ref *)ptr;
10385a1d7843SJan Schmidt 			victim_name_len = btrfs_inode_ref_name_len(leaf,
10395a1d7843SJan Schmidt 								   victim_ref);
10405a1d7843SJan Schmidt 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
10413650860bSJosef Bacik 			if (!victim_name)
10423650860bSJosef Bacik 				return -ENOMEM;
10435a1d7843SJan Schmidt 
10445a1d7843SJan Schmidt 			read_extent_buffer(leaf, victim_name,
10455a1d7843SJan Schmidt 					   (unsigned long)(victim_ref + 1),
10465a1d7843SJan Schmidt 					   victim_name_len);
10475a1d7843SJan Schmidt 
1048f186373fSMark Fasheh 			if (!backref_in_log(log_root, &search_key,
1049f186373fSMark Fasheh 					    parent_objectid,
1050f186373fSMark Fasheh 					    victim_name,
10515a1d7843SJan Schmidt 					    victim_name_len)) {
105294c91a1fSNikolay Borisov 				inc_nlink(&inode->vfs_inode);
10535a1d7843SJan Schmidt 				btrfs_release_path(path);
10545a1d7843SJan Schmidt 
105594c91a1fSNikolay Borisov 				ret = btrfs_unlink_inode(trans, root, dir, inode,
10564ec5934eSNikolay Borisov 						victim_name, victim_name_len);
1057f186373fSMark Fasheh 				kfree(victim_name);
10583650860bSJosef Bacik 				if (ret)
10593650860bSJosef Bacik 					return ret;
10602ff7e61eSJeff Mahoney 				ret = btrfs_run_delayed_items(trans, fs_info);
1061ada9af21SFilipe David Borba Manana 				if (ret)
1062ada9af21SFilipe David Borba Manana 					return ret;
1063f186373fSMark Fasheh 				*search_done = 1;
1064f186373fSMark Fasheh 				goto again;
10655a1d7843SJan Schmidt 			}
10665a1d7843SJan Schmidt 			kfree(victim_name);
1067f186373fSMark Fasheh 
10685a1d7843SJan Schmidt 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
10695a1d7843SJan Schmidt 		}
10705a1d7843SJan Schmidt 
10715a1d7843SJan Schmidt 		/*
10725a1d7843SJan Schmidt 		 * NOTE: we have searched root tree and checked the
1073bb7ab3b9SAdam Buchbinder 		 * corresponding ref, it does not need to check again.
10745a1d7843SJan Schmidt 		 */
10755a1d7843SJan Schmidt 		*search_done = 1;
10765a1d7843SJan Schmidt 	}
10775a1d7843SJan Schmidt 	btrfs_release_path(path);
10785a1d7843SJan Schmidt 
1079f186373fSMark Fasheh 	/* Same search but for extended refs */
1080f186373fSMark Fasheh 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1081f186373fSMark Fasheh 					   inode_objectid, parent_objectid, 0,
1082f186373fSMark Fasheh 					   0);
1083f186373fSMark Fasheh 	if (!IS_ERR_OR_NULL(extref)) {
1084f186373fSMark Fasheh 		u32 item_size;
1085f186373fSMark Fasheh 		u32 cur_offset = 0;
1086f186373fSMark Fasheh 		unsigned long base;
1087f186373fSMark Fasheh 		struct inode *victim_parent;
1088f186373fSMark Fasheh 
1089f186373fSMark Fasheh 		leaf = path->nodes[0];
1090f186373fSMark Fasheh 
1091f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1092f186373fSMark Fasheh 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1093f186373fSMark Fasheh 
1094f186373fSMark Fasheh 		while (cur_offset < item_size) {
1095dd9ef135SQuentin Casasnovas 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1096f186373fSMark Fasheh 
1097f186373fSMark Fasheh 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1098f186373fSMark Fasheh 
1099f186373fSMark Fasheh 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1100f186373fSMark Fasheh 				goto next;
1101f186373fSMark Fasheh 
1102f186373fSMark Fasheh 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
11033650860bSJosef Bacik 			if (!victim_name)
11043650860bSJosef Bacik 				return -ENOMEM;
1105f186373fSMark Fasheh 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1106f186373fSMark Fasheh 					   victim_name_len);
1107f186373fSMark Fasheh 
1108f186373fSMark Fasheh 			search_key.objectid = inode_objectid;
1109f186373fSMark Fasheh 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1110f186373fSMark Fasheh 			search_key.offset = btrfs_extref_hash(parent_objectid,
1111f186373fSMark Fasheh 							      victim_name,
1112f186373fSMark Fasheh 							      victim_name_len);
1113f186373fSMark Fasheh 			ret = 0;
1114f186373fSMark Fasheh 			if (!backref_in_log(log_root, &search_key,
1115f186373fSMark Fasheh 					    parent_objectid, victim_name,
1116f186373fSMark Fasheh 					    victim_name_len)) {
1117f186373fSMark Fasheh 				ret = -ENOENT;
1118f186373fSMark Fasheh 				victim_parent = read_one_inode(root,
1119f186373fSMark Fasheh 						parent_objectid);
1120f186373fSMark Fasheh 				if (victim_parent) {
112194c91a1fSNikolay Borisov 					inc_nlink(&inode->vfs_inode);
1122f186373fSMark Fasheh 					btrfs_release_path(path);
1123f186373fSMark Fasheh 
1124f186373fSMark Fasheh 					ret = btrfs_unlink_inode(trans, root,
11254ec5934eSNikolay Borisov 							BTRFS_I(victim_parent),
112694c91a1fSNikolay Borisov 							inode,
1127f186373fSMark Fasheh 							victim_name,
1128f186373fSMark Fasheh 							victim_name_len);
1129ada9af21SFilipe David Borba Manana 					if (!ret)
1130ada9af21SFilipe David Borba Manana 						ret = btrfs_run_delayed_items(
11312ff7e61eSJeff Mahoney 								  trans,
11322ff7e61eSJeff Mahoney 								  fs_info);
1133f186373fSMark Fasheh 				}
1134f186373fSMark Fasheh 				iput(victim_parent);
1135f186373fSMark Fasheh 				kfree(victim_name);
11363650860bSJosef Bacik 				if (ret)
11373650860bSJosef Bacik 					return ret;
1138f186373fSMark Fasheh 				*search_done = 1;
1139f186373fSMark Fasheh 				goto again;
1140f186373fSMark Fasheh 			}
1141f186373fSMark Fasheh 			kfree(victim_name);
11423650860bSJosef Bacik 			if (ret)
11433650860bSJosef Bacik 				return ret;
1144f186373fSMark Fasheh next:
1145f186373fSMark Fasheh 			cur_offset += victim_name_len + sizeof(*extref);
1146f186373fSMark Fasheh 		}
1147f186373fSMark Fasheh 		*search_done = 1;
1148f186373fSMark Fasheh 	}
1149f186373fSMark Fasheh 	btrfs_release_path(path);
1150f186373fSMark Fasheh 
11515a1d7843SJan Schmidt 	/* look for a conflicting sequence number */
115294c91a1fSNikolay Borisov 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1153f186373fSMark Fasheh 					 ref_index, name, namelen, 0);
11545a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
115594c91a1fSNikolay Borisov 		ret = drop_one_dir_item(trans, root, path, dir, di);
11563650860bSJosef Bacik 		if (ret)
11573650860bSJosef Bacik 			return ret;
11585a1d7843SJan Schmidt 	}
11595a1d7843SJan Schmidt 	btrfs_release_path(path);
11605a1d7843SJan Schmidt 
11615a1d7843SJan Schmidt 	/* look for a conflicing name */
116294c91a1fSNikolay Borisov 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
11635a1d7843SJan Schmidt 				   name, namelen, 0);
11645a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
116594c91a1fSNikolay Borisov 		ret = drop_one_dir_item(trans, root, path, dir, di);
11663650860bSJosef Bacik 		if (ret)
11673650860bSJosef Bacik 			return ret;
11685a1d7843SJan Schmidt 	}
11695a1d7843SJan Schmidt 	btrfs_release_path(path);
11705a1d7843SJan Schmidt 
11715a1d7843SJan Schmidt 	return 0;
11725a1d7843SJan Schmidt }
1173e02119d5SChris Mason 
1174f186373fSMark Fasheh static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1175f186373fSMark Fasheh 			     u32 *namelen, char **name, u64 *index,
1176f186373fSMark Fasheh 			     u64 *parent_objectid)
1177f186373fSMark Fasheh {
1178f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1179f186373fSMark Fasheh 
1180f186373fSMark Fasheh 	extref = (struct btrfs_inode_extref *)ref_ptr;
1181f186373fSMark Fasheh 
1182f186373fSMark Fasheh 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1183f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1184f186373fSMark Fasheh 	if (*name == NULL)
1185f186373fSMark Fasheh 		return -ENOMEM;
1186f186373fSMark Fasheh 
1187f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1188f186373fSMark Fasheh 			   *namelen);
1189f186373fSMark Fasheh 
1190f186373fSMark Fasheh 	*index = btrfs_inode_extref_index(eb, extref);
1191f186373fSMark Fasheh 	if (parent_objectid)
1192f186373fSMark Fasheh 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1193f186373fSMark Fasheh 
1194f186373fSMark Fasheh 	return 0;
1195f186373fSMark Fasheh }
1196f186373fSMark Fasheh 
1197f186373fSMark Fasheh static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1198f186373fSMark Fasheh 			  u32 *namelen, char **name, u64 *index)
1199f186373fSMark Fasheh {
1200f186373fSMark Fasheh 	struct btrfs_inode_ref *ref;
1201f186373fSMark Fasheh 
1202f186373fSMark Fasheh 	ref = (struct btrfs_inode_ref *)ref_ptr;
1203f186373fSMark Fasheh 
1204f186373fSMark Fasheh 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1205f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1206f186373fSMark Fasheh 	if (*name == NULL)
1207f186373fSMark Fasheh 		return -ENOMEM;
1208f186373fSMark Fasheh 
1209f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1210f186373fSMark Fasheh 
1211f186373fSMark Fasheh 	*index = btrfs_inode_ref_index(eb, ref);
1212f186373fSMark Fasheh 
1213f186373fSMark Fasheh 	return 0;
1214f186373fSMark Fasheh }
1215f186373fSMark Fasheh 
1216e02119d5SChris Mason /*
1217e02119d5SChris Mason  * replay one inode back reference item found in the log tree.
1218e02119d5SChris Mason  * eb, slot and key refer to the buffer and key found in the log tree.
1219e02119d5SChris Mason  * root is the destination we are replaying into, and path is for temp
1220e02119d5SChris Mason  * use by this function.  (it should be released on return).
1221e02119d5SChris Mason  */
1222e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1223e02119d5SChris Mason 				  struct btrfs_root *root,
1224e02119d5SChris Mason 				  struct btrfs_root *log,
1225e02119d5SChris Mason 				  struct btrfs_path *path,
1226e02119d5SChris Mason 				  struct extent_buffer *eb, int slot,
1227e02119d5SChris Mason 				  struct btrfs_key *key)
1228e02119d5SChris Mason {
122903b2f08bSGeyslan G. Bem 	struct inode *dir = NULL;
123003b2f08bSGeyslan G. Bem 	struct inode *inode = NULL;
1231e02119d5SChris Mason 	unsigned long ref_ptr;
1232e02119d5SChris Mason 	unsigned long ref_end;
123303b2f08bSGeyslan G. Bem 	char *name = NULL;
123434f3e4f2Sliubo 	int namelen;
123534f3e4f2Sliubo 	int ret;
1236c622ae60Sliubo 	int search_done = 0;
1237f186373fSMark Fasheh 	int log_ref_ver = 0;
1238f186373fSMark Fasheh 	u64 parent_objectid;
1239f186373fSMark Fasheh 	u64 inode_objectid;
1240f46dbe3dSChris Mason 	u64 ref_index = 0;
1241f186373fSMark Fasheh 	int ref_struct_size;
1242f186373fSMark Fasheh 
1243f186373fSMark Fasheh 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1244f186373fSMark Fasheh 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1245f186373fSMark Fasheh 
1246f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1247f186373fSMark Fasheh 		struct btrfs_inode_extref *r;
1248f186373fSMark Fasheh 
1249f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1250f186373fSMark Fasheh 		log_ref_ver = 1;
1251f186373fSMark Fasheh 		r = (struct btrfs_inode_extref *)ref_ptr;
1252f186373fSMark Fasheh 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1253f186373fSMark Fasheh 	} else {
1254f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1255f186373fSMark Fasheh 		parent_objectid = key->offset;
1256f186373fSMark Fasheh 	}
1257f186373fSMark Fasheh 	inode_objectid = key->objectid;
1258e02119d5SChris Mason 
1259e02119d5SChris Mason 	/*
1260e02119d5SChris Mason 	 * it is possible that we didn't log all the parent directories
1261e02119d5SChris Mason 	 * for a given inode.  If we don't find the dir, just don't
1262e02119d5SChris Mason 	 * copy the back ref in.  The link count fixup code will take
1263e02119d5SChris Mason 	 * care of the rest
1264e02119d5SChris Mason 	 */
1265f186373fSMark Fasheh 	dir = read_one_inode(root, parent_objectid);
126603b2f08bSGeyslan G. Bem 	if (!dir) {
126703b2f08bSGeyslan G. Bem 		ret = -ENOENT;
126803b2f08bSGeyslan G. Bem 		goto out;
126903b2f08bSGeyslan G. Bem 	}
1270e02119d5SChris Mason 
1271f186373fSMark Fasheh 	inode = read_one_inode(root, inode_objectid);
1272c00e9493STsutomu Itoh 	if (!inode) {
127303b2f08bSGeyslan G. Bem 		ret = -EIO;
127403b2f08bSGeyslan G. Bem 		goto out;
1275c00e9493STsutomu Itoh 	}
1276e02119d5SChris Mason 
12775a1d7843SJan Schmidt 	while (ref_ptr < ref_end) {
1278f186373fSMark Fasheh 		if (log_ref_ver) {
1279f186373fSMark Fasheh 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1280f186373fSMark Fasheh 						&ref_index, &parent_objectid);
1281f186373fSMark Fasheh 			/*
1282f186373fSMark Fasheh 			 * parent object can change from one array
1283f186373fSMark Fasheh 			 * item to another.
1284f186373fSMark Fasheh 			 */
1285f186373fSMark Fasheh 			if (!dir)
1286f186373fSMark Fasheh 				dir = read_one_inode(root, parent_objectid);
128703b2f08bSGeyslan G. Bem 			if (!dir) {
128803b2f08bSGeyslan G. Bem 				ret = -ENOENT;
128903b2f08bSGeyslan G. Bem 				goto out;
129003b2f08bSGeyslan G. Bem 			}
1291f186373fSMark Fasheh 		} else {
1292f186373fSMark Fasheh 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1293f186373fSMark Fasheh 					     &ref_index);
1294f186373fSMark Fasheh 		}
1295f186373fSMark Fasheh 		if (ret)
129603b2f08bSGeyslan G. Bem 			goto out;
1297e02119d5SChris Mason 
1298e02119d5SChris Mason 		/* if we already have a perfect match, we're done */
12994a0cc7caSNikolay Borisov 		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), btrfs_ino(BTRFS_I(inode)),
1300f186373fSMark Fasheh 				  ref_index, name, namelen)) {
13015a1d7843SJan Schmidt 			/*
13025a1d7843SJan Schmidt 			 * look for a conflicting back reference in the
13035a1d7843SJan Schmidt 			 * metadata. if we find one we have to unlink that name
13045a1d7843SJan Schmidt 			 * of the file before we add our new link.  Later on, we
13055a1d7843SJan Schmidt 			 * overwrite any existing back reference, and we don't
13065a1d7843SJan Schmidt 			 * want to create dangling pointers in the directory.
13075a1d7843SJan Schmidt 			 */
13085a1d7843SJan Schmidt 
13095a1d7843SJan Schmidt 			if (!search_done) {
13105a1d7843SJan Schmidt 				ret = __add_inode_ref(trans, root, path, log,
131194c91a1fSNikolay Borisov 						      BTRFS_I(dir),
131294c91a1fSNikolay Borisov 						      BTRFS_I(inode), eb,
1313f186373fSMark Fasheh 						      inode_objectid,
1314f186373fSMark Fasheh 						      parent_objectid,
1315f186373fSMark Fasheh 						      ref_index, name, namelen,
13165a1d7843SJan Schmidt 						      &search_done);
131703b2f08bSGeyslan G. Bem 				if (ret) {
131803b2f08bSGeyslan G. Bem 					if (ret == 1)
13193650860bSJosef Bacik 						ret = 0;
1320e02119d5SChris Mason 					goto out;
13213650860bSJosef Bacik 				}
132234f3e4f2Sliubo 			}
132334f3e4f2Sliubo 
1324e02119d5SChris Mason 			/* insert our name */
13255a1d7843SJan Schmidt 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
1326f186373fSMark Fasheh 					     0, ref_index);
13273650860bSJosef Bacik 			if (ret)
13283650860bSJosef Bacik 				goto out;
1329e02119d5SChris Mason 
1330e02119d5SChris Mason 			btrfs_update_inode(trans, root, inode);
13315a1d7843SJan Schmidt 		}
1332e02119d5SChris Mason 
1333f186373fSMark Fasheh 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1334e02119d5SChris Mason 		kfree(name);
133503b2f08bSGeyslan G. Bem 		name = NULL;
1336f186373fSMark Fasheh 		if (log_ref_ver) {
1337f186373fSMark Fasheh 			iput(dir);
1338f186373fSMark Fasheh 			dir = NULL;
1339f186373fSMark Fasheh 		}
13405a1d7843SJan Schmidt 	}
1341e02119d5SChris Mason 
1342e02119d5SChris Mason 	/* finally write the back reference in the inode */
1343e02119d5SChris Mason 	ret = overwrite_item(trans, root, path, eb, slot, key);
13445a1d7843SJan Schmidt out:
1345b3b4aa74SDavid Sterba 	btrfs_release_path(path);
134603b2f08bSGeyslan G. Bem 	kfree(name);
1347e02119d5SChris Mason 	iput(dir);
1348e02119d5SChris Mason 	iput(inode);
13493650860bSJosef Bacik 	return ret;
1350e02119d5SChris Mason }
1351e02119d5SChris Mason 
1352c71bf099SYan, Zheng static int insert_orphan_item(struct btrfs_trans_handle *trans,
13539c4f61f0SDavid Sterba 			      struct btrfs_root *root, u64 ino)
1354c71bf099SYan, Zheng {
1355c71bf099SYan, Zheng 	int ret;
1356381cf658SDavid Sterba 
13579c4f61f0SDavid Sterba 	ret = btrfs_insert_orphan_item(trans, root, ino);
13589c4f61f0SDavid Sterba 	if (ret == -EEXIST)
13599c4f61f0SDavid Sterba 		ret = 0;
1360381cf658SDavid Sterba 
1361c71bf099SYan, Zheng 	return ret;
1362c71bf099SYan, Zheng }
1363c71bf099SYan, Zheng 
1364f186373fSMark Fasheh static int count_inode_extrefs(struct btrfs_root *root,
1365f186373fSMark Fasheh 			       struct inode *inode, struct btrfs_path *path)
1366e02119d5SChris Mason {
1367f186373fSMark Fasheh 	int ret = 0;
1368f186373fSMark Fasheh 	int name_len;
1369f186373fSMark Fasheh 	unsigned int nlink = 0;
1370f186373fSMark Fasheh 	u32 item_size;
1371f186373fSMark Fasheh 	u32 cur_offset = 0;
13724a0cc7caSNikolay Borisov 	u64 inode_objectid = btrfs_ino(BTRFS_I(inode));
1373f186373fSMark Fasheh 	u64 offset = 0;
1374f186373fSMark Fasheh 	unsigned long ptr;
1375f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1376f186373fSMark Fasheh 	struct extent_buffer *leaf;
1377f186373fSMark Fasheh 
1378f186373fSMark Fasheh 	while (1) {
1379f186373fSMark Fasheh 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1380f186373fSMark Fasheh 					    &extref, &offset);
1381f186373fSMark Fasheh 		if (ret)
1382f186373fSMark Fasheh 			break;
1383f186373fSMark Fasheh 
1384f186373fSMark Fasheh 		leaf = path->nodes[0];
1385f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1386f186373fSMark Fasheh 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
13872c2c452bSFilipe Manana 		cur_offset = 0;
1388f186373fSMark Fasheh 
1389f186373fSMark Fasheh 		while (cur_offset < item_size) {
1390f186373fSMark Fasheh 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1391f186373fSMark Fasheh 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1392f186373fSMark Fasheh 
1393f186373fSMark Fasheh 			nlink++;
1394f186373fSMark Fasheh 
1395f186373fSMark Fasheh 			cur_offset += name_len + sizeof(*extref);
1396f186373fSMark Fasheh 		}
1397f186373fSMark Fasheh 
1398f186373fSMark Fasheh 		offset++;
1399f186373fSMark Fasheh 		btrfs_release_path(path);
1400f186373fSMark Fasheh 	}
1401f186373fSMark Fasheh 	btrfs_release_path(path);
1402f186373fSMark Fasheh 
14032c2c452bSFilipe Manana 	if (ret < 0 && ret != -ENOENT)
1404f186373fSMark Fasheh 		return ret;
1405f186373fSMark Fasheh 	return nlink;
1406f186373fSMark Fasheh }
1407f186373fSMark Fasheh 
1408f186373fSMark Fasheh static int count_inode_refs(struct btrfs_root *root,
1409f186373fSMark Fasheh 			       struct inode *inode, struct btrfs_path *path)
1410f186373fSMark Fasheh {
1411e02119d5SChris Mason 	int ret;
1412e02119d5SChris Mason 	struct btrfs_key key;
1413f186373fSMark Fasheh 	unsigned int nlink = 0;
1414e02119d5SChris Mason 	unsigned long ptr;
1415e02119d5SChris Mason 	unsigned long ptr_end;
1416e02119d5SChris Mason 	int name_len;
14174a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
1418e02119d5SChris Mason 
141933345d01SLi Zefan 	key.objectid = ino;
1420e02119d5SChris Mason 	key.type = BTRFS_INODE_REF_KEY;
1421e02119d5SChris Mason 	key.offset = (u64)-1;
1422e02119d5SChris Mason 
1423e02119d5SChris Mason 	while (1) {
1424e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1425e02119d5SChris Mason 		if (ret < 0)
1426e02119d5SChris Mason 			break;
1427e02119d5SChris Mason 		if (ret > 0) {
1428e02119d5SChris Mason 			if (path->slots[0] == 0)
1429e02119d5SChris Mason 				break;
1430e02119d5SChris Mason 			path->slots[0]--;
1431e02119d5SChris Mason 		}
1432e93ae26fSFilipe David Borba Manana process_slot:
1433e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1434e02119d5SChris Mason 				      path->slots[0]);
143533345d01SLi Zefan 		if (key.objectid != ino ||
1436e02119d5SChris Mason 		    key.type != BTRFS_INODE_REF_KEY)
1437e02119d5SChris Mason 			break;
1438e02119d5SChris Mason 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1439e02119d5SChris Mason 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1440e02119d5SChris Mason 						   path->slots[0]);
1441e02119d5SChris Mason 		while (ptr < ptr_end) {
1442e02119d5SChris Mason 			struct btrfs_inode_ref *ref;
1443e02119d5SChris Mason 
1444e02119d5SChris Mason 			ref = (struct btrfs_inode_ref *)ptr;
1445e02119d5SChris Mason 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1446e02119d5SChris Mason 							    ref);
1447e02119d5SChris Mason 			ptr = (unsigned long)(ref + 1) + name_len;
1448e02119d5SChris Mason 			nlink++;
1449e02119d5SChris Mason 		}
1450e02119d5SChris Mason 
1451e02119d5SChris Mason 		if (key.offset == 0)
1452e02119d5SChris Mason 			break;
1453e93ae26fSFilipe David Borba Manana 		if (path->slots[0] > 0) {
1454e93ae26fSFilipe David Borba Manana 			path->slots[0]--;
1455e93ae26fSFilipe David Borba Manana 			goto process_slot;
1456e93ae26fSFilipe David Borba Manana 		}
1457e02119d5SChris Mason 		key.offset--;
1458b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1459e02119d5SChris Mason 	}
1460b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1461f186373fSMark Fasheh 
1462f186373fSMark Fasheh 	return nlink;
1463f186373fSMark Fasheh }
1464f186373fSMark Fasheh 
1465f186373fSMark Fasheh /*
1466f186373fSMark Fasheh  * There are a few corners where the link count of the file can't
1467f186373fSMark Fasheh  * be properly maintained during replay.  So, instead of adding
1468f186373fSMark Fasheh  * lots of complexity to the log code, we just scan the backrefs
1469f186373fSMark Fasheh  * for any file that has been through replay.
1470f186373fSMark Fasheh  *
1471f186373fSMark Fasheh  * The scan will update the link count on the inode to reflect the
1472f186373fSMark Fasheh  * number of back refs found.  If it goes down to zero, the iput
1473f186373fSMark Fasheh  * will free the inode.
1474f186373fSMark Fasheh  */
1475f186373fSMark Fasheh static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1476f186373fSMark Fasheh 					   struct btrfs_root *root,
1477f186373fSMark Fasheh 					   struct inode *inode)
1478f186373fSMark Fasheh {
1479f186373fSMark Fasheh 	struct btrfs_path *path;
1480f186373fSMark Fasheh 	int ret;
1481f186373fSMark Fasheh 	u64 nlink = 0;
14824a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
1483f186373fSMark Fasheh 
1484f186373fSMark Fasheh 	path = btrfs_alloc_path();
1485f186373fSMark Fasheh 	if (!path)
1486f186373fSMark Fasheh 		return -ENOMEM;
1487f186373fSMark Fasheh 
1488f186373fSMark Fasheh 	ret = count_inode_refs(root, inode, path);
1489f186373fSMark Fasheh 	if (ret < 0)
1490f186373fSMark Fasheh 		goto out;
1491f186373fSMark Fasheh 
1492f186373fSMark Fasheh 	nlink = ret;
1493f186373fSMark Fasheh 
1494f186373fSMark Fasheh 	ret = count_inode_extrefs(root, inode, path);
1495f186373fSMark Fasheh 	if (ret < 0)
1496f186373fSMark Fasheh 		goto out;
1497f186373fSMark Fasheh 
1498f186373fSMark Fasheh 	nlink += ret;
1499f186373fSMark Fasheh 
1500f186373fSMark Fasheh 	ret = 0;
1501f186373fSMark Fasheh 
1502e02119d5SChris Mason 	if (nlink != inode->i_nlink) {
1503bfe86848SMiklos Szeredi 		set_nlink(inode, nlink);
1504e02119d5SChris Mason 		btrfs_update_inode(trans, root, inode);
1505e02119d5SChris Mason 	}
15068d5bf1cbSChris Mason 	BTRFS_I(inode)->index_cnt = (u64)-1;
1507e02119d5SChris Mason 
1508c71bf099SYan, Zheng 	if (inode->i_nlink == 0) {
1509c71bf099SYan, Zheng 		if (S_ISDIR(inode->i_mode)) {
151012fcfd22SChris Mason 			ret = replay_dir_deletes(trans, root, NULL, path,
151133345d01SLi Zefan 						 ino, 1);
15123650860bSJosef Bacik 			if (ret)
15133650860bSJosef Bacik 				goto out;
151412fcfd22SChris Mason 		}
151533345d01SLi Zefan 		ret = insert_orphan_item(trans, root, ino);
1516c71bf099SYan, Zheng 	}
151712fcfd22SChris Mason 
1518f186373fSMark Fasheh out:
1519f186373fSMark Fasheh 	btrfs_free_path(path);
1520f186373fSMark Fasheh 	return ret;
1521e02119d5SChris Mason }
1522e02119d5SChris Mason 
1523e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1524e02119d5SChris Mason 					    struct btrfs_root *root,
1525e02119d5SChris Mason 					    struct btrfs_path *path)
1526e02119d5SChris Mason {
1527e02119d5SChris Mason 	int ret;
1528e02119d5SChris Mason 	struct btrfs_key key;
1529e02119d5SChris Mason 	struct inode *inode;
1530e02119d5SChris Mason 
1531e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1532e02119d5SChris Mason 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1533e02119d5SChris Mason 	key.offset = (u64)-1;
1534e02119d5SChris Mason 	while (1) {
1535e02119d5SChris Mason 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1536e02119d5SChris Mason 		if (ret < 0)
1537e02119d5SChris Mason 			break;
1538e02119d5SChris Mason 
1539e02119d5SChris Mason 		if (ret == 1) {
1540e02119d5SChris Mason 			if (path->slots[0] == 0)
1541e02119d5SChris Mason 				break;
1542e02119d5SChris Mason 			path->slots[0]--;
1543e02119d5SChris Mason 		}
1544e02119d5SChris Mason 
1545e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1546e02119d5SChris Mason 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1547e02119d5SChris Mason 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1548e02119d5SChris Mason 			break;
1549e02119d5SChris Mason 
1550e02119d5SChris Mason 		ret = btrfs_del_item(trans, root, path);
155165a246c5STsutomu Itoh 		if (ret)
155265a246c5STsutomu Itoh 			goto out;
1553e02119d5SChris Mason 
1554b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1555e02119d5SChris Mason 		inode = read_one_inode(root, key.offset);
1556c00e9493STsutomu Itoh 		if (!inode)
1557c00e9493STsutomu Itoh 			return -EIO;
1558e02119d5SChris Mason 
1559e02119d5SChris Mason 		ret = fixup_inode_link_count(trans, root, inode);
1560e02119d5SChris Mason 		iput(inode);
15613650860bSJosef Bacik 		if (ret)
15623650860bSJosef Bacik 			goto out;
1563e02119d5SChris Mason 
156412fcfd22SChris Mason 		/*
156512fcfd22SChris Mason 		 * fixup on a directory may create new entries,
156612fcfd22SChris Mason 		 * make sure we always look for the highset possible
156712fcfd22SChris Mason 		 * offset
156812fcfd22SChris Mason 		 */
156912fcfd22SChris Mason 		key.offset = (u64)-1;
1570e02119d5SChris Mason 	}
157165a246c5STsutomu Itoh 	ret = 0;
157265a246c5STsutomu Itoh out:
1573b3b4aa74SDavid Sterba 	btrfs_release_path(path);
157465a246c5STsutomu Itoh 	return ret;
1575e02119d5SChris Mason }
1576e02119d5SChris Mason 
1577e02119d5SChris Mason 
1578e02119d5SChris Mason /*
1579e02119d5SChris Mason  * record a given inode in the fixup dir so we can check its link
1580e02119d5SChris Mason  * count when replay is done.  The link count is incremented here
1581e02119d5SChris Mason  * so the inode won't go away until we check it
1582e02119d5SChris Mason  */
1583e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1584e02119d5SChris Mason 				      struct btrfs_root *root,
1585e02119d5SChris Mason 				      struct btrfs_path *path,
1586e02119d5SChris Mason 				      u64 objectid)
1587e02119d5SChris Mason {
1588e02119d5SChris Mason 	struct btrfs_key key;
1589e02119d5SChris Mason 	int ret = 0;
1590e02119d5SChris Mason 	struct inode *inode;
1591e02119d5SChris Mason 
1592e02119d5SChris Mason 	inode = read_one_inode(root, objectid);
1593c00e9493STsutomu Itoh 	if (!inode)
1594c00e9493STsutomu Itoh 		return -EIO;
1595e02119d5SChris Mason 
1596e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1597962a298fSDavid Sterba 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1598e02119d5SChris Mason 	key.offset = objectid;
1599e02119d5SChris Mason 
1600e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1601e02119d5SChris Mason 
1602b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1603e02119d5SChris Mason 	if (ret == 0) {
16049bf7a489SJosef Bacik 		if (!inode->i_nlink)
16059bf7a489SJosef Bacik 			set_nlink(inode, 1);
16069bf7a489SJosef Bacik 		else
16078b558c5fSZach Brown 			inc_nlink(inode);
1608b9959295STsutomu Itoh 		ret = btrfs_update_inode(trans, root, inode);
1609e02119d5SChris Mason 	} else if (ret == -EEXIST) {
1610e02119d5SChris Mason 		ret = 0;
1611e02119d5SChris Mason 	} else {
16123650860bSJosef Bacik 		BUG(); /* Logic Error */
1613e02119d5SChris Mason 	}
1614e02119d5SChris Mason 	iput(inode);
1615e02119d5SChris Mason 
1616e02119d5SChris Mason 	return ret;
1617e02119d5SChris Mason }
1618e02119d5SChris Mason 
1619e02119d5SChris Mason /*
1620e02119d5SChris Mason  * when replaying the log for a directory, we only insert names
1621e02119d5SChris Mason  * for inodes that actually exist.  This means an fsync on a directory
1622e02119d5SChris Mason  * does not implicitly fsync all the new files in it
1623e02119d5SChris Mason  */
1624e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1625e02119d5SChris Mason 				    struct btrfs_root *root,
1626e02119d5SChris Mason 				    u64 dirid, u64 index,
162760d53eb3SZhaolei 				    char *name, int name_len,
1628e02119d5SChris Mason 				    struct btrfs_key *location)
1629e02119d5SChris Mason {
1630e02119d5SChris Mason 	struct inode *inode;
1631e02119d5SChris Mason 	struct inode *dir;
1632e02119d5SChris Mason 	int ret;
1633e02119d5SChris Mason 
1634e02119d5SChris Mason 	inode = read_one_inode(root, location->objectid);
1635e02119d5SChris Mason 	if (!inode)
1636e02119d5SChris Mason 		return -ENOENT;
1637e02119d5SChris Mason 
1638e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
1639e02119d5SChris Mason 	if (!dir) {
1640e02119d5SChris Mason 		iput(inode);
1641e02119d5SChris Mason 		return -EIO;
1642e02119d5SChris Mason 	}
1643d555438bSJosef Bacik 
1644e02119d5SChris Mason 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1645e02119d5SChris Mason 
1646e02119d5SChris Mason 	/* FIXME, put inode into FIXUP list */
1647e02119d5SChris Mason 
1648e02119d5SChris Mason 	iput(inode);
1649e02119d5SChris Mason 	iput(dir);
1650e02119d5SChris Mason 	return ret;
1651e02119d5SChris Mason }
1652e02119d5SChris Mason 
1653e02119d5SChris Mason /*
1654df8d116fSFilipe Manana  * Return true if an inode reference exists in the log for the given name,
1655df8d116fSFilipe Manana  * inode and parent inode.
1656df8d116fSFilipe Manana  */
1657df8d116fSFilipe Manana static bool name_in_log_ref(struct btrfs_root *log_root,
1658df8d116fSFilipe Manana 			    const char *name, const int name_len,
1659df8d116fSFilipe Manana 			    const u64 dirid, const u64 ino)
1660df8d116fSFilipe Manana {
1661df8d116fSFilipe Manana 	struct btrfs_key search_key;
1662df8d116fSFilipe Manana 
1663df8d116fSFilipe Manana 	search_key.objectid = ino;
1664df8d116fSFilipe Manana 	search_key.type = BTRFS_INODE_REF_KEY;
1665df8d116fSFilipe Manana 	search_key.offset = dirid;
1666df8d116fSFilipe Manana 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1667df8d116fSFilipe Manana 		return true;
1668df8d116fSFilipe Manana 
1669df8d116fSFilipe Manana 	search_key.type = BTRFS_INODE_EXTREF_KEY;
1670df8d116fSFilipe Manana 	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1671df8d116fSFilipe Manana 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1672df8d116fSFilipe Manana 		return true;
1673df8d116fSFilipe Manana 
1674df8d116fSFilipe Manana 	return false;
1675df8d116fSFilipe Manana }
1676df8d116fSFilipe Manana 
1677df8d116fSFilipe Manana /*
1678e02119d5SChris Mason  * take a single entry in a log directory item and replay it into
1679e02119d5SChris Mason  * the subvolume.
1680e02119d5SChris Mason  *
1681e02119d5SChris Mason  * if a conflicting item exists in the subdirectory already,
1682e02119d5SChris Mason  * the inode it points to is unlinked and put into the link count
1683e02119d5SChris Mason  * fix up tree.
1684e02119d5SChris Mason  *
1685e02119d5SChris Mason  * If a name from the log points to a file or directory that does
1686e02119d5SChris Mason  * not exist in the FS, it is skipped.  fsyncs on directories
1687e02119d5SChris Mason  * do not force down inodes inside that directory, just changes to the
1688e02119d5SChris Mason  * names or unlinks in a directory.
1689bb53eda9SFilipe Manana  *
1690bb53eda9SFilipe Manana  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1691bb53eda9SFilipe Manana  * non-existing inode) and 1 if the name was replayed.
1692e02119d5SChris Mason  */
1693e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1694e02119d5SChris Mason 				    struct btrfs_root *root,
1695e02119d5SChris Mason 				    struct btrfs_path *path,
1696e02119d5SChris Mason 				    struct extent_buffer *eb,
1697e02119d5SChris Mason 				    struct btrfs_dir_item *di,
1698e02119d5SChris Mason 				    struct btrfs_key *key)
1699e02119d5SChris Mason {
1700e02119d5SChris Mason 	char *name;
1701e02119d5SChris Mason 	int name_len;
1702e02119d5SChris Mason 	struct btrfs_dir_item *dst_di;
1703e02119d5SChris Mason 	struct btrfs_key found_key;
1704e02119d5SChris Mason 	struct btrfs_key log_key;
1705e02119d5SChris Mason 	struct inode *dir;
1706e02119d5SChris Mason 	u8 log_type;
17074bef0848SChris Mason 	int exists;
17083650860bSJosef Bacik 	int ret = 0;
1709d555438bSJosef Bacik 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1710bb53eda9SFilipe Manana 	bool name_added = false;
1711e02119d5SChris Mason 
1712e02119d5SChris Mason 	dir = read_one_inode(root, key->objectid);
1713c00e9493STsutomu Itoh 	if (!dir)
1714c00e9493STsutomu Itoh 		return -EIO;
1715e02119d5SChris Mason 
1716e02119d5SChris Mason 	name_len = btrfs_dir_name_len(eb, di);
1717e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
17182bac325eSFilipe David Borba Manana 	if (!name) {
17192bac325eSFilipe David Borba Manana 		ret = -ENOMEM;
17202bac325eSFilipe David Borba Manana 		goto out;
17212bac325eSFilipe David Borba Manana 	}
17222a29edc6Sliubo 
1723e02119d5SChris Mason 	log_type = btrfs_dir_type(eb, di);
1724e02119d5SChris Mason 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1725e02119d5SChris Mason 		   name_len);
1726e02119d5SChris Mason 
1727e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
17284bef0848SChris Mason 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
17294bef0848SChris Mason 	if (exists == 0)
17304bef0848SChris Mason 		exists = 1;
17314bef0848SChris Mason 	else
17324bef0848SChris Mason 		exists = 0;
1733b3b4aa74SDavid Sterba 	btrfs_release_path(path);
17344bef0848SChris Mason 
1735e02119d5SChris Mason 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1736e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1737e02119d5SChris Mason 				       name, name_len, 1);
1738d397712bSChris Mason 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1739e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1740e02119d5SChris Mason 						     key->objectid,
1741e02119d5SChris Mason 						     key->offset, name,
1742e02119d5SChris Mason 						     name_len, 1);
1743e02119d5SChris Mason 	} else {
17443650860bSJosef Bacik 		/* Corruption */
17453650860bSJosef Bacik 		ret = -EINVAL;
17463650860bSJosef Bacik 		goto out;
1747e02119d5SChris Mason 	}
1748c704005dSDavid Sterba 	if (IS_ERR_OR_NULL(dst_di)) {
1749e02119d5SChris Mason 		/* we need a sequence number to insert, so we only
1750e02119d5SChris Mason 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1751e02119d5SChris Mason 		 */
1752e02119d5SChris Mason 		if (key->type != BTRFS_DIR_INDEX_KEY)
1753e02119d5SChris Mason 			goto out;
1754e02119d5SChris Mason 		goto insert;
1755e02119d5SChris Mason 	}
1756e02119d5SChris Mason 
1757e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1758e02119d5SChris Mason 	/* the existing item matches the logged item */
1759e02119d5SChris Mason 	if (found_key.objectid == log_key.objectid &&
1760e02119d5SChris Mason 	    found_key.type == log_key.type &&
1761e02119d5SChris Mason 	    found_key.offset == log_key.offset &&
1762e02119d5SChris Mason 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1763a2cc11dbSFilipe Manana 		update_size = false;
1764e02119d5SChris Mason 		goto out;
1765e02119d5SChris Mason 	}
1766e02119d5SChris Mason 
1767e02119d5SChris Mason 	/*
1768e02119d5SChris Mason 	 * don't drop the conflicting directory entry if the inode
1769e02119d5SChris Mason 	 * for the new entry doesn't exist
1770e02119d5SChris Mason 	 */
17714bef0848SChris Mason 	if (!exists)
1772e02119d5SChris Mason 		goto out;
1773e02119d5SChris Mason 
1774207e7d92SNikolay Borisov 	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
17753650860bSJosef Bacik 	if (ret)
17763650860bSJosef Bacik 		goto out;
1777e02119d5SChris Mason 
1778e02119d5SChris Mason 	if (key->type == BTRFS_DIR_INDEX_KEY)
1779e02119d5SChris Mason 		goto insert;
1780e02119d5SChris Mason out:
1781b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1782d555438bSJosef Bacik 	if (!ret && update_size) {
1783d555438bSJosef Bacik 		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1784d555438bSJosef Bacik 		ret = btrfs_update_inode(trans, root, dir);
1785d555438bSJosef Bacik 	}
1786e02119d5SChris Mason 	kfree(name);
1787e02119d5SChris Mason 	iput(dir);
1788bb53eda9SFilipe Manana 	if (!ret && name_added)
1789bb53eda9SFilipe Manana 		ret = 1;
17903650860bSJosef Bacik 	return ret;
1791e02119d5SChris Mason 
1792e02119d5SChris Mason insert:
1793df8d116fSFilipe Manana 	if (name_in_log_ref(root->log_root, name, name_len,
1794df8d116fSFilipe Manana 			    key->objectid, log_key.objectid)) {
1795df8d116fSFilipe Manana 		/* The dentry will be added later. */
1796df8d116fSFilipe Manana 		ret = 0;
1797df8d116fSFilipe Manana 		update_size = false;
1798df8d116fSFilipe Manana 		goto out;
1799df8d116fSFilipe Manana 	}
1800b3b4aa74SDavid Sterba 	btrfs_release_path(path);
180160d53eb3SZhaolei 	ret = insert_one_name(trans, root, key->objectid, key->offset,
180260d53eb3SZhaolei 			      name, name_len, &log_key);
1803df8d116fSFilipe Manana 	if (ret && ret != -ENOENT && ret != -EEXIST)
18043650860bSJosef Bacik 		goto out;
1805bb53eda9SFilipe Manana 	if (!ret)
1806bb53eda9SFilipe Manana 		name_added = true;
1807d555438bSJosef Bacik 	update_size = false;
18083650860bSJosef Bacik 	ret = 0;
1809e02119d5SChris Mason 	goto out;
1810e02119d5SChris Mason }
1811e02119d5SChris Mason 
1812e02119d5SChris Mason /*
1813e02119d5SChris Mason  * find all the names in a directory item and reconcile them into
1814e02119d5SChris Mason  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1815e02119d5SChris Mason  * one name in a directory item, but the same code gets used for
1816e02119d5SChris Mason  * both directory index types
1817e02119d5SChris Mason  */
1818e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1819e02119d5SChris Mason 					struct btrfs_root *root,
1820e02119d5SChris Mason 					struct btrfs_path *path,
1821e02119d5SChris Mason 					struct extent_buffer *eb, int slot,
1822e02119d5SChris Mason 					struct btrfs_key *key)
1823e02119d5SChris Mason {
18242ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
1825bb53eda9SFilipe Manana 	int ret = 0;
1826e02119d5SChris Mason 	u32 item_size = btrfs_item_size_nr(eb, slot);
1827e02119d5SChris Mason 	struct btrfs_dir_item *di;
1828e02119d5SChris Mason 	int name_len;
1829e02119d5SChris Mason 	unsigned long ptr;
1830e02119d5SChris Mason 	unsigned long ptr_end;
1831bb53eda9SFilipe Manana 	struct btrfs_path *fixup_path = NULL;
1832e02119d5SChris Mason 
1833e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
1834e02119d5SChris Mason 	ptr_end = ptr + item_size;
1835e02119d5SChris Mason 	while (ptr < ptr_end) {
1836e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
18372ff7e61eSJeff Mahoney 		if (verify_dir_item(fs_info, eb, di))
183822a94d44SJosef Bacik 			return -EIO;
1839e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
1840e02119d5SChris Mason 		ret = replay_one_name(trans, root, path, eb, di, key);
1841bb53eda9SFilipe Manana 		if (ret < 0)
1842bb53eda9SFilipe Manana 			break;
1843e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
1844e02119d5SChris Mason 		ptr += name_len;
1845bb53eda9SFilipe Manana 
1846bb53eda9SFilipe Manana 		/*
1847bb53eda9SFilipe Manana 		 * If this entry refers to a non-directory (directories can not
1848bb53eda9SFilipe Manana 		 * have a link count > 1) and it was added in the transaction
1849bb53eda9SFilipe Manana 		 * that was not committed, make sure we fixup the link count of
1850bb53eda9SFilipe Manana 		 * the inode it the entry points to. Otherwise something like
1851bb53eda9SFilipe Manana 		 * the following would result in a directory pointing to an
1852bb53eda9SFilipe Manana 		 * inode with a wrong link that does not account for this dir
1853bb53eda9SFilipe Manana 		 * entry:
1854bb53eda9SFilipe Manana 		 *
1855bb53eda9SFilipe Manana 		 * mkdir testdir
1856bb53eda9SFilipe Manana 		 * touch testdir/foo
1857bb53eda9SFilipe Manana 		 * touch testdir/bar
1858bb53eda9SFilipe Manana 		 * sync
1859bb53eda9SFilipe Manana 		 *
1860bb53eda9SFilipe Manana 		 * ln testdir/bar testdir/bar_link
1861bb53eda9SFilipe Manana 		 * ln testdir/foo testdir/foo_link
1862bb53eda9SFilipe Manana 		 * xfs_io -c "fsync" testdir/bar
1863bb53eda9SFilipe Manana 		 *
1864bb53eda9SFilipe Manana 		 * <power failure>
1865bb53eda9SFilipe Manana 		 *
1866bb53eda9SFilipe Manana 		 * mount fs, log replay happens
1867bb53eda9SFilipe Manana 		 *
1868bb53eda9SFilipe Manana 		 * File foo would remain with a link count of 1 when it has two
1869bb53eda9SFilipe Manana 		 * entries pointing to it in the directory testdir. This would
1870bb53eda9SFilipe Manana 		 * make it impossible to ever delete the parent directory has
1871bb53eda9SFilipe Manana 		 * it would result in stale dentries that can never be deleted.
1872bb53eda9SFilipe Manana 		 */
1873bb53eda9SFilipe Manana 		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1874bb53eda9SFilipe Manana 			struct btrfs_key di_key;
1875bb53eda9SFilipe Manana 
1876bb53eda9SFilipe Manana 			if (!fixup_path) {
1877bb53eda9SFilipe Manana 				fixup_path = btrfs_alloc_path();
1878bb53eda9SFilipe Manana 				if (!fixup_path) {
1879bb53eda9SFilipe Manana 					ret = -ENOMEM;
1880bb53eda9SFilipe Manana 					break;
1881e02119d5SChris Mason 				}
1882bb53eda9SFilipe Manana 			}
1883bb53eda9SFilipe Manana 
1884bb53eda9SFilipe Manana 			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1885bb53eda9SFilipe Manana 			ret = link_to_fixup_dir(trans, root, fixup_path,
1886bb53eda9SFilipe Manana 						di_key.objectid);
1887bb53eda9SFilipe Manana 			if (ret)
1888bb53eda9SFilipe Manana 				break;
1889bb53eda9SFilipe Manana 		}
1890bb53eda9SFilipe Manana 		ret = 0;
1891bb53eda9SFilipe Manana 	}
1892bb53eda9SFilipe Manana 	btrfs_free_path(fixup_path);
1893bb53eda9SFilipe Manana 	return ret;
1894e02119d5SChris Mason }
1895e02119d5SChris Mason 
1896e02119d5SChris Mason /*
1897e02119d5SChris Mason  * directory replay has two parts.  There are the standard directory
1898e02119d5SChris Mason  * items in the log copied from the subvolume, and range items
1899e02119d5SChris Mason  * created in the log while the subvolume was logged.
1900e02119d5SChris Mason  *
1901e02119d5SChris Mason  * The range items tell us which parts of the key space the log
1902e02119d5SChris Mason  * is authoritative for.  During replay, if a key in the subvolume
1903e02119d5SChris Mason  * directory is in a logged range item, but not actually in the log
1904e02119d5SChris Mason  * that means it was deleted from the directory before the fsync
1905e02119d5SChris Mason  * and should be removed.
1906e02119d5SChris Mason  */
1907e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root,
1908e02119d5SChris Mason 				   struct btrfs_path *path,
1909e02119d5SChris Mason 				   u64 dirid, int key_type,
1910e02119d5SChris Mason 				   u64 *start_ret, u64 *end_ret)
1911e02119d5SChris Mason {
1912e02119d5SChris Mason 	struct btrfs_key key;
1913e02119d5SChris Mason 	u64 found_end;
1914e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
1915e02119d5SChris Mason 	int ret;
1916e02119d5SChris Mason 	int nritems;
1917e02119d5SChris Mason 
1918e02119d5SChris Mason 	if (*start_ret == (u64)-1)
1919e02119d5SChris Mason 		return 1;
1920e02119d5SChris Mason 
1921e02119d5SChris Mason 	key.objectid = dirid;
1922e02119d5SChris Mason 	key.type = key_type;
1923e02119d5SChris Mason 	key.offset = *start_ret;
1924e02119d5SChris Mason 
1925e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1926e02119d5SChris Mason 	if (ret < 0)
1927e02119d5SChris Mason 		goto out;
1928e02119d5SChris Mason 	if (ret > 0) {
1929e02119d5SChris Mason 		if (path->slots[0] == 0)
1930e02119d5SChris Mason 			goto out;
1931e02119d5SChris Mason 		path->slots[0]--;
1932e02119d5SChris Mason 	}
1933e02119d5SChris Mason 	if (ret != 0)
1934e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1935e02119d5SChris Mason 
1936e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
1937e02119d5SChris Mason 		ret = 1;
1938e02119d5SChris Mason 		goto next;
1939e02119d5SChris Mason 	}
1940e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1941e02119d5SChris Mason 			      struct btrfs_dir_log_item);
1942e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1943e02119d5SChris Mason 
1944e02119d5SChris Mason 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1945e02119d5SChris Mason 		ret = 0;
1946e02119d5SChris Mason 		*start_ret = key.offset;
1947e02119d5SChris Mason 		*end_ret = found_end;
1948e02119d5SChris Mason 		goto out;
1949e02119d5SChris Mason 	}
1950e02119d5SChris Mason 	ret = 1;
1951e02119d5SChris Mason next:
1952e02119d5SChris Mason 	/* check the next slot in the tree to see if it is a valid item */
1953e02119d5SChris Mason 	nritems = btrfs_header_nritems(path->nodes[0]);
19542a7bf53fSRobbie Ko 	path->slots[0]++;
1955e02119d5SChris Mason 	if (path->slots[0] >= nritems) {
1956e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
1957e02119d5SChris Mason 		if (ret)
1958e02119d5SChris Mason 			goto out;
1959e02119d5SChris Mason 	}
1960e02119d5SChris Mason 
1961e02119d5SChris Mason 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1962e02119d5SChris Mason 
1963e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
1964e02119d5SChris Mason 		ret = 1;
1965e02119d5SChris Mason 		goto out;
1966e02119d5SChris Mason 	}
1967e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1968e02119d5SChris Mason 			      struct btrfs_dir_log_item);
1969e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1970e02119d5SChris Mason 	*start_ret = key.offset;
1971e02119d5SChris Mason 	*end_ret = found_end;
1972e02119d5SChris Mason 	ret = 0;
1973e02119d5SChris Mason out:
1974b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1975e02119d5SChris Mason 	return ret;
1976e02119d5SChris Mason }
1977e02119d5SChris Mason 
1978e02119d5SChris Mason /*
1979e02119d5SChris Mason  * this looks for a given directory item in the log.  If the directory
1980e02119d5SChris Mason  * item is not in the log, the item is removed and the inode it points
1981e02119d5SChris Mason  * to is unlinked
1982e02119d5SChris Mason  */
1983e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1984e02119d5SChris Mason 				      struct btrfs_root *root,
1985e02119d5SChris Mason 				      struct btrfs_root *log,
1986e02119d5SChris Mason 				      struct btrfs_path *path,
1987e02119d5SChris Mason 				      struct btrfs_path *log_path,
1988e02119d5SChris Mason 				      struct inode *dir,
1989e02119d5SChris Mason 				      struct btrfs_key *dir_key)
1990e02119d5SChris Mason {
19912ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
1992e02119d5SChris Mason 	int ret;
1993e02119d5SChris Mason 	struct extent_buffer *eb;
1994e02119d5SChris Mason 	int slot;
1995e02119d5SChris Mason 	u32 item_size;
1996e02119d5SChris Mason 	struct btrfs_dir_item *di;
1997e02119d5SChris Mason 	struct btrfs_dir_item *log_di;
1998e02119d5SChris Mason 	int name_len;
1999e02119d5SChris Mason 	unsigned long ptr;
2000e02119d5SChris Mason 	unsigned long ptr_end;
2001e02119d5SChris Mason 	char *name;
2002e02119d5SChris Mason 	struct inode *inode;
2003e02119d5SChris Mason 	struct btrfs_key location;
2004e02119d5SChris Mason 
2005e02119d5SChris Mason again:
2006e02119d5SChris Mason 	eb = path->nodes[0];
2007e02119d5SChris Mason 	slot = path->slots[0];
2008e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
2009e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
2010e02119d5SChris Mason 	ptr_end = ptr + item_size;
2011e02119d5SChris Mason 	while (ptr < ptr_end) {
2012e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
20132ff7e61eSJeff Mahoney 		if (verify_dir_item(fs_info, eb, di)) {
201422a94d44SJosef Bacik 			ret = -EIO;
201522a94d44SJosef Bacik 			goto out;
201622a94d44SJosef Bacik 		}
201722a94d44SJosef Bacik 
2018e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
2019e02119d5SChris Mason 		name = kmalloc(name_len, GFP_NOFS);
2020e02119d5SChris Mason 		if (!name) {
2021e02119d5SChris Mason 			ret = -ENOMEM;
2022e02119d5SChris Mason 			goto out;
2023e02119d5SChris Mason 		}
2024e02119d5SChris Mason 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2025e02119d5SChris Mason 				  name_len);
2026e02119d5SChris Mason 		log_di = NULL;
202712fcfd22SChris Mason 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2028e02119d5SChris Mason 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2029e02119d5SChris Mason 						       dir_key->objectid,
2030e02119d5SChris Mason 						       name, name_len, 0);
203112fcfd22SChris Mason 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2032e02119d5SChris Mason 			log_di = btrfs_lookup_dir_index_item(trans, log,
2033e02119d5SChris Mason 						     log_path,
2034e02119d5SChris Mason 						     dir_key->objectid,
2035e02119d5SChris Mason 						     dir_key->offset,
2036e02119d5SChris Mason 						     name, name_len, 0);
2037e02119d5SChris Mason 		}
2038269d040fSFilipe David Borba Manana 		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2039e02119d5SChris Mason 			btrfs_dir_item_key_to_cpu(eb, di, &location);
2040b3b4aa74SDavid Sterba 			btrfs_release_path(path);
2041b3b4aa74SDavid Sterba 			btrfs_release_path(log_path);
2042e02119d5SChris Mason 			inode = read_one_inode(root, location.objectid);
2043c00e9493STsutomu Itoh 			if (!inode) {
2044c00e9493STsutomu Itoh 				kfree(name);
2045c00e9493STsutomu Itoh 				return -EIO;
2046c00e9493STsutomu Itoh 			}
2047e02119d5SChris Mason 
2048e02119d5SChris Mason 			ret = link_to_fixup_dir(trans, root,
2049e02119d5SChris Mason 						path, location.objectid);
20503650860bSJosef Bacik 			if (ret) {
20513650860bSJosef Bacik 				kfree(name);
20523650860bSJosef Bacik 				iput(inode);
20533650860bSJosef Bacik 				goto out;
20543650860bSJosef Bacik 			}
20553650860bSJosef Bacik 
20568b558c5fSZach Brown 			inc_nlink(inode);
20574ec5934eSNikolay Borisov 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
20584ec5934eSNikolay Borisov 					BTRFS_I(inode), name, name_len);
20593650860bSJosef Bacik 			if (!ret)
20602ff7e61eSJeff Mahoney 				ret = btrfs_run_delayed_items(trans, fs_info);
2061e02119d5SChris Mason 			kfree(name);
2062e02119d5SChris Mason 			iput(inode);
20633650860bSJosef Bacik 			if (ret)
20643650860bSJosef Bacik 				goto out;
2065e02119d5SChris Mason 
2066e02119d5SChris Mason 			/* there might still be more names under this key
2067e02119d5SChris Mason 			 * check and repeat if required
2068e02119d5SChris Mason 			 */
2069e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, dir_key, path,
2070e02119d5SChris Mason 						0, 0);
2071e02119d5SChris Mason 			if (ret == 0)
2072e02119d5SChris Mason 				goto again;
2073e02119d5SChris Mason 			ret = 0;
2074e02119d5SChris Mason 			goto out;
2075269d040fSFilipe David Borba Manana 		} else if (IS_ERR(log_di)) {
2076269d040fSFilipe David Borba Manana 			kfree(name);
2077269d040fSFilipe David Borba Manana 			return PTR_ERR(log_di);
2078e02119d5SChris Mason 		}
2079b3b4aa74SDavid Sterba 		btrfs_release_path(log_path);
2080e02119d5SChris Mason 		kfree(name);
2081e02119d5SChris Mason 
2082e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
2083e02119d5SChris Mason 		ptr += name_len;
2084e02119d5SChris Mason 	}
2085e02119d5SChris Mason 	ret = 0;
2086e02119d5SChris Mason out:
2087b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2088b3b4aa74SDavid Sterba 	btrfs_release_path(log_path);
2089e02119d5SChris Mason 	return ret;
2090e02119d5SChris Mason }
2091e02119d5SChris Mason 
20924f764e51SFilipe Manana static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
20934f764e51SFilipe Manana 			      struct btrfs_root *root,
20944f764e51SFilipe Manana 			      struct btrfs_root *log,
20954f764e51SFilipe Manana 			      struct btrfs_path *path,
20964f764e51SFilipe Manana 			      const u64 ino)
20974f764e51SFilipe Manana {
20984f764e51SFilipe Manana 	struct btrfs_key search_key;
20994f764e51SFilipe Manana 	struct btrfs_path *log_path;
21004f764e51SFilipe Manana 	int i;
21014f764e51SFilipe Manana 	int nritems;
21024f764e51SFilipe Manana 	int ret;
21034f764e51SFilipe Manana 
21044f764e51SFilipe Manana 	log_path = btrfs_alloc_path();
21054f764e51SFilipe Manana 	if (!log_path)
21064f764e51SFilipe Manana 		return -ENOMEM;
21074f764e51SFilipe Manana 
21084f764e51SFilipe Manana 	search_key.objectid = ino;
21094f764e51SFilipe Manana 	search_key.type = BTRFS_XATTR_ITEM_KEY;
21104f764e51SFilipe Manana 	search_key.offset = 0;
21114f764e51SFilipe Manana again:
21124f764e51SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
21134f764e51SFilipe Manana 	if (ret < 0)
21144f764e51SFilipe Manana 		goto out;
21154f764e51SFilipe Manana process_leaf:
21164f764e51SFilipe Manana 	nritems = btrfs_header_nritems(path->nodes[0]);
21174f764e51SFilipe Manana 	for (i = path->slots[0]; i < nritems; i++) {
21184f764e51SFilipe Manana 		struct btrfs_key key;
21194f764e51SFilipe Manana 		struct btrfs_dir_item *di;
21204f764e51SFilipe Manana 		struct btrfs_dir_item *log_di;
21214f764e51SFilipe Manana 		u32 total_size;
21224f764e51SFilipe Manana 		u32 cur;
21234f764e51SFilipe Manana 
21244f764e51SFilipe Manana 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
21254f764e51SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
21264f764e51SFilipe Manana 			ret = 0;
21274f764e51SFilipe Manana 			goto out;
21284f764e51SFilipe Manana 		}
21294f764e51SFilipe Manana 
21304f764e51SFilipe Manana 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
21314f764e51SFilipe Manana 		total_size = btrfs_item_size_nr(path->nodes[0], i);
21324f764e51SFilipe Manana 		cur = 0;
21334f764e51SFilipe Manana 		while (cur < total_size) {
21344f764e51SFilipe Manana 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
21354f764e51SFilipe Manana 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
21364f764e51SFilipe Manana 			u32 this_len = sizeof(*di) + name_len + data_len;
21374f764e51SFilipe Manana 			char *name;
21384f764e51SFilipe Manana 
21394f764e51SFilipe Manana 			name = kmalloc(name_len, GFP_NOFS);
21404f764e51SFilipe Manana 			if (!name) {
21414f764e51SFilipe Manana 				ret = -ENOMEM;
21424f764e51SFilipe Manana 				goto out;
21434f764e51SFilipe Manana 			}
21444f764e51SFilipe Manana 			read_extent_buffer(path->nodes[0], name,
21454f764e51SFilipe Manana 					   (unsigned long)(di + 1), name_len);
21464f764e51SFilipe Manana 
21474f764e51SFilipe Manana 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
21484f764e51SFilipe Manana 						    name, name_len, 0);
21494f764e51SFilipe Manana 			btrfs_release_path(log_path);
21504f764e51SFilipe Manana 			if (!log_di) {
21514f764e51SFilipe Manana 				/* Doesn't exist in log tree, so delete it. */
21524f764e51SFilipe Manana 				btrfs_release_path(path);
21534f764e51SFilipe Manana 				di = btrfs_lookup_xattr(trans, root, path, ino,
21544f764e51SFilipe Manana 							name, name_len, -1);
21554f764e51SFilipe Manana 				kfree(name);
21564f764e51SFilipe Manana 				if (IS_ERR(di)) {
21574f764e51SFilipe Manana 					ret = PTR_ERR(di);
21584f764e51SFilipe Manana 					goto out;
21594f764e51SFilipe Manana 				}
21604f764e51SFilipe Manana 				ASSERT(di);
21614f764e51SFilipe Manana 				ret = btrfs_delete_one_dir_name(trans, root,
21624f764e51SFilipe Manana 								path, di);
21634f764e51SFilipe Manana 				if (ret)
21644f764e51SFilipe Manana 					goto out;
21654f764e51SFilipe Manana 				btrfs_release_path(path);
21664f764e51SFilipe Manana 				search_key = key;
21674f764e51SFilipe Manana 				goto again;
21684f764e51SFilipe Manana 			}
21694f764e51SFilipe Manana 			kfree(name);
21704f764e51SFilipe Manana 			if (IS_ERR(log_di)) {
21714f764e51SFilipe Manana 				ret = PTR_ERR(log_di);
21724f764e51SFilipe Manana 				goto out;
21734f764e51SFilipe Manana 			}
21744f764e51SFilipe Manana 			cur += this_len;
21754f764e51SFilipe Manana 			di = (struct btrfs_dir_item *)((char *)di + this_len);
21764f764e51SFilipe Manana 		}
21774f764e51SFilipe Manana 	}
21784f764e51SFilipe Manana 	ret = btrfs_next_leaf(root, path);
21794f764e51SFilipe Manana 	if (ret > 0)
21804f764e51SFilipe Manana 		ret = 0;
21814f764e51SFilipe Manana 	else if (ret == 0)
21824f764e51SFilipe Manana 		goto process_leaf;
21834f764e51SFilipe Manana out:
21844f764e51SFilipe Manana 	btrfs_free_path(log_path);
21854f764e51SFilipe Manana 	btrfs_release_path(path);
21864f764e51SFilipe Manana 	return ret;
21874f764e51SFilipe Manana }
21884f764e51SFilipe Manana 
21894f764e51SFilipe Manana 
2190e02119d5SChris Mason /*
2191e02119d5SChris Mason  * deletion replay happens before we copy any new directory items
2192e02119d5SChris Mason  * out of the log or out of backreferences from inodes.  It
2193e02119d5SChris Mason  * scans the log to find ranges of keys that log is authoritative for,
2194e02119d5SChris Mason  * and then scans the directory to find items in those ranges that are
2195e02119d5SChris Mason  * not present in the log.
2196e02119d5SChris Mason  *
2197e02119d5SChris Mason  * Anything we don't find in the log is unlinked and removed from the
2198e02119d5SChris Mason  * directory.
2199e02119d5SChris Mason  */
2200e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2201e02119d5SChris Mason 				       struct btrfs_root *root,
2202e02119d5SChris Mason 				       struct btrfs_root *log,
2203e02119d5SChris Mason 				       struct btrfs_path *path,
220412fcfd22SChris Mason 				       u64 dirid, int del_all)
2205e02119d5SChris Mason {
2206e02119d5SChris Mason 	u64 range_start;
2207e02119d5SChris Mason 	u64 range_end;
2208e02119d5SChris Mason 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2209e02119d5SChris Mason 	int ret = 0;
2210e02119d5SChris Mason 	struct btrfs_key dir_key;
2211e02119d5SChris Mason 	struct btrfs_key found_key;
2212e02119d5SChris Mason 	struct btrfs_path *log_path;
2213e02119d5SChris Mason 	struct inode *dir;
2214e02119d5SChris Mason 
2215e02119d5SChris Mason 	dir_key.objectid = dirid;
2216e02119d5SChris Mason 	dir_key.type = BTRFS_DIR_ITEM_KEY;
2217e02119d5SChris Mason 	log_path = btrfs_alloc_path();
2218e02119d5SChris Mason 	if (!log_path)
2219e02119d5SChris Mason 		return -ENOMEM;
2220e02119d5SChris Mason 
2221e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
2222e02119d5SChris Mason 	/* it isn't an error if the inode isn't there, that can happen
2223e02119d5SChris Mason 	 * because we replay the deletes before we copy in the inode item
2224e02119d5SChris Mason 	 * from the log
2225e02119d5SChris Mason 	 */
2226e02119d5SChris Mason 	if (!dir) {
2227e02119d5SChris Mason 		btrfs_free_path(log_path);
2228e02119d5SChris Mason 		return 0;
2229e02119d5SChris Mason 	}
2230e02119d5SChris Mason again:
2231e02119d5SChris Mason 	range_start = 0;
2232e02119d5SChris Mason 	range_end = 0;
2233e02119d5SChris Mason 	while (1) {
223412fcfd22SChris Mason 		if (del_all)
223512fcfd22SChris Mason 			range_end = (u64)-1;
223612fcfd22SChris Mason 		else {
2237e02119d5SChris Mason 			ret = find_dir_range(log, path, dirid, key_type,
2238e02119d5SChris Mason 					     &range_start, &range_end);
2239e02119d5SChris Mason 			if (ret != 0)
2240e02119d5SChris Mason 				break;
224112fcfd22SChris Mason 		}
2242e02119d5SChris Mason 
2243e02119d5SChris Mason 		dir_key.offset = range_start;
2244e02119d5SChris Mason 		while (1) {
2245e02119d5SChris Mason 			int nritems;
2246e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2247e02119d5SChris Mason 						0, 0);
2248e02119d5SChris Mason 			if (ret < 0)
2249e02119d5SChris Mason 				goto out;
2250e02119d5SChris Mason 
2251e02119d5SChris Mason 			nritems = btrfs_header_nritems(path->nodes[0]);
2252e02119d5SChris Mason 			if (path->slots[0] >= nritems) {
2253e02119d5SChris Mason 				ret = btrfs_next_leaf(root, path);
2254e02119d5SChris Mason 				if (ret)
2255e02119d5SChris Mason 					break;
2256e02119d5SChris Mason 			}
2257e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2258e02119d5SChris Mason 					      path->slots[0]);
2259e02119d5SChris Mason 			if (found_key.objectid != dirid ||
2260e02119d5SChris Mason 			    found_key.type != dir_key.type)
2261e02119d5SChris Mason 				goto next_type;
2262e02119d5SChris Mason 
2263e02119d5SChris Mason 			if (found_key.offset > range_end)
2264e02119d5SChris Mason 				break;
2265e02119d5SChris Mason 
2266e02119d5SChris Mason 			ret = check_item_in_log(trans, root, log, path,
226712fcfd22SChris Mason 						log_path, dir,
226812fcfd22SChris Mason 						&found_key);
22693650860bSJosef Bacik 			if (ret)
22703650860bSJosef Bacik 				goto out;
2271e02119d5SChris Mason 			if (found_key.offset == (u64)-1)
2272e02119d5SChris Mason 				break;
2273e02119d5SChris Mason 			dir_key.offset = found_key.offset + 1;
2274e02119d5SChris Mason 		}
2275b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2276e02119d5SChris Mason 		if (range_end == (u64)-1)
2277e02119d5SChris Mason 			break;
2278e02119d5SChris Mason 		range_start = range_end + 1;
2279e02119d5SChris Mason 	}
2280e02119d5SChris Mason 
2281e02119d5SChris Mason next_type:
2282e02119d5SChris Mason 	ret = 0;
2283e02119d5SChris Mason 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2284e02119d5SChris Mason 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2285e02119d5SChris Mason 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2286b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2287e02119d5SChris Mason 		goto again;
2288e02119d5SChris Mason 	}
2289e02119d5SChris Mason out:
2290b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2291e02119d5SChris Mason 	btrfs_free_path(log_path);
2292e02119d5SChris Mason 	iput(dir);
2293e02119d5SChris Mason 	return ret;
2294e02119d5SChris Mason }
2295e02119d5SChris Mason 
2296e02119d5SChris Mason /*
2297e02119d5SChris Mason  * the process_func used to replay items from the log tree.  This
2298e02119d5SChris Mason  * gets called in two different stages.  The first stage just looks
2299e02119d5SChris Mason  * for inodes and makes sure they are all copied into the subvolume.
2300e02119d5SChris Mason  *
2301e02119d5SChris Mason  * The second stage copies all the other item types from the log into
2302e02119d5SChris Mason  * the subvolume.  The two stage approach is slower, but gets rid of
2303e02119d5SChris Mason  * lots of complexity around inodes referencing other inodes that exist
2304e02119d5SChris Mason  * only in the log (references come from either directory items or inode
2305e02119d5SChris Mason  * back refs).
2306e02119d5SChris Mason  */
2307e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2308e02119d5SChris Mason 			     struct walk_control *wc, u64 gen)
2309e02119d5SChris Mason {
2310e02119d5SChris Mason 	int nritems;
2311e02119d5SChris Mason 	struct btrfs_path *path;
2312e02119d5SChris Mason 	struct btrfs_root *root = wc->replay_dest;
2313e02119d5SChris Mason 	struct btrfs_key key;
2314e02119d5SChris Mason 	int level;
2315e02119d5SChris Mason 	int i;
2316e02119d5SChris Mason 	int ret;
2317e02119d5SChris Mason 
2318018642a1STsutomu Itoh 	ret = btrfs_read_buffer(eb, gen);
2319018642a1STsutomu Itoh 	if (ret)
2320018642a1STsutomu Itoh 		return ret;
2321e02119d5SChris Mason 
2322e02119d5SChris Mason 	level = btrfs_header_level(eb);
2323e02119d5SChris Mason 
2324e02119d5SChris Mason 	if (level != 0)
2325e02119d5SChris Mason 		return 0;
2326e02119d5SChris Mason 
2327e02119d5SChris Mason 	path = btrfs_alloc_path();
23281e5063d0SMark Fasheh 	if (!path)
23291e5063d0SMark Fasheh 		return -ENOMEM;
2330e02119d5SChris Mason 
2331e02119d5SChris Mason 	nritems = btrfs_header_nritems(eb);
2332e02119d5SChris Mason 	for (i = 0; i < nritems; i++) {
2333e02119d5SChris Mason 		btrfs_item_key_to_cpu(eb, &key, i);
2334e02119d5SChris Mason 
2335e02119d5SChris Mason 		/* inode keys are done during the first stage */
2336e02119d5SChris Mason 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2337e02119d5SChris Mason 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2338e02119d5SChris Mason 			struct btrfs_inode_item *inode_item;
2339e02119d5SChris Mason 			u32 mode;
2340e02119d5SChris Mason 
2341e02119d5SChris Mason 			inode_item = btrfs_item_ptr(eb, i,
2342e02119d5SChris Mason 					    struct btrfs_inode_item);
23434f764e51SFilipe Manana 			ret = replay_xattr_deletes(wc->trans, root, log,
23444f764e51SFilipe Manana 						   path, key.objectid);
23454f764e51SFilipe Manana 			if (ret)
23464f764e51SFilipe Manana 				break;
2347e02119d5SChris Mason 			mode = btrfs_inode_mode(eb, inode_item);
2348e02119d5SChris Mason 			if (S_ISDIR(mode)) {
2349e02119d5SChris Mason 				ret = replay_dir_deletes(wc->trans,
235012fcfd22SChris Mason 					 root, log, path, key.objectid, 0);
2351b50c6e25SJosef Bacik 				if (ret)
2352b50c6e25SJosef Bacik 					break;
2353e02119d5SChris Mason 			}
2354e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2355e02119d5SChris Mason 					     eb, i, &key);
2356b50c6e25SJosef Bacik 			if (ret)
2357b50c6e25SJosef Bacik 				break;
2358e02119d5SChris Mason 
2359c71bf099SYan, Zheng 			/* for regular files, make sure corresponding
236001327610SNicholas D Steeves 			 * orphan item exist. extents past the new EOF
2361c71bf099SYan, Zheng 			 * will be truncated later by orphan cleanup.
2362e02119d5SChris Mason 			 */
2363e02119d5SChris Mason 			if (S_ISREG(mode)) {
2364c71bf099SYan, Zheng 				ret = insert_orphan_item(wc->trans, root,
2365e02119d5SChris Mason 							 key.objectid);
2366b50c6e25SJosef Bacik 				if (ret)
2367b50c6e25SJosef Bacik 					break;
2368c71bf099SYan, Zheng 			}
2369a74ac322SChris Mason 
2370e02119d5SChris Mason 			ret = link_to_fixup_dir(wc->trans, root,
2371e02119d5SChris Mason 						path, key.objectid);
2372b50c6e25SJosef Bacik 			if (ret)
2373b50c6e25SJosef Bacik 				break;
2374e02119d5SChris Mason 		}
2375dd8e7217SJosef Bacik 
2376dd8e7217SJosef Bacik 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2377dd8e7217SJosef Bacik 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2378dd8e7217SJosef Bacik 			ret = replay_one_dir_item(wc->trans, root, path,
2379dd8e7217SJosef Bacik 						  eb, i, &key);
2380dd8e7217SJosef Bacik 			if (ret)
2381dd8e7217SJosef Bacik 				break;
2382dd8e7217SJosef Bacik 		}
2383dd8e7217SJosef Bacik 
2384e02119d5SChris Mason 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2385e02119d5SChris Mason 			continue;
2386e02119d5SChris Mason 
2387e02119d5SChris Mason 		/* these keys are simply copied */
2388e02119d5SChris Mason 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2389e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2390e02119d5SChris Mason 					     eb, i, &key);
2391b50c6e25SJosef Bacik 			if (ret)
2392b50c6e25SJosef Bacik 				break;
23932da1c669SLiu Bo 		} else if (key.type == BTRFS_INODE_REF_KEY ||
23942da1c669SLiu Bo 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2395f186373fSMark Fasheh 			ret = add_inode_ref(wc->trans, root, log, path,
2396f186373fSMark Fasheh 					    eb, i, &key);
2397b50c6e25SJosef Bacik 			if (ret && ret != -ENOENT)
2398b50c6e25SJosef Bacik 				break;
2399b50c6e25SJosef Bacik 			ret = 0;
2400e02119d5SChris Mason 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2401e02119d5SChris Mason 			ret = replay_one_extent(wc->trans, root, path,
2402e02119d5SChris Mason 						eb, i, &key);
2403b50c6e25SJosef Bacik 			if (ret)
2404b50c6e25SJosef Bacik 				break;
2405dd8e7217SJosef Bacik 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2406e02119d5SChris Mason 			ret = replay_one_dir_item(wc->trans, root, path,
2407e02119d5SChris Mason 						  eb, i, &key);
2408b50c6e25SJosef Bacik 			if (ret)
2409b50c6e25SJosef Bacik 				break;
2410e02119d5SChris Mason 		}
2411e02119d5SChris Mason 	}
2412e02119d5SChris Mason 	btrfs_free_path(path);
2413b50c6e25SJosef Bacik 	return ret;
2414e02119d5SChris Mason }
2415e02119d5SChris Mason 
2416d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2417e02119d5SChris Mason 				   struct btrfs_root *root,
2418e02119d5SChris Mason 				   struct btrfs_path *path, int *level,
2419e02119d5SChris Mason 				   struct walk_control *wc)
2420e02119d5SChris Mason {
24210b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2422e02119d5SChris Mason 	u64 root_owner;
2423e02119d5SChris Mason 	u64 bytenr;
2424e02119d5SChris Mason 	u64 ptr_gen;
2425e02119d5SChris Mason 	struct extent_buffer *next;
2426e02119d5SChris Mason 	struct extent_buffer *cur;
2427e02119d5SChris Mason 	struct extent_buffer *parent;
2428e02119d5SChris Mason 	u32 blocksize;
2429e02119d5SChris Mason 	int ret = 0;
2430e02119d5SChris Mason 
2431e02119d5SChris Mason 	WARN_ON(*level < 0);
2432e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2433e02119d5SChris Mason 
2434e02119d5SChris Mason 	while (*level > 0) {
2435e02119d5SChris Mason 		WARN_ON(*level < 0);
2436e02119d5SChris Mason 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
2437e02119d5SChris Mason 		cur = path->nodes[*level];
2438e02119d5SChris Mason 
2439fae7f21cSDulshani Gunawardhana 		WARN_ON(btrfs_header_level(cur) != *level);
2440e02119d5SChris Mason 
2441e02119d5SChris Mason 		if (path->slots[*level] >=
2442e02119d5SChris Mason 		    btrfs_header_nritems(cur))
2443e02119d5SChris Mason 			break;
2444e02119d5SChris Mason 
2445e02119d5SChris Mason 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2446e02119d5SChris Mason 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
24470b246afaSJeff Mahoney 		blocksize = fs_info->nodesize;
2448e02119d5SChris Mason 
2449e02119d5SChris Mason 		parent = path->nodes[*level];
2450e02119d5SChris Mason 		root_owner = btrfs_header_owner(parent);
2451e02119d5SChris Mason 
24522ff7e61eSJeff Mahoney 		next = btrfs_find_create_tree_block(fs_info, bytenr);
2453c871b0f2SLiu Bo 		if (IS_ERR(next))
2454c871b0f2SLiu Bo 			return PTR_ERR(next);
2455e02119d5SChris Mason 
24564a500fd1SYan, Zheng 		if (*level == 1) {
24571e5063d0SMark Fasheh 			ret = wc->process_func(root, next, wc, ptr_gen);
2458b50c6e25SJosef Bacik 			if (ret) {
2459b50c6e25SJosef Bacik 				free_extent_buffer(next);
24601e5063d0SMark Fasheh 				return ret;
2461b50c6e25SJosef Bacik 			}
2462e02119d5SChris Mason 
2463e02119d5SChris Mason 			path->slots[*level]++;
2464e02119d5SChris Mason 			if (wc->free) {
2465018642a1STsutomu Itoh 				ret = btrfs_read_buffer(next, ptr_gen);
2466018642a1STsutomu Itoh 				if (ret) {
2467018642a1STsutomu Itoh 					free_extent_buffer(next);
2468018642a1STsutomu Itoh 					return ret;
2469018642a1STsutomu Itoh 				}
2470e02119d5SChris Mason 
2471681ae509SJosef Bacik 				if (trans) {
2472e02119d5SChris Mason 					btrfs_tree_lock(next);
2473b4ce94deSChris Mason 					btrfs_set_lock_blocking(next);
24740b246afaSJeff Mahoney 					clean_tree_block(trans, fs_info, next);
2475e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2476e02119d5SChris Mason 					btrfs_tree_unlock(next);
2477681ae509SJosef Bacik 				}
2478e02119d5SChris Mason 
2479e02119d5SChris Mason 				WARN_ON(root_owner !=
2480e02119d5SChris Mason 					BTRFS_TREE_LOG_OBJECTID);
24812ff7e61eSJeff Mahoney 				ret = btrfs_free_and_pin_reserved_extent(
24822ff7e61eSJeff Mahoney 							fs_info, bytenr,
24832ff7e61eSJeff Mahoney 							blocksize);
24843650860bSJosef Bacik 				if (ret) {
24853650860bSJosef Bacik 					free_extent_buffer(next);
24863650860bSJosef Bacik 					return ret;
24873650860bSJosef Bacik 				}
2488e02119d5SChris Mason 			}
2489e02119d5SChris Mason 			free_extent_buffer(next);
2490e02119d5SChris Mason 			continue;
2491e02119d5SChris Mason 		}
2492018642a1STsutomu Itoh 		ret = btrfs_read_buffer(next, ptr_gen);
2493018642a1STsutomu Itoh 		if (ret) {
2494018642a1STsutomu Itoh 			free_extent_buffer(next);
2495018642a1STsutomu Itoh 			return ret;
2496018642a1STsutomu Itoh 		}
2497e02119d5SChris Mason 
2498e02119d5SChris Mason 		WARN_ON(*level <= 0);
2499e02119d5SChris Mason 		if (path->nodes[*level-1])
2500e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level-1]);
2501e02119d5SChris Mason 		path->nodes[*level-1] = next;
2502e02119d5SChris Mason 		*level = btrfs_header_level(next);
2503e02119d5SChris Mason 		path->slots[*level] = 0;
2504e02119d5SChris Mason 		cond_resched();
2505e02119d5SChris Mason 	}
2506e02119d5SChris Mason 	WARN_ON(*level < 0);
2507e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2508e02119d5SChris Mason 
25094a500fd1SYan, Zheng 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2510e02119d5SChris Mason 
2511e02119d5SChris Mason 	cond_resched();
2512e02119d5SChris Mason 	return 0;
2513e02119d5SChris Mason }
2514e02119d5SChris Mason 
2515d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2516e02119d5SChris Mason 				 struct btrfs_root *root,
2517e02119d5SChris Mason 				 struct btrfs_path *path, int *level,
2518e02119d5SChris Mason 				 struct walk_control *wc)
2519e02119d5SChris Mason {
25200b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2521e02119d5SChris Mason 	u64 root_owner;
2522e02119d5SChris Mason 	int i;
2523e02119d5SChris Mason 	int slot;
2524e02119d5SChris Mason 	int ret;
2525e02119d5SChris Mason 
2526e02119d5SChris Mason 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2527e02119d5SChris Mason 		slot = path->slots[i];
25284a500fd1SYan, Zheng 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2529e02119d5SChris Mason 			path->slots[i]++;
2530e02119d5SChris Mason 			*level = i;
2531e02119d5SChris Mason 			WARN_ON(*level == 0);
2532e02119d5SChris Mason 			return 0;
2533e02119d5SChris Mason 		} else {
253431840ae1SZheng Yan 			struct extent_buffer *parent;
253531840ae1SZheng Yan 			if (path->nodes[*level] == root->node)
253631840ae1SZheng Yan 				parent = path->nodes[*level];
253731840ae1SZheng Yan 			else
253831840ae1SZheng Yan 				parent = path->nodes[*level + 1];
253931840ae1SZheng Yan 
254031840ae1SZheng Yan 			root_owner = btrfs_header_owner(parent);
25411e5063d0SMark Fasheh 			ret = wc->process_func(root, path->nodes[*level], wc,
2542e02119d5SChris Mason 				 btrfs_header_generation(path->nodes[*level]));
25431e5063d0SMark Fasheh 			if (ret)
25441e5063d0SMark Fasheh 				return ret;
25451e5063d0SMark Fasheh 
2546e02119d5SChris Mason 			if (wc->free) {
2547e02119d5SChris Mason 				struct extent_buffer *next;
2548e02119d5SChris Mason 
2549e02119d5SChris Mason 				next = path->nodes[*level];
2550e02119d5SChris Mason 
2551681ae509SJosef Bacik 				if (trans) {
2552e02119d5SChris Mason 					btrfs_tree_lock(next);
2553b4ce94deSChris Mason 					btrfs_set_lock_blocking(next);
25540b246afaSJeff Mahoney 					clean_tree_block(trans, fs_info, next);
2555e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2556e02119d5SChris Mason 					btrfs_tree_unlock(next);
2557681ae509SJosef Bacik 				}
2558e02119d5SChris Mason 
2559e02119d5SChris Mason 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
25602ff7e61eSJeff Mahoney 				ret = btrfs_free_and_pin_reserved_extent(
25612ff7e61eSJeff Mahoney 						fs_info,
2562e02119d5SChris Mason 						path->nodes[*level]->start,
2563d00aff00SChris Mason 						path->nodes[*level]->len);
25643650860bSJosef Bacik 				if (ret)
25653650860bSJosef Bacik 					return ret;
2566e02119d5SChris Mason 			}
2567e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level]);
2568e02119d5SChris Mason 			path->nodes[*level] = NULL;
2569e02119d5SChris Mason 			*level = i + 1;
2570e02119d5SChris Mason 		}
2571e02119d5SChris Mason 	}
2572e02119d5SChris Mason 	return 1;
2573e02119d5SChris Mason }
2574e02119d5SChris Mason 
2575e02119d5SChris Mason /*
2576e02119d5SChris Mason  * drop the reference count on the tree rooted at 'snap'.  This traverses
2577e02119d5SChris Mason  * the tree freeing any blocks that have a ref count of zero after being
2578e02119d5SChris Mason  * decremented.
2579e02119d5SChris Mason  */
2580e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans,
2581e02119d5SChris Mason 			 struct btrfs_root *log, struct walk_control *wc)
2582e02119d5SChris Mason {
25832ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
2584e02119d5SChris Mason 	int ret = 0;
2585e02119d5SChris Mason 	int wret;
2586e02119d5SChris Mason 	int level;
2587e02119d5SChris Mason 	struct btrfs_path *path;
2588e02119d5SChris Mason 	int orig_level;
2589e02119d5SChris Mason 
2590e02119d5SChris Mason 	path = btrfs_alloc_path();
2591db5b493aSTsutomu Itoh 	if (!path)
2592db5b493aSTsutomu Itoh 		return -ENOMEM;
2593e02119d5SChris Mason 
2594e02119d5SChris Mason 	level = btrfs_header_level(log->node);
2595e02119d5SChris Mason 	orig_level = level;
2596e02119d5SChris Mason 	path->nodes[level] = log->node;
2597e02119d5SChris Mason 	extent_buffer_get(log->node);
2598e02119d5SChris Mason 	path->slots[level] = 0;
2599e02119d5SChris Mason 
2600e02119d5SChris Mason 	while (1) {
2601e02119d5SChris Mason 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2602e02119d5SChris Mason 		if (wret > 0)
2603e02119d5SChris Mason 			break;
260479787eaaSJeff Mahoney 		if (wret < 0) {
2605e02119d5SChris Mason 			ret = wret;
260679787eaaSJeff Mahoney 			goto out;
260779787eaaSJeff Mahoney 		}
2608e02119d5SChris Mason 
2609e02119d5SChris Mason 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2610e02119d5SChris Mason 		if (wret > 0)
2611e02119d5SChris Mason 			break;
261279787eaaSJeff Mahoney 		if (wret < 0) {
2613e02119d5SChris Mason 			ret = wret;
261479787eaaSJeff Mahoney 			goto out;
261579787eaaSJeff Mahoney 		}
2616e02119d5SChris Mason 	}
2617e02119d5SChris Mason 
2618e02119d5SChris Mason 	/* was the root node processed? if not, catch it here */
2619e02119d5SChris Mason 	if (path->nodes[orig_level]) {
262079787eaaSJeff Mahoney 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2621e02119d5SChris Mason 			 btrfs_header_generation(path->nodes[orig_level]));
262279787eaaSJeff Mahoney 		if (ret)
262379787eaaSJeff Mahoney 			goto out;
2624e02119d5SChris Mason 		if (wc->free) {
2625e02119d5SChris Mason 			struct extent_buffer *next;
2626e02119d5SChris Mason 
2627e02119d5SChris Mason 			next = path->nodes[orig_level];
2628e02119d5SChris Mason 
2629681ae509SJosef Bacik 			if (trans) {
2630e02119d5SChris Mason 				btrfs_tree_lock(next);
2631b4ce94deSChris Mason 				btrfs_set_lock_blocking(next);
26322ff7e61eSJeff Mahoney 				clean_tree_block(trans, fs_info, next);
2633e02119d5SChris Mason 				btrfs_wait_tree_block_writeback(next);
2634e02119d5SChris Mason 				btrfs_tree_unlock(next);
2635681ae509SJosef Bacik 			}
2636e02119d5SChris Mason 
2637e02119d5SChris Mason 			WARN_ON(log->root_key.objectid !=
2638e02119d5SChris Mason 				BTRFS_TREE_LOG_OBJECTID);
26392ff7e61eSJeff Mahoney 			ret = btrfs_free_and_pin_reserved_extent(fs_info,
26402ff7e61eSJeff Mahoney 							next->start, next->len);
26413650860bSJosef Bacik 			if (ret)
26423650860bSJosef Bacik 				goto out;
2643e02119d5SChris Mason 		}
2644e02119d5SChris Mason 	}
2645e02119d5SChris Mason 
264679787eaaSJeff Mahoney out:
2647e02119d5SChris Mason 	btrfs_free_path(path);
2648e02119d5SChris Mason 	return ret;
2649e02119d5SChris Mason }
2650e02119d5SChris Mason 
26517237f183SYan Zheng /*
26527237f183SYan Zheng  * helper function to update the item for a given subvolumes log root
26537237f183SYan Zheng  * in the tree of log roots
26547237f183SYan Zheng  */
26557237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans,
26567237f183SYan Zheng 			   struct btrfs_root *log)
26577237f183SYan Zheng {
26580b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
26597237f183SYan Zheng 	int ret;
26607237f183SYan Zheng 
26617237f183SYan Zheng 	if (log->log_transid == 1) {
26627237f183SYan Zheng 		/* insert root item on the first sync */
26630b246afaSJeff Mahoney 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
26647237f183SYan Zheng 				&log->root_key, &log->root_item);
26657237f183SYan Zheng 	} else {
26660b246afaSJeff Mahoney 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
26677237f183SYan Zheng 				&log->root_key, &log->root_item);
26687237f183SYan Zheng 	}
26697237f183SYan Zheng 	return ret;
26707237f183SYan Zheng }
26717237f183SYan Zheng 
267260d53eb3SZhaolei static void wait_log_commit(struct btrfs_root *root, int transid)
2673e02119d5SChris Mason {
2674e02119d5SChris Mason 	DEFINE_WAIT(wait);
26757237f183SYan Zheng 	int index = transid % 2;
2676e02119d5SChris Mason 
26777237f183SYan Zheng 	/*
26787237f183SYan Zheng 	 * we only allow two pending log transactions at a time,
26797237f183SYan Zheng 	 * so we know that if ours is more than 2 older than the
26807237f183SYan Zheng 	 * current transaction, we're done
26817237f183SYan Zheng 	 */
2682e02119d5SChris Mason 	do {
26837237f183SYan Zheng 		prepare_to_wait(&root->log_commit_wait[index],
26847237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
26857237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
268612fcfd22SChris Mason 
2687d1433debSMiao Xie 		if (root->log_transid_committed < transid &&
26887237f183SYan Zheng 		    atomic_read(&root->log_commit[index]))
2689e02119d5SChris Mason 			schedule();
269012fcfd22SChris Mason 
26917237f183SYan Zheng 		finish_wait(&root->log_commit_wait[index], &wait);
26927237f183SYan Zheng 		mutex_lock(&root->log_mutex);
2693d1433debSMiao Xie 	} while (root->log_transid_committed < transid &&
26947237f183SYan Zheng 		 atomic_read(&root->log_commit[index]));
26957237f183SYan Zheng }
26967237f183SYan Zheng 
269760d53eb3SZhaolei static void wait_for_writer(struct btrfs_root *root)
26987237f183SYan Zheng {
26997237f183SYan Zheng 	DEFINE_WAIT(wait);
27008b050d35SMiao Xie 
27018b050d35SMiao Xie 	while (atomic_read(&root->log_writers)) {
27027237f183SYan Zheng 		prepare_to_wait(&root->log_writer_wait,
27037237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
27047237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
27058b050d35SMiao Xie 		if (atomic_read(&root->log_writers))
27067237f183SYan Zheng 			schedule();
27077237f183SYan Zheng 		finish_wait(&root->log_writer_wait, &wait);
2708575849ecSFilipe Manana 		mutex_lock(&root->log_mutex);
27097237f183SYan Zheng 	}
2710e02119d5SChris Mason }
2711e02119d5SChris Mason 
27128b050d35SMiao Xie static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
27138b050d35SMiao Xie 					struct btrfs_log_ctx *ctx)
27148b050d35SMiao Xie {
27158b050d35SMiao Xie 	if (!ctx)
27168b050d35SMiao Xie 		return;
27178b050d35SMiao Xie 
27188b050d35SMiao Xie 	mutex_lock(&root->log_mutex);
27198b050d35SMiao Xie 	list_del_init(&ctx->list);
27208b050d35SMiao Xie 	mutex_unlock(&root->log_mutex);
27218b050d35SMiao Xie }
27228b050d35SMiao Xie 
27238b050d35SMiao Xie /*
27248b050d35SMiao Xie  * Invoked in log mutex context, or be sure there is no other task which
27258b050d35SMiao Xie  * can access the list.
27268b050d35SMiao Xie  */
27278b050d35SMiao Xie static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
27288b050d35SMiao Xie 					     int index, int error)
27298b050d35SMiao Xie {
27308b050d35SMiao Xie 	struct btrfs_log_ctx *ctx;
2731570dd450SChris Mason 	struct btrfs_log_ctx *safe;
27328b050d35SMiao Xie 
2733570dd450SChris Mason 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2734570dd450SChris Mason 		list_del_init(&ctx->list);
27358b050d35SMiao Xie 		ctx->log_ret = error;
2736570dd450SChris Mason 	}
27378b050d35SMiao Xie 
27388b050d35SMiao Xie 	INIT_LIST_HEAD(&root->log_ctxs[index]);
27398b050d35SMiao Xie }
27408b050d35SMiao Xie 
2741e02119d5SChris Mason /*
2742e02119d5SChris Mason  * btrfs_sync_log does sends a given tree log down to the disk and
2743e02119d5SChris Mason  * updates the super blocks to record it.  When this call is done,
274412fcfd22SChris Mason  * you know that any inodes previously logged are safely on disk only
274512fcfd22SChris Mason  * if it returns 0.
274612fcfd22SChris Mason  *
274712fcfd22SChris Mason  * Any other return value means you need to call btrfs_commit_transaction.
274812fcfd22SChris Mason  * Some of the edge cases for fsyncing directories that have had unlinks
274912fcfd22SChris Mason  * or renames done in the past mean that sometimes the only safe
275012fcfd22SChris Mason  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
275112fcfd22SChris Mason  * that has happened.
2752e02119d5SChris Mason  */
2753e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans,
27548b050d35SMiao Xie 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2755e02119d5SChris Mason {
27567237f183SYan Zheng 	int index1;
27577237f183SYan Zheng 	int index2;
27588cef4e16SYan, Zheng 	int mark;
2759e02119d5SChris Mason 	int ret;
27600b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2761e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
27620b246afaSJeff Mahoney 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2763bb14a59bSMiao Xie 	int log_transid = 0;
27648b050d35SMiao Xie 	struct btrfs_log_ctx root_log_ctx;
2765c6adc9ccSMiao Xie 	struct blk_plug plug;
2766e02119d5SChris Mason 
27677237f183SYan Zheng 	mutex_lock(&root->log_mutex);
2768d1433debSMiao Xie 	log_transid = ctx->log_transid;
2769d1433debSMiao Xie 	if (root->log_transid_committed >= log_transid) {
27707237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
27718b050d35SMiao Xie 		return ctx->log_ret;
2772e02119d5SChris Mason 	}
2773d1433debSMiao Xie 
2774d1433debSMiao Xie 	index1 = log_transid % 2;
2775d1433debSMiao Xie 	if (atomic_read(&root->log_commit[index1])) {
277660d53eb3SZhaolei 		wait_log_commit(root, log_transid);
2777d1433debSMiao Xie 		mutex_unlock(&root->log_mutex);
2778d1433debSMiao Xie 		return ctx->log_ret;
2779d1433debSMiao Xie 	}
2780d1433debSMiao Xie 	ASSERT(log_transid == root->log_transid);
27817237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 1);
27827237f183SYan Zheng 
27837237f183SYan Zheng 	/* wait for previous tree log sync to complete */
27847237f183SYan Zheng 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
278560d53eb3SZhaolei 		wait_log_commit(root, log_transid - 1);
278648cab2e0SMiao Xie 
278786df7eb9SYan, Zheng 	while (1) {
27882ecb7923SMiao Xie 		int batch = atomic_read(&root->log_batch);
2789cd354ad6SChris Mason 		/* when we're on an ssd, just kick the log commit out */
27900b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, SSD) &&
279127cdeb70SMiao Xie 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
27927237f183SYan Zheng 			mutex_unlock(&root->log_mutex);
2793e02119d5SChris Mason 			schedule_timeout_uninterruptible(1);
27947237f183SYan Zheng 			mutex_lock(&root->log_mutex);
279586df7eb9SYan, Zheng 		}
279660d53eb3SZhaolei 		wait_for_writer(root);
27972ecb7923SMiao Xie 		if (batch == atomic_read(&root->log_batch))
2798e02119d5SChris Mason 			break;
2799e02119d5SChris Mason 	}
2800d0c803c4SChris Mason 
280112fcfd22SChris Mason 	/* bail out if we need to do a full commit */
28020b246afaSJeff Mahoney 	if (btrfs_need_log_full_commit(fs_info, trans)) {
280312fcfd22SChris Mason 		ret = -EAGAIN;
28042ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
280512fcfd22SChris Mason 		mutex_unlock(&root->log_mutex);
280612fcfd22SChris Mason 		goto out;
280712fcfd22SChris Mason 	}
280812fcfd22SChris Mason 
28098cef4e16SYan, Zheng 	if (log_transid % 2 == 0)
28108cef4e16SYan, Zheng 		mark = EXTENT_DIRTY;
28118cef4e16SYan, Zheng 	else
28128cef4e16SYan, Zheng 		mark = EXTENT_NEW;
28138cef4e16SYan, Zheng 
2814690587d1SChris Mason 	/* we start IO on  all the marked extents here, but we don't actually
2815690587d1SChris Mason 	 * wait for them until later.
2816690587d1SChris Mason 	 */
2817c6adc9ccSMiao Xie 	blk_start_plug(&plug);
28182ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
281979787eaaSJeff Mahoney 	if (ret) {
2820c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
282166642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
28222ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
28230b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
282479787eaaSJeff Mahoney 		mutex_unlock(&root->log_mutex);
282579787eaaSJeff Mahoney 		goto out;
282679787eaaSJeff Mahoney 	}
28277237f183SYan Zheng 
28285d4f98a2SYan Zheng 	btrfs_set_root_node(&log->root_item, log->node);
28297237f183SYan Zheng 
28307237f183SYan Zheng 	root->log_transid++;
28317237f183SYan Zheng 	log->log_transid = root->log_transid;
2832ff782e0aSJosef Bacik 	root->log_start_pid = 0;
28337237f183SYan Zheng 	/*
28348cef4e16SYan, Zheng 	 * IO has been started, blocks of the log tree have WRITTEN flag set
28358cef4e16SYan, Zheng 	 * in their headers. new modifications of the log will be written to
28368cef4e16SYan, Zheng 	 * new positions. so it's safe to allow log writers to go in.
28377237f183SYan Zheng 	 */
28387237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
28397237f183SYan Zheng 
284028a23593SFilipe Manana 	btrfs_init_log_ctx(&root_log_ctx, NULL);
2841d1433debSMiao Xie 
28427237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
28432ecb7923SMiao Xie 	atomic_inc(&log_root_tree->log_batch);
28447237f183SYan Zheng 	atomic_inc(&log_root_tree->log_writers);
2845d1433debSMiao Xie 
2846d1433debSMiao Xie 	index2 = log_root_tree->log_transid % 2;
2847d1433debSMiao Xie 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2848d1433debSMiao Xie 	root_log_ctx.log_transid = log_root_tree->log_transid;
2849d1433debSMiao Xie 
28507237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
28517237f183SYan Zheng 
28527237f183SYan Zheng 	ret = update_log_root(trans, log);
28537237f183SYan Zheng 
28547237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
28557237f183SYan Zheng 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2856779adf0fSDavid Sterba 		/*
2857779adf0fSDavid Sterba 		 * Implicit memory barrier after atomic_dec_and_test
2858779adf0fSDavid Sterba 		 */
28597237f183SYan Zheng 		if (waitqueue_active(&log_root_tree->log_writer_wait))
28607237f183SYan Zheng 			wake_up(&log_root_tree->log_writer_wait);
28617237f183SYan Zheng 	}
28627237f183SYan Zheng 
28634a500fd1SYan, Zheng 	if (ret) {
2864d1433debSMiao Xie 		if (!list_empty(&root_log_ctx.list))
2865d1433debSMiao Xie 			list_del_init(&root_log_ctx.list);
2866d1433debSMiao Xie 
2867c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
28680b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
2869995946ddSMiao Xie 
287079787eaaSJeff Mahoney 		if (ret != -ENOSPC) {
287166642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
287279787eaaSJeff Mahoney 			mutex_unlock(&log_root_tree->log_mutex);
287379787eaaSJeff Mahoney 			goto out;
287479787eaaSJeff Mahoney 		}
2875bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
28762ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
28774a500fd1SYan, Zheng 		mutex_unlock(&log_root_tree->log_mutex);
28784a500fd1SYan, Zheng 		ret = -EAGAIN;
28794a500fd1SYan, Zheng 		goto out;
28804a500fd1SYan, Zheng 	}
28814a500fd1SYan, Zheng 
2882d1433debSMiao Xie 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
28833da5ab56SForrest Liu 		blk_finish_plug(&plug);
2884cbd60aa7SChris Mason 		list_del_init(&root_log_ctx.list);
2885d1433debSMiao Xie 		mutex_unlock(&log_root_tree->log_mutex);
2886d1433debSMiao Xie 		ret = root_log_ctx.log_ret;
2887d1433debSMiao Xie 		goto out;
2888d1433debSMiao Xie 	}
28898b050d35SMiao Xie 
2890d1433debSMiao Xie 	index2 = root_log_ctx.log_transid % 2;
28917237f183SYan Zheng 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2892c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
2893bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log, mark);
289450d9aa99SJosef Bacik 		btrfs_wait_logged_extents(trans, log, log_transid);
289560d53eb3SZhaolei 		wait_log_commit(log_root_tree,
2896d1433debSMiao Xie 				root_log_ctx.log_transid);
28977237f183SYan Zheng 		mutex_unlock(&log_root_tree->log_mutex);
28985ab5e44aSFilipe Manana 		if (!ret)
28998b050d35SMiao Xie 			ret = root_log_ctx.log_ret;
29007237f183SYan Zheng 		goto out;
29017237f183SYan Zheng 	}
2902d1433debSMiao Xie 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
29037237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 1);
29047237f183SYan Zheng 
290512fcfd22SChris Mason 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
290660d53eb3SZhaolei 		wait_log_commit(log_root_tree,
2907d1433debSMiao Xie 				root_log_ctx.log_transid - 1);
290812fcfd22SChris Mason 	}
29097237f183SYan Zheng 
291060d53eb3SZhaolei 	wait_for_writer(log_root_tree);
291112fcfd22SChris Mason 
291212fcfd22SChris Mason 	/*
291312fcfd22SChris Mason 	 * now that we've moved on to the tree of log tree roots,
291412fcfd22SChris Mason 	 * check the full commit flag again
291512fcfd22SChris Mason 	 */
29160b246afaSJeff Mahoney 	if (btrfs_need_log_full_commit(fs_info, trans)) {
2917c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
2918bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
29192ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
292012fcfd22SChris Mason 		mutex_unlock(&log_root_tree->log_mutex);
292112fcfd22SChris Mason 		ret = -EAGAIN;
292212fcfd22SChris Mason 		goto out_wake_log_root;
292312fcfd22SChris Mason 	}
29247237f183SYan Zheng 
29252ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info,
29268cef4e16SYan, Zheng 					 &log_root_tree->dirty_log_pages,
29278cef4e16SYan, Zheng 					 EXTENT_DIRTY | EXTENT_NEW);
2928c6adc9ccSMiao Xie 	blk_finish_plug(&plug);
292979787eaaSJeff Mahoney 	if (ret) {
29300b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
293166642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
29322ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
293379787eaaSJeff Mahoney 		mutex_unlock(&log_root_tree->log_mutex);
293479787eaaSJeff Mahoney 		goto out_wake_log_root;
293579787eaaSJeff Mahoney 	}
2936bf89d38fSJeff Mahoney 	ret = btrfs_wait_tree_log_extents(log, mark);
29375ab5e44aSFilipe Manana 	if (!ret)
2938bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log_root_tree,
2939c6adc9ccSMiao Xie 						  EXTENT_NEW | EXTENT_DIRTY);
29405ab5e44aSFilipe Manana 	if (ret) {
29410b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
29425ab5e44aSFilipe Manana 		btrfs_free_logged_extents(log, log_transid);
29435ab5e44aSFilipe Manana 		mutex_unlock(&log_root_tree->log_mutex);
29445ab5e44aSFilipe Manana 		goto out_wake_log_root;
29455ab5e44aSFilipe Manana 	}
294650d9aa99SJosef Bacik 	btrfs_wait_logged_extents(trans, log, log_transid);
2947e02119d5SChris Mason 
29480b246afaSJeff Mahoney 	btrfs_set_super_log_root(fs_info->super_for_commit,
29497237f183SYan Zheng 				 log_root_tree->node->start);
29500b246afaSJeff Mahoney 	btrfs_set_super_log_root_level(fs_info->super_for_commit,
29517237f183SYan Zheng 				       btrfs_header_level(log_root_tree->node));
2952e02119d5SChris Mason 
29537237f183SYan Zheng 	log_root_tree->log_transid++;
29547237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
29557237f183SYan Zheng 
29567237f183SYan Zheng 	/*
29577237f183SYan Zheng 	 * nobody else is going to jump in and write the the ctree
29587237f183SYan Zheng 	 * super here because the log_commit atomic below is protecting
29597237f183SYan Zheng 	 * us.  We must be called with a transaction handle pinning
29607237f183SYan Zheng 	 * the running transaction open, so a full commit can't hop
29617237f183SYan Zheng 	 * in and cause problems either.
29627237f183SYan Zheng 	 */
29632ff7e61eSJeff Mahoney 	ret = write_ctree_super(trans, fs_info, 1);
29645af3e8ccSStefan Behrens 	if (ret) {
29650b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
296666642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
29675af3e8ccSStefan Behrens 		goto out_wake_log_root;
29685af3e8ccSStefan Behrens 	}
29697237f183SYan Zheng 
2970257c62e1SChris Mason 	mutex_lock(&root->log_mutex);
2971257c62e1SChris Mason 	if (root->last_log_commit < log_transid)
2972257c62e1SChris Mason 		root->last_log_commit = log_transid;
2973257c62e1SChris Mason 	mutex_unlock(&root->log_mutex);
2974257c62e1SChris Mason 
297512fcfd22SChris Mason out_wake_log_root:
2976570dd450SChris Mason 	mutex_lock(&log_root_tree->log_mutex);
29778b050d35SMiao Xie 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
29788b050d35SMiao Xie 
2979d1433debSMiao Xie 	log_root_tree->log_transid_committed++;
29807237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 0);
2981d1433debSMiao Xie 	mutex_unlock(&log_root_tree->log_mutex);
2982d1433debSMiao Xie 
298333a9eca7SDavid Sterba 	/*
298433a9eca7SDavid Sterba 	 * The barrier before waitqueue_active is implied by mutex_unlock
298533a9eca7SDavid Sterba 	 */
29867237f183SYan Zheng 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
29877237f183SYan Zheng 		wake_up(&log_root_tree->log_commit_wait[index2]);
2988e02119d5SChris Mason out:
2989d1433debSMiao Xie 	mutex_lock(&root->log_mutex);
2990570dd450SChris Mason 	btrfs_remove_all_log_ctxs(root, index1, ret);
2991d1433debSMiao Xie 	root->log_transid_committed++;
29927237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 0);
2993d1433debSMiao Xie 	mutex_unlock(&root->log_mutex);
29948b050d35SMiao Xie 
299533a9eca7SDavid Sterba 	/*
299633a9eca7SDavid Sterba 	 * The barrier before waitqueue_active is implied by mutex_unlock
299733a9eca7SDavid Sterba 	 */
29987237f183SYan Zheng 	if (waitqueue_active(&root->log_commit_wait[index1]))
29997237f183SYan Zheng 		wake_up(&root->log_commit_wait[index1]);
3000b31eabd8SChris Mason 	return ret;
3001e02119d5SChris Mason }
3002e02119d5SChris Mason 
30034a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans,
30044a500fd1SYan, Zheng 			  struct btrfs_root *log)
3005e02119d5SChris Mason {
3006e02119d5SChris Mason 	int ret;
3007d0c803c4SChris Mason 	u64 start;
3008d0c803c4SChris Mason 	u64 end;
3009e02119d5SChris Mason 	struct walk_control wc = {
3010e02119d5SChris Mason 		.free = 1,
3011e02119d5SChris Mason 		.process_func = process_one_buffer
3012e02119d5SChris Mason 	};
3013e02119d5SChris Mason 
3014e02119d5SChris Mason 	ret = walk_log_tree(trans, log, &wc);
30153650860bSJosef Bacik 	/* I don't think this can happen but just in case */
30163650860bSJosef Bacik 	if (ret)
301766642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
3018e02119d5SChris Mason 
3019d0c803c4SChris Mason 	while (1) {
3020d0c803c4SChris Mason 		ret = find_first_extent_bit(&log->dirty_log_pages,
3021e6138876SJosef Bacik 				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
3022e6138876SJosef Bacik 				NULL);
3023d0c803c4SChris Mason 		if (ret)
3024d0c803c4SChris Mason 			break;
3025d0c803c4SChris Mason 
30268cef4e16SYan, Zheng 		clear_extent_bits(&log->dirty_log_pages, start, end,
302791166212SDavid Sterba 				  EXTENT_DIRTY | EXTENT_NEW);
3028d0c803c4SChris Mason 	}
3029d0c803c4SChris Mason 
30302ab28f32SJosef Bacik 	/*
30312ab28f32SJosef Bacik 	 * We may have short-circuited the log tree with the full commit logic
30322ab28f32SJosef Bacik 	 * and left ordered extents on our list, so clear these out to keep us
30332ab28f32SJosef Bacik 	 * from leaking inodes and memory.
30342ab28f32SJosef Bacik 	 */
30352ab28f32SJosef Bacik 	btrfs_free_logged_extents(log, 0);
30362ab28f32SJosef Bacik 	btrfs_free_logged_extents(log, 1);
30372ab28f32SJosef Bacik 
30387237f183SYan Zheng 	free_extent_buffer(log->node);
30397237f183SYan Zheng 	kfree(log);
30404a500fd1SYan, Zheng }
30414a500fd1SYan, Zheng 
30424a500fd1SYan, Zheng /*
30434a500fd1SYan, Zheng  * free all the extents used by the tree log.  This should be called
30444a500fd1SYan, Zheng  * at commit time of the full transaction
30454a500fd1SYan, Zheng  */
30464a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
30474a500fd1SYan, Zheng {
30484a500fd1SYan, Zheng 	if (root->log_root) {
30494a500fd1SYan, Zheng 		free_log_tree(trans, root->log_root);
30504a500fd1SYan, Zheng 		root->log_root = NULL;
30514a500fd1SYan, Zheng 	}
30524a500fd1SYan, Zheng 	return 0;
30534a500fd1SYan, Zheng }
30544a500fd1SYan, Zheng 
30554a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
30564a500fd1SYan, Zheng 			     struct btrfs_fs_info *fs_info)
30574a500fd1SYan, Zheng {
30584a500fd1SYan, Zheng 	if (fs_info->log_root_tree) {
30594a500fd1SYan, Zheng 		free_log_tree(trans, fs_info->log_root_tree);
30604a500fd1SYan, Zheng 		fs_info->log_root_tree = NULL;
30614a500fd1SYan, Zheng 	}
3062e02119d5SChris Mason 	return 0;
3063e02119d5SChris Mason }
3064e02119d5SChris Mason 
3065e02119d5SChris Mason /*
3066e02119d5SChris Mason  * If both a file and directory are logged, and unlinks or renames are
3067e02119d5SChris Mason  * mixed in, we have a few interesting corners:
3068e02119d5SChris Mason  *
3069e02119d5SChris Mason  * create file X in dir Y
3070e02119d5SChris Mason  * link file X to X.link in dir Y
3071e02119d5SChris Mason  * fsync file X
3072e02119d5SChris Mason  * unlink file X but leave X.link
3073e02119d5SChris Mason  * fsync dir Y
3074e02119d5SChris Mason  *
3075e02119d5SChris Mason  * After a crash we would expect only X.link to exist.  But file X
3076e02119d5SChris Mason  * didn't get fsync'd again so the log has back refs for X and X.link.
3077e02119d5SChris Mason  *
3078e02119d5SChris Mason  * We solve this by removing directory entries and inode backrefs from the
3079e02119d5SChris Mason  * log when a file that was logged in the current transaction is
3080e02119d5SChris Mason  * unlinked.  Any later fsync will include the updated log entries, and
3081e02119d5SChris Mason  * we'll be able to reconstruct the proper directory items from backrefs.
3082e02119d5SChris Mason  *
3083e02119d5SChris Mason  * This optimizations allows us to avoid relogging the entire inode
3084e02119d5SChris Mason  * or the entire directory.
3085e02119d5SChris Mason  */
3086e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3087e02119d5SChris Mason 				 struct btrfs_root *root,
3088e02119d5SChris Mason 				 const char *name, int name_len,
308949f34d1fSNikolay Borisov 				 struct btrfs_inode *dir, u64 index)
3090e02119d5SChris Mason {
3091e02119d5SChris Mason 	struct btrfs_root *log;
3092e02119d5SChris Mason 	struct btrfs_dir_item *di;
3093e02119d5SChris Mason 	struct btrfs_path *path;
3094e02119d5SChris Mason 	int ret;
30954a500fd1SYan, Zheng 	int err = 0;
3096e02119d5SChris Mason 	int bytes_del = 0;
309749f34d1fSNikolay Borisov 	u64 dir_ino = btrfs_ino(dir);
3098e02119d5SChris Mason 
309949f34d1fSNikolay Borisov 	if (dir->logged_trans < trans->transid)
31003a5f1d45SChris Mason 		return 0;
31013a5f1d45SChris Mason 
3102e02119d5SChris Mason 	ret = join_running_log_trans(root);
3103e02119d5SChris Mason 	if (ret)
3104e02119d5SChris Mason 		return 0;
3105e02119d5SChris Mason 
310649f34d1fSNikolay Borisov 	mutex_lock(&dir->log_mutex);
3107e02119d5SChris Mason 
3108e02119d5SChris Mason 	log = root->log_root;
3109e02119d5SChris Mason 	path = btrfs_alloc_path();
3110a62f44a5STsutomu Itoh 	if (!path) {
3111a62f44a5STsutomu Itoh 		err = -ENOMEM;
3112a62f44a5STsutomu Itoh 		goto out_unlock;
3113a62f44a5STsutomu Itoh 	}
31142a29edc6Sliubo 
311533345d01SLi Zefan 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3116e02119d5SChris Mason 				   name, name_len, -1);
31174a500fd1SYan, Zheng 	if (IS_ERR(di)) {
31184a500fd1SYan, Zheng 		err = PTR_ERR(di);
31194a500fd1SYan, Zheng 		goto fail;
31204a500fd1SYan, Zheng 	}
31214a500fd1SYan, Zheng 	if (di) {
3122e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3123e02119d5SChris Mason 		bytes_del += name_len;
31243650860bSJosef Bacik 		if (ret) {
31253650860bSJosef Bacik 			err = ret;
31263650860bSJosef Bacik 			goto fail;
31273650860bSJosef Bacik 		}
3128e02119d5SChris Mason 	}
3129b3b4aa74SDavid Sterba 	btrfs_release_path(path);
313033345d01SLi Zefan 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3131e02119d5SChris Mason 					 index, name, name_len, -1);
31324a500fd1SYan, Zheng 	if (IS_ERR(di)) {
31334a500fd1SYan, Zheng 		err = PTR_ERR(di);
31344a500fd1SYan, Zheng 		goto fail;
31354a500fd1SYan, Zheng 	}
31364a500fd1SYan, Zheng 	if (di) {
3137e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3138e02119d5SChris Mason 		bytes_del += name_len;
31393650860bSJosef Bacik 		if (ret) {
31403650860bSJosef Bacik 			err = ret;
31413650860bSJosef Bacik 			goto fail;
31423650860bSJosef Bacik 		}
3143e02119d5SChris Mason 	}
3144e02119d5SChris Mason 
3145e02119d5SChris Mason 	/* update the directory size in the log to reflect the names
3146e02119d5SChris Mason 	 * we have removed
3147e02119d5SChris Mason 	 */
3148e02119d5SChris Mason 	if (bytes_del) {
3149e02119d5SChris Mason 		struct btrfs_key key;
3150e02119d5SChris Mason 
315133345d01SLi Zefan 		key.objectid = dir_ino;
3152e02119d5SChris Mason 		key.offset = 0;
3153e02119d5SChris Mason 		key.type = BTRFS_INODE_ITEM_KEY;
3154b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3155e02119d5SChris Mason 
3156e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
31574a500fd1SYan, Zheng 		if (ret < 0) {
31584a500fd1SYan, Zheng 			err = ret;
31594a500fd1SYan, Zheng 			goto fail;
31604a500fd1SYan, Zheng 		}
3161e02119d5SChris Mason 		if (ret == 0) {
3162e02119d5SChris Mason 			struct btrfs_inode_item *item;
3163e02119d5SChris Mason 			u64 i_size;
3164e02119d5SChris Mason 
3165e02119d5SChris Mason 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3166e02119d5SChris Mason 					      struct btrfs_inode_item);
3167e02119d5SChris Mason 			i_size = btrfs_inode_size(path->nodes[0], item);
3168e02119d5SChris Mason 			if (i_size > bytes_del)
3169e02119d5SChris Mason 				i_size -= bytes_del;
3170e02119d5SChris Mason 			else
3171e02119d5SChris Mason 				i_size = 0;
3172e02119d5SChris Mason 			btrfs_set_inode_size(path->nodes[0], item, i_size);
3173e02119d5SChris Mason 			btrfs_mark_buffer_dirty(path->nodes[0]);
3174e02119d5SChris Mason 		} else
3175e02119d5SChris Mason 			ret = 0;
3176b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3177e02119d5SChris Mason 	}
31784a500fd1SYan, Zheng fail:
3179e02119d5SChris Mason 	btrfs_free_path(path);
3180a62f44a5STsutomu Itoh out_unlock:
318149f34d1fSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
31824a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
3183995946ddSMiao Xie 		btrfs_set_log_full_commit(root->fs_info, trans);
31844a500fd1SYan, Zheng 		ret = 0;
318579787eaaSJeff Mahoney 	} else if (ret < 0)
318666642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
318779787eaaSJeff Mahoney 
318812fcfd22SChris Mason 	btrfs_end_log_trans(root);
3189e02119d5SChris Mason 
3190411fc6bcSAndi Kleen 	return err;
3191e02119d5SChris Mason }
3192e02119d5SChris Mason 
3193e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */
3194e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3195e02119d5SChris Mason 			       struct btrfs_root *root,
3196e02119d5SChris Mason 			       const char *name, int name_len,
3197a491abb2SNikolay Borisov 			       struct btrfs_inode *inode, u64 dirid)
3198e02119d5SChris Mason {
31990b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
3200e02119d5SChris Mason 	struct btrfs_root *log;
3201e02119d5SChris Mason 	u64 index;
3202e02119d5SChris Mason 	int ret;
3203e02119d5SChris Mason 
3204a491abb2SNikolay Borisov 	if (inode->logged_trans < trans->transid)
32053a5f1d45SChris Mason 		return 0;
32063a5f1d45SChris Mason 
3207e02119d5SChris Mason 	ret = join_running_log_trans(root);
3208e02119d5SChris Mason 	if (ret)
3209e02119d5SChris Mason 		return 0;
3210e02119d5SChris Mason 	log = root->log_root;
3211a491abb2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
3212e02119d5SChris Mason 
3213a491abb2SNikolay Borisov 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3214e02119d5SChris Mason 				  dirid, &index);
3215a491abb2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
32164a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
32170b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
32184a500fd1SYan, Zheng 		ret = 0;
321979787eaaSJeff Mahoney 	} else if (ret < 0 && ret != -ENOENT)
322066642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
322112fcfd22SChris Mason 	btrfs_end_log_trans(root);
3222e02119d5SChris Mason 
3223e02119d5SChris Mason 	return ret;
3224e02119d5SChris Mason }
3225e02119d5SChris Mason 
3226e02119d5SChris Mason /*
3227e02119d5SChris Mason  * creates a range item in the log for 'dirid'.  first_offset and
3228e02119d5SChris Mason  * last_offset tell us which parts of the key space the log should
3229e02119d5SChris Mason  * be considered authoritative for.
3230e02119d5SChris Mason  */
3231e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3232e02119d5SChris Mason 				       struct btrfs_root *log,
3233e02119d5SChris Mason 				       struct btrfs_path *path,
3234e02119d5SChris Mason 				       int key_type, u64 dirid,
3235e02119d5SChris Mason 				       u64 first_offset, u64 last_offset)
3236e02119d5SChris Mason {
3237e02119d5SChris Mason 	int ret;
3238e02119d5SChris Mason 	struct btrfs_key key;
3239e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
3240e02119d5SChris Mason 
3241e02119d5SChris Mason 	key.objectid = dirid;
3242e02119d5SChris Mason 	key.offset = first_offset;
3243e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY)
3244e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3245e02119d5SChris Mason 	else
3246e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3247e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
32484a500fd1SYan, Zheng 	if (ret)
32494a500fd1SYan, Zheng 		return ret;
3250e02119d5SChris Mason 
3251e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3252e02119d5SChris Mason 			      struct btrfs_dir_log_item);
3253e02119d5SChris Mason 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3254e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
3255b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3256e02119d5SChris Mason 	return 0;
3257e02119d5SChris Mason }
3258e02119d5SChris Mason 
3259e02119d5SChris Mason /*
3260e02119d5SChris Mason  * log all the items included in the current transaction for a given
3261e02119d5SChris Mason  * directory.  This also creates the range items in the log tree required
3262e02119d5SChris Mason  * to replay anything deleted before the fsync
3263e02119d5SChris Mason  */
3264e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3265684a5773SNikolay Borisov 			  struct btrfs_root *root, struct btrfs_inode *inode,
3266e02119d5SChris Mason 			  struct btrfs_path *path,
3267e02119d5SChris Mason 			  struct btrfs_path *dst_path, int key_type,
32682f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx,
3269e02119d5SChris Mason 			  u64 min_offset, u64 *last_offset_ret)
3270e02119d5SChris Mason {
3271e02119d5SChris Mason 	struct btrfs_key min_key;
3272e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
3273e02119d5SChris Mason 	struct extent_buffer *src;
32744a500fd1SYan, Zheng 	int err = 0;
3275e02119d5SChris Mason 	int ret;
3276e02119d5SChris Mason 	int i;
3277e02119d5SChris Mason 	int nritems;
3278e02119d5SChris Mason 	u64 first_offset = min_offset;
3279e02119d5SChris Mason 	u64 last_offset = (u64)-1;
3280684a5773SNikolay Borisov 	u64 ino = btrfs_ino(inode);
3281e02119d5SChris Mason 
3282e02119d5SChris Mason 	log = root->log_root;
3283e02119d5SChris Mason 
328433345d01SLi Zefan 	min_key.objectid = ino;
3285e02119d5SChris Mason 	min_key.type = key_type;
3286e02119d5SChris Mason 	min_key.offset = min_offset;
3287e02119d5SChris Mason 
32886174d3cbSFilipe David Borba Manana 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3289e02119d5SChris Mason 
3290e02119d5SChris Mason 	/*
3291e02119d5SChris Mason 	 * we didn't find anything from this transaction, see if there
3292e02119d5SChris Mason 	 * is anything at all
3293e02119d5SChris Mason 	 */
329433345d01SLi Zefan 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
329533345d01SLi Zefan 		min_key.objectid = ino;
3296e02119d5SChris Mason 		min_key.type = key_type;
3297e02119d5SChris Mason 		min_key.offset = (u64)-1;
3298b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3299e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3300e02119d5SChris Mason 		if (ret < 0) {
3301b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3302e02119d5SChris Mason 			return ret;
3303e02119d5SChris Mason 		}
330433345d01SLi Zefan 		ret = btrfs_previous_item(root, path, ino, key_type);
3305e02119d5SChris Mason 
3306e02119d5SChris Mason 		/* if ret == 0 there are items for this type,
3307e02119d5SChris Mason 		 * create a range to tell us the last key of this type.
3308e02119d5SChris Mason 		 * otherwise, there are no items in this directory after
3309e02119d5SChris Mason 		 * *min_offset, and we create a range to indicate that.
3310e02119d5SChris Mason 		 */
3311e02119d5SChris Mason 		if (ret == 0) {
3312e02119d5SChris Mason 			struct btrfs_key tmp;
3313e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3314e02119d5SChris Mason 					      path->slots[0]);
3315d397712bSChris Mason 			if (key_type == tmp.type)
3316e02119d5SChris Mason 				first_offset = max(min_offset, tmp.offset) + 1;
3317e02119d5SChris Mason 		}
3318e02119d5SChris Mason 		goto done;
3319e02119d5SChris Mason 	}
3320e02119d5SChris Mason 
3321e02119d5SChris Mason 	/* go backward to find any previous key */
332233345d01SLi Zefan 	ret = btrfs_previous_item(root, path, ino, key_type);
3323e02119d5SChris Mason 	if (ret == 0) {
3324e02119d5SChris Mason 		struct btrfs_key tmp;
3325e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3326e02119d5SChris Mason 		if (key_type == tmp.type) {
3327e02119d5SChris Mason 			first_offset = tmp.offset;
3328e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3329e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3330e02119d5SChris Mason 					     &tmp);
33314a500fd1SYan, Zheng 			if (ret) {
33324a500fd1SYan, Zheng 				err = ret;
33334a500fd1SYan, Zheng 				goto done;
33344a500fd1SYan, Zheng 			}
3335e02119d5SChris Mason 		}
3336e02119d5SChris Mason 	}
3337b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3338e02119d5SChris Mason 
3339e02119d5SChris Mason 	/* find the first key from this transaction again */
3340e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3341fae7f21cSDulshani Gunawardhana 	if (WARN_ON(ret != 0))
3342e02119d5SChris Mason 		goto done;
3343e02119d5SChris Mason 
3344e02119d5SChris Mason 	/*
3345e02119d5SChris Mason 	 * we have a block from this transaction, log every item in it
3346e02119d5SChris Mason 	 * from our directory
3347e02119d5SChris Mason 	 */
3348e02119d5SChris Mason 	while (1) {
3349e02119d5SChris Mason 		struct btrfs_key tmp;
3350e02119d5SChris Mason 		src = path->nodes[0];
3351e02119d5SChris Mason 		nritems = btrfs_header_nritems(src);
3352e02119d5SChris Mason 		for (i = path->slots[0]; i < nritems; i++) {
33532f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
33542f2ff0eeSFilipe Manana 
3355e02119d5SChris Mason 			btrfs_item_key_to_cpu(src, &min_key, i);
3356e02119d5SChris Mason 
335733345d01SLi Zefan 			if (min_key.objectid != ino || min_key.type != key_type)
3358e02119d5SChris Mason 				goto done;
3359e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path, src, i,
3360e02119d5SChris Mason 					     &min_key);
33614a500fd1SYan, Zheng 			if (ret) {
33624a500fd1SYan, Zheng 				err = ret;
33634a500fd1SYan, Zheng 				goto done;
33644a500fd1SYan, Zheng 			}
33652f2ff0eeSFilipe Manana 
33662f2ff0eeSFilipe Manana 			/*
33672f2ff0eeSFilipe Manana 			 * We must make sure that when we log a directory entry,
33682f2ff0eeSFilipe Manana 			 * the corresponding inode, after log replay, has a
33692f2ff0eeSFilipe Manana 			 * matching link count. For example:
33702f2ff0eeSFilipe Manana 			 *
33712f2ff0eeSFilipe Manana 			 * touch foo
33722f2ff0eeSFilipe Manana 			 * mkdir mydir
33732f2ff0eeSFilipe Manana 			 * sync
33742f2ff0eeSFilipe Manana 			 * ln foo mydir/bar
33752f2ff0eeSFilipe Manana 			 * xfs_io -c "fsync" mydir
33762f2ff0eeSFilipe Manana 			 * <crash>
33772f2ff0eeSFilipe Manana 			 * <mount fs and log replay>
33782f2ff0eeSFilipe Manana 			 *
33792f2ff0eeSFilipe Manana 			 * Would result in a fsync log that when replayed, our
33802f2ff0eeSFilipe Manana 			 * file inode would have a link count of 1, but we get
33812f2ff0eeSFilipe Manana 			 * two directory entries pointing to the same inode.
33822f2ff0eeSFilipe Manana 			 * After removing one of the names, it would not be
33832f2ff0eeSFilipe Manana 			 * possible to remove the other name, which resulted
33842f2ff0eeSFilipe Manana 			 * always in stale file handle errors, and would not
33852f2ff0eeSFilipe Manana 			 * be possible to rmdir the parent directory, since
33862f2ff0eeSFilipe Manana 			 * its i_size could never decrement to the value
33872f2ff0eeSFilipe Manana 			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
33882f2ff0eeSFilipe Manana 			 */
33892f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
33902f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
33912f2ff0eeSFilipe Manana 			if (ctx &&
33922f2ff0eeSFilipe Manana 			    (btrfs_dir_transid(src, di) == trans->transid ||
33932f2ff0eeSFilipe Manana 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
33942f2ff0eeSFilipe Manana 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
33952f2ff0eeSFilipe Manana 				ctx->log_new_dentries = true;
3396e02119d5SChris Mason 		}
3397e02119d5SChris Mason 		path->slots[0] = nritems;
3398e02119d5SChris Mason 
3399e02119d5SChris Mason 		/*
3400e02119d5SChris Mason 		 * look ahead to the next item and see if it is also
3401e02119d5SChris Mason 		 * from this directory and from this transaction
3402e02119d5SChris Mason 		 */
3403e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
3404e02119d5SChris Mason 		if (ret == 1) {
3405e02119d5SChris Mason 			last_offset = (u64)-1;
3406e02119d5SChris Mason 			goto done;
3407e02119d5SChris Mason 		}
3408e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
340933345d01SLi Zefan 		if (tmp.objectid != ino || tmp.type != key_type) {
3410e02119d5SChris Mason 			last_offset = (u64)-1;
3411e02119d5SChris Mason 			goto done;
3412e02119d5SChris Mason 		}
3413e02119d5SChris Mason 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3414e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3415e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3416e02119d5SChris Mason 					     &tmp);
34174a500fd1SYan, Zheng 			if (ret)
34184a500fd1SYan, Zheng 				err = ret;
34194a500fd1SYan, Zheng 			else
3420e02119d5SChris Mason 				last_offset = tmp.offset;
3421e02119d5SChris Mason 			goto done;
3422e02119d5SChris Mason 		}
3423e02119d5SChris Mason 	}
3424e02119d5SChris Mason done:
3425b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3426b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
3427e02119d5SChris Mason 
34284a500fd1SYan, Zheng 	if (err == 0) {
34294a500fd1SYan, Zheng 		*last_offset_ret = last_offset;
34304a500fd1SYan, Zheng 		/*
34314a500fd1SYan, Zheng 		 * insert the log range keys to indicate where the log
34324a500fd1SYan, Zheng 		 * is valid
34334a500fd1SYan, Zheng 		 */
34344a500fd1SYan, Zheng 		ret = insert_dir_log_key(trans, log, path, key_type,
343533345d01SLi Zefan 					 ino, first_offset, last_offset);
34364a500fd1SYan, Zheng 		if (ret)
34374a500fd1SYan, Zheng 			err = ret;
34384a500fd1SYan, Zheng 	}
34394a500fd1SYan, Zheng 	return err;
3440e02119d5SChris Mason }
3441e02119d5SChris Mason 
3442e02119d5SChris Mason /*
3443e02119d5SChris Mason  * logging directories is very similar to logging inodes, We find all the items
3444e02119d5SChris Mason  * from the current transaction and write them to the log.
3445e02119d5SChris Mason  *
3446e02119d5SChris Mason  * The recovery code scans the directory in the subvolume, and if it finds a
3447e02119d5SChris Mason  * key in the range logged that is not present in the log tree, then it means
3448e02119d5SChris Mason  * that dir entry was unlinked during the transaction.
3449e02119d5SChris Mason  *
3450e02119d5SChris Mason  * In order for that scan to work, we must include one key smaller than
3451e02119d5SChris Mason  * the smallest logged by this transaction and one key larger than the largest
3452e02119d5SChris Mason  * key logged by this transaction.
3453e02119d5SChris Mason  */
3454e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3455dbf39ea4SNikolay Borisov 			  struct btrfs_root *root, struct btrfs_inode *inode,
3456e02119d5SChris Mason 			  struct btrfs_path *path,
34572f2ff0eeSFilipe Manana 			  struct btrfs_path *dst_path,
34582f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx)
3459e02119d5SChris Mason {
3460e02119d5SChris Mason 	u64 min_key;
3461e02119d5SChris Mason 	u64 max_key;
3462e02119d5SChris Mason 	int ret;
3463e02119d5SChris Mason 	int key_type = BTRFS_DIR_ITEM_KEY;
3464e02119d5SChris Mason 
3465e02119d5SChris Mason again:
3466e02119d5SChris Mason 	min_key = 0;
3467e02119d5SChris Mason 	max_key = 0;
3468e02119d5SChris Mason 	while (1) {
3469dbf39ea4SNikolay Borisov 		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3470dbf39ea4SNikolay Borisov 				ctx, min_key, &max_key);
34714a500fd1SYan, Zheng 		if (ret)
34724a500fd1SYan, Zheng 			return ret;
3473e02119d5SChris Mason 		if (max_key == (u64)-1)
3474e02119d5SChris Mason 			break;
3475e02119d5SChris Mason 		min_key = max_key + 1;
3476e02119d5SChris Mason 	}
3477e02119d5SChris Mason 
3478e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3479e02119d5SChris Mason 		key_type = BTRFS_DIR_INDEX_KEY;
3480e02119d5SChris Mason 		goto again;
3481e02119d5SChris Mason 	}
3482e02119d5SChris Mason 	return 0;
3483e02119d5SChris Mason }
3484e02119d5SChris Mason 
3485e02119d5SChris Mason /*
3486e02119d5SChris Mason  * a helper function to drop items from the log before we relog an
3487e02119d5SChris Mason  * inode.  max_key_type indicates the highest item type to remove.
3488e02119d5SChris Mason  * This cannot be run for file data extents because it does not
3489e02119d5SChris Mason  * free the extents they point to.
3490e02119d5SChris Mason  */
3491e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans,
3492e02119d5SChris Mason 				  struct btrfs_root *log,
3493e02119d5SChris Mason 				  struct btrfs_path *path,
3494e02119d5SChris Mason 				  u64 objectid, int max_key_type)
3495e02119d5SChris Mason {
3496e02119d5SChris Mason 	int ret;
3497e02119d5SChris Mason 	struct btrfs_key key;
3498e02119d5SChris Mason 	struct btrfs_key found_key;
349918ec90d6SJosef Bacik 	int start_slot;
3500e02119d5SChris Mason 
3501e02119d5SChris Mason 	key.objectid = objectid;
3502e02119d5SChris Mason 	key.type = max_key_type;
3503e02119d5SChris Mason 	key.offset = (u64)-1;
3504e02119d5SChris Mason 
3505e02119d5SChris Mason 	while (1) {
3506e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
35073650860bSJosef Bacik 		BUG_ON(ret == 0); /* Logic error */
35084a500fd1SYan, Zheng 		if (ret < 0)
3509e02119d5SChris Mason 			break;
3510e02119d5SChris Mason 
3511e02119d5SChris Mason 		if (path->slots[0] == 0)
3512e02119d5SChris Mason 			break;
3513e02119d5SChris Mason 
3514e02119d5SChris Mason 		path->slots[0]--;
3515e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3516e02119d5SChris Mason 				      path->slots[0]);
3517e02119d5SChris Mason 
3518e02119d5SChris Mason 		if (found_key.objectid != objectid)
3519e02119d5SChris Mason 			break;
3520e02119d5SChris Mason 
352118ec90d6SJosef Bacik 		found_key.offset = 0;
352218ec90d6SJosef Bacik 		found_key.type = 0;
352318ec90d6SJosef Bacik 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
352418ec90d6SJosef Bacik 				       &start_slot);
352518ec90d6SJosef Bacik 
352618ec90d6SJosef Bacik 		ret = btrfs_del_items(trans, log, path, start_slot,
352718ec90d6SJosef Bacik 				      path->slots[0] - start_slot + 1);
352818ec90d6SJosef Bacik 		/*
352918ec90d6SJosef Bacik 		 * If start slot isn't 0 then we don't need to re-search, we've
353018ec90d6SJosef Bacik 		 * found the last guy with the objectid in this tree.
353118ec90d6SJosef Bacik 		 */
353218ec90d6SJosef Bacik 		if (ret || start_slot != 0)
353365a246c5STsutomu Itoh 			break;
3534b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3535e02119d5SChris Mason 	}
3536b3b4aa74SDavid Sterba 	btrfs_release_path(path);
35375bdbeb21SJosef Bacik 	if (ret > 0)
35385bdbeb21SJosef Bacik 		ret = 0;
35394a500fd1SYan, Zheng 	return ret;
3540e02119d5SChris Mason }
3541e02119d5SChris Mason 
354294edf4aeSJosef Bacik static void fill_inode_item(struct btrfs_trans_handle *trans,
354394edf4aeSJosef Bacik 			    struct extent_buffer *leaf,
354494edf4aeSJosef Bacik 			    struct btrfs_inode_item *item,
35451a4bcf47SFilipe Manana 			    struct inode *inode, int log_inode_only,
35461a4bcf47SFilipe Manana 			    u64 logged_isize)
354794edf4aeSJosef Bacik {
35480b1c6ccaSJosef Bacik 	struct btrfs_map_token token;
354994edf4aeSJosef Bacik 
35500b1c6ccaSJosef Bacik 	btrfs_init_map_token(&token);
355194edf4aeSJosef Bacik 
355294edf4aeSJosef Bacik 	if (log_inode_only) {
355394edf4aeSJosef Bacik 		/* set the generation to zero so the recover code
355494edf4aeSJosef Bacik 		 * can tell the difference between an logging
355594edf4aeSJosef Bacik 		 * just to say 'this inode exists' and a logging
355694edf4aeSJosef Bacik 		 * to say 'update this inode with these values'
355794edf4aeSJosef Bacik 		 */
35580b1c6ccaSJosef Bacik 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
35591a4bcf47SFilipe Manana 		btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
356094edf4aeSJosef Bacik 	} else {
35610b1c6ccaSJosef Bacik 		btrfs_set_token_inode_generation(leaf, item,
35620b1c6ccaSJosef Bacik 						 BTRFS_I(inode)->generation,
35630b1c6ccaSJosef Bacik 						 &token);
35640b1c6ccaSJosef Bacik 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
356594edf4aeSJosef Bacik 	}
356694edf4aeSJosef Bacik 
35670b1c6ccaSJosef Bacik 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
35680b1c6ccaSJosef Bacik 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
35690b1c6ccaSJosef Bacik 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
35700b1c6ccaSJosef Bacik 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
35710b1c6ccaSJosef Bacik 
3572a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->atime,
35730b1c6ccaSJosef Bacik 				     inode->i_atime.tv_sec, &token);
3574a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
35750b1c6ccaSJosef Bacik 				      inode->i_atime.tv_nsec, &token);
35760b1c6ccaSJosef Bacik 
3577a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
35780b1c6ccaSJosef Bacik 				     inode->i_mtime.tv_sec, &token);
3579a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
35800b1c6ccaSJosef Bacik 				      inode->i_mtime.tv_nsec, &token);
35810b1c6ccaSJosef Bacik 
3582a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
35830b1c6ccaSJosef Bacik 				     inode->i_ctime.tv_sec, &token);
3584a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
35850b1c6ccaSJosef Bacik 				      inode->i_ctime.tv_nsec, &token);
35860b1c6ccaSJosef Bacik 
35870b1c6ccaSJosef Bacik 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
35880b1c6ccaSJosef Bacik 				     &token);
35890b1c6ccaSJosef Bacik 
35900b1c6ccaSJosef Bacik 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
35910b1c6ccaSJosef Bacik 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
35920b1c6ccaSJosef Bacik 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
35930b1c6ccaSJosef Bacik 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
35940b1c6ccaSJosef Bacik 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
359594edf4aeSJosef Bacik }
359694edf4aeSJosef Bacik 
3597a95249b3SJosef Bacik static int log_inode_item(struct btrfs_trans_handle *trans,
3598a95249b3SJosef Bacik 			  struct btrfs_root *log, struct btrfs_path *path,
3599*6d889a3bSNikolay Borisov 			  struct btrfs_inode *inode)
3600a95249b3SJosef Bacik {
3601a95249b3SJosef Bacik 	struct btrfs_inode_item *inode_item;
3602a95249b3SJosef Bacik 	int ret;
3603a95249b3SJosef Bacik 
3604efd0c405SFilipe David Borba Manana 	ret = btrfs_insert_empty_item(trans, log, path,
3605*6d889a3bSNikolay Borisov 				      &inode->location, sizeof(*inode_item));
3606a95249b3SJosef Bacik 	if (ret && ret != -EEXIST)
3607a95249b3SJosef Bacik 		return ret;
3608a95249b3SJosef Bacik 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3609a95249b3SJosef Bacik 				    struct btrfs_inode_item);
3610*6d889a3bSNikolay Borisov 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3611*6d889a3bSNikolay Borisov 			0, 0);
3612a95249b3SJosef Bacik 	btrfs_release_path(path);
3613a95249b3SJosef Bacik 	return 0;
3614a95249b3SJosef Bacik }
3615a95249b3SJosef Bacik 
361631ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans,
361744d70e19SNikolay Borisov 			       struct btrfs_inode *inode,
361831ff1cd2SChris Mason 			       struct btrfs_path *dst_path,
361916e7549fSJosef Bacik 			       struct btrfs_path *src_path, u64 *last_extent,
36201a4bcf47SFilipe Manana 			       int start_slot, int nr, int inode_only,
36211a4bcf47SFilipe Manana 			       u64 logged_isize)
362231ff1cd2SChris Mason {
362344d70e19SNikolay Borisov 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
362431ff1cd2SChris Mason 	unsigned long src_offset;
362531ff1cd2SChris Mason 	unsigned long dst_offset;
362644d70e19SNikolay Borisov 	struct btrfs_root *log = inode->root->log_root;
362731ff1cd2SChris Mason 	struct btrfs_file_extent_item *extent;
362831ff1cd2SChris Mason 	struct btrfs_inode_item *inode_item;
362916e7549fSJosef Bacik 	struct extent_buffer *src = src_path->nodes[0];
363016e7549fSJosef Bacik 	struct btrfs_key first_key, last_key, key;
363131ff1cd2SChris Mason 	int ret;
363231ff1cd2SChris Mason 	struct btrfs_key *ins_keys;
363331ff1cd2SChris Mason 	u32 *ins_sizes;
363431ff1cd2SChris Mason 	char *ins_data;
363531ff1cd2SChris Mason 	int i;
3636d20f7043SChris Mason 	struct list_head ordered_sums;
363744d70e19SNikolay Borisov 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
363816e7549fSJosef Bacik 	bool has_extents = false;
363974121f7cSFilipe Manana 	bool need_find_last_extent = true;
364016e7549fSJosef Bacik 	bool done = false;
3641d20f7043SChris Mason 
3642d20f7043SChris Mason 	INIT_LIST_HEAD(&ordered_sums);
364331ff1cd2SChris Mason 
364431ff1cd2SChris Mason 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
364531ff1cd2SChris Mason 			   nr * sizeof(u32), GFP_NOFS);
36462a29edc6Sliubo 	if (!ins_data)
36472a29edc6Sliubo 		return -ENOMEM;
36482a29edc6Sliubo 
364916e7549fSJosef Bacik 	first_key.objectid = (u64)-1;
365016e7549fSJosef Bacik 
365131ff1cd2SChris Mason 	ins_sizes = (u32 *)ins_data;
365231ff1cd2SChris Mason 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
365331ff1cd2SChris Mason 
365431ff1cd2SChris Mason 	for (i = 0; i < nr; i++) {
365531ff1cd2SChris Mason 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
365631ff1cd2SChris Mason 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
365731ff1cd2SChris Mason 	}
365831ff1cd2SChris Mason 	ret = btrfs_insert_empty_items(trans, log, dst_path,
365931ff1cd2SChris Mason 				       ins_keys, ins_sizes, nr);
36604a500fd1SYan, Zheng 	if (ret) {
36614a500fd1SYan, Zheng 		kfree(ins_data);
36624a500fd1SYan, Zheng 		return ret;
36634a500fd1SYan, Zheng 	}
366431ff1cd2SChris Mason 
36655d4f98a2SYan Zheng 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
366631ff1cd2SChris Mason 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
366731ff1cd2SChris Mason 						   dst_path->slots[0]);
366831ff1cd2SChris Mason 
366931ff1cd2SChris Mason 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
367031ff1cd2SChris Mason 
367116e7549fSJosef Bacik 		if ((i == (nr - 1)))
367216e7549fSJosef Bacik 			last_key = ins_keys[i];
367316e7549fSJosef Bacik 
367494edf4aeSJosef Bacik 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
367531ff1cd2SChris Mason 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
367631ff1cd2SChris Mason 						    dst_path->slots[0],
367731ff1cd2SChris Mason 						    struct btrfs_inode_item);
367894edf4aeSJosef Bacik 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
367944d70e19SNikolay Borisov 					&inode->vfs_inode, inode_only == LOG_INODE_EXISTS,
36801a4bcf47SFilipe Manana 					logged_isize);
368194edf4aeSJosef Bacik 		} else {
368294edf4aeSJosef Bacik 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
368394edf4aeSJosef Bacik 					   src_offset, ins_sizes[i]);
368431ff1cd2SChris Mason 		}
368594edf4aeSJosef Bacik 
368616e7549fSJosef Bacik 		/*
368716e7549fSJosef Bacik 		 * We set need_find_last_extent here in case we know we were
368816e7549fSJosef Bacik 		 * processing other items and then walk into the first extent in
368916e7549fSJosef Bacik 		 * the inode.  If we don't hit an extent then nothing changes,
369016e7549fSJosef Bacik 		 * we'll do the last search the next time around.
369116e7549fSJosef Bacik 		 */
369216e7549fSJosef Bacik 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
369316e7549fSJosef Bacik 			has_extents = true;
369474121f7cSFilipe Manana 			if (first_key.objectid == (u64)-1)
369516e7549fSJosef Bacik 				first_key = ins_keys[i];
369616e7549fSJosef Bacik 		} else {
369716e7549fSJosef Bacik 			need_find_last_extent = false;
369816e7549fSJosef Bacik 		}
369916e7549fSJosef Bacik 
370031ff1cd2SChris Mason 		/* take a reference on file data extents so that truncates
370131ff1cd2SChris Mason 		 * or deletes of this inode don't have to relog the inode
370231ff1cd2SChris Mason 		 * again
370331ff1cd2SChris Mason 		 */
3704962a298fSDavid Sterba 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3705d2794405SLiu Bo 		    !skip_csum) {
370631ff1cd2SChris Mason 			int found_type;
370731ff1cd2SChris Mason 			extent = btrfs_item_ptr(src, start_slot + i,
370831ff1cd2SChris Mason 						struct btrfs_file_extent_item);
370931ff1cd2SChris Mason 
37108e531cdfSliubo 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
37118e531cdfSliubo 				continue;
37128e531cdfSliubo 
371331ff1cd2SChris Mason 			found_type = btrfs_file_extent_type(src, extent);
37146f1fed77SJosef Bacik 			if (found_type == BTRFS_FILE_EXTENT_REG) {
37155d4f98a2SYan Zheng 				u64 ds, dl, cs, cl;
37165d4f98a2SYan Zheng 				ds = btrfs_file_extent_disk_bytenr(src,
371731ff1cd2SChris Mason 								extent);
37185d4f98a2SYan Zheng 				/* ds == 0 is a hole */
37195d4f98a2SYan Zheng 				if (ds == 0)
37205d4f98a2SYan Zheng 					continue;
37215d4f98a2SYan Zheng 
37225d4f98a2SYan Zheng 				dl = btrfs_file_extent_disk_num_bytes(src,
372331ff1cd2SChris Mason 								extent);
37245d4f98a2SYan Zheng 				cs = btrfs_file_extent_offset(src, extent);
37255d4f98a2SYan Zheng 				cl = btrfs_file_extent_num_bytes(src,
3726a419aef8SJoe Perches 								extent);
3727580afd76SChris Mason 				if (btrfs_file_extent_compression(src,
3728580afd76SChris Mason 								  extent)) {
3729580afd76SChris Mason 					cs = 0;
3730580afd76SChris Mason 					cl = dl;
3731580afd76SChris Mason 				}
37325d4f98a2SYan Zheng 
373307d400a6SYan Zheng 				ret = btrfs_lookup_csums_range(
37340b246afaSJeff Mahoney 						fs_info->csum_root,
373507d400a6SYan Zheng 						ds + cs, ds + cs + cl - 1,
3736a2de733cSArne Jansen 						&ordered_sums, 0);
37373650860bSJosef Bacik 				if (ret) {
37383650860bSJosef Bacik 					btrfs_release_path(dst_path);
37393650860bSJosef Bacik 					kfree(ins_data);
37403650860bSJosef Bacik 					return ret;
37413650860bSJosef Bacik 				}
374231ff1cd2SChris Mason 			}
374331ff1cd2SChris Mason 		}
374431ff1cd2SChris Mason 	}
374531ff1cd2SChris Mason 
374631ff1cd2SChris Mason 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3747b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
374831ff1cd2SChris Mason 	kfree(ins_data);
3749d20f7043SChris Mason 
3750d20f7043SChris Mason 	/*
3751d20f7043SChris Mason 	 * we have to do this after the loop above to avoid changing the
3752d20f7043SChris Mason 	 * log tree while trying to change the log tree.
3753d20f7043SChris Mason 	 */
37544a500fd1SYan, Zheng 	ret = 0;
3755d20f7043SChris Mason 	while (!list_empty(&ordered_sums)) {
3756d20f7043SChris Mason 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3757d20f7043SChris Mason 						   struct btrfs_ordered_sum,
3758d20f7043SChris Mason 						   list);
37594a500fd1SYan, Zheng 		if (!ret)
3760d20f7043SChris Mason 			ret = btrfs_csum_file_blocks(trans, log, sums);
3761d20f7043SChris Mason 		list_del(&sums->list);
3762d20f7043SChris Mason 		kfree(sums);
3763d20f7043SChris Mason 	}
376416e7549fSJosef Bacik 
376516e7549fSJosef Bacik 	if (!has_extents)
376616e7549fSJosef Bacik 		return ret;
376716e7549fSJosef Bacik 
376874121f7cSFilipe Manana 	if (need_find_last_extent && *last_extent == first_key.offset) {
376974121f7cSFilipe Manana 		/*
377074121f7cSFilipe Manana 		 * We don't have any leafs between our current one and the one
377174121f7cSFilipe Manana 		 * we processed before that can have file extent items for our
377274121f7cSFilipe Manana 		 * inode (and have a generation number smaller than our current
377374121f7cSFilipe Manana 		 * transaction id).
377474121f7cSFilipe Manana 		 */
377574121f7cSFilipe Manana 		need_find_last_extent = false;
377674121f7cSFilipe Manana 	}
377774121f7cSFilipe Manana 
377816e7549fSJosef Bacik 	/*
377916e7549fSJosef Bacik 	 * Because we use btrfs_search_forward we could skip leaves that were
378016e7549fSJosef Bacik 	 * not modified and then assume *last_extent is valid when it really
378116e7549fSJosef Bacik 	 * isn't.  So back up to the previous leaf and read the end of the last
378216e7549fSJosef Bacik 	 * extent before we go and fill in holes.
378316e7549fSJosef Bacik 	 */
378416e7549fSJosef Bacik 	if (need_find_last_extent) {
378516e7549fSJosef Bacik 		u64 len;
378616e7549fSJosef Bacik 
378744d70e19SNikolay Borisov 		ret = btrfs_prev_leaf(inode->root, src_path);
378816e7549fSJosef Bacik 		if (ret < 0)
378916e7549fSJosef Bacik 			return ret;
379016e7549fSJosef Bacik 		if (ret)
379116e7549fSJosef Bacik 			goto fill_holes;
379216e7549fSJosef Bacik 		if (src_path->slots[0])
379316e7549fSJosef Bacik 			src_path->slots[0]--;
379416e7549fSJosef Bacik 		src = src_path->nodes[0];
379516e7549fSJosef Bacik 		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
379644d70e19SNikolay Borisov 		if (key.objectid != btrfs_ino(inode) ||
379716e7549fSJosef Bacik 		    key.type != BTRFS_EXTENT_DATA_KEY)
379816e7549fSJosef Bacik 			goto fill_holes;
379916e7549fSJosef Bacik 		extent = btrfs_item_ptr(src, src_path->slots[0],
380016e7549fSJosef Bacik 					struct btrfs_file_extent_item);
380116e7549fSJosef Bacik 		if (btrfs_file_extent_type(src, extent) ==
380216e7549fSJosef Bacik 		    BTRFS_FILE_EXTENT_INLINE) {
3803514ac8adSChris Mason 			len = btrfs_file_extent_inline_len(src,
3804514ac8adSChris Mason 							   src_path->slots[0],
3805514ac8adSChris Mason 							   extent);
380616e7549fSJosef Bacik 			*last_extent = ALIGN(key.offset + len,
38070b246afaSJeff Mahoney 					     fs_info->sectorsize);
380816e7549fSJosef Bacik 		} else {
380916e7549fSJosef Bacik 			len = btrfs_file_extent_num_bytes(src, extent);
381016e7549fSJosef Bacik 			*last_extent = key.offset + len;
381116e7549fSJosef Bacik 		}
381216e7549fSJosef Bacik 	}
381316e7549fSJosef Bacik fill_holes:
381416e7549fSJosef Bacik 	/* So we did prev_leaf, now we need to move to the next leaf, but a few
381516e7549fSJosef Bacik 	 * things could have happened
381616e7549fSJosef Bacik 	 *
381716e7549fSJosef Bacik 	 * 1) A merge could have happened, so we could currently be on a leaf
381816e7549fSJosef Bacik 	 * that holds what we were copying in the first place.
381916e7549fSJosef Bacik 	 * 2) A split could have happened, and now not all of the items we want
382016e7549fSJosef Bacik 	 * are on the same leaf.
382116e7549fSJosef Bacik 	 *
382216e7549fSJosef Bacik 	 * So we need to adjust how we search for holes, we need to drop the
382316e7549fSJosef Bacik 	 * path and re-search for the first extent key we found, and then walk
382416e7549fSJosef Bacik 	 * forward until we hit the last one we copied.
382516e7549fSJosef Bacik 	 */
382616e7549fSJosef Bacik 	if (need_find_last_extent) {
382716e7549fSJosef Bacik 		/* btrfs_prev_leaf could return 1 without releasing the path */
382816e7549fSJosef Bacik 		btrfs_release_path(src_path);
382944d70e19SNikolay Borisov 		ret = btrfs_search_slot(NULL, inode->root, &first_key, src_path, 0, 0);
383016e7549fSJosef Bacik 		if (ret < 0)
383116e7549fSJosef Bacik 			return ret;
383216e7549fSJosef Bacik 		ASSERT(ret == 0);
383316e7549fSJosef Bacik 		src = src_path->nodes[0];
383416e7549fSJosef Bacik 		i = src_path->slots[0];
383516e7549fSJosef Bacik 	} else {
383616e7549fSJosef Bacik 		i = start_slot;
383716e7549fSJosef Bacik 	}
383816e7549fSJosef Bacik 
383916e7549fSJosef Bacik 	/*
384016e7549fSJosef Bacik 	 * Ok so here we need to go through and fill in any holes we may have
384116e7549fSJosef Bacik 	 * to make sure that holes are punched for those areas in case they had
384216e7549fSJosef Bacik 	 * extents previously.
384316e7549fSJosef Bacik 	 */
384416e7549fSJosef Bacik 	while (!done) {
384516e7549fSJosef Bacik 		u64 offset, len;
384616e7549fSJosef Bacik 		u64 extent_end;
384716e7549fSJosef Bacik 
384816e7549fSJosef Bacik 		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
384944d70e19SNikolay Borisov 			ret = btrfs_next_leaf(inode->root, src_path);
385016e7549fSJosef Bacik 			if (ret < 0)
385116e7549fSJosef Bacik 				return ret;
385216e7549fSJosef Bacik 			ASSERT(ret == 0);
385316e7549fSJosef Bacik 			src = src_path->nodes[0];
385416e7549fSJosef Bacik 			i = 0;
385516e7549fSJosef Bacik 		}
385616e7549fSJosef Bacik 
385716e7549fSJosef Bacik 		btrfs_item_key_to_cpu(src, &key, i);
385816e7549fSJosef Bacik 		if (!btrfs_comp_cpu_keys(&key, &last_key))
385916e7549fSJosef Bacik 			done = true;
386044d70e19SNikolay Borisov 		if (key.objectid != btrfs_ino(inode) ||
386116e7549fSJosef Bacik 		    key.type != BTRFS_EXTENT_DATA_KEY) {
386216e7549fSJosef Bacik 			i++;
386316e7549fSJosef Bacik 			continue;
386416e7549fSJosef Bacik 		}
386516e7549fSJosef Bacik 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
386616e7549fSJosef Bacik 		if (btrfs_file_extent_type(src, extent) ==
386716e7549fSJosef Bacik 		    BTRFS_FILE_EXTENT_INLINE) {
3868514ac8adSChris Mason 			len = btrfs_file_extent_inline_len(src, i, extent);
3869da17066cSJeff Mahoney 			extent_end = ALIGN(key.offset + len,
38700b246afaSJeff Mahoney 					   fs_info->sectorsize);
387116e7549fSJosef Bacik 		} else {
387216e7549fSJosef Bacik 			len = btrfs_file_extent_num_bytes(src, extent);
387316e7549fSJosef Bacik 			extent_end = key.offset + len;
387416e7549fSJosef Bacik 		}
387516e7549fSJosef Bacik 		i++;
387616e7549fSJosef Bacik 
387716e7549fSJosef Bacik 		if (*last_extent == key.offset) {
387816e7549fSJosef Bacik 			*last_extent = extent_end;
387916e7549fSJosef Bacik 			continue;
388016e7549fSJosef Bacik 		}
388116e7549fSJosef Bacik 		offset = *last_extent;
388216e7549fSJosef Bacik 		len = key.offset - *last_extent;
388344d70e19SNikolay Borisov 		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
388444d70e19SNikolay Borisov 					       offset, 0, 0, len, 0, len, 0, 0, 0);
388516e7549fSJosef Bacik 		if (ret)
388616e7549fSJosef Bacik 			break;
388774121f7cSFilipe Manana 		*last_extent = extent_end;
388816e7549fSJosef Bacik 	}
388916e7549fSJosef Bacik 	/*
389016e7549fSJosef Bacik 	 * Need to let the callers know we dropped the path so they should
389116e7549fSJosef Bacik 	 * re-search.
389216e7549fSJosef Bacik 	 */
389316e7549fSJosef Bacik 	if (!ret && need_find_last_extent)
389416e7549fSJosef Bacik 		ret = 1;
38954a500fd1SYan, Zheng 	return ret;
389631ff1cd2SChris Mason }
389731ff1cd2SChris Mason 
38985dc562c5SJosef Bacik static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
38995dc562c5SJosef Bacik {
39005dc562c5SJosef Bacik 	struct extent_map *em1, *em2;
39015dc562c5SJosef Bacik 
39025dc562c5SJosef Bacik 	em1 = list_entry(a, struct extent_map, list);
39035dc562c5SJosef Bacik 	em2 = list_entry(b, struct extent_map, list);
39045dc562c5SJosef Bacik 
39055dc562c5SJosef Bacik 	if (em1->start < em2->start)
39065dc562c5SJosef Bacik 		return -1;
39075dc562c5SJosef Bacik 	else if (em1->start > em2->start)
39085dc562c5SJosef Bacik 		return 1;
39095dc562c5SJosef Bacik 	return 0;
39105dc562c5SJosef Bacik }
39115dc562c5SJosef Bacik 
39128407f553SFilipe Manana static int wait_ordered_extents(struct btrfs_trans_handle *trans,
39138407f553SFilipe Manana 				struct inode *inode,
39148407f553SFilipe Manana 				struct btrfs_root *root,
39158407f553SFilipe Manana 				const struct extent_map *em,
39168407f553SFilipe Manana 				const struct list_head *logged_list,
39178407f553SFilipe Manana 				bool *ordered_io_error)
39185dc562c5SJosef Bacik {
39190b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
39202ab28f32SJosef Bacik 	struct btrfs_ordered_extent *ordered;
39218407f553SFilipe Manana 	struct btrfs_root *log = root->log_root;
39222ab28f32SJosef Bacik 	u64 mod_start = em->mod_start;
39232ab28f32SJosef Bacik 	u64 mod_len = em->mod_len;
39248407f553SFilipe Manana 	const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
39252ab28f32SJosef Bacik 	u64 csum_offset;
39262ab28f32SJosef Bacik 	u64 csum_len;
39278407f553SFilipe Manana 	LIST_HEAD(ordered_sums);
39288407f553SFilipe Manana 	int ret = 0;
392909a2a8f9SJosef Bacik 
39308407f553SFilipe Manana 	*ordered_io_error = false;
39311acae57bSFilipe David Borba Manana 
39328407f553SFilipe Manana 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
39338407f553SFilipe Manana 	    em->block_start == EXTENT_MAP_HOLE)
393470c8a91cSJosef Bacik 		return 0;
393570c8a91cSJosef Bacik 
39362ab28f32SJosef Bacik 	/*
39378407f553SFilipe Manana 	 * Wait far any ordered extent that covers our extent map. If it
39388407f553SFilipe Manana 	 * finishes without an error, first check and see if our csums are on
39398407f553SFilipe Manana 	 * our outstanding ordered extents.
39402ab28f32SJosef Bacik 	 */
3941827463c4SMiao Xie 	list_for_each_entry(ordered, logged_list, log_list) {
39422ab28f32SJosef Bacik 		struct btrfs_ordered_sum *sum;
39432ab28f32SJosef Bacik 
39442ab28f32SJosef Bacik 		if (!mod_len)
39452ab28f32SJosef Bacik 			break;
39462ab28f32SJosef Bacik 
39472ab28f32SJosef Bacik 		if (ordered->file_offset + ordered->len <= mod_start ||
39482ab28f32SJosef Bacik 		    mod_start + mod_len <= ordered->file_offset)
39492ab28f32SJosef Bacik 			continue;
39502ab28f32SJosef Bacik 
39518407f553SFilipe Manana 		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
39528407f553SFilipe Manana 		    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
39538407f553SFilipe Manana 		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
39548407f553SFilipe Manana 			const u64 start = ordered->file_offset;
39558407f553SFilipe Manana 			const u64 end = ordered->file_offset + ordered->len - 1;
39568407f553SFilipe Manana 
39578407f553SFilipe Manana 			WARN_ON(ordered->inode != inode);
39588407f553SFilipe Manana 			filemap_fdatawrite_range(inode->i_mapping, start, end);
39598407f553SFilipe Manana 		}
39608407f553SFilipe Manana 
39618407f553SFilipe Manana 		wait_event(ordered->wait,
39628407f553SFilipe Manana 			   (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
39638407f553SFilipe Manana 			    test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
39648407f553SFilipe Manana 
39658407f553SFilipe Manana 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3966b38ef71cSFilipe Manana 			/*
3967b38ef71cSFilipe Manana 			 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3968b38ef71cSFilipe Manana 			 * i_mapping flags, so that the next fsync won't get
3969b38ef71cSFilipe Manana 			 * an outdated io error too.
3970b38ef71cSFilipe Manana 			 */
3971f0312210SMiklos Szeredi 			filemap_check_errors(inode->i_mapping);
39728407f553SFilipe Manana 			*ordered_io_error = true;
39738407f553SFilipe Manana 			break;
39748407f553SFilipe Manana 		}
39752ab28f32SJosef Bacik 		/*
39762ab28f32SJosef Bacik 		 * We are going to copy all the csums on this ordered extent, so
39772ab28f32SJosef Bacik 		 * go ahead and adjust mod_start and mod_len in case this
39782ab28f32SJosef Bacik 		 * ordered extent has already been logged.
39792ab28f32SJosef Bacik 		 */
39802ab28f32SJosef Bacik 		if (ordered->file_offset > mod_start) {
39812ab28f32SJosef Bacik 			if (ordered->file_offset + ordered->len >=
39822ab28f32SJosef Bacik 			    mod_start + mod_len)
39832ab28f32SJosef Bacik 				mod_len = ordered->file_offset - mod_start;
39842ab28f32SJosef Bacik 			/*
39852ab28f32SJosef Bacik 			 * If we have this case
39862ab28f32SJosef Bacik 			 *
39872ab28f32SJosef Bacik 			 * |--------- logged extent ---------|
39882ab28f32SJosef Bacik 			 *       |----- ordered extent ----|
39892ab28f32SJosef Bacik 			 *
39902ab28f32SJosef Bacik 			 * Just don't mess with mod_start and mod_len, we'll
39912ab28f32SJosef Bacik 			 * just end up logging more csums than we need and it
39922ab28f32SJosef Bacik 			 * will be ok.
39932ab28f32SJosef Bacik 			 */
39942ab28f32SJosef Bacik 		} else {
39952ab28f32SJosef Bacik 			if (ordered->file_offset + ordered->len <
39962ab28f32SJosef Bacik 			    mod_start + mod_len) {
39972ab28f32SJosef Bacik 				mod_len = (mod_start + mod_len) -
39982ab28f32SJosef Bacik 					(ordered->file_offset + ordered->len);
39992ab28f32SJosef Bacik 				mod_start = ordered->file_offset +
40002ab28f32SJosef Bacik 					ordered->len;
40012ab28f32SJosef Bacik 			} else {
40022ab28f32SJosef Bacik 				mod_len = 0;
40032ab28f32SJosef Bacik 			}
40042ab28f32SJosef Bacik 		}
40052ab28f32SJosef Bacik 
40068407f553SFilipe Manana 		if (skip_csum)
40078407f553SFilipe Manana 			continue;
40088407f553SFilipe Manana 
40092ab28f32SJosef Bacik 		/*
40102ab28f32SJosef Bacik 		 * To keep us from looping for the above case of an ordered
40112ab28f32SJosef Bacik 		 * extent that falls inside of the logged extent.
40122ab28f32SJosef Bacik 		 */
40132ab28f32SJosef Bacik 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
40142ab28f32SJosef Bacik 				     &ordered->flags))
40152ab28f32SJosef Bacik 			continue;
40162ab28f32SJosef Bacik 
40172ab28f32SJosef Bacik 		list_for_each_entry(sum, &ordered->list, list) {
40182ab28f32SJosef Bacik 			ret = btrfs_csum_file_blocks(trans, log, sum);
4019827463c4SMiao Xie 			if (ret)
40208407f553SFilipe Manana 				break;
40218407f553SFilipe Manana 		}
40222ab28f32SJosef Bacik 	}
40232ab28f32SJosef Bacik 
40248407f553SFilipe Manana 	if (*ordered_io_error || !mod_len || ret || skip_csum)
40252ab28f32SJosef Bacik 		return ret;
40262ab28f32SJosef Bacik 
4027488111aaSFilipe David Borba Manana 	if (em->compress_type) {
4028488111aaSFilipe David Borba Manana 		csum_offset = 0;
40298407f553SFilipe Manana 		csum_len = max(em->block_len, em->orig_block_len);
4030488111aaSFilipe David Borba Manana 	} else {
40312ab28f32SJosef Bacik 		csum_offset = mod_start - em->start;
40322ab28f32SJosef Bacik 		csum_len = mod_len;
4033488111aaSFilipe David Borba Manana 	}
40342ab28f32SJosef Bacik 
403570c8a91cSJosef Bacik 	/* block start is already adjusted for the file extent offset. */
40360b246afaSJeff Mahoney 	ret = btrfs_lookup_csums_range(fs_info->csum_root,
403770c8a91cSJosef Bacik 				       em->block_start + csum_offset,
403870c8a91cSJosef Bacik 				       em->block_start + csum_offset +
403970c8a91cSJosef Bacik 				       csum_len - 1, &ordered_sums, 0);
40405dc562c5SJosef Bacik 	if (ret)
40415dc562c5SJosef Bacik 		return ret;
404270c8a91cSJosef Bacik 
404370c8a91cSJosef Bacik 	while (!list_empty(&ordered_sums)) {
404470c8a91cSJosef Bacik 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
404570c8a91cSJosef Bacik 						   struct btrfs_ordered_sum,
404670c8a91cSJosef Bacik 						   list);
404770c8a91cSJosef Bacik 		if (!ret)
404870c8a91cSJosef Bacik 			ret = btrfs_csum_file_blocks(trans, log, sums);
404970c8a91cSJosef Bacik 		list_del(&sums->list);
405070c8a91cSJosef Bacik 		kfree(sums);
40515dc562c5SJosef Bacik 	}
40525dc562c5SJosef Bacik 
405370c8a91cSJosef Bacik 	return ret;
40545dc562c5SJosef Bacik }
40555dc562c5SJosef Bacik 
40568407f553SFilipe Manana static int log_one_extent(struct btrfs_trans_handle *trans,
40579d122629SNikolay Borisov 			  struct btrfs_inode *inode, struct btrfs_root *root,
40588407f553SFilipe Manana 			  const struct extent_map *em,
40598407f553SFilipe Manana 			  struct btrfs_path *path,
40608407f553SFilipe Manana 			  const struct list_head *logged_list,
40618407f553SFilipe Manana 			  struct btrfs_log_ctx *ctx)
40628407f553SFilipe Manana {
40638407f553SFilipe Manana 	struct btrfs_root *log = root->log_root;
40648407f553SFilipe Manana 	struct btrfs_file_extent_item *fi;
40658407f553SFilipe Manana 	struct extent_buffer *leaf;
40668407f553SFilipe Manana 	struct btrfs_map_token token;
40678407f553SFilipe Manana 	struct btrfs_key key;
40688407f553SFilipe Manana 	u64 extent_offset = em->start - em->orig_start;
40698407f553SFilipe Manana 	u64 block_len;
40708407f553SFilipe Manana 	int ret;
40718407f553SFilipe Manana 	int extent_inserted = 0;
40728407f553SFilipe Manana 	bool ordered_io_err = false;
40738407f553SFilipe Manana 
40749d122629SNikolay Borisov 	ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, logged_list,
40758407f553SFilipe Manana 				   &ordered_io_err);
40768407f553SFilipe Manana 	if (ret)
40778407f553SFilipe Manana 		return ret;
40788407f553SFilipe Manana 
40798407f553SFilipe Manana 	if (ordered_io_err) {
40808407f553SFilipe Manana 		ctx->io_err = -EIO;
40818407f553SFilipe Manana 		return 0;
40828407f553SFilipe Manana 	}
40838407f553SFilipe Manana 
40848407f553SFilipe Manana 	btrfs_init_map_token(&token);
40858407f553SFilipe Manana 
40869d122629SNikolay Borisov 	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
40878407f553SFilipe Manana 				   em->start + em->len, NULL, 0, 1,
40888407f553SFilipe Manana 				   sizeof(*fi), &extent_inserted);
40898407f553SFilipe Manana 	if (ret)
40908407f553SFilipe Manana 		return ret;
40918407f553SFilipe Manana 
40928407f553SFilipe Manana 	if (!extent_inserted) {
40939d122629SNikolay Borisov 		key.objectid = btrfs_ino(inode);
40948407f553SFilipe Manana 		key.type = BTRFS_EXTENT_DATA_KEY;
40958407f553SFilipe Manana 		key.offset = em->start;
40968407f553SFilipe Manana 
40978407f553SFilipe Manana 		ret = btrfs_insert_empty_item(trans, log, path, &key,
40988407f553SFilipe Manana 					      sizeof(*fi));
40998407f553SFilipe Manana 		if (ret)
41008407f553SFilipe Manana 			return ret;
41018407f553SFilipe Manana 	}
41028407f553SFilipe Manana 	leaf = path->nodes[0];
41038407f553SFilipe Manana 	fi = btrfs_item_ptr(leaf, path->slots[0],
41048407f553SFilipe Manana 			    struct btrfs_file_extent_item);
41058407f553SFilipe Manana 
410650d9aa99SJosef Bacik 	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
41078407f553SFilipe Manana 					       &token);
41088407f553SFilipe Manana 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
41098407f553SFilipe Manana 		btrfs_set_token_file_extent_type(leaf, fi,
41108407f553SFilipe Manana 						 BTRFS_FILE_EXTENT_PREALLOC,
41118407f553SFilipe Manana 						 &token);
41128407f553SFilipe Manana 	else
41138407f553SFilipe Manana 		btrfs_set_token_file_extent_type(leaf, fi,
41148407f553SFilipe Manana 						 BTRFS_FILE_EXTENT_REG,
41158407f553SFilipe Manana 						 &token);
41168407f553SFilipe Manana 
41178407f553SFilipe Manana 	block_len = max(em->block_len, em->orig_block_len);
41188407f553SFilipe Manana 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
41198407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
41208407f553SFilipe Manana 							em->block_start,
41218407f553SFilipe Manana 							&token);
41228407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
41238407f553SFilipe Manana 							   &token);
41248407f553SFilipe Manana 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
41258407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
41268407f553SFilipe Manana 							em->block_start -
41278407f553SFilipe Manana 							extent_offset, &token);
41288407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
41298407f553SFilipe Manana 							   &token);
41308407f553SFilipe Manana 	} else {
41318407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
41328407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
41338407f553SFilipe Manana 							   &token);
41348407f553SFilipe Manana 	}
41358407f553SFilipe Manana 
41368407f553SFilipe Manana 	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
41378407f553SFilipe Manana 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
41388407f553SFilipe Manana 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
41398407f553SFilipe Manana 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
41408407f553SFilipe Manana 						&token);
41418407f553SFilipe Manana 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
41428407f553SFilipe Manana 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
41438407f553SFilipe Manana 	btrfs_mark_buffer_dirty(leaf);
41448407f553SFilipe Manana 
41458407f553SFilipe Manana 	btrfs_release_path(path);
41468407f553SFilipe Manana 
41478407f553SFilipe Manana 	return ret;
41488407f553SFilipe Manana }
41498407f553SFilipe Manana 
41505dc562c5SJosef Bacik static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
41515dc562c5SJosef Bacik 				     struct btrfs_root *root,
41529d122629SNikolay Borisov 				     struct btrfs_inode *inode,
4153827463c4SMiao Xie 				     struct btrfs_path *path,
41548407f553SFilipe Manana 				     struct list_head *logged_list,
4155de0ee0edSFilipe Manana 				     struct btrfs_log_ctx *ctx,
4156de0ee0edSFilipe Manana 				     const u64 start,
4157de0ee0edSFilipe Manana 				     const u64 end)
41585dc562c5SJosef Bacik {
41595dc562c5SJosef Bacik 	struct extent_map *em, *n;
41605dc562c5SJosef Bacik 	struct list_head extents;
41619d122629SNikolay Borisov 	struct extent_map_tree *tree = &inode->extent_tree;
41625dc562c5SJosef Bacik 	u64 test_gen;
41635dc562c5SJosef Bacik 	int ret = 0;
41642ab28f32SJosef Bacik 	int num = 0;
41655dc562c5SJosef Bacik 
41665dc562c5SJosef Bacik 	INIT_LIST_HEAD(&extents);
41675dc562c5SJosef Bacik 
41689d122629SNikolay Borisov 	down_write(&inode->dio_sem);
41695dc562c5SJosef Bacik 	write_lock(&tree->lock);
41705dc562c5SJosef Bacik 	test_gen = root->fs_info->last_trans_committed;
41715dc562c5SJosef Bacik 
41725dc562c5SJosef Bacik 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
41735dc562c5SJosef Bacik 		list_del_init(&em->list);
41742ab28f32SJosef Bacik 
41752ab28f32SJosef Bacik 		/*
41762ab28f32SJosef Bacik 		 * Just an arbitrary number, this can be really CPU intensive
41772ab28f32SJosef Bacik 		 * once we start getting a lot of extents, and really once we
41782ab28f32SJosef Bacik 		 * have a bunch of extents we just want to commit since it will
41792ab28f32SJosef Bacik 		 * be faster.
41802ab28f32SJosef Bacik 		 */
41812ab28f32SJosef Bacik 		if (++num > 32768) {
41822ab28f32SJosef Bacik 			list_del_init(&tree->modified_extents);
41832ab28f32SJosef Bacik 			ret = -EFBIG;
41842ab28f32SJosef Bacik 			goto process;
41852ab28f32SJosef Bacik 		}
41862ab28f32SJosef Bacik 
41875dc562c5SJosef Bacik 		if (em->generation <= test_gen)
41885dc562c5SJosef Bacik 			continue;
4189ff44c6e3SJosef Bacik 		/* Need a ref to keep it from getting evicted from cache */
4190ff44c6e3SJosef Bacik 		atomic_inc(&em->refs);
4191ff44c6e3SJosef Bacik 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
41925dc562c5SJosef Bacik 		list_add_tail(&em->list, &extents);
41932ab28f32SJosef Bacik 		num++;
41945dc562c5SJosef Bacik 	}
41955dc562c5SJosef Bacik 
41965dc562c5SJosef Bacik 	list_sort(NULL, &extents, extent_cmp);
41979d122629SNikolay Borisov 	btrfs_get_logged_extents(inode, logged_list, start, end);
41985f9a8a51SFilipe Manana 	/*
41995f9a8a51SFilipe Manana 	 * Some ordered extents started by fsync might have completed
42005f9a8a51SFilipe Manana 	 * before we could collect them into the list logged_list, which
42015f9a8a51SFilipe Manana 	 * means they're gone, not in our logged_list nor in the inode's
42025f9a8a51SFilipe Manana 	 * ordered tree. We want the application/user space to know an
42035f9a8a51SFilipe Manana 	 * error happened while attempting to persist file data so that
42045f9a8a51SFilipe Manana 	 * it can take proper action. If such error happened, we leave
42055f9a8a51SFilipe Manana 	 * without writing to the log tree and the fsync must report the
42065f9a8a51SFilipe Manana 	 * file data write error and not commit the current transaction.
42075f9a8a51SFilipe Manana 	 */
42089d122629SNikolay Borisov 	ret = filemap_check_errors(inode->vfs_inode.i_mapping);
42095f9a8a51SFilipe Manana 	if (ret)
42105f9a8a51SFilipe Manana 		ctx->io_err = ret;
42112ab28f32SJosef Bacik process:
42125dc562c5SJosef Bacik 	while (!list_empty(&extents)) {
42135dc562c5SJosef Bacik 		em = list_entry(extents.next, struct extent_map, list);
42145dc562c5SJosef Bacik 
42155dc562c5SJosef Bacik 		list_del_init(&em->list);
42165dc562c5SJosef Bacik 
42175dc562c5SJosef Bacik 		/*
42185dc562c5SJosef Bacik 		 * If we had an error we just need to delete everybody from our
42195dc562c5SJosef Bacik 		 * private list.
42205dc562c5SJosef Bacik 		 */
4221ff44c6e3SJosef Bacik 		if (ret) {
4222201a9038SJosef Bacik 			clear_em_logging(tree, em);
4223ff44c6e3SJosef Bacik 			free_extent_map(em);
42245dc562c5SJosef Bacik 			continue;
4225ff44c6e3SJosef Bacik 		}
4226ff44c6e3SJosef Bacik 
4227ff44c6e3SJosef Bacik 		write_unlock(&tree->lock);
42285dc562c5SJosef Bacik 
42298407f553SFilipe Manana 		ret = log_one_extent(trans, inode, root, em, path, logged_list,
42308407f553SFilipe Manana 				     ctx);
4231ff44c6e3SJosef Bacik 		write_lock(&tree->lock);
4232201a9038SJosef Bacik 		clear_em_logging(tree, em);
4233201a9038SJosef Bacik 		free_extent_map(em);
42345dc562c5SJosef Bacik 	}
4235ff44c6e3SJosef Bacik 	WARN_ON(!list_empty(&extents));
4236ff44c6e3SJosef Bacik 	write_unlock(&tree->lock);
42379d122629SNikolay Borisov 	up_write(&inode->dio_sem);
42385dc562c5SJosef Bacik 
42395dc562c5SJosef Bacik 	btrfs_release_path(path);
42405dc562c5SJosef Bacik 	return ret;
42415dc562c5SJosef Bacik }
42425dc562c5SJosef Bacik 
4243481b01c0SNikolay Borisov static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
42441a4bcf47SFilipe Manana 			     struct btrfs_path *path, u64 *size_ret)
42451a4bcf47SFilipe Manana {
42461a4bcf47SFilipe Manana 	struct btrfs_key key;
42471a4bcf47SFilipe Manana 	int ret;
42481a4bcf47SFilipe Manana 
4249481b01c0SNikolay Borisov 	key.objectid = btrfs_ino(inode);
42501a4bcf47SFilipe Manana 	key.type = BTRFS_INODE_ITEM_KEY;
42511a4bcf47SFilipe Manana 	key.offset = 0;
42521a4bcf47SFilipe Manana 
42531a4bcf47SFilipe Manana 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
42541a4bcf47SFilipe Manana 	if (ret < 0) {
42551a4bcf47SFilipe Manana 		return ret;
42561a4bcf47SFilipe Manana 	} else if (ret > 0) {
42572f2ff0eeSFilipe Manana 		*size_ret = 0;
42581a4bcf47SFilipe Manana 	} else {
42591a4bcf47SFilipe Manana 		struct btrfs_inode_item *item;
42601a4bcf47SFilipe Manana 
42611a4bcf47SFilipe Manana 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
42621a4bcf47SFilipe Manana 				      struct btrfs_inode_item);
42631a4bcf47SFilipe Manana 		*size_ret = btrfs_inode_size(path->nodes[0], item);
42641a4bcf47SFilipe Manana 	}
42651a4bcf47SFilipe Manana 
42661a4bcf47SFilipe Manana 	btrfs_release_path(path);
42671a4bcf47SFilipe Manana 	return 0;
42681a4bcf47SFilipe Manana }
42691a4bcf47SFilipe Manana 
427036283bf7SFilipe Manana /*
427136283bf7SFilipe Manana  * At the moment we always log all xattrs. This is to figure out at log replay
427236283bf7SFilipe Manana  * time which xattrs must have their deletion replayed. If a xattr is missing
427336283bf7SFilipe Manana  * in the log tree and exists in the fs/subvol tree, we delete it. This is
427436283bf7SFilipe Manana  * because if a xattr is deleted, the inode is fsynced and a power failure
427536283bf7SFilipe Manana  * happens, causing the log to be replayed the next time the fs is mounted,
427636283bf7SFilipe Manana  * we want the xattr to not exist anymore (same behaviour as other filesystems
427736283bf7SFilipe Manana  * with a journal, ext3/4, xfs, f2fs, etc).
427836283bf7SFilipe Manana  */
427936283bf7SFilipe Manana static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
428036283bf7SFilipe Manana 				struct btrfs_root *root,
42811a93c36aSNikolay Borisov 				struct btrfs_inode *inode,
428236283bf7SFilipe Manana 				struct btrfs_path *path,
428336283bf7SFilipe Manana 				struct btrfs_path *dst_path)
428436283bf7SFilipe Manana {
428536283bf7SFilipe Manana 	int ret;
428636283bf7SFilipe Manana 	struct btrfs_key key;
42871a93c36aSNikolay Borisov 	const u64 ino = btrfs_ino(inode);
428836283bf7SFilipe Manana 	int ins_nr = 0;
428936283bf7SFilipe Manana 	int start_slot = 0;
429036283bf7SFilipe Manana 
429136283bf7SFilipe Manana 	key.objectid = ino;
429236283bf7SFilipe Manana 	key.type = BTRFS_XATTR_ITEM_KEY;
429336283bf7SFilipe Manana 	key.offset = 0;
429436283bf7SFilipe Manana 
429536283bf7SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
429636283bf7SFilipe Manana 	if (ret < 0)
429736283bf7SFilipe Manana 		return ret;
429836283bf7SFilipe Manana 
429936283bf7SFilipe Manana 	while (true) {
430036283bf7SFilipe Manana 		int slot = path->slots[0];
430136283bf7SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
430236283bf7SFilipe Manana 		int nritems = btrfs_header_nritems(leaf);
430336283bf7SFilipe Manana 
430436283bf7SFilipe Manana 		if (slot >= nritems) {
430536283bf7SFilipe Manana 			if (ins_nr > 0) {
430636283bf7SFilipe Manana 				u64 last_extent = 0;
430736283bf7SFilipe Manana 
43081a93c36aSNikolay Borisov 				ret = copy_items(trans, inode, dst_path, path,
430936283bf7SFilipe Manana 						 &last_extent, start_slot,
431036283bf7SFilipe Manana 						 ins_nr, 1, 0);
431136283bf7SFilipe Manana 				/* can't be 1, extent items aren't processed */
431236283bf7SFilipe Manana 				ASSERT(ret <= 0);
431336283bf7SFilipe Manana 				if (ret < 0)
431436283bf7SFilipe Manana 					return ret;
431536283bf7SFilipe Manana 				ins_nr = 0;
431636283bf7SFilipe Manana 			}
431736283bf7SFilipe Manana 			ret = btrfs_next_leaf(root, path);
431836283bf7SFilipe Manana 			if (ret < 0)
431936283bf7SFilipe Manana 				return ret;
432036283bf7SFilipe Manana 			else if (ret > 0)
432136283bf7SFilipe Manana 				break;
432236283bf7SFilipe Manana 			continue;
432336283bf7SFilipe Manana 		}
432436283bf7SFilipe Manana 
432536283bf7SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
432636283bf7SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
432736283bf7SFilipe Manana 			break;
432836283bf7SFilipe Manana 
432936283bf7SFilipe Manana 		if (ins_nr == 0)
433036283bf7SFilipe Manana 			start_slot = slot;
433136283bf7SFilipe Manana 		ins_nr++;
433236283bf7SFilipe Manana 		path->slots[0]++;
433336283bf7SFilipe Manana 		cond_resched();
433436283bf7SFilipe Manana 	}
433536283bf7SFilipe Manana 	if (ins_nr > 0) {
433636283bf7SFilipe Manana 		u64 last_extent = 0;
433736283bf7SFilipe Manana 
43381a93c36aSNikolay Borisov 		ret = copy_items(trans, inode, dst_path, path,
433936283bf7SFilipe Manana 				 &last_extent, start_slot,
434036283bf7SFilipe Manana 				 ins_nr, 1, 0);
434136283bf7SFilipe Manana 		/* can't be 1, extent items aren't processed */
434236283bf7SFilipe Manana 		ASSERT(ret <= 0);
434336283bf7SFilipe Manana 		if (ret < 0)
434436283bf7SFilipe Manana 			return ret;
434536283bf7SFilipe Manana 	}
434636283bf7SFilipe Manana 
434736283bf7SFilipe Manana 	return 0;
434836283bf7SFilipe Manana }
434936283bf7SFilipe Manana 
4350a89ca6f2SFilipe Manana /*
4351a89ca6f2SFilipe Manana  * If the no holes feature is enabled we need to make sure any hole between the
4352a89ca6f2SFilipe Manana  * last extent and the i_size of our inode is explicitly marked in the log. This
4353a89ca6f2SFilipe Manana  * is to make sure that doing something like:
4354a89ca6f2SFilipe Manana  *
4355a89ca6f2SFilipe Manana  *      1) create file with 128Kb of data
4356a89ca6f2SFilipe Manana  *      2) truncate file to 64Kb
4357a89ca6f2SFilipe Manana  *      3) truncate file to 256Kb
4358a89ca6f2SFilipe Manana  *      4) fsync file
4359a89ca6f2SFilipe Manana  *      5) <crash/power failure>
4360a89ca6f2SFilipe Manana  *      6) mount fs and trigger log replay
4361a89ca6f2SFilipe Manana  *
4362a89ca6f2SFilipe Manana  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4363a89ca6f2SFilipe Manana  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4364a89ca6f2SFilipe Manana  * file correspond to a hole. The presence of explicit holes in a log tree is
4365a89ca6f2SFilipe Manana  * what guarantees that log replay will remove/adjust file extent items in the
4366a89ca6f2SFilipe Manana  * fs/subvol tree.
4367a89ca6f2SFilipe Manana  *
4368a89ca6f2SFilipe Manana  * Here we do not need to care about holes between extents, that is already done
4369a89ca6f2SFilipe Manana  * by copy_items(). We also only need to do this in the full sync path, where we
4370a89ca6f2SFilipe Manana  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4371a89ca6f2SFilipe Manana  * lookup the list of modified extent maps and if any represents a hole, we
4372a89ca6f2SFilipe Manana  * insert a corresponding extent representing a hole in the log tree.
4373a89ca6f2SFilipe Manana  */
4374a89ca6f2SFilipe Manana static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4375a89ca6f2SFilipe Manana 				   struct btrfs_root *root,
4376a0308dd7SNikolay Borisov 				   struct btrfs_inode *inode,
4377a89ca6f2SFilipe Manana 				   struct btrfs_path *path)
4378a89ca6f2SFilipe Manana {
43790b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4380a89ca6f2SFilipe Manana 	int ret;
4381a89ca6f2SFilipe Manana 	struct btrfs_key key;
4382a89ca6f2SFilipe Manana 	u64 hole_start;
4383a89ca6f2SFilipe Manana 	u64 hole_size;
4384a89ca6f2SFilipe Manana 	struct extent_buffer *leaf;
4385a89ca6f2SFilipe Manana 	struct btrfs_root *log = root->log_root;
4386a0308dd7SNikolay Borisov 	const u64 ino = btrfs_ino(inode);
4387a0308dd7SNikolay Borisov 	const u64 i_size = i_size_read(&inode->vfs_inode);
4388a89ca6f2SFilipe Manana 
43890b246afaSJeff Mahoney 	if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4390a89ca6f2SFilipe Manana 		return 0;
4391a89ca6f2SFilipe Manana 
4392a89ca6f2SFilipe Manana 	key.objectid = ino;
4393a89ca6f2SFilipe Manana 	key.type = BTRFS_EXTENT_DATA_KEY;
4394a89ca6f2SFilipe Manana 	key.offset = (u64)-1;
4395a89ca6f2SFilipe Manana 
4396a89ca6f2SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4397a89ca6f2SFilipe Manana 	ASSERT(ret != 0);
4398a89ca6f2SFilipe Manana 	if (ret < 0)
4399a89ca6f2SFilipe Manana 		return ret;
4400a89ca6f2SFilipe Manana 
4401a89ca6f2SFilipe Manana 	ASSERT(path->slots[0] > 0);
4402a89ca6f2SFilipe Manana 	path->slots[0]--;
4403a89ca6f2SFilipe Manana 	leaf = path->nodes[0];
4404a89ca6f2SFilipe Manana 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4405a89ca6f2SFilipe Manana 
4406a89ca6f2SFilipe Manana 	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4407a89ca6f2SFilipe Manana 		/* inode does not have any extents */
4408a89ca6f2SFilipe Manana 		hole_start = 0;
4409a89ca6f2SFilipe Manana 		hole_size = i_size;
4410a89ca6f2SFilipe Manana 	} else {
4411a89ca6f2SFilipe Manana 		struct btrfs_file_extent_item *extent;
4412a89ca6f2SFilipe Manana 		u64 len;
4413a89ca6f2SFilipe Manana 
4414a89ca6f2SFilipe Manana 		/*
4415a89ca6f2SFilipe Manana 		 * If there's an extent beyond i_size, an explicit hole was
4416a89ca6f2SFilipe Manana 		 * already inserted by copy_items().
4417a89ca6f2SFilipe Manana 		 */
4418a89ca6f2SFilipe Manana 		if (key.offset >= i_size)
4419a89ca6f2SFilipe Manana 			return 0;
4420a89ca6f2SFilipe Manana 
4421a89ca6f2SFilipe Manana 		extent = btrfs_item_ptr(leaf, path->slots[0],
4422a89ca6f2SFilipe Manana 					struct btrfs_file_extent_item);
4423a89ca6f2SFilipe Manana 
4424a89ca6f2SFilipe Manana 		if (btrfs_file_extent_type(leaf, extent) ==
4425a89ca6f2SFilipe Manana 		    BTRFS_FILE_EXTENT_INLINE) {
4426a89ca6f2SFilipe Manana 			len = btrfs_file_extent_inline_len(leaf,
4427a89ca6f2SFilipe Manana 							   path->slots[0],
4428a89ca6f2SFilipe Manana 							   extent);
4429a89ca6f2SFilipe Manana 			ASSERT(len == i_size);
4430a89ca6f2SFilipe Manana 			return 0;
4431a89ca6f2SFilipe Manana 		}
4432a89ca6f2SFilipe Manana 
4433a89ca6f2SFilipe Manana 		len = btrfs_file_extent_num_bytes(leaf, extent);
4434a89ca6f2SFilipe Manana 		/* Last extent goes beyond i_size, no need to log a hole. */
4435a89ca6f2SFilipe Manana 		if (key.offset + len > i_size)
4436a89ca6f2SFilipe Manana 			return 0;
4437a89ca6f2SFilipe Manana 		hole_start = key.offset + len;
4438a89ca6f2SFilipe Manana 		hole_size = i_size - hole_start;
4439a89ca6f2SFilipe Manana 	}
4440a89ca6f2SFilipe Manana 	btrfs_release_path(path);
4441a89ca6f2SFilipe Manana 
4442a89ca6f2SFilipe Manana 	/* Last extent ends at i_size. */
4443a89ca6f2SFilipe Manana 	if (hole_size == 0)
4444a89ca6f2SFilipe Manana 		return 0;
4445a89ca6f2SFilipe Manana 
44460b246afaSJeff Mahoney 	hole_size = ALIGN(hole_size, fs_info->sectorsize);
4447a89ca6f2SFilipe Manana 	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4448a89ca6f2SFilipe Manana 				       hole_size, 0, hole_size, 0, 0, 0);
4449a89ca6f2SFilipe Manana 	return ret;
4450a89ca6f2SFilipe Manana }
4451a89ca6f2SFilipe Manana 
445256f23fdbSFilipe Manana /*
445356f23fdbSFilipe Manana  * When we are logging a new inode X, check if it doesn't have a reference that
445456f23fdbSFilipe Manana  * matches the reference from some other inode Y created in a past transaction
445556f23fdbSFilipe Manana  * and that was renamed in the current transaction. If we don't do this, then at
445656f23fdbSFilipe Manana  * log replay time we can lose inode Y (and all its files if it's a directory):
445756f23fdbSFilipe Manana  *
445856f23fdbSFilipe Manana  * mkdir /mnt/x
445956f23fdbSFilipe Manana  * echo "hello world" > /mnt/x/foobar
446056f23fdbSFilipe Manana  * sync
446156f23fdbSFilipe Manana  * mv /mnt/x /mnt/y
446256f23fdbSFilipe Manana  * mkdir /mnt/x                 # or touch /mnt/x
446356f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/x
446456f23fdbSFilipe Manana  * <power fail>
446556f23fdbSFilipe Manana  * mount fs, trigger log replay
446656f23fdbSFilipe Manana  *
446756f23fdbSFilipe Manana  * After the log replay procedure, we would lose the first directory and all its
446856f23fdbSFilipe Manana  * files (file foobar).
446956f23fdbSFilipe Manana  * For the case where inode Y is not a directory we simply end up losing it:
447056f23fdbSFilipe Manana  *
447156f23fdbSFilipe Manana  * echo "123" > /mnt/foo
447256f23fdbSFilipe Manana  * sync
447356f23fdbSFilipe Manana  * mv /mnt/foo /mnt/bar
447456f23fdbSFilipe Manana  * echo "abc" > /mnt/foo
447556f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/foo
447656f23fdbSFilipe Manana  * <power fail>
447756f23fdbSFilipe Manana  *
447856f23fdbSFilipe Manana  * We also need this for cases where a snapshot entry is replaced by some other
447956f23fdbSFilipe Manana  * entry (file or directory) otherwise we end up with an unreplayable log due to
448056f23fdbSFilipe Manana  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
448156f23fdbSFilipe Manana  * if it were a regular entry:
448256f23fdbSFilipe Manana  *
448356f23fdbSFilipe Manana  * mkdir /mnt/x
448456f23fdbSFilipe Manana  * btrfs subvolume snapshot /mnt /mnt/x/snap
448556f23fdbSFilipe Manana  * btrfs subvolume delete /mnt/x/snap
448656f23fdbSFilipe Manana  * rmdir /mnt/x
448756f23fdbSFilipe Manana  * mkdir /mnt/x
448856f23fdbSFilipe Manana  * fsync /mnt/x or fsync some new file inside it
448956f23fdbSFilipe Manana  * <power fail>
449056f23fdbSFilipe Manana  *
449156f23fdbSFilipe Manana  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
449256f23fdbSFilipe Manana  * the same transaction.
449356f23fdbSFilipe Manana  */
449456f23fdbSFilipe Manana static int btrfs_check_ref_name_override(struct extent_buffer *eb,
449556f23fdbSFilipe Manana 					 const int slot,
449656f23fdbSFilipe Manana 					 const struct btrfs_key *key,
44974791c8f1SNikolay Borisov 					 struct btrfs_inode *inode,
449844f714daSFilipe Manana 					 u64 *other_ino)
449956f23fdbSFilipe Manana {
450056f23fdbSFilipe Manana 	int ret;
450156f23fdbSFilipe Manana 	struct btrfs_path *search_path;
450256f23fdbSFilipe Manana 	char *name = NULL;
450356f23fdbSFilipe Manana 	u32 name_len = 0;
450456f23fdbSFilipe Manana 	u32 item_size = btrfs_item_size_nr(eb, slot);
450556f23fdbSFilipe Manana 	u32 cur_offset = 0;
450656f23fdbSFilipe Manana 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
450756f23fdbSFilipe Manana 
450856f23fdbSFilipe Manana 	search_path = btrfs_alloc_path();
450956f23fdbSFilipe Manana 	if (!search_path)
451056f23fdbSFilipe Manana 		return -ENOMEM;
451156f23fdbSFilipe Manana 	search_path->search_commit_root = 1;
451256f23fdbSFilipe Manana 	search_path->skip_locking = 1;
451356f23fdbSFilipe Manana 
451456f23fdbSFilipe Manana 	while (cur_offset < item_size) {
451556f23fdbSFilipe Manana 		u64 parent;
451656f23fdbSFilipe Manana 		u32 this_name_len;
451756f23fdbSFilipe Manana 		u32 this_len;
451856f23fdbSFilipe Manana 		unsigned long name_ptr;
451956f23fdbSFilipe Manana 		struct btrfs_dir_item *di;
452056f23fdbSFilipe Manana 
452156f23fdbSFilipe Manana 		if (key->type == BTRFS_INODE_REF_KEY) {
452256f23fdbSFilipe Manana 			struct btrfs_inode_ref *iref;
452356f23fdbSFilipe Manana 
452456f23fdbSFilipe Manana 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
452556f23fdbSFilipe Manana 			parent = key->offset;
452656f23fdbSFilipe Manana 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
452756f23fdbSFilipe Manana 			name_ptr = (unsigned long)(iref + 1);
452856f23fdbSFilipe Manana 			this_len = sizeof(*iref) + this_name_len;
452956f23fdbSFilipe Manana 		} else {
453056f23fdbSFilipe Manana 			struct btrfs_inode_extref *extref;
453156f23fdbSFilipe Manana 
453256f23fdbSFilipe Manana 			extref = (struct btrfs_inode_extref *)(ptr +
453356f23fdbSFilipe Manana 							       cur_offset);
453456f23fdbSFilipe Manana 			parent = btrfs_inode_extref_parent(eb, extref);
453556f23fdbSFilipe Manana 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
453656f23fdbSFilipe Manana 			name_ptr = (unsigned long)&extref->name;
453756f23fdbSFilipe Manana 			this_len = sizeof(*extref) + this_name_len;
453856f23fdbSFilipe Manana 		}
453956f23fdbSFilipe Manana 
454056f23fdbSFilipe Manana 		if (this_name_len > name_len) {
454156f23fdbSFilipe Manana 			char *new_name;
454256f23fdbSFilipe Manana 
454356f23fdbSFilipe Manana 			new_name = krealloc(name, this_name_len, GFP_NOFS);
454456f23fdbSFilipe Manana 			if (!new_name) {
454556f23fdbSFilipe Manana 				ret = -ENOMEM;
454656f23fdbSFilipe Manana 				goto out;
454756f23fdbSFilipe Manana 			}
454856f23fdbSFilipe Manana 			name_len = this_name_len;
454956f23fdbSFilipe Manana 			name = new_name;
455056f23fdbSFilipe Manana 		}
455156f23fdbSFilipe Manana 
455256f23fdbSFilipe Manana 		read_extent_buffer(eb, name, name_ptr, this_name_len);
45534791c8f1SNikolay Borisov 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
45544791c8f1SNikolay Borisov 				parent, name, this_name_len, 0);
455556f23fdbSFilipe Manana 		if (di && !IS_ERR(di)) {
455644f714daSFilipe Manana 			struct btrfs_key di_key;
455744f714daSFilipe Manana 
455844f714daSFilipe Manana 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
455944f714daSFilipe Manana 						  di, &di_key);
456044f714daSFilipe Manana 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
456156f23fdbSFilipe Manana 				ret = 1;
456244f714daSFilipe Manana 				*other_ino = di_key.objectid;
456344f714daSFilipe Manana 			} else {
456444f714daSFilipe Manana 				ret = -EAGAIN;
456544f714daSFilipe Manana 			}
456656f23fdbSFilipe Manana 			goto out;
456756f23fdbSFilipe Manana 		} else if (IS_ERR(di)) {
456856f23fdbSFilipe Manana 			ret = PTR_ERR(di);
456956f23fdbSFilipe Manana 			goto out;
457056f23fdbSFilipe Manana 		}
457156f23fdbSFilipe Manana 		btrfs_release_path(search_path);
457256f23fdbSFilipe Manana 
457356f23fdbSFilipe Manana 		cur_offset += this_len;
457456f23fdbSFilipe Manana 	}
457556f23fdbSFilipe Manana 	ret = 0;
457656f23fdbSFilipe Manana out:
457756f23fdbSFilipe Manana 	btrfs_free_path(search_path);
457856f23fdbSFilipe Manana 	kfree(name);
457956f23fdbSFilipe Manana 	return ret;
458056f23fdbSFilipe Manana }
458156f23fdbSFilipe Manana 
4582e02119d5SChris Mason /* log a single inode in the tree log.
4583e02119d5SChris Mason  * At least one parent directory for this inode must exist in the tree
4584e02119d5SChris Mason  * or be logged already.
4585e02119d5SChris Mason  *
4586e02119d5SChris Mason  * Any items from this inode changed by the current transaction are copied
4587e02119d5SChris Mason  * to the log tree.  An extra reference is taken on any extents in this
4588e02119d5SChris Mason  * file, allowing us to avoid a whole pile of corner cases around logging
4589e02119d5SChris Mason  * blocks that have been removed from the tree.
4590e02119d5SChris Mason  *
4591e02119d5SChris Mason  * See LOG_INODE_ALL and related defines for a description of what inode_only
4592e02119d5SChris Mason  * does.
4593e02119d5SChris Mason  *
4594e02119d5SChris Mason  * This handles both files and directories.
4595e02119d5SChris Mason  */
459612fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4597e02119d5SChris Mason 			   struct btrfs_root *root, struct inode *inode,
459849dae1bcSFilipe Manana 			   int inode_only,
459949dae1bcSFilipe Manana 			   const loff_t start,
46008407f553SFilipe Manana 			   const loff_t end,
46018407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx)
4602e02119d5SChris Mason {
46030b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4604e02119d5SChris Mason 	struct btrfs_path *path;
4605e02119d5SChris Mason 	struct btrfs_path *dst_path;
4606e02119d5SChris Mason 	struct btrfs_key min_key;
4607e02119d5SChris Mason 	struct btrfs_key max_key;
4608e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
460931ff1cd2SChris Mason 	struct extent_buffer *src = NULL;
4610827463c4SMiao Xie 	LIST_HEAD(logged_list);
461116e7549fSJosef Bacik 	u64 last_extent = 0;
46124a500fd1SYan, Zheng 	int err = 0;
4613e02119d5SChris Mason 	int ret;
46143a5f1d45SChris Mason 	int nritems;
461531ff1cd2SChris Mason 	int ins_start_slot = 0;
461631ff1cd2SChris Mason 	int ins_nr;
46175dc562c5SJosef Bacik 	bool fast_search = false;
46184a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
461949dae1bcSFilipe Manana 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
46201a4bcf47SFilipe Manana 	u64 logged_isize = 0;
4621e4545de5SFilipe Manana 	bool need_log_inode_item = true;
4622e02119d5SChris Mason 
4623e02119d5SChris Mason 	path = btrfs_alloc_path();
46245df67083STsutomu Itoh 	if (!path)
46255df67083STsutomu Itoh 		return -ENOMEM;
4626e02119d5SChris Mason 	dst_path = btrfs_alloc_path();
46275df67083STsutomu Itoh 	if (!dst_path) {
46285df67083STsutomu Itoh 		btrfs_free_path(path);
46295df67083STsutomu Itoh 		return -ENOMEM;
46305df67083STsutomu Itoh 	}
4631e02119d5SChris Mason 
463233345d01SLi Zefan 	min_key.objectid = ino;
4633e02119d5SChris Mason 	min_key.type = BTRFS_INODE_ITEM_KEY;
4634e02119d5SChris Mason 	min_key.offset = 0;
4635e02119d5SChris Mason 
463633345d01SLi Zefan 	max_key.objectid = ino;
463712fcfd22SChris Mason 
463812fcfd22SChris Mason 
46395dc562c5SJosef Bacik 	/* today the code can only do partial logging of directories */
46405269b67eSMiao Xie 	if (S_ISDIR(inode->i_mode) ||
46415269b67eSMiao Xie 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
46425269b67eSMiao Xie 		       &BTRFS_I(inode)->runtime_flags) &&
4643781feef7SLiu Bo 	     inode_only >= LOG_INODE_EXISTS))
4644e02119d5SChris Mason 		max_key.type = BTRFS_XATTR_ITEM_KEY;
4645e02119d5SChris Mason 	else
4646e02119d5SChris Mason 		max_key.type = (u8)-1;
4647e02119d5SChris Mason 	max_key.offset = (u64)-1;
4648e02119d5SChris Mason 
46492c2c452bSFilipe Manana 	/*
46502c2c452bSFilipe Manana 	 * Only run delayed items if we are a dir or a new file.
46512c2c452bSFilipe Manana 	 * Otherwise commit the delayed inode only, which is needed in
46522c2c452bSFilipe Manana 	 * order for the log replay code to mark inodes for link count
46532c2c452bSFilipe Manana 	 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
46542c2c452bSFilipe Manana 	 */
465594edf4aeSJosef Bacik 	if (S_ISDIR(inode->i_mode) ||
46560b246afaSJeff Mahoney 	    BTRFS_I(inode)->generation > fs_info->last_trans_committed)
46575f4b32e9SNikolay Borisov 		ret = btrfs_commit_inode_delayed_items(trans, BTRFS_I(inode));
46582c2c452bSFilipe Manana 	else
4659aa79021fSNikolay Borisov 		ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
46602c2c452bSFilipe Manana 
466116cdcec7SMiao Xie 	if (ret) {
466216cdcec7SMiao Xie 		btrfs_free_path(path);
466316cdcec7SMiao Xie 		btrfs_free_path(dst_path);
466416cdcec7SMiao Xie 		return ret;
466516cdcec7SMiao Xie 	}
466616cdcec7SMiao Xie 
4667781feef7SLiu Bo 	if (inode_only == LOG_OTHER_INODE) {
4668781feef7SLiu Bo 		inode_only = LOG_INODE_EXISTS;
4669781feef7SLiu Bo 		mutex_lock_nested(&BTRFS_I(inode)->log_mutex,
4670781feef7SLiu Bo 				  SINGLE_DEPTH_NESTING);
4671781feef7SLiu Bo 	} else {
4672e02119d5SChris Mason 		mutex_lock(&BTRFS_I(inode)->log_mutex);
4673781feef7SLiu Bo 	}
4674e02119d5SChris Mason 
46755e33a2bdSFilipe Manana 	/*
4676e02119d5SChris Mason 	 * a brute force approach to making sure we get the most uptodate
4677e02119d5SChris Mason 	 * copies of everything.
4678e02119d5SChris Mason 	 */
4679e02119d5SChris Mason 	if (S_ISDIR(inode->i_mode)) {
4680e02119d5SChris Mason 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4681e02119d5SChris Mason 
46824f764e51SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS)
46834f764e51SFilipe Manana 			max_key_type = BTRFS_XATTR_ITEM_KEY;
468433345d01SLi Zefan 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4685e02119d5SChris Mason 	} else {
46861a4bcf47SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS) {
46871a4bcf47SFilipe Manana 			/*
46881a4bcf47SFilipe Manana 			 * Make sure the new inode item we write to the log has
46891a4bcf47SFilipe Manana 			 * the same isize as the current one (if it exists).
46901a4bcf47SFilipe Manana 			 * This is necessary to prevent data loss after log
46911a4bcf47SFilipe Manana 			 * replay, and also to prevent doing a wrong expanding
46921a4bcf47SFilipe Manana 			 * truncate - for e.g. create file, write 4K into offset
46931a4bcf47SFilipe Manana 			 * 0, fsync, write 4K into offset 4096, add hard link,
46941a4bcf47SFilipe Manana 			 * fsync some other file (to sync log), power fail - if
46951a4bcf47SFilipe Manana 			 * we use the inode's current i_size, after log replay
46961a4bcf47SFilipe Manana 			 * we get a 8Kb file, with the last 4Kb extent as a hole
46971a4bcf47SFilipe Manana 			 * (zeroes), as if an expanding truncate happened,
46981a4bcf47SFilipe Manana 			 * instead of getting a file of 4Kb only.
46991a4bcf47SFilipe Manana 			 */
4700481b01c0SNikolay Borisov 			err = logged_inode_size(log, BTRFS_I(inode), path,
47011a4bcf47SFilipe Manana 						&logged_isize);
47021a4bcf47SFilipe Manana 			if (err)
47031a4bcf47SFilipe Manana 				goto out_unlock;
47041a4bcf47SFilipe Manana 		}
4705a742994aSFilipe Manana 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
47065dc562c5SJosef Bacik 			     &BTRFS_I(inode)->runtime_flags)) {
4707a742994aSFilipe Manana 			if (inode_only == LOG_INODE_EXISTS) {
47084f764e51SFilipe Manana 				max_key.type = BTRFS_XATTR_ITEM_KEY;
4709a742994aSFilipe Manana 				ret = drop_objectid_items(trans, log, path, ino,
4710a742994aSFilipe Manana 							  max_key.type);
4711a742994aSFilipe Manana 			} else {
4712a742994aSFilipe Manana 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4713a742994aSFilipe Manana 					  &BTRFS_I(inode)->runtime_flags);
4714e9976151SJosef Bacik 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4715e9976151SJosef Bacik 					  &BTRFS_I(inode)->runtime_flags);
471628ed1345SChris Mason 				while(1) {
471728ed1345SChris Mason 					ret = btrfs_truncate_inode_items(trans,
471828ed1345SChris Mason 							 log, inode, 0, 0);
471928ed1345SChris Mason 					if (ret != -EAGAIN)
472028ed1345SChris Mason 						break;
472128ed1345SChris Mason 				}
4722a742994aSFilipe Manana 			}
47234f764e51SFilipe Manana 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
47246cfab851SJosef Bacik 					      &BTRFS_I(inode)->runtime_flags) ||
47256cfab851SJosef Bacik 			   inode_only == LOG_INODE_EXISTS) {
47264f764e51SFilipe Manana 			if (inode_only == LOG_INODE_ALL)
4727a95249b3SJosef Bacik 				fast_search = true;
4728a95249b3SJosef Bacik 			max_key.type = BTRFS_XATTR_ITEM_KEY;
4729a95249b3SJosef Bacik 			ret = drop_objectid_items(trans, log, path, ino,
4730a95249b3SJosef Bacik 						  max_key.type);
47315dc562c5SJosef Bacik 		} else {
4732183f37faSLiu Bo 			if (inode_only == LOG_INODE_ALL)
47335dc562c5SJosef Bacik 				fast_search = true;
4734a95249b3SJosef Bacik 			goto log_extents;
4735a95249b3SJosef Bacik 		}
4736a95249b3SJosef Bacik 
4737e02119d5SChris Mason 	}
47384a500fd1SYan, Zheng 	if (ret) {
47394a500fd1SYan, Zheng 		err = ret;
47404a500fd1SYan, Zheng 		goto out_unlock;
47414a500fd1SYan, Zheng 	}
4742e02119d5SChris Mason 
4743e02119d5SChris Mason 	while (1) {
474431ff1cd2SChris Mason 		ins_nr = 0;
47456174d3cbSFilipe David Borba Manana 		ret = btrfs_search_forward(root, &min_key,
4746de78b51aSEric Sandeen 					   path, trans->transid);
4747fb770ae4SLiu Bo 		if (ret < 0) {
4748fb770ae4SLiu Bo 			err = ret;
4749fb770ae4SLiu Bo 			goto out_unlock;
4750fb770ae4SLiu Bo 		}
4751e02119d5SChris Mason 		if (ret != 0)
4752e02119d5SChris Mason 			break;
47533a5f1d45SChris Mason again:
475431ff1cd2SChris Mason 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
475533345d01SLi Zefan 		if (min_key.objectid != ino)
4756e02119d5SChris Mason 			break;
4757e02119d5SChris Mason 		if (min_key.type > max_key.type)
4758e02119d5SChris Mason 			break;
475931ff1cd2SChris Mason 
4760e4545de5SFilipe Manana 		if (min_key.type == BTRFS_INODE_ITEM_KEY)
4761e4545de5SFilipe Manana 			need_log_inode_item = false;
4762e4545de5SFilipe Manana 
476356f23fdbSFilipe Manana 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
476456f23fdbSFilipe Manana 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
476556f23fdbSFilipe Manana 		    BTRFS_I(inode)->generation == trans->transid) {
476644f714daSFilipe Manana 			u64 other_ino = 0;
476744f714daSFilipe Manana 
476856f23fdbSFilipe Manana 			ret = btrfs_check_ref_name_override(path->nodes[0],
476956f23fdbSFilipe Manana 							    path->slots[0],
47704791c8f1SNikolay Borisov 							    &min_key, BTRFS_I(inode),
477144f714daSFilipe Manana 							    &other_ino);
477256f23fdbSFilipe Manana 			if (ret < 0) {
477356f23fdbSFilipe Manana 				err = ret;
477456f23fdbSFilipe Manana 				goto out_unlock;
477528a23593SFilipe Manana 			} else if (ret > 0 && ctx &&
47764a0cc7caSNikolay Borisov 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
477744f714daSFilipe Manana 				struct btrfs_key inode_key;
477844f714daSFilipe Manana 				struct inode *other_inode;
477944f714daSFilipe Manana 
478044f714daSFilipe Manana 				if (ins_nr > 0) {
478144f714daSFilipe Manana 					ins_nr++;
478244f714daSFilipe Manana 				} else {
478344f714daSFilipe Manana 					ins_nr = 1;
478444f714daSFilipe Manana 					ins_start_slot = path->slots[0];
478544f714daSFilipe Manana 				}
478644d70e19SNikolay Borisov 				ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
478744f714daSFilipe Manana 						 &last_extent, ins_start_slot,
478844f714daSFilipe Manana 						 ins_nr, inode_only,
478944f714daSFilipe Manana 						 logged_isize);
479044f714daSFilipe Manana 				if (ret < 0) {
479144f714daSFilipe Manana 					err = ret;
479256f23fdbSFilipe Manana 					goto out_unlock;
479356f23fdbSFilipe Manana 				}
479444f714daSFilipe Manana 				ins_nr = 0;
479544f714daSFilipe Manana 				btrfs_release_path(path);
479644f714daSFilipe Manana 				inode_key.objectid = other_ino;
479744f714daSFilipe Manana 				inode_key.type = BTRFS_INODE_ITEM_KEY;
479844f714daSFilipe Manana 				inode_key.offset = 0;
47990b246afaSJeff Mahoney 				other_inode = btrfs_iget(fs_info->sb,
480044f714daSFilipe Manana 							 &inode_key, root,
480144f714daSFilipe Manana 							 NULL);
480244f714daSFilipe Manana 				/*
480344f714daSFilipe Manana 				 * If the other inode that had a conflicting dir
480444f714daSFilipe Manana 				 * entry was deleted in the current transaction,
480544f714daSFilipe Manana 				 * we don't need to do more work nor fallback to
480644f714daSFilipe Manana 				 * a transaction commit.
480744f714daSFilipe Manana 				 */
480844f714daSFilipe Manana 				if (IS_ERR(other_inode) &&
480944f714daSFilipe Manana 				    PTR_ERR(other_inode) == -ENOENT) {
481044f714daSFilipe Manana 					goto next_key;
481144f714daSFilipe Manana 				} else if (IS_ERR(other_inode)) {
481244f714daSFilipe Manana 					err = PTR_ERR(other_inode);
481344f714daSFilipe Manana 					goto out_unlock;
481444f714daSFilipe Manana 				}
481544f714daSFilipe Manana 				/*
481644f714daSFilipe Manana 				 * We are safe logging the other inode without
481744f714daSFilipe Manana 				 * acquiring its i_mutex as long as we log with
481844f714daSFilipe Manana 				 * the LOG_INODE_EXISTS mode. We're safe against
481944f714daSFilipe Manana 				 * concurrent renames of the other inode as well
482044f714daSFilipe Manana 				 * because during a rename we pin the log and
482144f714daSFilipe Manana 				 * update the log with the new name before we
482244f714daSFilipe Manana 				 * unpin it.
482344f714daSFilipe Manana 				 */
482444f714daSFilipe Manana 				err = btrfs_log_inode(trans, root, other_inode,
4825781feef7SLiu Bo 						      LOG_OTHER_INODE,
482644f714daSFilipe Manana 						      0, LLONG_MAX, ctx);
482744f714daSFilipe Manana 				iput(other_inode);
482844f714daSFilipe Manana 				if (err)
482944f714daSFilipe Manana 					goto out_unlock;
483044f714daSFilipe Manana 				else
483144f714daSFilipe Manana 					goto next_key;
483244f714daSFilipe Manana 			}
483356f23fdbSFilipe Manana 		}
483456f23fdbSFilipe Manana 
483536283bf7SFilipe Manana 		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
483636283bf7SFilipe Manana 		if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
483736283bf7SFilipe Manana 			if (ins_nr == 0)
483836283bf7SFilipe Manana 				goto next_slot;
483944d70e19SNikolay Borisov 			ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
484036283bf7SFilipe Manana 					 &last_extent, ins_start_slot,
484136283bf7SFilipe Manana 					 ins_nr, inode_only, logged_isize);
484236283bf7SFilipe Manana 			if (ret < 0) {
484336283bf7SFilipe Manana 				err = ret;
484436283bf7SFilipe Manana 				goto out_unlock;
484536283bf7SFilipe Manana 			}
484636283bf7SFilipe Manana 			ins_nr = 0;
484736283bf7SFilipe Manana 			if (ret) {
484836283bf7SFilipe Manana 				btrfs_release_path(path);
484936283bf7SFilipe Manana 				continue;
485036283bf7SFilipe Manana 			}
485136283bf7SFilipe Manana 			goto next_slot;
485236283bf7SFilipe Manana 		}
485336283bf7SFilipe Manana 
4854e02119d5SChris Mason 		src = path->nodes[0];
485531ff1cd2SChris Mason 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
485631ff1cd2SChris Mason 			ins_nr++;
485731ff1cd2SChris Mason 			goto next_slot;
485831ff1cd2SChris Mason 		} else if (!ins_nr) {
485931ff1cd2SChris Mason 			ins_start_slot = path->slots[0];
486031ff1cd2SChris Mason 			ins_nr = 1;
486131ff1cd2SChris Mason 			goto next_slot;
4862e02119d5SChris Mason 		}
4863e02119d5SChris Mason 
486444d70e19SNikolay Borisov 		ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent,
48651a4bcf47SFilipe Manana 				 ins_start_slot, ins_nr, inode_only,
48661a4bcf47SFilipe Manana 				 logged_isize);
486716e7549fSJosef Bacik 		if (ret < 0) {
48684a500fd1SYan, Zheng 			err = ret;
48694a500fd1SYan, Zheng 			goto out_unlock;
4870a71db86eSRasmus Villemoes 		}
4871a71db86eSRasmus Villemoes 		if (ret) {
487216e7549fSJosef Bacik 			ins_nr = 0;
487316e7549fSJosef Bacik 			btrfs_release_path(path);
487416e7549fSJosef Bacik 			continue;
48754a500fd1SYan, Zheng 		}
487631ff1cd2SChris Mason 		ins_nr = 1;
487731ff1cd2SChris Mason 		ins_start_slot = path->slots[0];
487831ff1cd2SChris Mason next_slot:
4879e02119d5SChris Mason 
48803a5f1d45SChris Mason 		nritems = btrfs_header_nritems(path->nodes[0]);
48813a5f1d45SChris Mason 		path->slots[0]++;
48823a5f1d45SChris Mason 		if (path->slots[0] < nritems) {
48833a5f1d45SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
48843a5f1d45SChris Mason 					      path->slots[0]);
48853a5f1d45SChris Mason 			goto again;
48863a5f1d45SChris Mason 		}
488731ff1cd2SChris Mason 		if (ins_nr) {
488844d70e19SNikolay Borisov 			ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
488916e7549fSJosef Bacik 					 &last_extent, ins_start_slot,
48901a4bcf47SFilipe Manana 					 ins_nr, inode_only, logged_isize);
489116e7549fSJosef Bacik 			if (ret < 0) {
48924a500fd1SYan, Zheng 				err = ret;
48934a500fd1SYan, Zheng 				goto out_unlock;
48944a500fd1SYan, Zheng 			}
489516e7549fSJosef Bacik 			ret = 0;
489631ff1cd2SChris Mason 			ins_nr = 0;
489731ff1cd2SChris Mason 		}
4898b3b4aa74SDavid Sterba 		btrfs_release_path(path);
489944f714daSFilipe Manana next_key:
49003d41d702SFilipe David Borba Manana 		if (min_key.offset < (u64)-1) {
4901e02119d5SChris Mason 			min_key.offset++;
49023d41d702SFilipe David Borba Manana 		} else if (min_key.type < max_key.type) {
4903e02119d5SChris Mason 			min_key.type++;
49043d41d702SFilipe David Borba Manana 			min_key.offset = 0;
49053d41d702SFilipe David Borba Manana 		} else {
4906e02119d5SChris Mason 			break;
4907e02119d5SChris Mason 		}
49083d41d702SFilipe David Borba Manana 	}
490931ff1cd2SChris Mason 	if (ins_nr) {
491044d70e19SNikolay Borisov 		ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent,
49111a4bcf47SFilipe Manana 				 ins_start_slot, ins_nr, inode_only,
49121a4bcf47SFilipe Manana 				 logged_isize);
491316e7549fSJosef Bacik 		if (ret < 0) {
49144a500fd1SYan, Zheng 			err = ret;
49154a500fd1SYan, Zheng 			goto out_unlock;
49164a500fd1SYan, Zheng 		}
491716e7549fSJosef Bacik 		ret = 0;
491831ff1cd2SChris Mason 		ins_nr = 0;
491931ff1cd2SChris Mason 	}
49205dc562c5SJosef Bacik 
492136283bf7SFilipe Manana 	btrfs_release_path(path);
492236283bf7SFilipe Manana 	btrfs_release_path(dst_path);
49231a93c36aSNikolay Borisov 	err = btrfs_log_all_xattrs(trans, root, BTRFS_I(inode), path, dst_path);
492436283bf7SFilipe Manana 	if (err)
492536283bf7SFilipe Manana 		goto out_unlock;
4926a89ca6f2SFilipe Manana 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4927a89ca6f2SFilipe Manana 		btrfs_release_path(path);
4928a89ca6f2SFilipe Manana 		btrfs_release_path(dst_path);
4929a0308dd7SNikolay Borisov 		err = btrfs_log_trailing_hole(trans, root, BTRFS_I(inode), path);
4930a89ca6f2SFilipe Manana 		if (err)
4931a89ca6f2SFilipe Manana 			goto out_unlock;
4932a89ca6f2SFilipe Manana 	}
4933a95249b3SJosef Bacik log_extents:
4934f3b15ccdSJosef Bacik 	btrfs_release_path(path);
49355dc562c5SJosef Bacik 	btrfs_release_path(dst_path);
4936e4545de5SFilipe Manana 	if (need_log_inode_item) {
4937*6d889a3bSNikolay Borisov 		err = log_inode_item(trans, log, dst_path, BTRFS_I(inode));
4938e4545de5SFilipe Manana 		if (err)
4939e4545de5SFilipe Manana 			goto out_unlock;
4940e4545de5SFilipe Manana 	}
4941f3b15ccdSJosef Bacik 	if (fast_search) {
49429d122629SNikolay Borisov 		ret = btrfs_log_changed_extents(trans, root, BTRFS_I(inode), dst_path,
4943de0ee0edSFilipe Manana 						&logged_list, ctx, start, end);
49445dc562c5SJosef Bacik 		if (ret) {
49455dc562c5SJosef Bacik 			err = ret;
49465dc562c5SJosef Bacik 			goto out_unlock;
49475dc562c5SJosef Bacik 		}
4948d006a048SJosef Bacik 	} else if (inode_only == LOG_INODE_ALL) {
494906d3d22bSLiu Bo 		struct extent_map *em, *n;
495006d3d22bSLiu Bo 
495149dae1bcSFilipe Manana 		write_lock(&em_tree->lock);
495249dae1bcSFilipe Manana 		/*
495349dae1bcSFilipe Manana 		 * We can't just remove every em if we're called for a ranged
495449dae1bcSFilipe Manana 		 * fsync - that is, one that doesn't cover the whole possible
495549dae1bcSFilipe Manana 		 * file range (0 to LLONG_MAX). This is because we can have
495649dae1bcSFilipe Manana 		 * em's that fall outside the range we're logging and therefore
495749dae1bcSFilipe Manana 		 * their ordered operations haven't completed yet
495849dae1bcSFilipe Manana 		 * (btrfs_finish_ordered_io() not invoked yet). This means we
495949dae1bcSFilipe Manana 		 * didn't get their respective file extent item in the fs/subvol
496049dae1bcSFilipe Manana 		 * tree yet, and need to let the next fast fsync (one which
496149dae1bcSFilipe Manana 		 * consults the list of modified extent maps) find the em so
496249dae1bcSFilipe Manana 		 * that it logs a matching file extent item and waits for the
496349dae1bcSFilipe Manana 		 * respective ordered operation to complete (if it's still
496449dae1bcSFilipe Manana 		 * running).
496549dae1bcSFilipe Manana 		 *
496649dae1bcSFilipe Manana 		 * Removing every em outside the range we're logging would make
496749dae1bcSFilipe Manana 		 * the next fast fsync not log their matching file extent items,
496849dae1bcSFilipe Manana 		 * therefore making us lose data after a log replay.
496949dae1bcSFilipe Manana 		 */
497049dae1bcSFilipe Manana 		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
497149dae1bcSFilipe Manana 					 list) {
497249dae1bcSFilipe Manana 			const u64 mod_end = em->mod_start + em->mod_len - 1;
497349dae1bcSFilipe Manana 
497449dae1bcSFilipe Manana 			if (em->mod_start >= start && mod_end <= end)
497506d3d22bSLiu Bo 				list_del_init(&em->list);
497649dae1bcSFilipe Manana 		}
497749dae1bcSFilipe Manana 		write_unlock(&em_tree->lock);
49785dc562c5SJosef Bacik 	}
49795dc562c5SJosef Bacik 
49809623f9a3SChris Mason 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4981dbf39ea4SNikolay Borisov 		ret = log_directory_changes(trans, root, BTRFS_I(inode), path,
4982dbf39ea4SNikolay Borisov 				dst_path, ctx);
49834a500fd1SYan, Zheng 		if (ret) {
49844a500fd1SYan, Zheng 			err = ret;
49854a500fd1SYan, Zheng 			goto out_unlock;
49864a500fd1SYan, Zheng 		}
4987e02119d5SChris Mason 	}
498849dae1bcSFilipe Manana 
49892f2ff0eeSFilipe Manana 	spin_lock(&BTRFS_I(inode)->lock);
49903a5f1d45SChris Mason 	BTRFS_I(inode)->logged_trans = trans->transid;
4991125c4cf9SFilipe Manana 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
49922f2ff0eeSFilipe Manana 	spin_unlock(&BTRFS_I(inode)->lock);
49934a500fd1SYan, Zheng out_unlock:
4994827463c4SMiao Xie 	if (unlikely(err))
4995827463c4SMiao Xie 		btrfs_put_logged_extents(&logged_list);
4996827463c4SMiao Xie 	else
4997827463c4SMiao Xie 		btrfs_submit_logged_extents(&logged_list, log);
4998e02119d5SChris Mason 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
4999e02119d5SChris Mason 
5000e02119d5SChris Mason 	btrfs_free_path(path);
5001e02119d5SChris Mason 	btrfs_free_path(dst_path);
50024a500fd1SYan, Zheng 	return err;
5003e02119d5SChris Mason }
5004e02119d5SChris Mason 
500512fcfd22SChris Mason /*
50062be63d5cSFilipe Manana  * Check if we must fallback to a transaction commit when logging an inode.
50072be63d5cSFilipe Manana  * This must be called after logging the inode and is used only in the context
50082be63d5cSFilipe Manana  * when fsyncing an inode requires the need to log some other inode - in which
50092be63d5cSFilipe Manana  * case we can't lock the i_mutex of each other inode we need to log as that
50102be63d5cSFilipe Manana  * can lead to deadlocks with concurrent fsync against other inodes (as we can
50112be63d5cSFilipe Manana  * log inodes up or down in the hierarchy) or rename operations for example. So
50122be63d5cSFilipe Manana  * we take the log_mutex of the inode after we have logged it and then check for
50132be63d5cSFilipe Manana  * its last_unlink_trans value - this is safe because any task setting
50142be63d5cSFilipe Manana  * last_unlink_trans must take the log_mutex and it must do this before it does
50152be63d5cSFilipe Manana  * the actual unlink operation, so if we do this check before a concurrent task
50162be63d5cSFilipe Manana  * sets last_unlink_trans it means we've logged a consistent version/state of
50172be63d5cSFilipe Manana  * all the inode items, otherwise we are not sure and must do a transaction
501801327610SNicholas D Steeves  * commit (the concurrent task might have only updated last_unlink_trans before
50192be63d5cSFilipe Manana  * we logged the inode or it might have also done the unlink).
50202be63d5cSFilipe Manana  */
50212be63d5cSFilipe Manana static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5022ab1717b2SNikolay Borisov 					  struct btrfs_inode *inode)
50232be63d5cSFilipe Manana {
5024ab1717b2SNikolay Borisov 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
50252be63d5cSFilipe Manana 	bool ret = false;
50262be63d5cSFilipe Manana 
5027ab1717b2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
5028ab1717b2SNikolay Borisov 	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
50292be63d5cSFilipe Manana 		/*
50302be63d5cSFilipe Manana 		 * Make sure any commits to the log are forced to be full
50312be63d5cSFilipe Manana 		 * commits.
50322be63d5cSFilipe Manana 		 */
50332be63d5cSFilipe Manana 		btrfs_set_log_full_commit(fs_info, trans);
50342be63d5cSFilipe Manana 		ret = true;
50352be63d5cSFilipe Manana 	}
5036ab1717b2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
50372be63d5cSFilipe Manana 
50382be63d5cSFilipe Manana 	return ret;
50392be63d5cSFilipe Manana }
50402be63d5cSFilipe Manana 
50412be63d5cSFilipe Manana /*
504212fcfd22SChris Mason  * follow the dentry parent pointers up the chain and see if any
504312fcfd22SChris Mason  * of the directories in it require a full commit before they can
504412fcfd22SChris Mason  * be logged.  Returns zero if nothing special needs to be done or 1 if
504512fcfd22SChris Mason  * a full commit is required.
504612fcfd22SChris Mason  */
504712fcfd22SChris Mason static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
504812fcfd22SChris Mason 					       struct inode *inode,
504912fcfd22SChris Mason 					       struct dentry *parent,
505012fcfd22SChris Mason 					       struct super_block *sb,
505112fcfd22SChris Mason 					       u64 last_committed)
5052e02119d5SChris Mason {
505312fcfd22SChris Mason 	int ret = 0;
50546a912213SJosef Bacik 	struct dentry *old_parent = NULL;
5055de2b530bSJosef Bacik 	struct inode *orig_inode = inode;
5056e02119d5SChris Mason 
5057af4176b4SChris Mason 	/*
5058af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
5059af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
5060af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
5061af4176b4SChris Mason 	 * and other fun in this file.
5062af4176b4SChris Mason 	 */
5063af4176b4SChris Mason 	if (S_ISREG(inode->i_mode) &&
5064af4176b4SChris Mason 	    BTRFS_I(inode)->generation <= last_committed &&
5065af4176b4SChris Mason 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
5066af4176b4SChris Mason 			goto out;
5067af4176b4SChris Mason 
506812fcfd22SChris Mason 	if (!S_ISDIR(inode->i_mode)) {
5069fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
507012fcfd22SChris Mason 			goto out;
50712b0143b5SDavid Howells 		inode = d_inode(parent);
507212fcfd22SChris Mason 	}
507312fcfd22SChris Mason 
507412fcfd22SChris Mason 	while (1) {
5075de2b530bSJosef Bacik 		/*
5076de2b530bSJosef Bacik 		 * If we are logging a directory then we start with our inode,
507701327610SNicholas D Steeves 		 * not our parent's inode, so we need to skip setting the
5078de2b530bSJosef Bacik 		 * logged_trans so that further down in the log code we don't
5079de2b530bSJosef Bacik 		 * think this inode has already been logged.
5080de2b530bSJosef Bacik 		 */
5081de2b530bSJosef Bacik 		if (inode != orig_inode)
508212fcfd22SChris Mason 			BTRFS_I(inode)->logged_trans = trans->transid;
508312fcfd22SChris Mason 		smp_mb();
508412fcfd22SChris Mason 
5085ab1717b2SNikolay Borisov 		if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) {
508612fcfd22SChris Mason 			ret = 1;
508712fcfd22SChris Mason 			break;
508812fcfd22SChris Mason 		}
508912fcfd22SChris Mason 
5090fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
509112fcfd22SChris Mason 			break;
509212fcfd22SChris Mason 
509344f714daSFilipe Manana 		if (IS_ROOT(parent)) {
509444f714daSFilipe Manana 			inode = d_inode(parent);
5095ab1717b2SNikolay Borisov 			if (btrfs_must_commit_transaction(trans, BTRFS_I(inode)))
509644f714daSFilipe Manana 				ret = 1;
509712fcfd22SChris Mason 			break;
509844f714daSFilipe Manana 		}
509912fcfd22SChris Mason 
51006a912213SJosef Bacik 		parent = dget_parent(parent);
51016a912213SJosef Bacik 		dput(old_parent);
51026a912213SJosef Bacik 		old_parent = parent;
51032b0143b5SDavid Howells 		inode = d_inode(parent);
510412fcfd22SChris Mason 
510512fcfd22SChris Mason 	}
51066a912213SJosef Bacik 	dput(old_parent);
510712fcfd22SChris Mason out:
5108e02119d5SChris Mason 	return ret;
5109e02119d5SChris Mason }
5110e02119d5SChris Mason 
51112f2ff0eeSFilipe Manana struct btrfs_dir_list {
51122f2ff0eeSFilipe Manana 	u64 ino;
51132f2ff0eeSFilipe Manana 	struct list_head list;
51142f2ff0eeSFilipe Manana };
51152f2ff0eeSFilipe Manana 
51162f2ff0eeSFilipe Manana /*
51172f2ff0eeSFilipe Manana  * Log the inodes of the new dentries of a directory. See log_dir_items() for
51182f2ff0eeSFilipe Manana  * details about the why it is needed.
51192f2ff0eeSFilipe Manana  * This is a recursive operation - if an existing dentry corresponds to a
51202f2ff0eeSFilipe Manana  * directory, that directory's new entries are logged too (same behaviour as
51212f2ff0eeSFilipe Manana  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
51222f2ff0eeSFilipe Manana  * the dentries point to we do not lock their i_mutex, otherwise lockdep
51232f2ff0eeSFilipe Manana  * complains about the following circular lock dependency / possible deadlock:
51242f2ff0eeSFilipe Manana  *
51252f2ff0eeSFilipe Manana  *        CPU0                                        CPU1
51262f2ff0eeSFilipe Manana  *        ----                                        ----
51272f2ff0eeSFilipe Manana  * lock(&type->i_mutex_dir_key#3/2);
51282f2ff0eeSFilipe Manana  *                                            lock(sb_internal#2);
51292f2ff0eeSFilipe Manana  *                                            lock(&type->i_mutex_dir_key#3/2);
51302f2ff0eeSFilipe Manana  * lock(&sb->s_type->i_mutex_key#14);
51312f2ff0eeSFilipe Manana  *
51322f2ff0eeSFilipe Manana  * Where sb_internal is the lock (a counter that works as a lock) acquired by
51332f2ff0eeSFilipe Manana  * sb_start_intwrite() in btrfs_start_transaction().
51342f2ff0eeSFilipe Manana  * Not locking i_mutex of the inodes is still safe because:
51352f2ff0eeSFilipe Manana  *
51362f2ff0eeSFilipe Manana  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
51372f2ff0eeSFilipe Manana  *    that while logging the inode new references (names) are added or removed
51382f2ff0eeSFilipe Manana  *    from the inode, leaving the logged inode item with a link count that does
51392f2ff0eeSFilipe Manana  *    not match the number of logged inode reference items. This is fine because
51402f2ff0eeSFilipe Manana  *    at log replay time we compute the real number of links and correct the
51412f2ff0eeSFilipe Manana  *    link count in the inode item (see replay_one_buffer() and
51422f2ff0eeSFilipe Manana  *    link_to_fixup_dir());
51432f2ff0eeSFilipe Manana  *
51442f2ff0eeSFilipe Manana  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
51452f2ff0eeSFilipe Manana  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
51462f2ff0eeSFilipe Manana  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
51472f2ff0eeSFilipe Manana  *    has a size that doesn't match the sum of the lengths of all the logged
51482f2ff0eeSFilipe Manana  *    names. This does not result in a problem because if a dir_item key is
51492f2ff0eeSFilipe Manana  *    logged but its matching dir_index key is not logged, at log replay time we
51502f2ff0eeSFilipe Manana  *    don't use it to replay the respective name (see replay_one_name()). On the
51512f2ff0eeSFilipe Manana  *    other hand if only the dir_index key ends up being logged, the respective
51522f2ff0eeSFilipe Manana  *    name is added to the fs/subvol tree with both the dir_item and dir_index
51532f2ff0eeSFilipe Manana  *    keys created (see replay_one_name()).
51542f2ff0eeSFilipe Manana  *    The directory's inode item with a wrong i_size is not a problem as well,
51552f2ff0eeSFilipe Manana  *    since we don't use it at log replay time to set the i_size in the inode
51562f2ff0eeSFilipe Manana  *    item of the fs/subvol tree (see overwrite_item()).
51572f2ff0eeSFilipe Manana  */
51582f2ff0eeSFilipe Manana static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
51592f2ff0eeSFilipe Manana 				struct btrfs_root *root,
516051cc0d32SNikolay Borisov 				struct btrfs_inode *start_inode,
51612f2ff0eeSFilipe Manana 				struct btrfs_log_ctx *ctx)
51622f2ff0eeSFilipe Manana {
51630b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
51642f2ff0eeSFilipe Manana 	struct btrfs_root *log = root->log_root;
51652f2ff0eeSFilipe Manana 	struct btrfs_path *path;
51662f2ff0eeSFilipe Manana 	LIST_HEAD(dir_list);
51672f2ff0eeSFilipe Manana 	struct btrfs_dir_list *dir_elem;
51682f2ff0eeSFilipe Manana 	int ret = 0;
51692f2ff0eeSFilipe Manana 
51702f2ff0eeSFilipe Manana 	path = btrfs_alloc_path();
51712f2ff0eeSFilipe Manana 	if (!path)
51722f2ff0eeSFilipe Manana 		return -ENOMEM;
51732f2ff0eeSFilipe Manana 
51742f2ff0eeSFilipe Manana 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
51752f2ff0eeSFilipe Manana 	if (!dir_elem) {
51762f2ff0eeSFilipe Manana 		btrfs_free_path(path);
51772f2ff0eeSFilipe Manana 		return -ENOMEM;
51782f2ff0eeSFilipe Manana 	}
517951cc0d32SNikolay Borisov 	dir_elem->ino = btrfs_ino(start_inode);
51802f2ff0eeSFilipe Manana 	list_add_tail(&dir_elem->list, &dir_list);
51812f2ff0eeSFilipe Manana 
51822f2ff0eeSFilipe Manana 	while (!list_empty(&dir_list)) {
51832f2ff0eeSFilipe Manana 		struct extent_buffer *leaf;
51842f2ff0eeSFilipe Manana 		struct btrfs_key min_key;
51852f2ff0eeSFilipe Manana 		int nritems;
51862f2ff0eeSFilipe Manana 		int i;
51872f2ff0eeSFilipe Manana 
51882f2ff0eeSFilipe Manana 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
51892f2ff0eeSFilipe Manana 					    list);
51902f2ff0eeSFilipe Manana 		if (ret)
51912f2ff0eeSFilipe Manana 			goto next_dir_inode;
51922f2ff0eeSFilipe Manana 
51932f2ff0eeSFilipe Manana 		min_key.objectid = dir_elem->ino;
51942f2ff0eeSFilipe Manana 		min_key.type = BTRFS_DIR_ITEM_KEY;
51952f2ff0eeSFilipe Manana 		min_key.offset = 0;
51962f2ff0eeSFilipe Manana again:
51972f2ff0eeSFilipe Manana 		btrfs_release_path(path);
51982f2ff0eeSFilipe Manana 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
51992f2ff0eeSFilipe Manana 		if (ret < 0) {
52002f2ff0eeSFilipe Manana 			goto next_dir_inode;
52012f2ff0eeSFilipe Manana 		} else if (ret > 0) {
52022f2ff0eeSFilipe Manana 			ret = 0;
52032f2ff0eeSFilipe Manana 			goto next_dir_inode;
52042f2ff0eeSFilipe Manana 		}
52052f2ff0eeSFilipe Manana 
52062f2ff0eeSFilipe Manana process_leaf:
52072f2ff0eeSFilipe Manana 		leaf = path->nodes[0];
52082f2ff0eeSFilipe Manana 		nritems = btrfs_header_nritems(leaf);
52092f2ff0eeSFilipe Manana 		for (i = path->slots[0]; i < nritems; i++) {
52102f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
52112f2ff0eeSFilipe Manana 			struct btrfs_key di_key;
52122f2ff0eeSFilipe Manana 			struct inode *di_inode;
52132f2ff0eeSFilipe Manana 			struct btrfs_dir_list *new_dir_elem;
52142f2ff0eeSFilipe Manana 			int log_mode = LOG_INODE_EXISTS;
52152f2ff0eeSFilipe Manana 			int type;
52162f2ff0eeSFilipe Manana 
52172f2ff0eeSFilipe Manana 			btrfs_item_key_to_cpu(leaf, &min_key, i);
52182f2ff0eeSFilipe Manana 			if (min_key.objectid != dir_elem->ino ||
52192f2ff0eeSFilipe Manana 			    min_key.type != BTRFS_DIR_ITEM_KEY)
52202f2ff0eeSFilipe Manana 				goto next_dir_inode;
52212f2ff0eeSFilipe Manana 
52222f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
52232f2ff0eeSFilipe Manana 			type = btrfs_dir_type(leaf, di);
52242f2ff0eeSFilipe Manana 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
52252f2ff0eeSFilipe Manana 			    type != BTRFS_FT_DIR)
52262f2ff0eeSFilipe Manana 				continue;
52272f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
52282f2ff0eeSFilipe Manana 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
52292f2ff0eeSFilipe Manana 				continue;
52302f2ff0eeSFilipe Manana 
5231ec125cfbSRobbie Ko 			btrfs_release_path(path);
52320b246afaSJeff Mahoney 			di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
52332f2ff0eeSFilipe Manana 			if (IS_ERR(di_inode)) {
52342f2ff0eeSFilipe Manana 				ret = PTR_ERR(di_inode);
52352f2ff0eeSFilipe Manana 				goto next_dir_inode;
52362f2ff0eeSFilipe Manana 			}
52372f2ff0eeSFilipe Manana 
52380f8939b8SNikolay Borisov 			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
52392f2ff0eeSFilipe Manana 				iput(di_inode);
5240ec125cfbSRobbie Ko 				break;
52412f2ff0eeSFilipe Manana 			}
52422f2ff0eeSFilipe Manana 
52432f2ff0eeSFilipe Manana 			ctx->log_new_dentries = false;
52443f9749f6SFilipe Manana 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
52452f2ff0eeSFilipe Manana 				log_mode = LOG_INODE_ALL;
52462f2ff0eeSFilipe Manana 			ret = btrfs_log_inode(trans, root, di_inode,
52472f2ff0eeSFilipe Manana 					      log_mode, 0, LLONG_MAX, ctx);
52482be63d5cSFilipe Manana 			if (!ret &&
5249ab1717b2SNikolay Borisov 			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
52502be63d5cSFilipe Manana 				ret = 1;
52512f2ff0eeSFilipe Manana 			iput(di_inode);
52522f2ff0eeSFilipe Manana 			if (ret)
52532f2ff0eeSFilipe Manana 				goto next_dir_inode;
52542f2ff0eeSFilipe Manana 			if (ctx->log_new_dentries) {
52552f2ff0eeSFilipe Manana 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
52562f2ff0eeSFilipe Manana 						       GFP_NOFS);
52572f2ff0eeSFilipe Manana 				if (!new_dir_elem) {
52582f2ff0eeSFilipe Manana 					ret = -ENOMEM;
52592f2ff0eeSFilipe Manana 					goto next_dir_inode;
52602f2ff0eeSFilipe Manana 				}
52612f2ff0eeSFilipe Manana 				new_dir_elem->ino = di_key.objectid;
52622f2ff0eeSFilipe Manana 				list_add_tail(&new_dir_elem->list, &dir_list);
52632f2ff0eeSFilipe Manana 			}
52642f2ff0eeSFilipe Manana 			break;
52652f2ff0eeSFilipe Manana 		}
52662f2ff0eeSFilipe Manana 		if (i == nritems) {
52672f2ff0eeSFilipe Manana 			ret = btrfs_next_leaf(log, path);
52682f2ff0eeSFilipe Manana 			if (ret < 0) {
52692f2ff0eeSFilipe Manana 				goto next_dir_inode;
52702f2ff0eeSFilipe Manana 			} else if (ret > 0) {
52712f2ff0eeSFilipe Manana 				ret = 0;
52722f2ff0eeSFilipe Manana 				goto next_dir_inode;
52732f2ff0eeSFilipe Manana 			}
52742f2ff0eeSFilipe Manana 			goto process_leaf;
52752f2ff0eeSFilipe Manana 		}
52762f2ff0eeSFilipe Manana 		if (min_key.offset < (u64)-1) {
52772f2ff0eeSFilipe Manana 			min_key.offset++;
52782f2ff0eeSFilipe Manana 			goto again;
52792f2ff0eeSFilipe Manana 		}
52802f2ff0eeSFilipe Manana next_dir_inode:
52812f2ff0eeSFilipe Manana 		list_del(&dir_elem->list);
52822f2ff0eeSFilipe Manana 		kfree(dir_elem);
52832f2ff0eeSFilipe Manana 	}
52842f2ff0eeSFilipe Manana 
52852f2ff0eeSFilipe Manana 	btrfs_free_path(path);
52862f2ff0eeSFilipe Manana 	return ret;
52872f2ff0eeSFilipe Manana }
52882f2ff0eeSFilipe Manana 
528918aa0922SFilipe Manana static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
529018aa0922SFilipe Manana 				 struct inode *inode,
529118aa0922SFilipe Manana 				 struct btrfs_log_ctx *ctx)
529218aa0922SFilipe Manana {
52930b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
529418aa0922SFilipe Manana 	int ret;
529518aa0922SFilipe Manana 	struct btrfs_path *path;
529618aa0922SFilipe Manana 	struct btrfs_key key;
529718aa0922SFilipe Manana 	struct btrfs_root *root = BTRFS_I(inode)->root;
52984a0cc7caSNikolay Borisov 	const u64 ino = btrfs_ino(BTRFS_I(inode));
529918aa0922SFilipe Manana 
530018aa0922SFilipe Manana 	path = btrfs_alloc_path();
530118aa0922SFilipe Manana 	if (!path)
530218aa0922SFilipe Manana 		return -ENOMEM;
530318aa0922SFilipe Manana 	path->skip_locking = 1;
530418aa0922SFilipe Manana 	path->search_commit_root = 1;
530518aa0922SFilipe Manana 
530618aa0922SFilipe Manana 	key.objectid = ino;
530718aa0922SFilipe Manana 	key.type = BTRFS_INODE_REF_KEY;
530818aa0922SFilipe Manana 	key.offset = 0;
530918aa0922SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
531018aa0922SFilipe Manana 	if (ret < 0)
531118aa0922SFilipe Manana 		goto out;
531218aa0922SFilipe Manana 
531318aa0922SFilipe Manana 	while (true) {
531418aa0922SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
531518aa0922SFilipe Manana 		int slot = path->slots[0];
531618aa0922SFilipe Manana 		u32 cur_offset = 0;
531718aa0922SFilipe Manana 		u32 item_size;
531818aa0922SFilipe Manana 		unsigned long ptr;
531918aa0922SFilipe Manana 
532018aa0922SFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
532118aa0922SFilipe Manana 			ret = btrfs_next_leaf(root, path);
532218aa0922SFilipe Manana 			if (ret < 0)
532318aa0922SFilipe Manana 				goto out;
532418aa0922SFilipe Manana 			else if (ret > 0)
532518aa0922SFilipe Manana 				break;
532618aa0922SFilipe Manana 			continue;
532718aa0922SFilipe Manana 		}
532818aa0922SFilipe Manana 
532918aa0922SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
533018aa0922SFilipe Manana 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
533118aa0922SFilipe Manana 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
533218aa0922SFilipe Manana 			break;
533318aa0922SFilipe Manana 
533418aa0922SFilipe Manana 		item_size = btrfs_item_size_nr(leaf, slot);
533518aa0922SFilipe Manana 		ptr = btrfs_item_ptr_offset(leaf, slot);
533618aa0922SFilipe Manana 		while (cur_offset < item_size) {
533718aa0922SFilipe Manana 			struct btrfs_key inode_key;
533818aa0922SFilipe Manana 			struct inode *dir_inode;
533918aa0922SFilipe Manana 
534018aa0922SFilipe Manana 			inode_key.type = BTRFS_INODE_ITEM_KEY;
534118aa0922SFilipe Manana 			inode_key.offset = 0;
534218aa0922SFilipe Manana 
534318aa0922SFilipe Manana 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
534418aa0922SFilipe Manana 				struct btrfs_inode_extref *extref;
534518aa0922SFilipe Manana 
534618aa0922SFilipe Manana 				extref = (struct btrfs_inode_extref *)
534718aa0922SFilipe Manana 					(ptr + cur_offset);
534818aa0922SFilipe Manana 				inode_key.objectid = btrfs_inode_extref_parent(
534918aa0922SFilipe Manana 					leaf, extref);
535018aa0922SFilipe Manana 				cur_offset += sizeof(*extref);
535118aa0922SFilipe Manana 				cur_offset += btrfs_inode_extref_name_len(leaf,
535218aa0922SFilipe Manana 					extref);
535318aa0922SFilipe Manana 			} else {
535418aa0922SFilipe Manana 				inode_key.objectid = key.offset;
535518aa0922SFilipe Manana 				cur_offset = item_size;
535618aa0922SFilipe Manana 			}
535718aa0922SFilipe Manana 
53580b246afaSJeff Mahoney 			dir_inode = btrfs_iget(fs_info->sb, &inode_key,
535918aa0922SFilipe Manana 					       root, NULL);
536018aa0922SFilipe Manana 			/* If parent inode was deleted, skip it. */
536118aa0922SFilipe Manana 			if (IS_ERR(dir_inode))
536218aa0922SFilipe Manana 				continue;
536318aa0922SFilipe Manana 
5364657ed1aaSFilipe Manana 			if (ctx)
5365657ed1aaSFilipe Manana 				ctx->log_new_dentries = false;
536618aa0922SFilipe Manana 			ret = btrfs_log_inode(trans, root, dir_inode,
536718aa0922SFilipe Manana 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
53682be63d5cSFilipe Manana 			if (!ret &&
5369ab1717b2SNikolay Borisov 			    btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
53702be63d5cSFilipe Manana 				ret = 1;
5371657ed1aaSFilipe Manana 			if (!ret && ctx && ctx->log_new_dentries)
5372657ed1aaSFilipe Manana 				ret = log_new_dir_dentries(trans, root,
537351cc0d32SNikolay Borisov 							   BTRFS_I(dir_inode), ctx);
537418aa0922SFilipe Manana 			iput(dir_inode);
537518aa0922SFilipe Manana 			if (ret)
537618aa0922SFilipe Manana 				goto out;
537718aa0922SFilipe Manana 		}
537818aa0922SFilipe Manana 		path->slots[0]++;
537918aa0922SFilipe Manana 	}
538018aa0922SFilipe Manana 	ret = 0;
538118aa0922SFilipe Manana out:
538218aa0922SFilipe Manana 	btrfs_free_path(path);
538318aa0922SFilipe Manana 	return ret;
538418aa0922SFilipe Manana }
538518aa0922SFilipe Manana 
5386e02119d5SChris Mason /*
5387e02119d5SChris Mason  * helper function around btrfs_log_inode to make sure newly created
5388e02119d5SChris Mason  * parent directories also end up in the log.  A minimal inode and backref
5389e02119d5SChris Mason  * only logging is done of any parent directories that are older than
5390e02119d5SChris Mason  * the last committed transaction
5391e02119d5SChris Mason  */
539248a3b636SEric Sandeen static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
539312fcfd22SChris Mason 			    	  struct btrfs_root *root, struct inode *inode,
539449dae1bcSFilipe Manana 				  struct dentry *parent,
539549dae1bcSFilipe Manana 				  const loff_t start,
539649dae1bcSFilipe Manana 				  const loff_t end,
539749dae1bcSFilipe Manana 				  int exists_only,
53988b050d35SMiao Xie 				  struct btrfs_log_ctx *ctx)
5399e02119d5SChris Mason {
54000b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
540112fcfd22SChris Mason 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
5402e02119d5SChris Mason 	struct super_block *sb;
54036a912213SJosef Bacik 	struct dentry *old_parent = NULL;
540412fcfd22SChris Mason 	int ret = 0;
54050b246afaSJeff Mahoney 	u64 last_committed = fs_info->last_trans_committed;
54062f2ff0eeSFilipe Manana 	bool log_dentries = false;
54072f2ff0eeSFilipe Manana 	struct inode *orig_inode = inode;
540812fcfd22SChris Mason 
540912fcfd22SChris Mason 	sb = inode->i_sb;
541012fcfd22SChris Mason 
54110b246afaSJeff Mahoney 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
54123a5e1404SSage Weil 		ret = 1;
54133a5e1404SSage Weil 		goto end_no_trans;
54143a5e1404SSage Weil 	}
54153a5e1404SSage Weil 
5416995946ddSMiao Xie 	/*
5417995946ddSMiao Xie 	 * The prev transaction commit doesn't complete, we need do
5418995946ddSMiao Xie 	 * full commit by ourselves.
5419995946ddSMiao Xie 	 */
54200b246afaSJeff Mahoney 	if (fs_info->last_trans_log_full_commit >
54210b246afaSJeff Mahoney 	    fs_info->last_trans_committed) {
542212fcfd22SChris Mason 		ret = 1;
542312fcfd22SChris Mason 		goto end_no_trans;
542412fcfd22SChris Mason 	}
542512fcfd22SChris Mason 
542676dda93cSYan, Zheng 	if (root != BTRFS_I(inode)->root ||
542776dda93cSYan, Zheng 	    btrfs_root_refs(&root->root_item) == 0) {
542876dda93cSYan, Zheng 		ret = 1;
542976dda93cSYan, Zheng 		goto end_no_trans;
543076dda93cSYan, Zheng 	}
543176dda93cSYan, Zheng 
543212fcfd22SChris Mason 	ret = check_parent_dirs_for_sync(trans, inode, parent,
543312fcfd22SChris Mason 					 sb, last_committed);
543412fcfd22SChris Mason 	if (ret)
543512fcfd22SChris Mason 		goto end_no_trans;
5436e02119d5SChris Mason 
54370f8939b8SNikolay Borisov 	if (btrfs_inode_in_log(BTRFS_I(inode), trans->transid)) {
5438257c62e1SChris Mason 		ret = BTRFS_NO_LOG_SYNC;
5439257c62e1SChris Mason 		goto end_no_trans;
5440257c62e1SChris Mason 	}
5441257c62e1SChris Mason 
54428b050d35SMiao Xie 	ret = start_log_trans(trans, root, ctx);
54434a500fd1SYan, Zheng 	if (ret)
5444e87ac136SMiao Xie 		goto end_no_trans;
544512fcfd22SChris Mason 
54468407f553SFilipe Manana 	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
54474a500fd1SYan, Zheng 	if (ret)
54484a500fd1SYan, Zheng 		goto end_trans;
5449e02119d5SChris Mason 
5450af4176b4SChris Mason 	/*
5451af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
5452af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
5453af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
5454af4176b4SChris Mason 	 * and other fun in this file.
5455af4176b4SChris Mason 	 */
5456af4176b4SChris Mason 	if (S_ISREG(inode->i_mode) &&
5457af4176b4SChris Mason 	    BTRFS_I(inode)->generation <= last_committed &&
54584a500fd1SYan, Zheng 	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
54594a500fd1SYan, Zheng 		ret = 0;
54604a500fd1SYan, Zheng 		goto end_trans;
54614a500fd1SYan, Zheng 	}
5462af4176b4SChris Mason 
54632f2ff0eeSFilipe Manana 	if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
54642f2ff0eeSFilipe Manana 		log_dentries = true;
54652f2ff0eeSFilipe Manana 
546618aa0922SFilipe Manana 	/*
546701327610SNicholas D Steeves 	 * On unlink we must make sure all our current and old parent directory
546818aa0922SFilipe Manana 	 * inodes are fully logged. This is to prevent leaving dangling
546918aa0922SFilipe Manana 	 * directory index entries in directories that were our parents but are
547018aa0922SFilipe Manana 	 * not anymore. Not doing this results in old parent directory being
547118aa0922SFilipe Manana 	 * impossible to delete after log replay (rmdir will always fail with
547218aa0922SFilipe Manana 	 * error -ENOTEMPTY).
547318aa0922SFilipe Manana 	 *
547418aa0922SFilipe Manana 	 * Example 1:
547518aa0922SFilipe Manana 	 *
547618aa0922SFilipe Manana 	 * mkdir testdir
547718aa0922SFilipe Manana 	 * touch testdir/foo
547818aa0922SFilipe Manana 	 * ln testdir/foo testdir/bar
547918aa0922SFilipe Manana 	 * sync
548018aa0922SFilipe Manana 	 * unlink testdir/bar
548118aa0922SFilipe Manana 	 * xfs_io -c fsync testdir/foo
548218aa0922SFilipe Manana 	 * <power failure>
548318aa0922SFilipe Manana 	 * mount fs, triggers log replay
548418aa0922SFilipe Manana 	 *
548518aa0922SFilipe Manana 	 * If we don't log the parent directory (testdir), after log replay the
548618aa0922SFilipe Manana 	 * directory still has an entry pointing to the file inode using the bar
548718aa0922SFilipe Manana 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
548818aa0922SFilipe Manana 	 * the file inode has a link count of 1.
548918aa0922SFilipe Manana 	 *
549018aa0922SFilipe Manana 	 * Example 2:
549118aa0922SFilipe Manana 	 *
549218aa0922SFilipe Manana 	 * mkdir testdir
549318aa0922SFilipe Manana 	 * touch foo
549418aa0922SFilipe Manana 	 * ln foo testdir/foo2
549518aa0922SFilipe Manana 	 * ln foo testdir/foo3
549618aa0922SFilipe Manana 	 * sync
549718aa0922SFilipe Manana 	 * unlink testdir/foo3
549818aa0922SFilipe Manana 	 * xfs_io -c fsync foo
549918aa0922SFilipe Manana 	 * <power failure>
550018aa0922SFilipe Manana 	 * mount fs, triggers log replay
550118aa0922SFilipe Manana 	 *
550218aa0922SFilipe Manana 	 * Similar as the first example, after log replay the parent directory
550318aa0922SFilipe Manana 	 * testdir still has an entry pointing to the inode file with name foo3
550418aa0922SFilipe Manana 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
550518aa0922SFilipe Manana 	 * and has a link count of 2.
550618aa0922SFilipe Manana 	 */
550718aa0922SFilipe Manana 	if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
550818aa0922SFilipe Manana 		ret = btrfs_log_all_parents(trans, orig_inode, ctx);
550918aa0922SFilipe Manana 		if (ret)
551018aa0922SFilipe Manana 			goto end_trans;
551118aa0922SFilipe Manana 	}
551218aa0922SFilipe Manana 
551312fcfd22SChris Mason 	while (1) {
5514fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5515e02119d5SChris Mason 			break;
5516e02119d5SChris Mason 
55172b0143b5SDavid Howells 		inode = d_inode(parent);
551876dda93cSYan, Zheng 		if (root != BTRFS_I(inode)->root)
551976dda93cSYan, Zheng 			break;
552076dda93cSYan, Zheng 
552118aa0922SFilipe Manana 		if (BTRFS_I(inode)->generation > last_committed) {
552218aa0922SFilipe Manana 			ret = btrfs_log_inode(trans, root, inode,
552318aa0922SFilipe Manana 					      LOG_INODE_EXISTS,
55248407f553SFilipe Manana 					      0, LLONG_MAX, ctx);
55254a500fd1SYan, Zheng 			if (ret)
55264a500fd1SYan, Zheng 				goto end_trans;
5527e02119d5SChris Mason 		}
552876dda93cSYan, Zheng 		if (IS_ROOT(parent))
552912fcfd22SChris Mason 			break;
553012fcfd22SChris Mason 
55316a912213SJosef Bacik 		parent = dget_parent(parent);
55326a912213SJosef Bacik 		dput(old_parent);
55336a912213SJosef Bacik 		old_parent = parent;
553412fcfd22SChris Mason 	}
55352f2ff0eeSFilipe Manana 	if (log_dentries)
553651cc0d32SNikolay Borisov 		ret = log_new_dir_dentries(trans, root, BTRFS_I(orig_inode), ctx);
55372f2ff0eeSFilipe Manana 	else
553812fcfd22SChris Mason 		ret = 0;
55394a500fd1SYan, Zheng end_trans:
55406a912213SJosef Bacik 	dput(old_parent);
55414a500fd1SYan, Zheng 	if (ret < 0) {
55420b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
55434a500fd1SYan, Zheng 		ret = 1;
55444a500fd1SYan, Zheng 	}
55458b050d35SMiao Xie 
55468b050d35SMiao Xie 	if (ret)
55478b050d35SMiao Xie 		btrfs_remove_log_ctx(root, ctx);
554812fcfd22SChris Mason 	btrfs_end_log_trans(root);
554912fcfd22SChris Mason end_no_trans:
555012fcfd22SChris Mason 	return ret;
5551e02119d5SChris Mason }
5552e02119d5SChris Mason 
5553e02119d5SChris Mason /*
5554e02119d5SChris Mason  * it is not safe to log dentry if the chunk root has added new
5555e02119d5SChris Mason  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5556e02119d5SChris Mason  * If this returns 1, you must commit the transaction to safely get your
5557e02119d5SChris Mason  * data on disk.
5558e02119d5SChris Mason  */
5559e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
55608b050d35SMiao Xie 			  struct btrfs_root *root, struct dentry *dentry,
556149dae1bcSFilipe Manana 			  const loff_t start,
556249dae1bcSFilipe Manana 			  const loff_t end,
55638b050d35SMiao Xie 			  struct btrfs_log_ctx *ctx)
5564e02119d5SChris Mason {
55656a912213SJosef Bacik 	struct dentry *parent = dget_parent(dentry);
55666a912213SJosef Bacik 	int ret;
55676a912213SJosef Bacik 
55682b0143b5SDavid Howells 	ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
556949dae1bcSFilipe Manana 				     start, end, 0, ctx);
55706a912213SJosef Bacik 	dput(parent);
55716a912213SJosef Bacik 
55726a912213SJosef Bacik 	return ret;
5573e02119d5SChris Mason }
5574e02119d5SChris Mason 
5575e02119d5SChris Mason /*
5576e02119d5SChris Mason  * should be called during mount to recover any replay any log trees
5577e02119d5SChris Mason  * from the FS
5578e02119d5SChris Mason  */
5579e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5580e02119d5SChris Mason {
5581e02119d5SChris Mason 	int ret;
5582e02119d5SChris Mason 	struct btrfs_path *path;
5583e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
5584e02119d5SChris Mason 	struct btrfs_key key;
5585e02119d5SChris Mason 	struct btrfs_key found_key;
5586e02119d5SChris Mason 	struct btrfs_key tmp_key;
5587e02119d5SChris Mason 	struct btrfs_root *log;
5588e02119d5SChris Mason 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5589e02119d5SChris Mason 	struct walk_control wc = {
5590e02119d5SChris Mason 		.process_func = process_one_buffer,
5591e02119d5SChris Mason 		.stage = 0,
5592e02119d5SChris Mason 	};
5593e02119d5SChris Mason 
5594e02119d5SChris Mason 	path = btrfs_alloc_path();
5595db5b493aSTsutomu Itoh 	if (!path)
5596db5b493aSTsutomu Itoh 		return -ENOMEM;
5597db5b493aSTsutomu Itoh 
5598afcdd129SJosef Bacik 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5599e02119d5SChris Mason 
56004a500fd1SYan, Zheng 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
560179787eaaSJeff Mahoney 	if (IS_ERR(trans)) {
560279787eaaSJeff Mahoney 		ret = PTR_ERR(trans);
560379787eaaSJeff Mahoney 		goto error;
560479787eaaSJeff Mahoney 	}
5605e02119d5SChris Mason 
5606e02119d5SChris Mason 	wc.trans = trans;
5607e02119d5SChris Mason 	wc.pin = 1;
5608e02119d5SChris Mason 
5609db5b493aSTsutomu Itoh 	ret = walk_log_tree(trans, log_root_tree, &wc);
561079787eaaSJeff Mahoney 	if (ret) {
56115d163e0eSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
56125d163e0eSJeff Mahoney 			"Failed to pin buffers while recovering log root tree.");
561379787eaaSJeff Mahoney 		goto error;
561479787eaaSJeff Mahoney 	}
5615e02119d5SChris Mason 
5616e02119d5SChris Mason again:
5617e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
5618e02119d5SChris Mason 	key.offset = (u64)-1;
5619962a298fSDavid Sterba 	key.type = BTRFS_ROOT_ITEM_KEY;
5620e02119d5SChris Mason 
5621e02119d5SChris Mason 	while (1) {
5622e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
562379787eaaSJeff Mahoney 
562479787eaaSJeff Mahoney 		if (ret < 0) {
562534d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
562679787eaaSJeff Mahoney 				    "Couldn't find tree log root.");
562779787eaaSJeff Mahoney 			goto error;
562879787eaaSJeff Mahoney 		}
5629e02119d5SChris Mason 		if (ret > 0) {
5630e02119d5SChris Mason 			if (path->slots[0] == 0)
5631e02119d5SChris Mason 				break;
5632e02119d5SChris Mason 			path->slots[0]--;
5633e02119d5SChris Mason 		}
5634e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5635e02119d5SChris Mason 				      path->slots[0]);
5636b3b4aa74SDavid Sterba 		btrfs_release_path(path);
5637e02119d5SChris Mason 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5638e02119d5SChris Mason 			break;
5639e02119d5SChris Mason 
5640cb517eabSMiao Xie 		log = btrfs_read_fs_root(log_root_tree, &found_key);
564179787eaaSJeff Mahoney 		if (IS_ERR(log)) {
564279787eaaSJeff Mahoney 			ret = PTR_ERR(log);
564334d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
564479787eaaSJeff Mahoney 				    "Couldn't read tree log root.");
564579787eaaSJeff Mahoney 			goto error;
564679787eaaSJeff Mahoney 		}
5647e02119d5SChris Mason 
5648e02119d5SChris Mason 		tmp_key.objectid = found_key.offset;
5649e02119d5SChris Mason 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5650e02119d5SChris Mason 		tmp_key.offset = (u64)-1;
5651e02119d5SChris Mason 
5652e02119d5SChris Mason 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
565379787eaaSJeff Mahoney 		if (IS_ERR(wc.replay_dest)) {
565479787eaaSJeff Mahoney 			ret = PTR_ERR(wc.replay_dest);
5655b50c6e25SJosef Bacik 			free_extent_buffer(log->node);
5656b50c6e25SJosef Bacik 			free_extent_buffer(log->commit_root);
5657b50c6e25SJosef Bacik 			kfree(log);
56585d163e0eSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
56595d163e0eSJeff Mahoney 				"Couldn't read target root for tree log recovery.");
566079787eaaSJeff Mahoney 			goto error;
566179787eaaSJeff Mahoney 		}
5662e02119d5SChris Mason 
566307d400a6SYan Zheng 		wc.replay_dest->log_root = log;
56645d4f98a2SYan Zheng 		btrfs_record_root_in_trans(trans, wc.replay_dest);
5665e02119d5SChris Mason 		ret = walk_log_tree(trans, log, &wc);
5666e02119d5SChris Mason 
5667b50c6e25SJosef Bacik 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5668e02119d5SChris Mason 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
5669e02119d5SChris Mason 						      path);
5670e02119d5SChris Mason 		}
5671e02119d5SChris Mason 
5672e02119d5SChris Mason 		key.offset = found_key.offset - 1;
567307d400a6SYan Zheng 		wc.replay_dest->log_root = NULL;
5674e02119d5SChris Mason 		free_extent_buffer(log->node);
5675b263c2c8SChris Mason 		free_extent_buffer(log->commit_root);
5676e02119d5SChris Mason 		kfree(log);
5677e02119d5SChris Mason 
5678b50c6e25SJosef Bacik 		if (ret)
5679b50c6e25SJosef Bacik 			goto error;
5680b50c6e25SJosef Bacik 
5681e02119d5SChris Mason 		if (found_key.offset == 0)
5682e02119d5SChris Mason 			break;
5683e02119d5SChris Mason 	}
5684b3b4aa74SDavid Sterba 	btrfs_release_path(path);
5685e02119d5SChris Mason 
5686e02119d5SChris Mason 	/* step one is to pin it all, step two is to replay just inodes */
5687e02119d5SChris Mason 	if (wc.pin) {
5688e02119d5SChris Mason 		wc.pin = 0;
5689e02119d5SChris Mason 		wc.process_func = replay_one_buffer;
5690e02119d5SChris Mason 		wc.stage = LOG_WALK_REPLAY_INODES;
5691e02119d5SChris Mason 		goto again;
5692e02119d5SChris Mason 	}
5693e02119d5SChris Mason 	/* step three is to replay everything */
5694e02119d5SChris Mason 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
5695e02119d5SChris Mason 		wc.stage++;
5696e02119d5SChris Mason 		goto again;
5697e02119d5SChris Mason 	}
5698e02119d5SChris Mason 
5699e02119d5SChris Mason 	btrfs_free_path(path);
5700e02119d5SChris Mason 
5701abefa55aSJosef Bacik 	/* step 4: commit the transaction, which also unpins the blocks */
57023a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
5703abefa55aSJosef Bacik 	if (ret)
5704abefa55aSJosef Bacik 		return ret;
5705abefa55aSJosef Bacik 
5706e02119d5SChris Mason 	free_extent_buffer(log_root_tree->node);
5707e02119d5SChris Mason 	log_root_tree->log_root = NULL;
5708afcdd129SJosef Bacik 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5709e02119d5SChris Mason 	kfree(log_root_tree);
571079787eaaSJeff Mahoney 
5711abefa55aSJosef Bacik 	return 0;
571279787eaaSJeff Mahoney error:
5713b50c6e25SJosef Bacik 	if (wc.trans)
57143a45bb20SJeff Mahoney 		btrfs_end_transaction(wc.trans);
571579787eaaSJeff Mahoney 	btrfs_free_path(path);
571679787eaaSJeff Mahoney 	return ret;
5717e02119d5SChris Mason }
571812fcfd22SChris Mason 
571912fcfd22SChris Mason /*
572012fcfd22SChris Mason  * there are some corner cases where we want to force a full
572112fcfd22SChris Mason  * commit instead of allowing a directory to be logged.
572212fcfd22SChris Mason  *
572312fcfd22SChris Mason  * They revolve around files there were unlinked from the directory, and
572412fcfd22SChris Mason  * this function updates the parent directory so that a full commit is
572512fcfd22SChris Mason  * properly done if it is fsync'd later after the unlinks are done.
57262be63d5cSFilipe Manana  *
57272be63d5cSFilipe Manana  * Must be called before the unlink operations (updates to the subvolume tree,
57282be63d5cSFilipe Manana  * inodes, etc) are done.
572912fcfd22SChris Mason  */
573012fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
57314176bdbfSNikolay Borisov 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
573212fcfd22SChris Mason 			     int for_rename)
573312fcfd22SChris Mason {
573412fcfd22SChris Mason 	/*
5735af4176b4SChris Mason 	 * when we're logging a file, if it hasn't been renamed
5736af4176b4SChris Mason 	 * or unlinked, and its inode is fully committed on disk,
5737af4176b4SChris Mason 	 * we don't have to worry about walking up the directory chain
5738af4176b4SChris Mason 	 * to log its parents.
5739af4176b4SChris Mason 	 *
5740af4176b4SChris Mason 	 * So, we use the last_unlink_trans field to put this transid
5741af4176b4SChris Mason 	 * into the file.  When the file is logged we check it and
5742af4176b4SChris Mason 	 * don't log the parents if the file is fully on disk.
5743af4176b4SChris Mason 	 */
57444176bdbfSNikolay Borisov 	mutex_lock(&inode->log_mutex);
57454176bdbfSNikolay Borisov 	inode->last_unlink_trans = trans->transid;
57464176bdbfSNikolay Borisov 	mutex_unlock(&inode->log_mutex);
5747af4176b4SChris Mason 
5748af4176b4SChris Mason 	/*
574912fcfd22SChris Mason 	 * if this directory was already logged any new
575012fcfd22SChris Mason 	 * names for this file/dir will get recorded
575112fcfd22SChris Mason 	 */
575212fcfd22SChris Mason 	smp_mb();
57534176bdbfSNikolay Borisov 	if (dir->logged_trans == trans->transid)
575412fcfd22SChris Mason 		return;
575512fcfd22SChris Mason 
575612fcfd22SChris Mason 	/*
575712fcfd22SChris Mason 	 * if the inode we're about to unlink was logged,
575812fcfd22SChris Mason 	 * the log will be properly updated for any new names
575912fcfd22SChris Mason 	 */
57604176bdbfSNikolay Borisov 	if (inode->logged_trans == trans->transid)
576112fcfd22SChris Mason 		return;
576212fcfd22SChris Mason 
576312fcfd22SChris Mason 	/*
576412fcfd22SChris Mason 	 * when renaming files across directories, if the directory
576512fcfd22SChris Mason 	 * there we're unlinking from gets fsync'd later on, there's
576612fcfd22SChris Mason 	 * no way to find the destination directory later and fsync it
576712fcfd22SChris Mason 	 * properly.  So, we have to be conservative and force commits
576812fcfd22SChris Mason 	 * so the new name gets discovered.
576912fcfd22SChris Mason 	 */
577012fcfd22SChris Mason 	if (for_rename)
577112fcfd22SChris Mason 		goto record;
577212fcfd22SChris Mason 
577312fcfd22SChris Mason 	/* we can safely do the unlink without any special recording */
577412fcfd22SChris Mason 	return;
577512fcfd22SChris Mason 
577612fcfd22SChris Mason record:
57774176bdbfSNikolay Borisov 	mutex_lock(&dir->log_mutex);
57784176bdbfSNikolay Borisov 	dir->last_unlink_trans = trans->transid;
57794176bdbfSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
578012fcfd22SChris Mason }
578112fcfd22SChris Mason 
578212fcfd22SChris Mason /*
57831ec9a1aeSFilipe Manana  * Make sure that if someone attempts to fsync the parent directory of a deleted
57841ec9a1aeSFilipe Manana  * snapshot, it ends up triggering a transaction commit. This is to guarantee
57851ec9a1aeSFilipe Manana  * that after replaying the log tree of the parent directory's root we will not
57861ec9a1aeSFilipe Manana  * see the snapshot anymore and at log replay time we will not see any log tree
57871ec9a1aeSFilipe Manana  * corresponding to the deleted snapshot's root, which could lead to replaying
57881ec9a1aeSFilipe Manana  * it after replaying the log tree of the parent directory (which would replay
57891ec9a1aeSFilipe Manana  * the snapshot delete operation).
57902be63d5cSFilipe Manana  *
57912be63d5cSFilipe Manana  * Must be called before the actual snapshot destroy operation (updates to the
57922be63d5cSFilipe Manana  * parent root and tree of tree roots trees, etc) are done.
57931ec9a1aeSFilipe Manana  */
57941ec9a1aeSFilipe Manana void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
579543663557SNikolay Borisov 				   struct btrfs_inode *dir)
57961ec9a1aeSFilipe Manana {
579743663557SNikolay Borisov 	mutex_lock(&dir->log_mutex);
579843663557SNikolay Borisov 	dir->last_unlink_trans = trans->transid;
579943663557SNikolay Borisov 	mutex_unlock(&dir->log_mutex);
58001ec9a1aeSFilipe Manana }
58011ec9a1aeSFilipe Manana 
58021ec9a1aeSFilipe Manana /*
580312fcfd22SChris Mason  * Call this after adding a new name for a file and it will properly
580412fcfd22SChris Mason  * update the log to reflect the new name.
580512fcfd22SChris Mason  *
580612fcfd22SChris Mason  * It will return zero if all goes well, and it will return 1 if a
580712fcfd22SChris Mason  * full transaction commit is required.
580812fcfd22SChris Mason  */
580912fcfd22SChris Mason int btrfs_log_new_name(struct btrfs_trans_handle *trans,
58109ca5fbfbSNikolay Borisov 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
581112fcfd22SChris Mason 			struct dentry *parent)
581212fcfd22SChris Mason {
58139ca5fbfbSNikolay Borisov 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
58149ca5fbfbSNikolay Borisov 	struct btrfs_root * root = inode->root;
581512fcfd22SChris Mason 
581612fcfd22SChris Mason 	/*
5817af4176b4SChris Mason 	 * this will force the logging code to walk the dentry chain
5818af4176b4SChris Mason 	 * up for the file
5819af4176b4SChris Mason 	 */
58209ca5fbfbSNikolay Borisov 	if (S_ISREG(inode->vfs_inode.i_mode))
58219ca5fbfbSNikolay Borisov 		inode->last_unlink_trans = trans->transid;
5822af4176b4SChris Mason 
5823af4176b4SChris Mason 	/*
582412fcfd22SChris Mason 	 * if this inode hasn't been logged and directory we're renaming it
582512fcfd22SChris Mason 	 * from hasn't been logged, we don't need to log it
582612fcfd22SChris Mason 	 */
58279ca5fbfbSNikolay Borisov 	if (inode->logged_trans <= fs_info->last_trans_committed &&
58289ca5fbfbSNikolay Borisov 	    (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
582912fcfd22SChris Mason 		return 0;
583012fcfd22SChris Mason 
58319ca5fbfbSNikolay Borisov 	return btrfs_log_inode_parent(trans, root, &inode->vfs_inode, parent, 0,
583249dae1bcSFilipe Manana 				      LLONG_MAX, 1, NULL);
583312fcfd22SChris Mason }
583412fcfd22SChris Mason 
5835