xref: /openbmc/linux/fs/btrfs/tree-log.c (revision f96d44743a44e3332f75d23d2075bb8270900e1d)
1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
2e02119d5SChris Mason /*
3e02119d5SChris Mason  * Copyright (C) 2008 Oracle.  All rights reserved.
4e02119d5SChris Mason  */
5e02119d5SChris Mason 
6e02119d5SChris Mason #include <linux/sched.h>
75a0e3ad6STejun Heo #include <linux/slab.h>
8c6adc9ccSMiao Xie #include <linux/blkdev.h>
95dc562c5SJosef Bacik #include <linux/list_sort.h>
10c7f88c4eSJeff Layton #include <linux/iversion.h>
11602cbe91SDavid Sterba #include "misc.h"
129678c543SNikolay Borisov #include "ctree.h"
13995946ddSMiao Xie #include "tree-log.h"
14e02119d5SChris Mason #include "disk-io.h"
15e02119d5SChris Mason #include "locking.h"
16e02119d5SChris Mason #include "print-tree.h"
17f186373fSMark Fasheh #include "backref.h"
18ebb8765bSAnand Jain #include "compression.h"
19df2c95f3SQu Wenruo #include "qgroup.h"
206787bb9fSNikolay Borisov #include "block-group.h"
216787bb9fSNikolay Borisov #include "space-info.h"
22d3575156SNaohiro Aota #include "zoned.h"
23e02119d5SChris Mason 
24e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode:
25e02119d5SChris Mason  *
26e02119d5SChris Mason  * LOG_INODE_ALL means to log everything
27e02119d5SChris Mason  * LOG_INODE_EXISTS means to log just enough to recreate the inode
28e02119d5SChris Mason  * during log replay
29e02119d5SChris Mason  */
30e13976cfSDavid Sterba enum {
31e13976cfSDavid Sterba 	LOG_INODE_ALL,
32e13976cfSDavid Sterba 	LOG_INODE_EXISTS,
33e13976cfSDavid Sterba 	LOG_OTHER_INODE,
34e13976cfSDavid Sterba 	LOG_OTHER_INODE_ALL,
35e13976cfSDavid Sterba };
36e02119d5SChris Mason 
37e02119d5SChris Mason /*
3812fcfd22SChris Mason  * directory trouble cases
3912fcfd22SChris Mason  *
4012fcfd22SChris Mason  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
4112fcfd22SChris Mason  * log, we must force a full commit before doing an fsync of the directory
4212fcfd22SChris Mason  * where the unlink was done.
4312fcfd22SChris Mason  * ---> record transid of last unlink/rename per directory
4412fcfd22SChris Mason  *
4512fcfd22SChris Mason  * mkdir foo/some_dir
4612fcfd22SChris Mason  * normal commit
4712fcfd22SChris Mason  * rename foo/some_dir foo2/some_dir
4812fcfd22SChris Mason  * mkdir foo/some_dir
4912fcfd22SChris Mason  * fsync foo/some_dir/some_file
5012fcfd22SChris Mason  *
5112fcfd22SChris Mason  * The fsync above will unlink the original some_dir without recording
5212fcfd22SChris Mason  * it in its new location (foo2).  After a crash, some_dir will be gone
5312fcfd22SChris Mason  * unless the fsync of some_file forces a full commit
5412fcfd22SChris Mason  *
5512fcfd22SChris Mason  * 2) we must log any new names for any file or dir that is in the fsync
5612fcfd22SChris Mason  * log. ---> check inode while renaming/linking.
5712fcfd22SChris Mason  *
5812fcfd22SChris Mason  * 2a) we must log any new names for any file or dir during rename
5912fcfd22SChris Mason  * when the directory they are being removed from was logged.
6012fcfd22SChris Mason  * ---> check inode and old parent dir during rename
6112fcfd22SChris Mason  *
6212fcfd22SChris Mason  *  2a is actually the more important variant.  With the extra logging
6312fcfd22SChris Mason  *  a crash might unlink the old name without recreating the new one
6412fcfd22SChris Mason  *
6512fcfd22SChris Mason  * 3) after a crash, we must go through any directories with a link count
6612fcfd22SChris Mason  * of zero and redo the rm -rf
6712fcfd22SChris Mason  *
6812fcfd22SChris Mason  * mkdir f1/foo
6912fcfd22SChris Mason  * normal commit
7012fcfd22SChris Mason  * rm -rf f1/foo
7112fcfd22SChris Mason  * fsync(f1)
7212fcfd22SChris Mason  *
7312fcfd22SChris Mason  * The directory f1 was fully removed from the FS, but fsync was never
7412fcfd22SChris Mason  * called on f1, only its parent dir.  After a crash the rm -rf must
7512fcfd22SChris Mason  * be replayed.  This must be able to recurse down the entire
7612fcfd22SChris Mason  * directory tree.  The inode link count fixup code takes care of the
7712fcfd22SChris Mason  * ugly details.
7812fcfd22SChris Mason  */
7912fcfd22SChris Mason 
8012fcfd22SChris Mason /*
81e02119d5SChris Mason  * stages for the tree walking.  The first
82e02119d5SChris Mason  * stage (0) is to only pin down the blocks we find
83e02119d5SChris Mason  * the second stage (1) is to make sure that all the inodes
84e02119d5SChris Mason  * we find in the log are created in the subvolume.
85e02119d5SChris Mason  *
86e02119d5SChris Mason  * The last stage is to deal with directories and links and extents
87e02119d5SChris Mason  * and all the other fun semantics
88e02119d5SChris Mason  */
89e13976cfSDavid Sterba enum {
90e13976cfSDavid Sterba 	LOG_WALK_PIN_ONLY,
91e13976cfSDavid Sterba 	LOG_WALK_REPLAY_INODES,
92e13976cfSDavid Sterba 	LOG_WALK_REPLAY_DIR_INDEX,
93e13976cfSDavid Sterba 	LOG_WALK_REPLAY_ALL,
94e13976cfSDavid Sterba };
95e02119d5SChris Mason 
9612fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97a59108a7SNikolay Borisov 			   struct btrfs_root *root, struct btrfs_inode *inode,
9849dae1bcSFilipe Manana 			   int inode_only,
998407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx);
100ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101ec051c0fSYan Zheng 			     struct btrfs_root *root,
102ec051c0fSYan Zheng 			     struct btrfs_path *path, u64 objectid);
10312fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
10412fcfd22SChris Mason 				       struct btrfs_root *root,
10512fcfd22SChris Mason 				       struct btrfs_root *log,
10612fcfd22SChris Mason 				       struct btrfs_path *path,
10712fcfd22SChris Mason 				       u64 dirid, int del_all);
108fa1a0f42SNaohiro Aota static void wait_log_commit(struct btrfs_root *root, int transid);
109e02119d5SChris Mason 
110e02119d5SChris Mason /*
111e02119d5SChris Mason  * tree logging is a special write ahead log used to make sure that
112e02119d5SChris Mason  * fsyncs and O_SYNCs can happen without doing full tree commits.
113e02119d5SChris Mason  *
114e02119d5SChris Mason  * Full tree commits are expensive because they require commonly
115e02119d5SChris Mason  * modified blocks to be recowed, creating many dirty pages in the
116e02119d5SChris Mason  * extent tree an 4x-6x higher write load than ext3.
117e02119d5SChris Mason  *
118e02119d5SChris Mason  * Instead of doing a tree commit on every fsync, we use the
119e02119d5SChris Mason  * key ranges and transaction ids to find items for a given file or directory
120e02119d5SChris Mason  * that have changed in this transaction.  Those items are copied into
121e02119d5SChris Mason  * a special tree (one per subvolume root), that tree is written to disk
122e02119d5SChris Mason  * and then the fsync is considered complete.
123e02119d5SChris Mason  *
124e02119d5SChris Mason  * After a crash, items are copied out of the log-tree back into the
125e02119d5SChris Mason  * subvolume tree.  Any file data extents found are recorded in the extent
126e02119d5SChris Mason  * allocation tree, and the log-tree freed.
127e02119d5SChris Mason  *
128e02119d5SChris Mason  * The log tree is read three times, once to pin down all the extents it is
129e02119d5SChris Mason  * using in ram and once, once to create all the inodes logged in the tree
130e02119d5SChris Mason  * and once to do all the other items.
131e02119d5SChris Mason  */
132e02119d5SChris Mason 
133e02119d5SChris Mason /*
134e02119d5SChris Mason  * start a sub transaction and setup the log tree
135e02119d5SChris Mason  * this increments the log tree writer count to make the people
136e02119d5SChris Mason  * syncing the tree wait for us to finish
137e02119d5SChris Mason  */
138e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans,
1398b050d35SMiao Xie 			   struct btrfs_root *root,
1408b050d35SMiao Xie 			   struct btrfs_log_ctx *ctx)
141e02119d5SChris Mason {
1420b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
14347876f7cSFilipe Manana 	struct btrfs_root *tree_root = fs_info->tree_root;
144fa1a0f42SNaohiro Aota 	const bool zoned = btrfs_is_zoned(fs_info);
14534eb2a52SZhaolei 	int ret = 0;
146fa1a0f42SNaohiro Aota 	bool created = false;
1477237f183SYan Zheng 
14847876f7cSFilipe Manana 	/*
14947876f7cSFilipe Manana 	 * First check if the log root tree was already created. If not, create
15047876f7cSFilipe Manana 	 * it before locking the root's log_mutex, just to keep lockdep happy.
15147876f7cSFilipe Manana 	 */
15247876f7cSFilipe Manana 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
15347876f7cSFilipe Manana 		mutex_lock(&tree_root->log_mutex);
15447876f7cSFilipe Manana 		if (!fs_info->log_root_tree) {
15547876f7cSFilipe Manana 			ret = btrfs_init_log_root_tree(trans, fs_info);
156fa1a0f42SNaohiro Aota 			if (!ret) {
15747876f7cSFilipe Manana 				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
158fa1a0f42SNaohiro Aota 				created = true;
159fa1a0f42SNaohiro Aota 			}
16047876f7cSFilipe Manana 		}
16147876f7cSFilipe Manana 		mutex_unlock(&tree_root->log_mutex);
16247876f7cSFilipe Manana 		if (ret)
16347876f7cSFilipe Manana 			return ret;
16447876f7cSFilipe Manana 	}
16547876f7cSFilipe Manana 
1667237f183SYan Zheng 	mutex_lock(&root->log_mutex);
16734eb2a52SZhaolei 
168fa1a0f42SNaohiro Aota again:
1697237f183SYan Zheng 	if (root->log_root) {
170fa1a0f42SNaohiro Aota 		int index = (root->log_transid + 1) % 2;
171fa1a0f42SNaohiro Aota 
1724884b8e8SDavid Sterba 		if (btrfs_need_log_full_commit(trans)) {
17350471a38SMiao Xie 			ret = -EAGAIN;
17450471a38SMiao Xie 			goto out;
17550471a38SMiao Xie 		}
17634eb2a52SZhaolei 
177fa1a0f42SNaohiro Aota 		if (zoned && atomic_read(&root->log_commit[index])) {
178fa1a0f42SNaohiro Aota 			wait_log_commit(root, root->log_transid - 1);
179fa1a0f42SNaohiro Aota 			goto again;
180fa1a0f42SNaohiro Aota 		}
181fa1a0f42SNaohiro Aota 
182ff782e0aSJosef Bacik 		if (!root->log_start_pid) {
18327cdeb70SMiao Xie 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
18434eb2a52SZhaolei 			root->log_start_pid = current->pid;
185ff782e0aSJosef Bacik 		} else if (root->log_start_pid != current->pid) {
18627cdeb70SMiao Xie 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
187ff782e0aSJosef Bacik 		}
18834eb2a52SZhaolei 	} else {
189fa1a0f42SNaohiro Aota 		/*
190fa1a0f42SNaohiro Aota 		 * This means fs_info->log_root_tree was already created
191fa1a0f42SNaohiro Aota 		 * for some other FS trees. Do the full commit not to mix
192fa1a0f42SNaohiro Aota 		 * nodes from multiple log transactions to do sequential
193fa1a0f42SNaohiro Aota 		 * writing.
194fa1a0f42SNaohiro Aota 		 */
195fa1a0f42SNaohiro Aota 		if (zoned && !created) {
196fa1a0f42SNaohiro Aota 			ret = -EAGAIN;
197fa1a0f42SNaohiro Aota 			goto out;
198fa1a0f42SNaohiro Aota 		}
199fa1a0f42SNaohiro Aota 
200e02119d5SChris Mason 		ret = btrfs_add_log_tree(trans, root);
2014a500fd1SYan, Zheng 		if (ret)
202e87ac136SMiao Xie 			goto out;
20334eb2a52SZhaolei 
204e7a79811SFilipe Manana 		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
20527cdeb70SMiao Xie 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
206e87ac136SMiao Xie 		root->log_start_pid = current->pid;
20734eb2a52SZhaolei 	}
20834eb2a52SZhaolei 
2097237f183SYan Zheng 	atomic_inc(&root->log_writers);
21075b463d2SFilipe Manana 	if (ctx && !ctx->logging_new_name) {
21134eb2a52SZhaolei 		int index = root->log_transid % 2;
2128b050d35SMiao Xie 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
213d1433debSMiao Xie 		ctx->log_transid = root->log_transid;
2148b050d35SMiao Xie 	}
21534eb2a52SZhaolei 
216e87ac136SMiao Xie out:
2177237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
218e87ac136SMiao Xie 	return ret;
219e02119d5SChris Mason }
220e02119d5SChris Mason 
221e02119d5SChris Mason /*
222e02119d5SChris Mason  * returns 0 if there was a log transaction running and we were able
223e02119d5SChris Mason  * to join, or returns -ENOENT if there were not transactions
224e02119d5SChris Mason  * in progress
225e02119d5SChris Mason  */
226e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root)
227e02119d5SChris Mason {
228fa1a0f42SNaohiro Aota 	const bool zoned = btrfs_is_zoned(root->fs_info);
229e02119d5SChris Mason 	int ret = -ENOENT;
230e02119d5SChris Mason 
231e7a79811SFilipe Manana 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
232e7a79811SFilipe Manana 		return ret;
233e7a79811SFilipe Manana 
2347237f183SYan Zheng 	mutex_lock(&root->log_mutex);
235fa1a0f42SNaohiro Aota again:
236e02119d5SChris Mason 	if (root->log_root) {
237fa1a0f42SNaohiro Aota 		int index = (root->log_transid + 1) % 2;
238fa1a0f42SNaohiro Aota 
239e02119d5SChris Mason 		ret = 0;
240fa1a0f42SNaohiro Aota 		if (zoned && atomic_read(&root->log_commit[index])) {
241fa1a0f42SNaohiro Aota 			wait_log_commit(root, root->log_transid - 1);
242fa1a0f42SNaohiro Aota 			goto again;
243fa1a0f42SNaohiro Aota 		}
2447237f183SYan Zheng 		atomic_inc(&root->log_writers);
245e02119d5SChris Mason 	}
2467237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
247e02119d5SChris Mason 	return ret;
248e02119d5SChris Mason }
249e02119d5SChris Mason 
250e02119d5SChris Mason /*
25112fcfd22SChris Mason  * This either makes the current running log transaction wait
25212fcfd22SChris Mason  * until you call btrfs_end_log_trans() or it makes any future
25312fcfd22SChris Mason  * log transactions wait until you call btrfs_end_log_trans()
25412fcfd22SChris Mason  */
25545128b08Szhong jiang void btrfs_pin_log_trans(struct btrfs_root *root)
25612fcfd22SChris Mason {
25712fcfd22SChris Mason 	atomic_inc(&root->log_writers);
25812fcfd22SChris Mason }
25912fcfd22SChris Mason 
26012fcfd22SChris Mason /*
261e02119d5SChris Mason  * indicate we're done making changes to the log tree
262e02119d5SChris Mason  * and wake up anyone waiting to do a sync
263e02119d5SChris Mason  */
264143bede5SJeff Mahoney void btrfs_end_log_trans(struct btrfs_root *root)
265e02119d5SChris Mason {
2667237f183SYan Zheng 	if (atomic_dec_and_test(&root->log_writers)) {
267093258e6SDavid Sterba 		/* atomic_dec_and_test implies a barrier */
268093258e6SDavid Sterba 		cond_wake_up_nomb(&root->log_writer_wait);
2697237f183SYan Zheng 	}
270e02119d5SChris Mason }
271e02119d5SChris Mason 
272247462a5SDavid Sterba static int btrfs_write_tree_block(struct extent_buffer *buf)
273247462a5SDavid Sterba {
274247462a5SDavid Sterba 	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
275247462a5SDavid Sterba 					buf->start + buf->len - 1);
276247462a5SDavid Sterba }
277247462a5SDavid Sterba 
278247462a5SDavid Sterba static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
279247462a5SDavid Sterba {
280247462a5SDavid Sterba 	filemap_fdatawait_range(buf->pages[0]->mapping,
281247462a5SDavid Sterba 			        buf->start, buf->start + buf->len - 1);
282247462a5SDavid Sterba }
283e02119d5SChris Mason 
284e02119d5SChris Mason /*
285e02119d5SChris Mason  * the walk control struct is used to pass state down the chain when
286e02119d5SChris Mason  * processing the log tree.  The stage field tells us which part
287e02119d5SChris Mason  * of the log tree processing we are currently doing.  The others
288e02119d5SChris Mason  * are state fields used for that specific part
289e02119d5SChris Mason  */
290e02119d5SChris Mason struct walk_control {
291e02119d5SChris Mason 	/* should we free the extent on disk when done?  This is used
292e02119d5SChris Mason 	 * at transaction commit time while freeing a log tree
293e02119d5SChris Mason 	 */
294e02119d5SChris Mason 	int free;
295e02119d5SChris Mason 
296e02119d5SChris Mason 	/* should we write out the extent buffer?  This is used
297e02119d5SChris Mason 	 * while flushing the log tree to disk during a sync
298e02119d5SChris Mason 	 */
299e02119d5SChris Mason 	int write;
300e02119d5SChris Mason 
301e02119d5SChris Mason 	/* should we wait for the extent buffer io to finish?  Also used
302e02119d5SChris Mason 	 * while flushing the log tree to disk for a sync
303e02119d5SChris Mason 	 */
304e02119d5SChris Mason 	int wait;
305e02119d5SChris Mason 
306e02119d5SChris Mason 	/* pin only walk, we record which extents on disk belong to the
307e02119d5SChris Mason 	 * log trees
308e02119d5SChris Mason 	 */
309e02119d5SChris Mason 	int pin;
310e02119d5SChris Mason 
311e02119d5SChris Mason 	/* what stage of the replay code we're currently in */
312e02119d5SChris Mason 	int stage;
313e02119d5SChris Mason 
314f2d72f42SFilipe Manana 	/*
315f2d72f42SFilipe Manana 	 * Ignore any items from the inode currently being processed. Needs
316f2d72f42SFilipe Manana 	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
317f2d72f42SFilipe Manana 	 * the LOG_WALK_REPLAY_INODES stage.
318f2d72f42SFilipe Manana 	 */
319f2d72f42SFilipe Manana 	bool ignore_cur_inode;
320f2d72f42SFilipe Manana 
321e02119d5SChris Mason 	/* the root we are currently replaying */
322e02119d5SChris Mason 	struct btrfs_root *replay_dest;
323e02119d5SChris Mason 
324e02119d5SChris Mason 	/* the trans handle for the current replay */
325e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
326e02119d5SChris Mason 
327e02119d5SChris Mason 	/* the function that gets used to process blocks we find in the
328e02119d5SChris Mason 	 * tree.  Note the extent_buffer might not be up to date when it is
329e02119d5SChris Mason 	 * passed in, and it must be checked or read if you need the data
330e02119d5SChris Mason 	 * inside it
331e02119d5SChris Mason 	 */
332e02119d5SChris Mason 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
333581c1760SQu Wenruo 			    struct walk_control *wc, u64 gen, int level);
334e02119d5SChris Mason };
335e02119d5SChris Mason 
336e02119d5SChris Mason /*
337e02119d5SChris Mason  * process_func used to pin down extents, write them or wait on them
338e02119d5SChris Mason  */
339e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log,
340e02119d5SChris Mason 			      struct extent_buffer *eb,
341581c1760SQu Wenruo 			      struct walk_control *wc, u64 gen, int level)
342e02119d5SChris Mason {
3430b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
344b50c6e25SJosef Bacik 	int ret = 0;
345b50c6e25SJosef Bacik 
3468c2a1a30SJosef Bacik 	/*
3478c2a1a30SJosef Bacik 	 * If this fs is mixed then we need to be able to process the leaves to
3488c2a1a30SJosef Bacik 	 * pin down any logged extents, so we have to read the block.
3498c2a1a30SJosef Bacik 	 */
3500b246afaSJeff Mahoney 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
351581c1760SQu Wenruo 		ret = btrfs_read_buffer(eb, gen, level, NULL);
3528c2a1a30SJosef Bacik 		if (ret)
3538c2a1a30SJosef Bacik 			return ret;
3548c2a1a30SJosef Bacik 	}
3558c2a1a30SJosef Bacik 
35604018de5SJosef Bacik 	if (wc->pin)
3579fce5704SNikolay Borisov 		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
3582ff7e61eSJeff Mahoney 						      eb->len);
359e02119d5SChris Mason 
360b50c6e25SJosef Bacik 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
3618c2a1a30SJosef Bacik 		if (wc->pin && btrfs_header_level(eb) == 0)
362bcdc428cSDavid Sterba 			ret = btrfs_exclude_logged_extents(eb);
363e02119d5SChris Mason 		if (wc->write)
364e02119d5SChris Mason 			btrfs_write_tree_block(eb);
365e02119d5SChris Mason 		if (wc->wait)
366e02119d5SChris Mason 			btrfs_wait_tree_block_writeback(eb);
367e02119d5SChris Mason 	}
368b50c6e25SJosef Bacik 	return ret;
369e02119d5SChris Mason }
370e02119d5SChris Mason 
371e02119d5SChris Mason /*
372e02119d5SChris Mason  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
373e02119d5SChris Mason  * to the src data we are copying out.
374e02119d5SChris Mason  *
375e02119d5SChris Mason  * root is the tree we are copying into, and path is a scratch
376e02119d5SChris Mason  * path for use in this function (it should be released on entry and
377e02119d5SChris Mason  * will be released on exit).
378e02119d5SChris Mason  *
379e02119d5SChris Mason  * If the key is already in the destination tree the existing item is
380e02119d5SChris Mason  * overwritten.  If the existing item isn't big enough, it is extended.
381e02119d5SChris Mason  * If it is too large, it is truncated.
382e02119d5SChris Mason  *
383e02119d5SChris Mason  * If the key isn't in the destination yet, a new item is inserted.
384e02119d5SChris Mason  */
385e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans,
386e02119d5SChris Mason 				   struct btrfs_root *root,
387e02119d5SChris Mason 				   struct btrfs_path *path,
388e02119d5SChris Mason 				   struct extent_buffer *eb, int slot,
389e02119d5SChris Mason 				   struct btrfs_key *key)
390e02119d5SChris Mason {
391e02119d5SChris Mason 	int ret;
392e02119d5SChris Mason 	u32 item_size;
393e02119d5SChris Mason 	u64 saved_i_size = 0;
394e02119d5SChris Mason 	int save_old_i_size = 0;
395e02119d5SChris Mason 	unsigned long src_ptr;
396e02119d5SChris Mason 	unsigned long dst_ptr;
397e02119d5SChris Mason 	int overwrite_root = 0;
3984bc4bee4SJosef Bacik 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
399e02119d5SChris Mason 
400e02119d5SChris Mason 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
401e02119d5SChris Mason 		overwrite_root = 1;
402e02119d5SChris Mason 
403e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
404e02119d5SChris Mason 	src_ptr = btrfs_item_ptr_offset(eb, slot);
405e02119d5SChris Mason 
406e02119d5SChris Mason 	/* look for the key in the destination tree */
407e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
4084bc4bee4SJosef Bacik 	if (ret < 0)
4094bc4bee4SJosef Bacik 		return ret;
4104bc4bee4SJosef Bacik 
411e02119d5SChris Mason 	if (ret == 0) {
412e02119d5SChris Mason 		char *src_copy;
413e02119d5SChris Mason 		char *dst_copy;
414e02119d5SChris Mason 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
415e02119d5SChris Mason 						  path->slots[0]);
416e02119d5SChris Mason 		if (dst_size != item_size)
417e02119d5SChris Mason 			goto insert;
418e02119d5SChris Mason 
419e02119d5SChris Mason 		if (item_size == 0) {
420b3b4aa74SDavid Sterba 			btrfs_release_path(path);
421e02119d5SChris Mason 			return 0;
422e02119d5SChris Mason 		}
423e02119d5SChris Mason 		dst_copy = kmalloc(item_size, GFP_NOFS);
424e02119d5SChris Mason 		src_copy = kmalloc(item_size, GFP_NOFS);
4252a29edc6Sliubo 		if (!dst_copy || !src_copy) {
426b3b4aa74SDavid Sterba 			btrfs_release_path(path);
4272a29edc6Sliubo 			kfree(dst_copy);
4282a29edc6Sliubo 			kfree(src_copy);
4292a29edc6Sliubo 			return -ENOMEM;
4302a29edc6Sliubo 		}
431e02119d5SChris Mason 
432e02119d5SChris Mason 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
433e02119d5SChris Mason 
434e02119d5SChris Mason 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
435e02119d5SChris Mason 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
436e02119d5SChris Mason 				   item_size);
437e02119d5SChris Mason 		ret = memcmp(dst_copy, src_copy, item_size);
438e02119d5SChris Mason 
439e02119d5SChris Mason 		kfree(dst_copy);
440e02119d5SChris Mason 		kfree(src_copy);
441e02119d5SChris Mason 		/*
442e02119d5SChris Mason 		 * they have the same contents, just return, this saves
443e02119d5SChris Mason 		 * us from cowing blocks in the destination tree and doing
444e02119d5SChris Mason 		 * extra writes that may not have been done by a previous
445e02119d5SChris Mason 		 * sync
446e02119d5SChris Mason 		 */
447e02119d5SChris Mason 		if (ret == 0) {
448b3b4aa74SDavid Sterba 			btrfs_release_path(path);
449e02119d5SChris Mason 			return 0;
450e02119d5SChris Mason 		}
451e02119d5SChris Mason 
4524bc4bee4SJosef Bacik 		/*
4534bc4bee4SJosef Bacik 		 * We need to load the old nbytes into the inode so when we
4544bc4bee4SJosef Bacik 		 * replay the extents we've logged we get the right nbytes.
4554bc4bee4SJosef Bacik 		 */
4564bc4bee4SJosef Bacik 		if (inode_item) {
4574bc4bee4SJosef Bacik 			struct btrfs_inode_item *item;
4584bc4bee4SJosef Bacik 			u64 nbytes;
459d555438bSJosef Bacik 			u32 mode;
4604bc4bee4SJosef Bacik 
4614bc4bee4SJosef Bacik 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4624bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4634bc4bee4SJosef Bacik 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
4644bc4bee4SJosef Bacik 			item = btrfs_item_ptr(eb, slot,
4654bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4664bc4bee4SJosef Bacik 			btrfs_set_inode_nbytes(eb, item, nbytes);
467d555438bSJosef Bacik 
468d555438bSJosef Bacik 			/*
469d555438bSJosef Bacik 			 * If this is a directory we need to reset the i_size to
470d555438bSJosef Bacik 			 * 0 so that we can set it up properly when replaying
471d555438bSJosef Bacik 			 * the rest of the items in this log.
472d555438bSJosef Bacik 			 */
473d555438bSJosef Bacik 			mode = btrfs_inode_mode(eb, item);
474d555438bSJosef Bacik 			if (S_ISDIR(mode))
475d555438bSJosef Bacik 				btrfs_set_inode_size(eb, item, 0);
4764bc4bee4SJosef Bacik 		}
4774bc4bee4SJosef Bacik 	} else if (inode_item) {
4784bc4bee4SJosef Bacik 		struct btrfs_inode_item *item;
479d555438bSJosef Bacik 		u32 mode;
4804bc4bee4SJosef Bacik 
4814bc4bee4SJosef Bacik 		/*
4824bc4bee4SJosef Bacik 		 * New inode, set nbytes to 0 so that the nbytes comes out
4834bc4bee4SJosef Bacik 		 * properly when we replay the extents.
4844bc4bee4SJosef Bacik 		 */
4854bc4bee4SJosef Bacik 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
4864bc4bee4SJosef Bacik 		btrfs_set_inode_nbytes(eb, item, 0);
487d555438bSJosef Bacik 
488d555438bSJosef Bacik 		/*
489d555438bSJosef Bacik 		 * If this is a directory we need to reset the i_size to 0 so
490d555438bSJosef Bacik 		 * that we can set it up properly when replaying the rest of
491d555438bSJosef Bacik 		 * the items in this log.
492d555438bSJosef Bacik 		 */
493d555438bSJosef Bacik 		mode = btrfs_inode_mode(eb, item);
494d555438bSJosef Bacik 		if (S_ISDIR(mode))
495d555438bSJosef Bacik 			btrfs_set_inode_size(eb, item, 0);
496e02119d5SChris Mason 	}
497e02119d5SChris Mason insert:
498b3b4aa74SDavid Sterba 	btrfs_release_path(path);
499e02119d5SChris Mason 	/* try to insert the key into the destination tree */
500df8d116fSFilipe Manana 	path->skip_release_on_error = 1;
501e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path,
502e02119d5SChris Mason 				      key, item_size);
503df8d116fSFilipe Manana 	path->skip_release_on_error = 0;
504e02119d5SChris Mason 
505e02119d5SChris Mason 	/* make sure any existing item is the correct size */
506df8d116fSFilipe Manana 	if (ret == -EEXIST || ret == -EOVERFLOW) {
507e02119d5SChris Mason 		u32 found_size;
508e02119d5SChris Mason 		found_size = btrfs_item_size_nr(path->nodes[0],
509e02119d5SChris Mason 						path->slots[0]);
510143bede5SJeff Mahoney 		if (found_size > item_size)
51178ac4f9eSDavid Sterba 			btrfs_truncate_item(path, item_size, 1);
512143bede5SJeff Mahoney 		else if (found_size < item_size)
513c71dd880SDavid Sterba 			btrfs_extend_item(path, item_size - found_size);
514e02119d5SChris Mason 	} else if (ret) {
5154a500fd1SYan, Zheng 		return ret;
516e02119d5SChris Mason 	}
517e02119d5SChris Mason 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
518e02119d5SChris Mason 					path->slots[0]);
519e02119d5SChris Mason 
520e02119d5SChris Mason 	/* don't overwrite an existing inode if the generation number
521e02119d5SChris Mason 	 * was logged as zero.  This is done when the tree logging code
522e02119d5SChris Mason 	 * is just logging an inode to make sure it exists after recovery.
523e02119d5SChris Mason 	 *
524e02119d5SChris Mason 	 * Also, don't overwrite i_size on directories during replay.
525e02119d5SChris Mason 	 * log replay inserts and removes directory items based on the
526e02119d5SChris Mason 	 * state of the tree found in the subvolume, and i_size is modified
527e02119d5SChris Mason 	 * as it goes
528e02119d5SChris Mason 	 */
529e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
530e02119d5SChris Mason 		struct btrfs_inode_item *src_item;
531e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
532e02119d5SChris Mason 
533e02119d5SChris Mason 		src_item = (struct btrfs_inode_item *)src_ptr;
534e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
535e02119d5SChris Mason 
5361a4bcf47SFilipe Manana 		if (btrfs_inode_generation(eb, src_item) == 0) {
5371a4bcf47SFilipe Manana 			struct extent_buffer *dst_eb = path->nodes[0];
5382f2ff0eeSFilipe Manana 			const u64 ino_size = btrfs_inode_size(eb, src_item);
5391a4bcf47SFilipe Manana 
5402f2ff0eeSFilipe Manana 			/*
5412f2ff0eeSFilipe Manana 			 * For regular files an ino_size == 0 is used only when
5422f2ff0eeSFilipe Manana 			 * logging that an inode exists, as part of a directory
5432f2ff0eeSFilipe Manana 			 * fsync, and the inode wasn't fsynced before. In this
5442f2ff0eeSFilipe Manana 			 * case don't set the size of the inode in the fs/subvol
5452f2ff0eeSFilipe Manana 			 * tree, otherwise we would be throwing valid data away.
5462f2ff0eeSFilipe Manana 			 */
5471a4bcf47SFilipe Manana 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
5482f2ff0eeSFilipe Manana 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
54960d48e2eSDavid Sterba 			    ino_size != 0)
55060d48e2eSDavid Sterba 				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
551e02119d5SChris Mason 			goto no_copy;
5521a4bcf47SFilipe Manana 		}
553e02119d5SChris Mason 
554e02119d5SChris Mason 		if (overwrite_root &&
555e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
556e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
557e02119d5SChris Mason 			save_old_i_size = 1;
558e02119d5SChris Mason 			saved_i_size = btrfs_inode_size(path->nodes[0],
559e02119d5SChris Mason 							dst_item);
560e02119d5SChris Mason 		}
561e02119d5SChris Mason 	}
562e02119d5SChris Mason 
563e02119d5SChris Mason 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
564e02119d5SChris Mason 			   src_ptr, item_size);
565e02119d5SChris Mason 
566e02119d5SChris Mason 	if (save_old_i_size) {
567e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
568e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
569e02119d5SChris Mason 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
570e02119d5SChris Mason 	}
571e02119d5SChris Mason 
572e02119d5SChris Mason 	/* make sure the generation is filled in */
573e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY) {
574e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
575e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
576e02119d5SChris Mason 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
577e02119d5SChris Mason 			btrfs_set_inode_generation(path->nodes[0], dst_item,
578e02119d5SChris Mason 						   trans->transid);
579e02119d5SChris Mason 		}
580e02119d5SChris Mason 	}
581e02119d5SChris Mason no_copy:
582e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
583b3b4aa74SDavid Sterba 	btrfs_release_path(path);
584e02119d5SChris Mason 	return 0;
585e02119d5SChris Mason }
586e02119d5SChris Mason 
587e02119d5SChris Mason /*
588e02119d5SChris Mason  * simple helper to read an inode off the disk from a given root
589e02119d5SChris Mason  * This can only be called for subvolume roots and not for the log
590e02119d5SChris Mason  */
591e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root,
592e02119d5SChris Mason 					     u64 objectid)
593e02119d5SChris Mason {
594e02119d5SChris Mason 	struct inode *inode;
595e02119d5SChris Mason 
5960202e83fSDavid Sterba 	inode = btrfs_iget(root->fs_info->sb, objectid, root);
5972e19f1f9SAl Viro 	if (IS_ERR(inode))
5985d4f98a2SYan Zheng 		inode = NULL;
599e02119d5SChris Mason 	return inode;
600e02119d5SChris Mason }
601e02119d5SChris Mason 
602e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the
603e02119d5SChris Mason  * subvolume 'root'.  path is released on entry and should be released
604e02119d5SChris Mason  * on exit.
605e02119d5SChris Mason  *
606e02119d5SChris Mason  * extents in the log tree have not been allocated out of the extent
607e02119d5SChris Mason  * tree yet.  So, this completes the allocation, taking a reference
608e02119d5SChris Mason  * as required if the extent already exists or creating a new extent
609e02119d5SChris Mason  * if it isn't in the extent allocation tree yet.
610e02119d5SChris Mason  *
611e02119d5SChris Mason  * The extent is inserted into the file, dropping any existing extents
612e02119d5SChris Mason  * from the file that overlap the new one.
613e02119d5SChris Mason  */
614e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
615e02119d5SChris Mason 				      struct btrfs_root *root,
616e02119d5SChris Mason 				      struct btrfs_path *path,
617e02119d5SChris Mason 				      struct extent_buffer *eb, int slot,
618e02119d5SChris Mason 				      struct btrfs_key *key)
619e02119d5SChris Mason {
6205893dfb9SFilipe Manana 	struct btrfs_drop_extents_args drop_args = { 0 };
6210b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
622e02119d5SChris Mason 	int found_type;
623e02119d5SChris Mason 	u64 extent_end;
624e02119d5SChris Mason 	u64 start = key->offset;
6254bc4bee4SJosef Bacik 	u64 nbytes = 0;
626e02119d5SChris Mason 	struct btrfs_file_extent_item *item;
627e02119d5SChris Mason 	struct inode *inode = NULL;
628e02119d5SChris Mason 	unsigned long size;
629e02119d5SChris Mason 	int ret = 0;
630e02119d5SChris Mason 
631e02119d5SChris Mason 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
632e02119d5SChris Mason 	found_type = btrfs_file_extent_type(eb, item);
633e02119d5SChris Mason 
634d899e052SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
6354bc4bee4SJosef Bacik 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6364bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_num_bytes(eb, item);
6374bc4bee4SJosef Bacik 		extent_end = start + nbytes;
6384bc4bee4SJosef Bacik 
6394bc4bee4SJosef Bacik 		/*
6404bc4bee4SJosef Bacik 		 * We don't add to the inodes nbytes if we are prealloc or a
6414bc4bee4SJosef Bacik 		 * hole.
6424bc4bee4SJosef Bacik 		 */
6434bc4bee4SJosef Bacik 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6444bc4bee4SJosef Bacik 			nbytes = 0;
6454bc4bee4SJosef Bacik 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
646e41ca589SQu Wenruo 		size = btrfs_file_extent_ram_bytes(eb, item);
6474bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
648da17066cSJeff Mahoney 		extent_end = ALIGN(start + size,
6490b246afaSJeff Mahoney 				   fs_info->sectorsize);
650e02119d5SChris Mason 	} else {
651e02119d5SChris Mason 		ret = 0;
652e02119d5SChris Mason 		goto out;
653e02119d5SChris Mason 	}
654e02119d5SChris Mason 
655e02119d5SChris Mason 	inode = read_one_inode(root, key->objectid);
656e02119d5SChris Mason 	if (!inode) {
657e02119d5SChris Mason 		ret = -EIO;
658e02119d5SChris Mason 		goto out;
659e02119d5SChris Mason 	}
660e02119d5SChris Mason 
661e02119d5SChris Mason 	/*
662e02119d5SChris Mason 	 * first check to see if we already have this extent in the
663e02119d5SChris Mason 	 * file.  This must be done before the btrfs_drop_extents run
664e02119d5SChris Mason 	 * so we don't try to drop this extent.
665e02119d5SChris Mason 	 */
666f85b7379SDavid Sterba 	ret = btrfs_lookup_file_extent(trans, root, path,
667f85b7379SDavid Sterba 			btrfs_ino(BTRFS_I(inode)), start, 0);
668e02119d5SChris Mason 
669d899e052SYan Zheng 	if (ret == 0 &&
670d899e052SYan Zheng 	    (found_type == BTRFS_FILE_EXTENT_REG ||
671d899e052SYan Zheng 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
672e02119d5SChris Mason 		struct btrfs_file_extent_item cmp1;
673e02119d5SChris Mason 		struct btrfs_file_extent_item cmp2;
674e02119d5SChris Mason 		struct btrfs_file_extent_item *existing;
675e02119d5SChris Mason 		struct extent_buffer *leaf;
676e02119d5SChris Mason 
677e02119d5SChris Mason 		leaf = path->nodes[0];
678e02119d5SChris Mason 		existing = btrfs_item_ptr(leaf, path->slots[0],
679e02119d5SChris Mason 					  struct btrfs_file_extent_item);
680e02119d5SChris Mason 
681e02119d5SChris Mason 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
682e02119d5SChris Mason 				   sizeof(cmp1));
683e02119d5SChris Mason 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
684e02119d5SChris Mason 				   sizeof(cmp2));
685e02119d5SChris Mason 
686e02119d5SChris Mason 		/*
687e02119d5SChris Mason 		 * we already have a pointer to this exact extent,
688e02119d5SChris Mason 		 * we don't have to do anything
689e02119d5SChris Mason 		 */
690e02119d5SChris Mason 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
691b3b4aa74SDavid Sterba 			btrfs_release_path(path);
692e02119d5SChris Mason 			goto out;
693e02119d5SChris Mason 		}
694e02119d5SChris Mason 	}
695b3b4aa74SDavid Sterba 	btrfs_release_path(path);
696e02119d5SChris Mason 
697e02119d5SChris Mason 	/* drop any overlapping extents */
6985893dfb9SFilipe Manana 	drop_args.start = start;
6995893dfb9SFilipe Manana 	drop_args.end = extent_end;
7005893dfb9SFilipe Manana 	drop_args.drop_cache = true;
7015893dfb9SFilipe Manana 	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
7023650860bSJosef Bacik 	if (ret)
7033650860bSJosef Bacik 		goto out;
704e02119d5SChris Mason 
70507d400a6SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
70607d400a6SYan Zheng 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7075d4f98a2SYan Zheng 		u64 offset;
70807d400a6SYan Zheng 		unsigned long dest_offset;
70907d400a6SYan Zheng 		struct btrfs_key ins;
71007d400a6SYan Zheng 
7113168021cSFilipe Manana 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
7123168021cSFilipe Manana 		    btrfs_fs_incompat(fs_info, NO_HOLES))
7133168021cSFilipe Manana 			goto update_inode;
7143168021cSFilipe Manana 
71507d400a6SYan Zheng 		ret = btrfs_insert_empty_item(trans, root, path, key,
71607d400a6SYan Zheng 					      sizeof(*item));
7173650860bSJosef Bacik 		if (ret)
7183650860bSJosef Bacik 			goto out;
71907d400a6SYan Zheng 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
72007d400a6SYan Zheng 						    path->slots[0]);
72107d400a6SYan Zheng 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
72207d400a6SYan Zheng 				(unsigned long)item,  sizeof(*item));
72307d400a6SYan Zheng 
72407d400a6SYan Zheng 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
72507d400a6SYan Zheng 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
72607d400a6SYan Zheng 		ins.type = BTRFS_EXTENT_ITEM_KEY;
7275d4f98a2SYan Zheng 		offset = key->offset - btrfs_file_extent_offset(eb, item);
72807d400a6SYan Zheng 
729df2c95f3SQu Wenruo 		/*
730df2c95f3SQu Wenruo 		 * Manually record dirty extent, as here we did a shallow
731df2c95f3SQu Wenruo 		 * file extent item copy and skip normal backref update,
732df2c95f3SQu Wenruo 		 * but modifying extent tree all by ourselves.
733df2c95f3SQu Wenruo 		 * So need to manually record dirty extent for qgroup,
734df2c95f3SQu Wenruo 		 * as the owner of the file extent changed from log tree
735df2c95f3SQu Wenruo 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
736df2c95f3SQu Wenruo 		 */
737a95f3aafSLu Fengqi 		ret = btrfs_qgroup_trace_extent(trans,
738df2c95f3SQu Wenruo 				btrfs_file_extent_disk_bytenr(eb, item),
739df2c95f3SQu Wenruo 				btrfs_file_extent_disk_num_bytes(eb, item),
740df2c95f3SQu Wenruo 				GFP_NOFS);
741df2c95f3SQu Wenruo 		if (ret < 0)
742df2c95f3SQu Wenruo 			goto out;
743df2c95f3SQu Wenruo 
74407d400a6SYan Zheng 		if (ins.objectid > 0) {
74582fa113fSQu Wenruo 			struct btrfs_ref ref = { 0 };
74607d400a6SYan Zheng 			u64 csum_start;
74707d400a6SYan Zheng 			u64 csum_end;
74807d400a6SYan Zheng 			LIST_HEAD(ordered_sums);
74982fa113fSQu Wenruo 
75007d400a6SYan Zheng 			/*
75107d400a6SYan Zheng 			 * is this extent already allocated in the extent
75207d400a6SYan Zheng 			 * allocation tree?  If so, just add a reference
75307d400a6SYan Zheng 			 */
7542ff7e61eSJeff Mahoney 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
75507d400a6SYan Zheng 						ins.offset);
75607d400a6SYan Zheng 			if (ret == 0) {
75782fa113fSQu Wenruo 				btrfs_init_generic_ref(&ref,
75882fa113fSQu Wenruo 						BTRFS_ADD_DELAYED_REF,
75982fa113fSQu Wenruo 						ins.objectid, ins.offset, 0);
76082fa113fSQu Wenruo 				btrfs_init_data_ref(&ref,
76182fa113fSQu Wenruo 						root->root_key.objectid,
762b06c4bf5SFilipe Manana 						key->objectid, offset);
76382fa113fSQu Wenruo 				ret = btrfs_inc_extent_ref(trans, &ref);
764b50c6e25SJosef Bacik 				if (ret)
765b50c6e25SJosef Bacik 					goto out;
76607d400a6SYan Zheng 			} else {
76707d400a6SYan Zheng 				/*
76807d400a6SYan Zheng 				 * insert the extent pointer in the extent
76907d400a6SYan Zheng 				 * allocation tree
77007d400a6SYan Zheng 				 */
7715d4f98a2SYan Zheng 				ret = btrfs_alloc_logged_file_extent(trans,
7722ff7e61eSJeff Mahoney 						root->root_key.objectid,
7735d4f98a2SYan Zheng 						key->objectid, offset, &ins);
774b50c6e25SJosef Bacik 				if (ret)
775b50c6e25SJosef Bacik 					goto out;
77607d400a6SYan Zheng 			}
777b3b4aa74SDavid Sterba 			btrfs_release_path(path);
77807d400a6SYan Zheng 
77907d400a6SYan Zheng 			if (btrfs_file_extent_compression(eb, item)) {
78007d400a6SYan Zheng 				csum_start = ins.objectid;
78107d400a6SYan Zheng 				csum_end = csum_start + ins.offset;
78207d400a6SYan Zheng 			} else {
78307d400a6SYan Zheng 				csum_start = ins.objectid +
78407d400a6SYan Zheng 					btrfs_file_extent_offset(eb, item);
78507d400a6SYan Zheng 				csum_end = csum_start +
78607d400a6SYan Zheng 					btrfs_file_extent_num_bytes(eb, item);
78707d400a6SYan Zheng 			}
78807d400a6SYan Zheng 
78907d400a6SYan Zheng 			ret = btrfs_lookup_csums_range(root->log_root,
79007d400a6SYan Zheng 						csum_start, csum_end - 1,
791a2de733cSArne Jansen 						&ordered_sums, 0);
7923650860bSJosef Bacik 			if (ret)
7933650860bSJosef Bacik 				goto out;
794b84b8390SFilipe Manana 			/*
795b84b8390SFilipe Manana 			 * Now delete all existing cums in the csum root that
796b84b8390SFilipe Manana 			 * cover our range. We do this because we can have an
797b84b8390SFilipe Manana 			 * extent that is completely referenced by one file
798b84b8390SFilipe Manana 			 * extent item and partially referenced by another
799b84b8390SFilipe Manana 			 * file extent item (like after using the clone or
800b84b8390SFilipe Manana 			 * extent_same ioctls). In this case if we end up doing
801b84b8390SFilipe Manana 			 * the replay of the one that partially references the
802b84b8390SFilipe Manana 			 * extent first, and we do not do the csum deletion
803b84b8390SFilipe Manana 			 * below, we can get 2 csum items in the csum tree that
804b84b8390SFilipe Manana 			 * overlap each other. For example, imagine our log has
805b84b8390SFilipe Manana 			 * the two following file extent items:
806b84b8390SFilipe Manana 			 *
807b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 409600)
808b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
809b84b8390SFilipe Manana 			 *     extent data offset 20480 nr 20480 ram 102400
810b84b8390SFilipe Manana 			 *
811b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 819200)
812b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
813b84b8390SFilipe Manana 			 *     extent data offset 0 nr 102400 ram 102400
814b84b8390SFilipe Manana 			 *
815b84b8390SFilipe Manana 			 * Where the second one fully references the 100K extent
816b84b8390SFilipe Manana 			 * that starts at disk byte 12845056, and the log tree
817b84b8390SFilipe Manana 			 * has a single csum item that covers the entire range
818b84b8390SFilipe Manana 			 * of the extent:
819b84b8390SFilipe Manana 			 *
820b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
821b84b8390SFilipe Manana 			 *
822b84b8390SFilipe Manana 			 * After the first file extent item is replayed, the
823b84b8390SFilipe Manana 			 * csum tree gets the following csum item:
824b84b8390SFilipe Manana 			 *
825b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
826b84b8390SFilipe Manana 			 *
827b84b8390SFilipe Manana 			 * Which covers the 20K sub-range starting at offset 20K
828b84b8390SFilipe Manana 			 * of our extent. Now when we replay the second file
829b84b8390SFilipe Manana 			 * extent item, if we do not delete existing csum items
830b84b8390SFilipe Manana 			 * that cover any of its blocks, we end up getting two
831b84b8390SFilipe Manana 			 * csum items in our csum tree that overlap each other:
832b84b8390SFilipe Manana 			 *
833b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
834b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
835b84b8390SFilipe Manana 			 *
836b84b8390SFilipe Manana 			 * Which is a problem, because after this anyone trying
837b84b8390SFilipe Manana 			 * to lookup up for the checksum of any block of our
838b84b8390SFilipe Manana 			 * extent starting at an offset of 40K or higher, will
839b84b8390SFilipe Manana 			 * end up looking at the second csum item only, which
840b84b8390SFilipe Manana 			 * does not contain the checksum for any block starting
841b84b8390SFilipe Manana 			 * at offset 40K or higher of our extent.
842b84b8390SFilipe Manana 			 */
84307d400a6SYan Zheng 			while (!list_empty(&ordered_sums)) {
84407d400a6SYan Zheng 				struct btrfs_ordered_sum *sums;
84507d400a6SYan Zheng 				sums = list_entry(ordered_sums.next,
84607d400a6SYan Zheng 						struct btrfs_ordered_sum,
84707d400a6SYan Zheng 						list);
8483650860bSJosef Bacik 				if (!ret)
84940e046acSFilipe Manana 					ret = btrfs_del_csums(trans,
85040e046acSFilipe Manana 							      fs_info->csum_root,
851b84b8390SFilipe Manana 							      sums->bytenr,
852b84b8390SFilipe Manana 							      sums->len);
853b84b8390SFilipe Manana 				if (!ret)
85407d400a6SYan Zheng 					ret = btrfs_csum_file_blocks(trans,
8550b246afaSJeff Mahoney 						fs_info->csum_root, sums);
85607d400a6SYan Zheng 				list_del(&sums->list);
85707d400a6SYan Zheng 				kfree(sums);
85807d400a6SYan Zheng 			}
8593650860bSJosef Bacik 			if (ret)
8603650860bSJosef Bacik 				goto out;
86107d400a6SYan Zheng 		} else {
862b3b4aa74SDavid Sterba 			btrfs_release_path(path);
86307d400a6SYan Zheng 		}
86407d400a6SYan Zheng 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
86507d400a6SYan Zheng 		/* inline extents are easy, we just overwrite them */
866e02119d5SChris Mason 		ret = overwrite_item(trans, root, path, eb, slot, key);
8673650860bSJosef Bacik 		if (ret)
8683650860bSJosef Bacik 			goto out;
86907d400a6SYan Zheng 	}
870e02119d5SChris Mason 
8719ddc959eSJosef Bacik 	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
8729ddc959eSJosef Bacik 						extent_end - start);
8739ddc959eSJosef Bacik 	if (ret)
8749ddc959eSJosef Bacik 		goto out;
8759ddc959eSJosef Bacik 
8763168021cSFilipe Manana update_inode:
8772766ff61SFilipe Manana 	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
8789a56fcd1SNikolay Borisov 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
879e02119d5SChris Mason out:
880e02119d5SChris Mason 	if (inode)
881e02119d5SChris Mason 		iput(inode);
882e02119d5SChris Mason 	return ret;
883e02119d5SChris Mason }
884e02119d5SChris Mason 
885e02119d5SChris Mason /*
886e02119d5SChris Mason  * when cleaning up conflicts between the directory names in the
887e02119d5SChris Mason  * subvolume, directory names in the log and directory names in the
888e02119d5SChris Mason  * inode back references, we may have to unlink inodes from directories.
889e02119d5SChris Mason  *
890e02119d5SChris Mason  * This is a helper function to do the unlink of a specific directory
891e02119d5SChris Mason  * item
892e02119d5SChris Mason  */
893e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
894e02119d5SChris Mason 				      struct btrfs_root *root,
895e02119d5SChris Mason 				      struct btrfs_path *path,
896207e7d92SNikolay Borisov 				      struct btrfs_inode *dir,
897e02119d5SChris Mason 				      struct btrfs_dir_item *di)
898e02119d5SChris Mason {
899e02119d5SChris Mason 	struct inode *inode;
900e02119d5SChris Mason 	char *name;
901e02119d5SChris Mason 	int name_len;
902e02119d5SChris Mason 	struct extent_buffer *leaf;
903e02119d5SChris Mason 	struct btrfs_key location;
904e02119d5SChris Mason 	int ret;
905e02119d5SChris Mason 
906e02119d5SChris Mason 	leaf = path->nodes[0];
907e02119d5SChris Mason 
908e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
909e02119d5SChris Mason 	name_len = btrfs_dir_name_len(leaf, di);
910e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
9112a29edc6Sliubo 	if (!name)
9122a29edc6Sliubo 		return -ENOMEM;
9132a29edc6Sliubo 
914e02119d5SChris Mason 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
915b3b4aa74SDavid Sterba 	btrfs_release_path(path);
916e02119d5SChris Mason 
917e02119d5SChris Mason 	inode = read_one_inode(root, location.objectid);
918c00e9493STsutomu Itoh 	if (!inode) {
9193650860bSJosef Bacik 		ret = -EIO;
9203650860bSJosef Bacik 		goto out;
921c00e9493STsutomu Itoh 	}
922e02119d5SChris Mason 
923ec051c0fSYan Zheng 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
9243650860bSJosef Bacik 	if (ret)
9253650860bSJosef Bacik 		goto out;
92612fcfd22SChris Mason 
927207e7d92SNikolay Borisov 	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
928207e7d92SNikolay Borisov 			name_len);
9293650860bSJosef Bacik 	if (ret)
9303650860bSJosef Bacik 		goto out;
931ada9af21SFilipe David Borba Manana 	else
932e5c304e6SNikolay Borisov 		ret = btrfs_run_delayed_items(trans);
9333650860bSJosef Bacik out:
9343650860bSJosef Bacik 	kfree(name);
9353650860bSJosef Bacik 	iput(inode);
936e02119d5SChris Mason 	return ret;
937e02119d5SChris Mason }
938e02119d5SChris Mason 
939e02119d5SChris Mason /*
940e02119d5SChris Mason  * helper function to see if a given name and sequence number found
941e02119d5SChris Mason  * in an inode back reference are already in a directory and correctly
942e02119d5SChris Mason  * point to this inode
943e02119d5SChris Mason  */
944e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root,
945e02119d5SChris Mason 				 struct btrfs_path *path,
946e02119d5SChris Mason 				 u64 dirid, u64 objectid, u64 index,
947e02119d5SChris Mason 				 const char *name, int name_len)
948e02119d5SChris Mason {
949e02119d5SChris Mason 	struct btrfs_dir_item *di;
950e02119d5SChris Mason 	struct btrfs_key location;
951e02119d5SChris Mason 	int match = 0;
952e02119d5SChris Mason 
953e02119d5SChris Mason 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
954e02119d5SChris Mason 					 index, name, name_len, 0);
955e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
956e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
957e02119d5SChris Mason 		if (location.objectid != objectid)
958e02119d5SChris Mason 			goto out;
959e02119d5SChris Mason 	} else
960e02119d5SChris Mason 		goto out;
961b3b4aa74SDavid Sterba 	btrfs_release_path(path);
962e02119d5SChris Mason 
963e02119d5SChris Mason 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
964e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
965e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
966e02119d5SChris Mason 		if (location.objectid != objectid)
967e02119d5SChris Mason 			goto out;
968e02119d5SChris Mason 	} else
969e02119d5SChris Mason 		goto out;
970e02119d5SChris Mason 	match = 1;
971e02119d5SChris Mason out:
972b3b4aa74SDavid Sterba 	btrfs_release_path(path);
973e02119d5SChris Mason 	return match;
974e02119d5SChris Mason }
975e02119d5SChris Mason 
976e02119d5SChris Mason /*
977e02119d5SChris Mason  * helper function to check a log tree for a named back reference in
978e02119d5SChris Mason  * an inode.  This is used to decide if a back reference that is
979e02119d5SChris Mason  * found in the subvolume conflicts with what we find in the log.
980e02119d5SChris Mason  *
981e02119d5SChris Mason  * inode backreferences may have multiple refs in a single item,
982e02119d5SChris Mason  * during replay we process one reference at a time, and we don't
983e02119d5SChris Mason  * want to delete valid links to a file from the subvolume if that
984e02119d5SChris Mason  * link is also in the log.
985e02119d5SChris Mason  */
986e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log,
987e02119d5SChris Mason 				   struct btrfs_key *key,
988f186373fSMark Fasheh 				   u64 ref_objectid,
989df8d116fSFilipe Manana 				   const char *name, int namelen)
990e02119d5SChris Mason {
991e02119d5SChris Mason 	struct btrfs_path *path;
992e02119d5SChris Mason 	int ret;
993e02119d5SChris Mason 
994e02119d5SChris Mason 	path = btrfs_alloc_path();
9952a29edc6Sliubo 	if (!path)
9962a29edc6Sliubo 		return -ENOMEM;
9972a29edc6Sliubo 
998e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
999d3316c82SNikolay Borisov 	if (ret < 0) {
1000d3316c82SNikolay Borisov 		goto out;
1001d3316c82SNikolay Borisov 	} else if (ret == 1) {
100289cbf5f6SNikolay Borisov 		ret = 0;
1003e02119d5SChris Mason 		goto out;
100489cbf5f6SNikolay Borisov 	}
1005e02119d5SChris Mason 
100689cbf5f6SNikolay Borisov 	if (key->type == BTRFS_INODE_EXTREF_KEY)
100789cbf5f6SNikolay Borisov 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
10081f250e92SFilipe Manana 						       path->slots[0],
10091f250e92SFilipe Manana 						       ref_objectid,
101089cbf5f6SNikolay Borisov 						       name, namelen);
101189cbf5f6SNikolay Borisov 	else
101289cbf5f6SNikolay Borisov 		ret = !!btrfs_find_name_in_backref(path->nodes[0],
101389cbf5f6SNikolay Borisov 						   path->slots[0],
101489cbf5f6SNikolay Borisov 						   name, namelen);
1015e02119d5SChris Mason out:
1016e02119d5SChris Mason 	btrfs_free_path(path);
101789cbf5f6SNikolay Borisov 	return ret;
1018e02119d5SChris Mason }
1019e02119d5SChris Mason 
10205a1d7843SJan Schmidt static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
10215a1d7843SJan Schmidt 				  struct btrfs_root *root,
10225a1d7843SJan Schmidt 				  struct btrfs_path *path,
10235a1d7843SJan Schmidt 				  struct btrfs_root *log_root,
102494c91a1fSNikolay Borisov 				  struct btrfs_inode *dir,
102594c91a1fSNikolay Borisov 				  struct btrfs_inode *inode,
1026f186373fSMark Fasheh 				  u64 inode_objectid, u64 parent_objectid,
1027f186373fSMark Fasheh 				  u64 ref_index, char *name, int namelen,
1028f186373fSMark Fasheh 				  int *search_done)
10295a1d7843SJan Schmidt {
10305a1d7843SJan Schmidt 	int ret;
10315a1d7843SJan Schmidt 	char *victim_name;
10325a1d7843SJan Schmidt 	int victim_name_len;
1033f186373fSMark Fasheh 	struct extent_buffer *leaf;
1034f186373fSMark Fasheh 	struct btrfs_dir_item *di;
1035f186373fSMark Fasheh 	struct btrfs_key search_key;
1036f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1037f186373fSMark Fasheh 
1038f186373fSMark Fasheh again:
1039f186373fSMark Fasheh 	/* Search old style refs */
1040f186373fSMark Fasheh 	search_key.objectid = inode_objectid;
1041f186373fSMark Fasheh 	search_key.type = BTRFS_INODE_REF_KEY;
1042f186373fSMark Fasheh 	search_key.offset = parent_objectid;
1043f186373fSMark Fasheh 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1044f186373fSMark Fasheh 	if (ret == 0) {
10455a1d7843SJan Schmidt 		struct btrfs_inode_ref *victim_ref;
10465a1d7843SJan Schmidt 		unsigned long ptr;
10475a1d7843SJan Schmidt 		unsigned long ptr_end;
1048f186373fSMark Fasheh 
1049f186373fSMark Fasheh 		leaf = path->nodes[0];
10505a1d7843SJan Schmidt 
10515a1d7843SJan Schmidt 		/* are we trying to overwrite a back ref for the root directory
10525a1d7843SJan Schmidt 		 * if so, just jump out, we're done
10535a1d7843SJan Schmidt 		 */
1054f186373fSMark Fasheh 		if (search_key.objectid == search_key.offset)
10555a1d7843SJan Schmidt 			return 1;
10565a1d7843SJan Schmidt 
10575a1d7843SJan Schmidt 		/* check all the names in this back reference to see
10585a1d7843SJan Schmidt 		 * if they are in the log.  if so, we allow them to stay
10595a1d7843SJan Schmidt 		 * otherwise they must be unlinked as a conflict
10605a1d7843SJan Schmidt 		 */
10615a1d7843SJan Schmidt 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
10625a1d7843SJan Schmidt 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
10635a1d7843SJan Schmidt 		while (ptr < ptr_end) {
10645a1d7843SJan Schmidt 			victim_ref = (struct btrfs_inode_ref *)ptr;
10655a1d7843SJan Schmidt 			victim_name_len = btrfs_inode_ref_name_len(leaf,
10665a1d7843SJan Schmidt 								   victim_ref);
10675a1d7843SJan Schmidt 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
10683650860bSJosef Bacik 			if (!victim_name)
10693650860bSJosef Bacik 				return -ENOMEM;
10705a1d7843SJan Schmidt 
10715a1d7843SJan Schmidt 			read_extent_buffer(leaf, victim_name,
10725a1d7843SJan Schmidt 					   (unsigned long)(victim_ref + 1),
10735a1d7843SJan Schmidt 					   victim_name_len);
10745a1d7843SJan Schmidt 
1075d3316c82SNikolay Borisov 			ret = backref_in_log(log_root, &search_key,
1076d3316c82SNikolay Borisov 					     parent_objectid, victim_name,
1077d3316c82SNikolay Borisov 					     victim_name_len);
1078d3316c82SNikolay Borisov 			if (ret < 0) {
1079d3316c82SNikolay Borisov 				kfree(victim_name);
1080d3316c82SNikolay Borisov 				return ret;
1081d3316c82SNikolay Borisov 			} else if (!ret) {
108294c91a1fSNikolay Borisov 				inc_nlink(&inode->vfs_inode);
10835a1d7843SJan Schmidt 				btrfs_release_path(path);
10845a1d7843SJan Schmidt 
108594c91a1fSNikolay Borisov 				ret = btrfs_unlink_inode(trans, root, dir, inode,
10864ec5934eSNikolay Borisov 						victim_name, victim_name_len);
1087f186373fSMark Fasheh 				kfree(victim_name);
10883650860bSJosef Bacik 				if (ret)
10893650860bSJosef Bacik 					return ret;
1090e5c304e6SNikolay Borisov 				ret = btrfs_run_delayed_items(trans);
1091ada9af21SFilipe David Borba Manana 				if (ret)
1092ada9af21SFilipe David Borba Manana 					return ret;
1093f186373fSMark Fasheh 				*search_done = 1;
1094f186373fSMark Fasheh 				goto again;
10955a1d7843SJan Schmidt 			}
10965a1d7843SJan Schmidt 			kfree(victim_name);
1097f186373fSMark Fasheh 
10985a1d7843SJan Schmidt 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
10995a1d7843SJan Schmidt 		}
11005a1d7843SJan Schmidt 
11015a1d7843SJan Schmidt 		/*
11025a1d7843SJan Schmidt 		 * NOTE: we have searched root tree and checked the
1103bb7ab3b9SAdam Buchbinder 		 * corresponding ref, it does not need to check again.
11045a1d7843SJan Schmidt 		 */
11055a1d7843SJan Schmidt 		*search_done = 1;
11065a1d7843SJan Schmidt 	}
11075a1d7843SJan Schmidt 	btrfs_release_path(path);
11085a1d7843SJan Schmidt 
1109f186373fSMark Fasheh 	/* Same search but for extended refs */
1110f186373fSMark Fasheh 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1111f186373fSMark Fasheh 					   inode_objectid, parent_objectid, 0,
1112f186373fSMark Fasheh 					   0);
1113f186373fSMark Fasheh 	if (!IS_ERR_OR_NULL(extref)) {
1114f186373fSMark Fasheh 		u32 item_size;
1115f186373fSMark Fasheh 		u32 cur_offset = 0;
1116f186373fSMark Fasheh 		unsigned long base;
1117f186373fSMark Fasheh 		struct inode *victim_parent;
1118f186373fSMark Fasheh 
1119f186373fSMark Fasheh 		leaf = path->nodes[0];
1120f186373fSMark Fasheh 
1121f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1122f186373fSMark Fasheh 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1123f186373fSMark Fasheh 
1124f186373fSMark Fasheh 		while (cur_offset < item_size) {
1125dd9ef135SQuentin Casasnovas 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1126f186373fSMark Fasheh 
1127f186373fSMark Fasheh 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1128f186373fSMark Fasheh 
1129f186373fSMark Fasheh 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1130f186373fSMark Fasheh 				goto next;
1131f186373fSMark Fasheh 
1132f186373fSMark Fasheh 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
11333650860bSJosef Bacik 			if (!victim_name)
11343650860bSJosef Bacik 				return -ENOMEM;
1135f186373fSMark Fasheh 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1136f186373fSMark Fasheh 					   victim_name_len);
1137f186373fSMark Fasheh 
1138f186373fSMark Fasheh 			search_key.objectid = inode_objectid;
1139f186373fSMark Fasheh 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1140f186373fSMark Fasheh 			search_key.offset = btrfs_extref_hash(parent_objectid,
1141f186373fSMark Fasheh 							      victim_name,
1142f186373fSMark Fasheh 							      victim_name_len);
1143d3316c82SNikolay Borisov 			ret = backref_in_log(log_root, &search_key,
1144f186373fSMark Fasheh 					     parent_objectid, victim_name,
1145d3316c82SNikolay Borisov 					     victim_name_len);
1146d3316c82SNikolay Borisov 			if (ret < 0) {
1147d3316c82SNikolay Borisov 				return ret;
1148d3316c82SNikolay Borisov 			} else if (!ret) {
1149f186373fSMark Fasheh 				ret = -ENOENT;
1150f186373fSMark Fasheh 				victim_parent = read_one_inode(root,
1151f186373fSMark Fasheh 						parent_objectid);
1152f186373fSMark Fasheh 				if (victim_parent) {
115394c91a1fSNikolay Borisov 					inc_nlink(&inode->vfs_inode);
1154f186373fSMark Fasheh 					btrfs_release_path(path);
1155f186373fSMark Fasheh 
1156f186373fSMark Fasheh 					ret = btrfs_unlink_inode(trans, root,
11574ec5934eSNikolay Borisov 							BTRFS_I(victim_parent),
115894c91a1fSNikolay Borisov 							inode,
1159f186373fSMark Fasheh 							victim_name,
1160f186373fSMark Fasheh 							victim_name_len);
1161ada9af21SFilipe David Borba Manana 					if (!ret)
1162ada9af21SFilipe David Borba Manana 						ret = btrfs_run_delayed_items(
1163e5c304e6SNikolay Borisov 								  trans);
1164f186373fSMark Fasheh 				}
1165f186373fSMark Fasheh 				iput(victim_parent);
1166f186373fSMark Fasheh 				kfree(victim_name);
11673650860bSJosef Bacik 				if (ret)
11683650860bSJosef Bacik 					return ret;
1169f186373fSMark Fasheh 				*search_done = 1;
1170f186373fSMark Fasheh 				goto again;
1171f186373fSMark Fasheh 			}
1172f186373fSMark Fasheh 			kfree(victim_name);
1173f186373fSMark Fasheh next:
1174f186373fSMark Fasheh 			cur_offset += victim_name_len + sizeof(*extref);
1175f186373fSMark Fasheh 		}
1176f186373fSMark Fasheh 		*search_done = 1;
1177f186373fSMark Fasheh 	}
1178f186373fSMark Fasheh 	btrfs_release_path(path);
1179f186373fSMark Fasheh 
11805a1d7843SJan Schmidt 	/* look for a conflicting sequence number */
118194c91a1fSNikolay Borisov 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1182f186373fSMark Fasheh 					 ref_index, name, namelen, 0);
11835a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
118494c91a1fSNikolay Borisov 		ret = drop_one_dir_item(trans, root, path, dir, di);
11853650860bSJosef Bacik 		if (ret)
11863650860bSJosef Bacik 			return ret;
11875a1d7843SJan Schmidt 	}
11885a1d7843SJan Schmidt 	btrfs_release_path(path);
11895a1d7843SJan Schmidt 
119052042d8eSAndrea Gelmini 	/* look for a conflicting name */
119194c91a1fSNikolay Borisov 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
11925a1d7843SJan Schmidt 				   name, namelen, 0);
11935a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
119494c91a1fSNikolay Borisov 		ret = drop_one_dir_item(trans, root, path, dir, di);
11953650860bSJosef Bacik 		if (ret)
11963650860bSJosef Bacik 			return ret;
11975a1d7843SJan Schmidt 	}
11985a1d7843SJan Schmidt 	btrfs_release_path(path);
11995a1d7843SJan Schmidt 
12005a1d7843SJan Schmidt 	return 0;
12015a1d7843SJan Schmidt }
1202e02119d5SChris Mason 
1203bae15d95SQu Wenruo static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1204bae15d95SQu Wenruo 			     u32 *namelen, char **name, u64 *index,
1205bae15d95SQu Wenruo 			     u64 *parent_objectid)
1206f186373fSMark Fasheh {
1207f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1208f186373fSMark Fasheh 
1209f186373fSMark Fasheh 	extref = (struct btrfs_inode_extref *)ref_ptr;
1210f186373fSMark Fasheh 
1211f186373fSMark Fasheh 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1212f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1213f186373fSMark Fasheh 	if (*name == NULL)
1214f186373fSMark Fasheh 		return -ENOMEM;
1215f186373fSMark Fasheh 
1216f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1217f186373fSMark Fasheh 			   *namelen);
1218f186373fSMark Fasheh 
12191f250e92SFilipe Manana 	if (index)
1220f186373fSMark Fasheh 		*index = btrfs_inode_extref_index(eb, extref);
1221f186373fSMark Fasheh 	if (parent_objectid)
1222f186373fSMark Fasheh 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1223f186373fSMark Fasheh 
1224f186373fSMark Fasheh 	return 0;
1225f186373fSMark Fasheh }
1226f186373fSMark Fasheh 
1227bae15d95SQu Wenruo static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1228bae15d95SQu Wenruo 			  u32 *namelen, char **name, u64 *index)
1229f186373fSMark Fasheh {
1230f186373fSMark Fasheh 	struct btrfs_inode_ref *ref;
1231f186373fSMark Fasheh 
1232f186373fSMark Fasheh 	ref = (struct btrfs_inode_ref *)ref_ptr;
1233f186373fSMark Fasheh 
1234f186373fSMark Fasheh 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1235f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1236f186373fSMark Fasheh 	if (*name == NULL)
1237f186373fSMark Fasheh 		return -ENOMEM;
1238f186373fSMark Fasheh 
1239f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1240f186373fSMark Fasheh 
12411f250e92SFilipe Manana 	if (index)
1242f186373fSMark Fasheh 		*index = btrfs_inode_ref_index(eb, ref);
1243f186373fSMark Fasheh 
1244f186373fSMark Fasheh 	return 0;
1245f186373fSMark Fasheh }
1246f186373fSMark Fasheh 
1247e02119d5SChris Mason /*
12481f250e92SFilipe Manana  * Take an inode reference item from the log tree and iterate all names from the
12491f250e92SFilipe Manana  * inode reference item in the subvolume tree with the same key (if it exists).
12501f250e92SFilipe Manana  * For any name that is not in the inode reference item from the log tree, do a
12511f250e92SFilipe Manana  * proper unlink of that name (that is, remove its entry from the inode
12521f250e92SFilipe Manana  * reference item and both dir index keys).
12531f250e92SFilipe Manana  */
12541f250e92SFilipe Manana static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
12551f250e92SFilipe Manana 				 struct btrfs_root *root,
12561f250e92SFilipe Manana 				 struct btrfs_path *path,
12571f250e92SFilipe Manana 				 struct btrfs_inode *inode,
12581f250e92SFilipe Manana 				 struct extent_buffer *log_eb,
12591f250e92SFilipe Manana 				 int log_slot,
12601f250e92SFilipe Manana 				 struct btrfs_key *key)
12611f250e92SFilipe Manana {
12621f250e92SFilipe Manana 	int ret;
12631f250e92SFilipe Manana 	unsigned long ref_ptr;
12641f250e92SFilipe Manana 	unsigned long ref_end;
12651f250e92SFilipe Manana 	struct extent_buffer *eb;
12661f250e92SFilipe Manana 
12671f250e92SFilipe Manana again:
12681f250e92SFilipe Manana 	btrfs_release_path(path);
12691f250e92SFilipe Manana 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
12701f250e92SFilipe Manana 	if (ret > 0) {
12711f250e92SFilipe Manana 		ret = 0;
12721f250e92SFilipe Manana 		goto out;
12731f250e92SFilipe Manana 	}
12741f250e92SFilipe Manana 	if (ret < 0)
12751f250e92SFilipe Manana 		goto out;
12761f250e92SFilipe Manana 
12771f250e92SFilipe Manana 	eb = path->nodes[0];
12781f250e92SFilipe Manana 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
12791f250e92SFilipe Manana 	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
12801f250e92SFilipe Manana 	while (ref_ptr < ref_end) {
12811f250e92SFilipe Manana 		char *name = NULL;
12821f250e92SFilipe Manana 		int namelen;
12831f250e92SFilipe Manana 		u64 parent_id;
12841f250e92SFilipe Manana 
12851f250e92SFilipe Manana 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
12861f250e92SFilipe Manana 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
12871f250e92SFilipe Manana 						NULL, &parent_id);
12881f250e92SFilipe Manana 		} else {
12891f250e92SFilipe Manana 			parent_id = key->offset;
12901f250e92SFilipe Manana 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
12911f250e92SFilipe Manana 					     NULL);
12921f250e92SFilipe Manana 		}
12931f250e92SFilipe Manana 		if (ret)
12941f250e92SFilipe Manana 			goto out;
12951f250e92SFilipe Manana 
12961f250e92SFilipe Manana 		if (key->type == BTRFS_INODE_EXTREF_KEY)
12976ff49c6aSNikolay Borisov 			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
12981f250e92SFilipe Manana 							       parent_id, name,
12996ff49c6aSNikolay Borisov 							       namelen);
13001f250e92SFilipe Manana 		else
13019bb8407fSNikolay Borisov 			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
13029bb8407fSNikolay Borisov 							   name, namelen);
13031f250e92SFilipe Manana 
13041f250e92SFilipe Manana 		if (!ret) {
13051f250e92SFilipe Manana 			struct inode *dir;
13061f250e92SFilipe Manana 
13071f250e92SFilipe Manana 			btrfs_release_path(path);
13081f250e92SFilipe Manana 			dir = read_one_inode(root, parent_id);
13091f250e92SFilipe Manana 			if (!dir) {
13101f250e92SFilipe Manana 				ret = -ENOENT;
13111f250e92SFilipe Manana 				kfree(name);
13121f250e92SFilipe Manana 				goto out;
13131f250e92SFilipe Manana 			}
13141f250e92SFilipe Manana 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
13151f250e92SFilipe Manana 						 inode, name, namelen);
13161f250e92SFilipe Manana 			kfree(name);
13171f250e92SFilipe Manana 			iput(dir);
13181f250e92SFilipe Manana 			if (ret)
13191f250e92SFilipe Manana 				goto out;
13201f250e92SFilipe Manana 			goto again;
13211f250e92SFilipe Manana 		}
13221f250e92SFilipe Manana 
13231f250e92SFilipe Manana 		kfree(name);
13241f250e92SFilipe Manana 		ref_ptr += namelen;
13251f250e92SFilipe Manana 		if (key->type == BTRFS_INODE_EXTREF_KEY)
13261f250e92SFilipe Manana 			ref_ptr += sizeof(struct btrfs_inode_extref);
13271f250e92SFilipe Manana 		else
13281f250e92SFilipe Manana 			ref_ptr += sizeof(struct btrfs_inode_ref);
13291f250e92SFilipe Manana 	}
13301f250e92SFilipe Manana 	ret = 0;
13311f250e92SFilipe Manana  out:
13321f250e92SFilipe Manana 	btrfs_release_path(path);
13331f250e92SFilipe Manana 	return ret;
13341f250e92SFilipe Manana }
13351f250e92SFilipe Manana 
13360d836392SFilipe Manana static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
13370d836392SFilipe Manana 				  const u8 ref_type, const char *name,
13380d836392SFilipe Manana 				  const int namelen)
13390d836392SFilipe Manana {
13400d836392SFilipe Manana 	struct btrfs_key key;
13410d836392SFilipe Manana 	struct btrfs_path *path;
13420d836392SFilipe Manana 	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
13430d836392SFilipe Manana 	int ret;
13440d836392SFilipe Manana 
13450d836392SFilipe Manana 	path = btrfs_alloc_path();
13460d836392SFilipe Manana 	if (!path)
13470d836392SFilipe Manana 		return -ENOMEM;
13480d836392SFilipe Manana 
13490d836392SFilipe Manana 	key.objectid = btrfs_ino(BTRFS_I(inode));
13500d836392SFilipe Manana 	key.type = ref_type;
13510d836392SFilipe Manana 	if (key.type == BTRFS_INODE_REF_KEY)
13520d836392SFilipe Manana 		key.offset = parent_id;
13530d836392SFilipe Manana 	else
13540d836392SFilipe Manana 		key.offset = btrfs_extref_hash(parent_id, name, namelen);
13550d836392SFilipe Manana 
13560d836392SFilipe Manana 	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
13570d836392SFilipe Manana 	if (ret < 0)
13580d836392SFilipe Manana 		goto out;
13590d836392SFilipe Manana 	if (ret > 0) {
13600d836392SFilipe Manana 		ret = 0;
13610d836392SFilipe Manana 		goto out;
13620d836392SFilipe Manana 	}
13630d836392SFilipe Manana 	if (key.type == BTRFS_INODE_EXTREF_KEY)
13646ff49c6aSNikolay Borisov 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
13656ff49c6aSNikolay Borisov 				path->slots[0], parent_id, name, namelen);
13660d836392SFilipe Manana 	else
13679bb8407fSNikolay Borisov 		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
13689bb8407fSNikolay Borisov 						   name, namelen);
13690d836392SFilipe Manana 
13700d836392SFilipe Manana out:
13710d836392SFilipe Manana 	btrfs_free_path(path);
13720d836392SFilipe Manana 	return ret;
13730d836392SFilipe Manana }
13740d836392SFilipe Manana 
13756b5fc433SFilipe Manana static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
13766b5fc433SFilipe Manana 		    struct inode *dir, struct inode *inode, const char *name,
13776b5fc433SFilipe Manana 		    int namelen, u64 ref_index)
13786b5fc433SFilipe Manana {
13796b5fc433SFilipe Manana 	struct btrfs_dir_item *dir_item;
13806b5fc433SFilipe Manana 	struct btrfs_key key;
13816b5fc433SFilipe Manana 	struct btrfs_path *path;
13826b5fc433SFilipe Manana 	struct inode *other_inode = NULL;
13836b5fc433SFilipe Manana 	int ret;
13846b5fc433SFilipe Manana 
13856b5fc433SFilipe Manana 	path = btrfs_alloc_path();
13866b5fc433SFilipe Manana 	if (!path)
13876b5fc433SFilipe Manana 		return -ENOMEM;
13886b5fc433SFilipe Manana 
13896b5fc433SFilipe Manana 	dir_item = btrfs_lookup_dir_item(NULL, root, path,
13906b5fc433SFilipe Manana 					 btrfs_ino(BTRFS_I(dir)),
13916b5fc433SFilipe Manana 					 name, namelen, 0);
13926b5fc433SFilipe Manana 	if (!dir_item) {
13936b5fc433SFilipe Manana 		btrfs_release_path(path);
13946b5fc433SFilipe Manana 		goto add_link;
13956b5fc433SFilipe Manana 	} else if (IS_ERR(dir_item)) {
13966b5fc433SFilipe Manana 		ret = PTR_ERR(dir_item);
13976b5fc433SFilipe Manana 		goto out;
13986b5fc433SFilipe Manana 	}
13996b5fc433SFilipe Manana 
14006b5fc433SFilipe Manana 	/*
14016b5fc433SFilipe Manana 	 * Our inode's dentry collides with the dentry of another inode which is
14026b5fc433SFilipe Manana 	 * in the log but not yet processed since it has a higher inode number.
14036b5fc433SFilipe Manana 	 * So delete that other dentry.
14046b5fc433SFilipe Manana 	 */
14056b5fc433SFilipe Manana 	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
14066b5fc433SFilipe Manana 	btrfs_release_path(path);
14076b5fc433SFilipe Manana 	other_inode = read_one_inode(root, key.objectid);
14086b5fc433SFilipe Manana 	if (!other_inode) {
14096b5fc433SFilipe Manana 		ret = -ENOENT;
14106b5fc433SFilipe Manana 		goto out;
14116b5fc433SFilipe Manana 	}
14126b5fc433SFilipe Manana 	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
14136b5fc433SFilipe Manana 				 name, namelen);
14146b5fc433SFilipe Manana 	if (ret)
14156b5fc433SFilipe Manana 		goto out;
14166b5fc433SFilipe Manana 	/*
14176b5fc433SFilipe Manana 	 * If we dropped the link count to 0, bump it so that later the iput()
14186b5fc433SFilipe Manana 	 * on the inode will not free it. We will fixup the link count later.
14196b5fc433SFilipe Manana 	 */
14206b5fc433SFilipe Manana 	if (other_inode->i_nlink == 0)
14216b5fc433SFilipe Manana 		inc_nlink(other_inode);
14226b5fc433SFilipe Manana 
14236b5fc433SFilipe Manana 	ret = btrfs_run_delayed_items(trans);
14246b5fc433SFilipe Manana 	if (ret)
14256b5fc433SFilipe Manana 		goto out;
14266b5fc433SFilipe Manana add_link:
14276b5fc433SFilipe Manana 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
14286b5fc433SFilipe Manana 			     name, namelen, 0, ref_index);
14296b5fc433SFilipe Manana out:
14306b5fc433SFilipe Manana 	iput(other_inode);
14316b5fc433SFilipe Manana 	btrfs_free_path(path);
14326b5fc433SFilipe Manana 
14336b5fc433SFilipe Manana 	return ret;
14346b5fc433SFilipe Manana }
14356b5fc433SFilipe Manana 
14361f250e92SFilipe Manana /*
1437e02119d5SChris Mason  * replay one inode back reference item found in the log tree.
1438e02119d5SChris Mason  * eb, slot and key refer to the buffer and key found in the log tree.
1439e02119d5SChris Mason  * root is the destination we are replaying into, and path is for temp
1440e02119d5SChris Mason  * use by this function.  (it should be released on return).
1441e02119d5SChris Mason  */
1442e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1443e02119d5SChris Mason 				  struct btrfs_root *root,
1444e02119d5SChris Mason 				  struct btrfs_root *log,
1445e02119d5SChris Mason 				  struct btrfs_path *path,
1446e02119d5SChris Mason 				  struct extent_buffer *eb, int slot,
1447e02119d5SChris Mason 				  struct btrfs_key *key)
1448e02119d5SChris Mason {
144903b2f08bSGeyslan G. Bem 	struct inode *dir = NULL;
145003b2f08bSGeyslan G. Bem 	struct inode *inode = NULL;
1451e02119d5SChris Mason 	unsigned long ref_ptr;
1452e02119d5SChris Mason 	unsigned long ref_end;
145303b2f08bSGeyslan G. Bem 	char *name = NULL;
145434f3e4f2Sliubo 	int namelen;
145534f3e4f2Sliubo 	int ret;
1456c622ae60Sliubo 	int search_done = 0;
1457f186373fSMark Fasheh 	int log_ref_ver = 0;
1458f186373fSMark Fasheh 	u64 parent_objectid;
1459f186373fSMark Fasheh 	u64 inode_objectid;
1460f46dbe3dSChris Mason 	u64 ref_index = 0;
1461f186373fSMark Fasheh 	int ref_struct_size;
1462f186373fSMark Fasheh 
1463f186373fSMark Fasheh 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1464f186373fSMark Fasheh 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1465f186373fSMark Fasheh 
1466f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1467f186373fSMark Fasheh 		struct btrfs_inode_extref *r;
1468f186373fSMark Fasheh 
1469f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1470f186373fSMark Fasheh 		log_ref_ver = 1;
1471f186373fSMark Fasheh 		r = (struct btrfs_inode_extref *)ref_ptr;
1472f186373fSMark Fasheh 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1473f186373fSMark Fasheh 	} else {
1474f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1475f186373fSMark Fasheh 		parent_objectid = key->offset;
1476f186373fSMark Fasheh 	}
1477f186373fSMark Fasheh 	inode_objectid = key->objectid;
1478e02119d5SChris Mason 
1479e02119d5SChris Mason 	/*
1480e02119d5SChris Mason 	 * it is possible that we didn't log all the parent directories
1481e02119d5SChris Mason 	 * for a given inode.  If we don't find the dir, just don't
1482e02119d5SChris Mason 	 * copy the back ref in.  The link count fixup code will take
1483e02119d5SChris Mason 	 * care of the rest
1484e02119d5SChris Mason 	 */
1485f186373fSMark Fasheh 	dir = read_one_inode(root, parent_objectid);
148603b2f08bSGeyslan G. Bem 	if (!dir) {
148703b2f08bSGeyslan G. Bem 		ret = -ENOENT;
148803b2f08bSGeyslan G. Bem 		goto out;
148903b2f08bSGeyslan G. Bem 	}
1490e02119d5SChris Mason 
1491f186373fSMark Fasheh 	inode = read_one_inode(root, inode_objectid);
1492c00e9493STsutomu Itoh 	if (!inode) {
149303b2f08bSGeyslan G. Bem 		ret = -EIO;
149403b2f08bSGeyslan G. Bem 		goto out;
1495c00e9493STsutomu Itoh 	}
1496e02119d5SChris Mason 
14975a1d7843SJan Schmidt 	while (ref_ptr < ref_end) {
1498f186373fSMark Fasheh 		if (log_ref_ver) {
1499bae15d95SQu Wenruo 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1500bae15d95SQu Wenruo 						&ref_index, &parent_objectid);
1501f186373fSMark Fasheh 			/*
1502f186373fSMark Fasheh 			 * parent object can change from one array
1503f186373fSMark Fasheh 			 * item to another.
1504f186373fSMark Fasheh 			 */
1505f186373fSMark Fasheh 			if (!dir)
1506f186373fSMark Fasheh 				dir = read_one_inode(root, parent_objectid);
150703b2f08bSGeyslan G. Bem 			if (!dir) {
150803b2f08bSGeyslan G. Bem 				ret = -ENOENT;
150903b2f08bSGeyslan G. Bem 				goto out;
151003b2f08bSGeyslan G. Bem 			}
1511f186373fSMark Fasheh 		} else {
1512bae15d95SQu Wenruo 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1513bae15d95SQu Wenruo 					     &ref_index);
1514f186373fSMark Fasheh 		}
1515f186373fSMark Fasheh 		if (ret)
151603b2f08bSGeyslan G. Bem 			goto out;
1517e02119d5SChris Mason 
1518e02119d5SChris Mason 		/* if we already have a perfect match, we're done */
1519f85b7379SDavid Sterba 		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1520f85b7379SDavid Sterba 					btrfs_ino(BTRFS_I(inode)), ref_index,
1521f85b7379SDavid Sterba 					name, namelen)) {
15225a1d7843SJan Schmidt 			/*
15235a1d7843SJan Schmidt 			 * look for a conflicting back reference in the
15245a1d7843SJan Schmidt 			 * metadata. if we find one we have to unlink that name
15255a1d7843SJan Schmidt 			 * of the file before we add our new link.  Later on, we
15265a1d7843SJan Schmidt 			 * overwrite any existing back reference, and we don't
15275a1d7843SJan Schmidt 			 * want to create dangling pointers in the directory.
15285a1d7843SJan Schmidt 			 */
15295a1d7843SJan Schmidt 
15305a1d7843SJan Schmidt 			if (!search_done) {
15315a1d7843SJan Schmidt 				ret = __add_inode_ref(trans, root, path, log,
153294c91a1fSNikolay Borisov 						      BTRFS_I(dir),
1533d75eefdfSDavid Sterba 						      BTRFS_I(inode),
1534f186373fSMark Fasheh 						      inode_objectid,
1535f186373fSMark Fasheh 						      parent_objectid,
1536f186373fSMark Fasheh 						      ref_index, name, namelen,
15375a1d7843SJan Schmidt 						      &search_done);
153803b2f08bSGeyslan G. Bem 				if (ret) {
153903b2f08bSGeyslan G. Bem 					if (ret == 1)
15403650860bSJosef Bacik 						ret = 0;
1541e02119d5SChris Mason 					goto out;
15423650860bSJosef Bacik 				}
154334f3e4f2Sliubo 			}
154434f3e4f2Sliubo 
15450d836392SFilipe Manana 			/*
15460d836392SFilipe Manana 			 * If a reference item already exists for this inode
15470d836392SFilipe Manana 			 * with the same parent and name, but different index,
15480d836392SFilipe Manana 			 * drop it and the corresponding directory index entries
15490d836392SFilipe Manana 			 * from the parent before adding the new reference item
15500d836392SFilipe Manana 			 * and dir index entries, otherwise we would fail with
15510d836392SFilipe Manana 			 * -EEXIST returned from btrfs_add_link() below.
15520d836392SFilipe Manana 			 */
15530d836392SFilipe Manana 			ret = btrfs_inode_ref_exists(inode, dir, key->type,
15540d836392SFilipe Manana 						     name, namelen);
15550d836392SFilipe Manana 			if (ret > 0) {
15560d836392SFilipe Manana 				ret = btrfs_unlink_inode(trans, root,
15570d836392SFilipe Manana 							 BTRFS_I(dir),
15580d836392SFilipe Manana 							 BTRFS_I(inode),
15590d836392SFilipe Manana 							 name, namelen);
15600d836392SFilipe Manana 				/*
15610d836392SFilipe Manana 				 * If we dropped the link count to 0, bump it so
15620d836392SFilipe Manana 				 * that later the iput() on the inode will not
15630d836392SFilipe Manana 				 * free it. We will fixup the link count later.
15640d836392SFilipe Manana 				 */
15650d836392SFilipe Manana 				if (!ret && inode->i_nlink == 0)
15660d836392SFilipe Manana 					inc_nlink(inode);
15670d836392SFilipe Manana 			}
15680d836392SFilipe Manana 			if (ret < 0)
15690d836392SFilipe Manana 				goto out;
15700d836392SFilipe Manana 
1571e02119d5SChris Mason 			/* insert our name */
15726b5fc433SFilipe Manana 			ret = add_link(trans, root, dir, inode, name, namelen,
15736b5fc433SFilipe Manana 				       ref_index);
15743650860bSJosef Bacik 			if (ret)
15753650860bSJosef Bacik 				goto out;
1576e02119d5SChris Mason 
1577*f96d4474SJosef Bacik 			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1578*f96d4474SJosef Bacik 			if (ret)
1579*f96d4474SJosef Bacik 				goto out;
15805a1d7843SJan Schmidt 		}
1581e02119d5SChris Mason 
1582f186373fSMark Fasheh 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1583e02119d5SChris Mason 		kfree(name);
158403b2f08bSGeyslan G. Bem 		name = NULL;
1585f186373fSMark Fasheh 		if (log_ref_ver) {
1586f186373fSMark Fasheh 			iput(dir);
1587f186373fSMark Fasheh 			dir = NULL;
1588f186373fSMark Fasheh 		}
15895a1d7843SJan Schmidt 	}
1590e02119d5SChris Mason 
15911f250e92SFilipe Manana 	/*
15921f250e92SFilipe Manana 	 * Before we overwrite the inode reference item in the subvolume tree
15931f250e92SFilipe Manana 	 * with the item from the log tree, we must unlink all names from the
15941f250e92SFilipe Manana 	 * parent directory that are in the subvolume's tree inode reference
15951f250e92SFilipe Manana 	 * item, otherwise we end up with an inconsistent subvolume tree where
15961f250e92SFilipe Manana 	 * dir index entries exist for a name but there is no inode reference
15971f250e92SFilipe Manana 	 * item with the same name.
15981f250e92SFilipe Manana 	 */
15991f250e92SFilipe Manana 	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
16001f250e92SFilipe Manana 				    key);
16011f250e92SFilipe Manana 	if (ret)
16021f250e92SFilipe Manana 		goto out;
16031f250e92SFilipe Manana 
1604e02119d5SChris Mason 	/* finally write the back reference in the inode */
1605e02119d5SChris Mason 	ret = overwrite_item(trans, root, path, eb, slot, key);
16065a1d7843SJan Schmidt out:
1607b3b4aa74SDavid Sterba 	btrfs_release_path(path);
160803b2f08bSGeyslan G. Bem 	kfree(name);
1609e02119d5SChris Mason 	iput(dir);
1610e02119d5SChris Mason 	iput(inode);
16113650860bSJosef Bacik 	return ret;
1612e02119d5SChris Mason }
1613e02119d5SChris Mason 
1614f186373fSMark Fasheh static int count_inode_extrefs(struct btrfs_root *root,
161536283658SNikolay Borisov 		struct btrfs_inode *inode, struct btrfs_path *path)
1616e02119d5SChris Mason {
1617f186373fSMark Fasheh 	int ret = 0;
1618f186373fSMark Fasheh 	int name_len;
1619f186373fSMark Fasheh 	unsigned int nlink = 0;
1620f186373fSMark Fasheh 	u32 item_size;
1621f186373fSMark Fasheh 	u32 cur_offset = 0;
162236283658SNikolay Borisov 	u64 inode_objectid = btrfs_ino(inode);
1623f186373fSMark Fasheh 	u64 offset = 0;
1624f186373fSMark Fasheh 	unsigned long ptr;
1625f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1626f186373fSMark Fasheh 	struct extent_buffer *leaf;
1627f186373fSMark Fasheh 
1628f186373fSMark Fasheh 	while (1) {
1629f186373fSMark Fasheh 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1630f186373fSMark Fasheh 					    &extref, &offset);
1631f186373fSMark Fasheh 		if (ret)
1632f186373fSMark Fasheh 			break;
1633f186373fSMark Fasheh 
1634f186373fSMark Fasheh 		leaf = path->nodes[0];
1635f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1636f186373fSMark Fasheh 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
16372c2c452bSFilipe Manana 		cur_offset = 0;
1638f186373fSMark Fasheh 
1639f186373fSMark Fasheh 		while (cur_offset < item_size) {
1640f186373fSMark Fasheh 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1641f186373fSMark Fasheh 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1642f186373fSMark Fasheh 
1643f186373fSMark Fasheh 			nlink++;
1644f186373fSMark Fasheh 
1645f186373fSMark Fasheh 			cur_offset += name_len + sizeof(*extref);
1646f186373fSMark Fasheh 		}
1647f186373fSMark Fasheh 
1648f186373fSMark Fasheh 		offset++;
1649f186373fSMark Fasheh 		btrfs_release_path(path);
1650f186373fSMark Fasheh 	}
1651f186373fSMark Fasheh 	btrfs_release_path(path);
1652f186373fSMark Fasheh 
16532c2c452bSFilipe Manana 	if (ret < 0 && ret != -ENOENT)
1654f186373fSMark Fasheh 		return ret;
1655f186373fSMark Fasheh 	return nlink;
1656f186373fSMark Fasheh }
1657f186373fSMark Fasheh 
1658f186373fSMark Fasheh static int count_inode_refs(struct btrfs_root *root,
1659f329e319SNikolay Borisov 			struct btrfs_inode *inode, struct btrfs_path *path)
1660f186373fSMark Fasheh {
1661e02119d5SChris Mason 	int ret;
1662e02119d5SChris Mason 	struct btrfs_key key;
1663f186373fSMark Fasheh 	unsigned int nlink = 0;
1664e02119d5SChris Mason 	unsigned long ptr;
1665e02119d5SChris Mason 	unsigned long ptr_end;
1666e02119d5SChris Mason 	int name_len;
1667f329e319SNikolay Borisov 	u64 ino = btrfs_ino(inode);
1668e02119d5SChris Mason 
166933345d01SLi Zefan 	key.objectid = ino;
1670e02119d5SChris Mason 	key.type = BTRFS_INODE_REF_KEY;
1671e02119d5SChris Mason 	key.offset = (u64)-1;
1672e02119d5SChris Mason 
1673e02119d5SChris Mason 	while (1) {
1674e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1675e02119d5SChris Mason 		if (ret < 0)
1676e02119d5SChris Mason 			break;
1677e02119d5SChris Mason 		if (ret > 0) {
1678e02119d5SChris Mason 			if (path->slots[0] == 0)
1679e02119d5SChris Mason 				break;
1680e02119d5SChris Mason 			path->slots[0]--;
1681e02119d5SChris Mason 		}
1682e93ae26fSFilipe David Borba Manana process_slot:
1683e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1684e02119d5SChris Mason 				      path->slots[0]);
168533345d01SLi Zefan 		if (key.objectid != ino ||
1686e02119d5SChris Mason 		    key.type != BTRFS_INODE_REF_KEY)
1687e02119d5SChris Mason 			break;
1688e02119d5SChris Mason 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1689e02119d5SChris Mason 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1690e02119d5SChris Mason 						   path->slots[0]);
1691e02119d5SChris Mason 		while (ptr < ptr_end) {
1692e02119d5SChris Mason 			struct btrfs_inode_ref *ref;
1693e02119d5SChris Mason 
1694e02119d5SChris Mason 			ref = (struct btrfs_inode_ref *)ptr;
1695e02119d5SChris Mason 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1696e02119d5SChris Mason 							    ref);
1697e02119d5SChris Mason 			ptr = (unsigned long)(ref + 1) + name_len;
1698e02119d5SChris Mason 			nlink++;
1699e02119d5SChris Mason 		}
1700e02119d5SChris Mason 
1701e02119d5SChris Mason 		if (key.offset == 0)
1702e02119d5SChris Mason 			break;
1703e93ae26fSFilipe David Borba Manana 		if (path->slots[0] > 0) {
1704e93ae26fSFilipe David Borba Manana 			path->slots[0]--;
1705e93ae26fSFilipe David Borba Manana 			goto process_slot;
1706e93ae26fSFilipe David Borba Manana 		}
1707e02119d5SChris Mason 		key.offset--;
1708b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1709e02119d5SChris Mason 	}
1710b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1711f186373fSMark Fasheh 
1712f186373fSMark Fasheh 	return nlink;
1713f186373fSMark Fasheh }
1714f186373fSMark Fasheh 
1715f186373fSMark Fasheh /*
1716f186373fSMark Fasheh  * There are a few corners where the link count of the file can't
1717f186373fSMark Fasheh  * be properly maintained during replay.  So, instead of adding
1718f186373fSMark Fasheh  * lots of complexity to the log code, we just scan the backrefs
1719f186373fSMark Fasheh  * for any file that has been through replay.
1720f186373fSMark Fasheh  *
1721f186373fSMark Fasheh  * The scan will update the link count on the inode to reflect the
1722f186373fSMark Fasheh  * number of back refs found.  If it goes down to zero, the iput
1723f186373fSMark Fasheh  * will free the inode.
1724f186373fSMark Fasheh  */
1725f186373fSMark Fasheh static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1726f186373fSMark Fasheh 					   struct btrfs_root *root,
1727f186373fSMark Fasheh 					   struct inode *inode)
1728f186373fSMark Fasheh {
1729f186373fSMark Fasheh 	struct btrfs_path *path;
1730f186373fSMark Fasheh 	int ret;
1731f186373fSMark Fasheh 	u64 nlink = 0;
17324a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
1733f186373fSMark Fasheh 
1734f186373fSMark Fasheh 	path = btrfs_alloc_path();
1735f186373fSMark Fasheh 	if (!path)
1736f186373fSMark Fasheh 		return -ENOMEM;
1737f186373fSMark Fasheh 
1738f329e319SNikolay Borisov 	ret = count_inode_refs(root, BTRFS_I(inode), path);
1739f186373fSMark Fasheh 	if (ret < 0)
1740f186373fSMark Fasheh 		goto out;
1741f186373fSMark Fasheh 
1742f186373fSMark Fasheh 	nlink = ret;
1743f186373fSMark Fasheh 
174436283658SNikolay Borisov 	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1745f186373fSMark Fasheh 	if (ret < 0)
1746f186373fSMark Fasheh 		goto out;
1747f186373fSMark Fasheh 
1748f186373fSMark Fasheh 	nlink += ret;
1749f186373fSMark Fasheh 
1750f186373fSMark Fasheh 	ret = 0;
1751f186373fSMark Fasheh 
1752e02119d5SChris Mason 	if (nlink != inode->i_nlink) {
1753bfe86848SMiklos Szeredi 		set_nlink(inode, nlink);
1754*f96d4474SJosef Bacik 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1755*f96d4474SJosef Bacik 		if (ret)
1756*f96d4474SJosef Bacik 			goto out;
1757e02119d5SChris Mason 	}
17588d5bf1cbSChris Mason 	BTRFS_I(inode)->index_cnt = (u64)-1;
1759e02119d5SChris Mason 
1760c71bf099SYan, Zheng 	if (inode->i_nlink == 0) {
1761c71bf099SYan, Zheng 		if (S_ISDIR(inode->i_mode)) {
176212fcfd22SChris Mason 			ret = replay_dir_deletes(trans, root, NULL, path,
176333345d01SLi Zefan 						 ino, 1);
17643650860bSJosef Bacik 			if (ret)
17653650860bSJosef Bacik 				goto out;
176612fcfd22SChris Mason 		}
1767ecdcf3c2SNikolay Borisov 		ret = btrfs_insert_orphan_item(trans, root, ino);
1768ecdcf3c2SNikolay Borisov 		if (ret == -EEXIST)
1769ecdcf3c2SNikolay Borisov 			ret = 0;
1770c71bf099SYan, Zheng 	}
177112fcfd22SChris Mason 
1772f186373fSMark Fasheh out:
1773f186373fSMark Fasheh 	btrfs_free_path(path);
1774f186373fSMark Fasheh 	return ret;
1775e02119d5SChris Mason }
1776e02119d5SChris Mason 
1777e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1778e02119d5SChris Mason 					    struct btrfs_root *root,
1779e02119d5SChris Mason 					    struct btrfs_path *path)
1780e02119d5SChris Mason {
1781e02119d5SChris Mason 	int ret;
1782e02119d5SChris Mason 	struct btrfs_key key;
1783e02119d5SChris Mason 	struct inode *inode;
1784e02119d5SChris Mason 
1785e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1786e02119d5SChris Mason 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1787e02119d5SChris Mason 	key.offset = (u64)-1;
1788e02119d5SChris Mason 	while (1) {
1789e02119d5SChris Mason 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1790e02119d5SChris Mason 		if (ret < 0)
1791e02119d5SChris Mason 			break;
1792e02119d5SChris Mason 
1793e02119d5SChris Mason 		if (ret == 1) {
1794011b28acSJosef Bacik 			ret = 0;
1795e02119d5SChris Mason 			if (path->slots[0] == 0)
1796e02119d5SChris Mason 				break;
1797e02119d5SChris Mason 			path->slots[0]--;
1798e02119d5SChris Mason 		}
1799e02119d5SChris Mason 
1800e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1801e02119d5SChris Mason 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1802e02119d5SChris Mason 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1803e02119d5SChris Mason 			break;
1804e02119d5SChris Mason 
1805e02119d5SChris Mason 		ret = btrfs_del_item(trans, root, path);
180665a246c5STsutomu Itoh 		if (ret)
1807011b28acSJosef Bacik 			break;
1808e02119d5SChris Mason 
1809b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1810e02119d5SChris Mason 		inode = read_one_inode(root, key.offset);
1811011b28acSJosef Bacik 		if (!inode) {
1812011b28acSJosef Bacik 			ret = -EIO;
1813011b28acSJosef Bacik 			break;
1814011b28acSJosef Bacik 		}
1815e02119d5SChris Mason 
1816e02119d5SChris Mason 		ret = fixup_inode_link_count(trans, root, inode);
1817e02119d5SChris Mason 		iput(inode);
18183650860bSJosef Bacik 		if (ret)
1819011b28acSJosef Bacik 			break;
1820e02119d5SChris Mason 
182112fcfd22SChris Mason 		/*
182212fcfd22SChris Mason 		 * fixup on a directory may create new entries,
182312fcfd22SChris Mason 		 * make sure we always look for the highset possible
182412fcfd22SChris Mason 		 * offset
182512fcfd22SChris Mason 		 */
182612fcfd22SChris Mason 		key.offset = (u64)-1;
1827e02119d5SChris Mason 	}
1828b3b4aa74SDavid Sterba 	btrfs_release_path(path);
182965a246c5STsutomu Itoh 	return ret;
1830e02119d5SChris Mason }
1831e02119d5SChris Mason 
1832e02119d5SChris Mason 
1833e02119d5SChris Mason /*
1834e02119d5SChris Mason  * record a given inode in the fixup dir so we can check its link
1835e02119d5SChris Mason  * count when replay is done.  The link count is incremented here
1836e02119d5SChris Mason  * so the inode won't go away until we check it
1837e02119d5SChris Mason  */
1838e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1839e02119d5SChris Mason 				      struct btrfs_root *root,
1840e02119d5SChris Mason 				      struct btrfs_path *path,
1841e02119d5SChris Mason 				      u64 objectid)
1842e02119d5SChris Mason {
1843e02119d5SChris Mason 	struct btrfs_key key;
1844e02119d5SChris Mason 	int ret = 0;
1845e02119d5SChris Mason 	struct inode *inode;
1846e02119d5SChris Mason 
1847e02119d5SChris Mason 	inode = read_one_inode(root, objectid);
1848c00e9493STsutomu Itoh 	if (!inode)
1849c00e9493STsutomu Itoh 		return -EIO;
1850e02119d5SChris Mason 
1851e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1852962a298fSDavid Sterba 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1853e02119d5SChris Mason 	key.offset = objectid;
1854e02119d5SChris Mason 
1855e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1856e02119d5SChris Mason 
1857b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1858e02119d5SChris Mason 	if (ret == 0) {
18599bf7a489SJosef Bacik 		if (!inode->i_nlink)
18609bf7a489SJosef Bacik 			set_nlink(inode, 1);
18619bf7a489SJosef Bacik 		else
18628b558c5fSZach Brown 			inc_nlink(inode);
18639a56fcd1SNikolay Borisov 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1864e02119d5SChris Mason 	} else if (ret == -EEXIST) {
1865e02119d5SChris Mason 		ret = 0;
1866e02119d5SChris Mason 	}
1867e02119d5SChris Mason 	iput(inode);
1868e02119d5SChris Mason 
1869e02119d5SChris Mason 	return ret;
1870e02119d5SChris Mason }
1871e02119d5SChris Mason 
1872e02119d5SChris Mason /*
1873e02119d5SChris Mason  * when replaying the log for a directory, we only insert names
1874e02119d5SChris Mason  * for inodes that actually exist.  This means an fsync on a directory
1875e02119d5SChris Mason  * does not implicitly fsync all the new files in it
1876e02119d5SChris Mason  */
1877e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1878e02119d5SChris Mason 				    struct btrfs_root *root,
1879e02119d5SChris Mason 				    u64 dirid, u64 index,
188060d53eb3SZhaolei 				    char *name, int name_len,
1881e02119d5SChris Mason 				    struct btrfs_key *location)
1882e02119d5SChris Mason {
1883e02119d5SChris Mason 	struct inode *inode;
1884e02119d5SChris Mason 	struct inode *dir;
1885e02119d5SChris Mason 	int ret;
1886e02119d5SChris Mason 
1887e02119d5SChris Mason 	inode = read_one_inode(root, location->objectid);
1888e02119d5SChris Mason 	if (!inode)
1889e02119d5SChris Mason 		return -ENOENT;
1890e02119d5SChris Mason 
1891e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
1892e02119d5SChris Mason 	if (!dir) {
1893e02119d5SChris Mason 		iput(inode);
1894e02119d5SChris Mason 		return -EIO;
1895e02119d5SChris Mason 	}
1896d555438bSJosef Bacik 
1897db0a669fSNikolay Borisov 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1898db0a669fSNikolay Borisov 			name_len, 1, index);
1899e02119d5SChris Mason 
1900e02119d5SChris Mason 	/* FIXME, put inode into FIXUP list */
1901e02119d5SChris Mason 
1902e02119d5SChris Mason 	iput(inode);
1903e02119d5SChris Mason 	iput(dir);
1904e02119d5SChris Mason 	return ret;
1905e02119d5SChris Mason }
1906e02119d5SChris Mason 
1907e02119d5SChris Mason /*
1908e02119d5SChris Mason  * take a single entry in a log directory item and replay it into
1909e02119d5SChris Mason  * the subvolume.
1910e02119d5SChris Mason  *
1911e02119d5SChris Mason  * if a conflicting item exists in the subdirectory already,
1912e02119d5SChris Mason  * the inode it points to is unlinked and put into the link count
1913e02119d5SChris Mason  * fix up tree.
1914e02119d5SChris Mason  *
1915e02119d5SChris Mason  * If a name from the log points to a file or directory that does
1916e02119d5SChris Mason  * not exist in the FS, it is skipped.  fsyncs on directories
1917e02119d5SChris Mason  * do not force down inodes inside that directory, just changes to the
1918e02119d5SChris Mason  * names or unlinks in a directory.
1919bb53eda9SFilipe Manana  *
1920bb53eda9SFilipe Manana  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1921bb53eda9SFilipe Manana  * non-existing inode) and 1 if the name was replayed.
1922e02119d5SChris Mason  */
1923e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1924e02119d5SChris Mason 				    struct btrfs_root *root,
1925e02119d5SChris Mason 				    struct btrfs_path *path,
1926e02119d5SChris Mason 				    struct extent_buffer *eb,
1927e02119d5SChris Mason 				    struct btrfs_dir_item *di,
1928e02119d5SChris Mason 				    struct btrfs_key *key)
1929e02119d5SChris Mason {
1930e02119d5SChris Mason 	char *name;
1931e02119d5SChris Mason 	int name_len;
1932e02119d5SChris Mason 	struct btrfs_dir_item *dst_di;
1933e02119d5SChris Mason 	struct btrfs_key found_key;
1934e02119d5SChris Mason 	struct btrfs_key log_key;
1935e02119d5SChris Mason 	struct inode *dir;
1936e02119d5SChris Mason 	u8 log_type;
19374bef0848SChris Mason 	int exists;
19383650860bSJosef Bacik 	int ret = 0;
1939d555438bSJosef Bacik 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1940bb53eda9SFilipe Manana 	bool name_added = false;
1941e02119d5SChris Mason 
1942e02119d5SChris Mason 	dir = read_one_inode(root, key->objectid);
1943c00e9493STsutomu Itoh 	if (!dir)
1944c00e9493STsutomu Itoh 		return -EIO;
1945e02119d5SChris Mason 
1946e02119d5SChris Mason 	name_len = btrfs_dir_name_len(eb, di);
1947e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
19482bac325eSFilipe David Borba Manana 	if (!name) {
19492bac325eSFilipe David Borba Manana 		ret = -ENOMEM;
19502bac325eSFilipe David Borba Manana 		goto out;
19512bac325eSFilipe David Borba Manana 	}
19522a29edc6Sliubo 
1953e02119d5SChris Mason 	log_type = btrfs_dir_type(eb, di);
1954e02119d5SChris Mason 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1955e02119d5SChris Mason 		   name_len);
1956e02119d5SChris Mason 
1957e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
19584bef0848SChris Mason 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
19594bef0848SChris Mason 	if (exists == 0)
19604bef0848SChris Mason 		exists = 1;
19614bef0848SChris Mason 	else
19624bef0848SChris Mason 		exists = 0;
1963b3b4aa74SDavid Sterba 	btrfs_release_path(path);
19644bef0848SChris Mason 
1965e02119d5SChris Mason 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1966e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1967e02119d5SChris Mason 				       name, name_len, 1);
1968d397712bSChris Mason 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1969e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1970e02119d5SChris Mason 						     key->objectid,
1971e02119d5SChris Mason 						     key->offset, name,
1972e02119d5SChris Mason 						     name_len, 1);
1973e02119d5SChris Mason 	} else {
19743650860bSJosef Bacik 		/* Corruption */
19753650860bSJosef Bacik 		ret = -EINVAL;
19763650860bSJosef Bacik 		goto out;
1977e02119d5SChris Mason 	}
1978c704005dSDavid Sterba 	if (IS_ERR_OR_NULL(dst_di)) {
1979e02119d5SChris Mason 		/* we need a sequence number to insert, so we only
1980e02119d5SChris Mason 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1981e02119d5SChris Mason 		 */
1982e02119d5SChris Mason 		if (key->type != BTRFS_DIR_INDEX_KEY)
1983e02119d5SChris Mason 			goto out;
1984e02119d5SChris Mason 		goto insert;
1985e02119d5SChris Mason 	}
1986e02119d5SChris Mason 
1987e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1988e02119d5SChris Mason 	/* the existing item matches the logged item */
1989e02119d5SChris Mason 	if (found_key.objectid == log_key.objectid &&
1990e02119d5SChris Mason 	    found_key.type == log_key.type &&
1991e02119d5SChris Mason 	    found_key.offset == log_key.offset &&
1992e02119d5SChris Mason 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1993a2cc11dbSFilipe Manana 		update_size = false;
1994e02119d5SChris Mason 		goto out;
1995e02119d5SChris Mason 	}
1996e02119d5SChris Mason 
1997e02119d5SChris Mason 	/*
1998e02119d5SChris Mason 	 * don't drop the conflicting directory entry if the inode
1999e02119d5SChris Mason 	 * for the new entry doesn't exist
2000e02119d5SChris Mason 	 */
20014bef0848SChris Mason 	if (!exists)
2002e02119d5SChris Mason 		goto out;
2003e02119d5SChris Mason 
2004207e7d92SNikolay Borisov 	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
20053650860bSJosef Bacik 	if (ret)
20063650860bSJosef Bacik 		goto out;
2007e02119d5SChris Mason 
2008e02119d5SChris Mason 	if (key->type == BTRFS_DIR_INDEX_KEY)
2009e02119d5SChris Mason 		goto insert;
2010e02119d5SChris Mason out:
2011b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2012d555438bSJosef Bacik 	if (!ret && update_size) {
20136ef06d27SNikolay Borisov 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
20149a56fcd1SNikolay Borisov 		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
2015d555438bSJosef Bacik 	}
2016e02119d5SChris Mason 	kfree(name);
2017e02119d5SChris Mason 	iput(dir);
2018bb53eda9SFilipe Manana 	if (!ret && name_added)
2019bb53eda9SFilipe Manana 		ret = 1;
20203650860bSJosef Bacik 	return ret;
2021e02119d5SChris Mason 
2022e02119d5SChris Mason insert:
2023725af92aSNikolay Borisov 	/*
2024725af92aSNikolay Borisov 	 * Check if the inode reference exists in the log for the given name,
2025725af92aSNikolay Borisov 	 * inode and parent inode
2026725af92aSNikolay Borisov 	 */
2027725af92aSNikolay Borisov 	found_key.objectid = log_key.objectid;
2028725af92aSNikolay Borisov 	found_key.type = BTRFS_INODE_REF_KEY;
2029725af92aSNikolay Borisov 	found_key.offset = key->objectid;
2030725af92aSNikolay Borisov 	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2031725af92aSNikolay Borisov 	if (ret < 0) {
2032725af92aSNikolay Borisov 	        goto out;
2033725af92aSNikolay Borisov 	} else if (ret) {
2034725af92aSNikolay Borisov 	        /* The dentry will be added later. */
2035725af92aSNikolay Borisov 	        ret = 0;
2036725af92aSNikolay Borisov 	        update_size = false;
2037725af92aSNikolay Borisov 	        goto out;
2038725af92aSNikolay Borisov 	}
2039725af92aSNikolay Borisov 
2040725af92aSNikolay Borisov 	found_key.objectid = log_key.objectid;
2041725af92aSNikolay Borisov 	found_key.type = BTRFS_INODE_EXTREF_KEY;
2042725af92aSNikolay Borisov 	found_key.offset = key->objectid;
2043725af92aSNikolay Borisov 	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2044725af92aSNikolay Borisov 			     name_len);
2045725af92aSNikolay Borisov 	if (ret < 0) {
2046725af92aSNikolay Borisov 		goto out;
2047725af92aSNikolay Borisov 	} else if (ret) {
2048df8d116fSFilipe Manana 		/* The dentry will be added later. */
2049df8d116fSFilipe Manana 		ret = 0;
2050df8d116fSFilipe Manana 		update_size = false;
2051df8d116fSFilipe Manana 		goto out;
2052df8d116fSFilipe Manana 	}
2053b3b4aa74SDavid Sterba 	btrfs_release_path(path);
205460d53eb3SZhaolei 	ret = insert_one_name(trans, root, key->objectid, key->offset,
205560d53eb3SZhaolei 			      name, name_len, &log_key);
2056df8d116fSFilipe Manana 	if (ret && ret != -ENOENT && ret != -EEXIST)
20573650860bSJosef Bacik 		goto out;
2058bb53eda9SFilipe Manana 	if (!ret)
2059bb53eda9SFilipe Manana 		name_added = true;
2060d555438bSJosef Bacik 	update_size = false;
20613650860bSJosef Bacik 	ret = 0;
2062e02119d5SChris Mason 	goto out;
2063e02119d5SChris Mason }
2064e02119d5SChris Mason 
2065e02119d5SChris Mason /*
2066e02119d5SChris Mason  * find all the names in a directory item and reconcile them into
2067e02119d5SChris Mason  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
2068e02119d5SChris Mason  * one name in a directory item, but the same code gets used for
2069e02119d5SChris Mason  * both directory index types
2070e02119d5SChris Mason  */
2071e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2072e02119d5SChris Mason 					struct btrfs_root *root,
2073e02119d5SChris Mason 					struct btrfs_path *path,
2074e02119d5SChris Mason 					struct extent_buffer *eb, int slot,
2075e02119d5SChris Mason 					struct btrfs_key *key)
2076e02119d5SChris Mason {
2077bb53eda9SFilipe Manana 	int ret = 0;
2078e02119d5SChris Mason 	u32 item_size = btrfs_item_size_nr(eb, slot);
2079e02119d5SChris Mason 	struct btrfs_dir_item *di;
2080e02119d5SChris Mason 	int name_len;
2081e02119d5SChris Mason 	unsigned long ptr;
2082e02119d5SChris Mason 	unsigned long ptr_end;
2083bb53eda9SFilipe Manana 	struct btrfs_path *fixup_path = NULL;
2084e02119d5SChris Mason 
2085e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
2086e02119d5SChris Mason 	ptr_end = ptr + item_size;
2087e02119d5SChris Mason 	while (ptr < ptr_end) {
2088e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
2089e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
2090e02119d5SChris Mason 		ret = replay_one_name(trans, root, path, eb, di, key);
2091bb53eda9SFilipe Manana 		if (ret < 0)
2092bb53eda9SFilipe Manana 			break;
2093e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
2094e02119d5SChris Mason 		ptr += name_len;
2095bb53eda9SFilipe Manana 
2096bb53eda9SFilipe Manana 		/*
2097bb53eda9SFilipe Manana 		 * If this entry refers to a non-directory (directories can not
2098bb53eda9SFilipe Manana 		 * have a link count > 1) and it was added in the transaction
2099bb53eda9SFilipe Manana 		 * that was not committed, make sure we fixup the link count of
2100bb53eda9SFilipe Manana 		 * the inode it the entry points to. Otherwise something like
2101bb53eda9SFilipe Manana 		 * the following would result in a directory pointing to an
2102bb53eda9SFilipe Manana 		 * inode with a wrong link that does not account for this dir
2103bb53eda9SFilipe Manana 		 * entry:
2104bb53eda9SFilipe Manana 		 *
2105bb53eda9SFilipe Manana 		 * mkdir testdir
2106bb53eda9SFilipe Manana 		 * touch testdir/foo
2107bb53eda9SFilipe Manana 		 * touch testdir/bar
2108bb53eda9SFilipe Manana 		 * sync
2109bb53eda9SFilipe Manana 		 *
2110bb53eda9SFilipe Manana 		 * ln testdir/bar testdir/bar_link
2111bb53eda9SFilipe Manana 		 * ln testdir/foo testdir/foo_link
2112bb53eda9SFilipe Manana 		 * xfs_io -c "fsync" testdir/bar
2113bb53eda9SFilipe Manana 		 *
2114bb53eda9SFilipe Manana 		 * <power failure>
2115bb53eda9SFilipe Manana 		 *
2116bb53eda9SFilipe Manana 		 * mount fs, log replay happens
2117bb53eda9SFilipe Manana 		 *
2118bb53eda9SFilipe Manana 		 * File foo would remain with a link count of 1 when it has two
2119bb53eda9SFilipe Manana 		 * entries pointing to it in the directory testdir. This would
2120bb53eda9SFilipe Manana 		 * make it impossible to ever delete the parent directory has
2121bb53eda9SFilipe Manana 		 * it would result in stale dentries that can never be deleted.
2122bb53eda9SFilipe Manana 		 */
2123bb53eda9SFilipe Manana 		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2124bb53eda9SFilipe Manana 			struct btrfs_key di_key;
2125bb53eda9SFilipe Manana 
2126bb53eda9SFilipe Manana 			if (!fixup_path) {
2127bb53eda9SFilipe Manana 				fixup_path = btrfs_alloc_path();
2128bb53eda9SFilipe Manana 				if (!fixup_path) {
2129bb53eda9SFilipe Manana 					ret = -ENOMEM;
2130bb53eda9SFilipe Manana 					break;
2131e02119d5SChris Mason 				}
2132bb53eda9SFilipe Manana 			}
2133bb53eda9SFilipe Manana 
2134bb53eda9SFilipe Manana 			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2135bb53eda9SFilipe Manana 			ret = link_to_fixup_dir(trans, root, fixup_path,
2136bb53eda9SFilipe Manana 						di_key.objectid);
2137bb53eda9SFilipe Manana 			if (ret)
2138bb53eda9SFilipe Manana 				break;
2139bb53eda9SFilipe Manana 		}
2140bb53eda9SFilipe Manana 		ret = 0;
2141bb53eda9SFilipe Manana 	}
2142bb53eda9SFilipe Manana 	btrfs_free_path(fixup_path);
2143bb53eda9SFilipe Manana 	return ret;
2144e02119d5SChris Mason }
2145e02119d5SChris Mason 
2146e02119d5SChris Mason /*
2147e02119d5SChris Mason  * directory replay has two parts.  There are the standard directory
2148e02119d5SChris Mason  * items in the log copied from the subvolume, and range items
2149e02119d5SChris Mason  * created in the log while the subvolume was logged.
2150e02119d5SChris Mason  *
2151e02119d5SChris Mason  * The range items tell us which parts of the key space the log
2152e02119d5SChris Mason  * is authoritative for.  During replay, if a key in the subvolume
2153e02119d5SChris Mason  * directory is in a logged range item, but not actually in the log
2154e02119d5SChris Mason  * that means it was deleted from the directory before the fsync
2155e02119d5SChris Mason  * and should be removed.
2156e02119d5SChris Mason  */
2157e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root,
2158e02119d5SChris Mason 				   struct btrfs_path *path,
2159e02119d5SChris Mason 				   u64 dirid, int key_type,
2160e02119d5SChris Mason 				   u64 *start_ret, u64 *end_ret)
2161e02119d5SChris Mason {
2162e02119d5SChris Mason 	struct btrfs_key key;
2163e02119d5SChris Mason 	u64 found_end;
2164e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
2165e02119d5SChris Mason 	int ret;
2166e02119d5SChris Mason 	int nritems;
2167e02119d5SChris Mason 
2168e02119d5SChris Mason 	if (*start_ret == (u64)-1)
2169e02119d5SChris Mason 		return 1;
2170e02119d5SChris Mason 
2171e02119d5SChris Mason 	key.objectid = dirid;
2172e02119d5SChris Mason 	key.type = key_type;
2173e02119d5SChris Mason 	key.offset = *start_ret;
2174e02119d5SChris Mason 
2175e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2176e02119d5SChris Mason 	if (ret < 0)
2177e02119d5SChris Mason 		goto out;
2178e02119d5SChris Mason 	if (ret > 0) {
2179e02119d5SChris Mason 		if (path->slots[0] == 0)
2180e02119d5SChris Mason 			goto out;
2181e02119d5SChris Mason 		path->slots[0]--;
2182e02119d5SChris Mason 	}
2183e02119d5SChris Mason 	if (ret != 0)
2184e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2185e02119d5SChris Mason 
2186e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
2187e02119d5SChris Mason 		ret = 1;
2188e02119d5SChris Mason 		goto next;
2189e02119d5SChris Mason 	}
2190e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2191e02119d5SChris Mason 			      struct btrfs_dir_log_item);
2192e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2193e02119d5SChris Mason 
2194e02119d5SChris Mason 	if (*start_ret >= key.offset && *start_ret <= found_end) {
2195e02119d5SChris Mason 		ret = 0;
2196e02119d5SChris Mason 		*start_ret = key.offset;
2197e02119d5SChris Mason 		*end_ret = found_end;
2198e02119d5SChris Mason 		goto out;
2199e02119d5SChris Mason 	}
2200e02119d5SChris Mason 	ret = 1;
2201e02119d5SChris Mason next:
2202e02119d5SChris Mason 	/* check the next slot in the tree to see if it is a valid item */
2203e02119d5SChris Mason 	nritems = btrfs_header_nritems(path->nodes[0]);
22042a7bf53fSRobbie Ko 	path->slots[0]++;
2205e02119d5SChris Mason 	if (path->slots[0] >= nritems) {
2206e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
2207e02119d5SChris Mason 		if (ret)
2208e02119d5SChris Mason 			goto out;
2209e02119d5SChris Mason 	}
2210e02119d5SChris Mason 
2211e02119d5SChris Mason 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2212e02119d5SChris Mason 
2213e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
2214e02119d5SChris Mason 		ret = 1;
2215e02119d5SChris Mason 		goto out;
2216e02119d5SChris Mason 	}
2217e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2218e02119d5SChris Mason 			      struct btrfs_dir_log_item);
2219e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2220e02119d5SChris Mason 	*start_ret = key.offset;
2221e02119d5SChris Mason 	*end_ret = found_end;
2222e02119d5SChris Mason 	ret = 0;
2223e02119d5SChris Mason out:
2224b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2225e02119d5SChris Mason 	return ret;
2226e02119d5SChris Mason }
2227e02119d5SChris Mason 
2228e02119d5SChris Mason /*
2229e02119d5SChris Mason  * this looks for a given directory item in the log.  If the directory
2230e02119d5SChris Mason  * item is not in the log, the item is removed and the inode it points
2231e02119d5SChris Mason  * to is unlinked
2232e02119d5SChris Mason  */
2233e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2234e02119d5SChris Mason 				      struct btrfs_root *root,
2235e02119d5SChris Mason 				      struct btrfs_root *log,
2236e02119d5SChris Mason 				      struct btrfs_path *path,
2237e02119d5SChris Mason 				      struct btrfs_path *log_path,
2238e02119d5SChris Mason 				      struct inode *dir,
2239e02119d5SChris Mason 				      struct btrfs_key *dir_key)
2240e02119d5SChris Mason {
2241e02119d5SChris Mason 	int ret;
2242e02119d5SChris Mason 	struct extent_buffer *eb;
2243e02119d5SChris Mason 	int slot;
2244e02119d5SChris Mason 	u32 item_size;
2245e02119d5SChris Mason 	struct btrfs_dir_item *di;
2246e02119d5SChris Mason 	struct btrfs_dir_item *log_di;
2247e02119d5SChris Mason 	int name_len;
2248e02119d5SChris Mason 	unsigned long ptr;
2249e02119d5SChris Mason 	unsigned long ptr_end;
2250e02119d5SChris Mason 	char *name;
2251e02119d5SChris Mason 	struct inode *inode;
2252e02119d5SChris Mason 	struct btrfs_key location;
2253e02119d5SChris Mason 
2254e02119d5SChris Mason again:
2255e02119d5SChris Mason 	eb = path->nodes[0];
2256e02119d5SChris Mason 	slot = path->slots[0];
2257e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
2258e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
2259e02119d5SChris Mason 	ptr_end = ptr + item_size;
2260e02119d5SChris Mason 	while (ptr < ptr_end) {
2261e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
2262e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
2263e02119d5SChris Mason 		name = kmalloc(name_len, GFP_NOFS);
2264e02119d5SChris Mason 		if (!name) {
2265e02119d5SChris Mason 			ret = -ENOMEM;
2266e02119d5SChris Mason 			goto out;
2267e02119d5SChris Mason 		}
2268e02119d5SChris Mason 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2269e02119d5SChris Mason 				  name_len);
2270e02119d5SChris Mason 		log_di = NULL;
227112fcfd22SChris Mason 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2272e02119d5SChris Mason 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2273e02119d5SChris Mason 						       dir_key->objectid,
2274e02119d5SChris Mason 						       name, name_len, 0);
227512fcfd22SChris Mason 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2276e02119d5SChris Mason 			log_di = btrfs_lookup_dir_index_item(trans, log,
2277e02119d5SChris Mason 						     log_path,
2278e02119d5SChris Mason 						     dir_key->objectid,
2279e02119d5SChris Mason 						     dir_key->offset,
2280e02119d5SChris Mason 						     name, name_len, 0);
2281e02119d5SChris Mason 		}
22828d9e220cSAl Viro 		if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2283e02119d5SChris Mason 			btrfs_dir_item_key_to_cpu(eb, di, &location);
2284b3b4aa74SDavid Sterba 			btrfs_release_path(path);
2285b3b4aa74SDavid Sterba 			btrfs_release_path(log_path);
2286e02119d5SChris Mason 			inode = read_one_inode(root, location.objectid);
2287c00e9493STsutomu Itoh 			if (!inode) {
2288c00e9493STsutomu Itoh 				kfree(name);
2289c00e9493STsutomu Itoh 				return -EIO;
2290c00e9493STsutomu Itoh 			}
2291e02119d5SChris Mason 
2292e02119d5SChris Mason 			ret = link_to_fixup_dir(trans, root,
2293e02119d5SChris Mason 						path, location.objectid);
22943650860bSJosef Bacik 			if (ret) {
22953650860bSJosef Bacik 				kfree(name);
22963650860bSJosef Bacik 				iput(inode);
22973650860bSJosef Bacik 				goto out;
22983650860bSJosef Bacik 			}
22993650860bSJosef Bacik 
23008b558c5fSZach Brown 			inc_nlink(inode);
23014ec5934eSNikolay Borisov 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
23024ec5934eSNikolay Borisov 					BTRFS_I(inode), name, name_len);
23033650860bSJosef Bacik 			if (!ret)
2304e5c304e6SNikolay Borisov 				ret = btrfs_run_delayed_items(trans);
2305e02119d5SChris Mason 			kfree(name);
2306e02119d5SChris Mason 			iput(inode);
23073650860bSJosef Bacik 			if (ret)
23083650860bSJosef Bacik 				goto out;
2309e02119d5SChris Mason 
2310e02119d5SChris Mason 			/* there might still be more names under this key
2311e02119d5SChris Mason 			 * check and repeat if required
2312e02119d5SChris Mason 			 */
2313e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, dir_key, path,
2314e02119d5SChris Mason 						0, 0);
2315e02119d5SChris Mason 			if (ret == 0)
2316e02119d5SChris Mason 				goto again;
2317e02119d5SChris Mason 			ret = 0;
2318e02119d5SChris Mason 			goto out;
2319269d040fSFilipe David Borba Manana 		} else if (IS_ERR(log_di)) {
2320269d040fSFilipe David Borba Manana 			kfree(name);
2321269d040fSFilipe David Borba Manana 			return PTR_ERR(log_di);
2322e02119d5SChris Mason 		}
2323b3b4aa74SDavid Sterba 		btrfs_release_path(log_path);
2324e02119d5SChris Mason 		kfree(name);
2325e02119d5SChris Mason 
2326e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
2327e02119d5SChris Mason 		ptr += name_len;
2328e02119d5SChris Mason 	}
2329e02119d5SChris Mason 	ret = 0;
2330e02119d5SChris Mason out:
2331b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2332b3b4aa74SDavid Sterba 	btrfs_release_path(log_path);
2333e02119d5SChris Mason 	return ret;
2334e02119d5SChris Mason }
2335e02119d5SChris Mason 
23364f764e51SFilipe Manana static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
23374f764e51SFilipe Manana 			      struct btrfs_root *root,
23384f764e51SFilipe Manana 			      struct btrfs_root *log,
23394f764e51SFilipe Manana 			      struct btrfs_path *path,
23404f764e51SFilipe Manana 			      const u64 ino)
23414f764e51SFilipe Manana {
23424f764e51SFilipe Manana 	struct btrfs_key search_key;
23434f764e51SFilipe Manana 	struct btrfs_path *log_path;
23444f764e51SFilipe Manana 	int i;
23454f764e51SFilipe Manana 	int nritems;
23464f764e51SFilipe Manana 	int ret;
23474f764e51SFilipe Manana 
23484f764e51SFilipe Manana 	log_path = btrfs_alloc_path();
23494f764e51SFilipe Manana 	if (!log_path)
23504f764e51SFilipe Manana 		return -ENOMEM;
23514f764e51SFilipe Manana 
23524f764e51SFilipe Manana 	search_key.objectid = ino;
23534f764e51SFilipe Manana 	search_key.type = BTRFS_XATTR_ITEM_KEY;
23544f764e51SFilipe Manana 	search_key.offset = 0;
23554f764e51SFilipe Manana again:
23564f764e51SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
23574f764e51SFilipe Manana 	if (ret < 0)
23584f764e51SFilipe Manana 		goto out;
23594f764e51SFilipe Manana process_leaf:
23604f764e51SFilipe Manana 	nritems = btrfs_header_nritems(path->nodes[0]);
23614f764e51SFilipe Manana 	for (i = path->slots[0]; i < nritems; i++) {
23624f764e51SFilipe Manana 		struct btrfs_key key;
23634f764e51SFilipe Manana 		struct btrfs_dir_item *di;
23644f764e51SFilipe Manana 		struct btrfs_dir_item *log_di;
23654f764e51SFilipe Manana 		u32 total_size;
23664f764e51SFilipe Manana 		u32 cur;
23674f764e51SFilipe Manana 
23684f764e51SFilipe Manana 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
23694f764e51SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
23704f764e51SFilipe Manana 			ret = 0;
23714f764e51SFilipe Manana 			goto out;
23724f764e51SFilipe Manana 		}
23734f764e51SFilipe Manana 
23744f764e51SFilipe Manana 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
23754f764e51SFilipe Manana 		total_size = btrfs_item_size_nr(path->nodes[0], i);
23764f764e51SFilipe Manana 		cur = 0;
23774f764e51SFilipe Manana 		while (cur < total_size) {
23784f764e51SFilipe Manana 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
23794f764e51SFilipe Manana 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
23804f764e51SFilipe Manana 			u32 this_len = sizeof(*di) + name_len + data_len;
23814f764e51SFilipe Manana 			char *name;
23824f764e51SFilipe Manana 
23834f764e51SFilipe Manana 			name = kmalloc(name_len, GFP_NOFS);
23844f764e51SFilipe Manana 			if (!name) {
23854f764e51SFilipe Manana 				ret = -ENOMEM;
23864f764e51SFilipe Manana 				goto out;
23874f764e51SFilipe Manana 			}
23884f764e51SFilipe Manana 			read_extent_buffer(path->nodes[0], name,
23894f764e51SFilipe Manana 					   (unsigned long)(di + 1), name_len);
23904f764e51SFilipe Manana 
23914f764e51SFilipe Manana 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
23924f764e51SFilipe Manana 						    name, name_len, 0);
23934f764e51SFilipe Manana 			btrfs_release_path(log_path);
23944f764e51SFilipe Manana 			if (!log_di) {
23954f764e51SFilipe Manana 				/* Doesn't exist in log tree, so delete it. */
23964f764e51SFilipe Manana 				btrfs_release_path(path);
23974f764e51SFilipe Manana 				di = btrfs_lookup_xattr(trans, root, path, ino,
23984f764e51SFilipe Manana 							name, name_len, -1);
23994f764e51SFilipe Manana 				kfree(name);
24004f764e51SFilipe Manana 				if (IS_ERR(di)) {
24014f764e51SFilipe Manana 					ret = PTR_ERR(di);
24024f764e51SFilipe Manana 					goto out;
24034f764e51SFilipe Manana 				}
24044f764e51SFilipe Manana 				ASSERT(di);
24054f764e51SFilipe Manana 				ret = btrfs_delete_one_dir_name(trans, root,
24064f764e51SFilipe Manana 								path, di);
24074f764e51SFilipe Manana 				if (ret)
24084f764e51SFilipe Manana 					goto out;
24094f764e51SFilipe Manana 				btrfs_release_path(path);
24104f764e51SFilipe Manana 				search_key = key;
24114f764e51SFilipe Manana 				goto again;
24124f764e51SFilipe Manana 			}
24134f764e51SFilipe Manana 			kfree(name);
24144f764e51SFilipe Manana 			if (IS_ERR(log_di)) {
24154f764e51SFilipe Manana 				ret = PTR_ERR(log_di);
24164f764e51SFilipe Manana 				goto out;
24174f764e51SFilipe Manana 			}
24184f764e51SFilipe Manana 			cur += this_len;
24194f764e51SFilipe Manana 			di = (struct btrfs_dir_item *)((char *)di + this_len);
24204f764e51SFilipe Manana 		}
24214f764e51SFilipe Manana 	}
24224f764e51SFilipe Manana 	ret = btrfs_next_leaf(root, path);
24234f764e51SFilipe Manana 	if (ret > 0)
24244f764e51SFilipe Manana 		ret = 0;
24254f764e51SFilipe Manana 	else if (ret == 0)
24264f764e51SFilipe Manana 		goto process_leaf;
24274f764e51SFilipe Manana out:
24284f764e51SFilipe Manana 	btrfs_free_path(log_path);
24294f764e51SFilipe Manana 	btrfs_release_path(path);
24304f764e51SFilipe Manana 	return ret;
24314f764e51SFilipe Manana }
24324f764e51SFilipe Manana 
24334f764e51SFilipe Manana 
2434e02119d5SChris Mason /*
2435e02119d5SChris Mason  * deletion replay happens before we copy any new directory items
2436e02119d5SChris Mason  * out of the log or out of backreferences from inodes.  It
2437e02119d5SChris Mason  * scans the log to find ranges of keys that log is authoritative for,
2438e02119d5SChris Mason  * and then scans the directory to find items in those ranges that are
2439e02119d5SChris Mason  * not present in the log.
2440e02119d5SChris Mason  *
2441e02119d5SChris Mason  * Anything we don't find in the log is unlinked and removed from the
2442e02119d5SChris Mason  * directory.
2443e02119d5SChris Mason  */
2444e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2445e02119d5SChris Mason 				       struct btrfs_root *root,
2446e02119d5SChris Mason 				       struct btrfs_root *log,
2447e02119d5SChris Mason 				       struct btrfs_path *path,
244812fcfd22SChris Mason 				       u64 dirid, int del_all)
2449e02119d5SChris Mason {
2450e02119d5SChris Mason 	u64 range_start;
2451e02119d5SChris Mason 	u64 range_end;
2452e02119d5SChris Mason 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2453e02119d5SChris Mason 	int ret = 0;
2454e02119d5SChris Mason 	struct btrfs_key dir_key;
2455e02119d5SChris Mason 	struct btrfs_key found_key;
2456e02119d5SChris Mason 	struct btrfs_path *log_path;
2457e02119d5SChris Mason 	struct inode *dir;
2458e02119d5SChris Mason 
2459e02119d5SChris Mason 	dir_key.objectid = dirid;
2460e02119d5SChris Mason 	dir_key.type = BTRFS_DIR_ITEM_KEY;
2461e02119d5SChris Mason 	log_path = btrfs_alloc_path();
2462e02119d5SChris Mason 	if (!log_path)
2463e02119d5SChris Mason 		return -ENOMEM;
2464e02119d5SChris Mason 
2465e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
2466e02119d5SChris Mason 	/* it isn't an error if the inode isn't there, that can happen
2467e02119d5SChris Mason 	 * because we replay the deletes before we copy in the inode item
2468e02119d5SChris Mason 	 * from the log
2469e02119d5SChris Mason 	 */
2470e02119d5SChris Mason 	if (!dir) {
2471e02119d5SChris Mason 		btrfs_free_path(log_path);
2472e02119d5SChris Mason 		return 0;
2473e02119d5SChris Mason 	}
2474e02119d5SChris Mason again:
2475e02119d5SChris Mason 	range_start = 0;
2476e02119d5SChris Mason 	range_end = 0;
2477e02119d5SChris Mason 	while (1) {
247812fcfd22SChris Mason 		if (del_all)
247912fcfd22SChris Mason 			range_end = (u64)-1;
248012fcfd22SChris Mason 		else {
2481e02119d5SChris Mason 			ret = find_dir_range(log, path, dirid, key_type,
2482e02119d5SChris Mason 					     &range_start, &range_end);
2483e02119d5SChris Mason 			if (ret != 0)
2484e02119d5SChris Mason 				break;
248512fcfd22SChris Mason 		}
2486e02119d5SChris Mason 
2487e02119d5SChris Mason 		dir_key.offset = range_start;
2488e02119d5SChris Mason 		while (1) {
2489e02119d5SChris Mason 			int nritems;
2490e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2491e02119d5SChris Mason 						0, 0);
2492e02119d5SChris Mason 			if (ret < 0)
2493e02119d5SChris Mason 				goto out;
2494e02119d5SChris Mason 
2495e02119d5SChris Mason 			nritems = btrfs_header_nritems(path->nodes[0]);
2496e02119d5SChris Mason 			if (path->slots[0] >= nritems) {
2497e02119d5SChris Mason 				ret = btrfs_next_leaf(root, path);
2498b98def7cSLiu Bo 				if (ret == 1)
2499e02119d5SChris Mason 					break;
2500b98def7cSLiu Bo 				else if (ret < 0)
2501b98def7cSLiu Bo 					goto out;
2502e02119d5SChris Mason 			}
2503e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2504e02119d5SChris Mason 					      path->slots[0]);
2505e02119d5SChris Mason 			if (found_key.objectid != dirid ||
2506e02119d5SChris Mason 			    found_key.type != dir_key.type)
2507e02119d5SChris Mason 				goto next_type;
2508e02119d5SChris Mason 
2509e02119d5SChris Mason 			if (found_key.offset > range_end)
2510e02119d5SChris Mason 				break;
2511e02119d5SChris Mason 
2512e02119d5SChris Mason 			ret = check_item_in_log(trans, root, log, path,
251312fcfd22SChris Mason 						log_path, dir,
251412fcfd22SChris Mason 						&found_key);
25153650860bSJosef Bacik 			if (ret)
25163650860bSJosef Bacik 				goto out;
2517e02119d5SChris Mason 			if (found_key.offset == (u64)-1)
2518e02119d5SChris Mason 				break;
2519e02119d5SChris Mason 			dir_key.offset = found_key.offset + 1;
2520e02119d5SChris Mason 		}
2521b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2522e02119d5SChris Mason 		if (range_end == (u64)-1)
2523e02119d5SChris Mason 			break;
2524e02119d5SChris Mason 		range_start = range_end + 1;
2525e02119d5SChris Mason 	}
2526e02119d5SChris Mason 
2527e02119d5SChris Mason next_type:
2528e02119d5SChris Mason 	ret = 0;
2529e02119d5SChris Mason 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2530e02119d5SChris Mason 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2531e02119d5SChris Mason 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2532b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2533e02119d5SChris Mason 		goto again;
2534e02119d5SChris Mason 	}
2535e02119d5SChris Mason out:
2536b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2537e02119d5SChris Mason 	btrfs_free_path(log_path);
2538e02119d5SChris Mason 	iput(dir);
2539e02119d5SChris Mason 	return ret;
2540e02119d5SChris Mason }
2541e02119d5SChris Mason 
2542e02119d5SChris Mason /*
2543e02119d5SChris Mason  * the process_func used to replay items from the log tree.  This
2544e02119d5SChris Mason  * gets called in two different stages.  The first stage just looks
2545e02119d5SChris Mason  * for inodes and makes sure they are all copied into the subvolume.
2546e02119d5SChris Mason  *
2547e02119d5SChris Mason  * The second stage copies all the other item types from the log into
2548e02119d5SChris Mason  * the subvolume.  The two stage approach is slower, but gets rid of
2549e02119d5SChris Mason  * lots of complexity around inodes referencing other inodes that exist
2550e02119d5SChris Mason  * only in the log (references come from either directory items or inode
2551e02119d5SChris Mason  * back refs).
2552e02119d5SChris Mason  */
2553e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2554581c1760SQu Wenruo 			     struct walk_control *wc, u64 gen, int level)
2555e02119d5SChris Mason {
2556e02119d5SChris Mason 	int nritems;
2557e02119d5SChris Mason 	struct btrfs_path *path;
2558e02119d5SChris Mason 	struct btrfs_root *root = wc->replay_dest;
2559e02119d5SChris Mason 	struct btrfs_key key;
2560e02119d5SChris Mason 	int i;
2561e02119d5SChris Mason 	int ret;
2562e02119d5SChris Mason 
2563581c1760SQu Wenruo 	ret = btrfs_read_buffer(eb, gen, level, NULL);
2564018642a1STsutomu Itoh 	if (ret)
2565018642a1STsutomu Itoh 		return ret;
2566e02119d5SChris Mason 
2567e02119d5SChris Mason 	level = btrfs_header_level(eb);
2568e02119d5SChris Mason 
2569e02119d5SChris Mason 	if (level != 0)
2570e02119d5SChris Mason 		return 0;
2571e02119d5SChris Mason 
2572e02119d5SChris Mason 	path = btrfs_alloc_path();
25731e5063d0SMark Fasheh 	if (!path)
25741e5063d0SMark Fasheh 		return -ENOMEM;
2575e02119d5SChris Mason 
2576e02119d5SChris Mason 	nritems = btrfs_header_nritems(eb);
2577e02119d5SChris Mason 	for (i = 0; i < nritems; i++) {
2578e02119d5SChris Mason 		btrfs_item_key_to_cpu(eb, &key, i);
2579e02119d5SChris Mason 
2580e02119d5SChris Mason 		/* inode keys are done during the first stage */
2581e02119d5SChris Mason 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2582e02119d5SChris Mason 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2583e02119d5SChris Mason 			struct btrfs_inode_item *inode_item;
2584e02119d5SChris Mason 			u32 mode;
2585e02119d5SChris Mason 
2586e02119d5SChris Mason 			inode_item = btrfs_item_ptr(eb, i,
2587e02119d5SChris Mason 					    struct btrfs_inode_item);
2588f2d72f42SFilipe Manana 			/*
2589f2d72f42SFilipe Manana 			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2590f2d72f42SFilipe Manana 			 * and never got linked before the fsync, skip it, as
2591f2d72f42SFilipe Manana 			 * replaying it is pointless since it would be deleted
2592f2d72f42SFilipe Manana 			 * later. We skip logging tmpfiles, but it's always
2593f2d72f42SFilipe Manana 			 * possible we are replaying a log created with a kernel
2594f2d72f42SFilipe Manana 			 * that used to log tmpfiles.
2595f2d72f42SFilipe Manana 			 */
2596f2d72f42SFilipe Manana 			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2597f2d72f42SFilipe Manana 				wc->ignore_cur_inode = true;
2598f2d72f42SFilipe Manana 				continue;
2599f2d72f42SFilipe Manana 			} else {
2600f2d72f42SFilipe Manana 				wc->ignore_cur_inode = false;
2601f2d72f42SFilipe Manana 			}
26024f764e51SFilipe Manana 			ret = replay_xattr_deletes(wc->trans, root, log,
26034f764e51SFilipe Manana 						   path, key.objectid);
26044f764e51SFilipe Manana 			if (ret)
26054f764e51SFilipe Manana 				break;
2606e02119d5SChris Mason 			mode = btrfs_inode_mode(eb, inode_item);
2607e02119d5SChris Mason 			if (S_ISDIR(mode)) {
2608e02119d5SChris Mason 				ret = replay_dir_deletes(wc->trans,
260912fcfd22SChris Mason 					 root, log, path, key.objectid, 0);
2610b50c6e25SJosef Bacik 				if (ret)
2611b50c6e25SJosef Bacik 					break;
2612e02119d5SChris Mason 			}
2613e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2614e02119d5SChris Mason 					     eb, i, &key);
2615b50c6e25SJosef Bacik 			if (ret)
2616b50c6e25SJosef Bacik 				break;
2617e02119d5SChris Mason 
2618471d557aSFilipe Manana 			/*
2619471d557aSFilipe Manana 			 * Before replaying extents, truncate the inode to its
2620471d557aSFilipe Manana 			 * size. We need to do it now and not after log replay
2621471d557aSFilipe Manana 			 * because before an fsync we can have prealloc extents
2622471d557aSFilipe Manana 			 * added beyond the inode's i_size. If we did it after,
2623471d557aSFilipe Manana 			 * through orphan cleanup for example, we would drop
2624471d557aSFilipe Manana 			 * those prealloc extents just after replaying them.
2625e02119d5SChris Mason 			 */
2626e02119d5SChris Mason 			if (S_ISREG(mode)) {
26275893dfb9SFilipe Manana 				struct btrfs_drop_extents_args drop_args = { 0 };
2628471d557aSFilipe Manana 				struct inode *inode;
2629471d557aSFilipe Manana 				u64 from;
2630471d557aSFilipe Manana 
2631471d557aSFilipe Manana 				inode = read_one_inode(root, key.objectid);
2632471d557aSFilipe Manana 				if (!inode) {
2633471d557aSFilipe Manana 					ret = -EIO;
2634471d557aSFilipe Manana 					break;
2635471d557aSFilipe Manana 				}
2636471d557aSFilipe Manana 				from = ALIGN(i_size_read(inode),
2637471d557aSFilipe Manana 					     root->fs_info->sectorsize);
26385893dfb9SFilipe Manana 				drop_args.start = from;
26395893dfb9SFilipe Manana 				drop_args.end = (u64)-1;
26405893dfb9SFilipe Manana 				drop_args.drop_cache = true;
26415893dfb9SFilipe Manana 				ret = btrfs_drop_extents(wc->trans, root,
26425893dfb9SFilipe Manana 							 BTRFS_I(inode),
26435893dfb9SFilipe Manana 							 &drop_args);
2644471d557aSFilipe Manana 				if (!ret) {
26452766ff61SFilipe Manana 					inode_sub_bytes(inode,
26462766ff61SFilipe Manana 							drop_args.bytes_found);
2647f2d72f42SFilipe Manana 					/* Update the inode's nbytes. */
2648471d557aSFilipe Manana 					ret = btrfs_update_inode(wc->trans,
26499a56fcd1SNikolay Borisov 							root, BTRFS_I(inode));
2650471d557aSFilipe Manana 				}
2651471d557aSFilipe Manana 				iput(inode);
2652b50c6e25SJosef Bacik 				if (ret)
2653b50c6e25SJosef Bacik 					break;
2654c71bf099SYan, Zheng 			}
2655a74ac322SChris Mason 
2656e02119d5SChris Mason 			ret = link_to_fixup_dir(wc->trans, root,
2657e02119d5SChris Mason 						path, key.objectid);
2658b50c6e25SJosef Bacik 			if (ret)
2659b50c6e25SJosef Bacik 				break;
2660e02119d5SChris Mason 		}
2661dd8e7217SJosef Bacik 
2662f2d72f42SFilipe Manana 		if (wc->ignore_cur_inode)
2663f2d72f42SFilipe Manana 			continue;
2664f2d72f42SFilipe Manana 
2665dd8e7217SJosef Bacik 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2666dd8e7217SJosef Bacik 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2667dd8e7217SJosef Bacik 			ret = replay_one_dir_item(wc->trans, root, path,
2668dd8e7217SJosef Bacik 						  eb, i, &key);
2669dd8e7217SJosef Bacik 			if (ret)
2670dd8e7217SJosef Bacik 				break;
2671dd8e7217SJosef Bacik 		}
2672dd8e7217SJosef Bacik 
2673e02119d5SChris Mason 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2674e02119d5SChris Mason 			continue;
2675e02119d5SChris Mason 
2676e02119d5SChris Mason 		/* these keys are simply copied */
2677e02119d5SChris Mason 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2678e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2679e02119d5SChris Mason 					     eb, i, &key);
2680b50c6e25SJosef Bacik 			if (ret)
2681b50c6e25SJosef Bacik 				break;
26822da1c669SLiu Bo 		} else if (key.type == BTRFS_INODE_REF_KEY ||
26832da1c669SLiu Bo 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2684f186373fSMark Fasheh 			ret = add_inode_ref(wc->trans, root, log, path,
2685f186373fSMark Fasheh 					    eb, i, &key);
2686b50c6e25SJosef Bacik 			if (ret && ret != -ENOENT)
2687b50c6e25SJosef Bacik 				break;
2688b50c6e25SJosef Bacik 			ret = 0;
2689e02119d5SChris Mason 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2690e02119d5SChris Mason 			ret = replay_one_extent(wc->trans, root, path,
2691e02119d5SChris Mason 						eb, i, &key);
2692b50c6e25SJosef Bacik 			if (ret)
2693b50c6e25SJosef Bacik 				break;
2694dd8e7217SJosef Bacik 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2695e02119d5SChris Mason 			ret = replay_one_dir_item(wc->trans, root, path,
2696e02119d5SChris Mason 						  eb, i, &key);
2697b50c6e25SJosef Bacik 			if (ret)
2698b50c6e25SJosef Bacik 				break;
2699e02119d5SChris Mason 		}
2700e02119d5SChris Mason 	}
2701e02119d5SChris Mason 	btrfs_free_path(path);
2702b50c6e25SJosef Bacik 	return ret;
2703e02119d5SChris Mason }
2704e02119d5SChris Mason 
27056787bb9fSNikolay Borisov /*
27066787bb9fSNikolay Borisov  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
27076787bb9fSNikolay Borisov  */
27086787bb9fSNikolay Borisov static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
27096787bb9fSNikolay Borisov {
27106787bb9fSNikolay Borisov 	struct btrfs_block_group *cache;
27116787bb9fSNikolay Borisov 
27126787bb9fSNikolay Borisov 	cache = btrfs_lookup_block_group(fs_info, start);
27136787bb9fSNikolay Borisov 	if (!cache) {
27146787bb9fSNikolay Borisov 		btrfs_err(fs_info, "unable to find block group for %llu", start);
27156787bb9fSNikolay Borisov 		return;
27166787bb9fSNikolay Borisov 	}
27176787bb9fSNikolay Borisov 
27186787bb9fSNikolay Borisov 	spin_lock(&cache->space_info->lock);
27196787bb9fSNikolay Borisov 	spin_lock(&cache->lock);
27206787bb9fSNikolay Borisov 	cache->reserved -= fs_info->nodesize;
27216787bb9fSNikolay Borisov 	cache->space_info->bytes_reserved -= fs_info->nodesize;
27226787bb9fSNikolay Borisov 	spin_unlock(&cache->lock);
27236787bb9fSNikolay Borisov 	spin_unlock(&cache->space_info->lock);
27246787bb9fSNikolay Borisov 
27256787bb9fSNikolay Borisov 	btrfs_put_block_group(cache);
27266787bb9fSNikolay Borisov }
27276787bb9fSNikolay Borisov 
2728d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2729e02119d5SChris Mason 				   struct btrfs_root *root,
2730e02119d5SChris Mason 				   struct btrfs_path *path, int *level,
2731e02119d5SChris Mason 				   struct walk_control *wc)
2732e02119d5SChris Mason {
27330b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2734e02119d5SChris Mason 	u64 bytenr;
2735e02119d5SChris Mason 	u64 ptr_gen;
2736e02119d5SChris Mason 	struct extent_buffer *next;
2737e02119d5SChris Mason 	struct extent_buffer *cur;
2738e02119d5SChris Mason 	u32 blocksize;
2739e02119d5SChris Mason 	int ret = 0;
2740e02119d5SChris Mason 
2741e02119d5SChris Mason 	while (*level > 0) {
2742581c1760SQu Wenruo 		struct btrfs_key first_key;
2743581c1760SQu Wenruo 
2744e02119d5SChris Mason 		cur = path->nodes[*level];
2745e02119d5SChris Mason 
2746fae7f21cSDulshani Gunawardhana 		WARN_ON(btrfs_header_level(cur) != *level);
2747e02119d5SChris Mason 
2748e02119d5SChris Mason 		if (path->slots[*level] >=
2749e02119d5SChris Mason 		    btrfs_header_nritems(cur))
2750e02119d5SChris Mason 			break;
2751e02119d5SChris Mason 
2752e02119d5SChris Mason 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2753e02119d5SChris Mason 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2754581c1760SQu Wenruo 		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
27550b246afaSJeff Mahoney 		blocksize = fs_info->nodesize;
2756e02119d5SChris Mason 
27573fbaf258SJosef Bacik 		next = btrfs_find_create_tree_block(fs_info, bytenr,
27583fbaf258SJosef Bacik 						    btrfs_header_owner(cur),
27593fbaf258SJosef Bacik 						    *level - 1);
2760c871b0f2SLiu Bo 		if (IS_ERR(next))
2761c871b0f2SLiu Bo 			return PTR_ERR(next);
2762e02119d5SChris Mason 
27634a500fd1SYan, Zheng 		if (*level == 1) {
2764581c1760SQu Wenruo 			ret = wc->process_func(root, next, wc, ptr_gen,
2765581c1760SQu Wenruo 					       *level - 1);
2766b50c6e25SJosef Bacik 			if (ret) {
2767b50c6e25SJosef Bacik 				free_extent_buffer(next);
27681e5063d0SMark Fasheh 				return ret;
2769b50c6e25SJosef Bacik 			}
2770e02119d5SChris Mason 
2771e02119d5SChris Mason 			path->slots[*level]++;
2772e02119d5SChris Mason 			if (wc->free) {
2773581c1760SQu Wenruo 				ret = btrfs_read_buffer(next, ptr_gen,
2774581c1760SQu Wenruo 							*level - 1, &first_key);
2775018642a1STsutomu Itoh 				if (ret) {
2776018642a1STsutomu Itoh 					free_extent_buffer(next);
2777018642a1STsutomu Itoh 					return ret;
2778018642a1STsutomu Itoh 				}
2779e02119d5SChris Mason 
2780681ae509SJosef Bacik 				if (trans) {
2781e02119d5SChris Mason 					btrfs_tree_lock(next);
27826a884d7dSDavid Sterba 					btrfs_clean_tree_block(next);
2783e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2784e02119d5SChris Mason 					btrfs_tree_unlock(next);
27857bfc1007SNikolay Borisov 					ret = btrfs_pin_reserved_extent(trans,
2786a0fbf736SNikolay Borisov 							bytenr, blocksize);
27873650860bSJosef Bacik 					if (ret) {
27883650860bSJosef Bacik 						free_extent_buffer(next);
27893650860bSJosef Bacik 						return ret;
27903650860bSJosef Bacik 					}
2791d3575156SNaohiro Aota 					btrfs_redirty_list_add(
2792d3575156SNaohiro Aota 						trans->transaction, next);
279310e958d5SNikolay Borisov 				} else {
279410e958d5SNikolay Borisov 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
279510e958d5SNikolay Borisov 						clear_extent_buffer_dirty(next);
279610e958d5SNikolay Borisov 					unaccount_log_buffer(fs_info, bytenr);
279710e958d5SNikolay Borisov 				}
2798e02119d5SChris Mason 			}
2799e02119d5SChris Mason 			free_extent_buffer(next);
2800e02119d5SChris Mason 			continue;
2801e02119d5SChris Mason 		}
2802581c1760SQu Wenruo 		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2803018642a1STsutomu Itoh 		if (ret) {
2804018642a1STsutomu Itoh 			free_extent_buffer(next);
2805018642a1STsutomu Itoh 			return ret;
2806018642a1STsutomu Itoh 		}
2807e02119d5SChris Mason 
2808e02119d5SChris Mason 		if (path->nodes[*level-1])
2809e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level-1]);
2810e02119d5SChris Mason 		path->nodes[*level-1] = next;
2811e02119d5SChris Mason 		*level = btrfs_header_level(next);
2812e02119d5SChris Mason 		path->slots[*level] = 0;
2813e02119d5SChris Mason 		cond_resched();
2814e02119d5SChris Mason 	}
28154a500fd1SYan, Zheng 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2816e02119d5SChris Mason 
2817e02119d5SChris Mason 	cond_resched();
2818e02119d5SChris Mason 	return 0;
2819e02119d5SChris Mason }
2820e02119d5SChris Mason 
2821d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2822e02119d5SChris Mason 				 struct btrfs_root *root,
2823e02119d5SChris Mason 				 struct btrfs_path *path, int *level,
2824e02119d5SChris Mason 				 struct walk_control *wc)
2825e02119d5SChris Mason {
28260b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2827e02119d5SChris Mason 	int i;
2828e02119d5SChris Mason 	int slot;
2829e02119d5SChris Mason 	int ret;
2830e02119d5SChris Mason 
2831e02119d5SChris Mason 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2832e02119d5SChris Mason 		slot = path->slots[i];
28334a500fd1SYan, Zheng 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2834e02119d5SChris Mason 			path->slots[i]++;
2835e02119d5SChris Mason 			*level = i;
2836e02119d5SChris Mason 			WARN_ON(*level == 0);
2837e02119d5SChris Mason 			return 0;
2838e02119d5SChris Mason 		} else {
28391e5063d0SMark Fasheh 			ret = wc->process_func(root, path->nodes[*level], wc,
2840581c1760SQu Wenruo 				 btrfs_header_generation(path->nodes[*level]),
2841581c1760SQu Wenruo 				 *level);
28421e5063d0SMark Fasheh 			if (ret)
28431e5063d0SMark Fasheh 				return ret;
28441e5063d0SMark Fasheh 
2845e02119d5SChris Mason 			if (wc->free) {
2846e02119d5SChris Mason 				struct extent_buffer *next;
2847e02119d5SChris Mason 
2848e02119d5SChris Mason 				next = path->nodes[*level];
2849e02119d5SChris Mason 
2850681ae509SJosef Bacik 				if (trans) {
2851e02119d5SChris Mason 					btrfs_tree_lock(next);
28526a884d7dSDavid Sterba 					btrfs_clean_tree_block(next);
2853e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2854e02119d5SChris Mason 					btrfs_tree_unlock(next);
28557bfc1007SNikolay Borisov 					ret = btrfs_pin_reserved_extent(trans,
2856e02119d5SChris Mason 						     path->nodes[*level]->start,
2857d00aff00SChris Mason 						     path->nodes[*level]->len);
28583650860bSJosef Bacik 					if (ret)
28593650860bSJosef Bacik 						return ret;
286010e958d5SNikolay Borisov 				} else {
286110e958d5SNikolay Borisov 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
286210e958d5SNikolay Borisov 						clear_extent_buffer_dirty(next);
286310e958d5SNikolay Borisov 
286410e958d5SNikolay Borisov 					unaccount_log_buffer(fs_info,
286510e958d5SNikolay Borisov 						path->nodes[*level]->start);
286610e958d5SNikolay Borisov 				}
2867e02119d5SChris Mason 			}
2868e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level]);
2869e02119d5SChris Mason 			path->nodes[*level] = NULL;
2870e02119d5SChris Mason 			*level = i + 1;
2871e02119d5SChris Mason 		}
2872e02119d5SChris Mason 	}
2873e02119d5SChris Mason 	return 1;
2874e02119d5SChris Mason }
2875e02119d5SChris Mason 
2876e02119d5SChris Mason /*
2877e02119d5SChris Mason  * drop the reference count on the tree rooted at 'snap'.  This traverses
2878e02119d5SChris Mason  * the tree freeing any blocks that have a ref count of zero after being
2879e02119d5SChris Mason  * decremented.
2880e02119d5SChris Mason  */
2881e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans,
2882e02119d5SChris Mason 			 struct btrfs_root *log, struct walk_control *wc)
2883e02119d5SChris Mason {
28842ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
2885e02119d5SChris Mason 	int ret = 0;
2886e02119d5SChris Mason 	int wret;
2887e02119d5SChris Mason 	int level;
2888e02119d5SChris Mason 	struct btrfs_path *path;
2889e02119d5SChris Mason 	int orig_level;
2890e02119d5SChris Mason 
2891e02119d5SChris Mason 	path = btrfs_alloc_path();
2892db5b493aSTsutomu Itoh 	if (!path)
2893db5b493aSTsutomu Itoh 		return -ENOMEM;
2894e02119d5SChris Mason 
2895e02119d5SChris Mason 	level = btrfs_header_level(log->node);
2896e02119d5SChris Mason 	orig_level = level;
2897e02119d5SChris Mason 	path->nodes[level] = log->node;
289867439dadSDavid Sterba 	atomic_inc(&log->node->refs);
2899e02119d5SChris Mason 	path->slots[level] = 0;
2900e02119d5SChris Mason 
2901e02119d5SChris Mason 	while (1) {
2902e02119d5SChris Mason 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2903e02119d5SChris Mason 		if (wret > 0)
2904e02119d5SChris Mason 			break;
290579787eaaSJeff Mahoney 		if (wret < 0) {
2906e02119d5SChris Mason 			ret = wret;
290779787eaaSJeff Mahoney 			goto out;
290879787eaaSJeff Mahoney 		}
2909e02119d5SChris Mason 
2910e02119d5SChris Mason 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2911e02119d5SChris Mason 		if (wret > 0)
2912e02119d5SChris Mason 			break;
291379787eaaSJeff Mahoney 		if (wret < 0) {
2914e02119d5SChris Mason 			ret = wret;
291579787eaaSJeff Mahoney 			goto out;
291679787eaaSJeff Mahoney 		}
2917e02119d5SChris Mason 	}
2918e02119d5SChris Mason 
2919e02119d5SChris Mason 	/* was the root node processed? if not, catch it here */
2920e02119d5SChris Mason 	if (path->nodes[orig_level]) {
292179787eaaSJeff Mahoney 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2922581c1760SQu Wenruo 			 btrfs_header_generation(path->nodes[orig_level]),
2923581c1760SQu Wenruo 			 orig_level);
292479787eaaSJeff Mahoney 		if (ret)
292579787eaaSJeff Mahoney 			goto out;
2926e02119d5SChris Mason 		if (wc->free) {
2927e02119d5SChris Mason 			struct extent_buffer *next;
2928e02119d5SChris Mason 
2929e02119d5SChris Mason 			next = path->nodes[orig_level];
2930e02119d5SChris Mason 
2931681ae509SJosef Bacik 			if (trans) {
2932e02119d5SChris Mason 				btrfs_tree_lock(next);
29336a884d7dSDavid Sterba 				btrfs_clean_tree_block(next);
2934e02119d5SChris Mason 				btrfs_wait_tree_block_writeback(next);
2935e02119d5SChris Mason 				btrfs_tree_unlock(next);
29367bfc1007SNikolay Borisov 				ret = btrfs_pin_reserved_extent(trans,
293710e958d5SNikolay Borisov 						next->start, next->len);
293810e958d5SNikolay Borisov 				if (ret)
293910e958d5SNikolay Borisov 					goto out;
29401846430cSLiu Bo 			} else {
29411846430cSLiu Bo 				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
29421846430cSLiu Bo 					clear_extent_buffer_dirty(next);
294310e958d5SNikolay Borisov 				unaccount_log_buffer(fs_info, next->start);
2944681ae509SJosef Bacik 			}
2945e02119d5SChris Mason 		}
2946e02119d5SChris Mason 	}
2947e02119d5SChris Mason 
294879787eaaSJeff Mahoney out:
2949e02119d5SChris Mason 	btrfs_free_path(path);
2950e02119d5SChris Mason 	return ret;
2951e02119d5SChris Mason }
2952e02119d5SChris Mason 
29537237f183SYan Zheng /*
29547237f183SYan Zheng  * helper function to update the item for a given subvolumes log root
29557237f183SYan Zheng  * in the tree of log roots
29567237f183SYan Zheng  */
29577237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans,
29584203e968SJosef Bacik 			   struct btrfs_root *log,
29594203e968SJosef Bacik 			   struct btrfs_root_item *root_item)
29607237f183SYan Zheng {
29610b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
29627237f183SYan Zheng 	int ret;
29637237f183SYan Zheng 
29647237f183SYan Zheng 	if (log->log_transid == 1) {
29657237f183SYan Zheng 		/* insert root item on the first sync */
29660b246afaSJeff Mahoney 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
29674203e968SJosef Bacik 				&log->root_key, root_item);
29687237f183SYan Zheng 	} else {
29690b246afaSJeff Mahoney 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
29704203e968SJosef Bacik 				&log->root_key, root_item);
29717237f183SYan Zheng 	}
29727237f183SYan Zheng 	return ret;
29737237f183SYan Zheng }
29747237f183SYan Zheng 
297560d53eb3SZhaolei static void wait_log_commit(struct btrfs_root *root, int transid)
2976e02119d5SChris Mason {
2977e02119d5SChris Mason 	DEFINE_WAIT(wait);
29787237f183SYan Zheng 	int index = transid % 2;
2979e02119d5SChris Mason 
29807237f183SYan Zheng 	/*
29817237f183SYan Zheng 	 * we only allow two pending log transactions at a time,
29827237f183SYan Zheng 	 * so we know that if ours is more than 2 older than the
29837237f183SYan Zheng 	 * current transaction, we're done
29847237f183SYan Zheng 	 */
298549e83f57SLiu Bo 	for (;;) {
29867237f183SYan Zheng 		prepare_to_wait(&root->log_commit_wait[index],
29877237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
298849e83f57SLiu Bo 
298949e83f57SLiu Bo 		if (!(root->log_transid_committed < transid &&
299049e83f57SLiu Bo 		      atomic_read(&root->log_commit[index])))
299149e83f57SLiu Bo 			break;
299249e83f57SLiu Bo 
29937237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
2994e02119d5SChris Mason 		schedule();
29957237f183SYan Zheng 		mutex_lock(&root->log_mutex);
299649e83f57SLiu Bo 	}
299749e83f57SLiu Bo 	finish_wait(&root->log_commit_wait[index], &wait);
29987237f183SYan Zheng }
29997237f183SYan Zheng 
300060d53eb3SZhaolei static void wait_for_writer(struct btrfs_root *root)
30017237f183SYan Zheng {
30027237f183SYan Zheng 	DEFINE_WAIT(wait);
30038b050d35SMiao Xie 
300449e83f57SLiu Bo 	for (;;) {
300549e83f57SLiu Bo 		prepare_to_wait(&root->log_writer_wait, &wait,
300649e83f57SLiu Bo 				TASK_UNINTERRUPTIBLE);
300749e83f57SLiu Bo 		if (!atomic_read(&root->log_writers))
300849e83f57SLiu Bo 			break;
300949e83f57SLiu Bo 
30107237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
30117237f183SYan Zheng 		schedule();
3012575849ecSFilipe Manana 		mutex_lock(&root->log_mutex);
30137237f183SYan Zheng 	}
301449e83f57SLiu Bo 	finish_wait(&root->log_writer_wait, &wait);
3015e02119d5SChris Mason }
3016e02119d5SChris Mason 
30178b050d35SMiao Xie static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
30188b050d35SMiao Xie 					struct btrfs_log_ctx *ctx)
30198b050d35SMiao Xie {
30208b050d35SMiao Xie 	if (!ctx)
30218b050d35SMiao Xie 		return;
30228b050d35SMiao Xie 
30238b050d35SMiao Xie 	mutex_lock(&root->log_mutex);
30248b050d35SMiao Xie 	list_del_init(&ctx->list);
30258b050d35SMiao Xie 	mutex_unlock(&root->log_mutex);
30268b050d35SMiao Xie }
30278b050d35SMiao Xie 
30288b050d35SMiao Xie /*
30298b050d35SMiao Xie  * Invoked in log mutex context, or be sure there is no other task which
30308b050d35SMiao Xie  * can access the list.
30318b050d35SMiao Xie  */
30328b050d35SMiao Xie static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
30338b050d35SMiao Xie 					     int index, int error)
30348b050d35SMiao Xie {
30358b050d35SMiao Xie 	struct btrfs_log_ctx *ctx;
3036570dd450SChris Mason 	struct btrfs_log_ctx *safe;
30378b050d35SMiao Xie 
3038570dd450SChris Mason 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3039570dd450SChris Mason 		list_del_init(&ctx->list);
30408b050d35SMiao Xie 		ctx->log_ret = error;
3041570dd450SChris Mason 	}
30428b050d35SMiao Xie 
30438b050d35SMiao Xie 	INIT_LIST_HEAD(&root->log_ctxs[index]);
30448b050d35SMiao Xie }
30458b050d35SMiao Xie 
3046e02119d5SChris Mason /*
3047e02119d5SChris Mason  * btrfs_sync_log does sends a given tree log down to the disk and
3048e02119d5SChris Mason  * updates the super blocks to record it.  When this call is done,
304912fcfd22SChris Mason  * you know that any inodes previously logged are safely on disk only
305012fcfd22SChris Mason  * if it returns 0.
305112fcfd22SChris Mason  *
305212fcfd22SChris Mason  * Any other return value means you need to call btrfs_commit_transaction.
305312fcfd22SChris Mason  * Some of the edge cases for fsyncing directories that have had unlinks
305412fcfd22SChris Mason  * or renames done in the past mean that sometimes the only safe
305512fcfd22SChris Mason  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
305612fcfd22SChris Mason  * that has happened.
3057e02119d5SChris Mason  */
3058e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans,
30598b050d35SMiao Xie 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3060e02119d5SChris Mason {
30617237f183SYan Zheng 	int index1;
30627237f183SYan Zheng 	int index2;
30638cef4e16SYan, Zheng 	int mark;
3064e02119d5SChris Mason 	int ret;
30650b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
3066e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
30670b246afaSJeff Mahoney 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
30684203e968SJosef Bacik 	struct btrfs_root_item new_root_item;
3069bb14a59bSMiao Xie 	int log_transid = 0;
30708b050d35SMiao Xie 	struct btrfs_log_ctx root_log_ctx;
3071c6adc9ccSMiao Xie 	struct blk_plug plug;
307247876f7cSFilipe Manana 	u64 log_root_start;
307347876f7cSFilipe Manana 	u64 log_root_level;
3074e02119d5SChris Mason 
30757237f183SYan Zheng 	mutex_lock(&root->log_mutex);
3076d1433debSMiao Xie 	log_transid = ctx->log_transid;
3077d1433debSMiao Xie 	if (root->log_transid_committed >= log_transid) {
30787237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
30798b050d35SMiao Xie 		return ctx->log_ret;
3080e02119d5SChris Mason 	}
3081d1433debSMiao Xie 
3082d1433debSMiao Xie 	index1 = log_transid % 2;
3083d1433debSMiao Xie 	if (atomic_read(&root->log_commit[index1])) {
308460d53eb3SZhaolei 		wait_log_commit(root, log_transid);
3085d1433debSMiao Xie 		mutex_unlock(&root->log_mutex);
3086d1433debSMiao Xie 		return ctx->log_ret;
3087d1433debSMiao Xie 	}
3088d1433debSMiao Xie 	ASSERT(log_transid == root->log_transid);
30897237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 1);
30907237f183SYan Zheng 
30917237f183SYan Zheng 	/* wait for previous tree log sync to complete */
30927237f183SYan Zheng 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
309360d53eb3SZhaolei 		wait_log_commit(root, log_transid - 1);
309448cab2e0SMiao Xie 
309586df7eb9SYan, Zheng 	while (1) {
30962ecb7923SMiao Xie 		int batch = atomic_read(&root->log_batch);
3097cd354ad6SChris Mason 		/* when we're on an ssd, just kick the log commit out */
30980b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, SSD) &&
309927cdeb70SMiao Xie 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
31007237f183SYan Zheng 			mutex_unlock(&root->log_mutex);
3101e02119d5SChris Mason 			schedule_timeout_uninterruptible(1);
31027237f183SYan Zheng 			mutex_lock(&root->log_mutex);
310386df7eb9SYan, Zheng 		}
310460d53eb3SZhaolei 		wait_for_writer(root);
31052ecb7923SMiao Xie 		if (batch == atomic_read(&root->log_batch))
3106e02119d5SChris Mason 			break;
3107e02119d5SChris Mason 	}
3108d0c803c4SChris Mason 
310912fcfd22SChris Mason 	/* bail out if we need to do a full commit */
31104884b8e8SDavid Sterba 	if (btrfs_need_log_full_commit(trans)) {
311112fcfd22SChris Mason 		ret = -EAGAIN;
311212fcfd22SChris Mason 		mutex_unlock(&root->log_mutex);
311312fcfd22SChris Mason 		goto out;
311412fcfd22SChris Mason 	}
311512fcfd22SChris Mason 
31168cef4e16SYan, Zheng 	if (log_transid % 2 == 0)
31178cef4e16SYan, Zheng 		mark = EXTENT_DIRTY;
31188cef4e16SYan, Zheng 	else
31198cef4e16SYan, Zheng 		mark = EXTENT_NEW;
31208cef4e16SYan, Zheng 
3121690587d1SChris Mason 	/* we start IO on  all the marked extents here, but we don't actually
3122690587d1SChris Mason 	 * wait for them until later.
3123690587d1SChris Mason 	 */
3124c6adc9ccSMiao Xie 	blk_start_plug(&plug);
31252ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3126b528f467SNaohiro Aota 	/*
3127b528f467SNaohiro Aota 	 * -EAGAIN happens when someone, e.g., a concurrent transaction
3128b528f467SNaohiro Aota 	 *  commit, writes a dirty extent in this tree-log commit. This
3129b528f467SNaohiro Aota 	 *  concurrent write will create a hole writing out the extents,
3130b528f467SNaohiro Aota 	 *  and we cannot proceed on a zoned filesystem, requiring
3131b528f467SNaohiro Aota 	 *  sequential writing. While we can bail out to a full commit
3132b528f467SNaohiro Aota 	 *  here, but we can continue hoping the concurrent writing fills
3133b528f467SNaohiro Aota 	 *  the hole.
3134b528f467SNaohiro Aota 	 */
3135b528f467SNaohiro Aota 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3136b528f467SNaohiro Aota 		ret = 0;
313779787eaaSJeff Mahoney 	if (ret) {
3138c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
313966642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
314090787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
314179787eaaSJeff Mahoney 		mutex_unlock(&root->log_mutex);
314279787eaaSJeff Mahoney 		goto out;
314379787eaaSJeff Mahoney 	}
31447237f183SYan Zheng 
31454203e968SJosef Bacik 	/*
31464203e968SJosef Bacik 	 * We _must_ update under the root->log_mutex in order to make sure we
31474203e968SJosef Bacik 	 * have a consistent view of the log root we are trying to commit at
31484203e968SJosef Bacik 	 * this moment.
31494203e968SJosef Bacik 	 *
31504203e968SJosef Bacik 	 * We _must_ copy this into a local copy, because we are not holding the
31514203e968SJosef Bacik 	 * log_root_tree->log_mutex yet.  This is important because when we
31524203e968SJosef Bacik 	 * commit the log_root_tree we must have a consistent view of the
31534203e968SJosef Bacik 	 * log_root_tree when we update the super block to point at the
31544203e968SJosef Bacik 	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
31554203e968SJosef Bacik 	 * with the commit and possibly point at the new block which we may not
31564203e968SJosef Bacik 	 * have written out.
31574203e968SJosef Bacik 	 */
31585d4f98a2SYan Zheng 	btrfs_set_root_node(&log->root_item, log->node);
31594203e968SJosef Bacik 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
31607237f183SYan Zheng 
31617237f183SYan Zheng 	root->log_transid++;
31627237f183SYan Zheng 	log->log_transid = root->log_transid;
3163ff782e0aSJosef Bacik 	root->log_start_pid = 0;
31647237f183SYan Zheng 	/*
31658cef4e16SYan, Zheng 	 * IO has been started, blocks of the log tree have WRITTEN flag set
31668cef4e16SYan, Zheng 	 * in their headers. new modifications of the log will be written to
31678cef4e16SYan, Zheng 	 * new positions. so it's safe to allow log writers to go in.
31687237f183SYan Zheng 	 */
31697237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
31707237f183SYan Zheng 
31713ddebf27SNaohiro Aota 	if (btrfs_is_zoned(fs_info)) {
3172e75f9fd1SNaohiro Aota 		mutex_lock(&fs_info->tree_root->log_mutex);
31733ddebf27SNaohiro Aota 		if (!log_root_tree->node) {
31743ddebf27SNaohiro Aota 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
31753ddebf27SNaohiro Aota 			if (ret) {
3176e75f9fd1SNaohiro Aota 				mutex_unlock(&fs_info->tree_log_mutex);
31773ddebf27SNaohiro Aota 				goto out;
31783ddebf27SNaohiro Aota 			}
31793ddebf27SNaohiro Aota 		}
3180e75f9fd1SNaohiro Aota 		mutex_unlock(&fs_info->tree_root->log_mutex);
31813ddebf27SNaohiro Aota 	}
31823ddebf27SNaohiro Aota 
3183e75f9fd1SNaohiro Aota 	btrfs_init_log_ctx(&root_log_ctx, NULL);
3184e75f9fd1SNaohiro Aota 
3185e75f9fd1SNaohiro Aota 	mutex_lock(&log_root_tree->log_mutex);
3186e75f9fd1SNaohiro Aota 
3187e3d3b415SFilipe Manana 	index2 = log_root_tree->log_transid % 2;
3188e3d3b415SFilipe Manana 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3189e3d3b415SFilipe Manana 	root_log_ctx.log_transid = log_root_tree->log_transid;
3190e3d3b415SFilipe Manana 
31914203e968SJosef Bacik 	/*
31924203e968SJosef Bacik 	 * Now we are safe to update the log_root_tree because we're under the
31934203e968SJosef Bacik 	 * log_mutex, and we're a current writer so we're holding the commit
31944203e968SJosef Bacik 	 * open until we drop the log_mutex.
31954203e968SJosef Bacik 	 */
31964203e968SJosef Bacik 	ret = update_log_root(trans, log, &new_root_item);
31974a500fd1SYan, Zheng 	if (ret) {
3198d1433debSMiao Xie 		if (!list_empty(&root_log_ctx.list))
3199d1433debSMiao Xie 			list_del_init(&root_log_ctx.list);
3200d1433debSMiao Xie 
3201c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
320290787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
3203995946ddSMiao Xie 
320479787eaaSJeff Mahoney 		if (ret != -ENOSPC) {
320566642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
320679787eaaSJeff Mahoney 			mutex_unlock(&log_root_tree->log_mutex);
320779787eaaSJeff Mahoney 			goto out;
320879787eaaSJeff Mahoney 		}
3209bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
32104a500fd1SYan, Zheng 		mutex_unlock(&log_root_tree->log_mutex);
32114a500fd1SYan, Zheng 		ret = -EAGAIN;
32124a500fd1SYan, Zheng 		goto out;
32134a500fd1SYan, Zheng 	}
32144a500fd1SYan, Zheng 
3215d1433debSMiao Xie 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
32163da5ab56SForrest Liu 		blk_finish_plug(&plug);
3217cbd60aa7SChris Mason 		list_del_init(&root_log_ctx.list);
3218d1433debSMiao Xie 		mutex_unlock(&log_root_tree->log_mutex);
3219d1433debSMiao Xie 		ret = root_log_ctx.log_ret;
3220d1433debSMiao Xie 		goto out;
3221d1433debSMiao Xie 	}
32228b050d35SMiao Xie 
3223d1433debSMiao Xie 	index2 = root_log_ctx.log_transid % 2;
32247237f183SYan Zheng 	if (atomic_read(&log_root_tree->log_commit[index2])) {
3225c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
3226bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log, mark);
322760d53eb3SZhaolei 		wait_log_commit(log_root_tree,
3228d1433debSMiao Xie 				root_log_ctx.log_transid);
32297237f183SYan Zheng 		mutex_unlock(&log_root_tree->log_mutex);
32305ab5e44aSFilipe Manana 		if (!ret)
32318b050d35SMiao Xie 			ret = root_log_ctx.log_ret;
32327237f183SYan Zheng 		goto out;
32337237f183SYan Zheng 	}
3234d1433debSMiao Xie 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
32357237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 1);
32367237f183SYan Zheng 
323712fcfd22SChris Mason 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
323860d53eb3SZhaolei 		wait_log_commit(log_root_tree,
3239d1433debSMiao Xie 				root_log_ctx.log_transid - 1);
324012fcfd22SChris Mason 	}
32417237f183SYan Zheng 
324212fcfd22SChris Mason 	/*
324312fcfd22SChris Mason 	 * now that we've moved on to the tree of log tree roots,
324412fcfd22SChris Mason 	 * check the full commit flag again
324512fcfd22SChris Mason 	 */
32464884b8e8SDavid Sterba 	if (btrfs_need_log_full_commit(trans)) {
3247c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
3248bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
324912fcfd22SChris Mason 		mutex_unlock(&log_root_tree->log_mutex);
325012fcfd22SChris Mason 		ret = -EAGAIN;
325112fcfd22SChris Mason 		goto out_wake_log_root;
325212fcfd22SChris Mason 	}
32537237f183SYan Zheng 
32542ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info,
32558cef4e16SYan, Zheng 					 &log_root_tree->dirty_log_pages,
32568cef4e16SYan, Zheng 					 EXTENT_DIRTY | EXTENT_NEW);
3257c6adc9ccSMiao Xie 	blk_finish_plug(&plug);
3258b528f467SNaohiro Aota 	/*
3259b528f467SNaohiro Aota 	 * As described above, -EAGAIN indicates a hole in the extents. We
3260b528f467SNaohiro Aota 	 * cannot wait for these write outs since the waiting cause a
3261b528f467SNaohiro Aota 	 * deadlock. Bail out to the full commit instead.
3262b528f467SNaohiro Aota 	 */
3263b528f467SNaohiro Aota 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3264b528f467SNaohiro Aota 		btrfs_set_log_full_commit(trans);
3265b528f467SNaohiro Aota 		btrfs_wait_tree_log_extents(log, mark);
3266b528f467SNaohiro Aota 		mutex_unlock(&log_root_tree->log_mutex);
3267b528f467SNaohiro Aota 		goto out_wake_log_root;
3268b528f467SNaohiro Aota 	} else if (ret) {
326990787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
327066642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
327179787eaaSJeff Mahoney 		mutex_unlock(&log_root_tree->log_mutex);
327279787eaaSJeff Mahoney 		goto out_wake_log_root;
327379787eaaSJeff Mahoney 	}
3274bf89d38fSJeff Mahoney 	ret = btrfs_wait_tree_log_extents(log, mark);
32755ab5e44aSFilipe Manana 	if (!ret)
3276bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log_root_tree,
3277c6adc9ccSMiao Xie 						  EXTENT_NEW | EXTENT_DIRTY);
32785ab5e44aSFilipe Manana 	if (ret) {
327990787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
32805ab5e44aSFilipe Manana 		mutex_unlock(&log_root_tree->log_mutex);
32815ab5e44aSFilipe Manana 		goto out_wake_log_root;
32825ab5e44aSFilipe Manana 	}
3283e02119d5SChris Mason 
328447876f7cSFilipe Manana 	log_root_start = log_root_tree->node->start;
328547876f7cSFilipe Manana 	log_root_level = btrfs_header_level(log_root_tree->node);
32867237f183SYan Zheng 	log_root_tree->log_transid++;
32877237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
32887237f183SYan Zheng 
32897237f183SYan Zheng 	/*
329047876f7cSFilipe Manana 	 * Here we are guaranteed that nobody is going to write the superblock
329147876f7cSFilipe Manana 	 * for the current transaction before us and that neither we do write
329247876f7cSFilipe Manana 	 * our superblock before the previous transaction finishes its commit
329347876f7cSFilipe Manana 	 * and writes its superblock, because:
329447876f7cSFilipe Manana 	 *
329547876f7cSFilipe Manana 	 * 1) We are holding a handle on the current transaction, so no body
329647876f7cSFilipe Manana 	 *    can commit it until we release the handle;
329747876f7cSFilipe Manana 	 *
329847876f7cSFilipe Manana 	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
329947876f7cSFilipe Manana 	 *    if the previous transaction is still committing, and hasn't yet
330047876f7cSFilipe Manana 	 *    written its superblock, we wait for it to do it, because a
330147876f7cSFilipe Manana 	 *    transaction commit acquires the tree_log_mutex when the commit
330247876f7cSFilipe Manana 	 *    begins and releases it only after writing its superblock.
33037237f183SYan Zheng 	 */
330447876f7cSFilipe Manana 	mutex_lock(&fs_info->tree_log_mutex);
330547876f7cSFilipe Manana 	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
330647876f7cSFilipe Manana 	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3307eece6a9cSDavid Sterba 	ret = write_all_supers(fs_info, 1);
330847876f7cSFilipe Manana 	mutex_unlock(&fs_info->tree_log_mutex);
33095af3e8ccSStefan Behrens 	if (ret) {
331090787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
331166642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
33125af3e8ccSStefan Behrens 		goto out_wake_log_root;
33135af3e8ccSStefan Behrens 	}
33147237f183SYan Zheng 
3315257c62e1SChris Mason 	mutex_lock(&root->log_mutex);
3316257c62e1SChris Mason 	if (root->last_log_commit < log_transid)
3317257c62e1SChris Mason 		root->last_log_commit = log_transid;
3318257c62e1SChris Mason 	mutex_unlock(&root->log_mutex);
3319257c62e1SChris Mason 
332012fcfd22SChris Mason out_wake_log_root:
3321570dd450SChris Mason 	mutex_lock(&log_root_tree->log_mutex);
33228b050d35SMiao Xie 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
33238b050d35SMiao Xie 
3324d1433debSMiao Xie 	log_root_tree->log_transid_committed++;
33257237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 0);
3326d1433debSMiao Xie 	mutex_unlock(&log_root_tree->log_mutex);
3327d1433debSMiao Xie 
332833a9eca7SDavid Sterba 	/*
3329093258e6SDavid Sterba 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3330093258e6SDavid Sterba 	 * all the updates above are seen by the woken threads. It might not be
3331093258e6SDavid Sterba 	 * necessary, but proving that seems to be hard.
333233a9eca7SDavid Sterba 	 */
3333093258e6SDavid Sterba 	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3334e02119d5SChris Mason out:
3335d1433debSMiao Xie 	mutex_lock(&root->log_mutex);
3336570dd450SChris Mason 	btrfs_remove_all_log_ctxs(root, index1, ret);
3337d1433debSMiao Xie 	root->log_transid_committed++;
33387237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 0);
3339d1433debSMiao Xie 	mutex_unlock(&root->log_mutex);
33408b050d35SMiao Xie 
334133a9eca7SDavid Sterba 	/*
3342093258e6SDavid Sterba 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3343093258e6SDavid Sterba 	 * all the updates above are seen by the woken threads. It might not be
3344093258e6SDavid Sterba 	 * necessary, but proving that seems to be hard.
334533a9eca7SDavid Sterba 	 */
3346093258e6SDavid Sterba 	cond_wake_up(&root->log_commit_wait[index1]);
3347b31eabd8SChris Mason 	return ret;
3348e02119d5SChris Mason }
3349e02119d5SChris Mason 
33504a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans,
33514a500fd1SYan, Zheng 			  struct btrfs_root *log)
3352e02119d5SChris Mason {
3353e02119d5SChris Mason 	int ret;
3354e02119d5SChris Mason 	struct walk_control wc = {
3355e02119d5SChris Mason 		.free = 1,
3356e02119d5SChris Mason 		.process_func = process_one_buffer
3357e02119d5SChris Mason 	};
3358e02119d5SChris Mason 
33593ddebf27SNaohiro Aota 	if (log->node) {
3360e02119d5SChris Mason 		ret = walk_log_tree(trans, log, &wc);
3361374b0e2dSJeff Mahoney 		if (ret) {
3362374b0e2dSJeff Mahoney 			if (trans)
336366642832SJeff Mahoney 				btrfs_abort_transaction(trans, ret);
3364374b0e2dSJeff Mahoney 			else
3365374b0e2dSJeff Mahoney 				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3366374b0e2dSJeff Mahoney 		}
33673ddebf27SNaohiro Aota 	}
3368e02119d5SChris Mason 
336959b0713aSFilipe Manana 	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
337055237a5fSLiu Bo 			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3371e289f03eSFilipe Manana 	extent_io_tree_release(&log->log_csum_range);
3372d3575156SNaohiro Aota 
3373d3575156SNaohiro Aota 	if (trans && log->node)
3374d3575156SNaohiro Aota 		btrfs_redirty_list_add(trans->transaction, log->node);
337500246528SJosef Bacik 	btrfs_put_root(log);
33764a500fd1SYan, Zheng }
33774a500fd1SYan, Zheng 
33784a500fd1SYan, Zheng /*
33794a500fd1SYan, Zheng  * free all the extents used by the tree log.  This should be called
33804a500fd1SYan, Zheng  * at commit time of the full transaction
33814a500fd1SYan, Zheng  */
33824a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
33834a500fd1SYan, Zheng {
33844a500fd1SYan, Zheng 	if (root->log_root) {
33854a500fd1SYan, Zheng 		free_log_tree(trans, root->log_root);
33864a500fd1SYan, Zheng 		root->log_root = NULL;
3387e7a79811SFilipe Manana 		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
33884a500fd1SYan, Zheng 	}
33894a500fd1SYan, Zheng 	return 0;
33904a500fd1SYan, Zheng }
33914a500fd1SYan, Zheng 
33924a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
33934a500fd1SYan, Zheng 			     struct btrfs_fs_info *fs_info)
33944a500fd1SYan, Zheng {
33954a500fd1SYan, Zheng 	if (fs_info->log_root_tree) {
33964a500fd1SYan, Zheng 		free_log_tree(trans, fs_info->log_root_tree);
33974a500fd1SYan, Zheng 		fs_info->log_root_tree = NULL;
339847876f7cSFilipe Manana 		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
33994a500fd1SYan, Zheng 	}
3400e02119d5SChris Mason 	return 0;
3401e02119d5SChris Mason }
3402e02119d5SChris Mason 
3403e02119d5SChris Mason /*
3404803f0f64SFilipe Manana  * Check if an inode was logged in the current transaction. We can't always rely
3405803f0f64SFilipe Manana  * on an inode's logged_trans value, because it's an in-memory only field and
3406803f0f64SFilipe Manana  * therefore not persisted. This means that its value is lost if the inode gets
3407803f0f64SFilipe Manana  * evicted and loaded again from disk (in which case it has a value of 0, and
3408803f0f64SFilipe Manana  * certainly it is smaller then any possible transaction ID), when that happens
3409803f0f64SFilipe Manana  * the full_sync flag is set in the inode's runtime flags, so on that case we
3410803f0f64SFilipe Manana  * assume eviction happened and ignore the logged_trans value, assuming the
3411803f0f64SFilipe Manana  * worst case, that the inode was logged before in the current transaction.
3412803f0f64SFilipe Manana  */
3413803f0f64SFilipe Manana static bool inode_logged(struct btrfs_trans_handle *trans,
3414803f0f64SFilipe Manana 			 struct btrfs_inode *inode)
3415803f0f64SFilipe Manana {
3416803f0f64SFilipe Manana 	if (inode->logged_trans == trans->transid)
3417803f0f64SFilipe Manana 		return true;
3418803f0f64SFilipe Manana 
3419803f0f64SFilipe Manana 	if (inode->last_trans == trans->transid &&
3420803f0f64SFilipe Manana 	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3421803f0f64SFilipe Manana 	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3422803f0f64SFilipe Manana 		return true;
3423803f0f64SFilipe Manana 
3424803f0f64SFilipe Manana 	return false;
3425803f0f64SFilipe Manana }
3426803f0f64SFilipe Manana 
3427803f0f64SFilipe Manana /*
3428e02119d5SChris Mason  * If both a file and directory are logged, and unlinks or renames are
3429e02119d5SChris Mason  * mixed in, we have a few interesting corners:
3430e02119d5SChris Mason  *
3431e02119d5SChris Mason  * create file X in dir Y
3432e02119d5SChris Mason  * link file X to X.link in dir Y
3433e02119d5SChris Mason  * fsync file X
3434e02119d5SChris Mason  * unlink file X but leave X.link
3435e02119d5SChris Mason  * fsync dir Y
3436e02119d5SChris Mason  *
3437e02119d5SChris Mason  * After a crash we would expect only X.link to exist.  But file X
3438e02119d5SChris Mason  * didn't get fsync'd again so the log has back refs for X and X.link.
3439e02119d5SChris Mason  *
3440e02119d5SChris Mason  * We solve this by removing directory entries and inode backrefs from the
3441e02119d5SChris Mason  * log when a file that was logged in the current transaction is
3442e02119d5SChris Mason  * unlinked.  Any later fsync will include the updated log entries, and
3443e02119d5SChris Mason  * we'll be able to reconstruct the proper directory items from backrefs.
3444e02119d5SChris Mason  *
3445e02119d5SChris Mason  * This optimizations allows us to avoid relogging the entire inode
3446e02119d5SChris Mason  * or the entire directory.
3447e02119d5SChris Mason  */
3448e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3449e02119d5SChris Mason 				 struct btrfs_root *root,
3450e02119d5SChris Mason 				 const char *name, int name_len,
345149f34d1fSNikolay Borisov 				 struct btrfs_inode *dir, u64 index)
3452e02119d5SChris Mason {
3453e02119d5SChris Mason 	struct btrfs_root *log;
3454e02119d5SChris Mason 	struct btrfs_dir_item *di;
3455e02119d5SChris Mason 	struct btrfs_path *path;
3456e02119d5SChris Mason 	int ret;
34574a500fd1SYan, Zheng 	int err = 0;
345849f34d1fSNikolay Borisov 	u64 dir_ino = btrfs_ino(dir);
3459e02119d5SChris Mason 
3460803f0f64SFilipe Manana 	if (!inode_logged(trans, dir))
34613a5f1d45SChris Mason 		return 0;
34623a5f1d45SChris Mason 
3463e02119d5SChris Mason 	ret = join_running_log_trans(root);
3464e02119d5SChris Mason 	if (ret)
3465e02119d5SChris Mason 		return 0;
3466e02119d5SChris Mason 
346749f34d1fSNikolay Borisov 	mutex_lock(&dir->log_mutex);
3468e02119d5SChris Mason 
3469e02119d5SChris Mason 	log = root->log_root;
3470e02119d5SChris Mason 	path = btrfs_alloc_path();
3471a62f44a5STsutomu Itoh 	if (!path) {
3472a62f44a5STsutomu Itoh 		err = -ENOMEM;
3473a62f44a5STsutomu Itoh 		goto out_unlock;
3474a62f44a5STsutomu Itoh 	}
34752a29edc6Sliubo 
347633345d01SLi Zefan 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3477e02119d5SChris Mason 				   name, name_len, -1);
34784a500fd1SYan, Zheng 	if (IS_ERR(di)) {
34794a500fd1SYan, Zheng 		err = PTR_ERR(di);
34804a500fd1SYan, Zheng 		goto fail;
34814a500fd1SYan, Zheng 	}
34824a500fd1SYan, Zheng 	if (di) {
3483e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
34843650860bSJosef Bacik 		if (ret) {
34853650860bSJosef Bacik 			err = ret;
34863650860bSJosef Bacik 			goto fail;
34873650860bSJosef Bacik 		}
3488e02119d5SChris Mason 	}
3489b3b4aa74SDavid Sterba 	btrfs_release_path(path);
349033345d01SLi Zefan 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3491e02119d5SChris Mason 					 index, name, name_len, -1);
34924a500fd1SYan, Zheng 	if (IS_ERR(di)) {
34934a500fd1SYan, Zheng 		err = PTR_ERR(di);
34944a500fd1SYan, Zheng 		goto fail;
34954a500fd1SYan, Zheng 	}
34964a500fd1SYan, Zheng 	if (di) {
3497e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
34983650860bSJosef Bacik 		if (ret) {
34993650860bSJosef Bacik 			err = ret;
35003650860bSJosef Bacik 			goto fail;
35013650860bSJosef Bacik 		}
3502e02119d5SChris Mason 	}
3503e02119d5SChris Mason 
3504ddffcf6fSFilipe Manana 	/*
3505ddffcf6fSFilipe Manana 	 * We do not need to update the size field of the directory's inode item
3506ddffcf6fSFilipe Manana 	 * because on log replay we update the field to reflect all existing
3507ddffcf6fSFilipe Manana 	 * entries in the directory (see overwrite_item()).
3508e02119d5SChris Mason 	 */
35094a500fd1SYan, Zheng fail:
3510e02119d5SChris Mason 	btrfs_free_path(path);
3511a62f44a5STsutomu Itoh out_unlock:
351249f34d1fSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
3513fb2fecbaSJosef Bacik 	if (err == -ENOSPC) {
351490787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
3515fb2fecbaSJosef Bacik 		err = 0;
3516fb2fecbaSJosef Bacik 	} else if (err < 0 && err != -ENOENT) {
3517fb2fecbaSJosef Bacik 		/* ENOENT can be returned if the entry hasn't been fsynced yet */
3518fb2fecbaSJosef Bacik 		btrfs_abort_transaction(trans, err);
3519fb2fecbaSJosef Bacik 	}
352079787eaaSJeff Mahoney 
352112fcfd22SChris Mason 	btrfs_end_log_trans(root);
3522e02119d5SChris Mason 
3523411fc6bcSAndi Kleen 	return err;
3524e02119d5SChris Mason }
3525e02119d5SChris Mason 
3526e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */
3527e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3528e02119d5SChris Mason 			       struct btrfs_root *root,
3529e02119d5SChris Mason 			       const char *name, int name_len,
3530a491abb2SNikolay Borisov 			       struct btrfs_inode *inode, u64 dirid)
3531e02119d5SChris Mason {
3532e02119d5SChris Mason 	struct btrfs_root *log;
3533e02119d5SChris Mason 	u64 index;
3534e02119d5SChris Mason 	int ret;
3535e02119d5SChris Mason 
3536803f0f64SFilipe Manana 	if (!inode_logged(trans, inode))
35373a5f1d45SChris Mason 		return 0;
35383a5f1d45SChris Mason 
3539e02119d5SChris Mason 	ret = join_running_log_trans(root);
3540e02119d5SChris Mason 	if (ret)
3541e02119d5SChris Mason 		return 0;
3542e02119d5SChris Mason 	log = root->log_root;
3543a491abb2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
3544e02119d5SChris Mason 
3545a491abb2SNikolay Borisov 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3546e02119d5SChris Mason 				  dirid, &index);
3547a491abb2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
35484a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
354990787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
35504a500fd1SYan, Zheng 		ret = 0;
355179787eaaSJeff Mahoney 	} else if (ret < 0 && ret != -ENOENT)
355266642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
355312fcfd22SChris Mason 	btrfs_end_log_trans(root);
3554e02119d5SChris Mason 
3555e02119d5SChris Mason 	return ret;
3556e02119d5SChris Mason }
3557e02119d5SChris Mason 
3558e02119d5SChris Mason /*
3559e02119d5SChris Mason  * creates a range item in the log for 'dirid'.  first_offset and
3560e02119d5SChris Mason  * last_offset tell us which parts of the key space the log should
3561e02119d5SChris Mason  * be considered authoritative for.
3562e02119d5SChris Mason  */
3563e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3564e02119d5SChris Mason 				       struct btrfs_root *log,
3565e02119d5SChris Mason 				       struct btrfs_path *path,
3566e02119d5SChris Mason 				       int key_type, u64 dirid,
3567e02119d5SChris Mason 				       u64 first_offset, u64 last_offset)
3568e02119d5SChris Mason {
3569e02119d5SChris Mason 	int ret;
3570e02119d5SChris Mason 	struct btrfs_key key;
3571e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
3572e02119d5SChris Mason 
3573e02119d5SChris Mason 	key.objectid = dirid;
3574e02119d5SChris Mason 	key.offset = first_offset;
3575e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY)
3576e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3577e02119d5SChris Mason 	else
3578e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3579e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
35804a500fd1SYan, Zheng 	if (ret)
35814a500fd1SYan, Zheng 		return ret;
3582e02119d5SChris Mason 
3583e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3584e02119d5SChris Mason 			      struct btrfs_dir_log_item);
3585e02119d5SChris Mason 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3586e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
3587b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3588e02119d5SChris Mason 	return 0;
3589e02119d5SChris Mason }
3590e02119d5SChris Mason 
3591e02119d5SChris Mason /*
3592e02119d5SChris Mason  * log all the items included in the current transaction for a given
3593e02119d5SChris Mason  * directory.  This also creates the range items in the log tree required
3594e02119d5SChris Mason  * to replay anything deleted before the fsync
3595e02119d5SChris Mason  */
3596e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3597684a5773SNikolay Borisov 			  struct btrfs_root *root, struct btrfs_inode *inode,
3598e02119d5SChris Mason 			  struct btrfs_path *path,
3599e02119d5SChris Mason 			  struct btrfs_path *dst_path, int key_type,
36002f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx,
3601e02119d5SChris Mason 			  u64 min_offset, u64 *last_offset_ret)
3602e02119d5SChris Mason {
3603e02119d5SChris Mason 	struct btrfs_key min_key;
3604e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
3605e02119d5SChris Mason 	struct extent_buffer *src;
36064a500fd1SYan, Zheng 	int err = 0;
3607e02119d5SChris Mason 	int ret;
3608e02119d5SChris Mason 	int i;
3609e02119d5SChris Mason 	int nritems;
3610e02119d5SChris Mason 	u64 first_offset = min_offset;
3611e02119d5SChris Mason 	u64 last_offset = (u64)-1;
3612684a5773SNikolay Borisov 	u64 ino = btrfs_ino(inode);
3613e02119d5SChris Mason 
3614e02119d5SChris Mason 	log = root->log_root;
3615e02119d5SChris Mason 
361633345d01SLi Zefan 	min_key.objectid = ino;
3617e02119d5SChris Mason 	min_key.type = key_type;
3618e02119d5SChris Mason 	min_key.offset = min_offset;
3619e02119d5SChris Mason 
36206174d3cbSFilipe David Borba Manana 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3621e02119d5SChris Mason 
3622e02119d5SChris Mason 	/*
3623e02119d5SChris Mason 	 * we didn't find anything from this transaction, see if there
3624e02119d5SChris Mason 	 * is anything at all
3625e02119d5SChris Mason 	 */
362633345d01SLi Zefan 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
362733345d01SLi Zefan 		min_key.objectid = ino;
3628e02119d5SChris Mason 		min_key.type = key_type;
3629e02119d5SChris Mason 		min_key.offset = (u64)-1;
3630b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3631e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3632e02119d5SChris Mason 		if (ret < 0) {
3633b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3634e02119d5SChris Mason 			return ret;
3635e02119d5SChris Mason 		}
363633345d01SLi Zefan 		ret = btrfs_previous_item(root, path, ino, key_type);
3637e02119d5SChris Mason 
3638e02119d5SChris Mason 		/* if ret == 0 there are items for this type,
3639e02119d5SChris Mason 		 * create a range to tell us the last key of this type.
3640e02119d5SChris Mason 		 * otherwise, there are no items in this directory after
3641e02119d5SChris Mason 		 * *min_offset, and we create a range to indicate that.
3642e02119d5SChris Mason 		 */
3643e02119d5SChris Mason 		if (ret == 0) {
3644e02119d5SChris Mason 			struct btrfs_key tmp;
3645e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3646e02119d5SChris Mason 					      path->slots[0]);
3647d397712bSChris Mason 			if (key_type == tmp.type)
3648e02119d5SChris Mason 				first_offset = max(min_offset, tmp.offset) + 1;
3649e02119d5SChris Mason 		}
3650e02119d5SChris Mason 		goto done;
3651e02119d5SChris Mason 	}
3652e02119d5SChris Mason 
3653e02119d5SChris Mason 	/* go backward to find any previous key */
365433345d01SLi Zefan 	ret = btrfs_previous_item(root, path, ino, key_type);
3655e02119d5SChris Mason 	if (ret == 0) {
3656e02119d5SChris Mason 		struct btrfs_key tmp;
3657e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3658e02119d5SChris Mason 		if (key_type == tmp.type) {
3659e02119d5SChris Mason 			first_offset = tmp.offset;
3660e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3661e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3662e02119d5SChris Mason 					     &tmp);
36634a500fd1SYan, Zheng 			if (ret) {
36644a500fd1SYan, Zheng 				err = ret;
36654a500fd1SYan, Zheng 				goto done;
36664a500fd1SYan, Zheng 			}
3667e02119d5SChris Mason 		}
3668e02119d5SChris Mason 	}
3669b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3670e02119d5SChris Mason 
36712cc83342SJosef Bacik 	/*
36722cc83342SJosef Bacik 	 * Find the first key from this transaction again.  See the note for
36732cc83342SJosef Bacik 	 * log_new_dir_dentries, if we're logging a directory recursively we
36742cc83342SJosef Bacik 	 * won't be holding its i_mutex, which means we can modify the directory
36752cc83342SJosef Bacik 	 * while we're logging it.  If we remove an entry between our first
36762cc83342SJosef Bacik 	 * search and this search we'll not find the key again and can just
36772cc83342SJosef Bacik 	 * bail.
36782cc83342SJosef Bacik 	 */
3679bb56f02fSFilipe Manana search:
3680e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
36812cc83342SJosef Bacik 	if (ret != 0)
3682e02119d5SChris Mason 		goto done;
3683e02119d5SChris Mason 
3684e02119d5SChris Mason 	/*
3685e02119d5SChris Mason 	 * we have a block from this transaction, log every item in it
3686e02119d5SChris Mason 	 * from our directory
3687e02119d5SChris Mason 	 */
3688e02119d5SChris Mason 	while (1) {
3689e02119d5SChris Mason 		struct btrfs_key tmp;
3690e02119d5SChris Mason 		src = path->nodes[0];
3691e02119d5SChris Mason 		nritems = btrfs_header_nritems(src);
3692e02119d5SChris Mason 		for (i = path->slots[0]; i < nritems; i++) {
36932f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
36942f2ff0eeSFilipe Manana 
3695e02119d5SChris Mason 			btrfs_item_key_to_cpu(src, &min_key, i);
3696e02119d5SChris Mason 
369733345d01SLi Zefan 			if (min_key.objectid != ino || min_key.type != key_type)
3698e02119d5SChris Mason 				goto done;
3699bb56f02fSFilipe Manana 
3700bb56f02fSFilipe Manana 			if (need_resched()) {
3701bb56f02fSFilipe Manana 				btrfs_release_path(path);
3702bb56f02fSFilipe Manana 				cond_resched();
3703bb56f02fSFilipe Manana 				goto search;
3704bb56f02fSFilipe Manana 			}
3705bb56f02fSFilipe Manana 
3706e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path, src, i,
3707e02119d5SChris Mason 					     &min_key);
37084a500fd1SYan, Zheng 			if (ret) {
37094a500fd1SYan, Zheng 				err = ret;
37104a500fd1SYan, Zheng 				goto done;
37114a500fd1SYan, Zheng 			}
37122f2ff0eeSFilipe Manana 
37132f2ff0eeSFilipe Manana 			/*
37142f2ff0eeSFilipe Manana 			 * We must make sure that when we log a directory entry,
37152f2ff0eeSFilipe Manana 			 * the corresponding inode, after log replay, has a
37162f2ff0eeSFilipe Manana 			 * matching link count. For example:
37172f2ff0eeSFilipe Manana 			 *
37182f2ff0eeSFilipe Manana 			 * touch foo
37192f2ff0eeSFilipe Manana 			 * mkdir mydir
37202f2ff0eeSFilipe Manana 			 * sync
37212f2ff0eeSFilipe Manana 			 * ln foo mydir/bar
37222f2ff0eeSFilipe Manana 			 * xfs_io -c "fsync" mydir
37232f2ff0eeSFilipe Manana 			 * <crash>
37242f2ff0eeSFilipe Manana 			 * <mount fs and log replay>
37252f2ff0eeSFilipe Manana 			 *
37262f2ff0eeSFilipe Manana 			 * Would result in a fsync log that when replayed, our
37272f2ff0eeSFilipe Manana 			 * file inode would have a link count of 1, but we get
37282f2ff0eeSFilipe Manana 			 * two directory entries pointing to the same inode.
37292f2ff0eeSFilipe Manana 			 * After removing one of the names, it would not be
37302f2ff0eeSFilipe Manana 			 * possible to remove the other name, which resulted
37312f2ff0eeSFilipe Manana 			 * always in stale file handle errors, and would not
37322f2ff0eeSFilipe Manana 			 * be possible to rmdir the parent directory, since
37332f2ff0eeSFilipe Manana 			 * its i_size could never decrement to the value
37342f2ff0eeSFilipe Manana 			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
37352f2ff0eeSFilipe Manana 			 */
37362f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
37372f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
37382f2ff0eeSFilipe Manana 			if (ctx &&
37392f2ff0eeSFilipe Manana 			    (btrfs_dir_transid(src, di) == trans->transid ||
37402f2ff0eeSFilipe Manana 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
37412f2ff0eeSFilipe Manana 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
37422f2ff0eeSFilipe Manana 				ctx->log_new_dentries = true;
3743e02119d5SChris Mason 		}
3744e02119d5SChris Mason 		path->slots[0] = nritems;
3745e02119d5SChris Mason 
3746e02119d5SChris Mason 		/*
3747e02119d5SChris Mason 		 * look ahead to the next item and see if it is also
3748e02119d5SChris Mason 		 * from this directory and from this transaction
3749e02119d5SChris Mason 		 */
3750e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
375180c0b421SLiu Bo 		if (ret) {
375280c0b421SLiu Bo 			if (ret == 1)
3753e02119d5SChris Mason 				last_offset = (u64)-1;
375480c0b421SLiu Bo 			else
375580c0b421SLiu Bo 				err = ret;
3756e02119d5SChris Mason 			goto done;
3757e02119d5SChris Mason 		}
3758e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
375933345d01SLi Zefan 		if (tmp.objectid != ino || tmp.type != key_type) {
3760e02119d5SChris Mason 			last_offset = (u64)-1;
3761e02119d5SChris Mason 			goto done;
3762e02119d5SChris Mason 		}
3763e02119d5SChris Mason 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3764e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3765e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3766e02119d5SChris Mason 					     &tmp);
37674a500fd1SYan, Zheng 			if (ret)
37684a500fd1SYan, Zheng 				err = ret;
37694a500fd1SYan, Zheng 			else
3770e02119d5SChris Mason 				last_offset = tmp.offset;
3771e02119d5SChris Mason 			goto done;
3772e02119d5SChris Mason 		}
3773e02119d5SChris Mason 	}
3774e02119d5SChris Mason done:
3775b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3776b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
3777e02119d5SChris Mason 
37784a500fd1SYan, Zheng 	if (err == 0) {
37794a500fd1SYan, Zheng 		*last_offset_ret = last_offset;
37804a500fd1SYan, Zheng 		/*
37814a500fd1SYan, Zheng 		 * insert the log range keys to indicate where the log
37824a500fd1SYan, Zheng 		 * is valid
37834a500fd1SYan, Zheng 		 */
37844a500fd1SYan, Zheng 		ret = insert_dir_log_key(trans, log, path, key_type,
378533345d01SLi Zefan 					 ino, first_offset, last_offset);
37864a500fd1SYan, Zheng 		if (ret)
37874a500fd1SYan, Zheng 			err = ret;
37884a500fd1SYan, Zheng 	}
37894a500fd1SYan, Zheng 	return err;
3790e02119d5SChris Mason }
3791e02119d5SChris Mason 
3792e02119d5SChris Mason /*
3793e02119d5SChris Mason  * logging directories is very similar to logging inodes, We find all the items
3794e02119d5SChris Mason  * from the current transaction and write them to the log.
3795e02119d5SChris Mason  *
3796e02119d5SChris Mason  * The recovery code scans the directory in the subvolume, and if it finds a
3797e02119d5SChris Mason  * key in the range logged that is not present in the log tree, then it means
3798e02119d5SChris Mason  * that dir entry was unlinked during the transaction.
3799e02119d5SChris Mason  *
3800e02119d5SChris Mason  * In order for that scan to work, we must include one key smaller than
3801e02119d5SChris Mason  * the smallest logged by this transaction and one key larger than the largest
3802e02119d5SChris Mason  * key logged by this transaction.
3803e02119d5SChris Mason  */
3804e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3805dbf39ea4SNikolay Borisov 			  struct btrfs_root *root, struct btrfs_inode *inode,
3806e02119d5SChris Mason 			  struct btrfs_path *path,
38072f2ff0eeSFilipe Manana 			  struct btrfs_path *dst_path,
38082f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx)
3809e02119d5SChris Mason {
3810e02119d5SChris Mason 	u64 min_key;
3811e02119d5SChris Mason 	u64 max_key;
3812e02119d5SChris Mason 	int ret;
3813e02119d5SChris Mason 	int key_type = BTRFS_DIR_ITEM_KEY;
3814e02119d5SChris Mason 
3815e02119d5SChris Mason again:
3816e02119d5SChris Mason 	min_key = 0;
3817e02119d5SChris Mason 	max_key = 0;
3818e02119d5SChris Mason 	while (1) {
3819dbf39ea4SNikolay Borisov 		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3820dbf39ea4SNikolay Borisov 				ctx, min_key, &max_key);
38214a500fd1SYan, Zheng 		if (ret)
38224a500fd1SYan, Zheng 			return ret;
3823e02119d5SChris Mason 		if (max_key == (u64)-1)
3824e02119d5SChris Mason 			break;
3825e02119d5SChris Mason 		min_key = max_key + 1;
3826e02119d5SChris Mason 	}
3827e02119d5SChris Mason 
3828e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3829e02119d5SChris Mason 		key_type = BTRFS_DIR_INDEX_KEY;
3830e02119d5SChris Mason 		goto again;
3831e02119d5SChris Mason 	}
3832e02119d5SChris Mason 	return 0;
3833e02119d5SChris Mason }
3834e02119d5SChris Mason 
3835e02119d5SChris Mason /*
3836e02119d5SChris Mason  * a helper function to drop items from the log before we relog an
3837e02119d5SChris Mason  * inode.  max_key_type indicates the highest item type to remove.
3838e02119d5SChris Mason  * This cannot be run for file data extents because it does not
3839e02119d5SChris Mason  * free the extents they point to.
3840e02119d5SChris Mason  */
3841e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans,
3842e02119d5SChris Mason 				  struct btrfs_root *log,
3843e02119d5SChris Mason 				  struct btrfs_path *path,
3844e02119d5SChris Mason 				  u64 objectid, int max_key_type)
3845e02119d5SChris Mason {
3846e02119d5SChris Mason 	int ret;
3847e02119d5SChris Mason 	struct btrfs_key key;
3848e02119d5SChris Mason 	struct btrfs_key found_key;
384918ec90d6SJosef Bacik 	int start_slot;
3850e02119d5SChris Mason 
3851e02119d5SChris Mason 	key.objectid = objectid;
3852e02119d5SChris Mason 	key.type = max_key_type;
3853e02119d5SChris Mason 	key.offset = (u64)-1;
3854e02119d5SChris Mason 
3855e02119d5SChris Mason 	while (1) {
3856e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
38573650860bSJosef Bacik 		BUG_ON(ret == 0); /* Logic error */
38584a500fd1SYan, Zheng 		if (ret < 0)
3859e02119d5SChris Mason 			break;
3860e02119d5SChris Mason 
3861e02119d5SChris Mason 		if (path->slots[0] == 0)
3862e02119d5SChris Mason 			break;
3863e02119d5SChris Mason 
3864e02119d5SChris Mason 		path->slots[0]--;
3865e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3866e02119d5SChris Mason 				      path->slots[0]);
3867e02119d5SChris Mason 
3868e02119d5SChris Mason 		if (found_key.objectid != objectid)
3869e02119d5SChris Mason 			break;
3870e02119d5SChris Mason 
387118ec90d6SJosef Bacik 		found_key.offset = 0;
387218ec90d6SJosef Bacik 		found_key.type = 0;
3873e3b83361SQu Wenruo 		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3874cbca7d59SFilipe Manana 		if (ret < 0)
3875cbca7d59SFilipe Manana 			break;
387618ec90d6SJosef Bacik 
387718ec90d6SJosef Bacik 		ret = btrfs_del_items(trans, log, path, start_slot,
387818ec90d6SJosef Bacik 				      path->slots[0] - start_slot + 1);
387918ec90d6SJosef Bacik 		/*
388018ec90d6SJosef Bacik 		 * If start slot isn't 0 then we don't need to re-search, we've
388118ec90d6SJosef Bacik 		 * found the last guy with the objectid in this tree.
388218ec90d6SJosef Bacik 		 */
388318ec90d6SJosef Bacik 		if (ret || start_slot != 0)
388465a246c5STsutomu Itoh 			break;
3885b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3886e02119d5SChris Mason 	}
3887b3b4aa74SDavid Sterba 	btrfs_release_path(path);
38885bdbeb21SJosef Bacik 	if (ret > 0)
38895bdbeb21SJosef Bacik 		ret = 0;
38904a500fd1SYan, Zheng 	return ret;
3891e02119d5SChris Mason }
3892e02119d5SChris Mason 
389394edf4aeSJosef Bacik static void fill_inode_item(struct btrfs_trans_handle *trans,
389494edf4aeSJosef Bacik 			    struct extent_buffer *leaf,
389594edf4aeSJosef Bacik 			    struct btrfs_inode_item *item,
38961a4bcf47SFilipe Manana 			    struct inode *inode, int log_inode_only,
38971a4bcf47SFilipe Manana 			    u64 logged_isize)
389894edf4aeSJosef Bacik {
38990b1c6ccaSJosef Bacik 	struct btrfs_map_token token;
390094edf4aeSJosef Bacik 
3901c82f823cSDavid Sterba 	btrfs_init_map_token(&token, leaf);
390294edf4aeSJosef Bacik 
390394edf4aeSJosef Bacik 	if (log_inode_only) {
390494edf4aeSJosef Bacik 		/* set the generation to zero so the recover code
390594edf4aeSJosef Bacik 		 * can tell the difference between an logging
390694edf4aeSJosef Bacik 		 * just to say 'this inode exists' and a logging
390794edf4aeSJosef Bacik 		 * to say 'update this inode with these values'
390894edf4aeSJosef Bacik 		 */
3909cc4c13d5SDavid Sterba 		btrfs_set_token_inode_generation(&token, item, 0);
3910cc4c13d5SDavid Sterba 		btrfs_set_token_inode_size(&token, item, logged_isize);
391194edf4aeSJosef Bacik 	} else {
3912cc4c13d5SDavid Sterba 		btrfs_set_token_inode_generation(&token, item,
3913cc4c13d5SDavid Sterba 						 BTRFS_I(inode)->generation);
3914cc4c13d5SDavid Sterba 		btrfs_set_token_inode_size(&token, item, inode->i_size);
391594edf4aeSJosef Bacik 	}
391694edf4aeSJosef Bacik 
3917cc4c13d5SDavid Sterba 	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3918cc4c13d5SDavid Sterba 	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3919cc4c13d5SDavid Sterba 	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3920cc4c13d5SDavid Sterba 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
39210b1c6ccaSJosef Bacik 
3922cc4c13d5SDavid Sterba 	btrfs_set_token_timespec_sec(&token, &item->atime,
3923cc4c13d5SDavid Sterba 				     inode->i_atime.tv_sec);
3924cc4c13d5SDavid Sterba 	btrfs_set_token_timespec_nsec(&token, &item->atime,
3925cc4c13d5SDavid Sterba 				      inode->i_atime.tv_nsec);
39260b1c6ccaSJosef Bacik 
3927cc4c13d5SDavid Sterba 	btrfs_set_token_timespec_sec(&token, &item->mtime,
3928cc4c13d5SDavid Sterba 				     inode->i_mtime.tv_sec);
3929cc4c13d5SDavid Sterba 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3930cc4c13d5SDavid Sterba 				      inode->i_mtime.tv_nsec);
39310b1c6ccaSJosef Bacik 
3932cc4c13d5SDavid Sterba 	btrfs_set_token_timespec_sec(&token, &item->ctime,
3933cc4c13d5SDavid Sterba 				     inode->i_ctime.tv_sec);
3934cc4c13d5SDavid Sterba 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3935cc4c13d5SDavid Sterba 				      inode->i_ctime.tv_nsec);
39360b1c6ccaSJosef Bacik 
3937e593e54eSFilipe Manana 	/*
3938e593e54eSFilipe Manana 	 * We do not need to set the nbytes field, in fact during a fast fsync
3939e593e54eSFilipe Manana 	 * its value may not even be correct, since a fast fsync does not wait
3940e593e54eSFilipe Manana 	 * for ordered extent completion, which is where we update nbytes, it
3941e593e54eSFilipe Manana 	 * only waits for writeback to complete. During log replay as we find
3942e593e54eSFilipe Manana 	 * file extent items and replay them, we adjust the nbytes field of the
3943e593e54eSFilipe Manana 	 * inode item in subvolume tree as needed (see overwrite_item()).
3944e593e54eSFilipe Manana 	 */
39450b1c6ccaSJosef Bacik 
3946cc4c13d5SDavid Sterba 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3947cc4c13d5SDavid Sterba 	btrfs_set_token_inode_transid(&token, item, trans->transid);
3948cc4c13d5SDavid Sterba 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3949cc4c13d5SDavid Sterba 	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3950cc4c13d5SDavid Sterba 	btrfs_set_token_inode_block_group(&token, item, 0);
395194edf4aeSJosef Bacik }
395294edf4aeSJosef Bacik 
3953a95249b3SJosef Bacik static int log_inode_item(struct btrfs_trans_handle *trans,
3954a95249b3SJosef Bacik 			  struct btrfs_root *log, struct btrfs_path *path,
39556d889a3bSNikolay Borisov 			  struct btrfs_inode *inode)
3956a95249b3SJosef Bacik {
3957a95249b3SJosef Bacik 	struct btrfs_inode_item *inode_item;
3958a95249b3SJosef Bacik 	int ret;
3959a95249b3SJosef Bacik 
3960efd0c405SFilipe David Borba Manana 	ret = btrfs_insert_empty_item(trans, log, path,
39616d889a3bSNikolay Borisov 				      &inode->location, sizeof(*inode_item));
3962a95249b3SJosef Bacik 	if (ret && ret != -EEXIST)
3963a95249b3SJosef Bacik 		return ret;
3964a95249b3SJosef Bacik 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3965a95249b3SJosef Bacik 				    struct btrfs_inode_item);
39666d889a3bSNikolay Borisov 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
39676d889a3bSNikolay Borisov 			0, 0);
3968a95249b3SJosef Bacik 	btrfs_release_path(path);
3969a95249b3SJosef Bacik 	return 0;
3970a95249b3SJosef Bacik }
3971a95249b3SJosef Bacik 
397240e046acSFilipe Manana static int log_csums(struct btrfs_trans_handle *trans,
39733ebac17cSFilipe Manana 		     struct btrfs_inode *inode,
397440e046acSFilipe Manana 		     struct btrfs_root *log_root,
397540e046acSFilipe Manana 		     struct btrfs_ordered_sum *sums)
397640e046acSFilipe Manana {
3977e289f03eSFilipe Manana 	const u64 lock_end = sums->bytenr + sums->len - 1;
3978e289f03eSFilipe Manana 	struct extent_state *cached_state = NULL;
397940e046acSFilipe Manana 	int ret;
398040e046acSFilipe Manana 
398140e046acSFilipe Manana 	/*
39823ebac17cSFilipe Manana 	 * If this inode was not used for reflink operations in the current
39833ebac17cSFilipe Manana 	 * transaction with new extents, then do the fast path, no need to
39843ebac17cSFilipe Manana 	 * worry about logging checksum items with overlapping ranges.
39853ebac17cSFilipe Manana 	 */
39863ebac17cSFilipe Manana 	if (inode->last_reflink_trans < trans->transid)
39873ebac17cSFilipe Manana 		return btrfs_csum_file_blocks(trans, log_root, sums);
39883ebac17cSFilipe Manana 
39893ebac17cSFilipe Manana 	/*
3990e289f03eSFilipe Manana 	 * Serialize logging for checksums. This is to avoid racing with the
3991e289f03eSFilipe Manana 	 * same checksum being logged by another task that is logging another
3992e289f03eSFilipe Manana 	 * file which happens to refer to the same extent as well. Such races
3993e289f03eSFilipe Manana 	 * can leave checksum items in the log with overlapping ranges.
3994e289f03eSFilipe Manana 	 */
3995e289f03eSFilipe Manana 	ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
3996e289f03eSFilipe Manana 			       lock_end, &cached_state);
3997e289f03eSFilipe Manana 	if (ret)
3998e289f03eSFilipe Manana 		return ret;
3999e289f03eSFilipe Manana 	/*
400040e046acSFilipe Manana 	 * Due to extent cloning, we might have logged a csum item that covers a
400140e046acSFilipe Manana 	 * subrange of a cloned extent, and later we can end up logging a csum
400240e046acSFilipe Manana 	 * item for a larger subrange of the same extent or the entire range.
400340e046acSFilipe Manana 	 * This would leave csum items in the log tree that cover the same range
400440e046acSFilipe Manana 	 * and break the searches for checksums in the log tree, resulting in
400540e046acSFilipe Manana 	 * some checksums missing in the fs/subvolume tree. So just delete (or
400640e046acSFilipe Manana 	 * trim and adjust) any existing csum items in the log for this range.
400740e046acSFilipe Manana 	 */
400840e046acSFilipe Manana 	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4009e289f03eSFilipe Manana 	if (!ret)
4010e289f03eSFilipe Manana 		ret = btrfs_csum_file_blocks(trans, log_root, sums);
401140e046acSFilipe Manana 
4012e289f03eSFilipe Manana 	unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
4013e289f03eSFilipe Manana 			     &cached_state);
4014e289f03eSFilipe Manana 
4015e289f03eSFilipe Manana 	return ret;
401640e046acSFilipe Manana }
401740e046acSFilipe Manana 
401831ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans,
401944d70e19SNikolay Borisov 			       struct btrfs_inode *inode,
402031ff1cd2SChris Mason 			       struct btrfs_path *dst_path,
40210e56315cSFilipe Manana 			       struct btrfs_path *src_path,
40221a4bcf47SFilipe Manana 			       int start_slot, int nr, int inode_only,
40231a4bcf47SFilipe Manana 			       u64 logged_isize)
402431ff1cd2SChris Mason {
40253ffbd68cSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
402631ff1cd2SChris Mason 	unsigned long src_offset;
402731ff1cd2SChris Mason 	unsigned long dst_offset;
402844d70e19SNikolay Borisov 	struct btrfs_root *log = inode->root->log_root;
402931ff1cd2SChris Mason 	struct btrfs_file_extent_item *extent;
403031ff1cd2SChris Mason 	struct btrfs_inode_item *inode_item;
403116e7549fSJosef Bacik 	struct extent_buffer *src = src_path->nodes[0];
403231ff1cd2SChris Mason 	int ret;
403331ff1cd2SChris Mason 	struct btrfs_key *ins_keys;
403431ff1cd2SChris Mason 	u32 *ins_sizes;
403531ff1cd2SChris Mason 	char *ins_data;
403631ff1cd2SChris Mason 	int i;
4037d20f7043SChris Mason 	struct list_head ordered_sums;
403844d70e19SNikolay Borisov 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4039d20f7043SChris Mason 
4040d20f7043SChris Mason 	INIT_LIST_HEAD(&ordered_sums);
404131ff1cd2SChris Mason 
404231ff1cd2SChris Mason 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
404331ff1cd2SChris Mason 			   nr * sizeof(u32), GFP_NOFS);
40442a29edc6Sliubo 	if (!ins_data)
40452a29edc6Sliubo 		return -ENOMEM;
40462a29edc6Sliubo 
404731ff1cd2SChris Mason 	ins_sizes = (u32 *)ins_data;
404831ff1cd2SChris Mason 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
404931ff1cd2SChris Mason 
405031ff1cd2SChris Mason 	for (i = 0; i < nr; i++) {
405131ff1cd2SChris Mason 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
405231ff1cd2SChris Mason 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
405331ff1cd2SChris Mason 	}
405431ff1cd2SChris Mason 	ret = btrfs_insert_empty_items(trans, log, dst_path,
405531ff1cd2SChris Mason 				       ins_keys, ins_sizes, nr);
40564a500fd1SYan, Zheng 	if (ret) {
40574a500fd1SYan, Zheng 		kfree(ins_data);
40584a500fd1SYan, Zheng 		return ret;
40594a500fd1SYan, Zheng 	}
406031ff1cd2SChris Mason 
40615d4f98a2SYan Zheng 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
406231ff1cd2SChris Mason 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
406331ff1cd2SChris Mason 						   dst_path->slots[0]);
406431ff1cd2SChris Mason 
406531ff1cd2SChris Mason 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
406631ff1cd2SChris Mason 
406794edf4aeSJosef Bacik 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
406831ff1cd2SChris Mason 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
406931ff1cd2SChris Mason 						    dst_path->slots[0],
407031ff1cd2SChris Mason 						    struct btrfs_inode_item);
407194edf4aeSJosef Bacik 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4072f85b7379SDavid Sterba 					&inode->vfs_inode,
4073f85b7379SDavid Sterba 					inode_only == LOG_INODE_EXISTS,
40741a4bcf47SFilipe Manana 					logged_isize);
407594edf4aeSJosef Bacik 		} else {
407694edf4aeSJosef Bacik 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
407794edf4aeSJosef Bacik 					   src_offset, ins_sizes[i]);
407831ff1cd2SChris Mason 		}
407994edf4aeSJosef Bacik 
408031ff1cd2SChris Mason 		/* take a reference on file data extents so that truncates
408131ff1cd2SChris Mason 		 * or deletes of this inode don't have to relog the inode
408231ff1cd2SChris Mason 		 * again
408331ff1cd2SChris Mason 		 */
4084962a298fSDavid Sterba 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4085d2794405SLiu Bo 		    !skip_csum) {
408631ff1cd2SChris Mason 			int found_type;
408731ff1cd2SChris Mason 			extent = btrfs_item_ptr(src, start_slot + i,
408831ff1cd2SChris Mason 						struct btrfs_file_extent_item);
408931ff1cd2SChris Mason 
40908e531cdfSliubo 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
40918e531cdfSliubo 				continue;
40928e531cdfSliubo 
409331ff1cd2SChris Mason 			found_type = btrfs_file_extent_type(src, extent);
40946f1fed77SJosef Bacik 			if (found_type == BTRFS_FILE_EXTENT_REG) {
40955d4f98a2SYan Zheng 				u64 ds, dl, cs, cl;
40965d4f98a2SYan Zheng 				ds = btrfs_file_extent_disk_bytenr(src,
409731ff1cd2SChris Mason 								extent);
40985d4f98a2SYan Zheng 				/* ds == 0 is a hole */
40995d4f98a2SYan Zheng 				if (ds == 0)
41005d4f98a2SYan Zheng 					continue;
41015d4f98a2SYan Zheng 
41025d4f98a2SYan Zheng 				dl = btrfs_file_extent_disk_num_bytes(src,
410331ff1cd2SChris Mason 								extent);
41045d4f98a2SYan Zheng 				cs = btrfs_file_extent_offset(src, extent);
41055d4f98a2SYan Zheng 				cl = btrfs_file_extent_num_bytes(src,
4106a419aef8SJoe Perches 								extent);
4107580afd76SChris Mason 				if (btrfs_file_extent_compression(src,
4108580afd76SChris Mason 								  extent)) {
4109580afd76SChris Mason 					cs = 0;
4110580afd76SChris Mason 					cl = dl;
4111580afd76SChris Mason 				}
41125d4f98a2SYan Zheng 
411307d400a6SYan Zheng 				ret = btrfs_lookup_csums_range(
41140b246afaSJeff Mahoney 						fs_info->csum_root,
411507d400a6SYan Zheng 						ds + cs, ds + cs + cl - 1,
4116a2de733cSArne Jansen 						&ordered_sums, 0);
41174f26433eSFilipe Manana 				if (ret)
41184f26433eSFilipe Manana 					break;
411931ff1cd2SChris Mason 			}
412031ff1cd2SChris Mason 		}
412131ff1cd2SChris Mason 	}
412231ff1cd2SChris Mason 
412331ff1cd2SChris Mason 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4124b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
412531ff1cd2SChris Mason 	kfree(ins_data);
4126d20f7043SChris Mason 
4127d20f7043SChris Mason 	/*
4128d20f7043SChris Mason 	 * we have to do this after the loop above to avoid changing the
4129d20f7043SChris Mason 	 * log tree while trying to change the log tree.
4130d20f7043SChris Mason 	 */
4131d20f7043SChris Mason 	while (!list_empty(&ordered_sums)) {
4132d20f7043SChris Mason 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4133d20f7043SChris Mason 						   struct btrfs_ordered_sum,
4134d20f7043SChris Mason 						   list);
41354a500fd1SYan, Zheng 		if (!ret)
41363ebac17cSFilipe Manana 			ret = log_csums(trans, inode, log, sums);
4137d20f7043SChris Mason 		list_del(&sums->list);
4138d20f7043SChris Mason 		kfree(sums);
4139d20f7043SChris Mason 	}
414016e7549fSJosef Bacik 
41414a500fd1SYan, Zheng 	return ret;
414231ff1cd2SChris Mason }
414331ff1cd2SChris Mason 
41445dc562c5SJosef Bacik static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
41455dc562c5SJosef Bacik {
41465dc562c5SJosef Bacik 	struct extent_map *em1, *em2;
41475dc562c5SJosef Bacik 
41485dc562c5SJosef Bacik 	em1 = list_entry(a, struct extent_map, list);
41495dc562c5SJosef Bacik 	em2 = list_entry(b, struct extent_map, list);
41505dc562c5SJosef Bacik 
41515dc562c5SJosef Bacik 	if (em1->start < em2->start)
41525dc562c5SJosef Bacik 		return -1;
41535dc562c5SJosef Bacik 	else if (em1->start > em2->start)
41545dc562c5SJosef Bacik 		return 1;
41555dc562c5SJosef Bacik 	return 0;
41565dc562c5SJosef Bacik }
41575dc562c5SJosef Bacik 
4158e7175a69SJosef Bacik static int log_extent_csums(struct btrfs_trans_handle *trans,
4159e7175a69SJosef Bacik 			    struct btrfs_inode *inode,
4160a9ecb653SNikolay Borisov 			    struct btrfs_root *log_root,
416148778179SFilipe Manana 			    const struct extent_map *em,
416248778179SFilipe Manana 			    struct btrfs_log_ctx *ctx)
41635dc562c5SJosef Bacik {
416448778179SFilipe Manana 	struct btrfs_ordered_extent *ordered;
41652ab28f32SJosef Bacik 	u64 csum_offset;
41662ab28f32SJosef Bacik 	u64 csum_len;
416748778179SFilipe Manana 	u64 mod_start = em->mod_start;
416848778179SFilipe Manana 	u64 mod_len = em->mod_len;
41698407f553SFilipe Manana 	LIST_HEAD(ordered_sums);
41708407f553SFilipe Manana 	int ret = 0;
417109a2a8f9SJosef Bacik 
4172e7175a69SJosef Bacik 	if (inode->flags & BTRFS_INODE_NODATASUM ||
4173e7175a69SJosef Bacik 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
41748407f553SFilipe Manana 	    em->block_start == EXTENT_MAP_HOLE)
417570c8a91cSJosef Bacik 		return 0;
417670c8a91cSJosef Bacik 
417748778179SFilipe Manana 	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
417848778179SFilipe Manana 		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
417948778179SFilipe Manana 		const u64 mod_end = mod_start + mod_len;
418048778179SFilipe Manana 		struct btrfs_ordered_sum *sums;
418148778179SFilipe Manana 
418248778179SFilipe Manana 		if (mod_len == 0)
418348778179SFilipe Manana 			break;
418448778179SFilipe Manana 
418548778179SFilipe Manana 		if (ordered_end <= mod_start)
418648778179SFilipe Manana 			continue;
418748778179SFilipe Manana 		if (mod_end <= ordered->file_offset)
418848778179SFilipe Manana 			break;
418948778179SFilipe Manana 
419048778179SFilipe Manana 		/*
419148778179SFilipe Manana 		 * We are going to copy all the csums on this ordered extent, so
419248778179SFilipe Manana 		 * go ahead and adjust mod_start and mod_len in case this ordered
419348778179SFilipe Manana 		 * extent has already been logged.
419448778179SFilipe Manana 		 */
419548778179SFilipe Manana 		if (ordered->file_offset > mod_start) {
419648778179SFilipe Manana 			if (ordered_end >= mod_end)
419748778179SFilipe Manana 				mod_len = ordered->file_offset - mod_start;
419848778179SFilipe Manana 			/*
419948778179SFilipe Manana 			 * If we have this case
420048778179SFilipe Manana 			 *
420148778179SFilipe Manana 			 * |--------- logged extent ---------|
420248778179SFilipe Manana 			 *       |----- ordered extent ----|
420348778179SFilipe Manana 			 *
420448778179SFilipe Manana 			 * Just don't mess with mod_start and mod_len, we'll
420548778179SFilipe Manana 			 * just end up logging more csums than we need and it
420648778179SFilipe Manana 			 * will be ok.
420748778179SFilipe Manana 			 */
420848778179SFilipe Manana 		} else {
420948778179SFilipe Manana 			if (ordered_end < mod_end) {
421048778179SFilipe Manana 				mod_len = mod_end - ordered_end;
421148778179SFilipe Manana 				mod_start = ordered_end;
421248778179SFilipe Manana 			} else {
421348778179SFilipe Manana 				mod_len = 0;
421448778179SFilipe Manana 			}
421548778179SFilipe Manana 		}
421648778179SFilipe Manana 
421748778179SFilipe Manana 		/*
421848778179SFilipe Manana 		 * To keep us from looping for the above case of an ordered
421948778179SFilipe Manana 		 * extent that falls inside of the logged extent.
422048778179SFilipe Manana 		 */
422148778179SFilipe Manana 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
422248778179SFilipe Manana 			continue;
422348778179SFilipe Manana 
422448778179SFilipe Manana 		list_for_each_entry(sums, &ordered->list, list) {
422548778179SFilipe Manana 			ret = log_csums(trans, inode, log_root, sums);
422648778179SFilipe Manana 			if (ret)
422748778179SFilipe Manana 				return ret;
422848778179SFilipe Manana 		}
422948778179SFilipe Manana 	}
423048778179SFilipe Manana 
423148778179SFilipe Manana 	/* We're done, found all csums in the ordered extents. */
423248778179SFilipe Manana 	if (mod_len == 0)
423348778179SFilipe Manana 		return 0;
423448778179SFilipe Manana 
4235e7175a69SJosef Bacik 	/* If we're compressed we have to save the entire range of csums. */
4236488111aaSFilipe David Borba Manana 	if (em->compress_type) {
4237488111aaSFilipe David Borba Manana 		csum_offset = 0;
42388407f553SFilipe Manana 		csum_len = max(em->block_len, em->orig_block_len);
4239488111aaSFilipe David Borba Manana 	} else {
424048778179SFilipe Manana 		csum_offset = mod_start - em->start;
424148778179SFilipe Manana 		csum_len = mod_len;
4242488111aaSFilipe David Borba Manana 	}
42432ab28f32SJosef Bacik 
424470c8a91cSJosef Bacik 	/* block start is already adjusted for the file extent offset. */
4245a9ecb653SNikolay Borisov 	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
424670c8a91cSJosef Bacik 				       em->block_start + csum_offset,
424770c8a91cSJosef Bacik 				       em->block_start + csum_offset +
424870c8a91cSJosef Bacik 				       csum_len - 1, &ordered_sums, 0);
42495dc562c5SJosef Bacik 	if (ret)
42505dc562c5SJosef Bacik 		return ret;
425170c8a91cSJosef Bacik 
425270c8a91cSJosef Bacik 	while (!list_empty(&ordered_sums)) {
425370c8a91cSJosef Bacik 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
425470c8a91cSJosef Bacik 						   struct btrfs_ordered_sum,
425570c8a91cSJosef Bacik 						   list);
425670c8a91cSJosef Bacik 		if (!ret)
42573ebac17cSFilipe Manana 			ret = log_csums(trans, inode, log_root, sums);
425870c8a91cSJosef Bacik 		list_del(&sums->list);
425970c8a91cSJosef Bacik 		kfree(sums);
42605dc562c5SJosef Bacik 	}
42615dc562c5SJosef Bacik 
426270c8a91cSJosef Bacik 	return ret;
42635dc562c5SJosef Bacik }
42645dc562c5SJosef Bacik 
42658407f553SFilipe Manana static int log_one_extent(struct btrfs_trans_handle *trans,
42669d122629SNikolay Borisov 			  struct btrfs_inode *inode, struct btrfs_root *root,
42678407f553SFilipe Manana 			  const struct extent_map *em,
42688407f553SFilipe Manana 			  struct btrfs_path *path,
42698407f553SFilipe Manana 			  struct btrfs_log_ctx *ctx)
42708407f553SFilipe Manana {
42715893dfb9SFilipe Manana 	struct btrfs_drop_extents_args drop_args = { 0 };
42728407f553SFilipe Manana 	struct btrfs_root *log = root->log_root;
42738407f553SFilipe Manana 	struct btrfs_file_extent_item *fi;
42748407f553SFilipe Manana 	struct extent_buffer *leaf;
42758407f553SFilipe Manana 	struct btrfs_map_token token;
42768407f553SFilipe Manana 	struct btrfs_key key;
42778407f553SFilipe Manana 	u64 extent_offset = em->start - em->orig_start;
42788407f553SFilipe Manana 	u64 block_len;
42798407f553SFilipe Manana 	int ret;
42808407f553SFilipe Manana 
428148778179SFilipe Manana 	ret = log_extent_csums(trans, inode, log, em, ctx);
42828407f553SFilipe Manana 	if (ret)
42838407f553SFilipe Manana 		return ret;
42848407f553SFilipe Manana 
42855893dfb9SFilipe Manana 	drop_args.path = path;
42865893dfb9SFilipe Manana 	drop_args.start = em->start;
42875893dfb9SFilipe Manana 	drop_args.end = em->start + em->len;
42885893dfb9SFilipe Manana 	drop_args.replace_extent = true;
42895893dfb9SFilipe Manana 	drop_args.extent_item_size = sizeof(*fi);
42905893dfb9SFilipe Manana 	ret = btrfs_drop_extents(trans, log, inode, &drop_args);
42918407f553SFilipe Manana 	if (ret)
42928407f553SFilipe Manana 		return ret;
42938407f553SFilipe Manana 
42945893dfb9SFilipe Manana 	if (!drop_args.extent_inserted) {
42959d122629SNikolay Borisov 		key.objectid = btrfs_ino(inode);
42968407f553SFilipe Manana 		key.type = BTRFS_EXTENT_DATA_KEY;
42978407f553SFilipe Manana 		key.offset = em->start;
42988407f553SFilipe Manana 
42998407f553SFilipe Manana 		ret = btrfs_insert_empty_item(trans, log, path, &key,
43008407f553SFilipe Manana 					      sizeof(*fi));
43018407f553SFilipe Manana 		if (ret)
43028407f553SFilipe Manana 			return ret;
43038407f553SFilipe Manana 	}
43048407f553SFilipe Manana 	leaf = path->nodes[0];
4305c82f823cSDavid Sterba 	btrfs_init_map_token(&token, leaf);
43068407f553SFilipe Manana 	fi = btrfs_item_ptr(leaf, path->slots[0],
43078407f553SFilipe Manana 			    struct btrfs_file_extent_item);
43088407f553SFilipe Manana 
4309cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
43108407f553SFilipe Manana 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4311cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_type(&token, fi,
4312cc4c13d5SDavid Sterba 						 BTRFS_FILE_EXTENT_PREALLOC);
43138407f553SFilipe Manana 	else
4314cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_type(&token, fi,
4315cc4c13d5SDavid Sterba 						 BTRFS_FILE_EXTENT_REG);
43168407f553SFilipe Manana 
43178407f553SFilipe Manana 	block_len = max(em->block_len, em->orig_block_len);
43188407f553SFilipe Manana 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4319cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4320cc4c13d5SDavid Sterba 							em->block_start);
4321cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
43228407f553SFilipe Manana 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4323cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
43248407f553SFilipe Manana 							em->block_start -
4325cc4c13d5SDavid Sterba 							extent_offset);
4326cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
43278407f553SFilipe Manana 	} else {
4328cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4329cc4c13d5SDavid Sterba 		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
43308407f553SFilipe Manana 	}
43318407f553SFilipe Manana 
4332cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4333cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4334cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4335cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4336cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_encryption(&token, fi, 0);
4337cc4c13d5SDavid Sterba 	btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
43388407f553SFilipe Manana 	btrfs_mark_buffer_dirty(leaf);
43398407f553SFilipe Manana 
43408407f553SFilipe Manana 	btrfs_release_path(path);
43418407f553SFilipe Manana 
43428407f553SFilipe Manana 	return ret;
43438407f553SFilipe Manana }
43448407f553SFilipe Manana 
434531d11b83SFilipe Manana /*
434631d11b83SFilipe Manana  * Log all prealloc extents beyond the inode's i_size to make sure we do not
434731d11b83SFilipe Manana  * lose them after doing a fast fsync and replaying the log. We scan the
434831d11b83SFilipe Manana  * subvolume's root instead of iterating the inode's extent map tree because
434931d11b83SFilipe Manana  * otherwise we can log incorrect extent items based on extent map conversion.
435031d11b83SFilipe Manana  * That can happen due to the fact that extent maps are merged when they
435131d11b83SFilipe Manana  * are not in the extent map tree's list of modified extents.
435231d11b83SFilipe Manana  */
435331d11b83SFilipe Manana static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
435431d11b83SFilipe Manana 				      struct btrfs_inode *inode,
435531d11b83SFilipe Manana 				      struct btrfs_path *path)
435631d11b83SFilipe Manana {
435731d11b83SFilipe Manana 	struct btrfs_root *root = inode->root;
435831d11b83SFilipe Manana 	struct btrfs_key key;
435931d11b83SFilipe Manana 	const u64 i_size = i_size_read(&inode->vfs_inode);
436031d11b83SFilipe Manana 	const u64 ino = btrfs_ino(inode);
436131d11b83SFilipe Manana 	struct btrfs_path *dst_path = NULL;
43620e56315cSFilipe Manana 	bool dropped_extents = false;
4363f135cea3SFilipe Manana 	u64 truncate_offset = i_size;
4364f135cea3SFilipe Manana 	struct extent_buffer *leaf;
4365f135cea3SFilipe Manana 	int slot;
436631d11b83SFilipe Manana 	int ins_nr = 0;
436731d11b83SFilipe Manana 	int start_slot;
436831d11b83SFilipe Manana 	int ret;
436931d11b83SFilipe Manana 
437031d11b83SFilipe Manana 	if (!(inode->flags & BTRFS_INODE_PREALLOC))
437131d11b83SFilipe Manana 		return 0;
437231d11b83SFilipe Manana 
437331d11b83SFilipe Manana 	key.objectid = ino;
437431d11b83SFilipe Manana 	key.type = BTRFS_EXTENT_DATA_KEY;
437531d11b83SFilipe Manana 	key.offset = i_size;
437631d11b83SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
437731d11b83SFilipe Manana 	if (ret < 0)
437831d11b83SFilipe Manana 		goto out;
437931d11b83SFilipe Manana 
4380f135cea3SFilipe Manana 	/*
4381f135cea3SFilipe Manana 	 * We must check if there is a prealloc extent that starts before the
4382f135cea3SFilipe Manana 	 * i_size and crosses the i_size boundary. This is to ensure later we
4383f135cea3SFilipe Manana 	 * truncate down to the end of that extent and not to the i_size, as
4384f135cea3SFilipe Manana 	 * otherwise we end up losing part of the prealloc extent after a log
4385f135cea3SFilipe Manana 	 * replay and with an implicit hole if there is another prealloc extent
4386f135cea3SFilipe Manana 	 * that starts at an offset beyond i_size.
4387f135cea3SFilipe Manana 	 */
4388f135cea3SFilipe Manana 	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4389f135cea3SFilipe Manana 	if (ret < 0)
4390f135cea3SFilipe Manana 		goto out;
4391f135cea3SFilipe Manana 
4392f135cea3SFilipe Manana 	if (ret == 0) {
4393f135cea3SFilipe Manana 		struct btrfs_file_extent_item *ei;
4394f135cea3SFilipe Manana 
4395f135cea3SFilipe Manana 		leaf = path->nodes[0];
4396f135cea3SFilipe Manana 		slot = path->slots[0];
4397f135cea3SFilipe Manana 		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4398f135cea3SFilipe Manana 
4399f135cea3SFilipe Manana 		if (btrfs_file_extent_type(leaf, ei) ==
4400f135cea3SFilipe Manana 		    BTRFS_FILE_EXTENT_PREALLOC) {
4401f135cea3SFilipe Manana 			u64 extent_end;
4402f135cea3SFilipe Manana 
4403f135cea3SFilipe Manana 			btrfs_item_key_to_cpu(leaf, &key, slot);
4404f135cea3SFilipe Manana 			extent_end = key.offset +
4405f135cea3SFilipe Manana 				btrfs_file_extent_num_bytes(leaf, ei);
4406f135cea3SFilipe Manana 
4407f135cea3SFilipe Manana 			if (extent_end > i_size)
4408f135cea3SFilipe Manana 				truncate_offset = extent_end;
4409f135cea3SFilipe Manana 		}
4410f135cea3SFilipe Manana 	} else {
4411f135cea3SFilipe Manana 		ret = 0;
4412f135cea3SFilipe Manana 	}
4413f135cea3SFilipe Manana 
441431d11b83SFilipe Manana 	while (true) {
4415f135cea3SFilipe Manana 		leaf = path->nodes[0];
4416f135cea3SFilipe Manana 		slot = path->slots[0];
441731d11b83SFilipe Manana 
441831d11b83SFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
441931d11b83SFilipe Manana 			if (ins_nr > 0) {
442031d11b83SFilipe Manana 				ret = copy_items(trans, inode, dst_path, path,
44210e56315cSFilipe Manana 						 start_slot, ins_nr, 1, 0);
442231d11b83SFilipe Manana 				if (ret < 0)
442331d11b83SFilipe Manana 					goto out;
442431d11b83SFilipe Manana 				ins_nr = 0;
442531d11b83SFilipe Manana 			}
442631d11b83SFilipe Manana 			ret = btrfs_next_leaf(root, path);
442731d11b83SFilipe Manana 			if (ret < 0)
442831d11b83SFilipe Manana 				goto out;
442931d11b83SFilipe Manana 			if (ret > 0) {
443031d11b83SFilipe Manana 				ret = 0;
443131d11b83SFilipe Manana 				break;
443231d11b83SFilipe Manana 			}
443331d11b83SFilipe Manana 			continue;
443431d11b83SFilipe Manana 		}
443531d11b83SFilipe Manana 
443631d11b83SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
443731d11b83SFilipe Manana 		if (key.objectid > ino)
443831d11b83SFilipe Manana 			break;
443931d11b83SFilipe Manana 		if (WARN_ON_ONCE(key.objectid < ino) ||
444031d11b83SFilipe Manana 		    key.type < BTRFS_EXTENT_DATA_KEY ||
444131d11b83SFilipe Manana 		    key.offset < i_size) {
444231d11b83SFilipe Manana 			path->slots[0]++;
444331d11b83SFilipe Manana 			continue;
444431d11b83SFilipe Manana 		}
44450e56315cSFilipe Manana 		if (!dropped_extents) {
444631d11b83SFilipe Manana 			/*
444731d11b83SFilipe Manana 			 * Avoid logging extent items logged in past fsync calls
444831d11b83SFilipe Manana 			 * and leading to duplicate keys in the log tree.
444931d11b83SFilipe Manana 			 */
445031d11b83SFilipe Manana 			do {
445131d11b83SFilipe Manana 				ret = btrfs_truncate_inode_items(trans,
445231d11b83SFilipe Manana 							 root->log_root,
445350743398SNikolay Borisov 							 inode, truncate_offset,
445431d11b83SFilipe Manana 							 BTRFS_EXTENT_DATA_KEY);
445531d11b83SFilipe Manana 			} while (ret == -EAGAIN);
445631d11b83SFilipe Manana 			if (ret)
445731d11b83SFilipe Manana 				goto out;
44580e56315cSFilipe Manana 			dropped_extents = true;
445931d11b83SFilipe Manana 		}
446031d11b83SFilipe Manana 		if (ins_nr == 0)
446131d11b83SFilipe Manana 			start_slot = slot;
446231d11b83SFilipe Manana 		ins_nr++;
446331d11b83SFilipe Manana 		path->slots[0]++;
446431d11b83SFilipe Manana 		if (!dst_path) {
446531d11b83SFilipe Manana 			dst_path = btrfs_alloc_path();
446631d11b83SFilipe Manana 			if (!dst_path) {
446731d11b83SFilipe Manana 				ret = -ENOMEM;
446831d11b83SFilipe Manana 				goto out;
446931d11b83SFilipe Manana 			}
447031d11b83SFilipe Manana 		}
447131d11b83SFilipe Manana 	}
44720bc2d3c0SFilipe Manana 	if (ins_nr > 0)
44730e56315cSFilipe Manana 		ret = copy_items(trans, inode, dst_path, path,
447431d11b83SFilipe Manana 				 start_slot, ins_nr, 1, 0);
447531d11b83SFilipe Manana out:
447631d11b83SFilipe Manana 	btrfs_release_path(path);
447731d11b83SFilipe Manana 	btrfs_free_path(dst_path);
447831d11b83SFilipe Manana 	return ret;
447931d11b83SFilipe Manana }
448031d11b83SFilipe Manana 
44815dc562c5SJosef Bacik static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
44825dc562c5SJosef Bacik 				     struct btrfs_root *root,
44839d122629SNikolay Borisov 				     struct btrfs_inode *inode,
4484827463c4SMiao Xie 				     struct btrfs_path *path,
448548778179SFilipe Manana 				     struct btrfs_log_ctx *ctx)
44865dc562c5SJosef Bacik {
448748778179SFilipe Manana 	struct btrfs_ordered_extent *ordered;
448848778179SFilipe Manana 	struct btrfs_ordered_extent *tmp;
44895dc562c5SJosef Bacik 	struct extent_map *em, *n;
44905dc562c5SJosef Bacik 	struct list_head extents;
44919d122629SNikolay Borisov 	struct extent_map_tree *tree = &inode->extent_tree;
44925dc562c5SJosef Bacik 	int ret = 0;
44932ab28f32SJosef Bacik 	int num = 0;
44945dc562c5SJosef Bacik 
44955dc562c5SJosef Bacik 	INIT_LIST_HEAD(&extents);
44965dc562c5SJosef Bacik 
44975dc562c5SJosef Bacik 	write_lock(&tree->lock);
44985dc562c5SJosef Bacik 
44995dc562c5SJosef Bacik 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
45005dc562c5SJosef Bacik 		list_del_init(&em->list);
45012ab28f32SJosef Bacik 		/*
45022ab28f32SJosef Bacik 		 * Just an arbitrary number, this can be really CPU intensive
45032ab28f32SJosef Bacik 		 * once we start getting a lot of extents, and really once we
45042ab28f32SJosef Bacik 		 * have a bunch of extents we just want to commit since it will
45052ab28f32SJosef Bacik 		 * be faster.
45062ab28f32SJosef Bacik 		 */
45072ab28f32SJosef Bacik 		if (++num > 32768) {
45082ab28f32SJosef Bacik 			list_del_init(&tree->modified_extents);
45092ab28f32SJosef Bacik 			ret = -EFBIG;
45102ab28f32SJosef Bacik 			goto process;
45112ab28f32SJosef Bacik 		}
45122ab28f32SJosef Bacik 
45135f96bfb7SFilipe Manana 		if (em->generation < trans->transid)
45145dc562c5SJosef Bacik 			continue;
45158c6c5928SJosef Bacik 
451631d11b83SFilipe Manana 		/* We log prealloc extents beyond eof later. */
451731d11b83SFilipe Manana 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
451831d11b83SFilipe Manana 		    em->start >= i_size_read(&inode->vfs_inode))
451931d11b83SFilipe Manana 			continue;
452031d11b83SFilipe Manana 
4521ff44c6e3SJosef Bacik 		/* Need a ref to keep it from getting evicted from cache */
4522490b54d6SElena Reshetova 		refcount_inc(&em->refs);
4523ff44c6e3SJosef Bacik 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
45245dc562c5SJosef Bacik 		list_add_tail(&em->list, &extents);
45252ab28f32SJosef Bacik 		num++;
45265dc562c5SJosef Bacik 	}
45275dc562c5SJosef Bacik 
45285dc562c5SJosef Bacik 	list_sort(NULL, &extents, extent_cmp);
45292ab28f32SJosef Bacik process:
45305dc562c5SJosef Bacik 	while (!list_empty(&extents)) {
45315dc562c5SJosef Bacik 		em = list_entry(extents.next, struct extent_map, list);
45325dc562c5SJosef Bacik 
45335dc562c5SJosef Bacik 		list_del_init(&em->list);
45345dc562c5SJosef Bacik 
45355dc562c5SJosef Bacik 		/*
45365dc562c5SJosef Bacik 		 * If we had an error we just need to delete everybody from our
45375dc562c5SJosef Bacik 		 * private list.
45385dc562c5SJosef Bacik 		 */
4539ff44c6e3SJosef Bacik 		if (ret) {
4540201a9038SJosef Bacik 			clear_em_logging(tree, em);
4541ff44c6e3SJosef Bacik 			free_extent_map(em);
45425dc562c5SJosef Bacik 			continue;
4543ff44c6e3SJosef Bacik 		}
4544ff44c6e3SJosef Bacik 
4545ff44c6e3SJosef Bacik 		write_unlock(&tree->lock);
45465dc562c5SJosef Bacik 
4547a2120a47SJosef Bacik 		ret = log_one_extent(trans, inode, root, em, path, ctx);
4548ff44c6e3SJosef Bacik 		write_lock(&tree->lock);
4549201a9038SJosef Bacik 		clear_em_logging(tree, em);
4550201a9038SJosef Bacik 		free_extent_map(em);
45515dc562c5SJosef Bacik 	}
4552ff44c6e3SJosef Bacik 	WARN_ON(!list_empty(&extents));
4553ff44c6e3SJosef Bacik 	write_unlock(&tree->lock);
45545dc562c5SJosef Bacik 
45555dc562c5SJosef Bacik 	btrfs_release_path(path);
455631d11b83SFilipe Manana 	if (!ret)
455731d11b83SFilipe Manana 		ret = btrfs_log_prealloc_extents(trans, inode, path);
455848778179SFilipe Manana 	if (ret)
45595dc562c5SJosef Bacik 		return ret;
456048778179SFilipe Manana 
456148778179SFilipe Manana 	/*
456248778179SFilipe Manana 	 * We have logged all extents successfully, now make sure the commit of
456348778179SFilipe Manana 	 * the current transaction waits for the ordered extents to complete
456448778179SFilipe Manana 	 * before it commits and wipes out the log trees, otherwise we would
456548778179SFilipe Manana 	 * lose data if an ordered extents completes after the transaction
456648778179SFilipe Manana 	 * commits and a power failure happens after the transaction commit.
456748778179SFilipe Manana 	 */
456848778179SFilipe Manana 	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
456948778179SFilipe Manana 		list_del_init(&ordered->log_list);
457048778179SFilipe Manana 		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
457148778179SFilipe Manana 
457248778179SFilipe Manana 		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
457348778179SFilipe Manana 			spin_lock_irq(&inode->ordered_tree.lock);
457448778179SFilipe Manana 			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
457548778179SFilipe Manana 				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
457648778179SFilipe Manana 				atomic_inc(&trans->transaction->pending_ordered);
457748778179SFilipe Manana 			}
457848778179SFilipe Manana 			spin_unlock_irq(&inode->ordered_tree.lock);
457948778179SFilipe Manana 		}
458048778179SFilipe Manana 		btrfs_put_ordered_extent(ordered);
458148778179SFilipe Manana 	}
458248778179SFilipe Manana 
458348778179SFilipe Manana 	return 0;
45845dc562c5SJosef Bacik }
45855dc562c5SJosef Bacik 
4586481b01c0SNikolay Borisov static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
45871a4bcf47SFilipe Manana 			     struct btrfs_path *path, u64 *size_ret)
45881a4bcf47SFilipe Manana {
45891a4bcf47SFilipe Manana 	struct btrfs_key key;
45901a4bcf47SFilipe Manana 	int ret;
45911a4bcf47SFilipe Manana 
4592481b01c0SNikolay Borisov 	key.objectid = btrfs_ino(inode);
45931a4bcf47SFilipe Manana 	key.type = BTRFS_INODE_ITEM_KEY;
45941a4bcf47SFilipe Manana 	key.offset = 0;
45951a4bcf47SFilipe Manana 
45961a4bcf47SFilipe Manana 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
45971a4bcf47SFilipe Manana 	if (ret < 0) {
45981a4bcf47SFilipe Manana 		return ret;
45991a4bcf47SFilipe Manana 	} else if (ret > 0) {
46002f2ff0eeSFilipe Manana 		*size_ret = 0;
46011a4bcf47SFilipe Manana 	} else {
46021a4bcf47SFilipe Manana 		struct btrfs_inode_item *item;
46031a4bcf47SFilipe Manana 
46041a4bcf47SFilipe Manana 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
46051a4bcf47SFilipe Manana 				      struct btrfs_inode_item);
46061a4bcf47SFilipe Manana 		*size_ret = btrfs_inode_size(path->nodes[0], item);
4607bf504110SFilipe Manana 		/*
4608bf504110SFilipe Manana 		 * If the in-memory inode's i_size is smaller then the inode
4609bf504110SFilipe Manana 		 * size stored in the btree, return the inode's i_size, so
4610bf504110SFilipe Manana 		 * that we get a correct inode size after replaying the log
4611bf504110SFilipe Manana 		 * when before a power failure we had a shrinking truncate
4612bf504110SFilipe Manana 		 * followed by addition of a new name (rename / new hard link).
4613bf504110SFilipe Manana 		 * Otherwise return the inode size from the btree, to avoid
4614bf504110SFilipe Manana 		 * data loss when replaying a log due to previously doing a
4615bf504110SFilipe Manana 		 * write that expands the inode's size and logging a new name
4616bf504110SFilipe Manana 		 * immediately after.
4617bf504110SFilipe Manana 		 */
4618bf504110SFilipe Manana 		if (*size_ret > inode->vfs_inode.i_size)
4619bf504110SFilipe Manana 			*size_ret = inode->vfs_inode.i_size;
46201a4bcf47SFilipe Manana 	}
46211a4bcf47SFilipe Manana 
46221a4bcf47SFilipe Manana 	btrfs_release_path(path);
46231a4bcf47SFilipe Manana 	return 0;
46241a4bcf47SFilipe Manana }
46251a4bcf47SFilipe Manana 
462636283bf7SFilipe Manana /*
462736283bf7SFilipe Manana  * At the moment we always log all xattrs. This is to figure out at log replay
462836283bf7SFilipe Manana  * time which xattrs must have their deletion replayed. If a xattr is missing
462936283bf7SFilipe Manana  * in the log tree and exists in the fs/subvol tree, we delete it. This is
463036283bf7SFilipe Manana  * because if a xattr is deleted, the inode is fsynced and a power failure
463136283bf7SFilipe Manana  * happens, causing the log to be replayed the next time the fs is mounted,
463236283bf7SFilipe Manana  * we want the xattr to not exist anymore (same behaviour as other filesystems
463336283bf7SFilipe Manana  * with a journal, ext3/4, xfs, f2fs, etc).
463436283bf7SFilipe Manana  */
463536283bf7SFilipe Manana static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
463636283bf7SFilipe Manana 				struct btrfs_root *root,
46371a93c36aSNikolay Borisov 				struct btrfs_inode *inode,
463836283bf7SFilipe Manana 				struct btrfs_path *path,
463936283bf7SFilipe Manana 				struct btrfs_path *dst_path)
464036283bf7SFilipe Manana {
464136283bf7SFilipe Manana 	int ret;
464236283bf7SFilipe Manana 	struct btrfs_key key;
46431a93c36aSNikolay Borisov 	const u64 ino = btrfs_ino(inode);
464436283bf7SFilipe Manana 	int ins_nr = 0;
464536283bf7SFilipe Manana 	int start_slot = 0;
4646f2f121abSFilipe Manana 	bool found_xattrs = false;
4647f2f121abSFilipe Manana 
4648f2f121abSFilipe Manana 	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4649f2f121abSFilipe Manana 		return 0;
465036283bf7SFilipe Manana 
465136283bf7SFilipe Manana 	key.objectid = ino;
465236283bf7SFilipe Manana 	key.type = BTRFS_XATTR_ITEM_KEY;
465336283bf7SFilipe Manana 	key.offset = 0;
465436283bf7SFilipe Manana 
465536283bf7SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
465636283bf7SFilipe Manana 	if (ret < 0)
465736283bf7SFilipe Manana 		return ret;
465836283bf7SFilipe Manana 
465936283bf7SFilipe Manana 	while (true) {
466036283bf7SFilipe Manana 		int slot = path->slots[0];
466136283bf7SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
466236283bf7SFilipe Manana 		int nritems = btrfs_header_nritems(leaf);
466336283bf7SFilipe Manana 
466436283bf7SFilipe Manana 		if (slot >= nritems) {
466536283bf7SFilipe Manana 			if (ins_nr > 0) {
46661a93c36aSNikolay Borisov 				ret = copy_items(trans, inode, dst_path, path,
46670e56315cSFilipe Manana 						 start_slot, ins_nr, 1, 0);
466836283bf7SFilipe Manana 				if (ret < 0)
466936283bf7SFilipe Manana 					return ret;
467036283bf7SFilipe Manana 				ins_nr = 0;
467136283bf7SFilipe Manana 			}
467236283bf7SFilipe Manana 			ret = btrfs_next_leaf(root, path);
467336283bf7SFilipe Manana 			if (ret < 0)
467436283bf7SFilipe Manana 				return ret;
467536283bf7SFilipe Manana 			else if (ret > 0)
467636283bf7SFilipe Manana 				break;
467736283bf7SFilipe Manana 			continue;
467836283bf7SFilipe Manana 		}
467936283bf7SFilipe Manana 
468036283bf7SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
468136283bf7SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
468236283bf7SFilipe Manana 			break;
468336283bf7SFilipe Manana 
468436283bf7SFilipe Manana 		if (ins_nr == 0)
468536283bf7SFilipe Manana 			start_slot = slot;
468636283bf7SFilipe Manana 		ins_nr++;
468736283bf7SFilipe Manana 		path->slots[0]++;
4688f2f121abSFilipe Manana 		found_xattrs = true;
468936283bf7SFilipe Manana 		cond_resched();
469036283bf7SFilipe Manana 	}
469136283bf7SFilipe Manana 	if (ins_nr > 0) {
46921a93c36aSNikolay Borisov 		ret = copy_items(trans, inode, dst_path, path,
46930e56315cSFilipe Manana 				 start_slot, ins_nr, 1, 0);
469436283bf7SFilipe Manana 		if (ret < 0)
469536283bf7SFilipe Manana 			return ret;
469636283bf7SFilipe Manana 	}
469736283bf7SFilipe Manana 
4698f2f121abSFilipe Manana 	if (!found_xattrs)
4699f2f121abSFilipe Manana 		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4700f2f121abSFilipe Manana 
470136283bf7SFilipe Manana 	return 0;
470236283bf7SFilipe Manana }
470336283bf7SFilipe Manana 
4704a89ca6f2SFilipe Manana /*
47050e56315cSFilipe Manana  * When using the NO_HOLES feature if we punched a hole that causes the
47060e56315cSFilipe Manana  * deletion of entire leafs or all the extent items of the first leaf (the one
47070e56315cSFilipe Manana  * that contains the inode item and references) we may end up not processing
47080e56315cSFilipe Manana  * any extents, because there are no leafs with a generation matching the
47090e56315cSFilipe Manana  * current transaction that have extent items for our inode. So we need to find
47100e56315cSFilipe Manana  * if any holes exist and then log them. We also need to log holes after any
47110e56315cSFilipe Manana  * truncate operation that changes the inode's size.
4712a89ca6f2SFilipe Manana  */
47130e56315cSFilipe Manana static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4714a89ca6f2SFilipe Manana 			   struct btrfs_root *root,
4715a0308dd7SNikolay Borisov 			   struct btrfs_inode *inode,
47167af59743SFilipe Manana 			   struct btrfs_path *path)
4717a89ca6f2SFilipe Manana {
47180b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4719a89ca6f2SFilipe Manana 	struct btrfs_key key;
4720a0308dd7SNikolay Borisov 	const u64 ino = btrfs_ino(inode);
4721a0308dd7SNikolay Borisov 	const u64 i_size = i_size_read(&inode->vfs_inode);
47227af59743SFilipe Manana 	u64 prev_extent_end = 0;
47230e56315cSFilipe Manana 	int ret;
4724a89ca6f2SFilipe Manana 
47250e56315cSFilipe Manana 	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4726a89ca6f2SFilipe Manana 		return 0;
4727a89ca6f2SFilipe Manana 
4728a89ca6f2SFilipe Manana 	key.objectid = ino;
4729a89ca6f2SFilipe Manana 	key.type = BTRFS_EXTENT_DATA_KEY;
47307af59743SFilipe Manana 	key.offset = 0;
4731a89ca6f2SFilipe Manana 
4732a89ca6f2SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4733a89ca6f2SFilipe Manana 	if (ret < 0)
4734a89ca6f2SFilipe Manana 		return ret;
4735a89ca6f2SFilipe Manana 
47360e56315cSFilipe Manana 	while (true) {
47370e56315cSFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
4738a89ca6f2SFilipe Manana 
47390e56315cSFilipe Manana 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
47400e56315cSFilipe Manana 			ret = btrfs_next_leaf(root, path);
47410e56315cSFilipe Manana 			if (ret < 0)
47420e56315cSFilipe Manana 				return ret;
47430e56315cSFilipe Manana 			if (ret > 0) {
47440e56315cSFilipe Manana 				ret = 0;
47450e56315cSFilipe Manana 				break;
47460e56315cSFilipe Manana 			}
47470e56315cSFilipe Manana 			leaf = path->nodes[0];
47480e56315cSFilipe Manana 		}
47490e56315cSFilipe Manana 
47500e56315cSFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
47510e56315cSFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
47520e56315cSFilipe Manana 			break;
47530e56315cSFilipe Manana 
47540e56315cSFilipe Manana 		/* We have a hole, log it. */
47550e56315cSFilipe Manana 		if (prev_extent_end < key.offset) {
47567af59743SFilipe Manana 			const u64 hole_len = key.offset - prev_extent_end;
47570e56315cSFilipe Manana 
4758a89ca6f2SFilipe Manana 			/*
47590e56315cSFilipe Manana 			 * Release the path to avoid deadlocks with other code
47600e56315cSFilipe Manana 			 * paths that search the root while holding locks on
47610e56315cSFilipe Manana 			 * leafs from the log root.
4762a89ca6f2SFilipe Manana 			 */
47630e56315cSFilipe Manana 			btrfs_release_path(path);
47640e56315cSFilipe Manana 			ret = btrfs_insert_file_extent(trans, root->log_root,
47650e56315cSFilipe Manana 						       ino, prev_extent_end, 0,
47660e56315cSFilipe Manana 						       0, hole_len, 0, hole_len,
47670e56315cSFilipe Manana 						       0, 0, 0);
47680e56315cSFilipe Manana 			if (ret < 0)
47690e56315cSFilipe Manana 				return ret;
47700e56315cSFilipe Manana 
47710e56315cSFilipe Manana 			/*
47720e56315cSFilipe Manana 			 * Search for the same key again in the root. Since it's
47730e56315cSFilipe Manana 			 * an extent item and we are holding the inode lock, the
47740e56315cSFilipe Manana 			 * key must still exist. If it doesn't just emit warning
47750e56315cSFilipe Manana 			 * and return an error to fall back to a transaction
47760e56315cSFilipe Manana 			 * commit.
47770e56315cSFilipe Manana 			 */
47780e56315cSFilipe Manana 			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
47790e56315cSFilipe Manana 			if (ret < 0)
47800e56315cSFilipe Manana 				return ret;
47810e56315cSFilipe Manana 			if (WARN_ON(ret > 0))
47820e56315cSFilipe Manana 				return -ENOENT;
47830e56315cSFilipe Manana 			leaf = path->nodes[0];
47840e56315cSFilipe Manana 		}
4785a89ca6f2SFilipe Manana 
47867af59743SFilipe Manana 		prev_extent_end = btrfs_file_extent_end(path);
47870e56315cSFilipe Manana 		path->slots[0]++;
47880e56315cSFilipe Manana 		cond_resched();
47890e56315cSFilipe Manana 	}
47900e56315cSFilipe Manana 
47917af59743SFilipe Manana 	if (prev_extent_end < i_size) {
47920e56315cSFilipe Manana 		u64 hole_len;
47930e56315cSFilipe Manana 
4794a89ca6f2SFilipe Manana 		btrfs_release_path(path);
47957af59743SFilipe Manana 		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
47960e56315cSFilipe Manana 		ret = btrfs_insert_file_extent(trans, root->log_root,
47970e56315cSFilipe Manana 					       ino, prev_extent_end, 0, 0,
47980e56315cSFilipe Manana 					       hole_len, 0, hole_len,
47990e56315cSFilipe Manana 					       0, 0, 0);
48000e56315cSFilipe Manana 		if (ret < 0)
4801a89ca6f2SFilipe Manana 			return ret;
4802a89ca6f2SFilipe Manana 	}
4803a89ca6f2SFilipe Manana 
48040e56315cSFilipe Manana 	return 0;
48050e56315cSFilipe Manana }
48060e56315cSFilipe Manana 
480756f23fdbSFilipe Manana /*
480856f23fdbSFilipe Manana  * When we are logging a new inode X, check if it doesn't have a reference that
480956f23fdbSFilipe Manana  * matches the reference from some other inode Y created in a past transaction
481056f23fdbSFilipe Manana  * and that was renamed in the current transaction. If we don't do this, then at
481156f23fdbSFilipe Manana  * log replay time we can lose inode Y (and all its files if it's a directory):
481256f23fdbSFilipe Manana  *
481356f23fdbSFilipe Manana  * mkdir /mnt/x
481456f23fdbSFilipe Manana  * echo "hello world" > /mnt/x/foobar
481556f23fdbSFilipe Manana  * sync
481656f23fdbSFilipe Manana  * mv /mnt/x /mnt/y
481756f23fdbSFilipe Manana  * mkdir /mnt/x                 # or touch /mnt/x
481856f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/x
481956f23fdbSFilipe Manana  * <power fail>
482056f23fdbSFilipe Manana  * mount fs, trigger log replay
482156f23fdbSFilipe Manana  *
482256f23fdbSFilipe Manana  * After the log replay procedure, we would lose the first directory and all its
482356f23fdbSFilipe Manana  * files (file foobar).
482456f23fdbSFilipe Manana  * For the case where inode Y is not a directory we simply end up losing it:
482556f23fdbSFilipe Manana  *
482656f23fdbSFilipe Manana  * echo "123" > /mnt/foo
482756f23fdbSFilipe Manana  * sync
482856f23fdbSFilipe Manana  * mv /mnt/foo /mnt/bar
482956f23fdbSFilipe Manana  * echo "abc" > /mnt/foo
483056f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/foo
483156f23fdbSFilipe Manana  * <power fail>
483256f23fdbSFilipe Manana  *
483356f23fdbSFilipe Manana  * We also need this for cases where a snapshot entry is replaced by some other
483456f23fdbSFilipe Manana  * entry (file or directory) otherwise we end up with an unreplayable log due to
483556f23fdbSFilipe Manana  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
483656f23fdbSFilipe Manana  * if it were a regular entry:
483756f23fdbSFilipe Manana  *
483856f23fdbSFilipe Manana  * mkdir /mnt/x
483956f23fdbSFilipe Manana  * btrfs subvolume snapshot /mnt /mnt/x/snap
484056f23fdbSFilipe Manana  * btrfs subvolume delete /mnt/x/snap
484156f23fdbSFilipe Manana  * rmdir /mnt/x
484256f23fdbSFilipe Manana  * mkdir /mnt/x
484356f23fdbSFilipe Manana  * fsync /mnt/x or fsync some new file inside it
484456f23fdbSFilipe Manana  * <power fail>
484556f23fdbSFilipe Manana  *
484656f23fdbSFilipe Manana  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
484756f23fdbSFilipe Manana  * the same transaction.
484856f23fdbSFilipe Manana  */
484956f23fdbSFilipe Manana static int btrfs_check_ref_name_override(struct extent_buffer *eb,
485056f23fdbSFilipe Manana 					 const int slot,
485156f23fdbSFilipe Manana 					 const struct btrfs_key *key,
48524791c8f1SNikolay Borisov 					 struct btrfs_inode *inode,
4853a3baaf0dSFilipe Manana 					 u64 *other_ino, u64 *other_parent)
485456f23fdbSFilipe Manana {
485556f23fdbSFilipe Manana 	int ret;
485656f23fdbSFilipe Manana 	struct btrfs_path *search_path;
485756f23fdbSFilipe Manana 	char *name = NULL;
485856f23fdbSFilipe Manana 	u32 name_len = 0;
485956f23fdbSFilipe Manana 	u32 item_size = btrfs_item_size_nr(eb, slot);
486056f23fdbSFilipe Manana 	u32 cur_offset = 0;
486156f23fdbSFilipe Manana 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
486256f23fdbSFilipe Manana 
486356f23fdbSFilipe Manana 	search_path = btrfs_alloc_path();
486456f23fdbSFilipe Manana 	if (!search_path)
486556f23fdbSFilipe Manana 		return -ENOMEM;
486656f23fdbSFilipe Manana 	search_path->search_commit_root = 1;
486756f23fdbSFilipe Manana 	search_path->skip_locking = 1;
486856f23fdbSFilipe Manana 
486956f23fdbSFilipe Manana 	while (cur_offset < item_size) {
487056f23fdbSFilipe Manana 		u64 parent;
487156f23fdbSFilipe Manana 		u32 this_name_len;
487256f23fdbSFilipe Manana 		u32 this_len;
487356f23fdbSFilipe Manana 		unsigned long name_ptr;
487456f23fdbSFilipe Manana 		struct btrfs_dir_item *di;
487556f23fdbSFilipe Manana 
487656f23fdbSFilipe Manana 		if (key->type == BTRFS_INODE_REF_KEY) {
487756f23fdbSFilipe Manana 			struct btrfs_inode_ref *iref;
487856f23fdbSFilipe Manana 
487956f23fdbSFilipe Manana 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
488056f23fdbSFilipe Manana 			parent = key->offset;
488156f23fdbSFilipe Manana 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
488256f23fdbSFilipe Manana 			name_ptr = (unsigned long)(iref + 1);
488356f23fdbSFilipe Manana 			this_len = sizeof(*iref) + this_name_len;
488456f23fdbSFilipe Manana 		} else {
488556f23fdbSFilipe Manana 			struct btrfs_inode_extref *extref;
488656f23fdbSFilipe Manana 
488756f23fdbSFilipe Manana 			extref = (struct btrfs_inode_extref *)(ptr +
488856f23fdbSFilipe Manana 							       cur_offset);
488956f23fdbSFilipe Manana 			parent = btrfs_inode_extref_parent(eb, extref);
489056f23fdbSFilipe Manana 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
489156f23fdbSFilipe Manana 			name_ptr = (unsigned long)&extref->name;
489256f23fdbSFilipe Manana 			this_len = sizeof(*extref) + this_name_len;
489356f23fdbSFilipe Manana 		}
489456f23fdbSFilipe Manana 
489556f23fdbSFilipe Manana 		if (this_name_len > name_len) {
489656f23fdbSFilipe Manana 			char *new_name;
489756f23fdbSFilipe Manana 
489856f23fdbSFilipe Manana 			new_name = krealloc(name, this_name_len, GFP_NOFS);
489956f23fdbSFilipe Manana 			if (!new_name) {
490056f23fdbSFilipe Manana 				ret = -ENOMEM;
490156f23fdbSFilipe Manana 				goto out;
490256f23fdbSFilipe Manana 			}
490356f23fdbSFilipe Manana 			name_len = this_name_len;
490456f23fdbSFilipe Manana 			name = new_name;
490556f23fdbSFilipe Manana 		}
490656f23fdbSFilipe Manana 
490756f23fdbSFilipe Manana 		read_extent_buffer(eb, name, name_ptr, this_name_len);
49084791c8f1SNikolay Borisov 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
49094791c8f1SNikolay Borisov 				parent, name, this_name_len, 0);
491056f23fdbSFilipe Manana 		if (di && !IS_ERR(di)) {
491144f714daSFilipe Manana 			struct btrfs_key di_key;
491244f714daSFilipe Manana 
491344f714daSFilipe Manana 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
491444f714daSFilipe Manana 						  di, &di_key);
491544f714daSFilipe Manana 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
49166b5fc433SFilipe Manana 				if (di_key.objectid != key->objectid) {
491756f23fdbSFilipe Manana 					ret = 1;
491844f714daSFilipe Manana 					*other_ino = di_key.objectid;
4919a3baaf0dSFilipe Manana 					*other_parent = parent;
492044f714daSFilipe Manana 				} else {
49216b5fc433SFilipe Manana 					ret = 0;
49226b5fc433SFilipe Manana 				}
49236b5fc433SFilipe Manana 			} else {
492444f714daSFilipe Manana 				ret = -EAGAIN;
492544f714daSFilipe Manana 			}
492656f23fdbSFilipe Manana 			goto out;
492756f23fdbSFilipe Manana 		} else if (IS_ERR(di)) {
492856f23fdbSFilipe Manana 			ret = PTR_ERR(di);
492956f23fdbSFilipe Manana 			goto out;
493056f23fdbSFilipe Manana 		}
493156f23fdbSFilipe Manana 		btrfs_release_path(search_path);
493256f23fdbSFilipe Manana 
493356f23fdbSFilipe Manana 		cur_offset += this_len;
493456f23fdbSFilipe Manana 	}
493556f23fdbSFilipe Manana 	ret = 0;
493656f23fdbSFilipe Manana out:
493756f23fdbSFilipe Manana 	btrfs_free_path(search_path);
493856f23fdbSFilipe Manana 	kfree(name);
493956f23fdbSFilipe Manana 	return ret;
494056f23fdbSFilipe Manana }
494156f23fdbSFilipe Manana 
49426b5fc433SFilipe Manana struct btrfs_ino_list {
49436b5fc433SFilipe Manana 	u64 ino;
4944a3baaf0dSFilipe Manana 	u64 parent;
49456b5fc433SFilipe Manana 	struct list_head list;
49466b5fc433SFilipe Manana };
49476b5fc433SFilipe Manana 
49486b5fc433SFilipe Manana static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
49496b5fc433SFilipe Manana 				  struct btrfs_root *root,
49506b5fc433SFilipe Manana 				  struct btrfs_path *path,
49516b5fc433SFilipe Manana 				  struct btrfs_log_ctx *ctx,
4952a3baaf0dSFilipe Manana 				  u64 ino, u64 parent)
49536b5fc433SFilipe Manana {
49546b5fc433SFilipe Manana 	struct btrfs_ino_list *ino_elem;
49556b5fc433SFilipe Manana 	LIST_HEAD(inode_list);
49566b5fc433SFilipe Manana 	int ret = 0;
49576b5fc433SFilipe Manana 
49586b5fc433SFilipe Manana 	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
49596b5fc433SFilipe Manana 	if (!ino_elem)
49606b5fc433SFilipe Manana 		return -ENOMEM;
49616b5fc433SFilipe Manana 	ino_elem->ino = ino;
4962a3baaf0dSFilipe Manana 	ino_elem->parent = parent;
49636b5fc433SFilipe Manana 	list_add_tail(&ino_elem->list, &inode_list);
49646b5fc433SFilipe Manana 
49656b5fc433SFilipe Manana 	while (!list_empty(&inode_list)) {
49666b5fc433SFilipe Manana 		struct btrfs_fs_info *fs_info = root->fs_info;
49676b5fc433SFilipe Manana 		struct btrfs_key key;
49686b5fc433SFilipe Manana 		struct inode *inode;
49696b5fc433SFilipe Manana 
49706b5fc433SFilipe Manana 		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
49716b5fc433SFilipe Manana 					    list);
49726b5fc433SFilipe Manana 		ino = ino_elem->ino;
4973a3baaf0dSFilipe Manana 		parent = ino_elem->parent;
49746b5fc433SFilipe Manana 		list_del(&ino_elem->list);
49756b5fc433SFilipe Manana 		kfree(ino_elem);
49766b5fc433SFilipe Manana 		if (ret)
49776b5fc433SFilipe Manana 			continue;
49786b5fc433SFilipe Manana 
49796b5fc433SFilipe Manana 		btrfs_release_path(path);
49806b5fc433SFilipe Manana 
49810202e83fSDavid Sterba 		inode = btrfs_iget(fs_info->sb, ino, root);
49826b5fc433SFilipe Manana 		/*
49836b5fc433SFilipe Manana 		 * If the other inode that had a conflicting dir entry was
4984a3baaf0dSFilipe Manana 		 * deleted in the current transaction, we need to log its parent
4985a3baaf0dSFilipe Manana 		 * directory.
49866b5fc433SFilipe Manana 		 */
49876b5fc433SFilipe Manana 		if (IS_ERR(inode)) {
49886b5fc433SFilipe Manana 			ret = PTR_ERR(inode);
4989a3baaf0dSFilipe Manana 			if (ret == -ENOENT) {
49900202e83fSDavid Sterba 				inode = btrfs_iget(fs_info->sb, parent, root);
4991a3baaf0dSFilipe Manana 				if (IS_ERR(inode)) {
4992a3baaf0dSFilipe Manana 					ret = PTR_ERR(inode);
4993a3baaf0dSFilipe Manana 				} else {
4994a3baaf0dSFilipe Manana 					ret = btrfs_log_inode(trans, root,
4995a3baaf0dSFilipe Manana 						      BTRFS_I(inode),
4996a3baaf0dSFilipe Manana 						      LOG_OTHER_INODE_ALL,
499748778179SFilipe Manana 						      ctx);
4998410f954cSFilipe Manana 					btrfs_add_delayed_iput(inode);
4999a3baaf0dSFilipe Manana 				}
5000a3baaf0dSFilipe Manana 			}
50016b5fc433SFilipe Manana 			continue;
50026b5fc433SFilipe Manana 		}
50036b5fc433SFilipe Manana 		/*
5004b5e4ff9dSFilipe Manana 		 * If the inode was already logged skip it - otherwise we can
5005b5e4ff9dSFilipe Manana 		 * hit an infinite loop. Example:
5006b5e4ff9dSFilipe Manana 		 *
5007b5e4ff9dSFilipe Manana 		 * From the commit root (previous transaction) we have the
5008b5e4ff9dSFilipe Manana 		 * following inodes:
5009b5e4ff9dSFilipe Manana 		 *
5010b5e4ff9dSFilipe Manana 		 * inode 257 a directory
5011b5e4ff9dSFilipe Manana 		 * inode 258 with references "zz" and "zz_link" on inode 257
5012b5e4ff9dSFilipe Manana 		 * inode 259 with reference "a" on inode 257
5013b5e4ff9dSFilipe Manana 		 *
5014b5e4ff9dSFilipe Manana 		 * And in the current (uncommitted) transaction we have:
5015b5e4ff9dSFilipe Manana 		 *
5016b5e4ff9dSFilipe Manana 		 * inode 257 a directory, unchanged
5017b5e4ff9dSFilipe Manana 		 * inode 258 with references "a" and "a2" on inode 257
5018b5e4ff9dSFilipe Manana 		 * inode 259 with reference "zz_link" on inode 257
5019b5e4ff9dSFilipe Manana 		 * inode 261 with reference "zz" on inode 257
5020b5e4ff9dSFilipe Manana 		 *
5021b5e4ff9dSFilipe Manana 		 * When logging inode 261 the following infinite loop could
5022b5e4ff9dSFilipe Manana 		 * happen if we don't skip already logged inodes:
5023b5e4ff9dSFilipe Manana 		 *
5024b5e4ff9dSFilipe Manana 		 * - we detect inode 258 as a conflicting inode, with inode 261
5025b5e4ff9dSFilipe Manana 		 *   on reference "zz", and log it;
5026b5e4ff9dSFilipe Manana 		 *
5027b5e4ff9dSFilipe Manana 		 * - we detect inode 259 as a conflicting inode, with inode 258
5028b5e4ff9dSFilipe Manana 		 *   on reference "a", and log it;
5029b5e4ff9dSFilipe Manana 		 *
5030b5e4ff9dSFilipe Manana 		 * - we detect inode 258 as a conflicting inode, with inode 259
5031b5e4ff9dSFilipe Manana 		 *   on reference "zz_link", and log it - again! After this we
5032b5e4ff9dSFilipe Manana 		 *   repeat the above steps forever.
5033b5e4ff9dSFilipe Manana 		 */
5034b5e4ff9dSFilipe Manana 		spin_lock(&BTRFS_I(inode)->lock);
5035b5e4ff9dSFilipe Manana 		/*
5036b5e4ff9dSFilipe Manana 		 * Check the inode's logged_trans only instead of
5037b5e4ff9dSFilipe Manana 		 * btrfs_inode_in_log(). This is because the last_log_commit of
5038b5e4ff9dSFilipe Manana 		 * the inode is not updated when we only log that it exists and
5039260db43cSRandy Dunlap 		 * it has the full sync bit set (see btrfs_log_inode()).
5040b5e4ff9dSFilipe Manana 		 */
5041b5e4ff9dSFilipe Manana 		if (BTRFS_I(inode)->logged_trans == trans->transid) {
5042b5e4ff9dSFilipe Manana 			spin_unlock(&BTRFS_I(inode)->lock);
5043b5e4ff9dSFilipe Manana 			btrfs_add_delayed_iput(inode);
5044b5e4ff9dSFilipe Manana 			continue;
5045b5e4ff9dSFilipe Manana 		}
5046b5e4ff9dSFilipe Manana 		spin_unlock(&BTRFS_I(inode)->lock);
5047b5e4ff9dSFilipe Manana 		/*
50486b5fc433SFilipe Manana 		 * We are safe logging the other inode without acquiring its
50496b5fc433SFilipe Manana 		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
50506b5fc433SFilipe Manana 		 * are safe against concurrent renames of the other inode as
50516b5fc433SFilipe Manana 		 * well because during a rename we pin the log and update the
50526b5fc433SFilipe Manana 		 * log with the new name before we unpin it.
50536b5fc433SFilipe Manana 		 */
50546b5fc433SFilipe Manana 		ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
505548778179SFilipe Manana 				      LOG_OTHER_INODE, ctx);
50566b5fc433SFilipe Manana 		if (ret) {
5057410f954cSFilipe Manana 			btrfs_add_delayed_iput(inode);
50586b5fc433SFilipe Manana 			continue;
50596b5fc433SFilipe Manana 		}
50606b5fc433SFilipe Manana 
50616b5fc433SFilipe Manana 		key.objectid = ino;
50626b5fc433SFilipe Manana 		key.type = BTRFS_INODE_REF_KEY;
50636b5fc433SFilipe Manana 		key.offset = 0;
50646b5fc433SFilipe Manana 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
50656b5fc433SFilipe Manana 		if (ret < 0) {
5066410f954cSFilipe Manana 			btrfs_add_delayed_iput(inode);
50676b5fc433SFilipe Manana 			continue;
50686b5fc433SFilipe Manana 		}
50696b5fc433SFilipe Manana 
50706b5fc433SFilipe Manana 		while (true) {
50716b5fc433SFilipe Manana 			struct extent_buffer *leaf = path->nodes[0];
50726b5fc433SFilipe Manana 			int slot = path->slots[0];
50736b5fc433SFilipe Manana 			u64 other_ino = 0;
5074a3baaf0dSFilipe Manana 			u64 other_parent = 0;
50756b5fc433SFilipe Manana 
50766b5fc433SFilipe Manana 			if (slot >= btrfs_header_nritems(leaf)) {
50776b5fc433SFilipe Manana 				ret = btrfs_next_leaf(root, path);
50786b5fc433SFilipe Manana 				if (ret < 0) {
50796b5fc433SFilipe Manana 					break;
50806b5fc433SFilipe Manana 				} else if (ret > 0) {
50816b5fc433SFilipe Manana 					ret = 0;
50826b5fc433SFilipe Manana 					break;
50836b5fc433SFilipe Manana 				}
50846b5fc433SFilipe Manana 				continue;
50856b5fc433SFilipe Manana 			}
50866b5fc433SFilipe Manana 
50876b5fc433SFilipe Manana 			btrfs_item_key_to_cpu(leaf, &key, slot);
50886b5fc433SFilipe Manana 			if (key.objectid != ino ||
50896b5fc433SFilipe Manana 			    (key.type != BTRFS_INODE_REF_KEY &&
50906b5fc433SFilipe Manana 			     key.type != BTRFS_INODE_EXTREF_KEY)) {
50916b5fc433SFilipe Manana 				ret = 0;
50926b5fc433SFilipe Manana 				break;
50936b5fc433SFilipe Manana 			}
50946b5fc433SFilipe Manana 
50956b5fc433SFilipe Manana 			ret = btrfs_check_ref_name_override(leaf, slot, &key,
5096a3baaf0dSFilipe Manana 					BTRFS_I(inode), &other_ino,
5097a3baaf0dSFilipe Manana 					&other_parent);
50986b5fc433SFilipe Manana 			if (ret < 0)
50996b5fc433SFilipe Manana 				break;
51006b5fc433SFilipe Manana 			if (ret > 0) {
51016b5fc433SFilipe Manana 				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
51026b5fc433SFilipe Manana 				if (!ino_elem) {
51036b5fc433SFilipe Manana 					ret = -ENOMEM;
51046b5fc433SFilipe Manana 					break;
51056b5fc433SFilipe Manana 				}
51066b5fc433SFilipe Manana 				ino_elem->ino = other_ino;
5107a3baaf0dSFilipe Manana 				ino_elem->parent = other_parent;
51086b5fc433SFilipe Manana 				list_add_tail(&ino_elem->list, &inode_list);
51096b5fc433SFilipe Manana 				ret = 0;
51106b5fc433SFilipe Manana 			}
51116b5fc433SFilipe Manana 			path->slots[0]++;
51126b5fc433SFilipe Manana 		}
5113410f954cSFilipe Manana 		btrfs_add_delayed_iput(inode);
51146b5fc433SFilipe Manana 	}
51156b5fc433SFilipe Manana 
51166b5fc433SFilipe Manana 	return ret;
51176b5fc433SFilipe Manana }
51186b5fc433SFilipe Manana 
5119da447009SFilipe Manana static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5120da447009SFilipe Manana 				   struct btrfs_inode *inode,
5121da447009SFilipe Manana 				   struct btrfs_key *min_key,
5122da447009SFilipe Manana 				   const struct btrfs_key *max_key,
5123da447009SFilipe Manana 				   struct btrfs_path *path,
5124da447009SFilipe Manana 				   struct btrfs_path *dst_path,
5125da447009SFilipe Manana 				   const u64 logged_isize,
5126da447009SFilipe Manana 				   const bool recursive_logging,
5127da447009SFilipe Manana 				   const int inode_only,
5128da447009SFilipe Manana 				   struct btrfs_log_ctx *ctx,
5129da447009SFilipe Manana 				   bool *need_log_inode_item)
5130da447009SFilipe Manana {
5131da447009SFilipe Manana 	struct btrfs_root *root = inode->root;
5132da447009SFilipe Manana 	int ins_start_slot = 0;
5133da447009SFilipe Manana 	int ins_nr = 0;
5134da447009SFilipe Manana 	int ret;
5135da447009SFilipe Manana 
5136da447009SFilipe Manana 	while (1) {
5137da447009SFilipe Manana 		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5138da447009SFilipe Manana 		if (ret < 0)
5139da447009SFilipe Manana 			return ret;
5140da447009SFilipe Manana 		if (ret > 0) {
5141da447009SFilipe Manana 			ret = 0;
5142da447009SFilipe Manana 			break;
5143da447009SFilipe Manana 		}
5144da447009SFilipe Manana again:
5145da447009SFilipe Manana 		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5146da447009SFilipe Manana 		if (min_key->objectid != max_key->objectid)
5147da447009SFilipe Manana 			break;
5148da447009SFilipe Manana 		if (min_key->type > max_key->type)
5149da447009SFilipe Manana 			break;
5150da447009SFilipe Manana 
5151da447009SFilipe Manana 		if (min_key->type == BTRFS_INODE_ITEM_KEY)
5152da447009SFilipe Manana 			*need_log_inode_item = false;
5153da447009SFilipe Manana 
5154da447009SFilipe Manana 		if ((min_key->type == BTRFS_INODE_REF_KEY ||
5155da447009SFilipe Manana 		     min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5156da447009SFilipe Manana 		    inode->generation == trans->transid &&
5157da447009SFilipe Manana 		    !recursive_logging) {
5158da447009SFilipe Manana 			u64 other_ino = 0;
5159da447009SFilipe Manana 			u64 other_parent = 0;
5160da447009SFilipe Manana 
5161da447009SFilipe Manana 			ret = btrfs_check_ref_name_override(path->nodes[0],
5162da447009SFilipe Manana 					path->slots[0], min_key, inode,
5163da447009SFilipe Manana 					&other_ino, &other_parent);
5164da447009SFilipe Manana 			if (ret < 0) {
5165da447009SFilipe Manana 				return ret;
5166da447009SFilipe Manana 			} else if (ret > 0 && ctx &&
5167da447009SFilipe Manana 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5168da447009SFilipe Manana 				if (ins_nr > 0) {
5169da447009SFilipe Manana 					ins_nr++;
5170da447009SFilipe Manana 				} else {
5171da447009SFilipe Manana 					ins_nr = 1;
5172da447009SFilipe Manana 					ins_start_slot = path->slots[0];
5173da447009SFilipe Manana 				}
5174da447009SFilipe Manana 				ret = copy_items(trans, inode, dst_path, path,
5175da447009SFilipe Manana 						 ins_start_slot, ins_nr,
5176da447009SFilipe Manana 						 inode_only, logged_isize);
5177da447009SFilipe Manana 				if (ret < 0)
5178da447009SFilipe Manana 					return ret;
5179da447009SFilipe Manana 				ins_nr = 0;
5180da447009SFilipe Manana 
5181da447009SFilipe Manana 				ret = log_conflicting_inodes(trans, root, path,
5182da447009SFilipe Manana 						ctx, other_ino, other_parent);
5183da447009SFilipe Manana 				if (ret)
5184da447009SFilipe Manana 					return ret;
5185da447009SFilipe Manana 				btrfs_release_path(path);
5186da447009SFilipe Manana 				goto next_key;
5187da447009SFilipe Manana 			}
5188da447009SFilipe Manana 		}
5189da447009SFilipe Manana 
5190da447009SFilipe Manana 		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5191da447009SFilipe Manana 		if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5192da447009SFilipe Manana 			if (ins_nr == 0)
5193da447009SFilipe Manana 				goto next_slot;
5194da447009SFilipe Manana 			ret = copy_items(trans, inode, dst_path, path,
5195da447009SFilipe Manana 					 ins_start_slot,
5196da447009SFilipe Manana 					 ins_nr, inode_only, logged_isize);
5197da447009SFilipe Manana 			if (ret < 0)
5198da447009SFilipe Manana 				return ret;
5199da447009SFilipe Manana 			ins_nr = 0;
5200da447009SFilipe Manana 			goto next_slot;
5201da447009SFilipe Manana 		}
5202da447009SFilipe Manana 
5203da447009SFilipe Manana 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5204da447009SFilipe Manana 			ins_nr++;
5205da447009SFilipe Manana 			goto next_slot;
5206da447009SFilipe Manana 		} else if (!ins_nr) {
5207da447009SFilipe Manana 			ins_start_slot = path->slots[0];
5208da447009SFilipe Manana 			ins_nr = 1;
5209da447009SFilipe Manana 			goto next_slot;
5210da447009SFilipe Manana 		}
5211da447009SFilipe Manana 
5212da447009SFilipe Manana 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5213da447009SFilipe Manana 				 ins_nr, inode_only, logged_isize);
5214da447009SFilipe Manana 		if (ret < 0)
5215da447009SFilipe Manana 			return ret;
5216da447009SFilipe Manana 		ins_nr = 1;
5217da447009SFilipe Manana 		ins_start_slot = path->slots[0];
5218da447009SFilipe Manana next_slot:
5219da447009SFilipe Manana 		path->slots[0]++;
5220da447009SFilipe Manana 		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5221da447009SFilipe Manana 			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5222da447009SFilipe Manana 					      path->slots[0]);
5223da447009SFilipe Manana 			goto again;
5224da447009SFilipe Manana 		}
5225da447009SFilipe Manana 		if (ins_nr) {
5226da447009SFilipe Manana 			ret = copy_items(trans, inode, dst_path, path,
5227da447009SFilipe Manana 					 ins_start_slot, ins_nr, inode_only,
5228da447009SFilipe Manana 					 logged_isize);
5229da447009SFilipe Manana 			if (ret < 0)
5230da447009SFilipe Manana 				return ret;
5231da447009SFilipe Manana 			ins_nr = 0;
5232da447009SFilipe Manana 		}
5233da447009SFilipe Manana 		btrfs_release_path(path);
5234da447009SFilipe Manana next_key:
5235da447009SFilipe Manana 		if (min_key->offset < (u64)-1) {
5236da447009SFilipe Manana 			min_key->offset++;
5237da447009SFilipe Manana 		} else if (min_key->type < max_key->type) {
5238da447009SFilipe Manana 			min_key->type++;
5239da447009SFilipe Manana 			min_key->offset = 0;
5240da447009SFilipe Manana 		} else {
5241da447009SFilipe Manana 			break;
5242da447009SFilipe Manana 		}
5243da447009SFilipe Manana 	}
5244da447009SFilipe Manana 	if (ins_nr)
5245da447009SFilipe Manana 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5246da447009SFilipe Manana 				 ins_nr, inode_only, logged_isize);
5247da447009SFilipe Manana 
5248da447009SFilipe Manana 	return ret;
5249da447009SFilipe Manana }
5250da447009SFilipe Manana 
5251e02119d5SChris Mason /* log a single inode in the tree log.
5252e02119d5SChris Mason  * At least one parent directory for this inode must exist in the tree
5253e02119d5SChris Mason  * or be logged already.
5254e02119d5SChris Mason  *
5255e02119d5SChris Mason  * Any items from this inode changed by the current transaction are copied
5256e02119d5SChris Mason  * to the log tree.  An extra reference is taken on any extents in this
5257e02119d5SChris Mason  * file, allowing us to avoid a whole pile of corner cases around logging
5258e02119d5SChris Mason  * blocks that have been removed from the tree.
5259e02119d5SChris Mason  *
5260e02119d5SChris Mason  * See LOG_INODE_ALL and related defines for a description of what inode_only
5261e02119d5SChris Mason  * does.
5262e02119d5SChris Mason  *
5263e02119d5SChris Mason  * This handles both files and directories.
5264e02119d5SChris Mason  */
526512fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5266a59108a7SNikolay Borisov 			   struct btrfs_root *root, struct btrfs_inode *inode,
526749dae1bcSFilipe Manana 			   int inode_only,
52688407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx)
5269e02119d5SChris Mason {
5270e02119d5SChris Mason 	struct btrfs_path *path;
5271e02119d5SChris Mason 	struct btrfs_path *dst_path;
5272e02119d5SChris Mason 	struct btrfs_key min_key;
5273e02119d5SChris Mason 	struct btrfs_key max_key;
5274e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
52754a500fd1SYan, Zheng 	int err = 0;
52768c8648ddSFilipe Manana 	int ret = 0;
52775dc562c5SJosef Bacik 	bool fast_search = false;
5278a59108a7SNikolay Borisov 	u64 ino = btrfs_ino(inode);
5279a59108a7SNikolay Borisov 	struct extent_map_tree *em_tree = &inode->extent_tree;
52801a4bcf47SFilipe Manana 	u64 logged_isize = 0;
5281e4545de5SFilipe Manana 	bool need_log_inode_item = true;
52829a8fca62SFilipe Manana 	bool xattrs_logged = false;
5283a3baaf0dSFilipe Manana 	bool recursive_logging = false;
5284e02119d5SChris Mason 
5285e02119d5SChris Mason 	path = btrfs_alloc_path();
52865df67083STsutomu Itoh 	if (!path)
52875df67083STsutomu Itoh 		return -ENOMEM;
5288e02119d5SChris Mason 	dst_path = btrfs_alloc_path();
52895df67083STsutomu Itoh 	if (!dst_path) {
52905df67083STsutomu Itoh 		btrfs_free_path(path);
52915df67083STsutomu Itoh 		return -ENOMEM;
52925df67083STsutomu Itoh 	}
5293e02119d5SChris Mason 
529433345d01SLi Zefan 	min_key.objectid = ino;
5295e02119d5SChris Mason 	min_key.type = BTRFS_INODE_ITEM_KEY;
5296e02119d5SChris Mason 	min_key.offset = 0;
5297e02119d5SChris Mason 
529833345d01SLi Zefan 	max_key.objectid = ino;
529912fcfd22SChris Mason 
530012fcfd22SChris Mason 
53015dc562c5SJosef Bacik 	/* today the code can only do partial logging of directories */
5302a59108a7SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
53035269b67eSMiao Xie 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5304a59108a7SNikolay Borisov 		       &inode->runtime_flags) &&
5305781feef7SLiu Bo 	     inode_only >= LOG_INODE_EXISTS))
5306e02119d5SChris Mason 		max_key.type = BTRFS_XATTR_ITEM_KEY;
5307e02119d5SChris Mason 	else
5308e02119d5SChris Mason 		max_key.type = (u8)-1;
5309e02119d5SChris Mason 	max_key.offset = (u64)-1;
5310e02119d5SChris Mason 
53112c2c452bSFilipe Manana 	/*
53125aa7d1a7SFilipe Manana 	 * Only run delayed items if we are a directory. We want to make sure
53135aa7d1a7SFilipe Manana 	 * all directory indexes hit the fs/subvolume tree so we can find them
53145aa7d1a7SFilipe Manana 	 * and figure out which index ranges have to be logged.
53155aa7d1a7SFilipe Manana 	 *
53168c8648ddSFilipe Manana 	 * Otherwise commit the delayed inode only if the full sync flag is set,
53178c8648ddSFilipe Manana 	 * as we want to make sure an up to date version is in the subvolume
53188c8648ddSFilipe Manana 	 * tree so copy_inode_items_to_log() / copy_items() can find it and copy
53198c8648ddSFilipe Manana 	 * it to the log tree. For a non full sync, we always log the inode item
53208c8648ddSFilipe Manana 	 * based on the in-memory struct btrfs_inode which is always up to date.
53212c2c452bSFilipe Manana 	 */
53225aa7d1a7SFilipe Manana 	if (S_ISDIR(inode->vfs_inode.i_mode))
5323a59108a7SNikolay Borisov 		ret = btrfs_commit_inode_delayed_items(trans, inode);
53248c8648ddSFilipe Manana 	else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5325a59108a7SNikolay Borisov 		ret = btrfs_commit_inode_delayed_inode(inode);
53262c2c452bSFilipe Manana 
532716cdcec7SMiao Xie 	if (ret) {
532816cdcec7SMiao Xie 		btrfs_free_path(path);
532916cdcec7SMiao Xie 		btrfs_free_path(dst_path);
533016cdcec7SMiao Xie 		return ret;
533116cdcec7SMiao Xie 	}
533216cdcec7SMiao Xie 
5333a3baaf0dSFilipe Manana 	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5334a3baaf0dSFilipe Manana 		recursive_logging = true;
5335a3baaf0dSFilipe Manana 		if (inode_only == LOG_OTHER_INODE)
5336781feef7SLiu Bo 			inode_only = LOG_INODE_EXISTS;
5337a3baaf0dSFilipe Manana 		else
5338a3baaf0dSFilipe Manana 			inode_only = LOG_INODE_ALL;
5339a59108a7SNikolay Borisov 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5340781feef7SLiu Bo 	} else {
5341a59108a7SNikolay Borisov 		mutex_lock(&inode->log_mutex);
5342781feef7SLiu Bo 	}
5343e02119d5SChris Mason 
53445e33a2bdSFilipe Manana 	/*
534564d6b281SFilipe Manana 	 * This is for cases where logging a directory could result in losing a
534664d6b281SFilipe Manana 	 * a file after replaying the log. For example, if we move a file from a
534764d6b281SFilipe Manana 	 * directory A to a directory B, then fsync directory A, we have no way
534864d6b281SFilipe Manana 	 * to known the file was moved from A to B, so logging just A would
534964d6b281SFilipe Manana 	 * result in losing the file after a log replay.
535064d6b281SFilipe Manana 	 */
535164d6b281SFilipe Manana 	if (S_ISDIR(inode->vfs_inode.i_mode) &&
535264d6b281SFilipe Manana 	    inode_only == LOG_INODE_ALL &&
535364d6b281SFilipe Manana 	    inode->last_unlink_trans >= trans->transid) {
535464d6b281SFilipe Manana 		btrfs_set_log_full_commit(trans);
535564d6b281SFilipe Manana 		err = 1;
535664d6b281SFilipe Manana 		goto out_unlock;
535764d6b281SFilipe Manana 	}
535864d6b281SFilipe Manana 
535964d6b281SFilipe Manana 	/*
5360e02119d5SChris Mason 	 * a brute force approach to making sure we get the most uptodate
5361e02119d5SChris Mason 	 * copies of everything.
5362e02119d5SChris Mason 	 */
5363a59108a7SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5364e02119d5SChris Mason 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5365e02119d5SChris Mason 
5366ab12313aSFilipe Manana 		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
53674f764e51SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS)
53684f764e51SFilipe Manana 			max_key_type = BTRFS_XATTR_ITEM_KEY;
536933345d01SLi Zefan 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5370e02119d5SChris Mason 	} else {
53711a4bcf47SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS) {
53721a4bcf47SFilipe Manana 			/*
53731a4bcf47SFilipe Manana 			 * Make sure the new inode item we write to the log has
53741a4bcf47SFilipe Manana 			 * the same isize as the current one (if it exists).
53751a4bcf47SFilipe Manana 			 * This is necessary to prevent data loss after log
53761a4bcf47SFilipe Manana 			 * replay, and also to prevent doing a wrong expanding
53771a4bcf47SFilipe Manana 			 * truncate - for e.g. create file, write 4K into offset
53781a4bcf47SFilipe Manana 			 * 0, fsync, write 4K into offset 4096, add hard link,
53791a4bcf47SFilipe Manana 			 * fsync some other file (to sync log), power fail - if
53801a4bcf47SFilipe Manana 			 * we use the inode's current i_size, after log replay
53811a4bcf47SFilipe Manana 			 * we get a 8Kb file, with the last 4Kb extent as a hole
53821a4bcf47SFilipe Manana 			 * (zeroes), as if an expanding truncate happened,
53831a4bcf47SFilipe Manana 			 * instead of getting a file of 4Kb only.
53841a4bcf47SFilipe Manana 			 */
5385a59108a7SNikolay Borisov 			err = logged_inode_size(log, inode, path, &logged_isize);
53861a4bcf47SFilipe Manana 			if (err)
53871a4bcf47SFilipe Manana 				goto out_unlock;
53881a4bcf47SFilipe Manana 		}
5389a742994aSFilipe Manana 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5390a59108a7SNikolay Borisov 			     &inode->runtime_flags)) {
5391a742994aSFilipe Manana 			if (inode_only == LOG_INODE_EXISTS) {
53924f764e51SFilipe Manana 				max_key.type = BTRFS_XATTR_ITEM_KEY;
5393a742994aSFilipe Manana 				ret = drop_objectid_items(trans, log, path, ino,
5394a742994aSFilipe Manana 							  max_key.type);
5395a742994aSFilipe Manana 			} else {
5396a742994aSFilipe Manana 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5397a59108a7SNikolay Borisov 					  &inode->runtime_flags);
5398e9976151SJosef Bacik 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5399a59108a7SNikolay Borisov 					  &inode->runtime_flags);
540028ed1345SChris Mason 				while(1) {
540128ed1345SChris Mason 					ret = btrfs_truncate_inode_items(trans,
540250743398SNikolay Borisov 						log, inode, 0, 0);
540328ed1345SChris Mason 					if (ret != -EAGAIN)
540428ed1345SChris Mason 						break;
540528ed1345SChris Mason 				}
5406a742994aSFilipe Manana 			}
54074f764e51SFilipe Manana 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5408a59108a7SNikolay Borisov 					      &inode->runtime_flags) ||
54096cfab851SJosef Bacik 			   inode_only == LOG_INODE_EXISTS) {
54104f764e51SFilipe Manana 			if (inode_only == LOG_INODE_ALL)
5411a95249b3SJosef Bacik 				fast_search = true;
5412a95249b3SJosef Bacik 			max_key.type = BTRFS_XATTR_ITEM_KEY;
5413a95249b3SJosef Bacik 			ret = drop_objectid_items(trans, log, path, ino,
5414a95249b3SJosef Bacik 						  max_key.type);
54155dc562c5SJosef Bacik 		} else {
5416183f37faSLiu Bo 			if (inode_only == LOG_INODE_ALL)
54175dc562c5SJosef Bacik 				fast_search = true;
5418a95249b3SJosef Bacik 			goto log_extents;
5419a95249b3SJosef Bacik 		}
5420a95249b3SJosef Bacik 
5421e02119d5SChris Mason 	}
54224a500fd1SYan, Zheng 	if (ret) {
54234a500fd1SYan, Zheng 		err = ret;
54244a500fd1SYan, Zheng 		goto out_unlock;
54254a500fd1SYan, Zheng 	}
5426e02119d5SChris Mason 
5427da447009SFilipe Manana 	err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5428da447009SFilipe Manana 				      path, dst_path, logged_isize,
54297af59743SFilipe Manana 				      recursive_logging, inode_only, ctx,
54307af59743SFilipe Manana 				      &need_log_inode_item);
543144f714daSFilipe Manana 	if (err)
543244f714daSFilipe Manana 		goto out_unlock;
54335dc562c5SJosef Bacik 
543436283bf7SFilipe Manana 	btrfs_release_path(path);
543536283bf7SFilipe Manana 	btrfs_release_path(dst_path);
5436a59108a7SNikolay Borisov 	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
543736283bf7SFilipe Manana 	if (err)
543836283bf7SFilipe Manana 		goto out_unlock;
54399a8fca62SFilipe Manana 	xattrs_logged = true;
5440a89ca6f2SFilipe Manana 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5441a89ca6f2SFilipe Manana 		btrfs_release_path(path);
5442a89ca6f2SFilipe Manana 		btrfs_release_path(dst_path);
54437af59743SFilipe Manana 		err = btrfs_log_holes(trans, root, inode, path);
5444a89ca6f2SFilipe Manana 		if (err)
5445a89ca6f2SFilipe Manana 			goto out_unlock;
5446a89ca6f2SFilipe Manana 	}
5447a95249b3SJosef Bacik log_extents:
5448f3b15ccdSJosef Bacik 	btrfs_release_path(path);
54495dc562c5SJosef Bacik 	btrfs_release_path(dst_path);
5450e4545de5SFilipe Manana 	if (need_log_inode_item) {
5451a59108a7SNikolay Borisov 		err = log_inode_item(trans, log, dst_path, inode);
54529a8fca62SFilipe Manana 		if (!err && !xattrs_logged) {
54539a8fca62SFilipe Manana 			err = btrfs_log_all_xattrs(trans, root, inode, path,
54549a8fca62SFilipe Manana 						   dst_path);
54559a8fca62SFilipe Manana 			btrfs_release_path(path);
54569a8fca62SFilipe Manana 		}
5457e4545de5SFilipe Manana 		if (err)
5458e4545de5SFilipe Manana 			goto out_unlock;
5459e4545de5SFilipe Manana 	}
5460f3b15ccdSJosef Bacik 	if (fast_search) {
5461a59108a7SNikolay Borisov 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
546248778179SFilipe Manana 						ctx);
54635dc562c5SJosef Bacik 		if (ret) {
54645dc562c5SJosef Bacik 			err = ret;
54655dc562c5SJosef Bacik 			goto out_unlock;
54665dc562c5SJosef Bacik 		}
5467d006a048SJosef Bacik 	} else if (inode_only == LOG_INODE_ALL) {
546806d3d22bSLiu Bo 		struct extent_map *em, *n;
546906d3d22bSLiu Bo 
547049dae1bcSFilipe Manana 		write_lock(&em_tree->lock);
547148778179SFilipe Manana 		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
547206d3d22bSLiu Bo 			list_del_init(&em->list);
547349dae1bcSFilipe Manana 		write_unlock(&em_tree->lock);
54745dc562c5SJosef Bacik 	}
54755dc562c5SJosef Bacik 
5476a59108a7SNikolay Borisov 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5477a59108a7SNikolay Borisov 		ret = log_directory_changes(trans, root, inode, path, dst_path,
5478a59108a7SNikolay Borisov 					ctx);
54794a500fd1SYan, Zheng 		if (ret) {
54804a500fd1SYan, Zheng 			err = ret;
54814a500fd1SYan, Zheng 			goto out_unlock;
54824a500fd1SYan, Zheng 		}
5483e02119d5SChris Mason 	}
548449dae1bcSFilipe Manana 
5485d1d832a0SFilipe Manana 	/*
548675b463d2SFilipe Manana 	 * If we are logging that an ancestor inode exists as part of logging a
548775b463d2SFilipe Manana 	 * new name from a link or rename operation, don't mark the inode as
548875b463d2SFilipe Manana 	 * logged - otherwise if an explicit fsync is made against an ancestor,
548975b463d2SFilipe Manana 	 * the fsync considers the inode in the log and doesn't sync the log,
549075b463d2SFilipe Manana 	 * resulting in the ancestor missing after a power failure unless the
549175b463d2SFilipe Manana 	 * log was synced as part of an fsync against any other unrelated inode.
549275b463d2SFilipe Manana 	 * So keep it simple for this case and just don't flag the ancestors as
549375b463d2SFilipe Manana 	 * logged.
5494d1d832a0SFilipe Manana 	 */
549575b463d2SFilipe Manana 	if (!ctx ||
549675b463d2SFilipe Manana 	    !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
549775b463d2SFilipe Manana 	      &inode->vfs_inode != ctx->inode)) {
5498a59108a7SNikolay Borisov 		spin_lock(&inode->lock);
5499a59108a7SNikolay Borisov 		inode->logged_trans = trans->transid;
550075b463d2SFilipe Manana 		/*
550175b463d2SFilipe Manana 		 * Don't update last_log_commit if we logged that an inode exists
550275b463d2SFilipe Manana 		 * after it was loaded to memory (full_sync bit set).
550375b463d2SFilipe Manana 		 * This is to prevent data loss when we do a write to the inode,
550475b463d2SFilipe Manana 		 * then the inode gets evicted after all delalloc was flushed,
550575b463d2SFilipe Manana 		 * then we log it exists (due to a rename for example) and then
550675b463d2SFilipe Manana 		 * fsync it. This last fsync would do nothing (not logging the
550775b463d2SFilipe Manana 		 * extents previously written).
550875b463d2SFilipe Manana 		 */
5509d1d832a0SFilipe Manana 		if (inode_only != LOG_INODE_EXISTS ||
5510d1d832a0SFilipe Manana 		    !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5511a59108a7SNikolay Borisov 			inode->last_log_commit = inode->last_sub_trans;
5512a59108a7SNikolay Borisov 		spin_unlock(&inode->lock);
551375b463d2SFilipe Manana 	}
55144a500fd1SYan, Zheng out_unlock:
5515a59108a7SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
5516e02119d5SChris Mason 
5517e02119d5SChris Mason 	btrfs_free_path(path);
5518e02119d5SChris Mason 	btrfs_free_path(dst_path);
55194a500fd1SYan, Zheng 	return err;
5520e02119d5SChris Mason }
5521e02119d5SChris Mason 
552212fcfd22SChris Mason /*
5523ab12313aSFilipe Manana  * Check if we need to log an inode. This is used in contexts where while
5524ab12313aSFilipe Manana  * logging an inode we need to log another inode (either that it exists or in
5525ab12313aSFilipe Manana  * full mode). This is used instead of btrfs_inode_in_log() because the later
5526ab12313aSFilipe Manana  * requires the inode to be in the log and have the log transaction committed,
5527ab12313aSFilipe Manana  * while here we do not care if the log transaction was already committed - our
5528ab12313aSFilipe Manana  * caller will commit the log later - and we want to avoid logging an inode
5529ab12313aSFilipe Manana  * multiple times when multiple tasks have joined the same log transaction.
5530ab12313aSFilipe Manana  */
5531ab12313aSFilipe Manana static bool need_log_inode(struct btrfs_trans_handle *trans,
5532ab12313aSFilipe Manana 			   struct btrfs_inode *inode)
5533ab12313aSFilipe Manana {
5534ab12313aSFilipe Manana 	/*
5535ab12313aSFilipe Manana 	 * If this inode does not have new/updated/deleted xattrs since the last
5536ab12313aSFilipe Manana 	 * time it was logged and is flagged as logged in the current transaction,
5537ab12313aSFilipe Manana 	 * we can skip logging it. As for new/deleted names, those are updated in
5538ab12313aSFilipe Manana 	 * the log by link/unlink/rename operations.
5539ab12313aSFilipe Manana 	 * In case the inode was logged and then evicted and reloaded, its
5540ab12313aSFilipe Manana 	 * logged_trans will be 0, in which case we have to fully log it since
5541ab12313aSFilipe Manana 	 * logged_trans is a transient field, not persisted.
5542ab12313aSFilipe Manana 	 */
5543ab12313aSFilipe Manana 	if (inode->logged_trans == trans->transid &&
5544ab12313aSFilipe Manana 	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5545ab12313aSFilipe Manana 		return false;
5546ab12313aSFilipe Manana 
5547ab12313aSFilipe Manana 	return true;
5548ab12313aSFilipe Manana }
5549ab12313aSFilipe Manana 
55502f2ff0eeSFilipe Manana struct btrfs_dir_list {
55512f2ff0eeSFilipe Manana 	u64 ino;
55522f2ff0eeSFilipe Manana 	struct list_head list;
55532f2ff0eeSFilipe Manana };
55542f2ff0eeSFilipe Manana 
55552f2ff0eeSFilipe Manana /*
55562f2ff0eeSFilipe Manana  * Log the inodes of the new dentries of a directory. See log_dir_items() for
55572f2ff0eeSFilipe Manana  * details about the why it is needed.
55582f2ff0eeSFilipe Manana  * This is a recursive operation - if an existing dentry corresponds to a
55592f2ff0eeSFilipe Manana  * directory, that directory's new entries are logged too (same behaviour as
55602f2ff0eeSFilipe Manana  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
55612f2ff0eeSFilipe Manana  * the dentries point to we do not lock their i_mutex, otherwise lockdep
55622f2ff0eeSFilipe Manana  * complains about the following circular lock dependency / possible deadlock:
55632f2ff0eeSFilipe Manana  *
55642f2ff0eeSFilipe Manana  *        CPU0                                        CPU1
55652f2ff0eeSFilipe Manana  *        ----                                        ----
55662f2ff0eeSFilipe Manana  * lock(&type->i_mutex_dir_key#3/2);
55672f2ff0eeSFilipe Manana  *                                            lock(sb_internal#2);
55682f2ff0eeSFilipe Manana  *                                            lock(&type->i_mutex_dir_key#3/2);
55692f2ff0eeSFilipe Manana  * lock(&sb->s_type->i_mutex_key#14);
55702f2ff0eeSFilipe Manana  *
55712f2ff0eeSFilipe Manana  * Where sb_internal is the lock (a counter that works as a lock) acquired by
55722f2ff0eeSFilipe Manana  * sb_start_intwrite() in btrfs_start_transaction().
55732f2ff0eeSFilipe Manana  * Not locking i_mutex of the inodes is still safe because:
55742f2ff0eeSFilipe Manana  *
55752f2ff0eeSFilipe Manana  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
55762f2ff0eeSFilipe Manana  *    that while logging the inode new references (names) are added or removed
55772f2ff0eeSFilipe Manana  *    from the inode, leaving the logged inode item with a link count that does
55782f2ff0eeSFilipe Manana  *    not match the number of logged inode reference items. This is fine because
55792f2ff0eeSFilipe Manana  *    at log replay time we compute the real number of links and correct the
55802f2ff0eeSFilipe Manana  *    link count in the inode item (see replay_one_buffer() and
55812f2ff0eeSFilipe Manana  *    link_to_fixup_dir());
55822f2ff0eeSFilipe Manana  *
55832f2ff0eeSFilipe Manana  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
55842f2ff0eeSFilipe Manana  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
55852f2ff0eeSFilipe Manana  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
55862f2ff0eeSFilipe Manana  *    has a size that doesn't match the sum of the lengths of all the logged
55872f2ff0eeSFilipe Manana  *    names. This does not result in a problem because if a dir_item key is
55882f2ff0eeSFilipe Manana  *    logged but its matching dir_index key is not logged, at log replay time we
55892f2ff0eeSFilipe Manana  *    don't use it to replay the respective name (see replay_one_name()). On the
55902f2ff0eeSFilipe Manana  *    other hand if only the dir_index key ends up being logged, the respective
55912f2ff0eeSFilipe Manana  *    name is added to the fs/subvol tree with both the dir_item and dir_index
55922f2ff0eeSFilipe Manana  *    keys created (see replay_one_name()).
55932f2ff0eeSFilipe Manana  *    The directory's inode item with a wrong i_size is not a problem as well,
55942f2ff0eeSFilipe Manana  *    since we don't use it at log replay time to set the i_size in the inode
55952f2ff0eeSFilipe Manana  *    item of the fs/subvol tree (see overwrite_item()).
55962f2ff0eeSFilipe Manana  */
55972f2ff0eeSFilipe Manana static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
55982f2ff0eeSFilipe Manana 				struct btrfs_root *root,
559951cc0d32SNikolay Borisov 				struct btrfs_inode *start_inode,
56002f2ff0eeSFilipe Manana 				struct btrfs_log_ctx *ctx)
56012f2ff0eeSFilipe Manana {
56020b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
56032f2ff0eeSFilipe Manana 	struct btrfs_root *log = root->log_root;
56042f2ff0eeSFilipe Manana 	struct btrfs_path *path;
56052f2ff0eeSFilipe Manana 	LIST_HEAD(dir_list);
56062f2ff0eeSFilipe Manana 	struct btrfs_dir_list *dir_elem;
56072f2ff0eeSFilipe Manana 	int ret = 0;
56082f2ff0eeSFilipe Manana 
56092f2ff0eeSFilipe Manana 	path = btrfs_alloc_path();
56102f2ff0eeSFilipe Manana 	if (!path)
56112f2ff0eeSFilipe Manana 		return -ENOMEM;
56122f2ff0eeSFilipe Manana 
56132f2ff0eeSFilipe Manana 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
56142f2ff0eeSFilipe Manana 	if (!dir_elem) {
56152f2ff0eeSFilipe Manana 		btrfs_free_path(path);
56162f2ff0eeSFilipe Manana 		return -ENOMEM;
56172f2ff0eeSFilipe Manana 	}
561851cc0d32SNikolay Borisov 	dir_elem->ino = btrfs_ino(start_inode);
56192f2ff0eeSFilipe Manana 	list_add_tail(&dir_elem->list, &dir_list);
56202f2ff0eeSFilipe Manana 
56212f2ff0eeSFilipe Manana 	while (!list_empty(&dir_list)) {
56222f2ff0eeSFilipe Manana 		struct extent_buffer *leaf;
56232f2ff0eeSFilipe Manana 		struct btrfs_key min_key;
56242f2ff0eeSFilipe Manana 		int nritems;
56252f2ff0eeSFilipe Manana 		int i;
56262f2ff0eeSFilipe Manana 
56272f2ff0eeSFilipe Manana 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
56282f2ff0eeSFilipe Manana 					    list);
56292f2ff0eeSFilipe Manana 		if (ret)
56302f2ff0eeSFilipe Manana 			goto next_dir_inode;
56312f2ff0eeSFilipe Manana 
56322f2ff0eeSFilipe Manana 		min_key.objectid = dir_elem->ino;
56332f2ff0eeSFilipe Manana 		min_key.type = BTRFS_DIR_ITEM_KEY;
56342f2ff0eeSFilipe Manana 		min_key.offset = 0;
56352f2ff0eeSFilipe Manana again:
56362f2ff0eeSFilipe Manana 		btrfs_release_path(path);
56372f2ff0eeSFilipe Manana 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
56382f2ff0eeSFilipe Manana 		if (ret < 0) {
56392f2ff0eeSFilipe Manana 			goto next_dir_inode;
56402f2ff0eeSFilipe Manana 		} else if (ret > 0) {
56412f2ff0eeSFilipe Manana 			ret = 0;
56422f2ff0eeSFilipe Manana 			goto next_dir_inode;
56432f2ff0eeSFilipe Manana 		}
56442f2ff0eeSFilipe Manana 
56452f2ff0eeSFilipe Manana process_leaf:
56462f2ff0eeSFilipe Manana 		leaf = path->nodes[0];
56472f2ff0eeSFilipe Manana 		nritems = btrfs_header_nritems(leaf);
56482f2ff0eeSFilipe Manana 		for (i = path->slots[0]; i < nritems; i++) {
56492f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
56502f2ff0eeSFilipe Manana 			struct btrfs_key di_key;
56512f2ff0eeSFilipe Manana 			struct inode *di_inode;
56522f2ff0eeSFilipe Manana 			struct btrfs_dir_list *new_dir_elem;
56532f2ff0eeSFilipe Manana 			int log_mode = LOG_INODE_EXISTS;
56542f2ff0eeSFilipe Manana 			int type;
56552f2ff0eeSFilipe Manana 
56562f2ff0eeSFilipe Manana 			btrfs_item_key_to_cpu(leaf, &min_key, i);
56572f2ff0eeSFilipe Manana 			if (min_key.objectid != dir_elem->ino ||
56582f2ff0eeSFilipe Manana 			    min_key.type != BTRFS_DIR_ITEM_KEY)
56592f2ff0eeSFilipe Manana 				goto next_dir_inode;
56602f2ff0eeSFilipe Manana 
56612f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
56622f2ff0eeSFilipe Manana 			type = btrfs_dir_type(leaf, di);
56632f2ff0eeSFilipe Manana 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
56642f2ff0eeSFilipe Manana 			    type != BTRFS_FT_DIR)
56652f2ff0eeSFilipe Manana 				continue;
56662f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
56672f2ff0eeSFilipe Manana 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
56682f2ff0eeSFilipe Manana 				continue;
56692f2ff0eeSFilipe Manana 
5670ec125cfbSRobbie Ko 			btrfs_release_path(path);
56710202e83fSDavid Sterba 			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
56722f2ff0eeSFilipe Manana 			if (IS_ERR(di_inode)) {
56732f2ff0eeSFilipe Manana 				ret = PTR_ERR(di_inode);
56742f2ff0eeSFilipe Manana 				goto next_dir_inode;
56752f2ff0eeSFilipe Manana 			}
56762f2ff0eeSFilipe Manana 
56770e44cb3fSFilipe Manana 			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5678410f954cSFilipe Manana 				btrfs_add_delayed_iput(di_inode);
5679ec125cfbSRobbie Ko 				break;
56802f2ff0eeSFilipe Manana 			}
56812f2ff0eeSFilipe Manana 
56822f2ff0eeSFilipe Manana 			ctx->log_new_dentries = false;
56833f9749f6SFilipe Manana 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
56842f2ff0eeSFilipe Manana 				log_mode = LOG_INODE_ALL;
5685a59108a7SNikolay Borisov 			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
568648778179SFilipe Manana 					      log_mode, ctx);
5687410f954cSFilipe Manana 			btrfs_add_delayed_iput(di_inode);
56882f2ff0eeSFilipe Manana 			if (ret)
56892f2ff0eeSFilipe Manana 				goto next_dir_inode;
56902f2ff0eeSFilipe Manana 			if (ctx->log_new_dentries) {
56912f2ff0eeSFilipe Manana 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
56922f2ff0eeSFilipe Manana 						       GFP_NOFS);
56932f2ff0eeSFilipe Manana 				if (!new_dir_elem) {
56942f2ff0eeSFilipe Manana 					ret = -ENOMEM;
56952f2ff0eeSFilipe Manana 					goto next_dir_inode;
56962f2ff0eeSFilipe Manana 				}
56972f2ff0eeSFilipe Manana 				new_dir_elem->ino = di_key.objectid;
56982f2ff0eeSFilipe Manana 				list_add_tail(&new_dir_elem->list, &dir_list);
56992f2ff0eeSFilipe Manana 			}
57002f2ff0eeSFilipe Manana 			break;
57012f2ff0eeSFilipe Manana 		}
57022f2ff0eeSFilipe Manana 		if (i == nritems) {
57032f2ff0eeSFilipe Manana 			ret = btrfs_next_leaf(log, path);
57042f2ff0eeSFilipe Manana 			if (ret < 0) {
57052f2ff0eeSFilipe Manana 				goto next_dir_inode;
57062f2ff0eeSFilipe Manana 			} else if (ret > 0) {
57072f2ff0eeSFilipe Manana 				ret = 0;
57082f2ff0eeSFilipe Manana 				goto next_dir_inode;
57092f2ff0eeSFilipe Manana 			}
57102f2ff0eeSFilipe Manana 			goto process_leaf;
57112f2ff0eeSFilipe Manana 		}
57122f2ff0eeSFilipe Manana 		if (min_key.offset < (u64)-1) {
57132f2ff0eeSFilipe Manana 			min_key.offset++;
57142f2ff0eeSFilipe Manana 			goto again;
57152f2ff0eeSFilipe Manana 		}
57162f2ff0eeSFilipe Manana next_dir_inode:
57172f2ff0eeSFilipe Manana 		list_del(&dir_elem->list);
57182f2ff0eeSFilipe Manana 		kfree(dir_elem);
57192f2ff0eeSFilipe Manana 	}
57202f2ff0eeSFilipe Manana 
57212f2ff0eeSFilipe Manana 	btrfs_free_path(path);
57222f2ff0eeSFilipe Manana 	return ret;
57232f2ff0eeSFilipe Manana }
57242f2ff0eeSFilipe Manana 
572518aa0922SFilipe Manana static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5726d0a0b78dSNikolay Borisov 				 struct btrfs_inode *inode,
572718aa0922SFilipe Manana 				 struct btrfs_log_ctx *ctx)
572818aa0922SFilipe Manana {
57293ffbd68cSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
573018aa0922SFilipe Manana 	int ret;
573118aa0922SFilipe Manana 	struct btrfs_path *path;
573218aa0922SFilipe Manana 	struct btrfs_key key;
5733d0a0b78dSNikolay Borisov 	struct btrfs_root *root = inode->root;
5734d0a0b78dSNikolay Borisov 	const u64 ino = btrfs_ino(inode);
573518aa0922SFilipe Manana 
573618aa0922SFilipe Manana 	path = btrfs_alloc_path();
573718aa0922SFilipe Manana 	if (!path)
573818aa0922SFilipe Manana 		return -ENOMEM;
573918aa0922SFilipe Manana 	path->skip_locking = 1;
574018aa0922SFilipe Manana 	path->search_commit_root = 1;
574118aa0922SFilipe Manana 
574218aa0922SFilipe Manana 	key.objectid = ino;
574318aa0922SFilipe Manana 	key.type = BTRFS_INODE_REF_KEY;
574418aa0922SFilipe Manana 	key.offset = 0;
574518aa0922SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
574618aa0922SFilipe Manana 	if (ret < 0)
574718aa0922SFilipe Manana 		goto out;
574818aa0922SFilipe Manana 
574918aa0922SFilipe Manana 	while (true) {
575018aa0922SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
575118aa0922SFilipe Manana 		int slot = path->slots[0];
575218aa0922SFilipe Manana 		u32 cur_offset = 0;
575318aa0922SFilipe Manana 		u32 item_size;
575418aa0922SFilipe Manana 		unsigned long ptr;
575518aa0922SFilipe Manana 
575618aa0922SFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
575718aa0922SFilipe Manana 			ret = btrfs_next_leaf(root, path);
575818aa0922SFilipe Manana 			if (ret < 0)
575918aa0922SFilipe Manana 				goto out;
576018aa0922SFilipe Manana 			else if (ret > 0)
576118aa0922SFilipe Manana 				break;
576218aa0922SFilipe Manana 			continue;
576318aa0922SFilipe Manana 		}
576418aa0922SFilipe Manana 
576518aa0922SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
576618aa0922SFilipe Manana 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
576718aa0922SFilipe Manana 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
576818aa0922SFilipe Manana 			break;
576918aa0922SFilipe Manana 
577018aa0922SFilipe Manana 		item_size = btrfs_item_size_nr(leaf, slot);
577118aa0922SFilipe Manana 		ptr = btrfs_item_ptr_offset(leaf, slot);
577218aa0922SFilipe Manana 		while (cur_offset < item_size) {
577318aa0922SFilipe Manana 			struct btrfs_key inode_key;
577418aa0922SFilipe Manana 			struct inode *dir_inode;
577518aa0922SFilipe Manana 
577618aa0922SFilipe Manana 			inode_key.type = BTRFS_INODE_ITEM_KEY;
577718aa0922SFilipe Manana 			inode_key.offset = 0;
577818aa0922SFilipe Manana 
577918aa0922SFilipe Manana 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
578018aa0922SFilipe Manana 				struct btrfs_inode_extref *extref;
578118aa0922SFilipe Manana 
578218aa0922SFilipe Manana 				extref = (struct btrfs_inode_extref *)
578318aa0922SFilipe Manana 					(ptr + cur_offset);
578418aa0922SFilipe Manana 				inode_key.objectid = btrfs_inode_extref_parent(
578518aa0922SFilipe Manana 					leaf, extref);
578618aa0922SFilipe Manana 				cur_offset += sizeof(*extref);
578718aa0922SFilipe Manana 				cur_offset += btrfs_inode_extref_name_len(leaf,
578818aa0922SFilipe Manana 					extref);
578918aa0922SFilipe Manana 			} else {
579018aa0922SFilipe Manana 				inode_key.objectid = key.offset;
579118aa0922SFilipe Manana 				cur_offset = item_size;
579218aa0922SFilipe Manana 			}
579318aa0922SFilipe Manana 
57940202e83fSDavid Sterba 			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
57950202e83fSDavid Sterba 					       root);
57960f375eedSFilipe Manana 			/*
57970f375eedSFilipe Manana 			 * If the parent inode was deleted, return an error to
57980f375eedSFilipe Manana 			 * fallback to a transaction commit. This is to prevent
57990f375eedSFilipe Manana 			 * getting an inode that was moved from one parent A to
58000f375eedSFilipe Manana 			 * a parent B, got its former parent A deleted and then
58010f375eedSFilipe Manana 			 * it got fsync'ed, from existing at both parents after
58020f375eedSFilipe Manana 			 * a log replay (and the old parent still existing).
58030f375eedSFilipe Manana 			 * Example:
58040f375eedSFilipe Manana 			 *
58050f375eedSFilipe Manana 			 * mkdir /mnt/A
58060f375eedSFilipe Manana 			 * mkdir /mnt/B
58070f375eedSFilipe Manana 			 * touch /mnt/B/bar
58080f375eedSFilipe Manana 			 * sync
58090f375eedSFilipe Manana 			 * mv /mnt/B/bar /mnt/A/bar
58100f375eedSFilipe Manana 			 * mv -T /mnt/A /mnt/B
58110f375eedSFilipe Manana 			 * fsync /mnt/B/bar
58120f375eedSFilipe Manana 			 * <power fail>
58130f375eedSFilipe Manana 			 *
58140f375eedSFilipe Manana 			 * If we ignore the old parent B which got deleted,
58150f375eedSFilipe Manana 			 * after a log replay we would have file bar linked
58160f375eedSFilipe Manana 			 * at both parents and the old parent B would still
58170f375eedSFilipe Manana 			 * exist.
58180f375eedSFilipe Manana 			 */
58190f375eedSFilipe Manana 			if (IS_ERR(dir_inode)) {
58200f375eedSFilipe Manana 				ret = PTR_ERR(dir_inode);
58210f375eedSFilipe Manana 				goto out;
58220f375eedSFilipe Manana 			}
582318aa0922SFilipe Manana 
58243e6a86a1SFilipe Manana 			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
58253e6a86a1SFilipe Manana 				btrfs_add_delayed_iput(dir_inode);
58263e6a86a1SFilipe Manana 				continue;
58273e6a86a1SFilipe Manana 			}
58283e6a86a1SFilipe Manana 
5829657ed1aaSFilipe Manana 			if (ctx)
5830657ed1aaSFilipe Manana 				ctx->log_new_dentries = false;
5831a59108a7SNikolay Borisov 			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
583248778179SFilipe Manana 					      LOG_INODE_ALL, ctx);
5833657ed1aaSFilipe Manana 			if (!ret && ctx && ctx->log_new_dentries)
5834657ed1aaSFilipe Manana 				ret = log_new_dir_dentries(trans, root,
583551cc0d32SNikolay Borisov 						   BTRFS_I(dir_inode), ctx);
5836410f954cSFilipe Manana 			btrfs_add_delayed_iput(dir_inode);
583718aa0922SFilipe Manana 			if (ret)
583818aa0922SFilipe Manana 				goto out;
583918aa0922SFilipe Manana 		}
584018aa0922SFilipe Manana 		path->slots[0]++;
584118aa0922SFilipe Manana 	}
584218aa0922SFilipe Manana 	ret = 0;
584318aa0922SFilipe Manana out:
584418aa0922SFilipe Manana 	btrfs_free_path(path);
584518aa0922SFilipe Manana 	return ret;
584618aa0922SFilipe Manana }
584718aa0922SFilipe Manana 
5848b8aa330dSFilipe Manana static int log_new_ancestors(struct btrfs_trans_handle *trans,
5849b8aa330dSFilipe Manana 			     struct btrfs_root *root,
5850b8aa330dSFilipe Manana 			     struct btrfs_path *path,
5851b8aa330dSFilipe Manana 			     struct btrfs_log_ctx *ctx)
5852b8aa330dSFilipe Manana {
5853b8aa330dSFilipe Manana 	struct btrfs_key found_key;
5854b8aa330dSFilipe Manana 
5855b8aa330dSFilipe Manana 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5856b8aa330dSFilipe Manana 
5857b8aa330dSFilipe Manana 	while (true) {
5858b8aa330dSFilipe Manana 		struct btrfs_fs_info *fs_info = root->fs_info;
5859b8aa330dSFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
5860b8aa330dSFilipe Manana 		int slot = path->slots[0];
5861b8aa330dSFilipe Manana 		struct btrfs_key search_key;
5862b8aa330dSFilipe Manana 		struct inode *inode;
58630202e83fSDavid Sterba 		u64 ino;
5864b8aa330dSFilipe Manana 		int ret = 0;
5865b8aa330dSFilipe Manana 
5866b8aa330dSFilipe Manana 		btrfs_release_path(path);
5867b8aa330dSFilipe Manana 
58680202e83fSDavid Sterba 		ino = found_key.offset;
58690202e83fSDavid Sterba 
5870b8aa330dSFilipe Manana 		search_key.objectid = found_key.offset;
5871b8aa330dSFilipe Manana 		search_key.type = BTRFS_INODE_ITEM_KEY;
5872b8aa330dSFilipe Manana 		search_key.offset = 0;
58730202e83fSDavid Sterba 		inode = btrfs_iget(fs_info->sb, ino, root);
5874b8aa330dSFilipe Manana 		if (IS_ERR(inode))
5875b8aa330dSFilipe Manana 			return PTR_ERR(inode);
5876b8aa330dSFilipe Manana 
5877ab12313aSFilipe Manana 		if (BTRFS_I(inode)->generation >= trans->transid &&
5878ab12313aSFilipe Manana 		    need_log_inode(trans, BTRFS_I(inode)))
5879b8aa330dSFilipe Manana 			ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
588048778179SFilipe Manana 					      LOG_INODE_EXISTS, ctx);
5881410f954cSFilipe Manana 		btrfs_add_delayed_iput(inode);
5882b8aa330dSFilipe Manana 		if (ret)
5883b8aa330dSFilipe Manana 			return ret;
5884b8aa330dSFilipe Manana 
5885b8aa330dSFilipe Manana 		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5886b8aa330dSFilipe Manana 			break;
5887b8aa330dSFilipe Manana 
5888b8aa330dSFilipe Manana 		search_key.type = BTRFS_INODE_REF_KEY;
5889b8aa330dSFilipe Manana 		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5890b8aa330dSFilipe Manana 		if (ret < 0)
5891b8aa330dSFilipe Manana 			return ret;
5892b8aa330dSFilipe Manana 
5893b8aa330dSFilipe Manana 		leaf = path->nodes[0];
5894b8aa330dSFilipe Manana 		slot = path->slots[0];
5895b8aa330dSFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
5896b8aa330dSFilipe Manana 			ret = btrfs_next_leaf(root, path);
5897b8aa330dSFilipe Manana 			if (ret < 0)
5898b8aa330dSFilipe Manana 				return ret;
5899b8aa330dSFilipe Manana 			else if (ret > 0)
5900b8aa330dSFilipe Manana 				return -ENOENT;
5901b8aa330dSFilipe Manana 			leaf = path->nodes[0];
5902b8aa330dSFilipe Manana 			slot = path->slots[0];
5903b8aa330dSFilipe Manana 		}
5904b8aa330dSFilipe Manana 
5905b8aa330dSFilipe Manana 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5906b8aa330dSFilipe Manana 		if (found_key.objectid != search_key.objectid ||
5907b8aa330dSFilipe Manana 		    found_key.type != BTRFS_INODE_REF_KEY)
5908b8aa330dSFilipe Manana 			return -ENOENT;
5909b8aa330dSFilipe Manana 	}
5910b8aa330dSFilipe Manana 	return 0;
5911b8aa330dSFilipe Manana }
5912b8aa330dSFilipe Manana 
5913b8aa330dSFilipe Manana static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5914b8aa330dSFilipe Manana 				  struct btrfs_inode *inode,
5915b8aa330dSFilipe Manana 				  struct dentry *parent,
5916b8aa330dSFilipe Manana 				  struct btrfs_log_ctx *ctx)
5917b8aa330dSFilipe Manana {
5918b8aa330dSFilipe Manana 	struct btrfs_root *root = inode->root;
5919b8aa330dSFilipe Manana 	struct dentry *old_parent = NULL;
5920b8aa330dSFilipe Manana 	struct super_block *sb = inode->vfs_inode.i_sb;
5921b8aa330dSFilipe Manana 	int ret = 0;
5922b8aa330dSFilipe Manana 
5923b8aa330dSFilipe Manana 	while (true) {
5924b8aa330dSFilipe Manana 		if (!parent || d_really_is_negative(parent) ||
5925b8aa330dSFilipe Manana 		    sb != parent->d_sb)
5926b8aa330dSFilipe Manana 			break;
5927b8aa330dSFilipe Manana 
5928b8aa330dSFilipe Manana 		inode = BTRFS_I(d_inode(parent));
5929b8aa330dSFilipe Manana 		if (root != inode->root)
5930b8aa330dSFilipe Manana 			break;
5931b8aa330dSFilipe Manana 
5932ab12313aSFilipe Manana 		if (inode->generation >= trans->transid &&
5933ab12313aSFilipe Manana 		    need_log_inode(trans, inode)) {
5934b8aa330dSFilipe Manana 			ret = btrfs_log_inode(trans, root, inode,
593548778179SFilipe Manana 					      LOG_INODE_EXISTS, ctx);
5936b8aa330dSFilipe Manana 			if (ret)
5937b8aa330dSFilipe Manana 				break;
5938b8aa330dSFilipe Manana 		}
5939b8aa330dSFilipe Manana 		if (IS_ROOT(parent))
5940b8aa330dSFilipe Manana 			break;
5941b8aa330dSFilipe Manana 
5942b8aa330dSFilipe Manana 		parent = dget_parent(parent);
5943b8aa330dSFilipe Manana 		dput(old_parent);
5944b8aa330dSFilipe Manana 		old_parent = parent;
5945b8aa330dSFilipe Manana 	}
5946b8aa330dSFilipe Manana 	dput(old_parent);
5947b8aa330dSFilipe Manana 
5948b8aa330dSFilipe Manana 	return ret;
5949b8aa330dSFilipe Manana }
5950b8aa330dSFilipe Manana 
5951b8aa330dSFilipe Manana static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
5952b8aa330dSFilipe Manana 				 struct btrfs_inode *inode,
5953b8aa330dSFilipe Manana 				 struct dentry *parent,
5954b8aa330dSFilipe Manana 				 struct btrfs_log_ctx *ctx)
5955b8aa330dSFilipe Manana {
5956b8aa330dSFilipe Manana 	struct btrfs_root *root = inode->root;
5957b8aa330dSFilipe Manana 	const u64 ino = btrfs_ino(inode);
5958b8aa330dSFilipe Manana 	struct btrfs_path *path;
5959b8aa330dSFilipe Manana 	struct btrfs_key search_key;
5960b8aa330dSFilipe Manana 	int ret;
5961b8aa330dSFilipe Manana 
5962b8aa330dSFilipe Manana 	/*
5963b8aa330dSFilipe Manana 	 * For a single hard link case, go through a fast path that does not
5964b8aa330dSFilipe Manana 	 * need to iterate the fs/subvolume tree.
5965b8aa330dSFilipe Manana 	 */
5966b8aa330dSFilipe Manana 	if (inode->vfs_inode.i_nlink < 2)
5967b8aa330dSFilipe Manana 		return log_new_ancestors_fast(trans, inode, parent, ctx);
5968b8aa330dSFilipe Manana 
5969b8aa330dSFilipe Manana 	path = btrfs_alloc_path();
5970b8aa330dSFilipe Manana 	if (!path)
5971b8aa330dSFilipe Manana 		return -ENOMEM;
5972b8aa330dSFilipe Manana 
5973b8aa330dSFilipe Manana 	search_key.objectid = ino;
5974b8aa330dSFilipe Manana 	search_key.type = BTRFS_INODE_REF_KEY;
5975b8aa330dSFilipe Manana 	search_key.offset = 0;
5976b8aa330dSFilipe Manana again:
5977b8aa330dSFilipe Manana 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5978b8aa330dSFilipe Manana 	if (ret < 0)
5979b8aa330dSFilipe Manana 		goto out;
5980b8aa330dSFilipe Manana 	if (ret == 0)
5981b8aa330dSFilipe Manana 		path->slots[0]++;
5982b8aa330dSFilipe Manana 
5983b8aa330dSFilipe Manana 	while (true) {
5984b8aa330dSFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
5985b8aa330dSFilipe Manana 		int slot = path->slots[0];
5986b8aa330dSFilipe Manana 		struct btrfs_key found_key;
5987b8aa330dSFilipe Manana 
5988b8aa330dSFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
5989b8aa330dSFilipe Manana 			ret = btrfs_next_leaf(root, path);
5990b8aa330dSFilipe Manana 			if (ret < 0)
5991b8aa330dSFilipe Manana 				goto out;
5992b8aa330dSFilipe Manana 			else if (ret > 0)
5993b8aa330dSFilipe Manana 				break;
5994b8aa330dSFilipe Manana 			continue;
5995b8aa330dSFilipe Manana 		}
5996b8aa330dSFilipe Manana 
5997b8aa330dSFilipe Manana 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5998b8aa330dSFilipe Manana 		if (found_key.objectid != ino ||
5999b8aa330dSFilipe Manana 		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6000b8aa330dSFilipe Manana 			break;
6001b8aa330dSFilipe Manana 
6002b8aa330dSFilipe Manana 		/*
6003b8aa330dSFilipe Manana 		 * Don't deal with extended references because they are rare
6004b8aa330dSFilipe Manana 		 * cases and too complex to deal with (we would need to keep
6005b8aa330dSFilipe Manana 		 * track of which subitem we are processing for each item in
6006b8aa330dSFilipe Manana 		 * this loop, etc). So just return some error to fallback to
6007b8aa330dSFilipe Manana 		 * a transaction commit.
6008b8aa330dSFilipe Manana 		 */
6009b8aa330dSFilipe Manana 		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6010b8aa330dSFilipe Manana 			ret = -EMLINK;
6011b8aa330dSFilipe Manana 			goto out;
6012b8aa330dSFilipe Manana 		}
6013b8aa330dSFilipe Manana 
6014b8aa330dSFilipe Manana 		/*
6015b8aa330dSFilipe Manana 		 * Logging ancestors needs to do more searches on the fs/subvol
6016b8aa330dSFilipe Manana 		 * tree, so it releases the path as needed to avoid deadlocks.
6017b8aa330dSFilipe Manana 		 * Keep track of the last inode ref key and resume from that key
6018b8aa330dSFilipe Manana 		 * after logging all new ancestors for the current hard link.
6019b8aa330dSFilipe Manana 		 */
6020b8aa330dSFilipe Manana 		memcpy(&search_key, &found_key, sizeof(search_key));
6021b8aa330dSFilipe Manana 
6022b8aa330dSFilipe Manana 		ret = log_new_ancestors(trans, root, path, ctx);
6023b8aa330dSFilipe Manana 		if (ret)
6024b8aa330dSFilipe Manana 			goto out;
6025b8aa330dSFilipe Manana 		btrfs_release_path(path);
6026b8aa330dSFilipe Manana 		goto again;
6027b8aa330dSFilipe Manana 	}
6028b8aa330dSFilipe Manana 	ret = 0;
6029b8aa330dSFilipe Manana out:
6030b8aa330dSFilipe Manana 	btrfs_free_path(path);
6031b8aa330dSFilipe Manana 	return ret;
6032b8aa330dSFilipe Manana }
6033b8aa330dSFilipe Manana 
6034e02119d5SChris Mason /*
6035e02119d5SChris Mason  * helper function around btrfs_log_inode to make sure newly created
6036e02119d5SChris Mason  * parent directories also end up in the log.  A minimal inode and backref
6037e02119d5SChris Mason  * only logging is done of any parent directories that are older than
6038e02119d5SChris Mason  * the last committed transaction
6039e02119d5SChris Mason  */
604048a3b636SEric Sandeen static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
604119df27a9SNikolay Borisov 				  struct btrfs_inode *inode,
604249dae1bcSFilipe Manana 				  struct dentry *parent,
604341a1eadaSEdmund Nadolski 				  int inode_only,
60448b050d35SMiao Xie 				  struct btrfs_log_ctx *ctx)
6045e02119d5SChris Mason {
6046f882274bSNikolay Borisov 	struct btrfs_root *root = inode->root;
60470b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
604812fcfd22SChris Mason 	int ret = 0;
60492f2ff0eeSFilipe Manana 	bool log_dentries = false;
605012fcfd22SChris Mason 
60510b246afaSJeff Mahoney 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
60523a5e1404SSage Weil 		ret = 1;
60533a5e1404SSage Weil 		goto end_no_trans;
60543a5e1404SSage Weil 	}
60553a5e1404SSage Weil 
6056f882274bSNikolay Borisov 	if (btrfs_root_refs(&root->root_item) == 0) {
605776dda93cSYan, Zheng 		ret = 1;
605876dda93cSYan, Zheng 		goto end_no_trans;
605976dda93cSYan, Zheng 	}
606076dda93cSYan, Zheng 
6061f2d72f42SFilipe Manana 	/*
6062f2d72f42SFilipe Manana 	 * Skip already logged inodes or inodes corresponding to tmpfiles
6063f2d72f42SFilipe Manana 	 * (since logging them is pointless, a link count of 0 means they
6064f2d72f42SFilipe Manana 	 * will never be accessible).
6065f2d72f42SFilipe Manana 	 */
6066626e9f41SFilipe Manana 	if ((btrfs_inode_in_log(inode, trans->transid) &&
6067626e9f41SFilipe Manana 	     list_empty(&ctx->ordered_extents)) ||
6068f2d72f42SFilipe Manana 	    inode->vfs_inode.i_nlink == 0) {
6069257c62e1SChris Mason 		ret = BTRFS_NO_LOG_SYNC;
6070257c62e1SChris Mason 		goto end_no_trans;
6071257c62e1SChris Mason 	}
6072257c62e1SChris Mason 
60738b050d35SMiao Xie 	ret = start_log_trans(trans, root, ctx);
60744a500fd1SYan, Zheng 	if (ret)
6075e87ac136SMiao Xie 		goto end_no_trans;
607612fcfd22SChris Mason 
607748778179SFilipe Manana 	ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
60784a500fd1SYan, Zheng 	if (ret)
60794a500fd1SYan, Zheng 		goto end_trans;
6080e02119d5SChris Mason 
6081af4176b4SChris Mason 	/*
6082af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
6083af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
6084af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
6085af4176b4SChris Mason 	 * and other fun in this file.
6086af4176b4SChris Mason 	 */
608719df27a9SNikolay Borisov 	if (S_ISREG(inode->vfs_inode.i_mode) &&
608847d3db41SFilipe Manana 	    inode->generation < trans->transid &&
608947d3db41SFilipe Manana 	    inode->last_unlink_trans < trans->transid) {
60904a500fd1SYan, Zheng 		ret = 0;
60914a500fd1SYan, Zheng 		goto end_trans;
60924a500fd1SYan, Zheng 	}
6093af4176b4SChris Mason 
609419df27a9SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
60952f2ff0eeSFilipe Manana 		log_dentries = true;
60962f2ff0eeSFilipe Manana 
609718aa0922SFilipe Manana 	/*
609801327610SNicholas D Steeves 	 * On unlink we must make sure all our current and old parent directory
609918aa0922SFilipe Manana 	 * inodes are fully logged. This is to prevent leaving dangling
610018aa0922SFilipe Manana 	 * directory index entries in directories that were our parents but are
610118aa0922SFilipe Manana 	 * not anymore. Not doing this results in old parent directory being
610218aa0922SFilipe Manana 	 * impossible to delete after log replay (rmdir will always fail with
610318aa0922SFilipe Manana 	 * error -ENOTEMPTY).
610418aa0922SFilipe Manana 	 *
610518aa0922SFilipe Manana 	 * Example 1:
610618aa0922SFilipe Manana 	 *
610718aa0922SFilipe Manana 	 * mkdir testdir
610818aa0922SFilipe Manana 	 * touch testdir/foo
610918aa0922SFilipe Manana 	 * ln testdir/foo testdir/bar
611018aa0922SFilipe Manana 	 * sync
611118aa0922SFilipe Manana 	 * unlink testdir/bar
611218aa0922SFilipe Manana 	 * xfs_io -c fsync testdir/foo
611318aa0922SFilipe Manana 	 * <power failure>
611418aa0922SFilipe Manana 	 * mount fs, triggers log replay
611518aa0922SFilipe Manana 	 *
611618aa0922SFilipe Manana 	 * If we don't log the parent directory (testdir), after log replay the
611718aa0922SFilipe Manana 	 * directory still has an entry pointing to the file inode using the bar
611818aa0922SFilipe Manana 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
611918aa0922SFilipe Manana 	 * the file inode has a link count of 1.
612018aa0922SFilipe Manana 	 *
612118aa0922SFilipe Manana 	 * Example 2:
612218aa0922SFilipe Manana 	 *
612318aa0922SFilipe Manana 	 * mkdir testdir
612418aa0922SFilipe Manana 	 * touch foo
612518aa0922SFilipe Manana 	 * ln foo testdir/foo2
612618aa0922SFilipe Manana 	 * ln foo testdir/foo3
612718aa0922SFilipe Manana 	 * sync
612818aa0922SFilipe Manana 	 * unlink testdir/foo3
612918aa0922SFilipe Manana 	 * xfs_io -c fsync foo
613018aa0922SFilipe Manana 	 * <power failure>
613118aa0922SFilipe Manana 	 * mount fs, triggers log replay
613218aa0922SFilipe Manana 	 *
613318aa0922SFilipe Manana 	 * Similar as the first example, after log replay the parent directory
613418aa0922SFilipe Manana 	 * testdir still has an entry pointing to the inode file with name foo3
613518aa0922SFilipe Manana 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
613618aa0922SFilipe Manana 	 * and has a link count of 2.
613718aa0922SFilipe Manana 	 */
613847d3db41SFilipe Manana 	if (inode->last_unlink_trans >= trans->transid) {
6139b8aa330dSFilipe Manana 		ret = btrfs_log_all_parents(trans, inode, ctx);
614018aa0922SFilipe Manana 		if (ret)
614118aa0922SFilipe Manana 			goto end_trans;
614218aa0922SFilipe Manana 	}
614318aa0922SFilipe Manana 
6144b8aa330dSFilipe Manana 	ret = log_all_new_ancestors(trans, inode, parent, ctx);
61454a500fd1SYan, Zheng 	if (ret)
61464a500fd1SYan, Zheng 		goto end_trans;
614712fcfd22SChris Mason 
61482f2ff0eeSFilipe Manana 	if (log_dentries)
6149b8aa330dSFilipe Manana 		ret = log_new_dir_dentries(trans, root, inode, ctx);
61502f2ff0eeSFilipe Manana 	else
615112fcfd22SChris Mason 		ret = 0;
61524a500fd1SYan, Zheng end_trans:
61534a500fd1SYan, Zheng 	if (ret < 0) {
615490787766SDavid Sterba 		btrfs_set_log_full_commit(trans);
61554a500fd1SYan, Zheng 		ret = 1;
61564a500fd1SYan, Zheng 	}
61578b050d35SMiao Xie 
61588b050d35SMiao Xie 	if (ret)
61598b050d35SMiao Xie 		btrfs_remove_log_ctx(root, ctx);
616012fcfd22SChris Mason 	btrfs_end_log_trans(root);
616112fcfd22SChris Mason end_no_trans:
616212fcfd22SChris Mason 	return ret;
6163e02119d5SChris Mason }
6164e02119d5SChris Mason 
6165e02119d5SChris Mason /*
6166e02119d5SChris Mason  * it is not safe to log dentry if the chunk root has added new
6167e02119d5SChris Mason  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6168e02119d5SChris Mason  * If this returns 1, you must commit the transaction to safely get your
6169e02119d5SChris Mason  * data on disk.
6170e02119d5SChris Mason  */
6171e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6172e5b84f7aSNikolay Borisov 			  struct dentry *dentry,
61738b050d35SMiao Xie 			  struct btrfs_log_ctx *ctx)
6174e02119d5SChris Mason {
61756a912213SJosef Bacik 	struct dentry *parent = dget_parent(dentry);
61766a912213SJosef Bacik 	int ret;
61776a912213SJosef Bacik 
6178f882274bSNikolay Borisov 	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
617948778179SFilipe Manana 				     LOG_INODE_ALL, ctx);
61806a912213SJosef Bacik 	dput(parent);
61816a912213SJosef Bacik 
61826a912213SJosef Bacik 	return ret;
6183e02119d5SChris Mason }
6184e02119d5SChris Mason 
6185e02119d5SChris Mason /*
6186e02119d5SChris Mason  * should be called during mount to recover any replay any log trees
6187e02119d5SChris Mason  * from the FS
6188e02119d5SChris Mason  */
6189e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6190e02119d5SChris Mason {
6191e02119d5SChris Mason 	int ret;
6192e02119d5SChris Mason 	struct btrfs_path *path;
6193e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
6194e02119d5SChris Mason 	struct btrfs_key key;
6195e02119d5SChris Mason 	struct btrfs_key found_key;
6196e02119d5SChris Mason 	struct btrfs_root *log;
6197e02119d5SChris Mason 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6198e02119d5SChris Mason 	struct walk_control wc = {
6199e02119d5SChris Mason 		.process_func = process_one_buffer,
6200430a6626SDavid Sterba 		.stage = LOG_WALK_PIN_ONLY,
6201e02119d5SChris Mason 	};
6202e02119d5SChris Mason 
6203e02119d5SChris Mason 	path = btrfs_alloc_path();
6204db5b493aSTsutomu Itoh 	if (!path)
6205db5b493aSTsutomu Itoh 		return -ENOMEM;
6206db5b493aSTsutomu Itoh 
6207afcdd129SJosef Bacik 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6208e02119d5SChris Mason 
62094a500fd1SYan, Zheng 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
621079787eaaSJeff Mahoney 	if (IS_ERR(trans)) {
621179787eaaSJeff Mahoney 		ret = PTR_ERR(trans);
621279787eaaSJeff Mahoney 		goto error;
621379787eaaSJeff Mahoney 	}
6214e02119d5SChris Mason 
6215e02119d5SChris Mason 	wc.trans = trans;
6216e02119d5SChris Mason 	wc.pin = 1;
6217e02119d5SChris Mason 
6218db5b493aSTsutomu Itoh 	ret = walk_log_tree(trans, log_root_tree, &wc);
621979787eaaSJeff Mahoney 	if (ret) {
62205d163e0eSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
62215d163e0eSJeff Mahoney 			"Failed to pin buffers while recovering log root tree.");
622279787eaaSJeff Mahoney 		goto error;
622379787eaaSJeff Mahoney 	}
6224e02119d5SChris Mason 
6225e02119d5SChris Mason again:
6226e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
6227e02119d5SChris Mason 	key.offset = (u64)-1;
6228962a298fSDavid Sterba 	key.type = BTRFS_ROOT_ITEM_KEY;
6229e02119d5SChris Mason 
6230e02119d5SChris Mason 	while (1) {
6231e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
623279787eaaSJeff Mahoney 
623379787eaaSJeff Mahoney 		if (ret < 0) {
623434d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
623579787eaaSJeff Mahoney 				    "Couldn't find tree log root.");
623679787eaaSJeff Mahoney 			goto error;
623779787eaaSJeff Mahoney 		}
6238e02119d5SChris Mason 		if (ret > 0) {
6239e02119d5SChris Mason 			if (path->slots[0] == 0)
6240e02119d5SChris Mason 				break;
6241e02119d5SChris Mason 			path->slots[0]--;
6242e02119d5SChris Mason 		}
6243e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6244e02119d5SChris Mason 				      path->slots[0]);
6245b3b4aa74SDavid Sterba 		btrfs_release_path(path);
6246e02119d5SChris Mason 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6247e02119d5SChris Mason 			break;
6248e02119d5SChris Mason 
624962a2c73eSJosef Bacik 		log = btrfs_read_tree_root(log_root_tree, &found_key);
625079787eaaSJeff Mahoney 		if (IS_ERR(log)) {
625179787eaaSJeff Mahoney 			ret = PTR_ERR(log);
625234d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
625379787eaaSJeff Mahoney 				    "Couldn't read tree log root.");
625479787eaaSJeff Mahoney 			goto error;
625579787eaaSJeff Mahoney 		}
6256e02119d5SChris Mason 
625756e9357aSDavid Sterba 		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
625856e9357aSDavid Sterba 						   true);
625979787eaaSJeff Mahoney 		if (IS_ERR(wc.replay_dest)) {
626079787eaaSJeff Mahoney 			ret = PTR_ERR(wc.replay_dest);
62619bc574deSJosef Bacik 
62629bc574deSJosef Bacik 			/*
62639bc574deSJosef Bacik 			 * We didn't find the subvol, likely because it was
62649bc574deSJosef Bacik 			 * deleted.  This is ok, simply skip this log and go to
62659bc574deSJosef Bacik 			 * the next one.
62669bc574deSJosef Bacik 			 *
62679bc574deSJosef Bacik 			 * We need to exclude the root because we can't have
62689bc574deSJosef Bacik 			 * other log replays overwriting this log as we'll read
62699bc574deSJosef Bacik 			 * it back in a few more times.  This will keep our
62709bc574deSJosef Bacik 			 * block from being modified, and we'll just bail for
62719bc574deSJosef Bacik 			 * each subsequent pass.
62729bc574deSJosef Bacik 			 */
62739bc574deSJosef Bacik 			if (ret == -ENOENT)
62749fce5704SNikolay Borisov 				ret = btrfs_pin_extent_for_log_replay(trans,
62759bc574deSJosef Bacik 							log->node->start,
62769bc574deSJosef Bacik 							log->node->len);
627700246528SJosef Bacik 			btrfs_put_root(log);
62789bc574deSJosef Bacik 
62799bc574deSJosef Bacik 			if (!ret)
62809bc574deSJosef Bacik 				goto next;
62815d163e0eSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
62825d163e0eSJeff Mahoney 				"Couldn't read target root for tree log recovery.");
628379787eaaSJeff Mahoney 			goto error;
628479787eaaSJeff Mahoney 		}
6285e02119d5SChris Mason 
628607d400a6SYan Zheng 		wc.replay_dest->log_root = log;
62872002ae11SJosef Bacik 		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
62882002ae11SJosef Bacik 		if (ret)
62892002ae11SJosef Bacik 			/* The loop needs to continue due to the root refs */
62902002ae11SJosef Bacik 			btrfs_handle_fs_error(fs_info, ret,
62912002ae11SJosef Bacik 				"failed to record the log root in transaction");
62922002ae11SJosef Bacik 		else
6293e02119d5SChris Mason 			ret = walk_log_tree(trans, log, &wc);
6294e02119d5SChris Mason 
6295b50c6e25SJosef Bacik 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6296e02119d5SChris Mason 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
6297e02119d5SChris Mason 						      path);
6298e02119d5SChris Mason 		}
6299e02119d5SChris Mason 
6300900c9981SLiu Bo 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6301900c9981SLiu Bo 			struct btrfs_root *root = wc.replay_dest;
6302900c9981SLiu Bo 
6303900c9981SLiu Bo 			btrfs_release_path(path);
6304900c9981SLiu Bo 
6305900c9981SLiu Bo 			/*
6306900c9981SLiu Bo 			 * We have just replayed everything, and the highest
6307900c9981SLiu Bo 			 * objectid of fs roots probably has changed in case
6308900c9981SLiu Bo 			 * some inode_item's got replayed.
6309900c9981SLiu Bo 			 *
6310900c9981SLiu Bo 			 * root->objectid_mutex is not acquired as log replay
6311900c9981SLiu Bo 			 * could only happen during mount.
6312900c9981SLiu Bo 			 */
6313453e4873SNikolay Borisov 			ret = btrfs_init_root_free_objectid(root);
6314900c9981SLiu Bo 		}
6315900c9981SLiu Bo 
631607d400a6SYan Zheng 		wc.replay_dest->log_root = NULL;
631700246528SJosef Bacik 		btrfs_put_root(wc.replay_dest);
631800246528SJosef Bacik 		btrfs_put_root(log);
6319e02119d5SChris Mason 
6320b50c6e25SJosef Bacik 		if (ret)
6321b50c6e25SJosef Bacik 			goto error;
63229bc574deSJosef Bacik next:
6323e02119d5SChris Mason 		if (found_key.offset == 0)
6324e02119d5SChris Mason 			break;
63259bc574deSJosef Bacik 		key.offset = found_key.offset - 1;
6326e02119d5SChris Mason 	}
6327b3b4aa74SDavid Sterba 	btrfs_release_path(path);
6328e02119d5SChris Mason 
6329e02119d5SChris Mason 	/* step one is to pin it all, step two is to replay just inodes */
6330e02119d5SChris Mason 	if (wc.pin) {
6331e02119d5SChris Mason 		wc.pin = 0;
6332e02119d5SChris Mason 		wc.process_func = replay_one_buffer;
6333e02119d5SChris Mason 		wc.stage = LOG_WALK_REPLAY_INODES;
6334e02119d5SChris Mason 		goto again;
6335e02119d5SChris Mason 	}
6336e02119d5SChris Mason 	/* step three is to replay everything */
6337e02119d5SChris Mason 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
6338e02119d5SChris Mason 		wc.stage++;
6339e02119d5SChris Mason 		goto again;
6340e02119d5SChris Mason 	}
6341e02119d5SChris Mason 
6342e02119d5SChris Mason 	btrfs_free_path(path);
6343e02119d5SChris Mason 
6344abefa55aSJosef Bacik 	/* step 4: commit the transaction, which also unpins the blocks */
63453a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
6346abefa55aSJosef Bacik 	if (ret)
6347abefa55aSJosef Bacik 		return ret;
6348abefa55aSJosef Bacik 
6349e02119d5SChris Mason 	log_root_tree->log_root = NULL;
6350afcdd129SJosef Bacik 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
635100246528SJosef Bacik 	btrfs_put_root(log_root_tree);
635279787eaaSJeff Mahoney 
6353abefa55aSJosef Bacik 	return 0;
635479787eaaSJeff Mahoney error:
6355b50c6e25SJosef Bacik 	if (wc.trans)
63563a45bb20SJeff Mahoney 		btrfs_end_transaction(wc.trans);
635779787eaaSJeff Mahoney 	btrfs_free_path(path);
635879787eaaSJeff Mahoney 	return ret;
6359e02119d5SChris Mason }
636012fcfd22SChris Mason 
636112fcfd22SChris Mason /*
636212fcfd22SChris Mason  * there are some corner cases where we want to force a full
636312fcfd22SChris Mason  * commit instead of allowing a directory to be logged.
636412fcfd22SChris Mason  *
636512fcfd22SChris Mason  * They revolve around files there were unlinked from the directory, and
636612fcfd22SChris Mason  * this function updates the parent directory so that a full commit is
636712fcfd22SChris Mason  * properly done if it is fsync'd later after the unlinks are done.
63682be63d5cSFilipe Manana  *
63692be63d5cSFilipe Manana  * Must be called before the unlink operations (updates to the subvolume tree,
63702be63d5cSFilipe Manana  * inodes, etc) are done.
637112fcfd22SChris Mason  */
637212fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
63734176bdbfSNikolay Borisov 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
637412fcfd22SChris Mason 			     int for_rename)
637512fcfd22SChris Mason {
637612fcfd22SChris Mason 	/*
6377af4176b4SChris Mason 	 * when we're logging a file, if it hasn't been renamed
6378af4176b4SChris Mason 	 * or unlinked, and its inode is fully committed on disk,
6379af4176b4SChris Mason 	 * we don't have to worry about walking up the directory chain
6380af4176b4SChris Mason 	 * to log its parents.
6381af4176b4SChris Mason 	 *
6382af4176b4SChris Mason 	 * So, we use the last_unlink_trans field to put this transid
6383af4176b4SChris Mason 	 * into the file.  When the file is logged we check it and
6384af4176b4SChris Mason 	 * don't log the parents if the file is fully on disk.
6385af4176b4SChris Mason 	 */
63864176bdbfSNikolay Borisov 	mutex_lock(&inode->log_mutex);
63874176bdbfSNikolay Borisov 	inode->last_unlink_trans = trans->transid;
63884176bdbfSNikolay Borisov 	mutex_unlock(&inode->log_mutex);
6389af4176b4SChris Mason 
6390af4176b4SChris Mason 	/*
639112fcfd22SChris Mason 	 * if this directory was already logged any new
639212fcfd22SChris Mason 	 * names for this file/dir will get recorded
639312fcfd22SChris Mason 	 */
63944176bdbfSNikolay Borisov 	if (dir->logged_trans == trans->transid)
639512fcfd22SChris Mason 		return;
639612fcfd22SChris Mason 
639712fcfd22SChris Mason 	/*
639812fcfd22SChris Mason 	 * if the inode we're about to unlink was logged,
639912fcfd22SChris Mason 	 * the log will be properly updated for any new names
640012fcfd22SChris Mason 	 */
64014176bdbfSNikolay Borisov 	if (inode->logged_trans == trans->transid)
640212fcfd22SChris Mason 		return;
640312fcfd22SChris Mason 
640412fcfd22SChris Mason 	/*
640512fcfd22SChris Mason 	 * when renaming files across directories, if the directory
640612fcfd22SChris Mason 	 * there we're unlinking from gets fsync'd later on, there's
640712fcfd22SChris Mason 	 * no way to find the destination directory later and fsync it
640812fcfd22SChris Mason 	 * properly.  So, we have to be conservative and force commits
640912fcfd22SChris Mason 	 * so the new name gets discovered.
641012fcfd22SChris Mason 	 */
641112fcfd22SChris Mason 	if (for_rename)
641212fcfd22SChris Mason 		goto record;
641312fcfd22SChris Mason 
641412fcfd22SChris Mason 	/* we can safely do the unlink without any special recording */
641512fcfd22SChris Mason 	return;
641612fcfd22SChris Mason 
641712fcfd22SChris Mason record:
64184176bdbfSNikolay Borisov 	mutex_lock(&dir->log_mutex);
64194176bdbfSNikolay Borisov 	dir->last_unlink_trans = trans->transid;
64204176bdbfSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
642112fcfd22SChris Mason }
642212fcfd22SChris Mason 
642312fcfd22SChris Mason /*
64241ec9a1aeSFilipe Manana  * Make sure that if someone attempts to fsync the parent directory of a deleted
64251ec9a1aeSFilipe Manana  * snapshot, it ends up triggering a transaction commit. This is to guarantee
64261ec9a1aeSFilipe Manana  * that after replaying the log tree of the parent directory's root we will not
64271ec9a1aeSFilipe Manana  * see the snapshot anymore and at log replay time we will not see any log tree
64281ec9a1aeSFilipe Manana  * corresponding to the deleted snapshot's root, which could lead to replaying
64291ec9a1aeSFilipe Manana  * it after replaying the log tree of the parent directory (which would replay
64301ec9a1aeSFilipe Manana  * the snapshot delete operation).
64312be63d5cSFilipe Manana  *
64322be63d5cSFilipe Manana  * Must be called before the actual snapshot destroy operation (updates to the
64332be63d5cSFilipe Manana  * parent root and tree of tree roots trees, etc) are done.
64341ec9a1aeSFilipe Manana  */
64351ec9a1aeSFilipe Manana void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
643643663557SNikolay Borisov 				   struct btrfs_inode *dir)
64371ec9a1aeSFilipe Manana {
643843663557SNikolay Borisov 	mutex_lock(&dir->log_mutex);
643943663557SNikolay Borisov 	dir->last_unlink_trans = trans->transid;
644043663557SNikolay Borisov 	mutex_unlock(&dir->log_mutex);
64411ec9a1aeSFilipe Manana }
64421ec9a1aeSFilipe Manana 
64431ec9a1aeSFilipe Manana /*
644412fcfd22SChris Mason  * Call this after adding a new name for a file and it will properly
644512fcfd22SChris Mason  * update the log to reflect the new name.
644612fcfd22SChris Mason  */
644775b463d2SFilipe Manana void btrfs_log_new_name(struct btrfs_trans_handle *trans,
64489ca5fbfbSNikolay Borisov 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
644975b463d2SFilipe Manana 			struct dentry *parent)
645012fcfd22SChris Mason {
645175b463d2SFilipe Manana 	struct btrfs_log_ctx ctx;
645212fcfd22SChris Mason 
645312fcfd22SChris Mason 	/*
6454af4176b4SChris Mason 	 * this will force the logging code to walk the dentry chain
6455af4176b4SChris Mason 	 * up for the file
6456af4176b4SChris Mason 	 */
64579a6509c4SFilipe Manana 	if (!S_ISDIR(inode->vfs_inode.i_mode))
64589ca5fbfbSNikolay Borisov 		inode->last_unlink_trans = trans->transid;
6459af4176b4SChris Mason 
6460af4176b4SChris Mason 	/*
646112fcfd22SChris Mason 	 * if this inode hasn't been logged and directory we're renaming it
646212fcfd22SChris Mason 	 * from hasn't been logged, we don't need to log it
646312fcfd22SChris Mason 	 */
6464de53d892SFilipe Manana 	if (inode->logged_trans < trans->transid &&
6465de53d892SFilipe Manana 	    (!old_dir || old_dir->logged_trans < trans->transid))
646675b463d2SFilipe Manana 		return;
646712fcfd22SChris Mason 
646854a40fc3SFilipe Manana 	/*
646954a40fc3SFilipe Manana 	 * If we are doing a rename (old_dir is not NULL) from a directory that
647054a40fc3SFilipe Manana 	 * was previously logged, make sure the next log attempt on the directory
647154a40fc3SFilipe Manana 	 * is not skipped and logs the inode again. This is because the log may
647254a40fc3SFilipe Manana 	 * not currently be authoritative for a range including the old
647354a40fc3SFilipe Manana 	 * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
647454a40fc3SFilipe Manana 	 * sure after a log replay we do not end up with both the new and old
647554a40fc3SFilipe Manana 	 * dentries around (in case the inode is a directory we would have a
647654a40fc3SFilipe Manana 	 * directory with two hard links and 2 inode references for different
647754a40fc3SFilipe Manana 	 * parents). The next log attempt of old_dir will happen at
647854a40fc3SFilipe Manana 	 * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
647954a40fc3SFilipe Manana 	 * below, because we have previously set inode->last_unlink_trans to the
648054a40fc3SFilipe Manana 	 * current transaction ID, either here or at btrfs_record_unlink_dir() in
648154a40fc3SFilipe Manana 	 * case inode is a directory.
648254a40fc3SFilipe Manana 	 */
648354a40fc3SFilipe Manana 	if (old_dir)
648454a40fc3SFilipe Manana 		old_dir->logged_trans = 0;
648554a40fc3SFilipe Manana 
648675b463d2SFilipe Manana 	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
648775b463d2SFilipe Manana 	ctx.logging_new_name = true;
648875b463d2SFilipe Manana 	/*
648975b463d2SFilipe Manana 	 * We don't care about the return value. If we fail to log the new name
649075b463d2SFilipe Manana 	 * then we know the next attempt to sync the log will fallback to a full
649175b463d2SFilipe Manana 	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
649275b463d2SFilipe Manana 	 * we don't need to worry about getting a log committed that has an
649375b463d2SFilipe Manana 	 * inconsistent state after a rename operation.
649475b463d2SFilipe Manana 	 */
649548778179SFilipe Manana 	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
649612fcfd22SChris Mason }
649712fcfd22SChris Mason 
6498