xref: /openbmc/linux/fs/btrfs/tree-log.c (revision a95f3aafd6a2d0e8de834c95e91066825e3e7787)
1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
2e02119d5SChris Mason /*
3e02119d5SChris Mason  * Copyright (C) 2008 Oracle.  All rights reserved.
4e02119d5SChris Mason  */
5e02119d5SChris Mason 
6e02119d5SChris Mason #include <linux/sched.h>
75a0e3ad6STejun Heo #include <linux/slab.h>
8c6adc9ccSMiao Xie #include <linux/blkdev.h>
95dc562c5SJosef Bacik #include <linux/list_sort.h>
10c7f88c4eSJeff Layton #include <linux/iversion.h>
119678c543SNikolay Borisov #include "ctree.h"
12995946ddSMiao Xie #include "tree-log.h"
13e02119d5SChris Mason #include "disk-io.h"
14e02119d5SChris Mason #include "locking.h"
15e02119d5SChris Mason #include "print-tree.h"
16f186373fSMark Fasheh #include "backref.h"
17ebb8765bSAnand Jain #include "compression.h"
18df2c95f3SQu Wenruo #include "qgroup.h"
19900c9981SLiu Bo #include "inode-map.h"
20e02119d5SChris Mason 
21e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode:
22e02119d5SChris Mason  *
23e02119d5SChris Mason  * LOG_INODE_ALL means to log everything
24e02119d5SChris Mason  * LOG_INODE_EXISTS means to log just enough to recreate the inode
25e02119d5SChris Mason  * during log replay
26e02119d5SChris Mason  */
27e02119d5SChris Mason #define LOG_INODE_ALL 0
28e02119d5SChris Mason #define LOG_INODE_EXISTS 1
29781feef7SLiu Bo #define LOG_OTHER_INODE 2
30e02119d5SChris Mason 
31e02119d5SChris Mason /*
3212fcfd22SChris Mason  * directory trouble cases
3312fcfd22SChris Mason  *
3412fcfd22SChris Mason  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
3512fcfd22SChris Mason  * log, we must force a full commit before doing an fsync of the directory
3612fcfd22SChris Mason  * where the unlink was done.
3712fcfd22SChris Mason  * ---> record transid of last unlink/rename per directory
3812fcfd22SChris Mason  *
3912fcfd22SChris Mason  * mkdir foo/some_dir
4012fcfd22SChris Mason  * normal commit
4112fcfd22SChris Mason  * rename foo/some_dir foo2/some_dir
4212fcfd22SChris Mason  * mkdir foo/some_dir
4312fcfd22SChris Mason  * fsync foo/some_dir/some_file
4412fcfd22SChris Mason  *
4512fcfd22SChris Mason  * The fsync above will unlink the original some_dir without recording
4612fcfd22SChris Mason  * it in its new location (foo2).  After a crash, some_dir will be gone
4712fcfd22SChris Mason  * unless the fsync of some_file forces a full commit
4812fcfd22SChris Mason  *
4912fcfd22SChris Mason  * 2) we must log any new names for any file or dir that is in the fsync
5012fcfd22SChris Mason  * log. ---> check inode while renaming/linking.
5112fcfd22SChris Mason  *
5212fcfd22SChris Mason  * 2a) we must log any new names for any file or dir during rename
5312fcfd22SChris Mason  * when the directory they are being removed from was logged.
5412fcfd22SChris Mason  * ---> check inode and old parent dir during rename
5512fcfd22SChris Mason  *
5612fcfd22SChris Mason  *  2a is actually the more important variant.  With the extra logging
5712fcfd22SChris Mason  *  a crash might unlink the old name without recreating the new one
5812fcfd22SChris Mason  *
5912fcfd22SChris Mason  * 3) after a crash, we must go through any directories with a link count
6012fcfd22SChris Mason  * of zero and redo the rm -rf
6112fcfd22SChris Mason  *
6212fcfd22SChris Mason  * mkdir f1/foo
6312fcfd22SChris Mason  * normal commit
6412fcfd22SChris Mason  * rm -rf f1/foo
6512fcfd22SChris Mason  * fsync(f1)
6612fcfd22SChris Mason  *
6712fcfd22SChris Mason  * The directory f1 was fully removed from the FS, but fsync was never
6812fcfd22SChris Mason  * called on f1, only its parent dir.  After a crash the rm -rf must
6912fcfd22SChris Mason  * be replayed.  This must be able to recurse down the entire
7012fcfd22SChris Mason  * directory tree.  The inode link count fixup code takes care of the
7112fcfd22SChris Mason  * ugly details.
7212fcfd22SChris Mason  */
7312fcfd22SChris Mason 
7412fcfd22SChris Mason /*
75e02119d5SChris Mason  * stages for the tree walking.  The first
76e02119d5SChris Mason  * stage (0) is to only pin down the blocks we find
77e02119d5SChris Mason  * the second stage (1) is to make sure that all the inodes
78e02119d5SChris Mason  * we find in the log are created in the subvolume.
79e02119d5SChris Mason  *
80e02119d5SChris Mason  * The last stage is to deal with directories and links and extents
81e02119d5SChris Mason  * and all the other fun semantics
82e02119d5SChris Mason  */
83e02119d5SChris Mason #define LOG_WALK_PIN_ONLY 0
84e02119d5SChris Mason #define LOG_WALK_REPLAY_INODES 1
85dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_DIR_INDEX 2
86dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_ALL 3
87e02119d5SChris Mason 
8812fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
89a59108a7SNikolay Borisov 			   struct btrfs_root *root, struct btrfs_inode *inode,
9049dae1bcSFilipe Manana 			   int inode_only,
9149dae1bcSFilipe Manana 			   const loff_t start,
928407f553SFilipe Manana 			   const loff_t end,
938407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx);
94ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
95ec051c0fSYan Zheng 			     struct btrfs_root *root,
96ec051c0fSYan Zheng 			     struct btrfs_path *path, u64 objectid);
9712fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
9812fcfd22SChris Mason 				       struct btrfs_root *root,
9912fcfd22SChris Mason 				       struct btrfs_root *log,
10012fcfd22SChris Mason 				       struct btrfs_path *path,
10112fcfd22SChris Mason 				       u64 dirid, int del_all);
102e02119d5SChris Mason 
103e02119d5SChris Mason /*
104e02119d5SChris Mason  * tree logging is a special write ahead log used to make sure that
105e02119d5SChris Mason  * fsyncs and O_SYNCs can happen without doing full tree commits.
106e02119d5SChris Mason  *
107e02119d5SChris Mason  * Full tree commits are expensive because they require commonly
108e02119d5SChris Mason  * modified blocks to be recowed, creating many dirty pages in the
109e02119d5SChris Mason  * extent tree an 4x-6x higher write load than ext3.
110e02119d5SChris Mason  *
111e02119d5SChris Mason  * Instead of doing a tree commit on every fsync, we use the
112e02119d5SChris Mason  * key ranges and transaction ids to find items for a given file or directory
113e02119d5SChris Mason  * that have changed in this transaction.  Those items are copied into
114e02119d5SChris Mason  * a special tree (one per subvolume root), that tree is written to disk
115e02119d5SChris Mason  * and then the fsync is considered complete.
116e02119d5SChris Mason  *
117e02119d5SChris Mason  * After a crash, items are copied out of the log-tree back into the
118e02119d5SChris Mason  * subvolume tree.  Any file data extents found are recorded in the extent
119e02119d5SChris Mason  * allocation tree, and the log-tree freed.
120e02119d5SChris Mason  *
121e02119d5SChris Mason  * The log tree is read three times, once to pin down all the extents it is
122e02119d5SChris Mason  * using in ram and once, once to create all the inodes logged in the tree
123e02119d5SChris Mason  * and once to do all the other items.
124e02119d5SChris Mason  */
125e02119d5SChris Mason 
126e02119d5SChris Mason /*
127e02119d5SChris Mason  * start a sub transaction and setup the log tree
128e02119d5SChris Mason  * this increments the log tree writer count to make the people
129e02119d5SChris Mason  * syncing the tree wait for us to finish
130e02119d5SChris Mason  */
131e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans,
1328b050d35SMiao Xie 			   struct btrfs_root *root,
1338b050d35SMiao Xie 			   struct btrfs_log_ctx *ctx)
134e02119d5SChris Mason {
1350b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
13634eb2a52SZhaolei 	int ret = 0;
1377237f183SYan Zheng 
1387237f183SYan Zheng 	mutex_lock(&root->log_mutex);
13934eb2a52SZhaolei 
1407237f183SYan Zheng 	if (root->log_root) {
1410b246afaSJeff Mahoney 		if (btrfs_need_log_full_commit(fs_info, trans)) {
14250471a38SMiao Xie 			ret = -EAGAIN;
14350471a38SMiao Xie 			goto out;
14450471a38SMiao Xie 		}
14534eb2a52SZhaolei 
146ff782e0aSJosef Bacik 		if (!root->log_start_pid) {
14727cdeb70SMiao Xie 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
14834eb2a52SZhaolei 			root->log_start_pid = current->pid;
149ff782e0aSJosef Bacik 		} else if (root->log_start_pid != current->pid) {
15027cdeb70SMiao Xie 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
151ff782e0aSJosef Bacik 		}
15234eb2a52SZhaolei 	} else {
1530b246afaSJeff Mahoney 		mutex_lock(&fs_info->tree_log_mutex);
1540b246afaSJeff Mahoney 		if (!fs_info->log_root_tree)
1550b246afaSJeff Mahoney 			ret = btrfs_init_log_root_tree(trans, fs_info);
1560b246afaSJeff Mahoney 		mutex_unlock(&fs_info->tree_log_mutex);
1574a500fd1SYan, Zheng 		if (ret)
158e87ac136SMiao Xie 			goto out;
159e87ac136SMiao Xie 
160e02119d5SChris Mason 		ret = btrfs_add_log_tree(trans, root);
1614a500fd1SYan, Zheng 		if (ret)
162e87ac136SMiao Xie 			goto out;
16334eb2a52SZhaolei 
16427cdeb70SMiao Xie 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
165e87ac136SMiao Xie 		root->log_start_pid = current->pid;
16634eb2a52SZhaolei 	}
16734eb2a52SZhaolei 
1682ecb7923SMiao Xie 	atomic_inc(&root->log_batch);
1697237f183SYan Zheng 	atomic_inc(&root->log_writers);
1708b050d35SMiao Xie 	if (ctx) {
17134eb2a52SZhaolei 		int index = root->log_transid % 2;
1728b050d35SMiao Xie 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
173d1433debSMiao Xie 		ctx->log_transid = root->log_transid;
1748b050d35SMiao Xie 	}
17534eb2a52SZhaolei 
176e87ac136SMiao Xie out:
1777237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
178e87ac136SMiao Xie 	return ret;
179e02119d5SChris Mason }
180e02119d5SChris Mason 
181e02119d5SChris Mason /*
182e02119d5SChris Mason  * returns 0 if there was a log transaction running and we were able
183e02119d5SChris Mason  * to join, or returns -ENOENT if there were not transactions
184e02119d5SChris Mason  * in progress
185e02119d5SChris Mason  */
186e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root)
187e02119d5SChris Mason {
188e02119d5SChris Mason 	int ret = -ENOENT;
189e02119d5SChris Mason 
190e02119d5SChris Mason 	smp_mb();
191e02119d5SChris Mason 	if (!root->log_root)
192e02119d5SChris Mason 		return -ENOENT;
193e02119d5SChris Mason 
1947237f183SYan Zheng 	mutex_lock(&root->log_mutex);
195e02119d5SChris Mason 	if (root->log_root) {
196e02119d5SChris Mason 		ret = 0;
1977237f183SYan Zheng 		atomic_inc(&root->log_writers);
198e02119d5SChris Mason 	}
1997237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
200e02119d5SChris Mason 	return ret;
201e02119d5SChris Mason }
202e02119d5SChris Mason 
203e02119d5SChris Mason /*
20412fcfd22SChris Mason  * This either makes the current running log transaction wait
20512fcfd22SChris Mason  * until you call btrfs_end_log_trans() or it makes any future
20612fcfd22SChris Mason  * log transactions wait until you call btrfs_end_log_trans()
20712fcfd22SChris Mason  */
20812fcfd22SChris Mason int btrfs_pin_log_trans(struct btrfs_root *root)
20912fcfd22SChris Mason {
21012fcfd22SChris Mason 	int ret = -ENOENT;
21112fcfd22SChris Mason 
21212fcfd22SChris Mason 	mutex_lock(&root->log_mutex);
21312fcfd22SChris Mason 	atomic_inc(&root->log_writers);
21412fcfd22SChris Mason 	mutex_unlock(&root->log_mutex);
21512fcfd22SChris Mason 	return ret;
21612fcfd22SChris Mason }
21712fcfd22SChris Mason 
21812fcfd22SChris Mason /*
219e02119d5SChris Mason  * indicate we're done making changes to the log tree
220e02119d5SChris Mason  * and wake up anyone waiting to do a sync
221e02119d5SChris Mason  */
222143bede5SJeff Mahoney void btrfs_end_log_trans(struct btrfs_root *root)
223e02119d5SChris Mason {
2247237f183SYan Zheng 	if (atomic_dec_and_test(&root->log_writers)) {
225093258e6SDavid Sterba 		/* atomic_dec_and_test implies a barrier */
226093258e6SDavid Sterba 		cond_wake_up_nomb(&root->log_writer_wait);
2277237f183SYan Zheng 	}
228e02119d5SChris Mason }
229e02119d5SChris Mason 
230e02119d5SChris Mason 
231e02119d5SChris Mason /*
232e02119d5SChris Mason  * the walk control struct is used to pass state down the chain when
233e02119d5SChris Mason  * processing the log tree.  The stage field tells us which part
234e02119d5SChris Mason  * of the log tree processing we are currently doing.  The others
235e02119d5SChris Mason  * are state fields used for that specific part
236e02119d5SChris Mason  */
237e02119d5SChris Mason struct walk_control {
238e02119d5SChris Mason 	/* should we free the extent on disk when done?  This is used
239e02119d5SChris Mason 	 * at transaction commit time while freeing a log tree
240e02119d5SChris Mason 	 */
241e02119d5SChris Mason 	int free;
242e02119d5SChris Mason 
243e02119d5SChris Mason 	/* should we write out the extent buffer?  This is used
244e02119d5SChris Mason 	 * while flushing the log tree to disk during a sync
245e02119d5SChris Mason 	 */
246e02119d5SChris Mason 	int write;
247e02119d5SChris Mason 
248e02119d5SChris Mason 	/* should we wait for the extent buffer io to finish?  Also used
249e02119d5SChris Mason 	 * while flushing the log tree to disk for a sync
250e02119d5SChris Mason 	 */
251e02119d5SChris Mason 	int wait;
252e02119d5SChris Mason 
253e02119d5SChris Mason 	/* pin only walk, we record which extents on disk belong to the
254e02119d5SChris Mason 	 * log trees
255e02119d5SChris Mason 	 */
256e02119d5SChris Mason 	int pin;
257e02119d5SChris Mason 
258e02119d5SChris Mason 	/* what stage of the replay code we're currently in */
259e02119d5SChris Mason 	int stage;
260e02119d5SChris Mason 
261e02119d5SChris Mason 	/* the root we are currently replaying */
262e02119d5SChris Mason 	struct btrfs_root *replay_dest;
263e02119d5SChris Mason 
264e02119d5SChris Mason 	/* the trans handle for the current replay */
265e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
266e02119d5SChris Mason 
267e02119d5SChris Mason 	/* the function that gets used to process blocks we find in the
268e02119d5SChris Mason 	 * tree.  Note the extent_buffer might not be up to date when it is
269e02119d5SChris Mason 	 * passed in, and it must be checked or read if you need the data
270e02119d5SChris Mason 	 * inside it
271e02119d5SChris Mason 	 */
272e02119d5SChris Mason 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
273581c1760SQu Wenruo 			    struct walk_control *wc, u64 gen, int level);
274e02119d5SChris Mason };
275e02119d5SChris Mason 
276e02119d5SChris Mason /*
277e02119d5SChris Mason  * process_func used to pin down extents, write them or wait on them
278e02119d5SChris Mason  */
279e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log,
280e02119d5SChris Mason 			      struct extent_buffer *eb,
281581c1760SQu Wenruo 			      struct walk_control *wc, u64 gen, int level)
282e02119d5SChris Mason {
2830b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
284b50c6e25SJosef Bacik 	int ret = 0;
285b50c6e25SJosef Bacik 
2868c2a1a30SJosef Bacik 	/*
2878c2a1a30SJosef Bacik 	 * If this fs is mixed then we need to be able to process the leaves to
2888c2a1a30SJosef Bacik 	 * pin down any logged extents, so we have to read the block.
2898c2a1a30SJosef Bacik 	 */
2900b246afaSJeff Mahoney 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
291581c1760SQu Wenruo 		ret = btrfs_read_buffer(eb, gen, level, NULL);
2928c2a1a30SJosef Bacik 		if (ret)
2938c2a1a30SJosef Bacik 			return ret;
2948c2a1a30SJosef Bacik 	}
2958c2a1a30SJosef Bacik 
29604018de5SJosef Bacik 	if (wc->pin)
2972ff7e61eSJeff Mahoney 		ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
2982ff7e61eSJeff Mahoney 						      eb->len);
299e02119d5SChris Mason 
300b50c6e25SJosef Bacik 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
3018c2a1a30SJosef Bacik 		if (wc->pin && btrfs_header_level(eb) == 0)
3022ff7e61eSJeff Mahoney 			ret = btrfs_exclude_logged_extents(fs_info, eb);
303e02119d5SChris Mason 		if (wc->write)
304e02119d5SChris Mason 			btrfs_write_tree_block(eb);
305e02119d5SChris Mason 		if (wc->wait)
306e02119d5SChris Mason 			btrfs_wait_tree_block_writeback(eb);
307e02119d5SChris Mason 	}
308b50c6e25SJosef Bacik 	return ret;
309e02119d5SChris Mason }
310e02119d5SChris Mason 
311e02119d5SChris Mason /*
312e02119d5SChris Mason  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
313e02119d5SChris Mason  * to the src data we are copying out.
314e02119d5SChris Mason  *
315e02119d5SChris Mason  * root is the tree we are copying into, and path is a scratch
316e02119d5SChris Mason  * path for use in this function (it should be released on entry and
317e02119d5SChris Mason  * will be released on exit).
318e02119d5SChris Mason  *
319e02119d5SChris Mason  * If the key is already in the destination tree the existing item is
320e02119d5SChris Mason  * overwritten.  If the existing item isn't big enough, it is extended.
321e02119d5SChris Mason  * If it is too large, it is truncated.
322e02119d5SChris Mason  *
323e02119d5SChris Mason  * If the key isn't in the destination yet, a new item is inserted.
324e02119d5SChris Mason  */
325e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans,
326e02119d5SChris Mason 				   struct btrfs_root *root,
327e02119d5SChris Mason 				   struct btrfs_path *path,
328e02119d5SChris Mason 				   struct extent_buffer *eb, int slot,
329e02119d5SChris Mason 				   struct btrfs_key *key)
330e02119d5SChris Mason {
3312ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
332e02119d5SChris Mason 	int ret;
333e02119d5SChris Mason 	u32 item_size;
334e02119d5SChris Mason 	u64 saved_i_size = 0;
335e02119d5SChris Mason 	int save_old_i_size = 0;
336e02119d5SChris Mason 	unsigned long src_ptr;
337e02119d5SChris Mason 	unsigned long dst_ptr;
338e02119d5SChris Mason 	int overwrite_root = 0;
3394bc4bee4SJosef Bacik 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
340e02119d5SChris Mason 
341e02119d5SChris Mason 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
342e02119d5SChris Mason 		overwrite_root = 1;
343e02119d5SChris Mason 
344e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
345e02119d5SChris Mason 	src_ptr = btrfs_item_ptr_offset(eb, slot);
346e02119d5SChris Mason 
347e02119d5SChris Mason 	/* look for the key in the destination tree */
348e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3494bc4bee4SJosef Bacik 	if (ret < 0)
3504bc4bee4SJosef Bacik 		return ret;
3514bc4bee4SJosef Bacik 
352e02119d5SChris Mason 	if (ret == 0) {
353e02119d5SChris Mason 		char *src_copy;
354e02119d5SChris Mason 		char *dst_copy;
355e02119d5SChris Mason 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
356e02119d5SChris Mason 						  path->slots[0]);
357e02119d5SChris Mason 		if (dst_size != item_size)
358e02119d5SChris Mason 			goto insert;
359e02119d5SChris Mason 
360e02119d5SChris Mason 		if (item_size == 0) {
361b3b4aa74SDavid Sterba 			btrfs_release_path(path);
362e02119d5SChris Mason 			return 0;
363e02119d5SChris Mason 		}
364e02119d5SChris Mason 		dst_copy = kmalloc(item_size, GFP_NOFS);
365e02119d5SChris Mason 		src_copy = kmalloc(item_size, GFP_NOFS);
3662a29edc6Sliubo 		if (!dst_copy || !src_copy) {
367b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3682a29edc6Sliubo 			kfree(dst_copy);
3692a29edc6Sliubo 			kfree(src_copy);
3702a29edc6Sliubo 			return -ENOMEM;
3712a29edc6Sliubo 		}
372e02119d5SChris Mason 
373e02119d5SChris Mason 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
374e02119d5SChris Mason 
375e02119d5SChris Mason 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
376e02119d5SChris Mason 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
377e02119d5SChris Mason 				   item_size);
378e02119d5SChris Mason 		ret = memcmp(dst_copy, src_copy, item_size);
379e02119d5SChris Mason 
380e02119d5SChris Mason 		kfree(dst_copy);
381e02119d5SChris Mason 		kfree(src_copy);
382e02119d5SChris Mason 		/*
383e02119d5SChris Mason 		 * they have the same contents, just return, this saves
384e02119d5SChris Mason 		 * us from cowing blocks in the destination tree and doing
385e02119d5SChris Mason 		 * extra writes that may not have been done by a previous
386e02119d5SChris Mason 		 * sync
387e02119d5SChris Mason 		 */
388e02119d5SChris Mason 		if (ret == 0) {
389b3b4aa74SDavid Sterba 			btrfs_release_path(path);
390e02119d5SChris Mason 			return 0;
391e02119d5SChris Mason 		}
392e02119d5SChris Mason 
3934bc4bee4SJosef Bacik 		/*
3944bc4bee4SJosef Bacik 		 * We need to load the old nbytes into the inode so when we
3954bc4bee4SJosef Bacik 		 * replay the extents we've logged we get the right nbytes.
3964bc4bee4SJosef Bacik 		 */
3974bc4bee4SJosef Bacik 		if (inode_item) {
3984bc4bee4SJosef Bacik 			struct btrfs_inode_item *item;
3994bc4bee4SJosef Bacik 			u64 nbytes;
400d555438bSJosef Bacik 			u32 mode;
4014bc4bee4SJosef Bacik 
4024bc4bee4SJosef Bacik 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4034bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4044bc4bee4SJosef Bacik 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
4054bc4bee4SJosef Bacik 			item = btrfs_item_ptr(eb, slot,
4064bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4074bc4bee4SJosef Bacik 			btrfs_set_inode_nbytes(eb, item, nbytes);
408d555438bSJosef Bacik 
409d555438bSJosef Bacik 			/*
410d555438bSJosef Bacik 			 * If this is a directory we need to reset the i_size to
411d555438bSJosef Bacik 			 * 0 so that we can set it up properly when replaying
412d555438bSJosef Bacik 			 * the rest of the items in this log.
413d555438bSJosef Bacik 			 */
414d555438bSJosef Bacik 			mode = btrfs_inode_mode(eb, item);
415d555438bSJosef Bacik 			if (S_ISDIR(mode))
416d555438bSJosef Bacik 				btrfs_set_inode_size(eb, item, 0);
4174bc4bee4SJosef Bacik 		}
4184bc4bee4SJosef Bacik 	} else if (inode_item) {
4194bc4bee4SJosef Bacik 		struct btrfs_inode_item *item;
420d555438bSJosef Bacik 		u32 mode;
4214bc4bee4SJosef Bacik 
4224bc4bee4SJosef Bacik 		/*
4234bc4bee4SJosef Bacik 		 * New inode, set nbytes to 0 so that the nbytes comes out
4244bc4bee4SJosef Bacik 		 * properly when we replay the extents.
4254bc4bee4SJosef Bacik 		 */
4264bc4bee4SJosef Bacik 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
4274bc4bee4SJosef Bacik 		btrfs_set_inode_nbytes(eb, item, 0);
428d555438bSJosef Bacik 
429d555438bSJosef Bacik 		/*
430d555438bSJosef Bacik 		 * If this is a directory we need to reset the i_size to 0 so
431d555438bSJosef Bacik 		 * that we can set it up properly when replaying the rest of
432d555438bSJosef Bacik 		 * the items in this log.
433d555438bSJosef Bacik 		 */
434d555438bSJosef Bacik 		mode = btrfs_inode_mode(eb, item);
435d555438bSJosef Bacik 		if (S_ISDIR(mode))
436d555438bSJosef Bacik 			btrfs_set_inode_size(eb, item, 0);
437e02119d5SChris Mason 	}
438e02119d5SChris Mason insert:
439b3b4aa74SDavid Sterba 	btrfs_release_path(path);
440e02119d5SChris Mason 	/* try to insert the key into the destination tree */
441df8d116fSFilipe Manana 	path->skip_release_on_error = 1;
442e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path,
443e02119d5SChris Mason 				      key, item_size);
444df8d116fSFilipe Manana 	path->skip_release_on_error = 0;
445e02119d5SChris Mason 
446e02119d5SChris Mason 	/* make sure any existing item is the correct size */
447df8d116fSFilipe Manana 	if (ret == -EEXIST || ret == -EOVERFLOW) {
448e02119d5SChris Mason 		u32 found_size;
449e02119d5SChris Mason 		found_size = btrfs_item_size_nr(path->nodes[0],
450e02119d5SChris Mason 						path->slots[0]);
451143bede5SJeff Mahoney 		if (found_size > item_size)
4522ff7e61eSJeff Mahoney 			btrfs_truncate_item(fs_info, path, item_size, 1);
453143bede5SJeff Mahoney 		else if (found_size < item_size)
4542ff7e61eSJeff Mahoney 			btrfs_extend_item(fs_info, path,
45587b29b20SYan Zheng 					  item_size - found_size);
456e02119d5SChris Mason 	} else if (ret) {
4574a500fd1SYan, Zheng 		return ret;
458e02119d5SChris Mason 	}
459e02119d5SChris Mason 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
460e02119d5SChris Mason 					path->slots[0]);
461e02119d5SChris Mason 
462e02119d5SChris Mason 	/* don't overwrite an existing inode if the generation number
463e02119d5SChris Mason 	 * was logged as zero.  This is done when the tree logging code
464e02119d5SChris Mason 	 * is just logging an inode to make sure it exists after recovery.
465e02119d5SChris Mason 	 *
466e02119d5SChris Mason 	 * Also, don't overwrite i_size on directories during replay.
467e02119d5SChris Mason 	 * log replay inserts and removes directory items based on the
468e02119d5SChris Mason 	 * state of the tree found in the subvolume, and i_size is modified
469e02119d5SChris Mason 	 * as it goes
470e02119d5SChris Mason 	 */
471e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
472e02119d5SChris Mason 		struct btrfs_inode_item *src_item;
473e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
474e02119d5SChris Mason 
475e02119d5SChris Mason 		src_item = (struct btrfs_inode_item *)src_ptr;
476e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
477e02119d5SChris Mason 
4781a4bcf47SFilipe Manana 		if (btrfs_inode_generation(eb, src_item) == 0) {
4791a4bcf47SFilipe Manana 			struct extent_buffer *dst_eb = path->nodes[0];
4802f2ff0eeSFilipe Manana 			const u64 ino_size = btrfs_inode_size(eb, src_item);
4811a4bcf47SFilipe Manana 
4822f2ff0eeSFilipe Manana 			/*
4832f2ff0eeSFilipe Manana 			 * For regular files an ino_size == 0 is used only when
4842f2ff0eeSFilipe Manana 			 * logging that an inode exists, as part of a directory
4852f2ff0eeSFilipe Manana 			 * fsync, and the inode wasn't fsynced before. In this
4862f2ff0eeSFilipe Manana 			 * case don't set the size of the inode in the fs/subvol
4872f2ff0eeSFilipe Manana 			 * tree, otherwise we would be throwing valid data away.
4882f2ff0eeSFilipe Manana 			 */
4891a4bcf47SFilipe Manana 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
4902f2ff0eeSFilipe Manana 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
4912f2ff0eeSFilipe Manana 			    ino_size != 0) {
4921a4bcf47SFilipe Manana 				struct btrfs_map_token token;
4931a4bcf47SFilipe Manana 
4941a4bcf47SFilipe Manana 				btrfs_init_map_token(&token);
4951a4bcf47SFilipe Manana 				btrfs_set_token_inode_size(dst_eb, dst_item,
4961a4bcf47SFilipe Manana 							   ino_size, &token);
4971a4bcf47SFilipe Manana 			}
498e02119d5SChris Mason 			goto no_copy;
4991a4bcf47SFilipe Manana 		}
500e02119d5SChris Mason 
501e02119d5SChris Mason 		if (overwrite_root &&
502e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
503e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
504e02119d5SChris Mason 			save_old_i_size = 1;
505e02119d5SChris Mason 			saved_i_size = btrfs_inode_size(path->nodes[0],
506e02119d5SChris Mason 							dst_item);
507e02119d5SChris Mason 		}
508e02119d5SChris Mason 	}
509e02119d5SChris Mason 
510e02119d5SChris Mason 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
511e02119d5SChris Mason 			   src_ptr, item_size);
512e02119d5SChris Mason 
513e02119d5SChris Mason 	if (save_old_i_size) {
514e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
515e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
516e02119d5SChris Mason 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
517e02119d5SChris Mason 	}
518e02119d5SChris Mason 
519e02119d5SChris Mason 	/* make sure the generation is filled in */
520e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY) {
521e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
522e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
523e02119d5SChris Mason 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
524e02119d5SChris Mason 			btrfs_set_inode_generation(path->nodes[0], dst_item,
525e02119d5SChris Mason 						   trans->transid);
526e02119d5SChris Mason 		}
527e02119d5SChris Mason 	}
528e02119d5SChris Mason no_copy:
529e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
530b3b4aa74SDavid Sterba 	btrfs_release_path(path);
531e02119d5SChris Mason 	return 0;
532e02119d5SChris Mason }
533e02119d5SChris Mason 
534e02119d5SChris Mason /*
535e02119d5SChris Mason  * simple helper to read an inode off the disk from a given root
536e02119d5SChris Mason  * This can only be called for subvolume roots and not for the log
537e02119d5SChris Mason  */
538e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root,
539e02119d5SChris Mason 					     u64 objectid)
540e02119d5SChris Mason {
5415d4f98a2SYan Zheng 	struct btrfs_key key;
542e02119d5SChris Mason 	struct inode *inode;
543e02119d5SChris Mason 
5445d4f98a2SYan Zheng 	key.objectid = objectid;
5455d4f98a2SYan Zheng 	key.type = BTRFS_INODE_ITEM_KEY;
5465d4f98a2SYan Zheng 	key.offset = 0;
54773f73415SJosef Bacik 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
5485d4f98a2SYan Zheng 	if (IS_ERR(inode)) {
5495d4f98a2SYan Zheng 		inode = NULL;
5505d4f98a2SYan Zheng 	} else if (is_bad_inode(inode)) {
551e02119d5SChris Mason 		iput(inode);
552e02119d5SChris Mason 		inode = NULL;
553e02119d5SChris Mason 	}
554e02119d5SChris Mason 	return inode;
555e02119d5SChris Mason }
556e02119d5SChris Mason 
557e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the
558e02119d5SChris Mason  * subvolume 'root'.  path is released on entry and should be released
559e02119d5SChris Mason  * on exit.
560e02119d5SChris Mason  *
561e02119d5SChris Mason  * extents in the log tree have not been allocated out of the extent
562e02119d5SChris Mason  * tree yet.  So, this completes the allocation, taking a reference
563e02119d5SChris Mason  * as required if the extent already exists or creating a new extent
564e02119d5SChris Mason  * if it isn't in the extent allocation tree yet.
565e02119d5SChris Mason  *
566e02119d5SChris Mason  * The extent is inserted into the file, dropping any existing extents
567e02119d5SChris Mason  * from the file that overlap the new one.
568e02119d5SChris Mason  */
569e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
570e02119d5SChris Mason 				      struct btrfs_root *root,
571e02119d5SChris Mason 				      struct btrfs_path *path,
572e02119d5SChris Mason 				      struct extent_buffer *eb, int slot,
573e02119d5SChris Mason 				      struct btrfs_key *key)
574e02119d5SChris Mason {
5750b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
576e02119d5SChris Mason 	int found_type;
577e02119d5SChris Mason 	u64 extent_end;
578e02119d5SChris Mason 	u64 start = key->offset;
5794bc4bee4SJosef Bacik 	u64 nbytes = 0;
580e02119d5SChris Mason 	struct btrfs_file_extent_item *item;
581e02119d5SChris Mason 	struct inode *inode = NULL;
582e02119d5SChris Mason 	unsigned long size;
583e02119d5SChris Mason 	int ret = 0;
584e02119d5SChris Mason 
585e02119d5SChris Mason 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
586e02119d5SChris Mason 	found_type = btrfs_file_extent_type(eb, item);
587e02119d5SChris Mason 
588d899e052SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5894bc4bee4SJosef Bacik 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5904bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_num_bytes(eb, item);
5914bc4bee4SJosef Bacik 		extent_end = start + nbytes;
5924bc4bee4SJosef Bacik 
5934bc4bee4SJosef Bacik 		/*
5944bc4bee4SJosef Bacik 		 * We don't add to the inodes nbytes if we are prealloc or a
5954bc4bee4SJosef Bacik 		 * hole.
5964bc4bee4SJosef Bacik 		 */
5974bc4bee4SJosef Bacik 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5984bc4bee4SJosef Bacik 			nbytes = 0;
5994bc4bee4SJosef Bacik 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
600e41ca589SQu Wenruo 		size = btrfs_file_extent_ram_bytes(eb, item);
6014bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
602da17066cSJeff Mahoney 		extent_end = ALIGN(start + size,
6030b246afaSJeff Mahoney 				   fs_info->sectorsize);
604e02119d5SChris Mason 	} else {
605e02119d5SChris Mason 		ret = 0;
606e02119d5SChris Mason 		goto out;
607e02119d5SChris Mason 	}
608e02119d5SChris Mason 
609e02119d5SChris Mason 	inode = read_one_inode(root, key->objectid);
610e02119d5SChris Mason 	if (!inode) {
611e02119d5SChris Mason 		ret = -EIO;
612e02119d5SChris Mason 		goto out;
613e02119d5SChris Mason 	}
614e02119d5SChris Mason 
615e02119d5SChris Mason 	/*
616e02119d5SChris Mason 	 * first check to see if we already have this extent in the
617e02119d5SChris Mason 	 * file.  This must be done before the btrfs_drop_extents run
618e02119d5SChris Mason 	 * so we don't try to drop this extent.
619e02119d5SChris Mason 	 */
620f85b7379SDavid Sterba 	ret = btrfs_lookup_file_extent(trans, root, path,
621f85b7379SDavid Sterba 			btrfs_ino(BTRFS_I(inode)), start, 0);
622e02119d5SChris Mason 
623d899e052SYan Zheng 	if (ret == 0 &&
624d899e052SYan Zheng 	    (found_type == BTRFS_FILE_EXTENT_REG ||
625d899e052SYan Zheng 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
626e02119d5SChris Mason 		struct btrfs_file_extent_item cmp1;
627e02119d5SChris Mason 		struct btrfs_file_extent_item cmp2;
628e02119d5SChris Mason 		struct btrfs_file_extent_item *existing;
629e02119d5SChris Mason 		struct extent_buffer *leaf;
630e02119d5SChris Mason 
631e02119d5SChris Mason 		leaf = path->nodes[0];
632e02119d5SChris Mason 		existing = btrfs_item_ptr(leaf, path->slots[0],
633e02119d5SChris Mason 					  struct btrfs_file_extent_item);
634e02119d5SChris Mason 
635e02119d5SChris Mason 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
636e02119d5SChris Mason 				   sizeof(cmp1));
637e02119d5SChris Mason 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
638e02119d5SChris Mason 				   sizeof(cmp2));
639e02119d5SChris Mason 
640e02119d5SChris Mason 		/*
641e02119d5SChris Mason 		 * we already have a pointer to this exact extent,
642e02119d5SChris Mason 		 * we don't have to do anything
643e02119d5SChris Mason 		 */
644e02119d5SChris Mason 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
645b3b4aa74SDavid Sterba 			btrfs_release_path(path);
646e02119d5SChris Mason 			goto out;
647e02119d5SChris Mason 		}
648e02119d5SChris Mason 	}
649b3b4aa74SDavid Sterba 	btrfs_release_path(path);
650e02119d5SChris Mason 
651e02119d5SChris Mason 	/* drop any overlapping extents */
6522671485dSJosef Bacik 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
6533650860bSJosef Bacik 	if (ret)
6543650860bSJosef Bacik 		goto out;
655e02119d5SChris Mason 
65607d400a6SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
65707d400a6SYan Zheng 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6585d4f98a2SYan Zheng 		u64 offset;
65907d400a6SYan Zheng 		unsigned long dest_offset;
66007d400a6SYan Zheng 		struct btrfs_key ins;
66107d400a6SYan Zheng 
6623168021cSFilipe Manana 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
6633168021cSFilipe Manana 		    btrfs_fs_incompat(fs_info, NO_HOLES))
6643168021cSFilipe Manana 			goto update_inode;
6653168021cSFilipe Manana 
66607d400a6SYan Zheng 		ret = btrfs_insert_empty_item(trans, root, path, key,
66707d400a6SYan Zheng 					      sizeof(*item));
6683650860bSJosef Bacik 		if (ret)
6693650860bSJosef Bacik 			goto out;
67007d400a6SYan Zheng 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
67107d400a6SYan Zheng 						    path->slots[0]);
67207d400a6SYan Zheng 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
67307d400a6SYan Zheng 				(unsigned long)item,  sizeof(*item));
67407d400a6SYan Zheng 
67507d400a6SYan Zheng 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
67607d400a6SYan Zheng 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
67707d400a6SYan Zheng 		ins.type = BTRFS_EXTENT_ITEM_KEY;
6785d4f98a2SYan Zheng 		offset = key->offset - btrfs_file_extent_offset(eb, item);
67907d400a6SYan Zheng 
680df2c95f3SQu Wenruo 		/*
681df2c95f3SQu Wenruo 		 * Manually record dirty extent, as here we did a shallow
682df2c95f3SQu Wenruo 		 * file extent item copy and skip normal backref update,
683df2c95f3SQu Wenruo 		 * but modifying extent tree all by ourselves.
684df2c95f3SQu Wenruo 		 * So need to manually record dirty extent for qgroup,
685df2c95f3SQu Wenruo 		 * as the owner of the file extent changed from log tree
686df2c95f3SQu Wenruo 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
687df2c95f3SQu Wenruo 		 */
688*a95f3aafSLu Fengqi 		ret = btrfs_qgroup_trace_extent(trans,
689df2c95f3SQu Wenruo 				btrfs_file_extent_disk_bytenr(eb, item),
690df2c95f3SQu Wenruo 				btrfs_file_extent_disk_num_bytes(eb, item),
691df2c95f3SQu Wenruo 				GFP_NOFS);
692df2c95f3SQu Wenruo 		if (ret < 0)
693df2c95f3SQu Wenruo 			goto out;
694df2c95f3SQu Wenruo 
69507d400a6SYan Zheng 		if (ins.objectid > 0) {
69607d400a6SYan Zheng 			u64 csum_start;
69707d400a6SYan Zheng 			u64 csum_end;
69807d400a6SYan Zheng 			LIST_HEAD(ordered_sums);
69907d400a6SYan Zheng 			/*
70007d400a6SYan Zheng 			 * is this extent already allocated in the extent
70107d400a6SYan Zheng 			 * allocation tree?  If so, just add a reference
70207d400a6SYan Zheng 			 */
7032ff7e61eSJeff Mahoney 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
70407d400a6SYan Zheng 						ins.offset);
70507d400a6SYan Zheng 			if (ret == 0) {
70684f7d8e6SJosef Bacik 				ret = btrfs_inc_extent_ref(trans, root,
70707d400a6SYan Zheng 						ins.objectid, ins.offset,
7085d4f98a2SYan Zheng 						0, root->root_key.objectid,
709b06c4bf5SFilipe Manana 						key->objectid, offset);
710b50c6e25SJosef Bacik 				if (ret)
711b50c6e25SJosef Bacik 					goto out;
71207d400a6SYan Zheng 			} else {
71307d400a6SYan Zheng 				/*
71407d400a6SYan Zheng 				 * insert the extent pointer in the extent
71507d400a6SYan Zheng 				 * allocation tree
71607d400a6SYan Zheng 				 */
7175d4f98a2SYan Zheng 				ret = btrfs_alloc_logged_file_extent(trans,
7182ff7e61eSJeff Mahoney 						root->root_key.objectid,
7195d4f98a2SYan Zheng 						key->objectid, offset, &ins);
720b50c6e25SJosef Bacik 				if (ret)
721b50c6e25SJosef Bacik 					goto out;
72207d400a6SYan Zheng 			}
723b3b4aa74SDavid Sterba 			btrfs_release_path(path);
72407d400a6SYan Zheng 
72507d400a6SYan Zheng 			if (btrfs_file_extent_compression(eb, item)) {
72607d400a6SYan Zheng 				csum_start = ins.objectid;
72707d400a6SYan Zheng 				csum_end = csum_start + ins.offset;
72807d400a6SYan Zheng 			} else {
72907d400a6SYan Zheng 				csum_start = ins.objectid +
73007d400a6SYan Zheng 					btrfs_file_extent_offset(eb, item);
73107d400a6SYan Zheng 				csum_end = csum_start +
73207d400a6SYan Zheng 					btrfs_file_extent_num_bytes(eb, item);
73307d400a6SYan Zheng 			}
73407d400a6SYan Zheng 
73507d400a6SYan Zheng 			ret = btrfs_lookup_csums_range(root->log_root,
73607d400a6SYan Zheng 						csum_start, csum_end - 1,
737a2de733cSArne Jansen 						&ordered_sums, 0);
7383650860bSJosef Bacik 			if (ret)
7393650860bSJosef Bacik 				goto out;
740b84b8390SFilipe Manana 			/*
741b84b8390SFilipe Manana 			 * Now delete all existing cums in the csum root that
742b84b8390SFilipe Manana 			 * cover our range. We do this because we can have an
743b84b8390SFilipe Manana 			 * extent that is completely referenced by one file
744b84b8390SFilipe Manana 			 * extent item and partially referenced by another
745b84b8390SFilipe Manana 			 * file extent item (like after using the clone or
746b84b8390SFilipe Manana 			 * extent_same ioctls). In this case if we end up doing
747b84b8390SFilipe Manana 			 * the replay of the one that partially references the
748b84b8390SFilipe Manana 			 * extent first, and we do not do the csum deletion
749b84b8390SFilipe Manana 			 * below, we can get 2 csum items in the csum tree that
750b84b8390SFilipe Manana 			 * overlap each other. For example, imagine our log has
751b84b8390SFilipe Manana 			 * the two following file extent items:
752b84b8390SFilipe Manana 			 *
753b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 409600)
754b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
755b84b8390SFilipe Manana 			 *     extent data offset 20480 nr 20480 ram 102400
756b84b8390SFilipe Manana 			 *
757b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 819200)
758b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
759b84b8390SFilipe Manana 			 *     extent data offset 0 nr 102400 ram 102400
760b84b8390SFilipe Manana 			 *
761b84b8390SFilipe Manana 			 * Where the second one fully references the 100K extent
762b84b8390SFilipe Manana 			 * that starts at disk byte 12845056, and the log tree
763b84b8390SFilipe Manana 			 * has a single csum item that covers the entire range
764b84b8390SFilipe Manana 			 * of the extent:
765b84b8390SFilipe Manana 			 *
766b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
767b84b8390SFilipe Manana 			 *
768b84b8390SFilipe Manana 			 * After the first file extent item is replayed, the
769b84b8390SFilipe Manana 			 * csum tree gets the following csum item:
770b84b8390SFilipe Manana 			 *
771b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
772b84b8390SFilipe Manana 			 *
773b84b8390SFilipe Manana 			 * Which covers the 20K sub-range starting at offset 20K
774b84b8390SFilipe Manana 			 * of our extent. Now when we replay the second file
775b84b8390SFilipe Manana 			 * extent item, if we do not delete existing csum items
776b84b8390SFilipe Manana 			 * that cover any of its blocks, we end up getting two
777b84b8390SFilipe Manana 			 * csum items in our csum tree that overlap each other:
778b84b8390SFilipe Manana 			 *
779b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
780b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
781b84b8390SFilipe Manana 			 *
782b84b8390SFilipe Manana 			 * Which is a problem, because after this anyone trying
783b84b8390SFilipe Manana 			 * to lookup up for the checksum of any block of our
784b84b8390SFilipe Manana 			 * extent starting at an offset of 40K or higher, will
785b84b8390SFilipe Manana 			 * end up looking at the second csum item only, which
786b84b8390SFilipe Manana 			 * does not contain the checksum for any block starting
787b84b8390SFilipe Manana 			 * at offset 40K or higher of our extent.
788b84b8390SFilipe Manana 			 */
78907d400a6SYan Zheng 			while (!list_empty(&ordered_sums)) {
79007d400a6SYan Zheng 				struct btrfs_ordered_sum *sums;
79107d400a6SYan Zheng 				sums = list_entry(ordered_sums.next,
79207d400a6SYan Zheng 						struct btrfs_ordered_sum,
79307d400a6SYan Zheng 						list);
7943650860bSJosef Bacik 				if (!ret)
7950b246afaSJeff Mahoney 					ret = btrfs_del_csums(trans, fs_info,
796b84b8390SFilipe Manana 							      sums->bytenr,
797b84b8390SFilipe Manana 							      sums->len);
798b84b8390SFilipe Manana 				if (!ret)
79907d400a6SYan Zheng 					ret = btrfs_csum_file_blocks(trans,
8000b246afaSJeff Mahoney 						fs_info->csum_root, sums);
80107d400a6SYan Zheng 				list_del(&sums->list);
80207d400a6SYan Zheng 				kfree(sums);
80307d400a6SYan Zheng 			}
8043650860bSJosef Bacik 			if (ret)
8053650860bSJosef Bacik 				goto out;
80607d400a6SYan Zheng 		} else {
807b3b4aa74SDavid Sterba 			btrfs_release_path(path);
80807d400a6SYan Zheng 		}
80907d400a6SYan Zheng 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
81007d400a6SYan Zheng 		/* inline extents are easy, we just overwrite them */
811e02119d5SChris Mason 		ret = overwrite_item(trans, root, path, eb, slot, key);
8123650860bSJosef Bacik 		if (ret)
8133650860bSJosef Bacik 			goto out;
81407d400a6SYan Zheng 	}
815e02119d5SChris Mason 
8164bc4bee4SJosef Bacik 	inode_add_bytes(inode, nbytes);
8173168021cSFilipe Manana update_inode:
818b9959295STsutomu Itoh 	ret = btrfs_update_inode(trans, root, inode);
819e02119d5SChris Mason out:
820e02119d5SChris Mason 	if (inode)
821e02119d5SChris Mason 		iput(inode);
822e02119d5SChris Mason 	return ret;
823e02119d5SChris Mason }
824e02119d5SChris Mason 
825e02119d5SChris Mason /*
826e02119d5SChris Mason  * when cleaning up conflicts between the directory names in the
827e02119d5SChris Mason  * subvolume, directory names in the log and directory names in the
828e02119d5SChris Mason  * inode back references, we may have to unlink inodes from directories.
829e02119d5SChris Mason  *
830e02119d5SChris Mason  * This is a helper function to do the unlink of a specific directory
831e02119d5SChris Mason  * item
832e02119d5SChris Mason  */
833e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
834e02119d5SChris Mason 				      struct btrfs_root *root,
835e02119d5SChris Mason 				      struct btrfs_path *path,
836207e7d92SNikolay Borisov 				      struct btrfs_inode *dir,
837e02119d5SChris Mason 				      struct btrfs_dir_item *di)
838e02119d5SChris Mason {
839e02119d5SChris Mason 	struct inode *inode;
840e02119d5SChris Mason 	char *name;
841e02119d5SChris Mason 	int name_len;
842e02119d5SChris Mason 	struct extent_buffer *leaf;
843e02119d5SChris Mason 	struct btrfs_key location;
844e02119d5SChris Mason 	int ret;
845e02119d5SChris Mason 
846e02119d5SChris Mason 	leaf = path->nodes[0];
847e02119d5SChris Mason 
848e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
849e02119d5SChris Mason 	name_len = btrfs_dir_name_len(leaf, di);
850e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
8512a29edc6Sliubo 	if (!name)
8522a29edc6Sliubo 		return -ENOMEM;
8532a29edc6Sliubo 
854e02119d5SChris Mason 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
855b3b4aa74SDavid Sterba 	btrfs_release_path(path);
856e02119d5SChris Mason 
857e02119d5SChris Mason 	inode = read_one_inode(root, location.objectid);
858c00e9493STsutomu Itoh 	if (!inode) {
8593650860bSJosef Bacik 		ret = -EIO;
8603650860bSJosef Bacik 		goto out;
861c00e9493STsutomu Itoh 	}
862e02119d5SChris Mason 
863ec051c0fSYan Zheng 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
8643650860bSJosef Bacik 	if (ret)
8653650860bSJosef Bacik 		goto out;
86612fcfd22SChris Mason 
867207e7d92SNikolay Borisov 	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
868207e7d92SNikolay Borisov 			name_len);
8693650860bSJosef Bacik 	if (ret)
8703650860bSJosef Bacik 		goto out;
871ada9af21SFilipe David Borba Manana 	else
872e5c304e6SNikolay Borisov 		ret = btrfs_run_delayed_items(trans);
8733650860bSJosef Bacik out:
8743650860bSJosef Bacik 	kfree(name);
8753650860bSJosef Bacik 	iput(inode);
876e02119d5SChris Mason 	return ret;
877e02119d5SChris Mason }
878e02119d5SChris Mason 
879e02119d5SChris Mason /*
880e02119d5SChris Mason  * helper function to see if a given name and sequence number found
881e02119d5SChris Mason  * in an inode back reference are already in a directory and correctly
882e02119d5SChris Mason  * point to this inode
883e02119d5SChris Mason  */
884e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root,
885e02119d5SChris Mason 				 struct btrfs_path *path,
886e02119d5SChris Mason 				 u64 dirid, u64 objectid, u64 index,
887e02119d5SChris Mason 				 const char *name, int name_len)
888e02119d5SChris Mason {
889e02119d5SChris Mason 	struct btrfs_dir_item *di;
890e02119d5SChris Mason 	struct btrfs_key location;
891e02119d5SChris Mason 	int match = 0;
892e02119d5SChris Mason 
893e02119d5SChris Mason 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
894e02119d5SChris Mason 					 index, name, name_len, 0);
895e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
896e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
897e02119d5SChris Mason 		if (location.objectid != objectid)
898e02119d5SChris Mason 			goto out;
899e02119d5SChris Mason 	} else
900e02119d5SChris Mason 		goto out;
901b3b4aa74SDavid Sterba 	btrfs_release_path(path);
902e02119d5SChris Mason 
903e02119d5SChris Mason 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
904e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
905e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
906e02119d5SChris Mason 		if (location.objectid != objectid)
907e02119d5SChris Mason 			goto out;
908e02119d5SChris Mason 	} else
909e02119d5SChris Mason 		goto out;
910e02119d5SChris Mason 	match = 1;
911e02119d5SChris Mason out:
912b3b4aa74SDavid Sterba 	btrfs_release_path(path);
913e02119d5SChris Mason 	return match;
914e02119d5SChris Mason }
915e02119d5SChris Mason 
916e02119d5SChris Mason /*
917e02119d5SChris Mason  * helper function to check a log tree for a named back reference in
918e02119d5SChris Mason  * an inode.  This is used to decide if a back reference that is
919e02119d5SChris Mason  * found in the subvolume conflicts with what we find in the log.
920e02119d5SChris Mason  *
921e02119d5SChris Mason  * inode backreferences may have multiple refs in a single item,
922e02119d5SChris Mason  * during replay we process one reference at a time, and we don't
923e02119d5SChris Mason  * want to delete valid links to a file from the subvolume if that
924e02119d5SChris Mason  * link is also in the log.
925e02119d5SChris Mason  */
926e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log,
927e02119d5SChris Mason 				   struct btrfs_key *key,
928f186373fSMark Fasheh 				   u64 ref_objectid,
929df8d116fSFilipe Manana 				   const char *name, int namelen)
930e02119d5SChris Mason {
931e02119d5SChris Mason 	struct btrfs_path *path;
932e02119d5SChris Mason 	struct btrfs_inode_ref *ref;
933e02119d5SChris Mason 	unsigned long ptr;
934e02119d5SChris Mason 	unsigned long ptr_end;
935e02119d5SChris Mason 	unsigned long name_ptr;
936e02119d5SChris Mason 	int found_name_len;
937e02119d5SChris Mason 	int item_size;
938e02119d5SChris Mason 	int ret;
939e02119d5SChris Mason 	int match = 0;
940e02119d5SChris Mason 
941e02119d5SChris Mason 	path = btrfs_alloc_path();
9422a29edc6Sliubo 	if (!path)
9432a29edc6Sliubo 		return -ENOMEM;
9442a29edc6Sliubo 
945e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
946e02119d5SChris Mason 	if (ret != 0)
947e02119d5SChris Mason 		goto out;
948e02119d5SChris Mason 
949e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
950f186373fSMark Fasheh 
951f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
9521f250e92SFilipe Manana 		if (btrfs_find_name_in_ext_backref(path->nodes[0],
9531f250e92SFilipe Manana 						   path->slots[0],
9541f250e92SFilipe Manana 						   ref_objectid,
955f186373fSMark Fasheh 						   name, namelen, NULL))
956f186373fSMark Fasheh 			match = 1;
957f186373fSMark Fasheh 
958f186373fSMark Fasheh 		goto out;
959f186373fSMark Fasheh 	}
960f186373fSMark Fasheh 
961f186373fSMark Fasheh 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
962e02119d5SChris Mason 	ptr_end = ptr + item_size;
963e02119d5SChris Mason 	while (ptr < ptr_end) {
964e02119d5SChris Mason 		ref = (struct btrfs_inode_ref *)ptr;
965e02119d5SChris Mason 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
966e02119d5SChris Mason 		if (found_name_len == namelen) {
967e02119d5SChris Mason 			name_ptr = (unsigned long)(ref + 1);
968e02119d5SChris Mason 			ret = memcmp_extent_buffer(path->nodes[0], name,
969e02119d5SChris Mason 						   name_ptr, namelen);
970e02119d5SChris Mason 			if (ret == 0) {
971e02119d5SChris Mason 				match = 1;
972e02119d5SChris Mason 				goto out;
973e02119d5SChris Mason 			}
974e02119d5SChris Mason 		}
975e02119d5SChris Mason 		ptr = (unsigned long)(ref + 1) + found_name_len;
976e02119d5SChris Mason 	}
977e02119d5SChris Mason out:
978e02119d5SChris Mason 	btrfs_free_path(path);
979e02119d5SChris Mason 	return match;
980e02119d5SChris Mason }
981e02119d5SChris Mason 
9825a1d7843SJan Schmidt static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
9835a1d7843SJan Schmidt 				  struct btrfs_root *root,
9845a1d7843SJan Schmidt 				  struct btrfs_path *path,
9855a1d7843SJan Schmidt 				  struct btrfs_root *log_root,
98694c91a1fSNikolay Borisov 				  struct btrfs_inode *dir,
98794c91a1fSNikolay Borisov 				  struct btrfs_inode *inode,
988f186373fSMark Fasheh 				  u64 inode_objectid, u64 parent_objectid,
989f186373fSMark Fasheh 				  u64 ref_index, char *name, int namelen,
990f186373fSMark Fasheh 				  int *search_done)
9915a1d7843SJan Schmidt {
9925a1d7843SJan Schmidt 	int ret;
9935a1d7843SJan Schmidt 	char *victim_name;
9945a1d7843SJan Schmidt 	int victim_name_len;
995f186373fSMark Fasheh 	struct extent_buffer *leaf;
996f186373fSMark Fasheh 	struct btrfs_dir_item *di;
997f186373fSMark Fasheh 	struct btrfs_key search_key;
998f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
999f186373fSMark Fasheh 
1000f186373fSMark Fasheh again:
1001f186373fSMark Fasheh 	/* Search old style refs */
1002f186373fSMark Fasheh 	search_key.objectid = inode_objectid;
1003f186373fSMark Fasheh 	search_key.type = BTRFS_INODE_REF_KEY;
1004f186373fSMark Fasheh 	search_key.offset = parent_objectid;
1005f186373fSMark Fasheh 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1006f186373fSMark Fasheh 	if (ret == 0) {
10075a1d7843SJan Schmidt 		struct btrfs_inode_ref *victim_ref;
10085a1d7843SJan Schmidt 		unsigned long ptr;
10095a1d7843SJan Schmidt 		unsigned long ptr_end;
1010f186373fSMark Fasheh 
1011f186373fSMark Fasheh 		leaf = path->nodes[0];
10125a1d7843SJan Schmidt 
10135a1d7843SJan Schmidt 		/* are we trying to overwrite a back ref for the root directory
10145a1d7843SJan Schmidt 		 * if so, just jump out, we're done
10155a1d7843SJan Schmidt 		 */
1016f186373fSMark Fasheh 		if (search_key.objectid == search_key.offset)
10175a1d7843SJan Schmidt 			return 1;
10185a1d7843SJan Schmidt 
10195a1d7843SJan Schmidt 		/* check all the names in this back reference to see
10205a1d7843SJan Schmidt 		 * if they are in the log.  if so, we allow them to stay
10215a1d7843SJan Schmidt 		 * otherwise they must be unlinked as a conflict
10225a1d7843SJan Schmidt 		 */
10235a1d7843SJan Schmidt 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
10245a1d7843SJan Schmidt 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
10255a1d7843SJan Schmidt 		while (ptr < ptr_end) {
10265a1d7843SJan Schmidt 			victim_ref = (struct btrfs_inode_ref *)ptr;
10275a1d7843SJan Schmidt 			victim_name_len = btrfs_inode_ref_name_len(leaf,
10285a1d7843SJan Schmidt 								   victim_ref);
10295a1d7843SJan Schmidt 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
10303650860bSJosef Bacik 			if (!victim_name)
10313650860bSJosef Bacik 				return -ENOMEM;
10325a1d7843SJan Schmidt 
10335a1d7843SJan Schmidt 			read_extent_buffer(leaf, victim_name,
10345a1d7843SJan Schmidt 					   (unsigned long)(victim_ref + 1),
10355a1d7843SJan Schmidt 					   victim_name_len);
10365a1d7843SJan Schmidt 
1037f186373fSMark Fasheh 			if (!backref_in_log(log_root, &search_key,
1038f186373fSMark Fasheh 					    parent_objectid,
1039f186373fSMark Fasheh 					    victim_name,
10405a1d7843SJan Schmidt 					    victim_name_len)) {
104194c91a1fSNikolay Borisov 				inc_nlink(&inode->vfs_inode);
10425a1d7843SJan Schmidt 				btrfs_release_path(path);
10435a1d7843SJan Schmidt 
104494c91a1fSNikolay Borisov 				ret = btrfs_unlink_inode(trans, root, dir, inode,
10454ec5934eSNikolay Borisov 						victim_name, victim_name_len);
1046f186373fSMark Fasheh 				kfree(victim_name);
10473650860bSJosef Bacik 				if (ret)
10483650860bSJosef Bacik 					return ret;
1049e5c304e6SNikolay Borisov 				ret = btrfs_run_delayed_items(trans);
1050ada9af21SFilipe David Borba Manana 				if (ret)
1051ada9af21SFilipe David Borba Manana 					return ret;
1052f186373fSMark Fasheh 				*search_done = 1;
1053f186373fSMark Fasheh 				goto again;
10545a1d7843SJan Schmidt 			}
10555a1d7843SJan Schmidt 			kfree(victim_name);
1056f186373fSMark Fasheh 
10575a1d7843SJan Schmidt 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
10585a1d7843SJan Schmidt 		}
10595a1d7843SJan Schmidt 
10605a1d7843SJan Schmidt 		/*
10615a1d7843SJan Schmidt 		 * NOTE: we have searched root tree and checked the
1062bb7ab3b9SAdam Buchbinder 		 * corresponding ref, it does not need to check again.
10635a1d7843SJan Schmidt 		 */
10645a1d7843SJan Schmidt 		*search_done = 1;
10655a1d7843SJan Schmidt 	}
10665a1d7843SJan Schmidt 	btrfs_release_path(path);
10675a1d7843SJan Schmidt 
1068f186373fSMark Fasheh 	/* Same search but for extended refs */
1069f186373fSMark Fasheh 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1070f186373fSMark Fasheh 					   inode_objectid, parent_objectid, 0,
1071f186373fSMark Fasheh 					   0);
1072f186373fSMark Fasheh 	if (!IS_ERR_OR_NULL(extref)) {
1073f186373fSMark Fasheh 		u32 item_size;
1074f186373fSMark Fasheh 		u32 cur_offset = 0;
1075f186373fSMark Fasheh 		unsigned long base;
1076f186373fSMark Fasheh 		struct inode *victim_parent;
1077f186373fSMark Fasheh 
1078f186373fSMark Fasheh 		leaf = path->nodes[0];
1079f186373fSMark Fasheh 
1080f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1081f186373fSMark Fasheh 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1082f186373fSMark Fasheh 
1083f186373fSMark Fasheh 		while (cur_offset < item_size) {
1084dd9ef135SQuentin Casasnovas 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1085f186373fSMark Fasheh 
1086f186373fSMark Fasheh 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1087f186373fSMark Fasheh 
1088f186373fSMark Fasheh 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1089f186373fSMark Fasheh 				goto next;
1090f186373fSMark Fasheh 
1091f186373fSMark Fasheh 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
10923650860bSJosef Bacik 			if (!victim_name)
10933650860bSJosef Bacik 				return -ENOMEM;
1094f186373fSMark Fasheh 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1095f186373fSMark Fasheh 					   victim_name_len);
1096f186373fSMark Fasheh 
1097f186373fSMark Fasheh 			search_key.objectid = inode_objectid;
1098f186373fSMark Fasheh 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1099f186373fSMark Fasheh 			search_key.offset = btrfs_extref_hash(parent_objectid,
1100f186373fSMark Fasheh 							      victim_name,
1101f186373fSMark Fasheh 							      victim_name_len);
1102f186373fSMark Fasheh 			ret = 0;
1103f186373fSMark Fasheh 			if (!backref_in_log(log_root, &search_key,
1104f186373fSMark Fasheh 					    parent_objectid, victim_name,
1105f186373fSMark Fasheh 					    victim_name_len)) {
1106f186373fSMark Fasheh 				ret = -ENOENT;
1107f186373fSMark Fasheh 				victim_parent = read_one_inode(root,
1108f186373fSMark Fasheh 						parent_objectid);
1109f186373fSMark Fasheh 				if (victim_parent) {
111094c91a1fSNikolay Borisov 					inc_nlink(&inode->vfs_inode);
1111f186373fSMark Fasheh 					btrfs_release_path(path);
1112f186373fSMark Fasheh 
1113f186373fSMark Fasheh 					ret = btrfs_unlink_inode(trans, root,
11144ec5934eSNikolay Borisov 							BTRFS_I(victim_parent),
111594c91a1fSNikolay Borisov 							inode,
1116f186373fSMark Fasheh 							victim_name,
1117f186373fSMark Fasheh 							victim_name_len);
1118ada9af21SFilipe David Borba Manana 					if (!ret)
1119ada9af21SFilipe David Borba Manana 						ret = btrfs_run_delayed_items(
1120e5c304e6SNikolay Borisov 								  trans);
1121f186373fSMark Fasheh 				}
1122f186373fSMark Fasheh 				iput(victim_parent);
1123f186373fSMark Fasheh 				kfree(victim_name);
11243650860bSJosef Bacik 				if (ret)
11253650860bSJosef Bacik 					return ret;
1126f186373fSMark Fasheh 				*search_done = 1;
1127f186373fSMark Fasheh 				goto again;
1128f186373fSMark Fasheh 			}
1129f186373fSMark Fasheh 			kfree(victim_name);
1130f186373fSMark Fasheh next:
1131f186373fSMark Fasheh 			cur_offset += victim_name_len + sizeof(*extref);
1132f186373fSMark Fasheh 		}
1133f186373fSMark Fasheh 		*search_done = 1;
1134f186373fSMark Fasheh 	}
1135f186373fSMark Fasheh 	btrfs_release_path(path);
1136f186373fSMark Fasheh 
11375a1d7843SJan Schmidt 	/* look for a conflicting sequence number */
113894c91a1fSNikolay Borisov 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1139f186373fSMark Fasheh 					 ref_index, name, namelen, 0);
11405a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
114194c91a1fSNikolay Borisov 		ret = drop_one_dir_item(trans, root, path, dir, di);
11423650860bSJosef Bacik 		if (ret)
11433650860bSJosef Bacik 			return ret;
11445a1d7843SJan Schmidt 	}
11455a1d7843SJan Schmidt 	btrfs_release_path(path);
11465a1d7843SJan Schmidt 
11475a1d7843SJan Schmidt 	/* look for a conflicing name */
114894c91a1fSNikolay Borisov 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
11495a1d7843SJan Schmidt 				   name, namelen, 0);
11505a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
115194c91a1fSNikolay Borisov 		ret = drop_one_dir_item(trans, root, path, dir, di);
11523650860bSJosef Bacik 		if (ret)
11533650860bSJosef Bacik 			return ret;
11545a1d7843SJan Schmidt 	}
11555a1d7843SJan Schmidt 	btrfs_release_path(path);
11565a1d7843SJan Schmidt 
11575a1d7843SJan Schmidt 	return 0;
11585a1d7843SJan Schmidt }
1159e02119d5SChris Mason 
1160bae15d95SQu Wenruo static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1161bae15d95SQu Wenruo 			     u32 *namelen, char **name, u64 *index,
1162bae15d95SQu Wenruo 			     u64 *parent_objectid)
1163f186373fSMark Fasheh {
1164f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1165f186373fSMark Fasheh 
1166f186373fSMark Fasheh 	extref = (struct btrfs_inode_extref *)ref_ptr;
1167f186373fSMark Fasheh 
1168f186373fSMark Fasheh 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1169f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1170f186373fSMark Fasheh 	if (*name == NULL)
1171f186373fSMark Fasheh 		return -ENOMEM;
1172f186373fSMark Fasheh 
1173f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1174f186373fSMark Fasheh 			   *namelen);
1175f186373fSMark Fasheh 
11761f250e92SFilipe Manana 	if (index)
1177f186373fSMark Fasheh 		*index = btrfs_inode_extref_index(eb, extref);
1178f186373fSMark Fasheh 	if (parent_objectid)
1179f186373fSMark Fasheh 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1180f186373fSMark Fasheh 
1181f186373fSMark Fasheh 	return 0;
1182f186373fSMark Fasheh }
1183f186373fSMark Fasheh 
1184bae15d95SQu Wenruo static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1185bae15d95SQu Wenruo 			  u32 *namelen, char **name, u64 *index)
1186f186373fSMark Fasheh {
1187f186373fSMark Fasheh 	struct btrfs_inode_ref *ref;
1188f186373fSMark Fasheh 
1189f186373fSMark Fasheh 	ref = (struct btrfs_inode_ref *)ref_ptr;
1190f186373fSMark Fasheh 
1191f186373fSMark Fasheh 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1192f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1193f186373fSMark Fasheh 	if (*name == NULL)
1194f186373fSMark Fasheh 		return -ENOMEM;
1195f186373fSMark Fasheh 
1196f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1197f186373fSMark Fasheh 
11981f250e92SFilipe Manana 	if (index)
1199f186373fSMark Fasheh 		*index = btrfs_inode_ref_index(eb, ref);
1200f186373fSMark Fasheh 
1201f186373fSMark Fasheh 	return 0;
1202f186373fSMark Fasheh }
1203f186373fSMark Fasheh 
1204e02119d5SChris Mason /*
12051f250e92SFilipe Manana  * Take an inode reference item from the log tree and iterate all names from the
12061f250e92SFilipe Manana  * inode reference item in the subvolume tree with the same key (if it exists).
12071f250e92SFilipe Manana  * For any name that is not in the inode reference item from the log tree, do a
12081f250e92SFilipe Manana  * proper unlink of that name (that is, remove its entry from the inode
12091f250e92SFilipe Manana  * reference item and both dir index keys).
12101f250e92SFilipe Manana  */
12111f250e92SFilipe Manana static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
12121f250e92SFilipe Manana 				 struct btrfs_root *root,
12131f250e92SFilipe Manana 				 struct btrfs_path *path,
12141f250e92SFilipe Manana 				 struct btrfs_inode *inode,
12151f250e92SFilipe Manana 				 struct extent_buffer *log_eb,
12161f250e92SFilipe Manana 				 int log_slot,
12171f250e92SFilipe Manana 				 struct btrfs_key *key)
12181f250e92SFilipe Manana {
12191f250e92SFilipe Manana 	int ret;
12201f250e92SFilipe Manana 	unsigned long ref_ptr;
12211f250e92SFilipe Manana 	unsigned long ref_end;
12221f250e92SFilipe Manana 	struct extent_buffer *eb;
12231f250e92SFilipe Manana 
12241f250e92SFilipe Manana again:
12251f250e92SFilipe Manana 	btrfs_release_path(path);
12261f250e92SFilipe Manana 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
12271f250e92SFilipe Manana 	if (ret > 0) {
12281f250e92SFilipe Manana 		ret = 0;
12291f250e92SFilipe Manana 		goto out;
12301f250e92SFilipe Manana 	}
12311f250e92SFilipe Manana 	if (ret < 0)
12321f250e92SFilipe Manana 		goto out;
12331f250e92SFilipe Manana 
12341f250e92SFilipe Manana 	eb = path->nodes[0];
12351f250e92SFilipe Manana 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
12361f250e92SFilipe Manana 	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
12371f250e92SFilipe Manana 	while (ref_ptr < ref_end) {
12381f250e92SFilipe Manana 		char *name = NULL;
12391f250e92SFilipe Manana 		int namelen;
12401f250e92SFilipe Manana 		u64 parent_id;
12411f250e92SFilipe Manana 
12421f250e92SFilipe Manana 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
12431f250e92SFilipe Manana 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
12441f250e92SFilipe Manana 						NULL, &parent_id);
12451f250e92SFilipe Manana 		} else {
12461f250e92SFilipe Manana 			parent_id = key->offset;
12471f250e92SFilipe Manana 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
12481f250e92SFilipe Manana 					     NULL);
12491f250e92SFilipe Manana 		}
12501f250e92SFilipe Manana 		if (ret)
12511f250e92SFilipe Manana 			goto out;
12521f250e92SFilipe Manana 
12531f250e92SFilipe Manana 		if (key->type == BTRFS_INODE_EXTREF_KEY)
12541f250e92SFilipe Manana 			ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
12551f250e92SFilipe Manana 							     parent_id, name,
12561f250e92SFilipe Manana 							     namelen, NULL);
12571f250e92SFilipe Manana 		else
12581f250e92SFilipe Manana 			ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
12591f250e92SFilipe Manana 							 namelen, NULL);
12601f250e92SFilipe Manana 
12611f250e92SFilipe Manana 		if (!ret) {
12621f250e92SFilipe Manana 			struct inode *dir;
12631f250e92SFilipe Manana 
12641f250e92SFilipe Manana 			btrfs_release_path(path);
12651f250e92SFilipe Manana 			dir = read_one_inode(root, parent_id);
12661f250e92SFilipe Manana 			if (!dir) {
12671f250e92SFilipe Manana 				ret = -ENOENT;
12681f250e92SFilipe Manana 				kfree(name);
12691f250e92SFilipe Manana 				goto out;
12701f250e92SFilipe Manana 			}
12711f250e92SFilipe Manana 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
12721f250e92SFilipe Manana 						 inode, name, namelen);
12731f250e92SFilipe Manana 			kfree(name);
12741f250e92SFilipe Manana 			iput(dir);
12751f250e92SFilipe Manana 			if (ret)
12761f250e92SFilipe Manana 				goto out;
12771f250e92SFilipe Manana 			goto again;
12781f250e92SFilipe Manana 		}
12791f250e92SFilipe Manana 
12801f250e92SFilipe Manana 		kfree(name);
12811f250e92SFilipe Manana 		ref_ptr += namelen;
12821f250e92SFilipe Manana 		if (key->type == BTRFS_INODE_EXTREF_KEY)
12831f250e92SFilipe Manana 			ref_ptr += sizeof(struct btrfs_inode_extref);
12841f250e92SFilipe Manana 		else
12851f250e92SFilipe Manana 			ref_ptr += sizeof(struct btrfs_inode_ref);
12861f250e92SFilipe Manana 	}
12871f250e92SFilipe Manana 	ret = 0;
12881f250e92SFilipe Manana  out:
12891f250e92SFilipe Manana 	btrfs_release_path(path);
12901f250e92SFilipe Manana 	return ret;
12911f250e92SFilipe Manana }
12921f250e92SFilipe Manana 
12931f250e92SFilipe Manana /*
1294e02119d5SChris Mason  * replay one inode back reference item found in the log tree.
1295e02119d5SChris Mason  * eb, slot and key refer to the buffer and key found in the log tree.
1296e02119d5SChris Mason  * root is the destination we are replaying into, and path is for temp
1297e02119d5SChris Mason  * use by this function.  (it should be released on return).
1298e02119d5SChris Mason  */
1299e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1300e02119d5SChris Mason 				  struct btrfs_root *root,
1301e02119d5SChris Mason 				  struct btrfs_root *log,
1302e02119d5SChris Mason 				  struct btrfs_path *path,
1303e02119d5SChris Mason 				  struct extent_buffer *eb, int slot,
1304e02119d5SChris Mason 				  struct btrfs_key *key)
1305e02119d5SChris Mason {
130603b2f08bSGeyslan G. Bem 	struct inode *dir = NULL;
130703b2f08bSGeyslan G. Bem 	struct inode *inode = NULL;
1308e02119d5SChris Mason 	unsigned long ref_ptr;
1309e02119d5SChris Mason 	unsigned long ref_end;
131003b2f08bSGeyslan G. Bem 	char *name = NULL;
131134f3e4f2Sliubo 	int namelen;
131234f3e4f2Sliubo 	int ret;
1313c622ae60Sliubo 	int search_done = 0;
1314f186373fSMark Fasheh 	int log_ref_ver = 0;
1315f186373fSMark Fasheh 	u64 parent_objectid;
1316f186373fSMark Fasheh 	u64 inode_objectid;
1317f46dbe3dSChris Mason 	u64 ref_index = 0;
1318f186373fSMark Fasheh 	int ref_struct_size;
1319f186373fSMark Fasheh 
1320f186373fSMark Fasheh 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1321f186373fSMark Fasheh 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1322f186373fSMark Fasheh 
1323f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1324f186373fSMark Fasheh 		struct btrfs_inode_extref *r;
1325f186373fSMark Fasheh 
1326f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1327f186373fSMark Fasheh 		log_ref_ver = 1;
1328f186373fSMark Fasheh 		r = (struct btrfs_inode_extref *)ref_ptr;
1329f186373fSMark Fasheh 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1330f186373fSMark Fasheh 	} else {
1331f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1332f186373fSMark Fasheh 		parent_objectid = key->offset;
1333f186373fSMark Fasheh 	}
1334f186373fSMark Fasheh 	inode_objectid = key->objectid;
1335e02119d5SChris Mason 
1336e02119d5SChris Mason 	/*
1337e02119d5SChris Mason 	 * it is possible that we didn't log all the parent directories
1338e02119d5SChris Mason 	 * for a given inode.  If we don't find the dir, just don't
1339e02119d5SChris Mason 	 * copy the back ref in.  The link count fixup code will take
1340e02119d5SChris Mason 	 * care of the rest
1341e02119d5SChris Mason 	 */
1342f186373fSMark Fasheh 	dir = read_one_inode(root, parent_objectid);
134303b2f08bSGeyslan G. Bem 	if (!dir) {
134403b2f08bSGeyslan G. Bem 		ret = -ENOENT;
134503b2f08bSGeyslan G. Bem 		goto out;
134603b2f08bSGeyslan G. Bem 	}
1347e02119d5SChris Mason 
1348f186373fSMark Fasheh 	inode = read_one_inode(root, inode_objectid);
1349c00e9493STsutomu Itoh 	if (!inode) {
135003b2f08bSGeyslan G. Bem 		ret = -EIO;
135103b2f08bSGeyslan G. Bem 		goto out;
1352c00e9493STsutomu Itoh 	}
1353e02119d5SChris Mason 
13545a1d7843SJan Schmidt 	while (ref_ptr < ref_end) {
1355f186373fSMark Fasheh 		if (log_ref_ver) {
1356bae15d95SQu Wenruo 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1357bae15d95SQu Wenruo 						&ref_index, &parent_objectid);
1358f186373fSMark Fasheh 			/*
1359f186373fSMark Fasheh 			 * parent object can change from one array
1360f186373fSMark Fasheh 			 * item to another.
1361f186373fSMark Fasheh 			 */
1362f186373fSMark Fasheh 			if (!dir)
1363f186373fSMark Fasheh 				dir = read_one_inode(root, parent_objectid);
136403b2f08bSGeyslan G. Bem 			if (!dir) {
136503b2f08bSGeyslan G. Bem 				ret = -ENOENT;
136603b2f08bSGeyslan G. Bem 				goto out;
136703b2f08bSGeyslan G. Bem 			}
1368f186373fSMark Fasheh 		} else {
1369bae15d95SQu Wenruo 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1370bae15d95SQu Wenruo 					     &ref_index);
1371f186373fSMark Fasheh 		}
1372f186373fSMark Fasheh 		if (ret)
137303b2f08bSGeyslan G. Bem 			goto out;
1374e02119d5SChris Mason 
1375e02119d5SChris Mason 		/* if we already have a perfect match, we're done */
1376f85b7379SDavid Sterba 		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1377f85b7379SDavid Sterba 					btrfs_ino(BTRFS_I(inode)), ref_index,
1378f85b7379SDavid Sterba 					name, namelen)) {
13795a1d7843SJan Schmidt 			/*
13805a1d7843SJan Schmidt 			 * look for a conflicting back reference in the
13815a1d7843SJan Schmidt 			 * metadata. if we find one we have to unlink that name
13825a1d7843SJan Schmidt 			 * of the file before we add our new link.  Later on, we
13835a1d7843SJan Schmidt 			 * overwrite any existing back reference, and we don't
13845a1d7843SJan Schmidt 			 * want to create dangling pointers in the directory.
13855a1d7843SJan Schmidt 			 */
13865a1d7843SJan Schmidt 
13875a1d7843SJan Schmidt 			if (!search_done) {
13885a1d7843SJan Schmidt 				ret = __add_inode_ref(trans, root, path, log,
138994c91a1fSNikolay Borisov 						      BTRFS_I(dir),
1390d75eefdfSDavid Sterba 						      BTRFS_I(inode),
1391f186373fSMark Fasheh 						      inode_objectid,
1392f186373fSMark Fasheh 						      parent_objectid,
1393f186373fSMark Fasheh 						      ref_index, name, namelen,
13945a1d7843SJan Schmidt 						      &search_done);
139503b2f08bSGeyslan G. Bem 				if (ret) {
139603b2f08bSGeyslan G. Bem 					if (ret == 1)
13973650860bSJosef Bacik 						ret = 0;
1398e02119d5SChris Mason 					goto out;
13993650860bSJosef Bacik 				}
140034f3e4f2Sliubo 			}
140134f3e4f2Sliubo 
1402e02119d5SChris Mason 			/* insert our name */
1403db0a669fSNikolay Borisov 			ret = btrfs_add_link(trans, BTRFS_I(dir),
1404db0a669fSNikolay Borisov 					BTRFS_I(inode),
1405db0a669fSNikolay Borisov 					name, namelen, 0, ref_index);
14063650860bSJosef Bacik 			if (ret)
14073650860bSJosef Bacik 				goto out;
1408e02119d5SChris Mason 
1409e02119d5SChris Mason 			btrfs_update_inode(trans, root, inode);
14105a1d7843SJan Schmidt 		}
1411e02119d5SChris Mason 
1412f186373fSMark Fasheh 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1413e02119d5SChris Mason 		kfree(name);
141403b2f08bSGeyslan G. Bem 		name = NULL;
1415f186373fSMark Fasheh 		if (log_ref_ver) {
1416f186373fSMark Fasheh 			iput(dir);
1417f186373fSMark Fasheh 			dir = NULL;
1418f186373fSMark Fasheh 		}
14195a1d7843SJan Schmidt 	}
1420e02119d5SChris Mason 
14211f250e92SFilipe Manana 	/*
14221f250e92SFilipe Manana 	 * Before we overwrite the inode reference item in the subvolume tree
14231f250e92SFilipe Manana 	 * with the item from the log tree, we must unlink all names from the
14241f250e92SFilipe Manana 	 * parent directory that are in the subvolume's tree inode reference
14251f250e92SFilipe Manana 	 * item, otherwise we end up with an inconsistent subvolume tree where
14261f250e92SFilipe Manana 	 * dir index entries exist for a name but there is no inode reference
14271f250e92SFilipe Manana 	 * item with the same name.
14281f250e92SFilipe Manana 	 */
14291f250e92SFilipe Manana 	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
14301f250e92SFilipe Manana 				    key);
14311f250e92SFilipe Manana 	if (ret)
14321f250e92SFilipe Manana 		goto out;
14331f250e92SFilipe Manana 
1434e02119d5SChris Mason 	/* finally write the back reference in the inode */
1435e02119d5SChris Mason 	ret = overwrite_item(trans, root, path, eb, slot, key);
14365a1d7843SJan Schmidt out:
1437b3b4aa74SDavid Sterba 	btrfs_release_path(path);
143803b2f08bSGeyslan G. Bem 	kfree(name);
1439e02119d5SChris Mason 	iput(dir);
1440e02119d5SChris Mason 	iput(inode);
14413650860bSJosef Bacik 	return ret;
1442e02119d5SChris Mason }
1443e02119d5SChris Mason 
1444c71bf099SYan, Zheng static int insert_orphan_item(struct btrfs_trans_handle *trans,
14459c4f61f0SDavid Sterba 			      struct btrfs_root *root, u64 ino)
1446c71bf099SYan, Zheng {
1447c71bf099SYan, Zheng 	int ret;
1448381cf658SDavid Sterba 
14499c4f61f0SDavid Sterba 	ret = btrfs_insert_orphan_item(trans, root, ino);
14509c4f61f0SDavid Sterba 	if (ret == -EEXIST)
14519c4f61f0SDavid Sterba 		ret = 0;
1452381cf658SDavid Sterba 
1453c71bf099SYan, Zheng 	return ret;
1454c71bf099SYan, Zheng }
1455c71bf099SYan, Zheng 
1456f186373fSMark Fasheh static int count_inode_extrefs(struct btrfs_root *root,
145736283658SNikolay Borisov 		struct btrfs_inode *inode, struct btrfs_path *path)
1458e02119d5SChris Mason {
1459f186373fSMark Fasheh 	int ret = 0;
1460f186373fSMark Fasheh 	int name_len;
1461f186373fSMark Fasheh 	unsigned int nlink = 0;
1462f186373fSMark Fasheh 	u32 item_size;
1463f186373fSMark Fasheh 	u32 cur_offset = 0;
146436283658SNikolay Borisov 	u64 inode_objectid = btrfs_ino(inode);
1465f186373fSMark Fasheh 	u64 offset = 0;
1466f186373fSMark Fasheh 	unsigned long ptr;
1467f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1468f186373fSMark Fasheh 	struct extent_buffer *leaf;
1469f186373fSMark Fasheh 
1470f186373fSMark Fasheh 	while (1) {
1471f186373fSMark Fasheh 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1472f186373fSMark Fasheh 					    &extref, &offset);
1473f186373fSMark Fasheh 		if (ret)
1474f186373fSMark Fasheh 			break;
1475f186373fSMark Fasheh 
1476f186373fSMark Fasheh 		leaf = path->nodes[0];
1477f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1478f186373fSMark Fasheh 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
14792c2c452bSFilipe Manana 		cur_offset = 0;
1480f186373fSMark Fasheh 
1481f186373fSMark Fasheh 		while (cur_offset < item_size) {
1482f186373fSMark Fasheh 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1483f186373fSMark Fasheh 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1484f186373fSMark Fasheh 
1485f186373fSMark Fasheh 			nlink++;
1486f186373fSMark Fasheh 
1487f186373fSMark Fasheh 			cur_offset += name_len + sizeof(*extref);
1488f186373fSMark Fasheh 		}
1489f186373fSMark Fasheh 
1490f186373fSMark Fasheh 		offset++;
1491f186373fSMark Fasheh 		btrfs_release_path(path);
1492f186373fSMark Fasheh 	}
1493f186373fSMark Fasheh 	btrfs_release_path(path);
1494f186373fSMark Fasheh 
14952c2c452bSFilipe Manana 	if (ret < 0 && ret != -ENOENT)
1496f186373fSMark Fasheh 		return ret;
1497f186373fSMark Fasheh 	return nlink;
1498f186373fSMark Fasheh }
1499f186373fSMark Fasheh 
1500f186373fSMark Fasheh static int count_inode_refs(struct btrfs_root *root,
1501f329e319SNikolay Borisov 			struct btrfs_inode *inode, struct btrfs_path *path)
1502f186373fSMark Fasheh {
1503e02119d5SChris Mason 	int ret;
1504e02119d5SChris Mason 	struct btrfs_key key;
1505f186373fSMark Fasheh 	unsigned int nlink = 0;
1506e02119d5SChris Mason 	unsigned long ptr;
1507e02119d5SChris Mason 	unsigned long ptr_end;
1508e02119d5SChris Mason 	int name_len;
1509f329e319SNikolay Borisov 	u64 ino = btrfs_ino(inode);
1510e02119d5SChris Mason 
151133345d01SLi Zefan 	key.objectid = ino;
1512e02119d5SChris Mason 	key.type = BTRFS_INODE_REF_KEY;
1513e02119d5SChris Mason 	key.offset = (u64)-1;
1514e02119d5SChris Mason 
1515e02119d5SChris Mason 	while (1) {
1516e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1517e02119d5SChris Mason 		if (ret < 0)
1518e02119d5SChris Mason 			break;
1519e02119d5SChris Mason 		if (ret > 0) {
1520e02119d5SChris Mason 			if (path->slots[0] == 0)
1521e02119d5SChris Mason 				break;
1522e02119d5SChris Mason 			path->slots[0]--;
1523e02119d5SChris Mason 		}
1524e93ae26fSFilipe David Borba Manana process_slot:
1525e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1526e02119d5SChris Mason 				      path->slots[0]);
152733345d01SLi Zefan 		if (key.objectid != ino ||
1528e02119d5SChris Mason 		    key.type != BTRFS_INODE_REF_KEY)
1529e02119d5SChris Mason 			break;
1530e02119d5SChris Mason 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1531e02119d5SChris Mason 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1532e02119d5SChris Mason 						   path->slots[0]);
1533e02119d5SChris Mason 		while (ptr < ptr_end) {
1534e02119d5SChris Mason 			struct btrfs_inode_ref *ref;
1535e02119d5SChris Mason 
1536e02119d5SChris Mason 			ref = (struct btrfs_inode_ref *)ptr;
1537e02119d5SChris Mason 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1538e02119d5SChris Mason 							    ref);
1539e02119d5SChris Mason 			ptr = (unsigned long)(ref + 1) + name_len;
1540e02119d5SChris Mason 			nlink++;
1541e02119d5SChris Mason 		}
1542e02119d5SChris Mason 
1543e02119d5SChris Mason 		if (key.offset == 0)
1544e02119d5SChris Mason 			break;
1545e93ae26fSFilipe David Borba Manana 		if (path->slots[0] > 0) {
1546e93ae26fSFilipe David Borba Manana 			path->slots[0]--;
1547e93ae26fSFilipe David Borba Manana 			goto process_slot;
1548e93ae26fSFilipe David Borba Manana 		}
1549e02119d5SChris Mason 		key.offset--;
1550b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1551e02119d5SChris Mason 	}
1552b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1553f186373fSMark Fasheh 
1554f186373fSMark Fasheh 	return nlink;
1555f186373fSMark Fasheh }
1556f186373fSMark Fasheh 
1557f186373fSMark Fasheh /*
1558f186373fSMark Fasheh  * There are a few corners where the link count of the file can't
1559f186373fSMark Fasheh  * be properly maintained during replay.  So, instead of adding
1560f186373fSMark Fasheh  * lots of complexity to the log code, we just scan the backrefs
1561f186373fSMark Fasheh  * for any file that has been through replay.
1562f186373fSMark Fasheh  *
1563f186373fSMark Fasheh  * The scan will update the link count on the inode to reflect the
1564f186373fSMark Fasheh  * number of back refs found.  If it goes down to zero, the iput
1565f186373fSMark Fasheh  * will free the inode.
1566f186373fSMark Fasheh  */
1567f186373fSMark Fasheh static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1568f186373fSMark Fasheh 					   struct btrfs_root *root,
1569f186373fSMark Fasheh 					   struct inode *inode)
1570f186373fSMark Fasheh {
1571f186373fSMark Fasheh 	struct btrfs_path *path;
1572f186373fSMark Fasheh 	int ret;
1573f186373fSMark Fasheh 	u64 nlink = 0;
15744a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
1575f186373fSMark Fasheh 
1576f186373fSMark Fasheh 	path = btrfs_alloc_path();
1577f186373fSMark Fasheh 	if (!path)
1578f186373fSMark Fasheh 		return -ENOMEM;
1579f186373fSMark Fasheh 
1580f329e319SNikolay Borisov 	ret = count_inode_refs(root, BTRFS_I(inode), path);
1581f186373fSMark Fasheh 	if (ret < 0)
1582f186373fSMark Fasheh 		goto out;
1583f186373fSMark Fasheh 
1584f186373fSMark Fasheh 	nlink = ret;
1585f186373fSMark Fasheh 
158636283658SNikolay Borisov 	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1587f186373fSMark Fasheh 	if (ret < 0)
1588f186373fSMark Fasheh 		goto out;
1589f186373fSMark Fasheh 
1590f186373fSMark Fasheh 	nlink += ret;
1591f186373fSMark Fasheh 
1592f186373fSMark Fasheh 	ret = 0;
1593f186373fSMark Fasheh 
1594e02119d5SChris Mason 	if (nlink != inode->i_nlink) {
1595bfe86848SMiklos Szeredi 		set_nlink(inode, nlink);
1596e02119d5SChris Mason 		btrfs_update_inode(trans, root, inode);
1597e02119d5SChris Mason 	}
15988d5bf1cbSChris Mason 	BTRFS_I(inode)->index_cnt = (u64)-1;
1599e02119d5SChris Mason 
1600c71bf099SYan, Zheng 	if (inode->i_nlink == 0) {
1601c71bf099SYan, Zheng 		if (S_ISDIR(inode->i_mode)) {
160212fcfd22SChris Mason 			ret = replay_dir_deletes(trans, root, NULL, path,
160333345d01SLi Zefan 						 ino, 1);
16043650860bSJosef Bacik 			if (ret)
16053650860bSJosef Bacik 				goto out;
160612fcfd22SChris Mason 		}
160733345d01SLi Zefan 		ret = insert_orphan_item(trans, root, ino);
1608c71bf099SYan, Zheng 	}
160912fcfd22SChris Mason 
1610f186373fSMark Fasheh out:
1611f186373fSMark Fasheh 	btrfs_free_path(path);
1612f186373fSMark Fasheh 	return ret;
1613e02119d5SChris Mason }
1614e02119d5SChris Mason 
1615e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1616e02119d5SChris Mason 					    struct btrfs_root *root,
1617e02119d5SChris Mason 					    struct btrfs_path *path)
1618e02119d5SChris Mason {
1619e02119d5SChris Mason 	int ret;
1620e02119d5SChris Mason 	struct btrfs_key key;
1621e02119d5SChris Mason 	struct inode *inode;
1622e02119d5SChris Mason 
1623e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1624e02119d5SChris Mason 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1625e02119d5SChris Mason 	key.offset = (u64)-1;
1626e02119d5SChris Mason 	while (1) {
1627e02119d5SChris Mason 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1628e02119d5SChris Mason 		if (ret < 0)
1629e02119d5SChris Mason 			break;
1630e02119d5SChris Mason 
1631e02119d5SChris Mason 		if (ret == 1) {
1632e02119d5SChris Mason 			if (path->slots[0] == 0)
1633e02119d5SChris Mason 				break;
1634e02119d5SChris Mason 			path->slots[0]--;
1635e02119d5SChris Mason 		}
1636e02119d5SChris Mason 
1637e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1638e02119d5SChris Mason 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1639e02119d5SChris Mason 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1640e02119d5SChris Mason 			break;
1641e02119d5SChris Mason 
1642e02119d5SChris Mason 		ret = btrfs_del_item(trans, root, path);
164365a246c5STsutomu Itoh 		if (ret)
164465a246c5STsutomu Itoh 			goto out;
1645e02119d5SChris Mason 
1646b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1647e02119d5SChris Mason 		inode = read_one_inode(root, key.offset);
1648c00e9493STsutomu Itoh 		if (!inode)
1649c00e9493STsutomu Itoh 			return -EIO;
1650e02119d5SChris Mason 
1651e02119d5SChris Mason 		ret = fixup_inode_link_count(trans, root, inode);
1652e02119d5SChris Mason 		iput(inode);
16533650860bSJosef Bacik 		if (ret)
16543650860bSJosef Bacik 			goto out;
1655e02119d5SChris Mason 
165612fcfd22SChris Mason 		/*
165712fcfd22SChris Mason 		 * fixup on a directory may create new entries,
165812fcfd22SChris Mason 		 * make sure we always look for the highset possible
165912fcfd22SChris Mason 		 * offset
166012fcfd22SChris Mason 		 */
166112fcfd22SChris Mason 		key.offset = (u64)-1;
1662e02119d5SChris Mason 	}
166365a246c5STsutomu Itoh 	ret = 0;
166465a246c5STsutomu Itoh out:
1665b3b4aa74SDavid Sterba 	btrfs_release_path(path);
166665a246c5STsutomu Itoh 	return ret;
1667e02119d5SChris Mason }
1668e02119d5SChris Mason 
1669e02119d5SChris Mason 
1670e02119d5SChris Mason /*
1671e02119d5SChris Mason  * record a given inode in the fixup dir so we can check its link
1672e02119d5SChris Mason  * count when replay is done.  The link count is incremented here
1673e02119d5SChris Mason  * so the inode won't go away until we check it
1674e02119d5SChris Mason  */
1675e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1676e02119d5SChris Mason 				      struct btrfs_root *root,
1677e02119d5SChris Mason 				      struct btrfs_path *path,
1678e02119d5SChris Mason 				      u64 objectid)
1679e02119d5SChris Mason {
1680e02119d5SChris Mason 	struct btrfs_key key;
1681e02119d5SChris Mason 	int ret = 0;
1682e02119d5SChris Mason 	struct inode *inode;
1683e02119d5SChris Mason 
1684e02119d5SChris Mason 	inode = read_one_inode(root, objectid);
1685c00e9493STsutomu Itoh 	if (!inode)
1686c00e9493STsutomu Itoh 		return -EIO;
1687e02119d5SChris Mason 
1688e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1689962a298fSDavid Sterba 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1690e02119d5SChris Mason 	key.offset = objectid;
1691e02119d5SChris Mason 
1692e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1693e02119d5SChris Mason 
1694b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1695e02119d5SChris Mason 	if (ret == 0) {
16969bf7a489SJosef Bacik 		if (!inode->i_nlink)
16979bf7a489SJosef Bacik 			set_nlink(inode, 1);
16989bf7a489SJosef Bacik 		else
16998b558c5fSZach Brown 			inc_nlink(inode);
1700b9959295STsutomu Itoh 		ret = btrfs_update_inode(trans, root, inode);
1701e02119d5SChris Mason 	} else if (ret == -EEXIST) {
1702e02119d5SChris Mason 		ret = 0;
1703e02119d5SChris Mason 	} else {
17043650860bSJosef Bacik 		BUG(); /* Logic Error */
1705e02119d5SChris Mason 	}
1706e02119d5SChris Mason 	iput(inode);
1707e02119d5SChris Mason 
1708e02119d5SChris Mason 	return ret;
1709e02119d5SChris Mason }
1710e02119d5SChris Mason 
1711e02119d5SChris Mason /*
1712e02119d5SChris Mason  * when replaying the log for a directory, we only insert names
1713e02119d5SChris Mason  * for inodes that actually exist.  This means an fsync on a directory
1714e02119d5SChris Mason  * does not implicitly fsync all the new files in it
1715e02119d5SChris Mason  */
1716e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1717e02119d5SChris Mason 				    struct btrfs_root *root,
1718e02119d5SChris Mason 				    u64 dirid, u64 index,
171960d53eb3SZhaolei 				    char *name, int name_len,
1720e02119d5SChris Mason 				    struct btrfs_key *location)
1721e02119d5SChris Mason {
1722e02119d5SChris Mason 	struct inode *inode;
1723e02119d5SChris Mason 	struct inode *dir;
1724e02119d5SChris Mason 	int ret;
1725e02119d5SChris Mason 
1726e02119d5SChris Mason 	inode = read_one_inode(root, location->objectid);
1727e02119d5SChris Mason 	if (!inode)
1728e02119d5SChris Mason 		return -ENOENT;
1729e02119d5SChris Mason 
1730e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
1731e02119d5SChris Mason 	if (!dir) {
1732e02119d5SChris Mason 		iput(inode);
1733e02119d5SChris Mason 		return -EIO;
1734e02119d5SChris Mason 	}
1735d555438bSJosef Bacik 
1736db0a669fSNikolay Borisov 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1737db0a669fSNikolay Borisov 			name_len, 1, index);
1738e02119d5SChris Mason 
1739e02119d5SChris Mason 	/* FIXME, put inode into FIXUP list */
1740e02119d5SChris Mason 
1741e02119d5SChris Mason 	iput(inode);
1742e02119d5SChris Mason 	iput(dir);
1743e02119d5SChris Mason 	return ret;
1744e02119d5SChris Mason }
1745e02119d5SChris Mason 
1746e02119d5SChris Mason /*
1747df8d116fSFilipe Manana  * Return true if an inode reference exists in the log for the given name,
1748df8d116fSFilipe Manana  * inode and parent inode.
1749df8d116fSFilipe Manana  */
1750df8d116fSFilipe Manana static bool name_in_log_ref(struct btrfs_root *log_root,
1751df8d116fSFilipe Manana 			    const char *name, const int name_len,
1752df8d116fSFilipe Manana 			    const u64 dirid, const u64 ino)
1753df8d116fSFilipe Manana {
1754df8d116fSFilipe Manana 	struct btrfs_key search_key;
1755df8d116fSFilipe Manana 
1756df8d116fSFilipe Manana 	search_key.objectid = ino;
1757df8d116fSFilipe Manana 	search_key.type = BTRFS_INODE_REF_KEY;
1758df8d116fSFilipe Manana 	search_key.offset = dirid;
1759df8d116fSFilipe Manana 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1760df8d116fSFilipe Manana 		return true;
1761df8d116fSFilipe Manana 
1762df8d116fSFilipe Manana 	search_key.type = BTRFS_INODE_EXTREF_KEY;
1763df8d116fSFilipe Manana 	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1764df8d116fSFilipe Manana 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1765df8d116fSFilipe Manana 		return true;
1766df8d116fSFilipe Manana 
1767df8d116fSFilipe Manana 	return false;
1768df8d116fSFilipe Manana }
1769df8d116fSFilipe Manana 
1770df8d116fSFilipe Manana /*
1771e02119d5SChris Mason  * take a single entry in a log directory item and replay it into
1772e02119d5SChris Mason  * the subvolume.
1773e02119d5SChris Mason  *
1774e02119d5SChris Mason  * if a conflicting item exists in the subdirectory already,
1775e02119d5SChris Mason  * the inode it points to is unlinked and put into the link count
1776e02119d5SChris Mason  * fix up tree.
1777e02119d5SChris Mason  *
1778e02119d5SChris Mason  * If a name from the log points to a file or directory that does
1779e02119d5SChris Mason  * not exist in the FS, it is skipped.  fsyncs on directories
1780e02119d5SChris Mason  * do not force down inodes inside that directory, just changes to the
1781e02119d5SChris Mason  * names or unlinks in a directory.
1782bb53eda9SFilipe Manana  *
1783bb53eda9SFilipe Manana  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1784bb53eda9SFilipe Manana  * non-existing inode) and 1 if the name was replayed.
1785e02119d5SChris Mason  */
1786e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1787e02119d5SChris Mason 				    struct btrfs_root *root,
1788e02119d5SChris Mason 				    struct btrfs_path *path,
1789e02119d5SChris Mason 				    struct extent_buffer *eb,
1790e02119d5SChris Mason 				    struct btrfs_dir_item *di,
1791e02119d5SChris Mason 				    struct btrfs_key *key)
1792e02119d5SChris Mason {
1793e02119d5SChris Mason 	char *name;
1794e02119d5SChris Mason 	int name_len;
1795e02119d5SChris Mason 	struct btrfs_dir_item *dst_di;
1796e02119d5SChris Mason 	struct btrfs_key found_key;
1797e02119d5SChris Mason 	struct btrfs_key log_key;
1798e02119d5SChris Mason 	struct inode *dir;
1799e02119d5SChris Mason 	u8 log_type;
18004bef0848SChris Mason 	int exists;
18013650860bSJosef Bacik 	int ret = 0;
1802d555438bSJosef Bacik 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1803bb53eda9SFilipe Manana 	bool name_added = false;
1804e02119d5SChris Mason 
1805e02119d5SChris Mason 	dir = read_one_inode(root, key->objectid);
1806c00e9493STsutomu Itoh 	if (!dir)
1807c00e9493STsutomu Itoh 		return -EIO;
1808e02119d5SChris Mason 
1809e02119d5SChris Mason 	name_len = btrfs_dir_name_len(eb, di);
1810e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
18112bac325eSFilipe David Borba Manana 	if (!name) {
18122bac325eSFilipe David Borba Manana 		ret = -ENOMEM;
18132bac325eSFilipe David Borba Manana 		goto out;
18142bac325eSFilipe David Borba Manana 	}
18152a29edc6Sliubo 
1816e02119d5SChris Mason 	log_type = btrfs_dir_type(eb, di);
1817e02119d5SChris Mason 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1818e02119d5SChris Mason 		   name_len);
1819e02119d5SChris Mason 
1820e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
18214bef0848SChris Mason 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
18224bef0848SChris Mason 	if (exists == 0)
18234bef0848SChris Mason 		exists = 1;
18244bef0848SChris Mason 	else
18254bef0848SChris Mason 		exists = 0;
1826b3b4aa74SDavid Sterba 	btrfs_release_path(path);
18274bef0848SChris Mason 
1828e02119d5SChris Mason 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1829e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1830e02119d5SChris Mason 				       name, name_len, 1);
1831d397712bSChris Mason 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1832e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1833e02119d5SChris Mason 						     key->objectid,
1834e02119d5SChris Mason 						     key->offset, name,
1835e02119d5SChris Mason 						     name_len, 1);
1836e02119d5SChris Mason 	} else {
18373650860bSJosef Bacik 		/* Corruption */
18383650860bSJosef Bacik 		ret = -EINVAL;
18393650860bSJosef Bacik 		goto out;
1840e02119d5SChris Mason 	}
1841c704005dSDavid Sterba 	if (IS_ERR_OR_NULL(dst_di)) {
1842e02119d5SChris Mason 		/* we need a sequence number to insert, so we only
1843e02119d5SChris Mason 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1844e02119d5SChris Mason 		 */
1845e02119d5SChris Mason 		if (key->type != BTRFS_DIR_INDEX_KEY)
1846e02119d5SChris Mason 			goto out;
1847e02119d5SChris Mason 		goto insert;
1848e02119d5SChris Mason 	}
1849e02119d5SChris Mason 
1850e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1851e02119d5SChris Mason 	/* the existing item matches the logged item */
1852e02119d5SChris Mason 	if (found_key.objectid == log_key.objectid &&
1853e02119d5SChris Mason 	    found_key.type == log_key.type &&
1854e02119d5SChris Mason 	    found_key.offset == log_key.offset &&
1855e02119d5SChris Mason 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1856a2cc11dbSFilipe Manana 		update_size = false;
1857e02119d5SChris Mason 		goto out;
1858e02119d5SChris Mason 	}
1859e02119d5SChris Mason 
1860e02119d5SChris Mason 	/*
1861e02119d5SChris Mason 	 * don't drop the conflicting directory entry if the inode
1862e02119d5SChris Mason 	 * for the new entry doesn't exist
1863e02119d5SChris Mason 	 */
18644bef0848SChris Mason 	if (!exists)
1865e02119d5SChris Mason 		goto out;
1866e02119d5SChris Mason 
1867207e7d92SNikolay Borisov 	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
18683650860bSJosef Bacik 	if (ret)
18693650860bSJosef Bacik 		goto out;
1870e02119d5SChris Mason 
1871e02119d5SChris Mason 	if (key->type == BTRFS_DIR_INDEX_KEY)
1872e02119d5SChris Mason 		goto insert;
1873e02119d5SChris Mason out:
1874b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1875d555438bSJosef Bacik 	if (!ret && update_size) {
18766ef06d27SNikolay Borisov 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1877d555438bSJosef Bacik 		ret = btrfs_update_inode(trans, root, dir);
1878d555438bSJosef Bacik 	}
1879e02119d5SChris Mason 	kfree(name);
1880e02119d5SChris Mason 	iput(dir);
1881bb53eda9SFilipe Manana 	if (!ret && name_added)
1882bb53eda9SFilipe Manana 		ret = 1;
18833650860bSJosef Bacik 	return ret;
1884e02119d5SChris Mason 
1885e02119d5SChris Mason insert:
1886df8d116fSFilipe Manana 	if (name_in_log_ref(root->log_root, name, name_len,
1887df8d116fSFilipe Manana 			    key->objectid, log_key.objectid)) {
1888df8d116fSFilipe Manana 		/* The dentry will be added later. */
1889df8d116fSFilipe Manana 		ret = 0;
1890df8d116fSFilipe Manana 		update_size = false;
1891df8d116fSFilipe Manana 		goto out;
1892df8d116fSFilipe Manana 	}
1893b3b4aa74SDavid Sterba 	btrfs_release_path(path);
189460d53eb3SZhaolei 	ret = insert_one_name(trans, root, key->objectid, key->offset,
189560d53eb3SZhaolei 			      name, name_len, &log_key);
1896df8d116fSFilipe Manana 	if (ret && ret != -ENOENT && ret != -EEXIST)
18973650860bSJosef Bacik 		goto out;
1898bb53eda9SFilipe Manana 	if (!ret)
1899bb53eda9SFilipe Manana 		name_added = true;
1900d555438bSJosef Bacik 	update_size = false;
19013650860bSJosef Bacik 	ret = 0;
1902e02119d5SChris Mason 	goto out;
1903e02119d5SChris Mason }
1904e02119d5SChris Mason 
1905e02119d5SChris Mason /*
1906e02119d5SChris Mason  * find all the names in a directory item and reconcile them into
1907e02119d5SChris Mason  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1908e02119d5SChris Mason  * one name in a directory item, but the same code gets used for
1909e02119d5SChris Mason  * both directory index types
1910e02119d5SChris Mason  */
1911e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1912e02119d5SChris Mason 					struct btrfs_root *root,
1913e02119d5SChris Mason 					struct btrfs_path *path,
1914e02119d5SChris Mason 					struct extent_buffer *eb, int slot,
1915e02119d5SChris Mason 					struct btrfs_key *key)
1916e02119d5SChris Mason {
1917bb53eda9SFilipe Manana 	int ret = 0;
1918e02119d5SChris Mason 	u32 item_size = btrfs_item_size_nr(eb, slot);
1919e02119d5SChris Mason 	struct btrfs_dir_item *di;
1920e02119d5SChris Mason 	int name_len;
1921e02119d5SChris Mason 	unsigned long ptr;
1922e02119d5SChris Mason 	unsigned long ptr_end;
1923bb53eda9SFilipe Manana 	struct btrfs_path *fixup_path = NULL;
1924e02119d5SChris Mason 
1925e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
1926e02119d5SChris Mason 	ptr_end = ptr + item_size;
1927e02119d5SChris Mason 	while (ptr < ptr_end) {
1928e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
1929e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
1930e02119d5SChris Mason 		ret = replay_one_name(trans, root, path, eb, di, key);
1931bb53eda9SFilipe Manana 		if (ret < 0)
1932bb53eda9SFilipe Manana 			break;
1933e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
1934e02119d5SChris Mason 		ptr += name_len;
1935bb53eda9SFilipe Manana 
1936bb53eda9SFilipe Manana 		/*
1937bb53eda9SFilipe Manana 		 * If this entry refers to a non-directory (directories can not
1938bb53eda9SFilipe Manana 		 * have a link count > 1) and it was added in the transaction
1939bb53eda9SFilipe Manana 		 * that was not committed, make sure we fixup the link count of
1940bb53eda9SFilipe Manana 		 * the inode it the entry points to. Otherwise something like
1941bb53eda9SFilipe Manana 		 * the following would result in a directory pointing to an
1942bb53eda9SFilipe Manana 		 * inode with a wrong link that does not account for this dir
1943bb53eda9SFilipe Manana 		 * entry:
1944bb53eda9SFilipe Manana 		 *
1945bb53eda9SFilipe Manana 		 * mkdir testdir
1946bb53eda9SFilipe Manana 		 * touch testdir/foo
1947bb53eda9SFilipe Manana 		 * touch testdir/bar
1948bb53eda9SFilipe Manana 		 * sync
1949bb53eda9SFilipe Manana 		 *
1950bb53eda9SFilipe Manana 		 * ln testdir/bar testdir/bar_link
1951bb53eda9SFilipe Manana 		 * ln testdir/foo testdir/foo_link
1952bb53eda9SFilipe Manana 		 * xfs_io -c "fsync" testdir/bar
1953bb53eda9SFilipe Manana 		 *
1954bb53eda9SFilipe Manana 		 * <power failure>
1955bb53eda9SFilipe Manana 		 *
1956bb53eda9SFilipe Manana 		 * mount fs, log replay happens
1957bb53eda9SFilipe Manana 		 *
1958bb53eda9SFilipe Manana 		 * File foo would remain with a link count of 1 when it has two
1959bb53eda9SFilipe Manana 		 * entries pointing to it in the directory testdir. This would
1960bb53eda9SFilipe Manana 		 * make it impossible to ever delete the parent directory has
1961bb53eda9SFilipe Manana 		 * it would result in stale dentries that can never be deleted.
1962bb53eda9SFilipe Manana 		 */
1963bb53eda9SFilipe Manana 		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1964bb53eda9SFilipe Manana 			struct btrfs_key di_key;
1965bb53eda9SFilipe Manana 
1966bb53eda9SFilipe Manana 			if (!fixup_path) {
1967bb53eda9SFilipe Manana 				fixup_path = btrfs_alloc_path();
1968bb53eda9SFilipe Manana 				if (!fixup_path) {
1969bb53eda9SFilipe Manana 					ret = -ENOMEM;
1970bb53eda9SFilipe Manana 					break;
1971e02119d5SChris Mason 				}
1972bb53eda9SFilipe Manana 			}
1973bb53eda9SFilipe Manana 
1974bb53eda9SFilipe Manana 			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1975bb53eda9SFilipe Manana 			ret = link_to_fixup_dir(trans, root, fixup_path,
1976bb53eda9SFilipe Manana 						di_key.objectid);
1977bb53eda9SFilipe Manana 			if (ret)
1978bb53eda9SFilipe Manana 				break;
1979bb53eda9SFilipe Manana 		}
1980bb53eda9SFilipe Manana 		ret = 0;
1981bb53eda9SFilipe Manana 	}
1982bb53eda9SFilipe Manana 	btrfs_free_path(fixup_path);
1983bb53eda9SFilipe Manana 	return ret;
1984e02119d5SChris Mason }
1985e02119d5SChris Mason 
1986e02119d5SChris Mason /*
1987e02119d5SChris Mason  * directory replay has two parts.  There are the standard directory
1988e02119d5SChris Mason  * items in the log copied from the subvolume, and range items
1989e02119d5SChris Mason  * created in the log while the subvolume was logged.
1990e02119d5SChris Mason  *
1991e02119d5SChris Mason  * The range items tell us which parts of the key space the log
1992e02119d5SChris Mason  * is authoritative for.  During replay, if a key in the subvolume
1993e02119d5SChris Mason  * directory is in a logged range item, but not actually in the log
1994e02119d5SChris Mason  * that means it was deleted from the directory before the fsync
1995e02119d5SChris Mason  * and should be removed.
1996e02119d5SChris Mason  */
1997e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root,
1998e02119d5SChris Mason 				   struct btrfs_path *path,
1999e02119d5SChris Mason 				   u64 dirid, int key_type,
2000e02119d5SChris Mason 				   u64 *start_ret, u64 *end_ret)
2001e02119d5SChris Mason {
2002e02119d5SChris Mason 	struct btrfs_key key;
2003e02119d5SChris Mason 	u64 found_end;
2004e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
2005e02119d5SChris Mason 	int ret;
2006e02119d5SChris Mason 	int nritems;
2007e02119d5SChris Mason 
2008e02119d5SChris Mason 	if (*start_ret == (u64)-1)
2009e02119d5SChris Mason 		return 1;
2010e02119d5SChris Mason 
2011e02119d5SChris Mason 	key.objectid = dirid;
2012e02119d5SChris Mason 	key.type = key_type;
2013e02119d5SChris Mason 	key.offset = *start_ret;
2014e02119d5SChris Mason 
2015e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2016e02119d5SChris Mason 	if (ret < 0)
2017e02119d5SChris Mason 		goto out;
2018e02119d5SChris Mason 	if (ret > 0) {
2019e02119d5SChris Mason 		if (path->slots[0] == 0)
2020e02119d5SChris Mason 			goto out;
2021e02119d5SChris Mason 		path->slots[0]--;
2022e02119d5SChris Mason 	}
2023e02119d5SChris Mason 	if (ret != 0)
2024e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2025e02119d5SChris Mason 
2026e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
2027e02119d5SChris Mason 		ret = 1;
2028e02119d5SChris Mason 		goto next;
2029e02119d5SChris Mason 	}
2030e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2031e02119d5SChris Mason 			      struct btrfs_dir_log_item);
2032e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2033e02119d5SChris Mason 
2034e02119d5SChris Mason 	if (*start_ret >= key.offset && *start_ret <= found_end) {
2035e02119d5SChris Mason 		ret = 0;
2036e02119d5SChris Mason 		*start_ret = key.offset;
2037e02119d5SChris Mason 		*end_ret = found_end;
2038e02119d5SChris Mason 		goto out;
2039e02119d5SChris Mason 	}
2040e02119d5SChris Mason 	ret = 1;
2041e02119d5SChris Mason next:
2042e02119d5SChris Mason 	/* check the next slot in the tree to see if it is a valid item */
2043e02119d5SChris Mason 	nritems = btrfs_header_nritems(path->nodes[0]);
20442a7bf53fSRobbie Ko 	path->slots[0]++;
2045e02119d5SChris Mason 	if (path->slots[0] >= nritems) {
2046e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
2047e02119d5SChris Mason 		if (ret)
2048e02119d5SChris Mason 			goto out;
2049e02119d5SChris Mason 	}
2050e02119d5SChris Mason 
2051e02119d5SChris Mason 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2052e02119d5SChris Mason 
2053e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
2054e02119d5SChris Mason 		ret = 1;
2055e02119d5SChris Mason 		goto out;
2056e02119d5SChris Mason 	}
2057e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2058e02119d5SChris Mason 			      struct btrfs_dir_log_item);
2059e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2060e02119d5SChris Mason 	*start_ret = key.offset;
2061e02119d5SChris Mason 	*end_ret = found_end;
2062e02119d5SChris Mason 	ret = 0;
2063e02119d5SChris Mason out:
2064b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2065e02119d5SChris Mason 	return ret;
2066e02119d5SChris Mason }
2067e02119d5SChris Mason 
2068e02119d5SChris Mason /*
2069e02119d5SChris Mason  * this looks for a given directory item in the log.  If the directory
2070e02119d5SChris Mason  * item is not in the log, the item is removed and the inode it points
2071e02119d5SChris Mason  * to is unlinked
2072e02119d5SChris Mason  */
2073e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2074e02119d5SChris Mason 				      struct btrfs_root *root,
2075e02119d5SChris Mason 				      struct btrfs_root *log,
2076e02119d5SChris Mason 				      struct btrfs_path *path,
2077e02119d5SChris Mason 				      struct btrfs_path *log_path,
2078e02119d5SChris Mason 				      struct inode *dir,
2079e02119d5SChris Mason 				      struct btrfs_key *dir_key)
2080e02119d5SChris Mason {
2081e02119d5SChris Mason 	int ret;
2082e02119d5SChris Mason 	struct extent_buffer *eb;
2083e02119d5SChris Mason 	int slot;
2084e02119d5SChris Mason 	u32 item_size;
2085e02119d5SChris Mason 	struct btrfs_dir_item *di;
2086e02119d5SChris Mason 	struct btrfs_dir_item *log_di;
2087e02119d5SChris Mason 	int name_len;
2088e02119d5SChris Mason 	unsigned long ptr;
2089e02119d5SChris Mason 	unsigned long ptr_end;
2090e02119d5SChris Mason 	char *name;
2091e02119d5SChris Mason 	struct inode *inode;
2092e02119d5SChris Mason 	struct btrfs_key location;
2093e02119d5SChris Mason 
2094e02119d5SChris Mason again:
2095e02119d5SChris Mason 	eb = path->nodes[0];
2096e02119d5SChris Mason 	slot = path->slots[0];
2097e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
2098e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
2099e02119d5SChris Mason 	ptr_end = ptr + item_size;
2100e02119d5SChris Mason 	while (ptr < ptr_end) {
2101e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
2102e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
2103e02119d5SChris Mason 		name = kmalloc(name_len, GFP_NOFS);
2104e02119d5SChris Mason 		if (!name) {
2105e02119d5SChris Mason 			ret = -ENOMEM;
2106e02119d5SChris Mason 			goto out;
2107e02119d5SChris Mason 		}
2108e02119d5SChris Mason 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2109e02119d5SChris Mason 				  name_len);
2110e02119d5SChris Mason 		log_di = NULL;
211112fcfd22SChris Mason 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2112e02119d5SChris Mason 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2113e02119d5SChris Mason 						       dir_key->objectid,
2114e02119d5SChris Mason 						       name, name_len, 0);
211512fcfd22SChris Mason 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2116e02119d5SChris Mason 			log_di = btrfs_lookup_dir_index_item(trans, log,
2117e02119d5SChris Mason 						     log_path,
2118e02119d5SChris Mason 						     dir_key->objectid,
2119e02119d5SChris Mason 						     dir_key->offset,
2120e02119d5SChris Mason 						     name, name_len, 0);
2121e02119d5SChris Mason 		}
2122269d040fSFilipe David Borba Manana 		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2123e02119d5SChris Mason 			btrfs_dir_item_key_to_cpu(eb, di, &location);
2124b3b4aa74SDavid Sterba 			btrfs_release_path(path);
2125b3b4aa74SDavid Sterba 			btrfs_release_path(log_path);
2126e02119d5SChris Mason 			inode = read_one_inode(root, location.objectid);
2127c00e9493STsutomu Itoh 			if (!inode) {
2128c00e9493STsutomu Itoh 				kfree(name);
2129c00e9493STsutomu Itoh 				return -EIO;
2130c00e9493STsutomu Itoh 			}
2131e02119d5SChris Mason 
2132e02119d5SChris Mason 			ret = link_to_fixup_dir(trans, root,
2133e02119d5SChris Mason 						path, location.objectid);
21343650860bSJosef Bacik 			if (ret) {
21353650860bSJosef Bacik 				kfree(name);
21363650860bSJosef Bacik 				iput(inode);
21373650860bSJosef Bacik 				goto out;
21383650860bSJosef Bacik 			}
21393650860bSJosef Bacik 
21408b558c5fSZach Brown 			inc_nlink(inode);
21414ec5934eSNikolay Borisov 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
21424ec5934eSNikolay Borisov 					BTRFS_I(inode), name, name_len);
21433650860bSJosef Bacik 			if (!ret)
2144e5c304e6SNikolay Borisov 				ret = btrfs_run_delayed_items(trans);
2145e02119d5SChris Mason 			kfree(name);
2146e02119d5SChris Mason 			iput(inode);
21473650860bSJosef Bacik 			if (ret)
21483650860bSJosef Bacik 				goto out;
2149e02119d5SChris Mason 
2150e02119d5SChris Mason 			/* there might still be more names under this key
2151e02119d5SChris Mason 			 * check and repeat if required
2152e02119d5SChris Mason 			 */
2153e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, dir_key, path,
2154e02119d5SChris Mason 						0, 0);
2155e02119d5SChris Mason 			if (ret == 0)
2156e02119d5SChris Mason 				goto again;
2157e02119d5SChris Mason 			ret = 0;
2158e02119d5SChris Mason 			goto out;
2159269d040fSFilipe David Borba Manana 		} else if (IS_ERR(log_di)) {
2160269d040fSFilipe David Borba Manana 			kfree(name);
2161269d040fSFilipe David Borba Manana 			return PTR_ERR(log_di);
2162e02119d5SChris Mason 		}
2163b3b4aa74SDavid Sterba 		btrfs_release_path(log_path);
2164e02119d5SChris Mason 		kfree(name);
2165e02119d5SChris Mason 
2166e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
2167e02119d5SChris Mason 		ptr += name_len;
2168e02119d5SChris Mason 	}
2169e02119d5SChris Mason 	ret = 0;
2170e02119d5SChris Mason out:
2171b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2172b3b4aa74SDavid Sterba 	btrfs_release_path(log_path);
2173e02119d5SChris Mason 	return ret;
2174e02119d5SChris Mason }
2175e02119d5SChris Mason 
21764f764e51SFilipe Manana static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
21774f764e51SFilipe Manana 			      struct btrfs_root *root,
21784f764e51SFilipe Manana 			      struct btrfs_root *log,
21794f764e51SFilipe Manana 			      struct btrfs_path *path,
21804f764e51SFilipe Manana 			      const u64 ino)
21814f764e51SFilipe Manana {
21824f764e51SFilipe Manana 	struct btrfs_key search_key;
21834f764e51SFilipe Manana 	struct btrfs_path *log_path;
21844f764e51SFilipe Manana 	int i;
21854f764e51SFilipe Manana 	int nritems;
21864f764e51SFilipe Manana 	int ret;
21874f764e51SFilipe Manana 
21884f764e51SFilipe Manana 	log_path = btrfs_alloc_path();
21894f764e51SFilipe Manana 	if (!log_path)
21904f764e51SFilipe Manana 		return -ENOMEM;
21914f764e51SFilipe Manana 
21924f764e51SFilipe Manana 	search_key.objectid = ino;
21934f764e51SFilipe Manana 	search_key.type = BTRFS_XATTR_ITEM_KEY;
21944f764e51SFilipe Manana 	search_key.offset = 0;
21954f764e51SFilipe Manana again:
21964f764e51SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
21974f764e51SFilipe Manana 	if (ret < 0)
21984f764e51SFilipe Manana 		goto out;
21994f764e51SFilipe Manana process_leaf:
22004f764e51SFilipe Manana 	nritems = btrfs_header_nritems(path->nodes[0]);
22014f764e51SFilipe Manana 	for (i = path->slots[0]; i < nritems; i++) {
22024f764e51SFilipe Manana 		struct btrfs_key key;
22034f764e51SFilipe Manana 		struct btrfs_dir_item *di;
22044f764e51SFilipe Manana 		struct btrfs_dir_item *log_di;
22054f764e51SFilipe Manana 		u32 total_size;
22064f764e51SFilipe Manana 		u32 cur;
22074f764e51SFilipe Manana 
22084f764e51SFilipe Manana 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
22094f764e51SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
22104f764e51SFilipe Manana 			ret = 0;
22114f764e51SFilipe Manana 			goto out;
22124f764e51SFilipe Manana 		}
22134f764e51SFilipe Manana 
22144f764e51SFilipe Manana 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
22154f764e51SFilipe Manana 		total_size = btrfs_item_size_nr(path->nodes[0], i);
22164f764e51SFilipe Manana 		cur = 0;
22174f764e51SFilipe Manana 		while (cur < total_size) {
22184f764e51SFilipe Manana 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
22194f764e51SFilipe Manana 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
22204f764e51SFilipe Manana 			u32 this_len = sizeof(*di) + name_len + data_len;
22214f764e51SFilipe Manana 			char *name;
22224f764e51SFilipe Manana 
22234f764e51SFilipe Manana 			name = kmalloc(name_len, GFP_NOFS);
22244f764e51SFilipe Manana 			if (!name) {
22254f764e51SFilipe Manana 				ret = -ENOMEM;
22264f764e51SFilipe Manana 				goto out;
22274f764e51SFilipe Manana 			}
22284f764e51SFilipe Manana 			read_extent_buffer(path->nodes[0], name,
22294f764e51SFilipe Manana 					   (unsigned long)(di + 1), name_len);
22304f764e51SFilipe Manana 
22314f764e51SFilipe Manana 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
22324f764e51SFilipe Manana 						    name, name_len, 0);
22334f764e51SFilipe Manana 			btrfs_release_path(log_path);
22344f764e51SFilipe Manana 			if (!log_di) {
22354f764e51SFilipe Manana 				/* Doesn't exist in log tree, so delete it. */
22364f764e51SFilipe Manana 				btrfs_release_path(path);
22374f764e51SFilipe Manana 				di = btrfs_lookup_xattr(trans, root, path, ino,
22384f764e51SFilipe Manana 							name, name_len, -1);
22394f764e51SFilipe Manana 				kfree(name);
22404f764e51SFilipe Manana 				if (IS_ERR(di)) {
22414f764e51SFilipe Manana 					ret = PTR_ERR(di);
22424f764e51SFilipe Manana 					goto out;
22434f764e51SFilipe Manana 				}
22444f764e51SFilipe Manana 				ASSERT(di);
22454f764e51SFilipe Manana 				ret = btrfs_delete_one_dir_name(trans, root,
22464f764e51SFilipe Manana 								path, di);
22474f764e51SFilipe Manana 				if (ret)
22484f764e51SFilipe Manana 					goto out;
22494f764e51SFilipe Manana 				btrfs_release_path(path);
22504f764e51SFilipe Manana 				search_key = key;
22514f764e51SFilipe Manana 				goto again;
22524f764e51SFilipe Manana 			}
22534f764e51SFilipe Manana 			kfree(name);
22544f764e51SFilipe Manana 			if (IS_ERR(log_di)) {
22554f764e51SFilipe Manana 				ret = PTR_ERR(log_di);
22564f764e51SFilipe Manana 				goto out;
22574f764e51SFilipe Manana 			}
22584f764e51SFilipe Manana 			cur += this_len;
22594f764e51SFilipe Manana 			di = (struct btrfs_dir_item *)((char *)di + this_len);
22604f764e51SFilipe Manana 		}
22614f764e51SFilipe Manana 	}
22624f764e51SFilipe Manana 	ret = btrfs_next_leaf(root, path);
22634f764e51SFilipe Manana 	if (ret > 0)
22644f764e51SFilipe Manana 		ret = 0;
22654f764e51SFilipe Manana 	else if (ret == 0)
22664f764e51SFilipe Manana 		goto process_leaf;
22674f764e51SFilipe Manana out:
22684f764e51SFilipe Manana 	btrfs_free_path(log_path);
22694f764e51SFilipe Manana 	btrfs_release_path(path);
22704f764e51SFilipe Manana 	return ret;
22714f764e51SFilipe Manana }
22724f764e51SFilipe Manana 
22734f764e51SFilipe Manana 
2274e02119d5SChris Mason /*
2275e02119d5SChris Mason  * deletion replay happens before we copy any new directory items
2276e02119d5SChris Mason  * out of the log or out of backreferences from inodes.  It
2277e02119d5SChris Mason  * scans the log to find ranges of keys that log is authoritative for,
2278e02119d5SChris Mason  * and then scans the directory to find items in those ranges that are
2279e02119d5SChris Mason  * not present in the log.
2280e02119d5SChris Mason  *
2281e02119d5SChris Mason  * Anything we don't find in the log is unlinked and removed from the
2282e02119d5SChris Mason  * directory.
2283e02119d5SChris Mason  */
2284e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2285e02119d5SChris Mason 				       struct btrfs_root *root,
2286e02119d5SChris Mason 				       struct btrfs_root *log,
2287e02119d5SChris Mason 				       struct btrfs_path *path,
228812fcfd22SChris Mason 				       u64 dirid, int del_all)
2289e02119d5SChris Mason {
2290e02119d5SChris Mason 	u64 range_start;
2291e02119d5SChris Mason 	u64 range_end;
2292e02119d5SChris Mason 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2293e02119d5SChris Mason 	int ret = 0;
2294e02119d5SChris Mason 	struct btrfs_key dir_key;
2295e02119d5SChris Mason 	struct btrfs_key found_key;
2296e02119d5SChris Mason 	struct btrfs_path *log_path;
2297e02119d5SChris Mason 	struct inode *dir;
2298e02119d5SChris Mason 
2299e02119d5SChris Mason 	dir_key.objectid = dirid;
2300e02119d5SChris Mason 	dir_key.type = BTRFS_DIR_ITEM_KEY;
2301e02119d5SChris Mason 	log_path = btrfs_alloc_path();
2302e02119d5SChris Mason 	if (!log_path)
2303e02119d5SChris Mason 		return -ENOMEM;
2304e02119d5SChris Mason 
2305e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
2306e02119d5SChris Mason 	/* it isn't an error if the inode isn't there, that can happen
2307e02119d5SChris Mason 	 * because we replay the deletes before we copy in the inode item
2308e02119d5SChris Mason 	 * from the log
2309e02119d5SChris Mason 	 */
2310e02119d5SChris Mason 	if (!dir) {
2311e02119d5SChris Mason 		btrfs_free_path(log_path);
2312e02119d5SChris Mason 		return 0;
2313e02119d5SChris Mason 	}
2314e02119d5SChris Mason again:
2315e02119d5SChris Mason 	range_start = 0;
2316e02119d5SChris Mason 	range_end = 0;
2317e02119d5SChris Mason 	while (1) {
231812fcfd22SChris Mason 		if (del_all)
231912fcfd22SChris Mason 			range_end = (u64)-1;
232012fcfd22SChris Mason 		else {
2321e02119d5SChris Mason 			ret = find_dir_range(log, path, dirid, key_type,
2322e02119d5SChris Mason 					     &range_start, &range_end);
2323e02119d5SChris Mason 			if (ret != 0)
2324e02119d5SChris Mason 				break;
232512fcfd22SChris Mason 		}
2326e02119d5SChris Mason 
2327e02119d5SChris Mason 		dir_key.offset = range_start;
2328e02119d5SChris Mason 		while (1) {
2329e02119d5SChris Mason 			int nritems;
2330e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2331e02119d5SChris Mason 						0, 0);
2332e02119d5SChris Mason 			if (ret < 0)
2333e02119d5SChris Mason 				goto out;
2334e02119d5SChris Mason 
2335e02119d5SChris Mason 			nritems = btrfs_header_nritems(path->nodes[0]);
2336e02119d5SChris Mason 			if (path->slots[0] >= nritems) {
2337e02119d5SChris Mason 				ret = btrfs_next_leaf(root, path);
2338b98def7cSLiu Bo 				if (ret == 1)
2339e02119d5SChris Mason 					break;
2340b98def7cSLiu Bo 				else if (ret < 0)
2341b98def7cSLiu Bo 					goto out;
2342e02119d5SChris Mason 			}
2343e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2344e02119d5SChris Mason 					      path->slots[0]);
2345e02119d5SChris Mason 			if (found_key.objectid != dirid ||
2346e02119d5SChris Mason 			    found_key.type != dir_key.type)
2347e02119d5SChris Mason 				goto next_type;
2348e02119d5SChris Mason 
2349e02119d5SChris Mason 			if (found_key.offset > range_end)
2350e02119d5SChris Mason 				break;
2351e02119d5SChris Mason 
2352e02119d5SChris Mason 			ret = check_item_in_log(trans, root, log, path,
235312fcfd22SChris Mason 						log_path, dir,
235412fcfd22SChris Mason 						&found_key);
23553650860bSJosef Bacik 			if (ret)
23563650860bSJosef Bacik 				goto out;
2357e02119d5SChris Mason 			if (found_key.offset == (u64)-1)
2358e02119d5SChris Mason 				break;
2359e02119d5SChris Mason 			dir_key.offset = found_key.offset + 1;
2360e02119d5SChris Mason 		}
2361b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2362e02119d5SChris Mason 		if (range_end == (u64)-1)
2363e02119d5SChris Mason 			break;
2364e02119d5SChris Mason 		range_start = range_end + 1;
2365e02119d5SChris Mason 	}
2366e02119d5SChris Mason 
2367e02119d5SChris Mason next_type:
2368e02119d5SChris Mason 	ret = 0;
2369e02119d5SChris Mason 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2370e02119d5SChris Mason 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2371e02119d5SChris Mason 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2372b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2373e02119d5SChris Mason 		goto again;
2374e02119d5SChris Mason 	}
2375e02119d5SChris Mason out:
2376b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2377e02119d5SChris Mason 	btrfs_free_path(log_path);
2378e02119d5SChris Mason 	iput(dir);
2379e02119d5SChris Mason 	return ret;
2380e02119d5SChris Mason }
2381e02119d5SChris Mason 
2382e02119d5SChris Mason /*
2383e02119d5SChris Mason  * the process_func used to replay items from the log tree.  This
2384e02119d5SChris Mason  * gets called in two different stages.  The first stage just looks
2385e02119d5SChris Mason  * for inodes and makes sure they are all copied into the subvolume.
2386e02119d5SChris Mason  *
2387e02119d5SChris Mason  * The second stage copies all the other item types from the log into
2388e02119d5SChris Mason  * the subvolume.  The two stage approach is slower, but gets rid of
2389e02119d5SChris Mason  * lots of complexity around inodes referencing other inodes that exist
2390e02119d5SChris Mason  * only in the log (references come from either directory items or inode
2391e02119d5SChris Mason  * back refs).
2392e02119d5SChris Mason  */
2393e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2394581c1760SQu Wenruo 			     struct walk_control *wc, u64 gen, int level)
2395e02119d5SChris Mason {
2396e02119d5SChris Mason 	int nritems;
2397e02119d5SChris Mason 	struct btrfs_path *path;
2398e02119d5SChris Mason 	struct btrfs_root *root = wc->replay_dest;
2399e02119d5SChris Mason 	struct btrfs_key key;
2400e02119d5SChris Mason 	int i;
2401e02119d5SChris Mason 	int ret;
2402e02119d5SChris Mason 
2403581c1760SQu Wenruo 	ret = btrfs_read_buffer(eb, gen, level, NULL);
2404018642a1STsutomu Itoh 	if (ret)
2405018642a1STsutomu Itoh 		return ret;
2406e02119d5SChris Mason 
2407e02119d5SChris Mason 	level = btrfs_header_level(eb);
2408e02119d5SChris Mason 
2409e02119d5SChris Mason 	if (level != 0)
2410e02119d5SChris Mason 		return 0;
2411e02119d5SChris Mason 
2412e02119d5SChris Mason 	path = btrfs_alloc_path();
24131e5063d0SMark Fasheh 	if (!path)
24141e5063d0SMark Fasheh 		return -ENOMEM;
2415e02119d5SChris Mason 
2416e02119d5SChris Mason 	nritems = btrfs_header_nritems(eb);
2417e02119d5SChris Mason 	for (i = 0; i < nritems; i++) {
2418e02119d5SChris Mason 		btrfs_item_key_to_cpu(eb, &key, i);
2419e02119d5SChris Mason 
2420e02119d5SChris Mason 		/* inode keys are done during the first stage */
2421e02119d5SChris Mason 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2422e02119d5SChris Mason 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2423e02119d5SChris Mason 			struct btrfs_inode_item *inode_item;
2424e02119d5SChris Mason 			u32 mode;
2425e02119d5SChris Mason 
2426e02119d5SChris Mason 			inode_item = btrfs_item_ptr(eb, i,
2427e02119d5SChris Mason 					    struct btrfs_inode_item);
24284f764e51SFilipe Manana 			ret = replay_xattr_deletes(wc->trans, root, log,
24294f764e51SFilipe Manana 						   path, key.objectid);
24304f764e51SFilipe Manana 			if (ret)
24314f764e51SFilipe Manana 				break;
2432e02119d5SChris Mason 			mode = btrfs_inode_mode(eb, inode_item);
2433e02119d5SChris Mason 			if (S_ISDIR(mode)) {
2434e02119d5SChris Mason 				ret = replay_dir_deletes(wc->trans,
243512fcfd22SChris Mason 					 root, log, path, key.objectid, 0);
2436b50c6e25SJosef Bacik 				if (ret)
2437b50c6e25SJosef Bacik 					break;
2438e02119d5SChris Mason 			}
2439e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2440e02119d5SChris Mason 					     eb, i, &key);
2441b50c6e25SJosef Bacik 			if (ret)
2442b50c6e25SJosef Bacik 				break;
2443e02119d5SChris Mason 
2444471d557aSFilipe Manana 			/*
2445471d557aSFilipe Manana 			 * Before replaying extents, truncate the inode to its
2446471d557aSFilipe Manana 			 * size. We need to do it now and not after log replay
2447471d557aSFilipe Manana 			 * because before an fsync we can have prealloc extents
2448471d557aSFilipe Manana 			 * added beyond the inode's i_size. If we did it after,
2449471d557aSFilipe Manana 			 * through orphan cleanup for example, we would drop
2450471d557aSFilipe Manana 			 * those prealloc extents just after replaying them.
2451e02119d5SChris Mason 			 */
2452e02119d5SChris Mason 			if (S_ISREG(mode)) {
2453471d557aSFilipe Manana 				struct inode *inode;
2454471d557aSFilipe Manana 				u64 from;
2455471d557aSFilipe Manana 
2456471d557aSFilipe Manana 				inode = read_one_inode(root, key.objectid);
2457471d557aSFilipe Manana 				if (!inode) {
2458471d557aSFilipe Manana 					ret = -EIO;
2459471d557aSFilipe Manana 					break;
2460471d557aSFilipe Manana 				}
2461471d557aSFilipe Manana 				from = ALIGN(i_size_read(inode),
2462471d557aSFilipe Manana 					     root->fs_info->sectorsize);
2463471d557aSFilipe Manana 				ret = btrfs_drop_extents(wc->trans, root, inode,
2464471d557aSFilipe Manana 							 from, (u64)-1, 1);
2465471d557aSFilipe Manana 				/*
2466471d557aSFilipe Manana 				 * If the nlink count is zero here, the iput
2467471d557aSFilipe Manana 				 * will free the inode.  We bump it to make
2468471d557aSFilipe Manana 				 * sure it doesn't get freed until the link
2469471d557aSFilipe Manana 				 * count fixup is done.
2470471d557aSFilipe Manana 				 */
2471471d557aSFilipe Manana 				if (!ret) {
2472471d557aSFilipe Manana 					if (inode->i_nlink == 0)
2473471d557aSFilipe Manana 						inc_nlink(inode);
2474471d557aSFilipe Manana 					/* Update link count and nbytes. */
2475471d557aSFilipe Manana 					ret = btrfs_update_inode(wc->trans,
2476471d557aSFilipe Manana 								 root, inode);
2477471d557aSFilipe Manana 				}
2478471d557aSFilipe Manana 				iput(inode);
2479b50c6e25SJosef Bacik 				if (ret)
2480b50c6e25SJosef Bacik 					break;
2481c71bf099SYan, Zheng 			}
2482a74ac322SChris Mason 
2483e02119d5SChris Mason 			ret = link_to_fixup_dir(wc->trans, root,
2484e02119d5SChris Mason 						path, key.objectid);
2485b50c6e25SJosef Bacik 			if (ret)
2486b50c6e25SJosef Bacik 				break;
2487e02119d5SChris Mason 		}
2488dd8e7217SJosef Bacik 
2489dd8e7217SJosef Bacik 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2490dd8e7217SJosef Bacik 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2491dd8e7217SJosef Bacik 			ret = replay_one_dir_item(wc->trans, root, path,
2492dd8e7217SJosef Bacik 						  eb, i, &key);
2493dd8e7217SJosef Bacik 			if (ret)
2494dd8e7217SJosef Bacik 				break;
2495dd8e7217SJosef Bacik 		}
2496dd8e7217SJosef Bacik 
2497e02119d5SChris Mason 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2498e02119d5SChris Mason 			continue;
2499e02119d5SChris Mason 
2500e02119d5SChris Mason 		/* these keys are simply copied */
2501e02119d5SChris Mason 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2502e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2503e02119d5SChris Mason 					     eb, i, &key);
2504b50c6e25SJosef Bacik 			if (ret)
2505b50c6e25SJosef Bacik 				break;
25062da1c669SLiu Bo 		} else if (key.type == BTRFS_INODE_REF_KEY ||
25072da1c669SLiu Bo 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2508f186373fSMark Fasheh 			ret = add_inode_ref(wc->trans, root, log, path,
2509f186373fSMark Fasheh 					    eb, i, &key);
2510b50c6e25SJosef Bacik 			if (ret && ret != -ENOENT)
2511b50c6e25SJosef Bacik 				break;
2512b50c6e25SJosef Bacik 			ret = 0;
2513e02119d5SChris Mason 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2514e02119d5SChris Mason 			ret = replay_one_extent(wc->trans, root, path,
2515e02119d5SChris Mason 						eb, i, &key);
2516b50c6e25SJosef Bacik 			if (ret)
2517b50c6e25SJosef Bacik 				break;
2518dd8e7217SJosef Bacik 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2519e02119d5SChris Mason 			ret = replay_one_dir_item(wc->trans, root, path,
2520e02119d5SChris Mason 						  eb, i, &key);
2521b50c6e25SJosef Bacik 			if (ret)
2522b50c6e25SJosef Bacik 				break;
2523e02119d5SChris Mason 		}
2524e02119d5SChris Mason 	}
2525e02119d5SChris Mason 	btrfs_free_path(path);
2526b50c6e25SJosef Bacik 	return ret;
2527e02119d5SChris Mason }
2528e02119d5SChris Mason 
2529d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2530e02119d5SChris Mason 				   struct btrfs_root *root,
2531e02119d5SChris Mason 				   struct btrfs_path *path, int *level,
2532e02119d5SChris Mason 				   struct walk_control *wc)
2533e02119d5SChris Mason {
25340b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2535e02119d5SChris Mason 	u64 root_owner;
2536e02119d5SChris Mason 	u64 bytenr;
2537e02119d5SChris Mason 	u64 ptr_gen;
2538e02119d5SChris Mason 	struct extent_buffer *next;
2539e02119d5SChris Mason 	struct extent_buffer *cur;
2540e02119d5SChris Mason 	struct extent_buffer *parent;
2541e02119d5SChris Mason 	u32 blocksize;
2542e02119d5SChris Mason 	int ret = 0;
2543e02119d5SChris Mason 
2544e02119d5SChris Mason 	WARN_ON(*level < 0);
2545e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2546e02119d5SChris Mason 
2547e02119d5SChris Mason 	while (*level > 0) {
2548581c1760SQu Wenruo 		struct btrfs_key first_key;
2549581c1760SQu Wenruo 
2550e02119d5SChris Mason 		WARN_ON(*level < 0);
2551e02119d5SChris Mason 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
2552e02119d5SChris Mason 		cur = path->nodes[*level];
2553e02119d5SChris Mason 
2554fae7f21cSDulshani Gunawardhana 		WARN_ON(btrfs_header_level(cur) != *level);
2555e02119d5SChris Mason 
2556e02119d5SChris Mason 		if (path->slots[*level] >=
2557e02119d5SChris Mason 		    btrfs_header_nritems(cur))
2558e02119d5SChris Mason 			break;
2559e02119d5SChris Mason 
2560e02119d5SChris Mason 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2561e02119d5SChris Mason 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2562581c1760SQu Wenruo 		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
25630b246afaSJeff Mahoney 		blocksize = fs_info->nodesize;
2564e02119d5SChris Mason 
2565e02119d5SChris Mason 		parent = path->nodes[*level];
2566e02119d5SChris Mason 		root_owner = btrfs_header_owner(parent);
2567e02119d5SChris Mason 
25682ff7e61eSJeff Mahoney 		next = btrfs_find_create_tree_block(fs_info, bytenr);
2569c871b0f2SLiu Bo 		if (IS_ERR(next))
2570c871b0f2SLiu Bo 			return PTR_ERR(next);
2571e02119d5SChris Mason 
25724a500fd1SYan, Zheng 		if (*level == 1) {
2573581c1760SQu Wenruo 			ret = wc->process_func(root, next, wc, ptr_gen,
2574581c1760SQu Wenruo 					       *level - 1);
2575b50c6e25SJosef Bacik 			if (ret) {
2576b50c6e25SJosef Bacik 				free_extent_buffer(next);
25771e5063d0SMark Fasheh 				return ret;
2578b50c6e25SJosef Bacik 			}
2579e02119d5SChris Mason 
2580e02119d5SChris Mason 			path->slots[*level]++;
2581e02119d5SChris Mason 			if (wc->free) {
2582581c1760SQu Wenruo 				ret = btrfs_read_buffer(next, ptr_gen,
2583581c1760SQu Wenruo 							*level - 1, &first_key);
2584018642a1STsutomu Itoh 				if (ret) {
2585018642a1STsutomu Itoh 					free_extent_buffer(next);
2586018642a1STsutomu Itoh 					return ret;
2587018642a1STsutomu Itoh 				}
2588e02119d5SChris Mason 
2589681ae509SJosef Bacik 				if (trans) {
2590e02119d5SChris Mason 					btrfs_tree_lock(next);
2591b4ce94deSChris Mason 					btrfs_set_lock_blocking(next);
25927c302b49SDavid Sterba 					clean_tree_block(fs_info, next);
2593e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2594e02119d5SChris Mason 					btrfs_tree_unlock(next);
25951846430cSLiu Bo 				} else {
25961846430cSLiu Bo 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
25971846430cSLiu Bo 						clear_extent_buffer_dirty(next);
2598681ae509SJosef Bacik 				}
2599e02119d5SChris Mason 
2600e02119d5SChris Mason 				WARN_ON(root_owner !=
2601e02119d5SChris Mason 					BTRFS_TREE_LOG_OBJECTID);
26022ff7e61eSJeff Mahoney 				ret = btrfs_free_and_pin_reserved_extent(
26032ff7e61eSJeff Mahoney 							fs_info, bytenr,
26042ff7e61eSJeff Mahoney 							blocksize);
26053650860bSJosef Bacik 				if (ret) {
26063650860bSJosef Bacik 					free_extent_buffer(next);
26073650860bSJosef Bacik 					return ret;
26083650860bSJosef Bacik 				}
2609e02119d5SChris Mason 			}
2610e02119d5SChris Mason 			free_extent_buffer(next);
2611e02119d5SChris Mason 			continue;
2612e02119d5SChris Mason 		}
2613581c1760SQu Wenruo 		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2614018642a1STsutomu Itoh 		if (ret) {
2615018642a1STsutomu Itoh 			free_extent_buffer(next);
2616018642a1STsutomu Itoh 			return ret;
2617018642a1STsutomu Itoh 		}
2618e02119d5SChris Mason 
2619e02119d5SChris Mason 		WARN_ON(*level <= 0);
2620e02119d5SChris Mason 		if (path->nodes[*level-1])
2621e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level-1]);
2622e02119d5SChris Mason 		path->nodes[*level-1] = next;
2623e02119d5SChris Mason 		*level = btrfs_header_level(next);
2624e02119d5SChris Mason 		path->slots[*level] = 0;
2625e02119d5SChris Mason 		cond_resched();
2626e02119d5SChris Mason 	}
2627e02119d5SChris Mason 	WARN_ON(*level < 0);
2628e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2629e02119d5SChris Mason 
26304a500fd1SYan, Zheng 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2631e02119d5SChris Mason 
2632e02119d5SChris Mason 	cond_resched();
2633e02119d5SChris Mason 	return 0;
2634e02119d5SChris Mason }
2635e02119d5SChris Mason 
2636d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2637e02119d5SChris Mason 				 struct btrfs_root *root,
2638e02119d5SChris Mason 				 struct btrfs_path *path, int *level,
2639e02119d5SChris Mason 				 struct walk_control *wc)
2640e02119d5SChris Mason {
26410b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2642e02119d5SChris Mason 	u64 root_owner;
2643e02119d5SChris Mason 	int i;
2644e02119d5SChris Mason 	int slot;
2645e02119d5SChris Mason 	int ret;
2646e02119d5SChris Mason 
2647e02119d5SChris Mason 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2648e02119d5SChris Mason 		slot = path->slots[i];
26494a500fd1SYan, Zheng 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2650e02119d5SChris Mason 			path->slots[i]++;
2651e02119d5SChris Mason 			*level = i;
2652e02119d5SChris Mason 			WARN_ON(*level == 0);
2653e02119d5SChris Mason 			return 0;
2654e02119d5SChris Mason 		} else {
265531840ae1SZheng Yan 			struct extent_buffer *parent;
265631840ae1SZheng Yan 			if (path->nodes[*level] == root->node)
265731840ae1SZheng Yan 				parent = path->nodes[*level];
265831840ae1SZheng Yan 			else
265931840ae1SZheng Yan 				parent = path->nodes[*level + 1];
266031840ae1SZheng Yan 
266131840ae1SZheng Yan 			root_owner = btrfs_header_owner(parent);
26621e5063d0SMark Fasheh 			ret = wc->process_func(root, path->nodes[*level], wc,
2663581c1760SQu Wenruo 				 btrfs_header_generation(path->nodes[*level]),
2664581c1760SQu Wenruo 				 *level);
26651e5063d0SMark Fasheh 			if (ret)
26661e5063d0SMark Fasheh 				return ret;
26671e5063d0SMark Fasheh 
2668e02119d5SChris Mason 			if (wc->free) {
2669e02119d5SChris Mason 				struct extent_buffer *next;
2670e02119d5SChris Mason 
2671e02119d5SChris Mason 				next = path->nodes[*level];
2672e02119d5SChris Mason 
2673681ae509SJosef Bacik 				if (trans) {
2674e02119d5SChris Mason 					btrfs_tree_lock(next);
2675b4ce94deSChris Mason 					btrfs_set_lock_blocking(next);
26767c302b49SDavid Sterba 					clean_tree_block(fs_info, next);
2677e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2678e02119d5SChris Mason 					btrfs_tree_unlock(next);
26791846430cSLiu Bo 				} else {
26801846430cSLiu Bo 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
26811846430cSLiu Bo 						clear_extent_buffer_dirty(next);
2682681ae509SJosef Bacik 				}
2683e02119d5SChris Mason 
2684e02119d5SChris Mason 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
26852ff7e61eSJeff Mahoney 				ret = btrfs_free_and_pin_reserved_extent(
26862ff7e61eSJeff Mahoney 						fs_info,
2687e02119d5SChris Mason 						path->nodes[*level]->start,
2688d00aff00SChris Mason 						path->nodes[*level]->len);
26893650860bSJosef Bacik 				if (ret)
26903650860bSJosef Bacik 					return ret;
2691e02119d5SChris Mason 			}
2692e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level]);
2693e02119d5SChris Mason 			path->nodes[*level] = NULL;
2694e02119d5SChris Mason 			*level = i + 1;
2695e02119d5SChris Mason 		}
2696e02119d5SChris Mason 	}
2697e02119d5SChris Mason 	return 1;
2698e02119d5SChris Mason }
2699e02119d5SChris Mason 
2700e02119d5SChris Mason /*
2701e02119d5SChris Mason  * drop the reference count on the tree rooted at 'snap'.  This traverses
2702e02119d5SChris Mason  * the tree freeing any blocks that have a ref count of zero after being
2703e02119d5SChris Mason  * decremented.
2704e02119d5SChris Mason  */
2705e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans,
2706e02119d5SChris Mason 			 struct btrfs_root *log, struct walk_control *wc)
2707e02119d5SChris Mason {
27082ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
2709e02119d5SChris Mason 	int ret = 0;
2710e02119d5SChris Mason 	int wret;
2711e02119d5SChris Mason 	int level;
2712e02119d5SChris Mason 	struct btrfs_path *path;
2713e02119d5SChris Mason 	int orig_level;
2714e02119d5SChris Mason 
2715e02119d5SChris Mason 	path = btrfs_alloc_path();
2716db5b493aSTsutomu Itoh 	if (!path)
2717db5b493aSTsutomu Itoh 		return -ENOMEM;
2718e02119d5SChris Mason 
2719e02119d5SChris Mason 	level = btrfs_header_level(log->node);
2720e02119d5SChris Mason 	orig_level = level;
2721e02119d5SChris Mason 	path->nodes[level] = log->node;
2722e02119d5SChris Mason 	extent_buffer_get(log->node);
2723e02119d5SChris Mason 	path->slots[level] = 0;
2724e02119d5SChris Mason 
2725e02119d5SChris Mason 	while (1) {
2726e02119d5SChris Mason 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2727e02119d5SChris Mason 		if (wret > 0)
2728e02119d5SChris Mason 			break;
272979787eaaSJeff Mahoney 		if (wret < 0) {
2730e02119d5SChris Mason 			ret = wret;
273179787eaaSJeff Mahoney 			goto out;
273279787eaaSJeff Mahoney 		}
2733e02119d5SChris Mason 
2734e02119d5SChris Mason 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2735e02119d5SChris Mason 		if (wret > 0)
2736e02119d5SChris Mason 			break;
273779787eaaSJeff Mahoney 		if (wret < 0) {
2738e02119d5SChris Mason 			ret = wret;
273979787eaaSJeff Mahoney 			goto out;
274079787eaaSJeff Mahoney 		}
2741e02119d5SChris Mason 	}
2742e02119d5SChris Mason 
2743e02119d5SChris Mason 	/* was the root node processed? if not, catch it here */
2744e02119d5SChris Mason 	if (path->nodes[orig_level]) {
274579787eaaSJeff Mahoney 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2746581c1760SQu Wenruo 			 btrfs_header_generation(path->nodes[orig_level]),
2747581c1760SQu Wenruo 			 orig_level);
274879787eaaSJeff Mahoney 		if (ret)
274979787eaaSJeff Mahoney 			goto out;
2750e02119d5SChris Mason 		if (wc->free) {
2751e02119d5SChris Mason 			struct extent_buffer *next;
2752e02119d5SChris Mason 
2753e02119d5SChris Mason 			next = path->nodes[orig_level];
2754e02119d5SChris Mason 
2755681ae509SJosef Bacik 			if (trans) {
2756e02119d5SChris Mason 				btrfs_tree_lock(next);
2757b4ce94deSChris Mason 				btrfs_set_lock_blocking(next);
27587c302b49SDavid Sterba 				clean_tree_block(fs_info, next);
2759e02119d5SChris Mason 				btrfs_wait_tree_block_writeback(next);
2760e02119d5SChris Mason 				btrfs_tree_unlock(next);
27611846430cSLiu Bo 			} else {
27621846430cSLiu Bo 				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
27631846430cSLiu Bo 					clear_extent_buffer_dirty(next);
2764681ae509SJosef Bacik 			}
2765e02119d5SChris Mason 
2766e02119d5SChris Mason 			WARN_ON(log->root_key.objectid !=
2767e02119d5SChris Mason 				BTRFS_TREE_LOG_OBJECTID);
27682ff7e61eSJeff Mahoney 			ret = btrfs_free_and_pin_reserved_extent(fs_info,
27692ff7e61eSJeff Mahoney 							next->start, next->len);
27703650860bSJosef Bacik 			if (ret)
27713650860bSJosef Bacik 				goto out;
2772e02119d5SChris Mason 		}
2773e02119d5SChris Mason 	}
2774e02119d5SChris Mason 
277579787eaaSJeff Mahoney out:
2776e02119d5SChris Mason 	btrfs_free_path(path);
2777e02119d5SChris Mason 	return ret;
2778e02119d5SChris Mason }
2779e02119d5SChris Mason 
27807237f183SYan Zheng /*
27817237f183SYan Zheng  * helper function to update the item for a given subvolumes log root
27827237f183SYan Zheng  * in the tree of log roots
27837237f183SYan Zheng  */
27847237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans,
27857237f183SYan Zheng 			   struct btrfs_root *log)
27867237f183SYan Zheng {
27870b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
27887237f183SYan Zheng 	int ret;
27897237f183SYan Zheng 
27907237f183SYan Zheng 	if (log->log_transid == 1) {
27917237f183SYan Zheng 		/* insert root item on the first sync */
27920b246afaSJeff Mahoney 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
27937237f183SYan Zheng 				&log->root_key, &log->root_item);
27947237f183SYan Zheng 	} else {
27950b246afaSJeff Mahoney 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
27967237f183SYan Zheng 				&log->root_key, &log->root_item);
27977237f183SYan Zheng 	}
27987237f183SYan Zheng 	return ret;
27997237f183SYan Zheng }
28007237f183SYan Zheng 
280160d53eb3SZhaolei static void wait_log_commit(struct btrfs_root *root, int transid)
2802e02119d5SChris Mason {
2803e02119d5SChris Mason 	DEFINE_WAIT(wait);
28047237f183SYan Zheng 	int index = transid % 2;
2805e02119d5SChris Mason 
28067237f183SYan Zheng 	/*
28077237f183SYan Zheng 	 * we only allow two pending log transactions at a time,
28087237f183SYan Zheng 	 * so we know that if ours is more than 2 older than the
28097237f183SYan Zheng 	 * current transaction, we're done
28107237f183SYan Zheng 	 */
281149e83f57SLiu Bo 	for (;;) {
28127237f183SYan Zheng 		prepare_to_wait(&root->log_commit_wait[index],
28137237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
281449e83f57SLiu Bo 
281549e83f57SLiu Bo 		if (!(root->log_transid_committed < transid &&
281649e83f57SLiu Bo 		      atomic_read(&root->log_commit[index])))
281749e83f57SLiu Bo 			break;
281849e83f57SLiu Bo 
28197237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
2820e02119d5SChris Mason 		schedule();
28217237f183SYan Zheng 		mutex_lock(&root->log_mutex);
282249e83f57SLiu Bo 	}
282349e83f57SLiu Bo 	finish_wait(&root->log_commit_wait[index], &wait);
28247237f183SYan Zheng }
28257237f183SYan Zheng 
282660d53eb3SZhaolei static void wait_for_writer(struct btrfs_root *root)
28277237f183SYan Zheng {
28287237f183SYan Zheng 	DEFINE_WAIT(wait);
28298b050d35SMiao Xie 
283049e83f57SLiu Bo 	for (;;) {
283149e83f57SLiu Bo 		prepare_to_wait(&root->log_writer_wait, &wait,
283249e83f57SLiu Bo 				TASK_UNINTERRUPTIBLE);
283349e83f57SLiu Bo 		if (!atomic_read(&root->log_writers))
283449e83f57SLiu Bo 			break;
283549e83f57SLiu Bo 
28367237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
28377237f183SYan Zheng 		schedule();
2838575849ecSFilipe Manana 		mutex_lock(&root->log_mutex);
28397237f183SYan Zheng 	}
284049e83f57SLiu Bo 	finish_wait(&root->log_writer_wait, &wait);
2841e02119d5SChris Mason }
2842e02119d5SChris Mason 
28438b050d35SMiao Xie static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
28448b050d35SMiao Xie 					struct btrfs_log_ctx *ctx)
28458b050d35SMiao Xie {
28468b050d35SMiao Xie 	if (!ctx)
28478b050d35SMiao Xie 		return;
28488b050d35SMiao Xie 
28498b050d35SMiao Xie 	mutex_lock(&root->log_mutex);
28508b050d35SMiao Xie 	list_del_init(&ctx->list);
28518b050d35SMiao Xie 	mutex_unlock(&root->log_mutex);
28528b050d35SMiao Xie }
28538b050d35SMiao Xie 
28548b050d35SMiao Xie /*
28558b050d35SMiao Xie  * Invoked in log mutex context, or be sure there is no other task which
28568b050d35SMiao Xie  * can access the list.
28578b050d35SMiao Xie  */
28588b050d35SMiao Xie static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
28598b050d35SMiao Xie 					     int index, int error)
28608b050d35SMiao Xie {
28618b050d35SMiao Xie 	struct btrfs_log_ctx *ctx;
2862570dd450SChris Mason 	struct btrfs_log_ctx *safe;
28638b050d35SMiao Xie 
2864570dd450SChris Mason 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2865570dd450SChris Mason 		list_del_init(&ctx->list);
28668b050d35SMiao Xie 		ctx->log_ret = error;
2867570dd450SChris Mason 	}
28688b050d35SMiao Xie 
28698b050d35SMiao Xie 	INIT_LIST_HEAD(&root->log_ctxs[index]);
28708b050d35SMiao Xie }
28718b050d35SMiao Xie 
2872e02119d5SChris Mason /*
2873e02119d5SChris Mason  * btrfs_sync_log does sends a given tree log down to the disk and
2874e02119d5SChris Mason  * updates the super blocks to record it.  When this call is done,
287512fcfd22SChris Mason  * you know that any inodes previously logged are safely on disk only
287612fcfd22SChris Mason  * if it returns 0.
287712fcfd22SChris Mason  *
287812fcfd22SChris Mason  * Any other return value means you need to call btrfs_commit_transaction.
287912fcfd22SChris Mason  * Some of the edge cases for fsyncing directories that have had unlinks
288012fcfd22SChris Mason  * or renames done in the past mean that sometimes the only safe
288112fcfd22SChris Mason  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
288212fcfd22SChris Mason  * that has happened.
2883e02119d5SChris Mason  */
2884e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans,
28858b050d35SMiao Xie 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2886e02119d5SChris Mason {
28877237f183SYan Zheng 	int index1;
28887237f183SYan Zheng 	int index2;
28898cef4e16SYan, Zheng 	int mark;
2890e02119d5SChris Mason 	int ret;
28910b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2892e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
28930b246afaSJeff Mahoney 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2894bb14a59bSMiao Xie 	int log_transid = 0;
28958b050d35SMiao Xie 	struct btrfs_log_ctx root_log_ctx;
2896c6adc9ccSMiao Xie 	struct blk_plug plug;
2897e02119d5SChris Mason 
28987237f183SYan Zheng 	mutex_lock(&root->log_mutex);
2899d1433debSMiao Xie 	log_transid = ctx->log_transid;
2900d1433debSMiao Xie 	if (root->log_transid_committed >= log_transid) {
29017237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
29028b050d35SMiao Xie 		return ctx->log_ret;
2903e02119d5SChris Mason 	}
2904d1433debSMiao Xie 
2905d1433debSMiao Xie 	index1 = log_transid % 2;
2906d1433debSMiao Xie 	if (atomic_read(&root->log_commit[index1])) {
290760d53eb3SZhaolei 		wait_log_commit(root, log_transid);
2908d1433debSMiao Xie 		mutex_unlock(&root->log_mutex);
2909d1433debSMiao Xie 		return ctx->log_ret;
2910d1433debSMiao Xie 	}
2911d1433debSMiao Xie 	ASSERT(log_transid == root->log_transid);
29127237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 1);
29137237f183SYan Zheng 
29147237f183SYan Zheng 	/* wait for previous tree log sync to complete */
29157237f183SYan Zheng 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
291660d53eb3SZhaolei 		wait_log_commit(root, log_transid - 1);
291748cab2e0SMiao Xie 
291886df7eb9SYan, Zheng 	while (1) {
29192ecb7923SMiao Xie 		int batch = atomic_read(&root->log_batch);
2920cd354ad6SChris Mason 		/* when we're on an ssd, just kick the log commit out */
29210b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, SSD) &&
292227cdeb70SMiao Xie 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
29237237f183SYan Zheng 			mutex_unlock(&root->log_mutex);
2924e02119d5SChris Mason 			schedule_timeout_uninterruptible(1);
29257237f183SYan Zheng 			mutex_lock(&root->log_mutex);
292686df7eb9SYan, Zheng 		}
292760d53eb3SZhaolei 		wait_for_writer(root);
29282ecb7923SMiao Xie 		if (batch == atomic_read(&root->log_batch))
2929e02119d5SChris Mason 			break;
2930e02119d5SChris Mason 	}
2931d0c803c4SChris Mason 
293212fcfd22SChris Mason 	/* bail out if we need to do a full commit */
29330b246afaSJeff Mahoney 	if (btrfs_need_log_full_commit(fs_info, trans)) {
293412fcfd22SChris Mason 		ret = -EAGAIN;
293512fcfd22SChris Mason 		mutex_unlock(&root->log_mutex);
293612fcfd22SChris Mason 		goto out;
293712fcfd22SChris Mason 	}
293812fcfd22SChris Mason 
29398cef4e16SYan, Zheng 	if (log_transid % 2 == 0)
29408cef4e16SYan, Zheng 		mark = EXTENT_DIRTY;
29418cef4e16SYan, Zheng 	else
29428cef4e16SYan, Zheng 		mark = EXTENT_NEW;
29438cef4e16SYan, Zheng 
2944690587d1SChris Mason 	/* we start IO on  all the marked extents here, but we don't actually
2945690587d1SChris Mason 	 * wait for them until later.
2946690587d1SChris Mason 	 */
2947c6adc9ccSMiao Xie 	blk_start_plug(&plug);
29482ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
294979787eaaSJeff Mahoney 	if (ret) {
2950c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
295166642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
29520b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
295379787eaaSJeff Mahoney 		mutex_unlock(&root->log_mutex);
295479787eaaSJeff Mahoney 		goto out;
295579787eaaSJeff Mahoney 	}
29567237f183SYan Zheng 
29575d4f98a2SYan Zheng 	btrfs_set_root_node(&log->root_item, log->node);
29587237f183SYan Zheng 
29597237f183SYan Zheng 	root->log_transid++;
29607237f183SYan Zheng 	log->log_transid = root->log_transid;
2961ff782e0aSJosef Bacik 	root->log_start_pid = 0;
29627237f183SYan Zheng 	/*
29638cef4e16SYan, Zheng 	 * IO has been started, blocks of the log tree have WRITTEN flag set
29648cef4e16SYan, Zheng 	 * in their headers. new modifications of the log will be written to
29658cef4e16SYan, Zheng 	 * new positions. so it's safe to allow log writers to go in.
29667237f183SYan Zheng 	 */
29677237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
29687237f183SYan Zheng 
296928a23593SFilipe Manana 	btrfs_init_log_ctx(&root_log_ctx, NULL);
2970d1433debSMiao Xie 
29717237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
29722ecb7923SMiao Xie 	atomic_inc(&log_root_tree->log_batch);
29737237f183SYan Zheng 	atomic_inc(&log_root_tree->log_writers);
2974d1433debSMiao Xie 
2975d1433debSMiao Xie 	index2 = log_root_tree->log_transid % 2;
2976d1433debSMiao Xie 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2977d1433debSMiao Xie 	root_log_ctx.log_transid = log_root_tree->log_transid;
2978d1433debSMiao Xie 
29797237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
29807237f183SYan Zheng 
29817237f183SYan Zheng 	ret = update_log_root(trans, log);
29827237f183SYan Zheng 
29837237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
29847237f183SYan Zheng 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2985093258e6SDavid Sterba 		/* atomic_dec_and_test implies a barrier */
2986093258e6SDavid Sterba 		cond_wake_up_nomb(&log_root_tree->log_writer_wait);
29877237f183SYan Zheng 	}
29887237f183SYan Zheng 
29894a500fd1SYan, Zheng 	if (ret) {
2990d1433debSMiao Xie 		if (!list_empty(&root_log_ctx.list))
2991d1433debSMiao Xie 			list_del_init(&root_log_ctx.list);
2992d1433debSMiao Xie 
2993c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
29940b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
2995995946ddSMiao Xie 
299679787eaaSJeff Mahoney 		if (ret != -ENOSPC) {
299766642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
299879787eaaSJeff Mahoney 			mutex_unlock(&log_root_tree->log_mutex);
299979787eaaSJeff Mahoney 			goto out;
300079787eaaSJeff Mahoney 		}
3001bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
30024a500fd1SYan, Zheng 		mutex_unlock(&log_root_tree->log_mutex);
30034a500fd1SYan, Zheng 		ret = -EAGAIN;
30044a500fd1SYan, Zheng 		goto out;
30054a500fd1SYan, Zheng 	}
30064a500fd1SYan, Zheng 
3007d1433debSMiao Xie 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
30083da5ab56SForrest Liu 		blk_finish_plug(&plug);
3009cbd60aa7SChris Mason 		list_del_init(&root_log_ctx.list);
3010d1433debSMiao Xie 		mutex_unlock(&log_root_tree->log_mutex);
3011d1433debSMiao Xie 		ret = root_log_ctx.log_ret;
3012d1433debSMiao Xie 		goto out;
3013d1433debSMiao Xie 	}
30148b050d35SMiao Xie 
3015d1433debSMiao Xie 	index2 = root_log_ctx.log_transid % 2;
30167237f183SYan Zheng 	if (atomic_read(&log_root_tree->log_commit[index2])) {
3017c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
3018bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log, mark);
301960d53eb3SZhaolei 		wait_log_commit(log_root_tree,
3020d1433debSMiao Xie 				root_log_ctx.log_transid);
30217237f183SYan Zheng 		mutex_unlock(&log_root_tree->log_mutex);
30225ab5e44aSFilipe Manana 		if (!ret)
30238b050d35SMiao Xie 			ret = root_log_ctx.log_ret;
30247237f183SYan Zheng 		goto out;
30257237f183SYan Zheng 	}
3026d1433debSMiao Xie 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
30277237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 1);
30287237f183SYan Zheng 
302912fcfd22SChris Mason 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
303060d53eb3SZhaolei 		wait_log_commit(log_root_tree,
3031d1433debSMiao Xie 				root_log_ctx.log_transid - 1);
303212fcfd22SChris Mason 	}
30337237f183SYan Zheng 
303460d53eb3SZhaolei 	wait_for_writer(log_root_tree);
303512fcfd22SChris Mason 
303612fcfd22SChris Mason 	/*
303712fcfd22SChris Mason 	 * now that we've moved on to the tree of log tree roots,
303812fcfd22SChris Mason 	 * check the full commit flag again
303912fcfd22SChris Mason 	 */
30400b246afaSJeff Mahoney 	if (btrfs_need_log_full_commit(fs_info, trans)) {
3041c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
3042bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
304312fcfd22SChris Mason 		mutex_unlock(&log_root_tree->log_mutex);
304412fcfd22SChris Mason 		ret = -EAGAIN;
304512fcfd22SChris Mason 		goto out_wake_log_root;
304612fcfd22SChris Mason 	}
30477237f183SYan Zheng 
30482ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info,
30498cef4e16SYan, Zheng 					 &log_root_tree->dirty_log_pages,
30508cef4e16SYan, Zheng 					 EXTENT_DIRTY | EXTENT_NEW);
3051c6adc9ccSMiao Xie 	blk_finish_plug(&plug);
305279787eaaSJeff Mahoney 	if (ret) {
30530b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
305466642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
305579787eaaSJeff Mahoney 		mutex_unlock(&log_root_tree->log_mutex);
305679787eaaSJeff Mahoney 		goto out_wake_log_root;
305779787eaaSJeff Mahoney 	}
3058bf89d38fSJeff Mahoney 	ret = btrfs_wait_tree_log_extents(log, mark);
30595ab5e44aSFilipe Manana 	if (!ret)
3060bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log_root_tree,
3061c6adc9ccSMiao Xie 						  EXTENT_NEW | EXTENT_DIRTY);
30625ab5e44aSFilipe Manana 	if (ret) {
30630b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
30645ab5e44aSFilipe Manana 		mutex_unlock(&log_root_tree->log_mutex);
30655ab5e44aSFilipe Manana 		goto out_wake_log_root;
30665ab5e44aSFilipe Manana 	}
3067e02119d5SChris Mason 
30680b246afaSJeff Mahoney 	btrfs_set_super_log_root(fs_info->super_for_commit,
30697237f183SYan Zheng 				 log_root_tree->node->start);
30700b246afaSJeff Mahoney 	btrfs_set_super_log_root_level(fs_info->super_for_commit,
30717237f183SYan Zheng 				       btrfs_header_level(log_root_tree->node));
3072e02119d5SChris Mason 
30737237f183SYan Zheng 	log_root_tree->log_transid++;
30747237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
30757237f183SYan Zheng 
30767237f183SYan Zheng 	/*
30777237f183SYan Zheng 	 * nobody else is going to jump in and write the the ctree
30787237f183SYan Zheng 	 * super here because the log_commit atomic below is protecting
30797237f183SYan Zheng 	 * us.  We must be called with a transaction handle pinning
30807237f183SYan Zheng 	 * the running transaction open, so a full commit can't hop
30817237f183SYan Zheng 	 * in and cause problems either.
30827237f183SYan Zheng 	 */
3083eece6a9cSDavid Sterba 	ret = write_all_supers(fs_info, 1);
30845af3e8ccSStefan Behrens 	if (ret) {
30850b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
308666642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
30875af3e8ccSStefan Behrens 		goto out_wake_log_root;
30885af3e8ccSStefan Behrens 	}
30897237f183SYan Zheng 
3090257c62e1SChris Mason 	mutex_lock(&root->log_mutex);
3091257c62e1SChris Mason 	if (root->last_log_commit < log_transid)
3092257c62e1SChris Mason 		root->last_log_commit = log_transid;
3093257c62e1SChris Mason 	mutex_unlock(&root->log_mutex);
3094257c62e1SChris Mason 
309512fcfd22SChris Mason out_wake_log_root:
3096570dd450SChris Mason 	mutex_lock(&log_root_tree->log_mutex);
30978b050d35SMiao Xie 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
30988b050d35SMiao Xie 
3099d1433debSMiao Xie 	log_root_tree->log_transid_committed++;
31007237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 0);
3101d1433debSMiao Xie 	mutex_unlock(&log_root_tree->log_mutex);
3102d1433debSMiao Xie 
310333a9eca7SDavid Sterba 	/*
3104093258e6SDavid Sterba 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3105093258e6SDavid Sterba 	 * all the updates above are seen by the woken threads. It might not be
3106093258e6SDavid Sterba 	 * necessary, but proving that seems to be hard.
310733a9eca7SDavid Sterba 	 */
3108093258e6SDavid Sterba 	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3109e02119d5SChris Mason out:
3110d1433debSMiao Xie 	mutex_lock(&root->log_mutex);
3111570dd450SChris Mason 	btrfs_remove_all_log_ctxs(root, index1, ret);
3112d1433debSMiao Xie 	root->log_transid_committed++;
31137237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 0);
3114d1433debSMiao Xie 	mutex_unlock(&root->log_mutex);
31158b050d35SMiao Xie 
311633a9eca7SDavid Sterba 	/*
3117093258e6SDavid Sterba 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3118093258e6SDavid Sterba 	 * all the updates above are seen by the woken threads. It might not be
3119093258e6SDavid Sterba 	 * necessary, but proving that seems to be hard.
312033a9eca7SDavid Sterba 	 */
3121093258e6SDavid Sterba 	cond_wake_up(&root->log_commit_wait[index1]);
3122b31eabd8SChris Mason 	return ret;
3123e02119d5SChris Mason }
3124e02119d5SChris Mason 
31254a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans,
31264a500fd1SYan, Zheng 			  struct btrfs_root *log)
3127e02119d5SChris Mason {
3128e02119d5SChris Mason 	int ret;
3129d0c803c4SChris Mason 	u64 start;
3130d0c803c4SChris Mason 	u64 end;
3131e02119d5SChris Mason 	struct walk_control wc = {
3132e02119d5SChris Mason 		.free = 1,
3133e02119d5SChris Mason 		.process_func = process_one_buffer
3134e02119d5SChris Mason 	};
3135e02119d5SChris Mason 
3136e02119d5SChris Mason 	ret = walk_log_tree(trans, log, &wc);
31373650860bSJosef Bacik 	/* I don't think this can happen but just in case */
31383650860bSJosef Bacik 	if (ret)
313966642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
3140e02119d5SChris Mason 
3141d0c803c4SChris Mason 	while (1) {
3142d0c803c4SChris Mason 		ret = find_first_extent_bit(&log->dirty_log_pages,
314355237a5fSLiu Bo 				0, &start, &end,
314455237a5fSLiu Bo 				EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
3145e6138876SJosef Bacik 				NULL);
3146d0c803c4SChris Mason 		if (ret)
3147d0c803c4SChris Mason 			break;
3148d0c803c4SChris Mason 
31498cef4e16SYan, Zheng 		clear_extent_bits(&log->dirty_log_pages, start, end,
315055237a5fSLiu Bo 				  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3151d0c803c4SChris Mason 	}
3152d0c803c4SChris Mason 
31537237f183SYan Zheng 	free_extent_buffer(log->node);
31547237f183SYan Zheng 	kfree(log);
31554a500fd1SYan, Zheng }
31564a500fd1SYan, Zheng 
31574a500fd1SYan, Zheng /*
31584a500fd1SYan, Zheng  * free all the extents used by the tree log.  This should be called
31594a500fd1SYan, Zheng  * at commit time of the full transaction
31604a500fd1SYan, Zheng  */
31614a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
31624a500fd1SYan, Zheng {
31634a500fd1SYan, Zheng 	if (root->log_root) {
31644a500fd1SYan, Zheng 		free_log_tree(trans, root->log_root);
31654a500fd1SYan, Zheng 		root->log_root = NULL;
31664a500fd1SYan, Zheng 	}
31674a500fd1SYan, Zheng 	return 0;
31684a500fd1SYan, Zheng }
31694a500fd1SYan, Zheng 
31704a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
31714a500fd1SYan, Zheng 			     struct btrfs_fs_info *fs_info)
31724a500fd1SYan, Zheng {
31734a500fd1SYan, Zheng 	if (fs_info->log_root_tree) {
31744a500fd1SYan, Zheng 		free_log_tree(trans, fs_info->log_root_tree);
31754a500fd1SYan, Zheng 		fs_info->log_root_tree = NULL;
31764a500fd1SYan, Zheng 	}
3177e02119d5SChris Mason 	return 0;
3178e02119d5SChris Mason }
3179e02119d5SChris Mason 
3180e02119d5SChris Mason /*
3181e02119d5SChris Mason  * If both a file and directory are logged, and unlinks or renames are
3182e02119d5SChris Mason  * mixed in, we have a few interesting corners:
3183e02119d5SChris Mason  *
3184e02119d5SChris Mason  * create file X in dir Y
3185e02119d5SChris Mason  * link file X to X.link in dir Y
3186e02119d5SChris Mason  * fsync file X
3187e02119d5SChris Mason  * unlink file X but leave X.link
3188e02119d5SChris Mason  * fsync dir Y
3189e02119d5SChris Mason  *
3190e02119d5SChris Mason  * After a crash we would expect only X.link to exist.  But file X
3191e02119d5SChris Mason  * didn't get fsync'd again so the log has back refs for X and X.link.
3192e02119d5SChris Mason  *
3193e02119d5SChris Mason  * We solve this by removing directory entries and inode backrefs from the
3194e02119d5SChris Mason  * log when a file that was logged in the current transaction is
3195e02119d5SChris Mason  * unlinked.  Any later fsync will include the updated log entries, and
3196e02119d5SChris Mason  * we'll be able to reconstruct the proper directory items from backrefs.
3197e02119d5SChris Mason  *
3198e02119d5SChris Mason  * This optimizations allows us to avoid relogging the entire inode
3199e02119d5SChris Mason  * or the entire directory.
3200e02119d5SChris Mason  */
3201e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3202e02119d5SChris Mason 				 struct btrfs_root *root,
3203e02119d5SChris Mason 				 const char *name, int name_len,
320449f34d1fSNikolay Borisov 				 struct btrfs_inode *dir, u64 index)
3205e02119d5SChris Mason {
3206e02119d5SChris Mason 	struct btrfs_root *log;
3207e02119d5SChris Mason 	struct btrfs_dir_item *di;
3208e02119d5SChris Mason 	struct btrfs_path *path;
3209e02119d5SChris Mason 	int ret;
32104a500fd1SYan, Zheng 	int err = 0;
3211e02119d5SChris Mason 	int bytes_del = 0;
321249f34d1fSNikolay Borisov 	u64 dir_ino = btrfs_ino(dir);
3213e02119d5SChris Mason 
321449f34d1fSNikolay Borisov 	if (dir->logged_trans < trans->transid)
32153a5f1d45SChris Mason 		return 0;
32163a5f1d45SChris Mason 
3217e02119d5SChris Mason 	ret = join_running_log_trans(root);
3218e02119d5SChris Mason 	if (ret)
3219e02119d5SChris Mason 		return 0;
3220e02119d5SChris Mason 
322149f34d1fSNikolay Borisov 	mutex_lock(&dir->log_mutex);
3222e02119d5SChris Mason 
3223e02119d5SChris Mason 	log = root->log_root;
3224e02119d5SChris Mason 	path = btrfs_alloc_path();
3225a62f44a5STsutomu Itoh 	if (!path) {
3226a62f44a5STsutomu Itoh 		err = -ENOMEM;
3227a62f44a5STsutomu Itoh 		goto out_unlock;
3228a62f44a5STsutomu Itoh 	}
32292a29edc6Sliubo 
323033345d01SLi Zefan 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3231e02119d5SChris Mason 				   name, name_len, -1);
32324a500fd1SYan, Zheng 	if (IS_ERR(di)) {
32334a500fd1SYan, Zheng 		err = PTR_ERR(di);
32344a500fd1SYan, Zheng 		goto fail;
32354a500fd1SYan, Zheng 	}
32364a500fd1SYan, Zheng 	if (di) {
3237e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3238e02119d5SChris Mason 		bytes_del += name_len;
32393650860bSJosef Bacik 		if (ret) {
32403650860bSJosef Bacik 			err = ret;
32413650860bSJosef Bacik 			goto fail;
32423650860bSJosef Bacik 		}
3243e02119d5SChris Mason 	}
3244b3b4aa74SDavid Sterba 	btrfs_release_path(path);
324533345d01SLi Zefan 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3246e02119d5SChris Mason 					 index, name, name_len, -1);
32474a500fd1SYan, Zheng 	if (IS_ERR(di)) {
32484a500fd1SYan, Zheng 		err = PTR_ERR(di);
32494a500fd1SYan, Zheng 		goto fail;
32504a500fd1SYan, Zheng 	}
32514a500fd1SYan, Zheng 	if (di) {
3252e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3253e02119d5SChris Mason 		bytes_del += name_len;
32543650860bSJosef Bacik 		if (ret) {
32553650860bSJosef Bacik 			err = ret;
32563650860bSJosef Bacik 			goto fail;
32573650860bSJosef Bacik 		}
3258e02119d5SChris Mason 	}
3259e02119d5SChris Mason 
3260e02119d5SChris Mason 	/* update the directory size in the log to reflect the names
3261e02119d5SChris Mason 	 * we have removed
3262e02119d5SChris Mason 	 */
3263e02119d5SChris Mason 	if (bytes_del) {
3264e02119d5SChris Mason 		struct btrfs_key key;
3265e02119d5SChris Mason 
326633345d01SLi Zefan 		key.objectid = dir_ino;
3267e02119d5SChris Mason 		key.offset = 0;
3268e02119d5SChris Mason 		key.type = BTRFS_INODE_ITEM_KEY;
3269b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3270e02119d5SChris Mason 
3271e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
32724a500fd1SYan, Zheng 		if (ret < 0) {
32734a500fd1SYan, Zheng 			err = ret;
32744a500fd1SYan, Zheng 			goto fail;
32754a500fd1SYan, Zheng 		}
3276e02119d5SChris Mason 		if (ret == 0) {
3277e02119d5SChris Mason 			struct btrfs_inode_item *item;
3278e02119d5SChris Mason 			u64 i_size;
3279e02119d5SChris Mason 
3280e02119d5SChris Mason 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3281e02119d5SChris Mason 					      struct btrfs_inode_item);
3282e02119d5SChris Mason 			i_size = btrfs_inode_size(path->nodes[0], item);
3283e02119d5SChris Mason 			if (i_size > bytes_del)
3284e02119d5SChris Mason 				i_size -= bytes_del;
3285e02119d5SChris Mason 			else
3286e02119d5SChris Mason 				i_size = 0;
3287e02119d5SChris Mason 			btrfs_set_inode_size(path->nodes[0], item, i_size);
3288e02119d5SChris Mason 			btrfs_mark_buffer_dirty(path->nodes[0]);
3289e02119d5SChris Mason 		} else
3290e02119d5SChris Mason 			ret = 0;
3291b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3292e02119d5SChris Mason 	}
32934a500fd1SYan, Zheng fail:
3294e02119d5SChris Mason 	btrfs_free_path(path);
3295a62f44a5STsutomu Itoh out_unlock:
329649f34d1fSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
32974a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
3298995946ddSMiao Xie 		btrfs_set_log_full_commit(root->fs_info, trans);
32994a500fd1SYan, Zheng 		ret = 0;
330079787eaaSJeff Mahoney 	} else if (ret < 0)
330166642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
330279787eaaSJeff Mahoney 
330312fcfd22SChris Mason 	btrfs_end_log_trans(root);
3304e02119d5SChris Mason 
3305411fc6bcSAndi Kleen 	return err;
3306e02119d5SChris Mason }
3307e02119d5SChris Mason 
3308e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */
3309e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3310e02119d5SChris Mason 			       struct btrfs_root *root,
3311e02119d5SChris Mason 			       const char *name, int name_len,
3312a491abb2SNikolay Borisov 			       struct btrfs_inode *inode, u64 dirid)
3313e02119d5SChris Mason {
33140b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
3315e02119d5SChris Mason 	struct btrfs_root *log;
3316e02119d5SChris Mason 	u64 index;
3317e02119d5SChris Mason 	int ret;
3318e02119d5SChris Mason 
3319a491abb2SNikolay Borisov 	if (inode->logged_trans < trans->transid)
33203a5f1d45SChris Mason 		return 0;
33213a5f1d45SChris Mason 
3322e02119d5SChris Mason 	ret = join_running_log_trans(root);
3323e02119d5SChris Mason 	if (ret)
3324e02119d5SChris Mason 		return 0;
3325e02119d5SChris Mason 	log = root->log_root;
3326a491abb2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
3327e02119d5SChris Mason 
3328a491abb2SNikolay Borisov 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3329e02119d5SChris Mason 				  dirid, &index);
3330a491abb2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
33314a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
33320b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
33334a500fd1SYan, Zheng 		ret = 0;
333479787eaaSJeff Mahoney 	} else if (ret < 0 && ret != -ENOENT)
333566642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
333612fcfd22SChris Mason 	btrfs_end_log_trans(root);
3337e02119d5SChris Mason 
3338e02119d5SChris Mason 	return ret;
3339e02119d5SChris Mason }
3340e02119d5SChris Mason 
3341e02119d5SChris Mason /*
3342e02119d5SChris Mason  * creates a range item in the log for 'dirid'.  first_offset and
3343e02119d5SChris Mason  * last_offset tell us which parts of the key space the log should
3344e02119d5SChris Mason  * be considered authoritative for.
3345e02119d5SChris Mason  */
3346e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3347e02119d5SChris Mason 				       struct btrfs_root *log,
3348e02119d5SChris Mason 				       struct btrfs_path *path,
3349e02119d5SChris Mason 				       int key_type, u64 dirid,
3350e02119d5SChris Mason 				       u64 first_offset, u64 last_offset)
3351e02119d5SChris Mason {
3352e02119d5SChris Mason 	int ret;
3353e02119d5SChris Mason 	struct btrfs_key key;
3354e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
3355e02119d5SChris Mason 
3356e02119d5SChris Mason 	key.objectid = dirid;
3357e02119d5SChris Mason 	key.offset = first_offset;
3358e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY)
3359e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3360e02119d5SChris Mason 	else
3361e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3362e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
33634a500fd1SYan, Zheng 	if (ret)
33644a500fd1SYan, Zheng 		return ret;
3365e02119d5SChris Mason 
3366e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3367e02119d5SChris Mason 			      struct btrfs_dir_log_item);
3368e02119d5SChris Mason 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3369e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
3370b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3371e02119d5SChris Mason 	return 0;
3372e02119d5SChris Mason }
3373e02119d5SChris Mason 
3374e02119d5SChris Mason /*
3375e02119d5SChris Mason  * log all the items included in the current transaction for a given
3376e02119d5SChris Mason  * directory.  This also creates the range items in the log tree required
3377e02119d5SChris Mason  * to replay anything deleted before the fsync
3378e02119d5SChris Mason  */
3379e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3380684a5773SNikolay Borisov 			  struct btrfs_root *root, struct btrfs_inode *inode,
3381e02119d5SChris Mason 			  struct btrfs_path *path,
3382e02119d5SChris Mason 			  struct btrfs_path *dst_path, int key_type,
33832f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx,
3384e02119d5SChris Mason 			  u64 min_offset, u64 *last_offset_ret)
3385e02119d5SChris Mason {
3386e02119d5SChris Mason 	struct btrfs_key min_key;
3387e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
3388e02119d5SChris Mason 	struct extent_buffer *src;
33894a500fd1SYan, Zheng 	int err = 0;
3390e02119d5SChris Mason 	int ret;
3391e02119d5SChris Mason 	int i;
3392e02119d5SChris Mason 	int nritems;
3393e02119d5SChris Mason 	u64 first_offset = min_offset;
3394e02119d5SChris Mason 	u64 last_offset = (u64)-1;
3395684a5773SNikolay Borisov 	u64 ino = btrfs_ino(inode);
3396e02119d5SChris Mason 
3397e02119d5SChris Mason 	log = root->log_root;
3398e02119d5SChris Mason 
339933345d01SLi Zefan 	min_key.objectid = ino;
3400e02119d5SChris Mason 	min_key.type = key_type;
3401e02119d5SChris Mason 	min_key.offset = min_offset;
3402e02119d5SChris Mason 
34036174d3cbSFilipe David Borba Manana 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3404e02119d5SChris Mason 
3405e02119d5SChris Mason 	/*
3406e02119d5SChris Mason 	 * we didn't find anything from this transaction, see if there
3407e02119d5SChris Mason 	 * is anything at all
3408e02119d5SChris Mason 	 */
340933345d01SLi Zefan 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
341033345d01SLi Zefan 		min_key.objectid = ino;
3411e02119d5SChris Mason 		min_key.type = key_type;
3412e02119d5SChris Mason 		min_key.offset = (u64)-1;
3413b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3414e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3415e02119d5SChris Mason 		if (ret < 0) {
3416b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3417e02119d5SChris Mason 			return ret;
3418e02119d5SChris Mason 		}
341933345d01SLi Zefan 		ret = btrfs_previous_item(root, path, ino, key_type);
3420e02119d5SChris Mason 
3421e02119d5SChris Mason 		/* if ret == 0 there are items for this type,
3422e02119d5SChris Mason 		 * create a range to tell us the last key of this type.
3423e02119d5SChris Mason 		 * otherwise, there are no items in this directory after
3424e02119d5SChris Mason 		 * *min_offset, and we create a range to indicate that.
3425e02119d5SChris Mason 		 */
3426e02119d5SChris Mason 		if (ret == 0) {
3427e02119d5SChris Mason 			struct btrfs_key tmp;
3428e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3429e02119d5SChris Mason 					      path->slots[0]);
3430d397712bSChris Mason 			if (key_type == tmp.type)
3431e02119d5SChris Mason 				first_offset = max(min_offset, tmp.offset) + 1;
3432e02119d5SChris Mason 		}
3433e02119d5SChris Mason 		goto done;
3434e02119d5SChris Mason 	}
3435e02119d5SChris Mason 
3436e02119d5SChris Mason 	/* go backward to find any previous key */
343733345d01SLi Zefan 	ret = btrfs_previous_item(root, path, ino, key_type);
3438e02119d5SChris Mason 	if (ret == 0) {
3439e02119d5SChris Mason 		struct btrfs_key tmp;
3440e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3441e02119d5SChris Mason 		if (key_type == tmp.type) {
3442e02119d5SChris Mason 			first_offset = tmp.offset;
3443e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3444e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3445e02119d5SChris Mason 					     &tmp);
34464a500fd1SYan, Zheng 			if (ret) {
34474a500fd1SYan, Zheng 				err = ret;
34484a500fd1SYan, Zheng 				goto done;
34494a500fd1SYan, Zheng 			}
3450e02119d5SChris Mason 		}
3451e02119d5SChris Mason 	}
3452b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3453e02119d5SChris Mason 
3454e02119d5SChris Mason 	/* find the first key from this transaction again */
3455e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3456fae7f21cSDulshani Gunawardhana 	if (WARN_ON(ret != 0))
3457e02119d5SChris Mason 		goto done;
3458e02119d5SChris Mason 
3459e02119d5SChris Mason 	/*
3460e02119d5SChris Mason 	 * we have a block from this transaction, log every item in it
3461e02119d5SChris Mason 	 * from our directory
3462e02119d5SChris Mason 	 */
3463e02119d5SChris Mason 	while (1) {
3464e02119d5SChris Mason 		struct btrfs_key tmp;
3465e02119d5SChris Mason 		src = path->nodes[0];
3466e02119d5SChris Mason 		nritems = btrfs_header_nritems(src);
3467e02119d5SChris Mason 		for (i = path->slots[0]; i < nritems; i++) {
34682f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
34692f2ff0eeSFilipe Manana 
3470e02119d5SChris Mason 			btrfs_item_key_to_cpu(src, &min_key, i);
3471e02119d5SChris Mason 
347233345d01SLi Zefan 			if (min_key.objectid != ino || min_key.type != key_type)
3473e02119d5SChris Mason 				goto done;
3474e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path, src, i,
3475e02119d5SChris Mason 					     &min_key);
34764a500fd1SYan, Zheng 			if (ret) {
34774a500fd1SYan, Zheng 				err = ret;
34784a500fd1SYan, Zheng 				goto done;
34794a500fd1SYan, Zheng 			}
34802f2ff0eeSFilipe Manana 
34812f2ff0eeSFilipe Manana 			/*
34822f2ff0eeSFilipe Manana 			 * We must make sure that when we log a directory entry,
34832f2ff0eeSFilipe Manana 			 * the corresponding inode, after log replay, has a
34842f2ff0eeSFilipe Manana 			 * matching link count. For example:
34852f2ff0eeSFilipe Manana 			 *
34862f2ff0eeSFilipe Manana 			 * touch foo
34872f2ff0eeSFilipe Manana 			 * mkdir mydir
34882f2ff0eeSFilipe Manana 			 * sync
34892f2ff0eeSFilipe Manana 			 * ln foo mydir/bar
34902f2ff0eeSFilipe Manana 			 * xfs_io -c "fsync" mydir
34912f2ff0eeSFilipe Manana 			 * <crash>
34922f2ff0eeSFilipe Manana 			 * <mount fs and log replay>
34932f2ff0eeSFilipe Manana 			 *
34942f2ff0eeSFilipe Manana 			 * Would result in a fsync log that when replayed, our
34952f2ff0eeSFilipe Manana 			 * file inode would have a link count of 1, but we get
34962f2ff0eeSFilipe Manana 			 * two directory entries pointing to the same inode.
34972f2ff0eeSFilipe Manana 			 * After removing one of the names, it would not be
34982f2ff0eeSFilipe Manana 			 * possible to remove the other name, which resulted
34992f2ff0eeSFilipe Manana 			 * always in stale file handle errors, and would not
35002f2ff0eeSFilipe Manana 			 * be possible to rmdir the parent directory, since
35012f2ff0eeSFilipe Manana 			 * its i_size could never decrement to the value
35022f2ff0eeSFilipe Manana 			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
35032f2ff0eeSFilipe Manana 			 */
35042f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
35052f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
35062f2ff0eeSFilipe Manana 			if (ctx &&
35072f2ff0eeSFilipe Manana 			    (btrfs_dir_transid(src, di) == trans->transid ||
35082f2ff0eeSFilipe Manana 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
35092f2ff0eeSFilipe Manana 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
35102f2ff0eeSFilipe Manana 				ctx->log_new_dentries = true;
3511e02119d5SChris Mason 		}
3512e02119d5SChris Mason 		path->slots[0] = nritems;
3513e02119d5SChris Mason 
3514e02119d5SChris Mason 		/*
3515e02119d5SChris Mason 		 * look ahead to the next item and see if it is also
3516e02119d5SChris Mason 		 * from this directory and from this transaction
3517e02119d5SChris Mason 		 */
3518e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
351980c0b421SLiu Bo 		if (ret) {
352080c0b421SLiu Bo 			if (ret == 1)
3521e02119d5SChris Mason 				last_offset = (u64)-1;
352280c0b421SLiu Bo 			else
352380c0b421SLiu Bo 				err = ret;
3524e02119d5SChris Mason 			goto done;
3525e02119d5SChris Mason 		}
3526e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
352733345d01SLi Zefan 		if (tmp.objectid != ino || tmp.type != key_type) {
3528e02119d5SChris Mason 			last_offset = (u64)-1;
3529e02119d5SChris Mason 			goto done;
3530e02119d5SChris Mason 		}
3531e02119d5SChris Mason 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3532e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3533e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3534e02119d5SChris Mason 					     &tmp);
35354a500fd1SYan, Zheng 			if (ret)
35364a500fd1SYan, Zheng 				err = ret;
35374a500fd1SYan, Zheng 			else
3538e02119d5SChris Mason 				last_offset = tmp.offset;
3539e02119d5SChris Mason 			goto done;
3540e02119d5SChris Mason 		}
3541e02119d5SChris Mason 	}
3542e02119d5SChris Mason done:
3543b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3544b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
3545e02119d5SChris Mason 
35464a500fd1SYan, Zheng 	if (err == 0) {
35474a500fd1SYan, Zheng 		*last_offset_ret = last_offset;
35484a500fd1SYan, Zheng 		/*
35494a500fd1SYan, Zheng 		 * insert the log range keys to indicate where the log
35504a500fd1SYan, Zheng 		 * is valid
35514a500fd1SYan, Zheng 		 */
35524a500fd1SYan, Zheng 		ret = insert_dir_log_key(trans, log, path, key_type,
355333345d01SLi Zefan 					 ino, first_offset, last_offset);
35544a500fd1SYan, Zheng 		if (ret)
35554a500fd1SYan, Zheng 			err = ret;
35564a500fd1SYan, Zheng 	}
35574a500fd1SYan, Zheng 	return err;
3558e02119d5SChris Mason }
3559e02119d5SChris Mason 
3560e02119d5SChris Mason /*
3561e02119d5SChris Mason  * logging directories is very similar to logging inodes, We find all the items
3562e02119d5SChris Mason  * from the current transaction and write them to the log.
3563e02119d5SChris Mason  *
3564e02119d5SChris Mason  * The recovery code scans the directory in the subvolume, and if it finds a
3565e02119d5SChris Mason  * key in the range logged that is not present in the log tree, then it means
3566e02119d5SChris Mason  * that dir entry was unlinked during the transaction.
3567e02119d5SChris Mason  *
3568e02119d5SChris Mason  * In order for that scan to work, we must include one key smaller than
3569e02119d5SChris Mason  * the smallest logged by this transaction and one key larger than the largest
3570e02119d5SChris Mason  * key logged by this transaction.
3571e02119d5SChris Mason  */
3572e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3573dbf39ea4SNikolay Borisov 			  struct btrfs_root *root, struct btrfs_inode *inode,
3574e02119d5SChris Mason 			  struct btrfs_path *path,
35752f2ff0eeSFilipe Manana 			  struct btrfs_path *dst_path,
35762f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx)
3577e02119d5SChris Mason {
3578e02119d5SChris Mason 	u64 min_key;
3579e02119d5SChris Mason 	u64 max_key;
3580e02119d5SChris Mason 	int ret;
3581e02119d5SChris Mason 	int key_type = BTRFS_DIR_ITEM_KEY;
3582e02119d5SChris Mason 
3583e02119d5SChris Mason again:
3584e02119d5SChris Mason 	min_key = 0;
3585e02119d5SChris Mason 	max_key = 0;
3586e02119d5SChris Mason 	while (1) {
3587dbf39ea4SNikolay Borisov 		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3588dbf39ea4SNikolay Borisov 				ctx, min_key, &max_key);
35894a500fd1SYan, Zheng 		if (ret)
35904a500fd1SYan, Zheng 			return ret;
3591e02119d5SChris Mason 		if (max_key == (u64)-1)
3592e02119d5SChris Mason 			break;
3593e02119d5SChris Mason 		min_key = max_key + 1;
3594e02119d5SChris Mason 	}
3595e02119d5SChris Mason 
3596e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3597e02119d5SChris Mason 		key_type = BTRFS_DIR_INDEX_KEY;
3598e02119d5SChris Mason 		goto again;
3599e02119d5SChris Mason 	}
3600e02119d5SChris Mason 	return 0;
3601e02119d5SChris Mason }
3602e02119d5SChris Mason 
3603e02119d5SChris Mason /*
3604e02119d5SChris Mason  * a helper function to drop items from the log before we relog an
3605e02119d5SChris Mason  * inode.  max_key_type indicates the highest item type to remove.
3606e02119d5SChris Mason  * This cannot be run for file data extents because it does not
3607e02119d5SChris Mason  * free the extents they point to.
3608e02119d5SChris Mason  */
3609e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans,
3610e02119d5SChris Mason 				  struct btrfs_root *log,
3611e02119d5SChris Mason 				  struct btrfs_path *path,
3612e02119d5SChris Mason 				  u64 objectid, int max_key_type)
3613e02119d5SChris Mason {
3614e02119d5SChris Mason 	int ret;
3615e02119d5SChris Mason 	struct btrfs_key key;
3616e02119d5SChris Mason 	struct btrfs_key found_key;
361718ec90d6SJosef Bacik 	int start_slot;
3618e02119d5SChris Mason 
3619e02119d5SChris Mason 	key.objectid = objectid;
3620e02119d5SChris Mason 	key.type = max_key_type;
3621e02119d5SChris Mason 	key.offset = (u64)-1;
3622e02119d5SChris Mason 
3623e02119d5SChris Mason 	while (1) {
3624e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
36253650860bSJosef Bacik 		BUG_ON(ret == 0); /* Logic error */
36264a500fd1SYan, Zheng 		if (ret < 0)
3627e02119d5SChris Mason 			break;
3628e02119d5SChris Mason 
3629e02119d5SChris Mason 		if (path->slots[0] == 0)
3630e02119d5SChris Mason 			break;
3631e02119d5SChris Mason 
3632e02119d5SChris Mason 		path->slots[0]--;
3633e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3634e02119d5SChris Mason 				      path->slots[0]);
3635e02119d5SChris Mason 
3636e02119d5SChris Mason 		if (found_key.objectid != objectid)
3637e02119d5SChris Mason 			break;
3638e02119d5SChris Mason 
363918ec90d6SJosef Bacik 		found_key.offset = 0;
364018ec90d6SJosef Bacik 		found_key.type = 0;
364118ec90d6SJosef Bacik 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
364218ec90d6SJosef Bacik 				       &start_slot);
364318ec90d6SJosef Bacik 
364418ec90d6SJosef Bacik 		ret = btrfs_del_items(trans, log, path, start_slot,
364518ec90d6SJosef Bacik 				      path->slots[0] - start_slot + 1);
364618ec90d6SJosef Bacik 		/*
364718ec90d6SJosef Bacik 		 * If start slot isn't 0 then we don't need to re-search, we've
364818ec90d6SJosef Bacik 		 * found the last guy with the objectid in this tree.
364918ec90d6SJosef Bacik 		 */
365018ec90d6SJosef Bacik 		if (ret || start_slot != 0)
365165a246c5STsutomu Itoh 			break;
3652b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3653e02119d5SChris Mason 	}
3654b3b4aa74SDavid Sterba 	btrfs_release_path(path);
36555bdbeb21SJosef Bacik 	if (ret > 0)
36565bdbeb21SJosef Bacik 		ret = 0;
36574a500fd1SYan, Zheng 	return ret;
3658e02119d5SChris Mason }
3659e02119d5SChris Mason 
366094edf4aeSJosef Bacik static void fill_inode_item(struct btrfs_trans_handle *trans,
366194edf4aeSJosef Bacik 			    struct extent_buffer *leaf,
366294edf4aeSJosef Bacik 			    struct btrfs_inode_item *item,
36631a4bcf47SFilipe Manana 			    struct inode *inode, int log_inode_only,
36641a4bcf47SFilipe Manana 			    u64 logged_isize)
366594edf4aeSJosef Bacik {
36660b1c6ccaSJosef Bacik 	struct btrfs_map_token token;
366794edf4aeSJosef Bacik 
36680b1c6ccaSJosef Bacik 	btrfs_init_map_token(&token);
366994edf4aeSJosef Bacik 
367094edf4aeSJosef Bacik 	if (log_inode_only) {
367194edf4aeSJosef Bacik 		/* set the generation to zero so the recover code
367294edf4aeSJosef Bacik 		 * can tell the difference between an logging
367394edf4aeSJosef Bacik 		 * just to say 'this inode exists' and a logging
367494edf4aeSJosef Bacik 		 * to say 'update this inode with these values'
367594edf4aeSJosef Bacik 		 */
36760b1c6ccaSJosef Bacik 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
36771a4bcf47SFilipe Manana 		btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
367894edf4aeSJosef Bacik 	} else {
36790b1c6ccaSJosef Bacik 		btrfs_set_token_inode_generation(leaf, item,
36800b1c6ccaSJosef Bacik 						 BTRFS_I(inode)->generation,
36810b1c6ccaSJosef Bacik 						 &token);
36820b1c6ccaSJosef Bacik 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
368394edf4aeSJosef Bacik 	}
368494edf4aeSJosef Bacik 
36850b1c6ccaSJosef Bacik 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
36860b1c6ccaSJosef Bacik 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
36870b1c6ccaSJosef Bacik 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
36880b1c6ccaSJosef Bacik 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
36890b1c6ccaSJosef Bacik 
3690a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->atime,
36910b1c6ccaSJosef Bacik 				     inode->i_atime.tv_sec, &token);
3692a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
36930b1c6ccaSJosef Bacik 				      inode->i_atime.tv_nsec, &token);
36940b1c6ccaSJosef Bacik 
3695a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
36960b1c6ccaSJosef Bacik 				     inode->i_mtime.tv_sec, &token);
3697a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
36980b1c6ccaSJosef Bacik 				      inode->i_mtime.tv_nsec, &token);
36990b1c6ccaSJosef Bacik 
3700a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
37010b1c6ccaSJosef Bacik 				     inode->i_ctime.tv_sec, &token);
3702a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
37030b1c6ccaSJosef Bacik 				      inode->i_ctime.tv_nsec, &token);
37040b1c6ccaSJosef Bacik 
37050b1c6ccaSJosef Bacik 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
37060b1c6ccaSJosef Bacik 				     &token);
37070b1c6ccaSJosef Bacik 
3708c7f88c4eSJeff Layton 	btrfs_set_token_inode_sequence(leaf, item,
3709c7f88c4eSJeff Layton 				       inode_peek_iversion(inode), &token);
37100b1c6ccaSJosef Bacik 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
37110b1c6ccaSJosef Bacik 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
37120b1c6ccaSJosef Bacik 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
37130b1c6ccaSJosef Bacik 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
371494edf4aeSJosef Bacik }
371594edf4aeSJosef Bacik 
3716a95249b3SJosef Bacik static int log_inode_item(struct btrfs_trans_handle *trans,
3717a95249b3SJosef Bacik 			  struct btrfs_root *log, struct btrfs_path *path,
37186d889a3bSNikolay Borisov 			  struct btrfs_inode *inode)
3719a95249b3SJosef Bacik {
3720a95249b3SJosef Bacik 	struct btrfs_inode_item *inode_item;
3721a95249b3SJosef Bacik 	int ret;
3722a95249b3SJosef Bacik 
3723efd0c405SFilipe David Borba Manana 	ret = btrfs_insert_empty_item(trans, log, path,
37246d889a3bSNikolay Borisov 				      &inode->location, sizeof(*inode_item));
3725a95249b3SJosef Bacik 	if (ret && ret != -EEXIST)
3726a95249b3SJosef Bacik 		return ret;
3727a95249b3SJosef Bacik 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3728a95249b3SJosef Bacik 				    struct btrfs_inode_item);
37296d889a3bSNikolay Borisov 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
37306d889a3bSNikolay Borisov 			0, 0);
3731a95249b3SJosef Bacik 	btrfs_release_path(path);
3732a95249b3SJosef Bacik 	return 0;
3733a95249b3SJosef Bacik }
3734a95249b3SJosef Bacik 
373531ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans,
373644d70e19SNikolay Borisov 			       struct btrfs_inode *inode,
373731ff1cd2SChris Mason 			       struct btrfs_path *dst_path,
373816e7549fSJosef Bacik 			       struct btrfs_path *src_path, u64 *last_extent,
37391a4bcf47SFilipe Manana 			       int start_slot, int nr, int inode_only,
37401a4bcf47SFilipe Manana 			       u64 logged_isize)
374131ff1cd2SChris Mason {
37423ffbd68cSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
374331ff1cd2SChris Mason 	unsigned long src_offset;
374431ff1cd2SChris Mason 	unsigned long dst_offset;
374544d70e19SNikolay Borisov 	struct btrfs_root *log = inode->root->log_root;
374631ff1cd2SChris Mason 	struct btrfs_file_extent_item *extent;
374731ff1cd2SChris Mason 	struct btrfs_inode_item *inode_item;
374816e7549fSJosef Bacik 	struct extent_buffer *src = src_path->nodes[0];
374916e7549fSJosef Bacik 	struct btrfs_key first_key, last_key, key;
375031ff1cd2SChris Mason 	int ret;
375131ff1cd2SChris Mason 	struct btrfs_key *ins_keys;
375231ff1cd2SChris Mason 	u32 *ins_sizes;
375331ff1cd2SChris Mason 	char *ins_data;
375431ff1cd2SChris Mason 	int i;
3755d20f7043SChris Mason 	struct list_head ordered_sums;
375644d70e19SNikolay Borisov 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
375716e7549fSJosef Bacik 	bool has_extents = false;
375874121f7cSFilipe Manana 	bool need_find_last_extent = true;
375916e7549fSJosef Bacik 	bool done = false;
3760d20f7043SChris Mason 
3761d20f7043SChris Mason 	INIT_LIST_HEAD(&ordered_sums);
376231ff1cd2SChris Mason 
376331ff1cd2SChris Mason 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
376431ff1cd2SChris Mason 			   nr * sizeof(u32), GFP_NOFS);
37652a29edc6Sliubo 	if (!ins_data)
37662a29edc6Sliubo 		return -ENOMEM;
37672a29edc6Sliubo 
376816e7549fSJosef Bacik 	first_key.objectid = (u64)-1;
376916e7549fSJosef Bacik 
377031ff1cd2SChris Mason 	ins_sizes = (u32 *)ins_data;
377131ff1cd2SChris Mason 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
377231ff1cd2SChris Mason 
377331ff1cd2SChris Mason 	for (i = 0; i < nr; i++) {
377431ff1cd2SChris Mason 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
377531ff1cd2SChris Mason 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
377631ff1cd2SChris Mason 	}
377731ff1cd2SChris Mason 	ret = btrfs_insert_empty_items(trans, log, dst_path,
377831ff1cd2SChris Mason 				       ins_keys, ins_sizes, nr);
37794a500fd1SYan, Zheng 	if (ret) {
37804a500fd1SYan, Zheng 		kfree(ins_data);
37814a500fd1SYan, Zheng 		return ret;
37824a500fd1SYan, Zheng 	}
378331ff1cd2SChris Mason 
37845d4f98a2SYan Zheng 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
378531ff1cd2SChris Mason 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
378631ff1cd2SChris Mason 						   dst_path->slots[0]);
378731ff1cd2SChris Mason 
378831ff1cd2SChris Mason 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
378931ff1cd2SChris Mason 
37900dde10beSMatthias Kaehlcke 		if (i == nr - 1)
379116e7549fSJosef Bacik 			last_key = ins_keys[i];
379216e7549fSJosef Bacik 
379394edf4aeSJosef Bacik 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
379431ff1cd2SChris Mason 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
379531ff1cd2SChris Mason 						    dst_path->slots[0],
379631ff1cd2SChris Mason 						    struct btrfs_inode_item);
379794edf4aeSJosef Bacik 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
3798f85b7379SDavid Sterba 					&inode->vfs_inode,
3799f85b7379SDavid Sterba 					inode_only == LOG_INODE_EXISTS,
38001a4bcf47SFilipe Manana 					logged_isize);
380194edf4aeSJosef Bacik 		} else {
380294edf4aeSJosef Bacik 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
380394edf4aeSJosef Bacik 					   src_offset, ins_sizes[i]);
380431ff1cd2SChris Mason 		}
380594edf4aeSJosef Bacik 
380616e7549fSJosef Bacik 		/*
380716e7549fSJosef Bacik 		 * We set need_find_last_extent here in case we know we were
380816e7549fSJosef Bacik 		 * processing other items and then walk into the first extent in
380916e7549fSJosef Bacik 		 * the inode.  If we don't hit an extent then nothing changes,
381016e7549fSJosef Bacik 		 * we'll do the last search the next time around.
381116e7549fSJosef Bacik 		 */
381216e7549fSJosef Bacik 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
381316e7549fSJosef Bacik 			has_extents = true;
381474121f7cSFilipe Manana 			if (first_key.objectid == (u64)-1)
381516e7549fSJosef Bacik 				first_key = ins_keys[i];
381616e7549fSJosef Bacik 		} else {
381716e7549fSJosef Bacik 			need_find_last_extent = false;
381816e7549fSJosef Bacik 		}
381916e7549fSJosef Bacik 
382031ff1cd2SChris Mason 		/* take a reference on file data extents so that truncates
382131ff1cd2SChris Mason 		 * or deletes of this inode don't have to relog the inode
382231ff1cd2SChris Mason 		 * again
382331ff1cd2SChris Mason 		 */
3824962a298fSDavid Sterba 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3825d2794405SLiu Bo 		    !skip_csum) {
382631ff1cd2SChris Mason 			int found_type;
382731ff1cd2SChris Mason 			extent = btrfs_item_ptr(src, start_slot + i,
382831ff1cd2SChris Mason 						struct btrfs_file_extent_item);
382931ff1cd2SChris Mason 
38308e531cdfSliubo 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
38318e531cdfSliubo 				continue;
38328e531cdfSliubo 
383331ff1cd2SChris Mason 			found_type = btrfs_file_extent_type(src, extent);
38346f1fed77SJosef Bacik 			if (found_type == BTRFS_FILE_EXTENT_REG) {
38355d4f98a2SYan Zheng 				u64 ds, dl, cs, cl;
38365d4f98a2SYan Zheng 				ds = btrfs_file_extent_disk_bytenr(src,
383731ff1cd2SChris Mason 								extent);
38385d4f98a2SYan Zheng 				/* ds == 0 is a hole */
38395d4f98a2SYan Zheng 				if (ds == 0)
38405d4f98a2SYan Zheng 					continue;
38415d4f98a2SYan Zheng 
38425d4f98a2SYan Zheng 				dl = btrfs_file_extent_disk_num_bytes(src,
384331ff1cd2SChris Mason 								extent);
38445d4f98a2SYan Zheng 				cs = btrfs_file_extent_offset(src, extent);
38455d4f98a2SYan Zheng 				cl = btrfs_file_extent_num_bytes(src,
3846a419aef8SJoe Perches 								extent);
3847580afd76SChris Mason 				if (btrfs_file_extent_compression(src,
3848580afd76SChris Mason 								  extent)) {
3849580afd76SChris Mason 					cs = 0;
3850580afd76SChris Mason 					cl = dl;
3851580afd76SChris Mason 				}
38525d4f98a2SYan Zheng 
385307d400a6SYan Zheng 				ret = btrfs_lookup_csums_range(
38540b246afaSJeff Mahoney 						fs_info->csum_root,
385507d400a6SYan Zheng 						ds + cs, ds + cs + cl - 1,
3856a2de733cSArne Jansen 						&ordered_sums, 0);
38573650860bSJosef Bacik 				if (ret) {
38583650860bSJosef Bacik 					btrfs_release_path(dst_path);
38593650860bSJosef Bacik 					kfree(ins_data);
38603650860bSJosef Bacik 					return ret;
38613650860bSJosef Bacik 				}
386231ff1cd2SChris Mason 			}
386331ff1cd2SChris Mason 		}
386431ff1cd2SChris Mason 	}
386531ff1cd2SChris Mason 
386631ff1cd2SChris Mason 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3867b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
386831ff1cd2SChris Mason 	kfree(ins_data);
3869d20f7043SChris Mason 
3870d20f7043SChris Mason 	/*
3871d20f7043SChris Mason 	 * we have to do this after the loop above to avoid changing the
3872d20f7043SChris Mason 	 * log tree while trying to change the log tree.
3873d20f7043SChris Mason 	 */
38744a500fd1SYan, Zheng 	ret = 0;
3875d20f7043SChris Mason 	while (!list_empty(&ordered_sums)) {
3876d20f7043SChris Mason 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3877d20f7043SChris Mason 						   struct btrfs_ordered_sum,
3878d20f7043SChris Mason 						   list);
38794a500fd1SYan, Zheng 		if (!ret)
3880d20f7043SChris Mason 			ret = btrfs_csum_file_blocks(trans, log, sums);
3881d20f7043SChris Mason 		list_del(&sums->list);
3882d20f7043SChris Mason 		kfree(sums);
3883d20f7043SChris Mason 	}
388416e7549fSJosef Bacik 
388516e7549fSJosef Bacik 	if (!has_extents)
388616e7549fSJosef Bacik 		return ret;
388716e7549fSJosef Bacik 
388874121f7cSFilipe Manana 	if (need_find_last_extent && *last_extent == first_key.offset) {
388974121f7cSFilipe Manana 		/*
389074121f7cSFilipe Manana 		 * We don't have any leafs between our current one and the one
389174121f7cSFilipe Manana 		 * we processed before that can have file extent items for our
389274121f7cSFilipe Manana 		 * inode (and have a generation number smaller than our current
389374121f7cSFilipe Manana 		 * transaction id).
389474121f7cSFilipe Manana 		 */
389574121f7cSFilipe Manana 		need_find_last_extent = false;
389674121f7cSFilipe Manana 	}
389774121f7cSFilipe Manana 
389816e7549fSJosef Bacik 	/*
389916e7549fSJosef Bacik 	 * Because we use btrfs_search_forward we could skip leaves that were
390016e7549fSJosef Bacik 	 * not modified and then assume *last_extent is valid when it really
390116e7549fSJosef Bacik 	 * isn't.  So back up to the previous leaf and read the end of the last
390216e7549fSJosef Bacik 	 * extent before we go and fill in holes.
390316e7549fSJosef Bacik 	 */
390416e7549fSJosef Bacik 	if (need_find_last_extent) {
390516e7549fSJosef Bacik 		u64 len;
390616e7549fSJosef Bacik 
390744d70e19SNikolay Borisov 		ret = btrfs_prev_leaf(inode->root, src_path);
390816e7549fSJosef Bacik 		if (ret < 0)
390916e7549fSJosef Bacik 			return ret;
391016e7549fSJosef Bacik 		if (ret)
391116e7549fSJosef Bacik 			goto fill_holes;
391216e7549fSJosef Bacik 		if (src_path->slots[0])
391316e7549fSJosef Bacik 			src_path->slots[0]--;
391416e7549fSJosef Bacik 		src = src_path->nodes[0];
391516e7549fSJosef Bacik 		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
391644d70e19SNikolay Borisov 		if (key.objectid != btrfs_ino(inode) ||
391716e7549fSJosef Bacik 		    key.type != BTRFS_EXTENT_DATA_KEY)
391816e7549fSJosef Bacik 			goto fill_holes;
391916e7549fSJosef Bacik 		extent = btrfs_item_ptr(src, src_path->slots[0],
392016e7549fSJosef Bacik 					struct btrfs_file_extent_item);
392116e7549fSJosef Bacik 		if (btrfs_file_extent_type(src, extent) ==
392216e7549fSJosef Bacik 		    BTRFS_FILE_EXTENT_INLINE) {
3923e41ca589SQu Wenruo 			len = btrfs_file_extent_ram_bytes(src, extent);
392416e7549fSJosef Bacik 			*last_extent = ALIGN(key.offset + len,
39250b246afaSJeff Mahoney 					     fs_info->sectorsize);
392616e7549fSJosef Bacik 		} else {
392716e7549fSJosef Bacik 			len = btrfs_file_extent_num_bytes(src, extent);
392816e7549fSJosef Bacik 			*last_extent = key.offset + len;
392916e7549fSJosef Bacik 		}
393016e7549fSJosef Bacik 	}
393116e7549fSJosef Bacik fill_holes:
393216e7549fSJosef Bacik 	/* So we did prev_leaf, now we need to move to the next leaf, but a few
393316e7549fSJosef Bacik 	 * things could have happened
393416e7549fSJosef Bacik 	 *
393516e7549fSJosef Bacik 	 * 1) A merge could have happened, so we could currently be on a leaf
393616e7549fSJosef Bacik 	 * that holds what we were copying in the first place.
393716e7549fSJosef Bacik 	 * 2) A split could have happened, and now not all of the items we want
393816e7549fSJosef Bacik 	 * are on the same leaf.
393916e7549fSJosef Bacik 	 *
394016e7549fSJosef Bacik 	 * So we need to adjust how we search for holes, we need to drop the
394116e7549fSJosef Bacik 	 * path and re-search for the first extent key we found, and then walk
394216e7549fSJosef Bacik 	 * forward until we hit the last one we copied.
394316e7549fSJosef Bacik 	 */
394416e7549fSJosef Bacik 	if (need_find_last_extent) {
394516e7549fSJosef Bacik 		/* btrfs_prev_leaf could return 1 without releasing the path */
394616e7549fSJosef Bacik 		btrfs_release_path(src_path);
3947f85b7379SDavid Sterba 		ret = btrfs_search_slot(NULL, inode->root, &first_key,
3948f85b7379SDavid Sterba 				src_path, 0, 0);
394916e7549fSJosef Bacik 		if (ret < 0)
395016e7549fSJosef Bacik 			return ret;
395116e7549fSJosef Bacik 		ASSERT(ret == 0);
395216e7549fSJosef Bacik 		src = src_path->nodes[0];
395316e7549fSJosef Bacik 		i = src_path->slots[0];
395416e7549fSJosef Bacik 	} else {
395516e7549fSJosef Bacik 		i = start_slot;
395616e7549fSJosef Bacik 	}
395716e7549fSJosef Bacik 
395816e7549fSJosef Bacik 	/*
395916e7549fSJosef Bacik 	 * Ok so here we need to go through and fill in any holes we may have
396016e7549fSJosef Bacik 	 * to make sure that holes are punched for those areas in case they had
396116e7549fSJosef Bacik 	 * extents previously.
396216e7549fSJosef Bacik 	 */
396316e7549fSJosef Bacik 	while (!done) {
396416e7549fSJosef Bacik 		u64 offset, len;
396516e7549fSJosef Bacik 		u64 extent_end;
396616e7549fSJosef Bacik 
396716e7549fSJosef Bacik 		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
396844d70e19SNikolay Borisov 			ret = btrfs_next_leaf(inode->root, src_path);
396916e7549fSJosef Bacik 			if (ret < 0)
397016e7549fSJosef Bacik 				return ret;
397116e7549fSJosef Bacik 			ASSERT(ret == 0);
397216e7549fSJosef Bacik 			src = src_path->nodes[0];
397316e7549fSJosef Bacik 			i = 0;
39748434ec46SFilipe Manana 			need_find_last_extent = true;
397516e7549fSJosef Bacik 		}
397616e7549fSJosef Bacik 
397716e7549fSJosef Bacik 		btrfs_item_key_to_cpu(src, &key, i);
397816e7549fSJosef Bacik 		if (!btrfs_comp_cpu_keys(&key, &last_key))
397916e7549fSJosef Bacik 			done = true;
398044d70e19SNikolay Borisov 		if (key.objectid != btrfs_ino(inode) ||
398116e7549fSJosef Bacik 		    key.type != BTRFS_EXTENT_DATA_KEY) {
398216e7549fSJosef Bacik 			i++;
398316e7549fSJosef Bacik 			continue;
398416e7549fSJosef Bacik 		}
398516e7549fSJosef Bacik 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
398616e7549fSJosef Bacik 		if (btrfs_file_extent_type(src, extent) ==
398716e7549fSJosef Bacik 		    BTRFS_FILE_EXTENT_INLINE) {
3988e41ca589SQu Wenruo 			len = btrfs_file_extent_ram_bytes(src, extent);
3989da17066cSJeff Mahoney 			extent_end = ALIGN(key.offset + len,
39900b246afaSJeff Mahoney 					   fs_info->sectorsize);
399116e7549fSJosef Bacik 		} else {
399216e7549fSJosef Bacik 			len = btrfs_file_extent_num_bytes(src, extent);
399316e7549fSJosef Bacik 			extent_end = key.offset + len;
399416e7549fSJosef Bacik 		}
399516e7549fSJosef Bacik 		i++;
399616e7549fSJosef Bacik 
399716e7549fSJosef Bacik 		if (*last_extent == key.offset) {
399816e7549fSJosef Bacik 			*last_extent = extent_end;
399916e7549fSJosef Bacik 			continue;
400016e7549fSJosef Bacik 		}
400116e7549fSJosef Bacik 		offset = *last_extent;
400216e7549fSJosef Bacik 		len = key.offset - *last_extent;
400344d70e19SNikolay Borisov 		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
400444d70e19SNikolay Borisov 				offset, 0, 0, len, 0, len, 0, 0, 0);
400516e7549fSJosef Bacik 		if (ret)
400616e7549fSJosef Bacik 			break;
400774121f7cSFilipe Manana 		*last_extent = extent_end;
400816e7549fSJosef Bacik 	}
40094ee3fad3SFilipe Manana 
40104ee3fad3SFilipe Manana 	/*
40114ee3fad3SFilipe Manana 	 * Check if there is a hole between the last extent found in our leaf
40124ee3fad3SFilipe Manana 	 * and the first extent in the next leaf. If there is one, we need to
40134ee3fad3SFilipe Manana 	 * log an explicit hole so that at replay time we can punch the hole.
40144ee3fad3SFilipe Manana 	 */
40154ee3fad3SFilipe Manana 	if (ret == 0 &&
40164ee3fad3SFilipe Manana 	    key.objectid == btrfs_ino(inode) &&
40174ee3fad3SFilipe Manana 	    key.type == BTRFS_EXTENT_DATA_KEY &&
40184ee3fad3SFilipe Manana 	    i == btrfs_header_nritems(src_path->nodes[0])) {
40194ee3fad3SFilipe Manana 		ret = btrfs_next_leaf(inode->root, src_path);
40204ee3fad3SFilipe Manana 		need_find_last_extent = true;
40214ee3fad3SFilipe Manana 		if (ret > 0) {
40224ee3fad3SFilipe Manana 			ret = 0;
40234ee3fad3SFilipe Manana 		} else if (ret == 0) {
40244ee3fad3SFilipe Manana 			btrfs_item_key_to_cpu(src_path->nodes[0], &key,
40254ee3fad3SFilipe Manana 					      src_path->slots[0]);
40264ee3fad3SFilipe Manana 			if (key.objectid == btrfs_ino(inode) &&
40274ee3fad3SFilipe Manana 			    key.type == BTRFS_EXTENT_DATA_KEY &&
40284ee3fad3SFilipe Manana 			    *last_extent < key.offset) {
40294ee3fad3SFilipe Manana 				const u64 len = key.offset - *last_extent;
40304ee3fad3SFilipe Manana 
40314ee3fad3SFilipe Manana 				ret = btrfs_insert_file_extent(trans, log,
40324ee3fad3SFilipe Manana 							       btrfs_ino(inode),
40334ee3fad3SFilipe Manana 							       *last_extent, 0,
40344ee3fad3SFilipe Manana 							       0, len, 0, len,
40354ee3fad3SFilipe Manana 							       0, 0, 0);
40364ee3fad3SFilipe Manana 			}
40374ee3fad3SFilipe Manana 		}
40384ee3fad3SFilipe Manana 	}
403916e7549fSJosef Bacik 	/*
404016e7549fSJosef Bacik 	 * Need to let the callers know we dropped the path so they should
404116e7549fSJosef Bacik 	 * re-search.
404216e7549fSJosef Bacik 	 */
404316e7549fSJosef Bacik 	if (!ret && need_find_last_extent)
404416e7549fSJosef Bacik 		ret = 1;
40454a500fd1SYan, Zheng 	return ret;
404631ff1cd2SChris Mason }
404731ff1cd2SChris Mason 
40485dc562c5SJosef Bacik static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
40495dc562c5SJosef Bacik {
40505dc562c5SJosef Bacik 	struct extent_map *em1, *em2;
40515dc562c5SJosef Bacik 
40525dc562c5SJosef Bacik 	em1 = list_entry(a, struct extent_map, list);
40535dc562c5SJosef Bacik 	em2 = list_entry(b, struct extent_map, list);
40545dc562c5SJosef Bacik 
40555dc562c5SJosef Bacik 	if (em1->start < em2->start)
40565dc562c5SJosef Bacik 		return -1;
40575dc562c5SJosef Bacik 	else if (em1->start > em2->start)
40585dc562c5SJosef Bacik 		return 1;
40595dc562c5SJosef Bacik 	return 0;
40605dc562c5SJosef Bacik }
40615dc562c5SJosef Bacik 
4062e7175a69SJosef Bacik static int log_extent_csums(struct btrfs_trans_handle *trans,
4063e7175a69SJosef Bacik 			    struct btrfs_inode *inode,
4064a9ecb653SNikolay Borisov 			    struct btrfs_root *log_root,
4065e7175a69SJosef Bacik 			    const struct extent_map *em)
40665dc562c5SJosef Bacik {
40672ab28f32SJosef Bacik 	u64 csum_offset;
40682ab28f32SJosef Bacik 	u64 csum_len;
40698407f553SFilipe Manana 	LIST_HEAD(ordered_sums);
40708407f553SFilipe Manana 	int ret = 0;
407109a2a8f9SJosef Bacik 
4072e7175a69SJosef Bacik 	if (inode->flags & BTRFS_INODE_NODATASUM ||
4073e7175a69SJosef Bacik 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
40748407f553SFilipe Manana 	    em->block_start == EXTENT_MAP_HOLE)
407570c8a91cSJosef Bacik 		return 0;
407670c8a91cSJosef Bacik 
4077e7175a69SJosef Bacik 	/* If we're compressed we have to save the entire range of csums. */
4078488111aaSFilipe David Borba Manana 	if (em->compress_type) {
4079488111aaSFilipe David Borba Manana 		csum_offset = 0;
40808407f553SFilipe Manana 		csum_len = max(em->block_len, em->orig_block_len);
4081488111aaSFilipe David Borba Manana 	} else {
4082e7175a69SJosef Bacik 		csum_offset = em->mod_start - em->start;
4083e7175a69SJosef Bacik 		csum_len = em->mod_len;
4084488111aaSFilipe David Borba Manana 	}
40852ab28f32SJosef Bacik 
408670c8a91cSJosef Bacik 	/* block start is already adjusted for the file extent offset. */
4087a9ecb653SNikolay Borisov 	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
408870c8a91cSJosef Bacik 				       em->block_start + csum_offset,
408970c8a91cSJosef Bacik 				       em->block_start + csum_offset +
409070c8a91cSJosef Bacik 				       csum_len - 1, &ordered_sums, 0);
40915dc562c5SJosef Bacik 	if (ret)
40925dc562c5SJosef Bacik 		return ret;
409370c8a91cSJosef Bacik 
409470c8a91cSJosef Bacik 	while (!list_empty(&ordered_sums)) {
409570c8a91cSJosef Bacik 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
409670c8a91cSJosef Bacik 						   struct btrfs_ordered_sum,
409770c8a91cSJosef Bacik 						   list);
409870c8a91cSJosef Bacik 		if (!ret)
4099a9ecb653SNikolay Borisov 			ret = btrfs_csum_file_blocks(trans, log_root, sums);
410070c8a91cSJosef Bacik 		list_del(&sums->list);
410170c8a91cSJosef Bacik 		kfree(sums);
41025dc562c5SJosef Bacik 	}
41035dc562c5SJosef Bacik 
410470c8a91cSJosef Bacik 	return ret;
41055dc562c5SJosef Bacik }
41065dc562c5SJosef Bacik 
41078407f553SFilipe Manana static int log_one_extent(struct btrfs_trans_handle *trans,
41089d122629SNikolay Borisov 			  struct btrfs_inode *inode, struct btrfs_root *root,
41098407f553SFilipe Manana 			  const struct extent_map *em,
41108407f553SFilipe Manana 			  struct btrfs_path *path,
41118407f553SFilipe Manana 			  struct btrfs_log_ctx *ctx)
41128407f553SFilipe Manana {
41138407f553SFilipe Manana 	struct btrfs_root *log = root->log_root;
41148407f553SFilipe Manana 	struct btrfs_file_extent_item *fi;
41158407f553SFilipe Manana 	struct extent_buffer *leaf;
41168407f553SFilipe Manana 	struct btrfs_map_token token;
41178407f553SFilipe Manana 	struct btrfs_key key;
41188407f553SFilipe Manana 	u64 extent_offset = em->start - em->orig_start;
41198407f553SFilipe Manana 	u64 block_len;
41208407f553SFilipe Manana 	int ret;
41218407f553SFilipe Manana 	int extent_inserted = 0;
41228407f553SFilipe Manana 
4123a9ecb653SNikolay Borisov 	ret = log_extent_csums(trans, inode, log, em);
41248407f553SFilipe Manana 	if (ret)
41258407f553SFilipe Manana 		return ret;
41268407f553SFilipe Manana 
41278407f553SFilipe Manana 	btrfs_init_map_token(&token);
41288407f553SFilipe Manana 
41299d122629SNikolay Borisov 	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
41308407f553SFilipe Manana 				   em->start + em->len, NULL, 0, 1,
41318407f553SFilipe Manana 				   sizeof(*fi), &extent_inserted);
41328407f553SFilipe Manana 	if (ret)
41338407f553SFilipe Manana 		return ret;
41348407f553SFilipe Manana 
41358407f553SFilipe Manana 	if (!extent_inserted) {
41369d122629SNikolay Borisov 		key.objectid = btrfs_ino(inode);
41378407f553SFilipe Manana 		key.type = BTRFS_EXTENT_DATA_KEY;
41388407f553SFilipe Manana 		key.offset = em->start;
41398407f553SFilipe Manana 
41408407f553SFilipe Manana 		ret = btrfs_insert_empty_item(trans, log, path, &key,
41418407f553SFilipe Manana 					      sizeof(*fi));
41428407f553SFilipe Manana 		if (ret)
41438407f553SFilipe Manana 			return ret;
41448407f553SFilipe Manana 	}
41458407f553SFilipe Manana 	leaf = path->nodes[0];
41468407f553SFilipe Manana 	fi = btrfs_item_ptr(leaf, path->slots[0],
41478407f553SFilipe Manana 			    struct btrfs_file_extent_item);
41488407f553SFilipe Manana 
414950d9aa99SJosef Bacik 	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
41508407f553SFilipe Manana 					       &token);
41518407f553SFilipe Manana 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
41528407f553SFilipe Manana 		btrfs_set_token_file_extent_type(leaf, fi,
41538407f553SFilipe Manana 						 BTRFS_FILE_EXTENT_PREALLOC,
41548407f553SFilipe Manana 						 &token);
41558407f553SFilipe Manana 	else
41568407f553SFilipe Manana 		btrfs_set_token_file_extent_type(leaf, fi,
41578407f553SFilipe Manana 						 BTRFS_FILE_EXTENT_REG,
41588407f553SFilipe Manana 						 &token);
41598407f553SFilipe Manana 
41608407f553SFilipe Manana 	block_len = max(em->block_len, em->orig_block_len);
41618407f553SFilipe Manana 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
41628407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
41638407f553SFilipe Manana 							em->block_start,
41648407f553SFilipe Manana 							&token);
41658407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
41668407f553SFilipe Manana 							   &token);
41678407f553SFilipe Manana 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
41688407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
41698407f553SFilipe Manana 							em->block_start -
41708407f553SFilipe Manana 							extent_offset, &token);
41718407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
41728407f553SFilipe Manana 							   &token);
41738407f553SFilipe Manana 	} else {
41748407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
41758407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
41768407f553SFilipe Manana 							   &token);
41778407f553SFilipe Manana 	}
41788407f553SFilipe Manana 
41798407f553SFilipe Manana 	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
41808407f553SFilipe Manana 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
41818407f553SFilipe Manana 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
41828407f553SFilipe Manana 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
41838407f553SFilipe Manana 						&token);
41848407f553SFilipe Manana 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
41858407f553SFilipe Manana 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
41868407f553SFilipe Manana 	btrfs_mark_buffer_dirty(leaf);
41878407f553SFilipe Manana 
41888407f553SFilipe Manana 	btrfs_release_path(path);
41898407f553SFilipe Manana 
41908407f553SFilipe Manana 	return ret;
41918407f553SFilipe Manana }
41928407f553SFilipe Manana 
419331d11b83SFilipe Manana /*
419431d11b83SFilipe Manana  * Log all prealloc extents beyond the inode's i_size to make sure we do not
419531d11b83SFilipe Manana  * lose them after doing a fast fsync and replaying the log. We scan the
419631d11b83SFilipe Manana  * subvolume's root instead of iterating the inode's extent map tree because
419731d11b83SFilipe Manana  * otherwise we can log incorrect extent items based on extent map conversion.
419831d11b83SFilipe Manana  * That can happen due to the fact that extent maps are merged when they
419931d11b83SFilipe Manana  * are not in the extent map tree's list of modified extents.
420031d11b83SFilipe Manana  */
420131d11b83SFilipe Manana static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
420231d11b83SFilipe Manana 				      struct btrfs_inode *inode,
420331d11b83SFilipe Manana 				      struct btrfs_path *path)
420431d11b83SFilipe Manana {
420531d11b83SFilipe Manana 	struct btrfs_root *root = inode->root;
420631d11b83SFilipe Manana 	struct btrfs_key key;
420731d11b83SFilipe Manana 	const u64 i_size = i_size_read(&inode->vfs_inode);
420831d11b83SFilipe Manana 	const u64 ino = btrfs_ino(inode);
420931d11b83SFilipe Manana 	struct btrfs_path *dst_path = NULL;
421031d11b83SFilipe Manana 	u64 last_extent = (u64)-1;
421131d11b83SFilipe Manana 	int ins_nr = 0;
421231d11b83SFilipe Manana 	int start_slot;
421331d11b83SFilipe Manana 	int ret;
421431d11b83SFilipe Manana 
421531d11b83SFilipe Manana 	if (!(inode->flags & BTRFS_INODE_PREALLOC))
421631d11b83SFilipe Manana 		return 0;
421731d11b83SFilipe Manana 
421831d11b83SFilipe Manana 	key.objectid = ino;
421931d11b83SFilipe Manana 	key.type = BTRFS_EXTENT_DATA_KEY;
422031d11b83SFilipe Manana 	key.offset = i_size;
422131d11b83SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
422231d11b83SFilipe Manana 	if (ret < 0)
422331d11b83SFilipe Manana 		goto out;
422431d11b83SFilipe Manana 
422531d11b83SFilipe Manana 	while (true) {
422631d11b83SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
422731d11b83SFilipe Manana 		int slot = path->slots[0];
422831d11b83SFilipe Manana 
422931d11b83SFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
423031d11b83SFilipe Manana 			if (ins_nr > 0) {
423131d11b83SFilipe Manana 				ret = copy_items(trans, inode, dst_path, path,
423231d11b83SFilipe Manana 						 &last_extent, start_slot,
423331d11b83SFilipe Manana 						 ins_nr, 1, 0);
423431d11b83SFilipe Manana 				if (ret < 0)
423531d11b83SFilipe Manana 					goto out;
423631d11b83SFilipe Manana 				ins_nr = 0;
423731d11b83SFilipe Manana 			}
423831d11b83SFilipe Manana 			ret = btrfs_next_leaf(root, path);
423931d11b83SFilipe Manana 			if (ret < 0)
424031d11b83SFilipe Manana 				goto out;
424131d11b83SFilipe Manana 			if (ret > 0) {
424231d11b83SFilipe Manana 				ret = 0;
424331d11b83SFilipe Manana 				break;
424431d11b83SFilipe Manana 			}
424531d11b83SFilipe Manana 			continue;
424631d11b83SFilipe Manana 		}
424731d11b83SFilipe Manana 
424831d11b83SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
424931d11b83SFilipe Manana 		if (key.objectid > ino)
425031d11b83SFilipe Manana 			break;
425131d11b83SFilipe Manana 		if (WARN_ON_ONCE(key.objectid < ino) ||
425231d11b83SFilipe Manana 		    key.type < BTRFS_EXTENT_DATA_KEY ||
425331d11b83SFilipe Manana 		    key.offset < i_size) {
425431d11b83SFilipe Manana 			path->slots[0]++;
425531d11b83SFilipe Manana 			continue;
425631d11b83SFilipe Manana 		}
425731d11b83SFilipe Manana 		if (last_extent == (u64)-1) {
425831d11b83SFilipe Manana 			last_extent = key.offset;
425931d11b83SFilipe Manana 			/*
426031d11b83SFilipe Manana 			 * Avoid logging extent items logged in past fsync calls
426131d11b83SFilipe Manana 			 * and leading to duplicate keys in the log tree.
426231d11b83SFilipe Manana 			 */
426331d11b83SFilipe Manana 			do {
426431d11b83SFilipe Manana 				ret = btrfs_truncate_inode_items(trans,
426531d11b83SFilipe Manana 							 root->log_root,
426631d11b83SFilipe Manana 							 &inode->vfs_inode,
426731d11b83SFilipe Manana 							 i_size,
426831d11b83SFilipe Manana 							 BTRFS_EXTENT_DATA_KEY);
426931d11b83SFilipe Manana 			} while (ret == -EAGAIN);
427031d11b83SFilipe Manana 			if (ret)
427131d11b83SFilipe Manana 				goto out;
427231d11b83SFilipe Manana 		}
427331d11b83SFilipe Manana 		if (ins_nr == 0)
427431d11b83SFilipe Manana 			start_slot = slot;
427531d11b83SFilipe Manana 		ins_nr++;
427631d11b83SFilipe Manana 		path->slots[0]++;
427731d11b83SFilipe Manana 		if (!dst_path) {
427831d11b83SFilipe Manana 			dst_path = btrfs_alloc_path();
427931d11b83SFilipe Manana 			if (!dst_path) {
428031d11b83SFilipe Manana 				ret = -ENOMEM;
428131d11b83SFilipe Manana 				goto out;
428231d11b83SFilipe Manana 			}
428331d11b83SFilipe Manana 		}
428431d11b83SFilipe Manana 	}
428531d11b83SFilipe Manana 	if (ins_nr > 0) {
428631d11b83SFilipe Manana 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
428731d11b83SFilipe Manana 				 start_slot, ins_nr, 1, 0);
428831d11b83SFilipe Manana 		if (ret > 0)
428931d11b83SFilipe Manana 			ret = 0;
429031d11b83SFilipe Manana 	}
429131d11b83SFilipe Manana out:
429231d11b83SFilipe Manana 	btrfs_release_path(path);
429331d11b83SFilipe Manana 	btrfs_free_path(dst_path);
429431d11b83SFilipe Manana 	return ret;
429531d11b83SFilipe Manana }
429631d11b83SFilipe Manana 
42975dc562c5SJosef Bacik static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
42985dc562c5SJosef Bacik 				     struct btrfs_root *root,
42999d122629SNikolay Borisov 				     struct btrfs_inode *inode,
4300827463c4SMiao Xie 				     struct btrfs_path *path,
4301de0ee0edSFilipe Manana 				     struct btrfs_log_ctx *ctx,
4302de0ee0edSFilipe Manana 				     const u64 start,
4303de0ee0edSFilipe Manana 				     const u64 end)
43045dc562c5SJosef Bacik {
43055dc562c5SJosef Bacik 	struct extent_map *em, *n;
43065dc562c5SJosef Bacik 	struct list_head extents;
43079d122629SNikolay Borisov 	struct extent_map_tree *tree = &inode->extent_tree;
43088c6c5928SJosef Bacik 	u64 logged_start, logged_end;
43095dc562c5SJosef Bacik 	u64 test_gen;
43105dc562c5SJosef Bacik 	int ret = 0;
43112ab28f32SJosef Bacik 	int num = 0;
43125dc562c5SJosef Bacik 
43135dc562c5SJosef Bacik 	INIT_LIST_HEAD(&extents);
43145dc562c5SJosef Bacik 
43159d122629SNikolay Borisov 	down_write(&inode->dio_sem);
43165dc562c5SJosef Bacik 	write_lock(&tree->lock);
43175dc562c5SJosef Bacik 	test_gen = root->fs_info->last_trans_committed;
43188c6c5928SJosef Bacik 	logged_start = start;
43198c6c5928SJosef Bacik 	logged_end = end;
43205dc562c5SJosef Bacik 
43215dc562c5SJosef Bacik 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
43225dc562c5SJosef Bacik 		list_del_init(&em->list);
43232ab28f32SJosef Bacik 		/*
43242ab28f32SJosef Bacik 		 * Just an arbitrary number, this can be really CPU intensive
43252ab28f32SJosef Bacik 		 * once we start getting a lot of extents, and really once we
43262ab28f32SJosef Bacik 		 * have a bunch of extents we just want to commit since it will
43272ab28f32SJosef Bacik 		 * be faster.
43282ab28f32SJosef Bacik 		 */
43292ab28f32SJosef Bacik 		if (++num > 32768) {
43302ab28f32SJosef Bacik 			list_del_init(&tree->modified_extents);
43312ab28f32SJosef Bacik 			ret = -EFBIG;
43322ab28f32SJosef Bacik 			goto process;
43332ab28f32SJosef Bacik 		}
43342ab28f32SJosef Bacik 
43355dc562c5SJosef Bacik 		if (em->generation <= test_gen)
43365dc562c5SJosef Bacik 			continue;
43378c6c5928SJosef Bacik 
433831d11b83SFilipe Manana 		/* We log prealloc extents beyond eof later. */
433931d11b83SFilipe Manana 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
434031d11b83SFilipe Manana 		    em->start >= i_size_read(&inode->vfs_inode))
434131d11b83SFilipe Manana 			continue;
434231d11b83SFilipe Manana 
43438c6c5928SJosef Bacik 		if (em->start < logged_start)
43448c6c5928SJosef Bacik 			logged_start = em->start;
43458c6c5928SJosef Bacik 		if ((em->start + em->len - 1) > logged_end)
43468c6c5928SJosef Bacik 			logged_end = em->start + em->len - 1;
43478c6c5928SJosef Bacik 
4348ff44c6e3SJosef Bacik 		/* Need a ref to keep it from getting evicted from cache */
4349490b54d6SElena Reshetova 		refcount_inc(&em->refs);
4350ff44c6e3SJosef Bacik 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
43515dc562c5SJosef Bacik 		list_add_tail(&em->list, &extents);
43522ab28f32SJosef Bacik 		num++;
43535dc562c5SJosef Bacik 	}
43545dc562c5SJosef Bacik 
43555dc562c5SJosef Bacik 	list_sort(NULL, &extents, extent_cmp);
43562ab28f32SJosef Bacik process:
43575dc562c5SJosef Bacik 	while (!list_empty(&extents)) {
43585dc562c5SJosef Bacik 		em = list_entry(extents.next, struct extent_map, list);
43595dc562c5SJosef Bacik 
43605dc562c5SJosef Bacik 		list_del_init(&em->list);
43615dc562c5SJosef Bacik 
43625dc562c5SJosef Bacik 		/*
43635dc562c5SJosef Bacik 		 * If we had an error we just need to delete everybody from our
43645dc562c5SJosef Bacik 		 * private list.
43655dc562c5SJosef Bacik 		 */
4366ff44c6e3SJosef Bacik 		if (ret) {
4367201a9038SJosef Bacik 			clear_em_logging(tree, em);
4368ff44c6e3SJosef Bacik 			free_extent_map(em);
43695dc562c5SJosef Bacik 			continue;
4370ff44c6e3SJosef Bacik 		}
4371ff44c6e3SJosef Bacik 
4372ff44c6e3SJosef Bacik 		write_unlock(&tree->lock);
43735dc562c5SJosef Bacik 
4374a2120a47SJosef Bacik 		ret = log_one_extent(trans, inode, root, em, path, ctx);
4375ff44c6e3SJosef Bacik 		write_lock(&tree->lock);
4376201a9038SJosef Bacik 		clear_em_logging(tree, em);
4377201a9038SJosef Bacik 		free_extent_map(em);
43785dc562c5SJosef Bacik 	}
4379ff44c6e3SJosef Bacik 	WARN_ON(!list_empty(&extents));
4380ff44c6e3SJosef Bacik 	write_unlock(&tree->lock);
43819d122629SNikolay Borisov 	up_write(&inode->dio_sem);
43825dc562c5SJosef Bacik 
43835dc562c5SJosef Bacik 	btrfs_release_path(path);
438431d11b83SFilipe Manana 	if (!ret)
438531d11b83SFilipe Manana 		ret = btrfs_log_prealloc_extents(trans, inode, path);
438631d11b83SFilipe Manana 
43875dc562c5SJosef Bacik 	return ret;
43885dc562c5SJosef Bacik }
43895dc562c5SJosef Bacik 
4390481b01c0SNikolay Borisov static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
43911a4bcf47SFilipe Manana 			     struct btrfs_path *path, u64 *size_ret)
43921a4bcf47SFilipe Manana {
43931a4bcf47SFilipe Manana 	struct btrfs_key key;
43941a4bcf47SFilipe Manana 	int ret;
43951a4bcf47SFilipe Manana 
4396481b01c0SNikolay Borisov 	key.objectid = btrfs_ino(inode);
43971a4bcf47SFilipe Manana 	key.type = BTRFS_INODE_ITEM_KEY;
43981a4bcf47SFilipe Manana 	key.offset = 0;
43991a4bcf47SFilipe Manana 
44001a4bcf47SFilipe Manana 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
44011a4bcf47SFilipe Manana 	if (ret < 0) {
44021a4bcf47SFilipe Manana 		return ret;
44031a4bcf47SFilipe Manana 	} else if (ret > 0) {
44042f2ff0eeSFilipe Manana 		*size_ret = 0;
44051a4bcf47SFilipe Manana 	} else {
44061a4bcf47SFilipe Manana 		struct btrfs_inode_item *item;
44071a4bcf47SFilipe Manana 
44081a4bcf47SFilipe Manana 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
44091a4bcf47SFilipe Manana 				      struct btrfs_inode_item);
44101a4bcf47SFilipe Manana 		*size_ret = btrfs_inode_size(path->nodes[0], item);
44111a4bcf47SFilipe Manana 	}
44121a4bcf47SFilipe Manana 
44131a4bcf47SFilipe Manana 	btrfs_release_path(path);
44141a4bcf47SFilipe Manana 	return 0;
44151a4bcf47SFilipe Manana }
44161a4bcf47SFilipe Manana 
441736283bf7SFilipe Manana /*
441836283bf7SFilipe Manana  * At the moment we always log all xattrs. This is to figure out at log replay
441936283bf7SFilipe Manana  * time which xattrs must have their deletion replayed. If a xattr is missing
442036283bf7SFilipe Manana  * in the log tree and exists in the fs/subvol tree, we delete it. This is
442136283bf7SFilipe Manana  * because if a xattr is deleted, the inode is fsynced and a power failure
442236283bf7SFilipe Manana  * happens, causing the log to be replayed the next time the fs is mounted,
442336283bf7SFilipe Manana  * we want the xattr to not exist anymore (same behaviour as other filesystems
442436283bf7SFilipe Manana  * with a journal, ext3/4, xfs, f2fs, etc).
442536283bf7SFilipe Manana  */
442636283bf7SFilipe Manana static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
442736283bf7SFilipe Manana 				struct btrfs_root *root,
44281a93c36aSNikolay Borisov 				struct btrfs_inode *inode,
442936283bf7SFilipe Manana 				struct btrfs_path *path,
443036283bf7SFilipe Manana 				struct btrfs_path *dst_path)
443136283bf7SFilipe Manana {
443236283bf7SFilipe Manana 	int ret;
443336283bf7SFilipe Manana 	struct btrfs_key key;
44341a93c36aSNikolay Borisov 	const u64 ino = btrfs_ino(inode);
443536283bf7SFilipe Manana 	int ins_nr = 0;
443636283bf7SFilipe Manana 	int start_slot = 0;
443736283bf7SFilipe Manana 
443836283bf7SFilipe Manana 	key.objectid = ino;
443936283bf7SFilipe Manana 	key.type = BTRFS_XATTR_ITEM_KEY;
444036283bf7SFilipe Manana 	key.offset = 0;
444136283bf7SFilipe Manana 
444236283bf7SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
444336283bf7SFilipe Manana 	if (ret < 0)
444436283bf7SFilipe Manana 		return ret;
444536283bf7SFilipe Manana 
444636283bf7SFilipe Manana 	while (true) {
444736283bf7SFilipe Manana 		int slot = path->slots[0];
444836283bf7SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
444936283bf7SFilipe Manana 		int nritems = btrfs_header_nritems(leaf);
445036283bf7SFilipe Manana 
445136283bf7SFilipe Manana 		if (slot >= nritems) {
445236283bf7SFilipe Manana 			if (ins_nr > 0) {
445336283bf7SFilipe Manana 				u64 last_extent = 0;
445436283bf7SFilipe Manana 
44551a93c36aSNikolay Borisov 				ret = copy_items(trans, inode, dst_path, path,
445636283bf7SFilipe Manana 						 &last_extent, start_slot,
445736283bf7SFilipe Manana 						 ins_nr, 1, 0);
445836283bf7SFilipe Manana 				/* can't be 1, extent items aren't processed */
445936283bf7SFilipe Manana 				ASSERT(ret <= 0);
446036283bf7SFilipe Manana 				if (ret < 0)
446136283bf7SFilipe Manana 					return ret;
446236283bf7SFilipe Manana 				ins_nr = 0;
446336283bf7SFilipe Manana 			}
446436283bf7SFilipe Manana 			ret = btrfs_next_leaf(root, path);
446536283bf7SFilipe Manana 			if (ret < 0)
446636283bf7SFilipe Manana 				return ret;
446736283bf7SFilipe Manana 			else if (ret > 0)
446836283bf7SFilipe Manana 				break;
446936283bf7SFilipe Manana 			continue;
447036283bf7SFilipe Manana 		}
447136283bf7SFilipe Manana 
447236283bf7SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
447336283bf7SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
447436283bf7SFilipe Manana 			break;
447536283bf7SFilipe Manana 
447636283bf7SFilipe Manana 		if (ins_nr == 0)
447736283bf7SFilipe Manana 			start_slot = slot;
447836283bf7SFilipe Manana 		ins_nr++;
447936283bf7SFilipe Manana 		path->slots[0]++;
448036283bf7SFilipe Manana 		cond_resched();
448136283bf7SFilipe Manana 	}
448236283bf7SFilipe Manana 	if (ins_nr > 0) {
448336283bf7SFilipe Manana 		u64 last_extent = 0;
448436283bf7SFilipe Manana 
44851a93c36aSNikolay Borisov 		ret = copy_items(trans, inode, dst_path, path,
448636283bf7SFilipe Manana 				 &last_extent, start_slot,
448736283bf7SFilipe Manana 				 ins_nr, 1, 0);
448836283bf7SFilipe Manana 		/* can't be 1, extent items aren't processed */
448936283bf7SFilipe Manana 		ASSERT(ret <= 0);
449036283bf7SFilipe Manana 		if (ret < 0)
449136283bf7SFilipe Manana 			return ret;
449236283bf7SFilipe Manana 	}
449336283bf7SFilipe Manana 
449436283bf7SFilipe Manana 	return 0;
449536283bf7SFilipe Manana }
449636283bf7SFilipe Manana 
4497a89ca6f2SFilipe Manana /*
4498a89ca6f2SFilipe Manana  * If the no holes feature is enabled we need to make sure any hole between the
4499a89ca6f2SFilipe Manana  * last extent and the i_size of our inode is explicitly marked in the log. This
4500a89ca6f2SFilipe Manana  * is to make sure that doing something like:
4501a89ca6f2SFilipe Manana  *
4502a89ca6f2SFilipe Manana  *      1) create file with 128Kb of data
4503a89ca6f2SFilipe Manana  *      2) truncate file to 64Kb
4504a89ca6f2SFilipe Manana  *      3) truncate file to 256Kb
4505a89ca6f2SFilipe Manana  *      4) fsync file
4506a89ca6f2SFilipe Manana  *      5) <crash/power failure>
4507a89ca6f2SFilipe Manana  *      6) mount fs and trigger log replay
4508a89ca6f2SFilipe Manana  *
4509a89ca6f2SFilipe Manana  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4510a89ca6f2SFilipe Manana  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4511a89ca6f2SFilipe Manana  * file correspond to a hole. The presence of explicit holes in a log tree is
4512a89ca6f2SFilipe Manana  * what guarantees that log replay will remove/adjust file extent items in the
4513a89ca6f2SFilipe Manana  * fs/subvol tree.
4514a89ca6f2SFilipe Manana  *
4515a89ca6f2SFilipe Manana  * Here we do not need to care about holes between extents, that is already done
4516a89ca6f2SFilipe Manana  * by copy_items(). We also only need to do this in the full sync path, where we
4517a89ca6f2SFilipe Manana  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4518a89ca6f2SFilipe Manana  * lookup the list of modified extent maps and if any represents a hole, we
4519a89ca6f2SFilipe Manana  * insert a corresponding extent representing a hole in the log tree.
4520a89ca6f2SFilipe Manana  */
4521a89ca6f2SFilipe Manana static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4522a89ca6f2SFilipe Manana 				   struct btrfs_root *root,
4523a0308dd7SNikolay Borisov 				   struct btrfs_inode *inode,
4524a89ca6f2SFilipe Manana 				   struct btrfs_path *path)
4525a89ca6f2SFilipe Manana {
45260b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4527a89ca6f2SFilipe Manana 	int ret;
4528a89ca6f2SFilipe Manana 	struct btrfs_key key;
4529a89ca6f2SFilipe Manana 	u64 hole_start;
4530a89ca6f2SFilipe Manana 	u64 hole_size;
4531a89ca6f2SFilipe Manana 	struct extent_buffer *leaf;
4532a89ca6f2SFilipe Manana 	struct btrfs_root *log = root->log_root;
4533a0308dd7SNikolay Borisov 	const u64 ino = btrfs_ino(inode);
4534a0308dd7SNikolay Borisov 	const u64 i_size = i_size_read(&inode->vfs_inode);
4535a89ca6f2SFilipe Manana 
45360b246afaSJeff Mahoney 	if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4537a89ca6f2SFilipe Manana 		return 0;
4538a89ca6f2SFilipe Manana 
4539a89ca6f2SFilipe Manana 	key.objectid = ino;
4540a89ca6f2SFilipe Manana 	key.type = BTRFS_EXTENT_DATA_KEY;
4541a89ca6f2SFilipe Manana 	key.offset = (u64)-1;
4542a89ca6f2SFilipe Manana 
4543a89ca6f2SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4544a89ca6f2SFilipe Manana 	ASSERT(ret != 0);
4545a89ca6f2SFilipe Manana 	if (ret < 0)
4546a89ca6f2SFilipe Manana 		return ret;
4547a89ca6f2SFilipe Manana 
4548a89ca6f2SFilipe Manana 	ASSERT(path->slots[0] > 0);
4549a89ca6f2SFilipe Manana 	path->slots[0]--;
4550a89ca6f2SFilipe Manana 	leaf = path->nodes[0];
4551a89ca6f2SFilipe Manana 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4552a89ca6f2SFilipe Manana 
4553a89ca6f2SFilipe Manana 	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4554a89ca6f2SFilipe Manana 		/* inode does not have any extents */
4555a89ca6f2SFilipe Manana 		hole_start = 0;
4556a89ca6f2SFilipe Manana 		hole_size = i_size;
4557a89ca6f2SFilipe Manana 	} else {
4558a89ca6f2SFilipe Manana 		struct btrfs_file_extent_item *extent;
4559a89ca6f2SFilipe Manana 		u64 len;
4560a89ca6f2SFilipe Manana 
4561a89ca6f2SFilipe Manana 		/*
4562a89ca6f2SFilipe Manana 		 * If there's an extent beyond i_size, an explicit hole was
4563a89ca6f2SFilipe Manana 		 * already inserted by copy_items().
4564a89ca6f2SFilipe Manana 		 */
4565a89ca6f2SFilipe Manana 		if (key.offset >= i_size)
4566a89ca6f2SFilipe Manana 			return 0;
4567a89ca6f2SFilipe Manana 
4568a89ca6f2SFilipe Manana 		extent = btrfs_item_ptr(leaf, path->slots[0],
4569a89ca6f2SFilipe Manana 					struct btrfs_file_extent_item);
4570a89ca6f2SFilipe Manana 
4571a89ca6f2SFilipe Manana 		if (btrfs_file_extent_type(leaf, extent) ==
4572a89ca6f2SFilipe Manana 		    BTRFS_FILE_EXTENT_INLINE) {
4573e41ca589SQu Wenruo 			len = btrfs_file_extent_ram_bytes(leaf, extent);
45746399fb5aSFilipe Manana 			ASSERT(len == i_size ||
45756399fb5aSFilipe Manana 			       (len == fs_info->sectorsize &&
45766399fb5aSFilipe Manana 				btrfs_file_extent_compression(leaf, extent) !=
45776399fb5aSFilipe Manana 				BTRFS_COMPRESS_NONE));
4578a89ca6f2SFilipe Manana 			return 0;
4579a89ca6f2SFilipe Manana 		}
4580a89ca6f2SFilipe Manana 
4581a89ca6f2SFilipe Manana 		len = btrfs_file_extent_num_bytes(leaf, extent);
4582a89ca6f2SFilipe Manana 		/* Last extent goes beyond i_size, no need to log a hole. */
4583a89ca6f2SFilipe Manana 		if (key.offset + len > i_size)
4584a89ca6f2SFilipe Manana 			return 0;
4585a89ca6f2SFilipe Manana 		hole_start = key.offset + len;
4586a89ca6f2SFilipe Manana 		hole_size = i_size - hole_start;
4587a89ca6f2SFilipe Manana 	}
4588a89ca6f2SFilipe Manana 	btrfs_release_path(path);
4589a89ca6f2SFilipe Manana 
4590a89ca6f2SFilipe Manana 	/* Last extent ends at i_size. */
4591a89ca6f2SFilipe Manana 	if (hole_size == 0)
4592a89ca6f2SFilipe Manana 		return 0;
4593a89ca6f2SFilipe Manana 
45940b246afaSJeff Mahoney 	hole_size = ALIGN(hole_size, fs_info->sectorsize);
4595a89ca6f2SFilipe Manana 	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4596a89ca6f2SFilipe Manana 				       hole_size, 0, hole_size, 0, 0, 0);
4597a89ca6f2SFilipe Manana 	return ret;
4598a89ca6f2SFilipe Manana }
4599a89ca6f2SFilipe Manana 
460056f23fdbSFilipe Manana /*
460156f23fdbSFilipe Manana  * When we are logging a new inode X, check if it doesn't have a reference that
460256f23fdbSFilipe Manana  * matches the reference from some other inode Y created in a past transaction
460356f23fdbSFilipe Manana  * and that was renamed in the current transaction. If we don't do this, then at
460456f23fdbSFilipe Manana  * log replay time we can lose inode Y (and all its files if it's a directory):
460556f23fdbSFilipe Manana  *
460656f23fdbSFilipe Manana  * mkdir /mnt/x
460756f23fdbSFilipe Manana  * echo "hello world" > /mnt/x/foobar
460856f23fdbSFilipe Manana  * sync
460956f23fdbSFilipe Manana  * mv /mnt/x /mnt/y
461056f23fdbSFilipe Manana  * mkdir /mnt/x                 # or touch /mnt/x
461156f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/x
461256f23fdbSFilipe Manana  * <power fail>
461356f23fdbSFilipe Manana  * mount fs, trigger log replay
461456f23fdbSFilipe Manana  *
461556f23fdbSFilipe Manana  * After the log replay procedure, we would lose the first directory and all its
461656f23fdbSFilipe Manana  * files (file foobar).
461756f23fdbSFilipe Manana  * For the case where inode Y is not a directory we simply end up losing it:
461856f23fdbSFilipe Manana  *
461956f23fdbSFilipe Manana  * echo "123" > /mnt/foo
462056f23fdbSFilipe Manana  * sync
462156f23fdbSFilipe Manana  * mv /mnt/foo /mnt/bar
462256f23fdbSFilipe Manana  * echo "abc" > /mnt/foo
462356f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/foo
462456f23fdbSFilipe Manana  * <power fail>
462556f23fdbSFilipe Manana  *
462656f23fdbSFilipe Manana  * We also need this for cases where a snapshot entry is replaced by some other
462756f23fdbSFilipe Manana  * entry (file or directory) otherwise we end up with an unreplayable log due to
462856f23fdbSFilipe Manana  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
462956f23fdbSFilipe Manana  * if it were a regular entry:
463056f23fdbSFilipe Manana  *
463156f23fdbSFilipe Manana  * mkdir /mnt/x
463256f23fdbSFilipe Manana  * btrfs subvolume snapshot /mnt /mnt/x/snap
463356f23fdbSFilipe Manana  * btrfs subvolume delete /mnt/x/snap
463456f23fdbSFilipe Manana  * rmdir /mnt/x
463556f23fdbSFilipe Manana  * mkdir /mnt/x
463656f23fdbSFilipe Manana  * fsync /mnt/x or fsync some new file inside it
463756f23fdbSFilipe Manana  * <power fail>
463856f23fdbSFilipe Manana  *
463956f23fdbSFilipe Manana  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
464056f23fdbSFilipe Manana  * the same transaction.
464156f23fdbSFilipe Manana  */
464256f23fdbSFilipe Manana static int btrfs_check_ref_name_override(struct extent_buffer *eb,
464356f23fdbSFilipe Manana 					 const int slot,
464456f23fdbSFilipe Manana 					 const struct btrfs_key *key,
46454791c8f1SNikolay Borisov 					 struct btrfs_inode *inode,
464644f714daSFilipe Manana 					 u64 *other_ino)
464756f23fdbSFilipe Manana {
464856f23fdbSFilipe Manana 	int ret;
464956f23fdbSFilipe Manana 	struct btrfs_path *search_path;
465056f23fdbSFilipe Manana 	char *name = NULL;
465156f23fdbSFilipe Manana 	u32 name_len = 0;
465256f23fdbSFilipe Manana 	u32 item_size = btrfs_item_size_nr(eb, slot);
465356f23fdbSFilipe Manana 	u32 cur_offset = 0;
465456f23fdbSFilipe Manana 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
465556f23fdbSFilipe Manana 
465656f23fdbSFilipe Manana 	search_path = btrfs_alloc_path();
465756f23fdbSFilipe Manana 	if (!search_path)
465856f23fdbSFilipe Manana 		return -ENOMEM;
465956f23fdbSFilipe Manana 	search_path->search_commit_root = 1;
466056f23fdbSFilipe Manana 	search_path->skip_locking = 1;
466156f23fdbSFilipe Manana 
466256f23fdbSFilipe Manana 	while (cur_offset < item_size) {
466356f23fdbSFilipe Manana 		u64 parent;
466456f23fdbSFilipe Manana 		u32 this_name_len;
466556f23fdbSFilipe Manana 		u32 this_len;
466656f23fdbSFilipe Manana 		unsigned long name_ptr;
466756f23fdbSFilipe Manana 		struct btrfs_dir_item *di;
466856f23fdbSFilipe Manana 
466956f23fdbSFilipe Manana 		if (key->type == BTRFS_INODE_REF_KEY) {
467056f23fdbSFilipe Manana 			struct btrfs_inode_ref *iref;
467156f23fdbSFilipe Manana 
467256f23fdbSFilipe Manana 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
467356f23fdbSFilipe Manana 			parent = key->offset;
467456f23fdbSFilipe Manana 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
467556f23fdbSFilipe Manana 			name_ptr = (unsigned long)(iref + 1);
467656f23fdbSFilipe Manana 			this_len = sizeof(*iref) + this_name_len;
467756f23fdbSFilipe Manana 		} else {
467856f23fdbSFilipe Manana 			struct btrfs_inode_extref *extref;
467956f23fdbSFilipe Manana 
468056f23fdbSFilipe Manana 			extref = (struct btrfs_inode_extref *)(ptr +
468156f23fdbSFilipe Manana 							       cur_offset);
468256f23fdbSFilipe Manana 			parent = btrfs_inode_extref_parent(eb, extref);
468356f23fdbSFilipe Manana 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
468456f23fdbSFilipe Manana 			name_ptr = (unsigned long)&extref->name;
468556f23fdbSFilipe Manana 			this_len = sizeof(*extref) + this_name_len;
468656f23fdbSFilipe Manana 		}
468756f23fdbSFilipe Manana 
468856f23fdbSFilipe Manana 		if (this_name_len > name_len) {
468956f23fdbSFilipe Manana 			char *new_name;
469056f23fdbSFilipe Manana 
469156f23fdbSFilipe Manana 			new_name = krealloc(name, this_name_len, GFP_NOFS);
469256f23fdbSFilipe Manana 			if (!new_name) {
469356f23fdbSFilipe Manana 				ret = -ENOMEM;
469456f23fdbSFilipe Manana 				goto out;
469556f23fdbSFilipe Manana 			}
469656f23fdbSFilipe Manana 			name_len = this_name_len;
469756f23fdbSFilipe Manana 			name = new_name;
469856f23fdbSFilipe Manana 		}
469956f23fdbSFilipe Manana 
470056f23fdbSFilipe Manana 		read_extent_buffer(eb, name, name_ptr, this_name_len);
47014791c8f1SNikolay Borisov 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
47024791c8f1SNikolay Borisov 				parent, name, this_name_len, 0);
470356f23fdbSFilipe Manana 		if (di && !IS_ERR(di)) {
470444f714daSFilipe Manana 			struct btrfs_key di_key;
470544f714daSFilipe Manana 
470644f714daSFilipe Manana 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
470744f714daSFilipe Manana 						  di, &di_key);
470844f714daSFilipe Manana 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
470956f23fdbSFilipe Manana 				ret = 1;
471044f714daSFilipe Manana 				*other_ino = di_key.objectid;
471144f714daSFilipe Manana 			} else {
471244f714daSFilipe Manana 				ret = -EAGAIN;
471344f714daSFilipe Manana 			}
471456f23fdbSFilipe Manana 			goto out;
471556f23fdbSFilipe Manana 		} else if (IS_ERR(di)) {
471656f23fdbSFilipe Manana 			ret = PTR_ERR(di);
471756f23fdbSFilipe Manana 			goto out;
471856f23fdbSFilipe Manana 		}
471956f23fdbSFilipe Manana 		btrfs_release_path(search_path);
472056f23fdbSFilipe Manana 
472156f23fdbSFilipe Manana 		cur_offset += this_len;
472256f23fdbSFilipe Manana 	}
472356f23fdbSFilipe Manana 	ret = 0;
472456f23fdbSFilipe Manana out:
472556f23fdbSFilipe Manana 	btrfs_free_path(search_path);
472656f23fdbSFilipe Manana 	kfree(name);
472756f23fdbSFilipe Manana 	return ret;
472856f23fdbSFilipe Manana }
472956f23fdbSFilipe Manana 
4730e02119d5SChris Mason /* log a single inode in the tree log.
4731e02119d5SChris Mason  * At least one parent directory for this inode must exist in the tree
4732e02119d5SChris Mason  * or be logged already.
4733e02119d5SChris Mason  *
4734e02119d5SChris Mason  * Any items from this inode changed by the current transaction are copied
4735e02119d5SChris Mason  * to the log tree.  An extra reference is taken on any extents in this
4736e02119d5SChris Mason  * file, allowing us to avoid a whole pile of corner cases around logging
4737e02119d5SChris Mason  * blocks that have been removed from the tree.
4738e02119d5SChris Mason  *
4739e02119d5SChris Mason  * See LOG_INODE_ALL and related defines for a description of what inode_only
4740e02119d5SChris Mason  * does.
4741e02119d5SChris Mason  *
4742e02119d5SChris Mason  * This handles both files and directories.
4743e02119d5SChris Mason  */
474412fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4745a59108a7SNikolay Borisov 			   struct btrfs_root *root, struct btrfs_inode *inode,
474649dae1bcSFilipe Manana 			   int inode_only,
474749dae1bcSFilipe Manana 			   const loff_t start,
47488407f553SFilipe Manana 			   const loff_t end,
47498407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx)
4750e02119d5SChris Mason {
47510b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4752e02119d5SChris Mason 	struct btrfs_path *path;
4753e02119d5SChris Mason 	struct btrfs_path *dst_path;
4754e02119d5SChris Mason 	struct btrfs_key min_key;
4755e02119d5SChris Mason 	struct btrfs_key max_key;
4756e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
475716e7549fSJosef Bacik 	u64 last_extent = 0;
47584a500fd1SYan, Zheng 	int err = 0;
4759e02119d5SChris Mason 	int ret;
47603a5f1d45SChris Mason 	int nritems;
476131ff1cd2SChris Mason 	int ins_start_slot = 0;
476231ff1cd2SChris Mason 	int ins_nr;
47635dc562c5SJosef Bacik 	bool fast_search = false;
4764a59108a7SNikolay Borisov 	u64 ino = btrfs_ino(inode);
4765a59108a7SNikolay Borisov 	struct extent_map_tree *em_tree = &inode->extent_tree;
47661a4bcf47SFilipe Manana 	u64 logged_isize = 0;
4767e4545de5SFilipe Manana 	bool need_log_inode_item = true;
47689a8fca62SFilipe Manana 	bool xattrs_logged = false;
4769e02119d5SChris Mason 
4770e02119d5SChris Mason 	path = btrfs_alloc_path();
47715df67083STsutomu Itoh 	if (!path)
47725df67083STsutomu Itoh 		return -ENOMEM;
4773e02119d5SChris Mason 	dst_path = btrfs_alloc_path();
47745df67083STsutomu Itoh 	if (!dst_path) {
47755df67083STsutomu Itoh 		btrfs_free_path(path);
47765df67083STsutomu Itoh 		return -ENOMEM;
47775df67083STsutomu Itoh 	}
4778e02119d5SChris Mason 
477933345d01SLi Zefan 	min_key.objectid = ino;
4780e02119d5SChris Mason 	min_key.type = BTRFS_INODE_ITEM_KEY;
4781e02119d5SChris Mason 	min_key.offset = 0;
4782e02119d5SChris Mason 
478333345d01SLi Zefan 	max_key.objectid = ino;
478412fcfd22SChris Mason 
478512fcfd22SChris Mason 
47865dc562c5SJosef Bacik 	/* today the code can only do partial logging of directories */
4787a59108a7SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
47885269b67eSMiao Xie 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4789a59108a7SNikolay Borisov 		       &inode->runtime_flags) &&
4790781feef7SLiu Bo 	     inode_only >= LOG_INODE_EXISTS))
4791e02119d5SChris Mason 		max_key.type = BTRFS_XATTR_ITEM_KEY;
4792e02119d5SChris Mason 	else
4793e02119d5SChris Mason 		max_key.type = (u8)-1;
4794e02119d5SChris Mason 	max_key.offset = (u64)-1;
4795e02119d5SChris Mason 
47962c2c452bSFilipe Manana 	/*
47972c2c452bSFilipe Manana 	 * Only run delayed items if we are a dir or a new file.
47982c2c452bSFilipe Manana 	 * Otherwise commit the delayed inode only, which is needed in
47992c2c452bSFilipe Manana 	 * order for the log replay code to mark inodes for link count
48002c2c452bSFilipe Manana 	 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
48012c2c452bSFilipe Manana 	 */
4802a59108a7SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
4803a59108a7SNikolay Borisov 	    inode->generation > fs_info->last_trans_committed)
4804a59108a7SNikolay Borisov 		ret = btrfs_commit_inode_delayed_items(trans, inode);
48052c2c452bSFilipe Manana 	else
4806a59108a7SNikolay Borisov 		ret = btrfs_commit_inode_delayed_inode(inode);
48072c2c452bSFilipe Manana 
480816cdcec7SMiao Xie 	if (ret) {
480916cdcec7SMiao Xie 		btrfs_free_path(path);
481016cdcec7SMiao Xie 		btrfs_free_path(dst_path);
481116cdcec7SMiao Xie 		return ret;
481216cdcec7SMiao Xie 	}
481316cdcec7SMiao Xie 
4814781feef7SLiu Bo 	if (inode_only == LOG_OTHER_INODE) {
4815781feef7SLiu Bo 		inode_only = LOG_INODE_EXISTS;
4816a59108a7SNikolay Borisov 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
4817781feef7SLiu Bo 	} else {
4818a59108a7SNikolay Borisov 		mutex_lock(&inode->log_mutex);
4819781feef7SLiu Bo 	}
4820e02119d5SChris Mason 
48215e33a2bdSFilipe Manana 	/*
4822e02119d5SChris Mason 	 * a brute force approach to making sure we get the most uptodate
4823e02119d5SChris Mason 	 * copies of everything.
4824e02119d5SChris Mason 	 */
4825a59108a7SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
4826e02119d5SChris Mason 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4827e02119d5SChris Mason 
48284f764e51SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS)
48294f764e51SFilipe Manana 			max_key_type = BTRFS_XATTR_ITEM_KEY;
483033345d01SLi Zefan 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4831e02119d5SChris Mason 	} else {
48321a4bcf47SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS) {
48331a4bcf47SFilipe Manana 			/*
48341a4bcf47SFilipe Manana 			 * Make sure the new inode item we write to the log has
48351a4bcf47SFilipe Manana 			 * the same isize as the current one (if it exists).
48361a4bcf47SFilipe Manana 			 * This is necessary to prevent data loss after log
48371a4bcf47SFilipe Manana 			 * replay, and also to prevent doing a wrong expanding
48381a4bcf47SFilipe Manana 			 * truncate - for e.g. create file, write 4K into offset
48391a4bcf47SFilipe Manana 			 * 0, fsync, write 4K into offset 4096, add hard link,
48401a4bcf47SFilipe Manana 			 * fsync some other file (to sync log), power fail - if
48411a4bcf47SFilipe Manana 			 * we use the inode's current i_size, after log replay
48421a4bcf47SFilipe Manana 			 * we get a 8Kb file, with the last 4Kb extent as a hole
48431a4bcf47SFilipe Manana 			 * (zeroes), as if an expanding truncate happened,
48441a4bcf47SFilipe Manana 			 * instead of getting a file of 4Kb only.
48451a4bcf47SFilipe Manana 			 */
4846a59108a7SNikolay Borisov 			err = logged_inode_size(log, inode, path, &logged_isize);
48471a4bcf47SFilipe Manana 			if (err)
48481a4bcf47SFilipe Manana 				goto out_unlock;
48491a4bcf47SFilipe Manana 		}
4850a742994aSFilipe Manana 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4851a59108a7SNikolay Borisov 			     &inode->runtime_flags)) {
4852a742994aSFilipe Manana 			if (inode_only == LOG_INODE_EXISTS) {
48534f764e51SFilipe Manana 				max_key.type = BTRFS_XATTR_ITEM_KEY;
4854a742994aSFilipe Manana 				ret = drop_objectid_items(trans, log, path, ino,
4855a742994aSFilipe Manana 							  max_key.type);
4856a742994aSFilipe Manana 			} else {
4857a742994aSFilipe Manana 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4858a59108a7SNikolay Borisov 					  &inode->runtime_flags);
4859e9976151SJosef Bacik 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4860a59108a7SNikolay Borisov 					  &inode->runtime_flags);
486128ed1345SChris Mason 				while(1) {
486228ed1345SChris Mason 					ret = btrfs_truncate_inode_items(trans,
4863a59108a7SNikolay Borisov 						log, &inode->vfs_inode, 0, 0);
486428ed1345SChris Mason 					if (ret != -EAGAIN)
486528ed1345SChris Mason 						break;
486628ed1345SChris Mason 				}
4867a742994aSFilipe Manana 			}
48684f764e51SFilipe Manana 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4869a59108a7SNikolay Borisov 					      &inode->runtime_flags) ||
48706cfab851SJosef Bacik 			   inode_only == LOG_INODE_EXISTS) {
48714f764e51SFilipe Manana 			if (inode_only == LOG_INODE_ALL)
4872a95249b3SJosef Bacik 				fast_search = true;
4873a95249b3SJosef Bacik 			max_key.type = BTRFS_XATTR_ITEM_KEY;
4874a95249b3SJosef Bacik 			ret = drop_objectid_items(trans, log, path, ino,
4875a95249b3SJosef Bacik 						  max_key.type);
48765dc562c5SJosef Bacik 		} else {
4877183f37faSLiu Bo 			if (inode_only == LOG_INODE_ALL)
48785dc562c5SJosef Bacik 				fast_search = true;
4879a95249b3SJosef Bacik 			goto log_extents;
4880a95249b3SJosef Bacik 		}
4881a95249b3SJosef Bacik 
4882e02119d5SChris Mason 	}
48834a500fd1SYan, Zheng 	if (ret) {
48844a500fd1SYan, Zheng 		err = ret;
48854a500fd1SYan, Zheng 		goto out_unlock;
48864a500fd1SYan, Zheng 	}
4887e02119d5SChris Mason 
4888e02119d5SChris Mason 	while (1) {
488931ff1cd2SChris Mason 		ins_nr = 0;
48906174d3cbSFilipe David Borba Manana 		ret = btrfs_search_forward(root, &min_key,
4891de78b51aSEric Sandeen 					   path, trans->transid);
4892fb770ae4SLiu Bo 		if (ret < 0) {
4893fb770ae4SLiu Bo 			err = ret;
4894fb770ae4SLiu Bo 			goto out_unlock;
4895fb770ae4SLiu Bo 		}
4896e02119d5SChris Mason 		if (ret != 0)
4897e02119d5SChris Mason 			break;
48983a5f1d45SChris Mason again:
489931ff1cd2SChris Mason 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
490033345d01SLi Zefan 		if (min_key.objectid != ino)
4901e02119d5SChris Mason 			break;
4902e02119d5SChris Mason 		if (min_key.type > max_key.type)
4903e02119d5SChris Mason 			break;
490431ff1cd2SChris Mason 
4905e4545de5SFilipe Manana 		if (min_key.type == BTRFS_INODE_ITEM_KEY)
4906e4545de5SFilipe Manana 			need_log_inode_item = false;
4907e4545de5SFilipe Manana 
490856f23fdbSFilipe Manana 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
490956f23fdbSFilipe Manana 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4910a59108a7SNikolay Borisov 		    inode->generation == trans->transid) {
491144f714daSFilipe Manana 			u64 other_ino = 0;
491244f714daSFilipe Manana 
491356f23fdbSFilipe Manana 			ret = btrfs_check_ref_name_override(path->nodes[0],
4914a59108a7SNikolay Borisov 					path->slots[0], &min_key, inode,
491544f714daSFilipe Manana 					&other_ino);
491656f23fdbSFilipe Manana 			if (ret < 0) {
491756f23fdbSFilipe Manana 				err = ret;
491856f23fdbSFilipe Manana 				goto out_unlock;
491928a23593SFilipe Manana 			} else if (ret > 0 && ctx &&
49204a0cc7caSNikolay Borisov 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
492144f714daSFilipe Manana 				struct btrfs_key inode_key;
492244f714daSFilipe Manana 				struct inode *other_inode;
492344f714daSFilipe Manana 
492444f714daSFilipe Manana 				if (ins_nr > 0) {
492544f714daSFilipe Manana 					ins_nr++;
492644f714daSFilipe Manana 				} else {
492744f714daSFilipe Manana 					ins_nr = 1;
492844f714daSFilipe Manana 					ins_start_slot = path->slots[0];
492944f714daSFilipe Manana 				}
4930a59108a7SNikolay Borisov 				ret = copy_items(trans, inode, dst_path, path,
493144f714daSFilipe Manana 						 &last_extent, ins_start_slot,
493244f714daSFilipe Manana 						 ins_nr, inode_only,
493344f714daSFilipe Manana 						 logged_isize);
493444f714daSFilipe Manana 				if (ret < 0) {
493544f714daSFilipe Manana 					err = ret;
493656f23fdbSFilipe Manana 					goto out_unlock;
493756f23fdbSFilipe Manana 				}
493844f714daSFilipe Manana 				ins_nr = 0;
493944f714daSFilipe Manana 				btrfs_release_path(path);
494044f714daSFilipe Manana 				inode_key.objectid = other_ino;
494144f714daSFilipe Manana 				inode_key.type = BTRFS_INODE_ITEM_KEY;
494244f714daSFilipe Manana 				inode_key.offset = 0;
49430b246afaSJeff Mahoney 				other_inode = btrfs_iget(fs_info->sb,
494444f714daSFilipe Manana 							 &inode_key, root,
494544f714daSFilipe Manana 							 NULL);
494644f714daSFilipe Manana 				/*
494744f714daSFilipe Manana 				 * If the other inode that had a conflicting dir
494844f714daSFilipe Manana 				 * entry was deleted in the current transaction,
494944f714daSFilipe Manana 				 * we don't need to do more work nor fallback to
495044f714daSFilipe Manana 				 * a transaction commit.
495144f714daSFilipe Manana 				 */
495244f714daSFilipe Manana 				if (IS_ERR(other_inode) &&
495344f714daSFilipe Manana 				    PTR_ERR(other_inode) == -ENOENT) {
495444f714daSFilipe Manana 					goto next_key;
495544f714daSFilipe Manana 				} else if (IS_ERR(other_inode)) {
495644f714daSFilipe Manana 					err = PTR_ERR(other_inode);
495744f714daSFilipe Manana 					goto out_unlock;
495844f714daSFilipe Manana 				}
495944f714daSFilipe Manana 				/*
496044f714daSFilipe Manana 				 * We are safe logging the other inode without
496144f714daSFilipe Manana 				 * acquiring its i_mutex as long as we log with
496244f714daSFilipe Manana 				 * the LOG_INODE_EXISTS mode. We're safe against
496344f714daSFilipe Manana 				 * concurrent renames of the other inode as well
496444f714daSFilipe Manana 				 * because during a rename we pin the log and
496544f714daSFilipe Manana 				 * update the log with the new name before we
496644f714daSFilipe Manana 				 * unpin it.
496744f714daSFilipe Manana 				 */
4968a59108a7SNikolay Borisov 				err = btrfs_log_inode(trans, root,
4969a59108a7SNikolay Borisov 						BTRFS_I(other_inode),
4970a59108a7SNikolay Borisov 						LOG_OTHER_INODE, 0, LLONG_MAX,
4971a59108a7SNikolay Borisov 						ctx);
497244f714daSFilipe Manana 				iput(other_inode);
497344f714daSFilipe Manana 				if (err)
497444f714daSFilipe Manana 					goto out_unlock;
497544f714daSFilipe Manana 				else
497644f714daSFilipe Manana 					goto next_key;
497744f714daSFilipe Manana 			}
497856f23fdbSFilipe Manana 		}
497956f23fdbSFilipe Manana 
498036283bf7SFilipe Manana 		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
498136283bf7SFilipe Manana 		if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
498236283bf7SFilipe Manana 			if (ins_nr == 0)
498336283bf7SFilipe Manana 				goto next_slot;
4984a59108a7SNikolay Borisov 			ret = copy_items(trans, inode, dst_path, path,
498536283bf7SFilipe Manana 					 &last_extent, ins_start_slot,
498636283bf7SFilipe Manana 					 ins_nr, inode_only, logged_isize);
498736283bf7SFilipe Manana 			if (ret < 0) {
498836283bf7SFilipe Manana 				err = ret;
498936283bf7SFilipe Manana 				goto out_unlock;
499036283bf7SFilipe Manana 			}
499136283bf7SFilipe Manana 			ins_nr = 0;
499236283bf7SFilipe Manana 			if (ret) {
499336283bf7SFilipe Manana 				btrfs_release_path(path);
499436283bf7SFilipe Manana 				continue;
499536283bf7SFilipe Manana 			}
499636283bf7SFilipe Manana 			goto next_slot;
499736283bf7SFilipe Manana 		}
499836283bf7SFilipe Manana 
499931ff1cd2SChris Mason 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
500031ff1cd2SChris Mason 			ins_nr++;
500131ff1cd2SChris Mason 			goto next_slot;
500231ff1cd2SChris Mason 		} else if (!ins_nr) {
500331ff1cd2SChris Mason 			ins_start_slot = path->slots[0];
500431ff1cd2SChris Mason 			ins_nr = 1;
500531ff1cd2SChris Mason 			goto next_slot;
5006e02119d5SChris Mason 		}
5007e02119d5SChris Mason 
5008a59108a7SNikolay Borisov 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
50091a4bcf47SFilipe Manana 				 ins_start_slot, ins_nr, inode_only,
50101a4bcf47SFilipe Manana 				 logged_isize);
501116e7549fSJosef Bacik 		if (ret < 0) {
50124a500fd1SYan, Zheng 			err = ret;
50134a500fd1SYan, Zheng 			goto out_unlock;
5014a71db86eSRasmus Villemoes 		}
5015a71db86eSRasmus Villemoes 		if (ret) {
501616e7549fSJosef Bacik 			ins_nr = 0;
501716e7549fSJosef Bacik 			btrfs_release_path(path);
501816e7549fSJosef Bacik 			continue;
50194a500fd1SYan, Zheng 		}
502031ff1cd2SChris Mason 		ins_nr = 1;
502131ff1cd2SChris Mason 		ins_start_slot = path->slots[0];
502231ff1cd2SChris Mason next_slot:
5023e02119d5SChris Mason 
50243a5f1d45SChris Mason 		nritems = btrfs_header_nritems(path->nodes[0]);
50253a5f1d45SChris Mason 		path->slots[0]++;
50263a5f1d45SChris Mason 		if (path->slots[0] < nritems) {
50273a5f1d45SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
50283a5f1d45SChris Mason 					      path->slots[0]);
50293a5f1d45SChris Mason 			goto again;
50303a5f1d45SChris Mason 		}
503131ff1cd2SChris Mason 		if (ins_nr) {
5032a59108a7SNikolay Borisov 			ret = copy_items(trans, inode, dst_path, path,
503316e7549fSJosef Bacik 					 &last_extent, ins_start_slot,
50341a4bcf47SFilipe Manana 					 ins_nr, inode_only, logged_isize);
503516e7549fSJosef Bacik 			if (ret < 0) {
50364a500fd1SYan, Zheng 				err = ret;
50374a500fd1SYan, Zheng 				goto out_unlock;
50384a500fd1SYan, Zheng 			}
503916e7549fSJosef Bacik 			ret = 0;
504031ff1cd2SChris Mason 			ins_nr = 0;
504131ff1cd2SChris Mason 		}
5042b3b4aa74SDavid Sterba 		btrfs_release_path(path);
504344f714daSFilipe Manana next_key:
50443d41d702SFilipe David Borba Manana 		if (min_key.offset < (u64)-1) {
5045e02119d5SChris Mason 			min_key.offset++;
50463d41d702SFilipe David Borba Manana 		} else if (min_key.type < max_key.type) {
5047e02119d5SChris Mason 			min_key.type++;
50483d41d702SFilipe David Borba Manana 			min_key.offset = 0;
50493d41d702SFilipe David Borba Manana 		} else {
5050e02119d5SChris Mason 			break;
5051e02119d5SChris Mason 		}
50523d41d702SFilipe David Borba Manana 	}
505331ff1cd2SChris Mason 	if (ins_nr) {
5054a59108a7SNikolay Borisov 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
50551a4bcf47SFilipe Manana 				 ins_start_slot, ins_nr, inode_only,
50561a4bcf47SFilipe Manana 				 logged_isize);
505716e7549fSJosef Bacik 		if (ret < 0) {
50584a500fd1SYan, Zheng 			err = ret;
50594a500fd1SYan, Zheng 			goto out_unlock;
50604a500fd1SYan, Zheng 		}
506116e7549fSJosef Bacik 		ret = 0;
506231ff1cd2SChris Mason 		ins_nr = 0;
506331ff1cd2SChris Mason 	}
50645dc562c5SJosef Bacik 
506536283bf7SFilipe Manana 	btrfs_release_path(path);
506636283bf7SFilipe Manana 	btrfs_release_path(dst_path);
5067a59108a7SNikolay Borisov 	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
506836283bf7SFilipe Manana 	if (err)
506936283bf7SFilipe Manana 		goto out_unlock;
50709a8fca62SFilipe Manana 	xattrs_logged = true;
5071a89ca6f2SFilipe Manana 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5072a89ca6f2SFilipe Manana 		btrfs_release_path(path);
5073a89ca6f2SFilipe Manana 		btrfs_release_path(dst_path);
5074a59108a7SNikolay Borisov 		err = btrfs_log_trailing_hole(trans, root, inode, path);
5075a89ca6f2SFilipe Manana 		if (err)
5076a89ca6f2SFilipe Manana 			goto out_unlock;
5077a89ca6f2SFilipe Manana 	}
5078a95249b3SJosef Bacik log_extents:
5079f3b15ccdSJosef Bacik 	btrfs_release_path(path);
50805dc562c5SJosef Bacik 	btrfs_release_path(dst_path);
5081e4545de5SFilipe Manana 	if (need_log_inode_item) {
5082a59108a7SNikolay Borisov 		err = log_inode_item(trans, log, dst_path, inode);
50839a8fca62SFilipe Manana 		if (!err && !xattrs_logged) {
50849a8fca62SFilipe Manana 			err = btrfs_log_all_xattrs(trans, root, inode, path,
50859a8fca62SFilipe Manana 						   dst_path);
50869a8fca62SFilipe Manana 			btrfs_release_path(path);
50879a8fca62SFilipe Manana 		}
5088e4545de5SFilipe Manana 		if (err)
5089e4545de5SFilipe Manana 			goto out_unlock;
5090e4545de5SFilipe Manana 	}
5091f3b15ccdSJosef Bacik 	if (fast_search) {
5092a59108a7SNikolay Borisov 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5093a2120a47SJosef Bacik 						ctx, start, end);
50945dc562c5SJosef Bacik 		if (ret) {
50955dc562c5SJosef Bacik 			err = ret;
50965dc562c5SJosef Bacik 			goto out_unlock;
50975dc562c5SJosef Bacik 		}
5098d006a048SJosef Bacik 	} else if (inode_only == LOG_INODE_ALL) {
509906d3d22bSLiu Bo 		struct extent_map *em, *n;
510006d3d22bSLiu Bo 
510149dae1bcSFilipe Manana 		write_lock(&em_tree->lock);
510249dae1bcSFilipe Manana 		/*
510349dae1bcSFilipe Manana 		 * We can't just remove every em if we're called for a ranged
510449dae1bcSFilipe Manana 		 * fsync - that is, one that doesn't cover the whole possible
510549dae1bcSFilipe Manana 		 * file range (0 to LLONG_MAX). This is because we can have
510649dae1bcSFilipe Manana 		 * em's that fall outside the range we're logging and therefore
510749dae1bcSFilipe Manana 		 * their ordered operations haven't completed yet
510849dae1bcSFilipe Manana 		 * (btrfs_finish_ordered_io() not invoked yet). This means we
510949dae1bcSFilipe Manana 		 * didn't get their respective file extent item in the fs/subvol
511049dae1bcSFilipe Manana 		 * tree yet, and need to let the next fast fsync (one which
511149dae1bcSFilipe Manana 		 * consults the list of modified extent maps) find the em so
511249dae1bcSFilipe Manana 		 * that it logs a matching file extent item and waits for the
511349dae1bcSFilipe Manana 		 * respective ordered operation to complete (if it's still
511449dae1bcSFilipe Manana 		 * running).
511549dae1bcSFilipe Manana 		 *
511649dae1bcSFilipe Manana 		 * Removing every em outside the range we're logging would make
511749dae1bcSFilipe Manana 		 * the next fast fsync not log their matching file extent items,
511849dae1bcSFilipe Manana 		 * therefore making us lose data after a log replay.
511949dae1bcSFilipe Manana 		 */
512049dae1bcSFilipe Manana 		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
512149dae1bcSFilipe Manana 					 list) {
512249dae1bcSFilipe Manana 			const u64 mod_end = em->mod_start + em->mod_len - 1;
512349dae1bcSFilipe Manana 
512449dae1bcSFilipe Manana 			if (em->mod_start >= start && mod_end <= end)
512506d3d22bSLiu Bo 				list_del_init(&em->list);
512649dae1bcSFilipe Manana 		}
512749dae1bcSFilipe Manana 		write_unlock(&em_tree->lock);
51285dc562c5SJosef Bacik 	}
51295dc562c5SJosef Bacik 
5130a59108a7SNikolay Borisov 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5131a59108a7SNikolay Borisov 		ret = log_directory_changes(trans, root, inode, path, dst_path,
5132a59108a7SNikolay Borisov 					ctx);
51334a500fd1SYan, Zheng 		if (ret) {
51344a500fd1SYan, Zheng 			err = ret;
51354a500fd1SYan, Zheng 			goto out_unlock;
51364a500fd1SYan, Zheng 		}
5137e02119d5SChris Mason 	}
513849dae1bcSFilipe Manana 
5139a59108a7SNikolay Borisov 	spin_lock(&inode->lock);
5140a59108a7SNikolay Borisov 	inode->logged_trans = trans->transid;
5141a59108a7SNikolay Borisov 	inode->last_log_commit = inode->last_sub_trans;
5142a59108a7SNikolay Borisov 	spin_unlock(&inode->lock);
51434a500fd1SYan, Zheng out_unlock:
5144a59108a7SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
5145e02119d5SChris Mason 
5146e02119d5SChris Mason 	btrfs_free_path(path);
5147e02119d5SChris Mason 	btrfs_free_path(dst_path);
51484a500fd1SYan, Zheng 	return err;
5149e02119d5SChris Mason }
5150e02119d5SChris Mason 
515112fcfd22SChris Mason /*
51522be63d5cSFilipe Manana  * Check if we must fallback to a transaction commit when logging an inode.
51532be63d5cSFilipe Manana  * This must be called after logging the inode and is used only in the context
51542be63d5cSFilipe Manana  * when fsyncing an inode requires the need to log some other inode - in which
51552be63d5cSFilipe Manana  * case we can't lock the i_mutex of each other inode we need to log as that
51562be63d5cSFilipe Manana  * can lead to deadlocks with concurrent fsync against other inodes (as we can
51572be63d5cSFilipe Manana  * log inodes up or down in the hierarchy) or rename operations for example. So
51582be63d5cSFilipe Manana  * we take the log_mutex of the inode after we have logged it and then check for
51592be63d5cSFilipe Manana  * its last_unlink_trans value - this is safe because any task setting
51602be63d5cSFilipe Manana  * last_unlink_trans must take the log_mutex and it must do this before it does
51612be63d5cSFilipe Manana  * the actual unlink operation, so if we do this check before a concurrent task
51622be63d5cSFilipe Manana  * sets last_unlink_trans it means we've logged a consistent version/state of
51632be63d5cSFilipe Manana  * all the inode items, otherwise we are not sure and must do a transaction
516401327610SNicholas D Steeves  * commit (the concurrent task might have only updated last_unlink_trans before
51652be63d5cSFilipe Manana  * we logged the inode or it might have also done the unlink).
51662be63d5cSFilipe Manana  */
51672be63d5cSFilipe Manana static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5168ab1717b2SNikolay Borisov 					  struct btrfs_inode *inode)
51692be63d5cSFilipe Manana {
5170ab1717b2SNikolay Borisov 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
51712be63d5cSFilipe Manana 	bool ret = false;
51722be63d5cSFilipe Manana 
5173ab1717b2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
5174ab1717b2SNikolay Borisov 	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
51752be63d5cSFilipe Manana 		/*
51762be63d5cSFilipe Manana 		 * Make sure any commits to the log are forced to be full
51772be63d5cSFilipe Manana 		 * commits.
51782be63d5cSFilipe Manana 		 */
51792be63d5cSFilipe Manana 		btrfs_set_log_full_commit(fs_info, trans);
51802be63d5cSFilipe Manana 		ret = true;
51812be63d5cSFilipe Manana 	}
5182ab1717b2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
51832be63d5cSFilipe Manana 
51842be63d5cSFilipe Manana 	return ret;
51852be63d5cSFilipe Manana }
51862be63d5cSFilipe Manana 
51872be63d5cSFilipe Manana /*
518812fcfd22SChris Mason  * follow the dentry parent pointers up the chain and see if any
518912fcfd22SChris Mason  * of the directories in it require a full commit before they can
519012fcfd22SChris Mason  * be logged.  Returns zero if nothing special needs to be done or 1 if
519112fcfd22SChris Mason  * a full commit is required.
519212fcfd22SChris Mason  */
519312fcfd22SChris Mason static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5194aefa6115SNikolay Borisov 					       struct btrfs_inode *inode,
519512fcfd22SChris Mason 					       struct dentry *parent,
519612fcfd22SChris Mason 					       struct super_block *sb,
519712fcfd22SChris Mason 					       u64 last_committed)
5198e02119d5SChris Mason {
519912fcfd22SChris Mason 	int ret = 0;
52006a912213SJosef Bacik 	struct dentry *old_parent = NULL;
5201aefa6115SNikolay Borisov 	struct btrfs_inode *orig_inode = inode;
5202e02119d5SChris Mason 
5203af4176b4SChris Mason 	/*
5204af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
5205af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
5206af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
5207af4176b4SChris Mason 	 * and other fun in this file.
5208af4176b4SChris Mason 	 */
5209aefa6115SNikolay Borisov 	if (S_ISREG(inode->vfs_inode.i_mode) &&
5210aefa6115SNikolay Borisov 	    inode->generation <= last_committed &&
5211aefa6115SNikolay Borisov 	    inode->last_unlink_trans <= last_committed)
5212af4176b4SChris Mason 		goto out;
5213af4176b4SChris Mason 
5214aefa6115SNikolay Borisov 	if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5215fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
521612fcfd22SChris Mason 			goto out;
5217aefa6115SNikolay Borisov 		inode = BTRFS_I(d_inode(parent));
521812fcfd22SChris Mason 	}
521912fcfd22SChris Mason 
522012fcfd22SChris Mason 	while (1) {
5221de2b530bSJosef Bacik 		/*
5222de2b530bSJosef Bacik 		 * If we are logging a directory then we start with our inode,
522301327610SNicholas D Steeves 		 * not our parent's inode, so we need to skip setting the
5224de2b530bSJosef Bacik 		 * logged_trans so that further down in the log code we don't
5225de2b530bSJosef Bacik 		 * think this inode has already been logged.
5226de2b530bSJosef Bacik 		 */
5227de2b530bSJosef Bacik 		if (inode != orig_inode)
5228aefa6115SNikolay Borisov 			inode->logged_trans = trans->transid;
522912fcfd22SChris Mason 		smp_mb();
523012fcfd22SChris Mason 
5231aefa6115SNikolay Borisov 		if (btrfs_must_commit_transaction(trans, inode)) {
523212fcfd22SChris Mason 			ret = 1;
523312fcfd22SChris Mason 			break;
523412fcfd22SChris Mason 		}
523512fcfd22SChris Mason 
5236fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
523712fcfd22SChris Mason 			break;
523812fcfd22SChris Mason 
523944f714daSFilipe Manana 		if (IS_ROOT(parent)) {
5240aefa6115SNikolay Borisov 			inode = BTRFS_I(d_inode(parent));
5241aefa6115SNikolay Borisov 			if (btrfs_must_commit_transaction(trans, inode))
524244f714daSFilipe Manana 				ret = 1;
524312fcfd22SChris Mason 			break;
524444f714daSFilipe Manana 		}
524512fcfd22SChris Mason 
52466a912213SJosef Bacik 		parent = dget_parent(parent);
52476a912213SJosef Bacik 		dput(old_parent);
52486a912213SJosef Bacik 		old_parent = parent;
5249aefa6115SNikolay Borisov 		inode = BTRFS_I(d_inode(parent));
525012fcfd22SChris Mason 
525112fcfd22SChris Mason 	}
52526a912213SJosef Bacik 	dput(old_parent);
525312fcfd22SChris Mason out:
5254e02119d5SChris Mason 	return ret;
5255e02119d5SChris Mason }
5256e02119d5SChris Mason 
52572f2ff0eeSFilipe Manana struct btrfs_dir_list {
52582f2ff0eeSFilipe Manana 	u64 ino;
52592f2ff0eeSFilipe Manana 	struct list_head list;
52602f2ff0eeSFilipe Manana };
52612f2ff0eeSFilipe Manana 
52622f2ff0eeSFilipe Manana /*
52632f2ff0eeSFilipe Manana  * Log the inodes of the new dentries of a directory. See log_dir_items() for
52642f2ff0eeSFilipe Manana  * details about the why it is needed.
52652f2ff0eeSFilipe Manana  * This is a recursive operation - if an existing dentry corresponds to a
52662f2ff0eeSFilipe Manana  * directory, that directory's new entries are logged too (same behaviour as
52672f2ff0eeSFilipe Manana  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
52682f2ff0eeSFilipe Manana  * the dentries point to we do not lock their i_mutex, otherwise lockdep
52692f2ff0eeSFilipe Manana  * complains about the following circular lock dependency / possible deadlock:
52702f2ff0eeSFilipe Manana  *
52712f2ff0eeSFilipe Manana  *        CPU0                                        CPU1
52722f2ff0eeSFilipe Manana  *        ----                                        ----
52732f2ff0eeSFilipe Manana  * lock(&type->i_mutex_dir_key#3/2);
52742f2ff0eeSFilipe Manana  *                                            lock(sb_internal#2);
52752f2ff0eeSFilipe Manana  *                                            lock(&type->i_mutex_dir_key#3/2);
52762f2ff0eeSFilipe Manana  * lock(&sb->s_type->i_mutex_key#14);
52772f2ff0eeSFilipe Manana  *
52782f2ff0eeSFilipe Manana  * Where sb_internal is the lock (a counter that works as a lock) acquired by
52792f2ff0eeSFilipe Manana  * sb_start_intwrite() in btrfs_start_transaction().
52802f2ff0eeSFilipe Manana  * Not locking i_mutex of the inodes is still safe because:
52812f2ff0eeSFilipe Manana  *
52822f2ff0eeSFilipe Manana  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
52832f2ff0eeSFilipe Manana  *    that while logging the inode new references (names) are added or removed
52842f2ff0eeSFilipe Manana  *    from the inode, leaving the logged inode item with a link count that does
52852f2ff0eeSFilipe Manana  *    not match the number of logged inode reference items. This is fine because
52862f2ff0eeSFilipe Manana  *    at log replay time we compute the real number of links and correct the
52872f2ff0eeSFilipe Manana  *    link count in the inode item (see replay_one_buffer() and
52882f2ff0eeSFilipe Manana  *    link_to_fixup_dir());
52892f2ff0eeSFilipe Manana  *
52902f2ff0eeSFilipe Manana  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
52912f2ff0eeSFilipe Manana  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
52922f2ff0eeSFilipe Manana  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
52932f2ff0eeSFilipe Manana  *    has a size that doesn't match the sum of the lengths of all the logged
52942f2ff0eeSFilipe Manana  *    names. This does not result in a problem because if a dir_item key is
52952f2ff0eeSFilipe Manana  *    logged but its matching dir_index key is not logged, at log replay time we
52962f2ff0eeSFilipe Manana  *    don't use it to replay the respective name (see replay_one_name()). On the
52972f2ff0eeSFilipe Manana  *    other hand if only the dir_index key ends up being logged, the respective
52982f2ff0eeSFilipe Manana  *    name is added to the fs/subvol tree with both the dir_item and dir_index
52992f2ff0eeSFilipe Manana  *    keys created (see replay_one_name()).
53002f2ff0eeSFilipe Manana  *    The directory's inode item with a wrong i_size is not a problem as well,
53012f2ff0eeSFilipe Manana  *    since we don't use it at log replay time to set the i_size in the inode
53022f2ff0eeSFilipe Manana  *    item of the fs/subvol tree (see overwrite_item()).
53032f2ff0eeSFilipe Manana  */
53042f2ff0eeSFilipe Manana static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
53052f2ff0eeSFilipe Manana 				struct btrfs_root *root,
530651cc0d32SNikolay Borisov 				struct btrfs_inode *start_inode,
53072f2ff0eeSFilipe Manana 				struct btrfs_log_ctx *ctx)
53082f2ff0eeSFilipe Manana {
53090b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
53102f2ff0eeSFilipe Manana 	struct btrfs_root *log = root->log_root;
53112f2ff0eeSFilipe Manana 	struct btrfs_path *path;
53122f2ff0eeSFilipe Manana 	LIST_HEAD(dir_list);
53132f2ff0eeSFilipe Manana 	struct btrfs_dir_list *dir_elem;
53142f2ff0eeSFilipe Manana 	int ret = 0;
53152f2ff0eeSFilipe Manana 
53162f2ff0eeSFilipe Manana 	path = btrfs_alloc_path();
53172f2ff0eeSFilipe Manana 	if (!path)
53182f2ff0eeSFilipe Manana 		return -ENOMEM;
53192f2ff0eeSFilipe Manana 
53202f2ff0eeSFilipe Manana 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
53212f2ff0eeSFilipe Manana 	if (!dir_elem) {
53222f2ff0eeSFilipe Manana 		btrfs_free_path(path);
53232f2ff0eeSFilipe Manana 		return -ENOMEM;
53242f2ff0eeSFilipe Manana 	}
532551cc0d32SNikolay Borisov 	dir_elem->ino = btrfs_ino(start_inode);
53262f2ff0eeSFilipe Manana 	list_add_tail(&dir_elem->list, &dir_list);
53272f2ff0eeSFilipe Manana 
53282f2ff0eeSFilipe Manana 	while (!list_empty(&dir_list)) {
53292f2ff0eeSFilipe Manana 		struct extent_buffer *leaf;
53302f2ff0eeSFilipe Manana 		struct btrfs_key min_key;
53312f2ff0eeSFilipe Manana 		int nritems;
53322f2ff0eeSFilipe Manana 		int i;
53332f2ff0eeSFilipe Manana 
53342f2ff0eeSFilipe Manana 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
53352f2ff0eeSFilipe Manana 					    list);
53362f2ff0eeSFilipe Manana 		if (ret)
53372f2ff0eeSFilipe Manana 			goto next_dir_inode;
53382f2ff0eeSFilipe Manana 
53392f2ff0eeSFilipe Manana 		min_key.objectid = dir_elem->ino;
53402f2ff0eeSFilipe Manana 		min_key.type = BTRFS_DIR_ITEM_KEY;
53412f2ff0eeSFilipe Manana 		min_key.offset = 0;
53422f2ff0eeSFilipe Manana again:
53432f2ff0eeSFilipe Manana 		btrfs_release_path(path);
53442f2ff0eeSFilipe Manana 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
53452f2ff0eeSFilipe Manana 		if (ret < 0) {
53462f2ff0eeSFilipe Manana 			goto next_dir_inode;
53472f2ff0eeSFilipe Manana 		} else if (ret > 0) {
53482f2ff0eeSFilipe Manana 			ret = 0;
53492f2ff0eeSFilipe Manana 			goto next_dir_inode;
53502f2ff0eeSFilipe Manana 		}
53512f2ff0eeSFilipe Manana 
53522f2ff0eeSFilipe Manana process_leaf:
53532f2ff0eeSFilipe Manana 		leaf = path->nodes[0];
53542f2ff0eeSFilipe Manana 		nritems = btrfs_header_nritems(leaf);
53552f2ff0eeSFilipe Manana 		for (i = path->slots[0]; i < nritems; i++) {
53562f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
53572f2ff0eeSFilipe Manana 			struct btrfs_key di_key;
53582f2ff0eeSFilipe Manana 			struct inode *di_inode;
53592f2ff0eeSFilipe Manana 			struct btrfs_dir_list *new_dir_elem;
53602f2ff0eeSFilipe Manana 			int log_mode = LOG_INODE_EXISTS;
53612f2ff0eeSFilipe Manana 			int type;
53622f2ff0eeSFilipe Manana 
53632f2ff0eeSFilipe Manana 			btrfs_item_key_to_cpu(leaf, &min_key, i);
53642f2ff0eeSFilipe Manana 			if (min_key.objectid != dir_elem->ino ||
53652f2ff0eeSFilipe Manana 			    min_key.type != BTRFS_DIR_ITEM_KEY)
53662f2ff0eeSFilipe Manana 				goto next_dir_inode;
53672f2ff0eeSFilipe Manana 
53682f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
53692f2ff0eeSFilipe Manana 			type = btrfs_dir_type(leaf, di);
53702f2ff0eeSFilipe Manana 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
53712f2ff0eeSFilipe Manana 			    type != BTRFS_FT_DIR)
53722f2ff0eeSFilipe Manana 				continue;
53732f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
53742f2ff0eeSFilipe Manana 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
53752f2ff0eeSFilipe Manana 				continue;
53762f2ff0eeSFilipe Manana 
5377ec125cfbSRobbie Ko 			btrfs_release_path(path);
53780b246afaSJeff Mahoney 			di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
53792f2ff0eeSFilipe Manana 			if (IS_ERR(di_inode)) {
53802f2ff0eeSFilipe Manana 				ret = PTR_ERR(di_inode);
53812f2ff0eeSFilipe Manana 				goto next_dir_inode;
53822f2ff0eeSFilipe Manana 			}
53832f2ff0eeSFilipe Manana 
53840f8939b8SNikolay Borisov 			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
53852f2ff0eeSFilipe Manana 				iput(di_inode);
5386ec125cfbSRobbie Ko 				break;
53872f2ff0eeSFilipe Manana 			}
53882f2ff0eeSFilipe Manana 
53892f2ff0eeSFilipe Manana 			ctx->log_new_dentries = false;
53903f9749f6SFilipe Manana 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
53912f2ff0eeSFilipe Manana 				log_mode = LOG_INODE_ALL;
5392a59108a7SNikolay Borisov 			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
53932f2ff0eeSFilipe Manana 					      log_mode, 0, LLONG_MAX, ctx);
53942be63d5cSFilipe Manana 			if (!ret &&
5395ab1717b2SNikolay Borisov 			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
53962be63d5cSFilipe Manana 				ret = 1;
53972f2ff0eeSFilipe Manana 			iput(di_inode);
53982f2ff0eeSFilipe Manana 			if (ret)
53992f2ff0eeSFilipe Manana 				goto next_dir_inode;
54002f2ff0eeSFilipe Manana 			if (ctx->log_new_dentries) {
54012f2ff0eeSFilipe Manana 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
54022f2ff0eeSFilipe Manana 						       GFP_NOFS);
54032f2ff0eeSFilipe Manana 				if (!new_dir_elem) {
54042f2ff0eeSFilipe Manana 					ret = -ENOMEM;
54052f2ff0eeSFilipe Manana 					goto next_dir_inode;
54062f2ff0eeSFilipe Manana 				}
54072f2ff0eeSFilipe Manana 				new_dir_elem->ino = di_key.objectid;
54082f2ff0eeSFilipe Manana 				list_add_tail(&new_dir_elem->list, &dir_list);
54092f2ff0eeSFilipe Manana 			}
54102f2ff0eeSFilipe Manana 			break;
54112f2ff0eeSFilipe Manana 		}
54122f2ff0eeSFilipe Manana 		if (i == nritems) {
54132f2ff0eeSFilipe Manana 			ret = btrfs_next_leaf(log, path);
54142f2ff0eeSFilipe Manana 			if (ret < 0) {
54152f2ff0eeSFilipe Manana 				goto next_dir_inode;
54162f2ff0eeSFilipe Manana 			} else if (ret > 0) {
54172f2ff0eeSFilipe Manana 				ret = 0;
54182f2ff0eeSFilipe Manana 				goto next_dir_inode;
54192f2ff0eeSFilipe Manana 			}
54202f2ff0eeSFilipe Manana 			goto process_leaf;
54212f2ff0eeSFilipe Manana 		}
54222f2ff0eeSFilipe Manana 		if (min_key.offset < (u64)-1) {
54232f2ff0eeSFilipe Manana 			min_key.offset++;
54242f2ff0eeSFilipe Manana 			goto again;
54252f2ff0eeSFilipe Manana 		}
54262f2ff0eeSFilipe Manana next_dir_inode:
54272f2ff0eeSFilipe Manana 		list_del(&dir_elem->list);
54282f2ff0eeSFilipe Manana 		kfree(dir_elem);
54292f2ff0eeSFilipe Manana 	}
54302f2ff0eeSFilipe Manana 
54312f2ff0eeSFilipe Manana 	btrfs_free_path(path);
54322f2ff0eeSFilipe Manana 	return ret;
54332f2ff0eeSFilipe Manana }
54342f2ff0eeSFilipe Manana 
543518aa0922SFilipe Manana static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5436d0a0b78dSNikolay Borisov 				 struct btrfs_inode *inode,
543718aa0922SFilipe Manana 				 struct btrfs_log_ctx *ctx)
543818aa0922SFilipe Manana {
54393ffbd68cSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
544018aa0922SFilipe Manana 	int ret;
544118aa0922SFilipe Manana 	struct btrfs_path *path;
544218aa0922SFilipe Manana 	struct btrfs_key key;
5443d0a0b78dSNikolay Borisov 	struct btrfs_root *root = inode->root;
5444d0a0b78dSNikolay Borisov 	const u64 ino = btrfs_ino(inode);
544518aa0922SFilipe Manana 
544618aa0922SFilipe Manana 	path = btrfs_alloc_path();
544718aa0922SFilipe Manana 	if (!path)
544818aa0922SFilipe Manana 		return -ENOMEM;
544918aa0922SFilipe Manana 	path->skip_locking = 1;
545018aa0922SFilipe Manana 	path->search_commit_root = 1;
545118aa0922SFilipe Manana 
545218aa0922SFilipe Manana 	key.objectid = ino;
545318aa0922SFilipe Manana 	key.type = BTRFS_INODE_REF_KEY;
545418aa0922SFilipe Manana 	key.offset = 0;
545518aa0922SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
545618aa0922SFilipe Manana 	if (ret < 0)
545718aa0922SFilipe Manana 		goto out;
545818aa0922SFilipe Manana 
545918aa0922SFilipe Manana 	while (true) {
546018aa0922SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
546118aa0922SFilipe Manana 		int slot = path->slots[0];
546218aa0922SFilipe Manana 		u32 cur_offset = 0;
546318aa0922SFilipe Manana 		u32 item_size;
546418aa0922SFilipe Manana 		unsigned long ptr;
546518aa0922SFilipe Manana 
546618aa0922SFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
546718aa0922SFilipe Manana 			ret = btrfs_next_leaf(root, path);
546818aa0922SFilipe Manana 			if (ret < 0)
546918aa0922SFilipe Manana 				goto out;
547018aa0922SFilipe Manana 			else if (ret > 0)
547118aa0922SFilipe Manana 				break;
547218aa0922SFilipe Manana 			continue;
547318aa0922SFilipe Manana 		}
547418aa0922SFilipe Manana 
547518aa0922SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
547618aa0922SFilipe Manana 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
547718aa0922SFilipe Manana 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
547818aa0922SFilipe Manana 			break;
547918aa0922SFilipe Manana 
548018aa0922SFilipe Manana 		item_size = btrfs_item_size_nr(leaf, slot);
548118aa0922SFilipe Manana 		ptr = btrfs_item_ptr_offset(leaf, slot);
548218aa0922SFilipe Manana 		while (cur_offset < item_size) {
548318aa0922SFilipe Manana 			struct btrfs_key inode_key;
548418aa0922SFilipe Manana 			struct inode *dir_inode;
548518aa0922SFilipe Manana 
548618aa0922SFilipe Manana 			inode_key.type = BTRFS_INODE_ITEM_KEY;
548718aa0922SFilipe Manana 			inode_key.offset = 0;
548818aa0922SFilipe Manana 
548918aa0922SFilipe Manana 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
549018aa0922SFilipe Manana 				struct btrfs_inode_extref *extref;
549118aa0922SFilipe Manana 
549218aa0922SFilipe Manana 				extref = (struct btrfs_inode_extref *)
549318aa0922SFilipe Manana 					(ptr + cur_offset);
549418aa0922SFilipe Manana 				inode_key.objectid = btrfs_inode_extref_parent(
549518aa0922SFilipe Manana 					leaf, extref);
549618aa0922SFilipe Manana 				cur_offset += sizeof(*extref);
549718aa0922SFilipe Manana 				cur_offset += btrfs_inode_extref_name_len(leaf,
549818aa0922SFilipe Manana 					extref);
549918aa0922SFilipe Manana 			} else {
550018aa0922SFilipe Manana 				inode_key.objectid = key.offset;
550118aa0922SFilipe Manana 				cur_offset = item_size;
550218aa0922SFilipe Manana 			}
550318aa0922SFilipe Manana 
55040b246afaSJeff Mahoney 			dir_inode = btrfs_iget(fs_info->sb, &inode_key,
550518aa0922SFilipe Manana 					       root, NULL);
550618aa0922SFilipe Manana 			/* If parent inode was deleted, skip it. */
550718aa0922SFilipe Manana 			if (IS_ERR(dir_inode))
550818aa0922SFilipe Manana 				continue;
550918aa0922SFilipe Manana 
5510657ed1aaSFilipe Manana 			if (ctx)
5511657ed1aaSFilipe Manana 				ctx->log_new_dentries = false;
5512a59108a7SNikolay Borisov 			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
551318aa0922SFilipe Manana 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
55142be63d5cSFilipe Manana 			if (!ret &&
5515ab1717b2SNikolay Borisov 			    btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
55162be63d5cSFilipe Manana 				ret = 1;
5517657ed1aaSFilipe Manana 			if (!ret && ctx && ctx->log_new_dentries)
5518657ed1aaSFilipe Manana 				ret = log_new_dir_dentries(trans, root,
551951cc0d32SNikolay Borisov 						   BTRFS_I(dir_inode), ctx);
552018aa0922SFilipe Manana 			iput(dir_inode);
552118aa0922SFilipe Manana 			if (ret)
552218aa0922SFilipe Manana 				goto out;
552318aa0922SFilipe Manana 		}
552418aa0922SFilipe Manana 		path->slots[0]++;
552518aa0922SFilipe Manana 	}
552618aa0922SFilipe Manana 	ret = 0;
552718aa0922SFilipe Manana out:
552818aa0922SFilipe Manana 	btrfs_free_path(path);
552918aa0922SFilipe Manana 	return ret;
553018aa0922SFilipe Manana }
553118aa0922SFilipe Manana 
5532e02119d5SChris Mason /*
5533e02119d5SChris Mason  * helper function around btrfs_log_inode to make sure newly created
5534e02119d5SChris Mason  * parent directories also end up in the log.  A minimal inode and backref
5535e02119d5SChris Mason  * only logging is done of any parent directories that are older than
5536e02119d5SChris Mason  * the last committed transaction
5537e02119d5SChris Mason  */
553848a3b636SEric Sandeen static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
553919df27a9SNikolay Borisov 				  struct btrfs_inode *inode,
554049dae1bcSFilipe Manana 				  struct dentry *parent,
554149dae1bcSFilipe Manana 				  const loff_t start,
554249dae1bcSFilipe Manana 				  const loff_t end,
554341a1eadaSEdmund Nadolski 				  int inode_only,
55448b050d35SMiao Xie 				  struct btrfs_log_ctx *ctx)
5545e02119d5SChris Mason {
5546f882274bSNikolay Borisov 	struct btrfs_root *root = inode->root;
55470b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
5548e02119d5SChris Mason 	struct super_block *sb;
55496a912213SJosef Bacik 	struct dentry *old_parent = NULL;
555012fcfd22SChris Mason 	int ret = 0;
55510b246afaSJeff Mahoney 	u64 last_committed = fs_info->last_trans_committed;
55522f2ff0eeSFilipe Manana 	bool log_dentries = false;
555319df27a9SNikolay Borisov 	struct btrfs_inode *orig_inode = inode;
555412fcfd22SChris Mason 
555519df27a9SNikolay Borisov 	sb = inode->vfs_inode.i_sb;
555612fcfd22SChris Mason 
55570b246afaSJeff Mahoney 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
55583a5e1404SSage Weil 		ret = 1;
55593a5e1404SSage Weil 		goto end_no_trans;
55603a5e1404SSage Weil 	}
55613a5e1404SSage Weil 
5562995946ddSMiao Xie 	/*
5563995946ddSMiao Xie 	 * The prev transaction commit doesn't complete, we need do
5564995946ddSMiao Xie 	 * full commit by ourselves.
5565995946ddSMiao Xie 	 */
55660b246afaSJeff Mahoney 	if (fs_info->last_trans_log_full_commit >
55670b246afaSJeff Mahoney 	    fs_info->last_trans_committed) {
556812fcfd22SChris Mason 		ret = 1;
556912fcfd22SChris Mason 		goto end_no_trans;
557012fcfd22SChris Mason 	}
557112fcfd22SChris Mason 
5572f882274bSNikolay Borisov 	if (btrfs_root_refs(&root->root_item) == 0) {
557376dda93cSYan, Zheng 		ret = 1;
557476dda93cSYan, Zheng 		goto end_no_trans;
557576dda93cSYan, Zheng 	}
557676dda93cSYan, Zheng 
557719df27a9SNikolay Borisov 	ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
557819df27a9SNikolay Borisov 			last_committed);
557912fcfd22SChris Mason 	if (ret)
558012fcfd22SChris Mason 		goto end_no_trans;
5581e02119d5SChris Mason 
558219df27a9SNikolay Borisov 	if (btrfs_inode_in_log(inode, trans->transid)) {
5583257c62e1SChris Mason 		ret = BTRFS_NO_LOG_SYNC;
5584257c62e1SChris Mason 		goto end_no_trans;
5585257c62e1SChris Mason 	}
5586257c62e1SChris Mason 
55878b050d35SMiao Xie 	ret = start_log_trans(trans, root, ctx);
55884a500fd1SYan, Zheng 	if (ret)
5589e87ac136SMiao Xie 		goto end_no_trans;
559012fcfd22SChris Mason 
559119df27a9SNikolay Borisov 	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
55924a500fd1SYan, Zheng 	if (ret)
55934a500fd1SYan, Zheng 		goto end_trans;
5594e02119d5SChris Mason 
5595af4176b4SChris Mason 	/*
5596af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
5597af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
5598af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
5599af4176b4SChris Mason 	 * and other fun in this file.
5600af4176b4SChris Mason 	 */
560119df27a9SNikolay Borisov 	if (S_ISREG(inode->vfs_inode.i_mode) &&
560219df27a9SNikolay Borisov 	    inode->generation <= last_committed &&
560319df27a9SNikolay Borisov 	    inode->last_unlink_trans <= last_committed) {
56044a500fd1SYan, Zheng 		ret = 0;
56054a500fd1SYan, Zheng 		goto end_trans;
56064a500fd1SYan, Zheng 	}
5607af4176b4SChris Mason 
560819df27a9SNikolay Borisov 	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
56092f2ff0eeSFilipe Manana 		log_dentries = true;
56102f2ff0eeSFilipe Manana 
561118aa0922SFilipe Manana 	/*
561201327610SNicholas D Steeves 	 * On unlink we must make sure all our current and old parent directory
561318aa0922SFilipe Manana 	 * inodes are fully logged. This is to prevent leaving dangling
561418aa0922SFilipe Manana 	 * directory index entries in directories that were our parents but are
561518aa0922SFilipe Manana 	 * not anymore. Not doing this results in old parent directory being
561618aa0922SFilipe Manana 	 * impossible to delete after log replay (rmdir will always fail with
561718aa0922SFilipe Manana 	 * error -ENOTEMPTY).
561818aa0922SFilipe Manana 	 *
561918aa0922SFilipe Manana 	 * Example 1:
562018aa0922SFilipe Manana 	 *
562118aa0922SFilipe Manana 	 * mkdir testdir
562218aa0922SFilipe Manana 	 * touch testdir/foo
562318aa0922SFilipe Manana 	 * ln testdir/foo testdir/bar
562418aa0922SFilipe Manana 	 * sync
562518aa0922SFilipe Manana 	 * unlink testdir/bar
562618aa0922SFilipe Manana 	 * xfs_io -c fsync testdir/foo
562718aa0922SFilipe Manana 	 * <power failure>
562818aa0922SFilipe Manana 	 * mount fs, triggers log replay
562918aa0922SFilipe Manana 	 *
563018aa0922SFilipe Manana 	 * If we don't log the parent directory (testdir), after log replay the
563118aa0922SFilipe Manana 	 * directory still has an entry pointing to the file inode using the bar
563218aa0922SFilipe Manana 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
563318aa0922SFilipe Manana 	 * the file inode has a link count of 1.
563418aa0922SFilipe Manana 	 *
563518aa0922SFilipe Manana 	 * Example 2:
563618aa0922SFilipe Manana 	 *
563718aa0922SFilipe Manana 	 * mkdir testdir
563818aa0922SFilipe Manana 	 * touch foo
563918aa0922SFilipe Manana 	 * ln foo testdir/foo2
564018aa0922SFilipe Manana 	 * ln foo testdir/foo3
564118aa0922SFilipe Manana 	 * sync
564218aa0922SFilipe Manana 	 * unlink testdir/foo3
564318aa0922SFilipe Manana 	 * xfs_io -c fsync foo
564418aa0922SFilipe Manana 	 * <power failure>
564518aa0922SFilipe Manana 	 * mount fs, triggers log replay
564618aa0922SFilipe Manana 	 *
564718aa0922SFilipe Manana 	 * Similar as the first example, after log replay the parent directory
564818aa0922SFilipe Manana 	 * testdir still has an entry pointing to the inode file with name foo3
564918aa0922SFilipe Manana 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
565018aa0922SFilipe Manana 	 * and has a link count of 2.
565118aa0922SFilipe Manana 	 */
565219df27a9SNikolay Borisov 	if (inode->last_unlink_trans > last_committed) {
565318aa0922SFilipe Manana 		ret = btrfs_log_all_parents(trans, orig_inode, ctx);
565418aa0922SFilipe Manana 		if (ret)
565518aa0922SFilipe Manana 			goto end_trans;
565618aa0922SFilipe Manana 	}
565718aa0922SFilipe Manana 
565812fcfd22SChris Mason 	while (1) {
5659fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5660e02119d5SChris Mason 			break;
5661e02119d5SChris Mason 
566219df27a9SNikolay Borisov 		inode = BTRFS_I(d_inode(parent));
566319df27a9SNikolay Borisov 		if (root != inode->root)
566476dda93cSYan, Zheng 			break;
566576dda93cSYan, Zheng 
566619df27a9SNikolay Borisov 		if (inode->generation > last_committed) {
566719df27a9SNikolay Borisov 			ret = btrfs_log_inode(trans, root, inode,
566819df27a9SNikolay Borisov 					LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
56694a500fd1SYan, Zheng 			if (ret)
56704a500fd1SYan, Zheng 				goto end_trans;
5671e02119d5SChris Mason 		}
567276dda93cSYan, Zheng 		if (IS_ROOT(parent))
567312fcfd22SChris Mason 			break;
567412fcfd22SChris Mason 
56756a912213SJosef Bacik 		parent = dget_parent(parent);
56766a912213SJosef Bacik 		dput(old_parent);
56776a912213SJosef Bacik 		old_parent = parent;
567812fcfd22SChris Mason 	}
56792f2ff0eeSFilipe Manana 	if (log_dentries)
568019df27a9SNikolay Borisov 		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
56812f2ff0eeSFilipe Manana 	else
568212fcfd22SChris Mason 		ret = 0;
56834a500fd1SYan, Zheng end_trans:
56846a912213SJosef Bacik 	dput(old_parent);
56854a500fd1SYan, Zheng 	if (ret < 0) {
56860b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
56874a500fd1SYan, Zheng 		ret = 1;
56884a500fd1SYan, Zheng 	}
56898b050d35SMiao Xie 
56908b050d35SMiao Xie 	if (ret)
56918b050d35SMiao Xie 		btrfs_remove_log_ctx(root, ctx);
569212fcfd22SChris Mason 	btrfs_end_log_trans(root);
569312fcfd22SChris Mason end_no_trans:
569412fcfd22SChris Mason 	return ret;
5695e02119d5SChris Mason }
5696e02119d5SChris Mason 
5697e02119d5SChris Mason /*
5698e02119d5SChris Mason  * it is not safe to log dentry if the chunk root has added new
5699e02119d5SChris Mason  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5700e02119d5SChris Mason  * If this returns 1, you must commit the transaction to safely get your
5701e02119d5SChris Mason  * data on disk.
5702e02119d5SChris Mason  */
5703e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5704e5b84f7aSNikolay Borisov 			  struct dentry *dentry,
570549dae1bcSFilipe Manana 			  const loff_t start,
570649dae1bcSFilipe Manana 			  const loff_t end,
57078b050d35SMiao Xie 			  struct btrfs_log_ctx *ctx)
5708e02119d5SChris Mason {
57096a912213SJosef Bacik 	struct dentry *parent = dget_parent(dentry);
57106a912213SJosef Bacik 	int ret;
57116a912213SJosef Bacik 
5712f882274bSNikolay Borisov 	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
5713f882274bSNikolay Borisov 				     start, end, LOG_INODE_ALL, ctx);
57146a912213SJosef Bacik 	dput(parent);
57156a912213SJosef Bacik 
57166a912213SJosef Bacik 	return ret;
5717e02119d5SChris Mason }
5718e02119d5SChris Mason 
5719e02119d5SChris Mason /*
5720e02119d5SChris Mason  * should be called during mount to recover any replay any log trees
5721e02119d5SChris Mason  * from the FS
5722e02119d5SChris Mason  */
5723e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5724e02119d5SChris Mason {
5725e02119d5SChris Mason 	int ret;
5726e02119d5SChris Mason 	struct btrfs_path *path;
5727e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
5728e02119d5SChris Mason 	struct btrfs_key key;
5729e02119d5SChris Mason 	struct btrfs_key found_key;
5730e02119d5SChris Mason 	struct btrfs_key tmp_key;
5731e02119d5SChris Mason 	struct btrfs_root *log;
5732e02119d5SChris Mason 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5733e02119d5SChris Mason 	struct walk_control wc = {
5734e02119d5SChris Mason 		.process_func = process_one_buffer,
5735e02119d5SChris Mason 		.stage = 0,
5736e02119d5SChris Mason 	};
5737e02119d5SChris Mason 
5738e02119d5SChris Mason 	path = btrfs_alloc_path();
5739db5b493aSTsutomu Itoh 	if (!path)
5740db5b493aSTsutomu Itoh 		return -ENOMEM;
5741db5b493aSTsutomu Itoh 
5742afcdd129SJosef Bacik 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5743e02119d5SChris Mason 
57444a500fd1SYan, Zheng 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
574579787eaaSJeff Mahoney 	if (IS_ERR(trans)) {
574679787eaaSJeff Mahoney 		ret = PTR_ERR(trans);
574779787eaaSJeff Mahoney 		goto error;
574879787eaaSJeff Mahoney 	}
5749e02119d5SChris Mason 
5750e02119d5SChris Mason 	wc.trans = trans;
5751e02119d5SChris Mason 	wc.pin = 1;
5752e02119d5SChris Mason 
5753db5b493aSTsutomu Itoh 	ret = walk_log_tree(trans, log_root_tree, &wc);
575479787eaaSJeff Mahoney 	if (ret) {
57555d163e0eSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
57565d163e0eSJeff Mahoney 			"Failed to pin buffers while recovering log root tree.");
575779787eaaSJeff Mahoney 		goto error;
575879787eaaSJeff Mahoney 	}
5759e02119d5SChris Mason 
5760e02119d5SChris Mason again:
5761e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
5762e02119d5SChris Mason 	key.offset = (u64)-1;
5763962a298fSDavid Sterba 	key.type = BTRFS_ROOT_ITEM_KEY;
5764e02119d5SChris Mason 
5765e02119d5SChris Mason 	while (1) {
5766e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
576779787eaaSJeff Mahoney 
576879787eaaSJeff Mahoney 		if (ret < 0) {
576934d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
577079787eaaSJeff Mahoney 				    "Couldn't find tree log root.");
577179787eaaSJeff Mahoney 			goto error;
577279787eaaSJeff Mahoney 		}
5773e02119d5SChris Mason 		if (ret > 0) {
5774e02119d5SChris Mason 			if (path->slots[0] == 0)
5775e02119d5SChris Mason 				break;
5776e02119d5SChris Mason 			path->slots[0]--;
5777e02119d5SChris Mason 		}
5778e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5779e02119d5SChris Mason 				      path->slots[0]);
5780b3b4aa74SDavid Sterba 		btrfs_release_path(path);
5781e02119d5SChris Mason 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5782e02119d5SChris Mason 			break;
5783e02119d5SChris Mason 
5784cb517eabSMiao Xie 		log = btrfs_read_fs_root(log_root_tree, &found_key);
578579787eaaSJeff Mahoney 		if (IS_ERR(log)) {
578679787eaaSJeff Mahoney 			ret = PTR_ERR(log);
578734d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
578879787eaaSJeff Mahoney 				    "Couldn't read tree log root.");
578979787eaaSJeff Mahoney 			goto error;
579079787eaaSJeff Mahoney 		}
5791e02119d5SChris Mason 
5792e02119d5SChris Mason 		tmp_key.objectid = found_key.offset;
5793e02119d5SChris Mason 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5794e02119d5SChris Mason 		tmp_key.offset = (u64)-1;
5795e02119d5SChris Mason 
5796e02119d5SChris Mason 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
579779787eaaSJeff Mahoney 		if (IS_ERR(wc.replay_dest)) {
579879787eaaSJeff Mahoney 			ret = PTR_ERR(wc.replay_dest);
5799b50c6e25SJosef Bacik 			free_extent_buffer(log->node);
5800b50c6e25SJosef Bacik 			free_extent_buffer(log->commit_root);
5801b50c6e25SJosef Bacik 			kfree(log);
58025d163e0eSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
58035d163e0eSJeff Mahoney 				"Couldn't read target root for tree log recovery.");
580479787eaaSJeff Mahoney 			goto error;
580579787eaaSJeff Mahoney 		}
5806e02119d5SChris Mason 
580707d400a6SYan Zheng 		wc.replay_dest->log_root = log;
58085d4f98a2SYan Zheng 		btrfs_record_root_in_trans(trans, wc.replay_dest);
5809e02119d5SChris Mason 		ret = walk_log_tree(trans, log, &wc);
5810e02119d5SChris Mason 
5811b50c6e25SJosef Bacik 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5812e02119d5SChris Mason 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
5813e02119d5SChris Mason 						      path);
5814e02119d5SChris Mason 		}
5815e02119d5SChris Mason 
5816900c9981SLiu Bo 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5817900c9981SLiu Bo 			struct btrfs_root *root = wc.replay_dest;
5818900c9981SLiu Bo 
5819900c9981SLiu Bo 			btrfs_release_path(path);
5820900c9981SLiu Bo 
5821900c9981SLiu Bo 			/*
5822900c9981SLiu Bo 			 * We have just replayed everything, and the highest
5823900c9981SLiu Bo 			 * objectid of fs roots probably has changed in case
5824900c9981SLiu Bo 			 * some inode_item's got replayed.
5825900c9981SLiu Bo 			 *
5826900c9981SLiu Bo 			 * root->objectid_mutex is not acquired as log replay
5827900c9981SLiu Bo 			 * could only happen during mount.
5828900c9981SLiu Bo 			 */
5829900c9981SLiu Bo 			ret = btrfs_find_highest_objectid(root,
5830900c9981SLiu Bo 						  &root->highest_objectid);
5831900c9981SLiu Bo 		}
5832900c9981SLiu Bo 
5833e02119d5SChris Mason 		key.offset = found_key.offset - 1;
583407d400a6SYan Zheng 		wc.replay_dest->log_root = NULL;
5835e02119d5SChris Mason 		free_extent_buffer(log->node);
5836b263c2c8SChris Mason 		free_extent_buffer(log->commit_root);
5837e02119d5SChris Mason 		kfree(log);
5838e02119d5SChris Mason 
5839b50c6e25SJosef Bacik 		if (ret)
5840b50c6e25SJosef Bacik 			goto error;
5841b50c6e25SJosef Bacik 
5842e02119d5SChris Mason 		if (found_key.offset == 0)
5843e02119d5SChris Mason 			break;
5844e02119d5SChris Mason 	}
5845b3b4aa74SDavid Sterba 	btrfs_release_path(path);
5846e02119d5SChris Mason 
5847e02119d5SChris Mason 	/* step one is to pin it all, step two is to replay just inodes */
5848e02119d5SChris Mason 	if (wc.pin) {
5849e02119d5SChris Mason 		wc.pin = 0;
5850e02119d5SChris Mason 		wc.process_func = replay_one_buffer;
5851e02119d5SChris Mason 		wc.stage = LOG_WALK_REPLAY_INODES;
5852e02119d5SChris Mason 		goto again;
5853e02119d5SChris Mason 	}
5854e02119d5SChris Mason 	/* step three is to replay everything */
5855e02119d5SChris Mason 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
5856e02119d5SChris Mason 		wc.stage++;
5857e02119d5SChris Mason 		goto again;
5858e02119d5SChris Mason 	}
5859e02119d5SChris Mason 
5860e02119d5SChris Mason 	btrfs_free_path(path);
5861e02119d5SChris Mason 
5862abefa55aSJosef Bacik 	/* step 4: commit the transaction, which also unpins the blocks */
58633a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
5864abefa55aSJosef Bacik 	if (ret)
5865abefa55aSJosef Bacik 		return ret;
5866abefa55aSJosef Bacik 
5867e02119d5SChris Mason 	free_extent_buffer(log_root_tree->node);
5868e02119d5SChris Mason 	log_root_tree->log_root = NULL;
5869afcdd129SJosef Bacik 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5870e02119d5SChris Mason 	kfree(log_root_tree);
587179787eaaSJeff Mahoney 
5872abefa55aSJosef Bacik 	return 0;
587379787eaaSJeff Mahoney error:
5874b50c6e25SJosef Bacik 	if (wc.trans)
58753a45bb20SJeff Mahoney 		btrfs_end_transaction(wc.trans);
587679787eaaSJeff Mahoney 	btrfs_free_path(path);
587779787eaaSJeff Mahoney 	return ret;
5878e02119d5SChris Mason }
587912fcfd22SChris Mason 
588012fcfd22SChris Mason /*
588112fcfd22SChris Mason  * there are some corner cases where we want to force a full
588212fcfd22SChris Mason  * commit instead of allowing a directory to be logged.
588312fcfd22SChris Mason  *
588412fcfd22SChris Mason  * They revolve around files there were unlinked from the directory, and
588512fcfd22SChris Mason  * this function updates the parent directory so that a full commit is
588612fcfd22SChris Mason  * properly done if it is fsync'd later after the unlinks are done.
58872be63d5cSFilipe Manana  *
58882be63d5cSFilipe Manana  * Must be called before the unlink operations (updates to the subvolume tree,
58892be63d5cSFilipe Manana  * inodes, etc) are done.
589012fcfd22SChris Mason  */
589112fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
58924176bdbfSNikolay Borisov 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
589312fcfd22SChris Mason 			     int for_rename)
589412fcfd22SChris Mason {
589512fcfd22SChris Mason 	/*
5896af4176b4SChris Mason 	 * when we're logging a file, if it hasn't been renamed
5897af4176b4SChris Mason 	 * or unlinked, and its inode is fully committed on disk,
5898af4176b4SChris Mason 	 * we don't have to worry about walking up the directory chain
5899af4176b4SChris Mason 	 * to log its parents.
5900af4176b4SChris Mason 	 *
5901af4176b4SChris Mason 	 * So, we use the last_unlink_trans field to put this transid
5902af4176b4SChris Mason 	 * into the file.  When the file is logged we check it and
5903af4176b4SChris Mason 	 * don't log the parents if the file is fully on disk.
5904af4176b4SChris Mason 	 */
59054176bdbfSNikolay Borisov 	mutex_lock(&inode->log_mutex);
59064176bdbfSNikolay Borisov 	inode->last_unlink_trans = trans->transid;
59074176bdbfSNikolay Borisov 	mutex_unlock(&inode->log_mutex);
5908af4176b4SChris Mason 
5909af4176b4SChris Mason 	/*
591012fcfd22SChris Mason 	 * if this directory was already logged any new
591112fcfd22SChris Mason 	 * names for this file/dir will get recorded
591212fcfd22SChris Mason 	 */
591312fcfd22SChris Mason 	smp_mb();
59144176bdbfSNikolay Borisov 	if (dir->logged_trans == trans->transid)
591512fcfd22SChris Mason 		return;
591612fcfd22SChris Mason 
591712fcfd22SChris Mason 	/*
591812fcfd22SChris Mason 	 * if the inode we're about to unlink was logged,
591912fcfd22SChris Mason 	 * the log will be properly updated for any new names
592012fcfd22SChris Mason 	 */
59214176bdbfSNikolay Borisov 	if (inode->logged_trans == trans->transid)
592212fcfd22SChris Mason 		return;
592312fcfd22SChris Mason 
592412fcfd22SChris Mason 	/*
592512fcfd22SChris Mason 	 * when renaming files across directories, if the directory
592612fcfd22SChris Mason 	 * there we're unlinking from gets fsync'd later on, there's
592712fcfd22SChris Mason 	 * no way to find the destination directory later and fsync it
592812fcfd22SChris Mason 	 * properly.  So, we have to be conservative and force commits
592912fcfd22SChris Mason 	 * so the new name gets discovered.
593012fcfd22SChris Mason 	 */
593112fcfd22SChris Mason 	if (for_rename)
593212fcfd22SChris Mason 		goto record;
593312fcfd22SChris Mason 
593412fcfd22SChris Mason 	/* we can safely do the unlink without any special recording */
593512fcfd22SChris Mason 	return;
593612fcfd22SChris Mason 
593712fcfd22SChris Mason record:
59384176bdbfSNikolay Borisov 	mutex_lock(&dir->log_mutex);
59394176bdbfSNikolay Borisov 	dir->last_unlink_trans = trans->transid;
59404176bdbfSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
594112fcfd22SChris Mason }
594212fcfd22SChris Mason 
594312fcfd22SChris Mason /*
59441ec9a1aeSFilipe Manana  * Make sure that if someone attempts to fsync the parent directory of a deleted
59451ec9a1aeSFilipe Manana  * snapshot, it ends up triggering a transaction commit. This is to guarantee
59461ec9a1aeSFilipe Manana  * that after replaying the log tree of the parent directory's root we will not
59471ec9a1aeSFilipe Manana  * see the snapshot anymore and at log replay time we will not see any log tree
59481ec9a1aeSFilipe Manana  * corresponding to the deleted snapshot's root, which could lead to replaying
59491ec9a1aeSFilipe Manana  * it after replaying the log tree of the parent directory (which would replay
59501ec9a1aeSFilipe Manana  * the snapshot delete operation).
59512be63d5cSFilipe Manana  *
59522be63d5cSFilipe Manana  * Must be called before the actual snapshot destroy operation (updates to the
59532be63d5cSFilipe Manana  * parent root and tree of tree roots trees, etc) are done.
59541ec9a1aeSFilipe Manana  */
59551ec9a1aeSFilipe Manana void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
595643663557SNikolay Borisov 				   struct btrfs_inode *dir)
59571ec9a1aeSFilipe Manana {
595843663557SNikolay Borisov 	mutex_lock(&dir->log_mutex);
595943663557SNikolay Borisov 	dir->last_unlink_trans = trans->transid;
596043663557SNikolay Borisov 	mutex_unlock(&dir->log_mutex);
59611ec9a1aeSFilipe Manana }
59621ec9a1aeSFilipe Manana 
59631ec9a1aeSFilipe Manana /*
596412fcfd22SChris Mason  * Call this after adding a new name for a file and it will properly
596512fcfd22SChris Mason  * update the log to reflect the new name.
596612fcfd22SChris Mason  *
596712fcfd22SChris Mason  * It will return zero if all goes well, and it will return 1 if a
596812fcfd22SChris Mason  * full transaction commit is required.
596912fcfd22SChris Mason  */
597012fcfd22SChris Mason int btrfs_log_new_name(struct btrfs_trans_handle *trans,
59719ca5fbfbSNikolay Borisov 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
597212fcfd22SChris Mason 			struct dentry *parent)
597312fcfd22SChris Mason {
59743ffbd68cSDavid Sterba 	struct btrfs_fs_info *fs_info = trans->fs_info;
597512fcfd22SChris Mason 
597612fcfd22SChris Mason 	/*
5977af4176b4SChris Mason 	 * this will force the logging code to walk the dentry chain
5978af4176b4SChris Mason 	 * up for the file
5979af4176b4SChris Mason 	 */
59809a6509c4SFilipe Manana 	if (!S_ISDIR(inode->vfs_inode.i_mode))
59819ca5fbfbSNikolay Borisov 		inode->last_unlink_trans = trans->transid;
5982af4176b4SChris Mason 
5983af4176b4SChris Mason 	/*
598412fcfd22SChris Mason 	 * if this inode hasn't been logged and directory we're renaming it
598512fcfd22SChris Mason 	 * from hasn't been logged, we don't need to log it
598612fcfd22SChris Mason 	 */
59879ca5fbfbSNikolay Borisov 	if (inode->logged_trans <= fs_info->last_trans_committed &&
59889ca5fbfbSNikolay Borisov 	    (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
598912fcfd22SChris Mason 		return 0;
599012fcfd22SChris Mason 
5991f882274bSNikolay Borisov 	return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
5992f882274bSNikolay Borisov 				      LOG_INODE_EXISTS, NULL);
599312fcfd22SChris Mason }
599412fcfd22SChris Mason 
5995