xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 1a93c36acdef62ef1f5dccf058328d4d5d6c0e70)
1e02119d5SChris Mason /*
2e02119d5SChris Mason  * Copyright (C) 2008 Oracle.  All rights reserved.
3e02119d5SChris Mason  *
4e02119d5SChris Mason  * This program is free software; you can redistribute it and/or
5e02119d5SChris Mason  * modify it under the terms of the GNU General Public
6e02119d5SChris Mason  * License v2 as published by the Free Software Foundation.
7e02119d5SChris Mason  *
8e02119d5SChris Mason  * This program is distributed in the hope that it will be useful,
9e02119d5SChris Mason  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10e02119d5SChris Mason  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11e02119d5SChris Mason  * General Public License for more details.
12e02119d5SChris Mason  *
13e02119d5SChris Mason  * You should have received a copy of the GNU General Public
14e02119d5SChris Mason  * License along with this program; if not, write to the
15e02119d5SChris Mason  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16e02119d5SChris Mason  * Boston, MA 021110-1307, USA.
17e02119d5SChris Mason  */
18e02119d5SChris Mason 
19e02119d5SChris Mason #include <linux/sched.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
21c6adc9ccSMiao Xie #include <linux/blkdev.h>
225dc562c5SJosef Bacik #include <linux/list_sort.h>
23995946ddSMiao Xie #include "tree-log.h"
24e02119d5SChris Mason #include "disk-io.h"
25e02119d5SChris Mason #include "locking.h"
26e02119d5SChris Mason #include "print-tree.h"
27f186373fSMark Fasheh #include "backref.h"
28f186373fSMark Fasheh #include "hash.h"
29ebb8765bSAnand Jain #include "compression.h"
30df2c95f3SQu Wenruo #include "qgroup.h"
31e02119d5SChris Mason 
32e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode:
33e02119d5SChris Mason  *
34e02119d5SChris Mason  * LOG_INODE_ALL means to log everything
35e02119d5SChris Mason  * LOG_INODE_EXISTS means to log just enough to recreate the inode
36e02119d5SChris Mason  * during log replay
37e02119d5SChris Mason  */
38e02119d5SChris Mason #define LOG_INODE_ALL 0
39e02119d5SChris Mason #define LOG_INODE_EXISTS 1
40781feef7SLiu Bo #define LOG_OTHER_INODE 2
41e02119d5SChris Mason 
42e02119d5SChris Mason /*
4312fcfd22SChris Mason  * directory trouble cases
4412fcfd22SChris Mason  *
4512fcfd22SChris Mason  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
4612fcfd22SChris Mason  * log, we must force a full commit before doing an fsync of the directory
4712fcfd22SChris Mason  * where the unlink was done.
4812fcfd22SChris Mason  * ---> record transid of last unlink/rename per directory
4912fcfd22SChris Mason  *
5012fcfd22SChris Mason  * mkdir foo/some_dir
5112fcfd22SChris Mason  * normal commit
5212fcfd22SChris Mason  * rename foo/some_dir foo2/some_dir
5312fcfd22SChris Mason  * mkdir foo/some_dir
5412fcfd22SChris Mason  * fsync foo/some_dir/some_file
5512fcfd22SChris Mason  *
5612fcfd22SChris Mason  * The fsync above will unlink the original some_dir without recording
5712fcfd22SChris Mason  * it in its new location (foo2).  After a crash, some_dir will be gone
5812fcfd22SChris Mason  * unless the fsync of some_file forces a full commit
5912fcfd22SChris Mason  *
6012fcfd22SChris Mason  * 2) we must log any new names for any file or dir that is in the fsync
6112fcfd22SChris Mason  * log. ---> check inode while renaming/linking.
6212fcfd22SChris Mason  *
6312fcfd22SChris Mason  * 2a) we must log any new names for any file or dir during rename
6412fcfd22SChris Mason  * when the directory they are being removed from was logged.
6512fcfd22SChris Mason  * ---> check inode and old parent dir during rename
6612fcfd22SChris Mason  *
6712fcfd22SChris Mason  *  2a is actually the more important variant.  With the extra logging
6812fcfd22SChris Mason  *  a crash might unlink the old name without recreating the new one
6912fcfd22SChris Mason  *
7012fcfd22SChris Mason  * 3) after a crash, we must go through any directories with a link count
7112fcfd22SChris Mason  * of zero and redo the rm -rf
7212fcfd22SChris Mason  *
7312fcfd22SChris Mason  * mkdir f1/foo
7412fcfd22SChris Mason  * normal commit
7512fcfd22SChris Mason  * rm -rf f1/foo
7612fcfd22SChris Mason  * fsync(f1)
7712fcfd22SChris Mason  *
7812fcfd22SChris Mason  * The directory f1 was fully removed from the FS, but fsync was never
7912fcfd22SChris Mason  * called on f1, only its parent dir.  After a crash the rm -rf must
8012fcfd22SChris Mason  * be replayed.  This must be able to recurse down the entire
8112fcfd22SChris Mason  * directory tree.  The inode link count fixup code takes care of the
8212fcfd22SChris Mason  * ugly details.
8312fcfd22SChris Mason  */
8412fcfd22SChris Mason 
8512fcfd22SChris Mason /*
86e02119d5SChris Mason  * stages for the tree walking.  The first
87e02119d5SChris Mason  * stage (0) is to only pin down the blocks we find
88e02119d5SChris Mason  * the second stage (1) is to make sure that all the inodes
89e02119d5SChris Mason  * we find in the log are created in the subvolume.
90e02119d5SChris Mason  *
91e02119d5SChris Mason  * The last stage is to deal with directories and links and extents
92e02119d5SChris Mason  * and all the other fun semantics
93e02119d5SChris Mason  */
94e02119d5SChris Mason #define LOG_WALK_PIN_ONLY 0
95e02119d5SChris Mason #define LOG_WALK_REPLAY_INODES 1
96dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_DIR_INDEX 2
97dd8e7217SJosef Bacik #define LOG_WALK_REPLAY_ALL 3
98e02119d5SChris Mason 
9912fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
100e02119d5SChris Mason 			   struct btrfs_root *root, struct inode *inode,
10149dae1bcSFilipe Manana 			   int inode_only,
10249dae1bcSFilipe Manana 			   const loff_t start,
1038407f553SFilipe Manana 			   const loff_t end,
1048407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx);
105ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
106ec051c0fSYan Zheng 			     struct btrfs_root *root,
107ec051c0fSYan Zheng 			     struct btrfs_path *path, u64 objectid);
10812fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
10912fcfd22SChris Mason 				       struct btrfs_root *root,
11012fcfd22SChris Mason 				       struct btrfs_root *log,
11112fcfd22SChris Mason 				       struct btrfs_path *path,
11212fcfd22SChris Mason 				       u64 dirid, int del_all);
113e02119d5SChris Mason 
114e02119d5SChris Mason /*
115e02119d5SChris Mason  * tree logging is a special write ahead log used to make sure that
116e02119d5SChris Mason  * fsyncs and O_SYNCs can happen without doing full tree commits.
117e02119d5SChris Mason  *
118e02119d5SChris Mason  * Full tree commits are expensive because they require commonly
119e02119d5SChris Mason  * modified blocks to be recowed, creating many dirty pages in the
120e02119d5SChris Mason  * extent tree an 4x-6x higher write load than ext3.
121e02119d5SChris Mason  *
122e02119d5SChris Mason  * Instead of doing a tree commit on every fsync, we use the
123e02119d5SChris Mason  * key ranges and transaction ids to find items for a given file or directory
124e02119d5SChris Mason  * that have changed in this transaction.  Those items are copied into
125e02119d5SChris Mason  * a special tree (one per subvolume root), that tree is written to disk
126e02119d5SChris Mason  * and then the fsync is considered complete.
127e02119d5SChris Mason  *
128e02119d5SChris Mason  * After a crash, items are copied out of the log-tree back into the
129e02119d5SChris Mason  * subvolume tree.  Any file data extents found are recorded in the extent
130e02119d5SChris Mason  * allocation tree, and the log-tree freed.
131e02119d5SChris Mason  *
132e02119d5SChris Mason  * The log tree is read three times, once to pin down all the extents it is
133e02119d5SChris Mason  * using in ram and once, once to create all the inodes logged in the tree
134e02119d5SChris Mason  * and once to do all the other items.
135e02119d5SChris Mason  */
136e02119d5SChris Mason 
137e02119d5SChris Mason /*
138e02119d5SChris Mason  * start a sub transaction and setup the log tree
139e02119d5SChris Mason  * this increments the log tree writer count to make the people
140e02119d5SChris Mason  * syncing the tree wait for us to finish
141e02119d5SChris Mason  */
142e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans,
1438b050d35SMiao Xie 			   struct btrfs_root *root,
1448b050d35SMiao Xie 			   struct btrfs_log_ctx *ctx)
145e02119d5SChris Mason {
1460b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
14734eb2a52SZhaolei 	int ret = 0;
1487237f183SYan Zheng 
1497237f183SYan Zheng 	mutex_lock(&root->log_mutex);
15034eb2a52SZhaolei 
1517237f183SYan Zheng 	if (root->log_root) {
1520b246afaSJeff Mahoney 		if (btrfs_need_log_full_commit(fs_info, trans)) {
15350471a38SMiao Xie 			ret = -EAGAIN;
15450471a38SMiao Xie 			goto out;
15550471a38SMiao Xie 		}
15634eb2a52SZhaolei 
157ff782e0aSJosef Bacik 		if (!root->log_start_pid) {
15827cdeb70SMiao Xie 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
15934eb2a52SZhaolei 			root->log_start_pid = current->pid;
160ff782e0aSJosef Bacik 		} else if (root->log_start_pid != current->pid) {
16127cdeb70SMiao Xie 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
162ff782e0aSJosef Bacik 		}
16334eb2a52SZhaolei 	} else {
1640b246afaSJeff Mahoney 		mutex_lock(&fs_info->tree_log_mutex);
1650b246afaSJeff Mahoney 		if (!fs_info->log_root_tree)
1660b246afaSJeff Mahoney 			ret = btrfs_init_log_root_tree(trans, fs_info);
1670b246afaSJeff Mahoney 		mutex_unlock(&fs_info->tree_log_mutex);
1684a500fd1SYan, Zheng 		if (ret)
169e87ac136SMiao Xie 			goto out;
170e87ac136SMiao Xie 
171e02119d5SChris Mason 		ret = btrfs_add_log_tree(trans, root);
1724a500fd1SYan, Zheng 		if (ret)
173e87ac136SMiao Xie 			goto out;
17434eb2a52SZhaolei 
17527cdeb70SMiao Xie 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
176e87ac136SMiao Xie 		root->log_start_pid = current->pid;
17734eb2a52SZhaolei 	}
17834eb2a52SZhaolei 
1792ecb7923SMiao Xie 	atomic_inc(&root->log_batch);
1807237f183SYan Zheng 	atomic_inc(&root->log_writers);
1818b050d35SMiao Xie 	if (ctx) {
18234eb2a52SZhaolei 		int index = root->log_transid % 2;
1838b050d35SMiao Xie 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
184d1433debSMiao Xie 		ctx->log_transid = root->log_transid;
1858b050d35SMiao Xie 	}
18634eb2a52SZhaolei 
187e87ac136SMiao Xie out:
1887237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
189e87ac136SMiao Xie 	return ret;
190e02119d5SChris Mason }
191e02119d5SChris Mason 
192e02119d5SChris Mason /*
193e02119d5SChris Mason  * returns 0 if there was a log transaction running and we were able
194e02119d5SChris Mason  * to join, or returns -ENOENT if there were not transactions
195e02119d5SChris Mason  * in progress
196e02119d5SChris Mason  */
197e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root)
198e02119d5SChris Mason {
199e02119d5SChris Mason 	int ret = -ENOENT;
200e02119d5SChris Mason 
201e02119d5SChris Mason 	smp_mb();
202e02119d5SChris Mason 	if (!root->log_root)
203e02119d5SChris Mason 		return -ENOENT;
204e02119d5SChris Mason 
2057237f183SYan Zheng 	mutex_lock(&root->log_mutex);
206e02119d5SChris Mason 	if (root->log_root) {
207e02119d5SChris Mason 		ret = 0;
2087237f183SYan Zheng 		atomic_inc(&root->log_writers);
209e02119d5SChris Mason 	}
2107237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
211e02119d5SChris Mason 	return ret;
212e02119d5SChris Mason }
213e02119d5SChris Mason 
214e02119d5SChris Mason /*
21512fcfd22SChris Mason  * This either makes the current running log transaction wait
21612fcfd22SChris Mason  * until you call btrfs_end_log_trans() or it makes any future
21712fcfd22SChris Mason  * log transactions wait until you call btrfs_end_log_trans()
21812fcfd22SChris Mason  */
21912fcfd22SChris Mason int btrfs_pin_log_trans(struct btrfs_root *root)
22012fcfd22SChris Mason {
22112fcfd22SChris Mason 	int ret = -ENOENT;
22212fcfd22SChris Mason 
22312fcfd22SChris Mason 	mutex_lock(&root->log_mutex);
22412fcfd22SChris Mason 	atomic_inc(&root->log_writers);
22512fcfd22SChris Mason 	mutex_unlock(&root->log_mutex);
22612fcfd22SChris Mason 	return ret;
22712fcfd22SChris Mason }
22812fcfd22SChris Mason 
22912fcfd22SChris Mason /*
230e02119d5SChris Mason  * indicate we're done making changes to the log tree
231e02119d5SChris Mason  * and wake up anyone waiting to do a sync
232e02119d5SChris Mason  */
233143bede5SJeff Mahoney void btrfs_end_log_trans(struct btrfs_root *root)
234e02119d5SChris Mason {
2357237f183SYan Zheng 	if (atomic_dec_and_test(&root->log_writers)) {
236779adf0fSDavid Sterba 		/*
237779adf0fSDavid Sterba 		 * Implicit memory barrier after atomic_dec_and_test
238779adf0fSDavid Sterba 		 */
2397237f183SYan Zheng 		if (waitqueue_active(&root->log_writer_wait))
2407237f183SYan Zheng 			wake_up(&root->log_writer_wait);
2417237f183SYan Zheng 	}
242e02119d5SChris Mason }
243e02119d5SChris Mason 
244e02119d5SChris Mason 
245e02119d5SChris Mason /*
246e02119d5SChris Mason  * the walk control struct is used to pass state down the chain when
247e02119d5SChris Mason  * processing the log tree.  The stage field tells us which part
248e02119d5SChris Mason  * of the log tree processing we are currently doing.  The others
249e02119d5SChris Mason  * are state fields used for that specific part
250e02119d5SChris Mason  */
251e02119d5SChris Mason struct walk_control {
252e02119d5SChris Mason 	/* should we free the extent on disk when done?  This is used
253e02119d5SChris Mason 	 * at transaction commit time while freeing a log tree
254e02119d5SChris Mason 	 */
255e02119d5SChris Mason 	int free;
256e02119d5SChris Mason 
257e02119d5SChris Mason 	/* should we write out the extent buffer?  This is used
258e02119d5SChris Mason 	 * while flushing the log tree to disk during a sync
259e02119d5SChris Mason 	 */
260e02119d5SChris Mason 	int write;
261e02119d5SChris Mason 
262e02119d5SChris Mason 	/* should we wait for the extent buffer io to finish?  Also used
263e02119d5SChris Mason 	 * while flushing the log tree to disk for a sync
264e02119d5SChris Mason 	 */
265e02119d5SChris Mason 	int wait;
266e02119d5SChris Mason 
267e02119d5SChris Mason 	/* pin only walk, we record which extents on disk belong to the
268e02119d5SChris Mason 	 * log trees
269e02119d5SChris Mason 	 */
270e02119d5SChris Mason 	int pin;
271e02119d5SChris Mason 
272e02119d5SChris Mason 	/* what stage of the replay code we're currently in */
273e02119d5SChris Mason 	int stage;
274e02119d5SChris Mason 
275e02119d5SChris Mason 	/* the root we are currently replaying */
276e02119d5SChris Mason 	struct btrfs_root *replay_dest;
277e02119d5SChris Mason 
278e02119d5SChris Mason 	/* the trans handle for the current replay */
279e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
280e02119d5SChris Mason 
281e02119d5SChris Mason 	/* the function that gets used to process blocks we find in the
282e02119d5SChris Mason 	 * tree.  Note the extent_buffer might not be up to date when it is
283e02119d5SChris Mason 	 * passed in, and it must be checked or read if you need the data
284e02119d5SChris Mason 	 * inside it
285e02119d5SChris Mason 	 */
286e02119d5SChris Mason 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
287e02119d5SChris Mason 			    struct walk_control *wc, u64 gen);
288e02119d5SChris Mason };
289e02119d5SChris Mason 
290e02119d5SChris Mason /*
291e02119d5SChris Mason  * process_func used to pin down extents, write them or wait on them
292e02119d5SChris Mason  */
293e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log,
294e02119d5SChris Mason 			      struct extent_buffer *eb,
295e02119d5SChris Mason 			      struct walk_control *wc, u64 gen)
296e02119d5SChris Mason {
2970b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
298b50c6e25SJosef Bacik 	int ret = 0;
299b50c6e25SJosef Bacik 
3008c2a1a30SJosef Bacik 	/*
3018c2a1a30SJosef Bacik 	 * If this fs is mixed then we need to be able to process the leaves to
3028c2a1a30SJosef Bacik 	 * pin down any logged extents, so we have to read the block.
3038c2a1a30SJosef Bacik 	 */
3040b246afaSJeff Mahoney 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
3058c2a1a30SJosef Bacik 		ret = btrfs_read_buffer(eb, gen);
3068c2a1a30SJosef Bacik 		if (ret)
3078c2a1a30SJosef Bacik 			return ret;
3088c2a1a30SJosef Bacik 	}
3098c2a1a30SJosef Bacik 
31004018de5SJosef Bacik 	if (wc->pin)
3112ff7e61eSJeff Mahoney 		ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
3122ff7e61eSJeff Mahoney 						      eb->len);
313e02119d5SChris Mason 
314b50c6e25SJosef Bacik 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
3158c2a1a30SJosef Bacik 		if (wc->pin && btrfs_header_level(eb) == 0)
3162ff7e61eSJeff Mahoney 			ret = btrfs_exclude_logged_extents(fs_info, eb);
317e02119d5SChris Mason 		if (wc->write)
318e02119d5SChris Mason 			btrfs_write_tree_block(eb);
319e02119d5SChris Mason 		if (wc->wait)
320e02119d5SChris Mason 			btrfs_wait_tree_block_writeback(eb);
321e02119d5SChris Mason 	}
322b50c6e25SJosef Bacik 	return ret;
323e02119d5SChris Mason }
324e02119d5SChris Mason 
325e02119d5SChris Mason /*
326e02119d5SChris Mason  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
327e02119d5SChris Mason  * to the src data we are copying out.
328e02119d5SChris Mason  *
329e02119d5SChris Mason  * root is the tree we are copying into, and path is a scratch
330e02119d5SChris Mason  * path for use in this function (it should be released on entry and
331e02119d5SChris Mason  * will be released on exit).
332e02119d5SChris Mason  *
333e02119d5SChris Mason  * If the key is already in the destination tree the existing item is
334e02119d5SChris Mason  * overwritten.  If the existing item isn't big enough, it is extended.
335e02119d5SChris Mason  * If it is too large, it is truncated.
336e02119d5SChris Mason  *
337e02119d5SChris Mason  * If the key isn't in the destination yet, a new item is inserted.
338e02119d5SChris Mason  */
339e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans,
340e02119d5SChris Mason 				   struct btrfs_root *root,
341e02119d5SChris Mason 				   struct btrfs_path *path,
342e02119d5SChris Mason 				   struct extent_buffer *eb, int slot,
343e02119d5SChris Mason 				   struct btrfs_key *key)
344e02119d5SChris Mason {
3452ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
346e02119d5SChris Mason 	int ret;
347e02119d5SChris Mason 	u32 item_size;
348e02119d5SChris Mason 	u64 saved_i_size = 0;
349e02119d5SChris Mason 	int save_old_i_size = 0;
350e02119d5SChris Mason 	unsigned long src_ptr;
351e02119d5SChris Mason 	unsigned long dst_ptr;
352e02119d5SChris Mason 	int overwrite_root = 0;
3534bc4bee4SJosef Bacik 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
354e02119d5SChris Mason 
355e02119d5SChris Mason 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
356e02119d5SChris Mason 		overwrite_root = 1;
357e02119d5SChris Mason 
358e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
359e02119d5SChris Mason 	src_ptr = btrfs_item_ptr_offset(eb, slot);
360e02119d5SChris Mason 
361e02119d5SChris Mason 	/* look for the key in the destination tree */
362e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3634bc4bee4SJosef Bacik 	if (ret < 0)
3644bc4bee4SJosef Bacik 		return ret;
3654bc4bee4SJosef Bacik 
366e02119d5SChris Mason 	if (ret == 0) {
367e02119d5SChris Mason 		char *src_copy;
368e02119d5SChris Mason 		char *dst_copy;
369e02119d5SChris Mason 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
370e02119d5SChris Mason 						  path->slots[0]);
371e02119d5SChris Mason 		if (dst_size != item_size)
372e02119d5SChris Mason 			goto insert;
373e02119d5SChris Mason 
374e02119d5SChris Mason 		if (item_size == 0) {
375b3b4aa74SDavid Sterba 			btrfs_release_path(path);
376e02119d5SChris Mason 			return 0;
377e02119d5SChris Mason 		}
378e02119d5SChris Mason 		dst_copy = kmalloc(item_size, GFP_NOFS);
379e02119d5SChris Mason 		src_copy = kmalloc(item_size, GFP_NOFS);
3802a29edc6Sliubo 		if (!dst_copy || !src_copy) {
381b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3822a29edc6Sliubo 			kfree(dst_copy);
3832a29edc6Sliubo 			kfree(src_copy);
3842a29edc6Sliubo 			return -ENOMEM;
3852a29edc6Sliubo 		}
386e02119d5SChris Mason 
387e02119d5SChris Mason 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
388e02119d5SChris Mason 
389e02119d5SChris Mason 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
390e02119d5SChris Mason 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
391e02119d5SChris Mason 				   item_size);
392e02119d5SChris Mason 		ret = memcmp(dst_copy, src_copy, item_size);
393e02119d5SChris Mason 
394e02119d5SChris Mason 		kfree(dst_copy);
395e02119d5SChris Mason 		kfree(src_copy);
396e02119d5SChris Mason 		/*
397e02119d5SChris Mason 		 * they have the same contents, just return, this saves
398e02119d5SChris Mason 		 * us from cowing blocks in the destination tree and doing
399e02119d5SChris Mason 		 * extra writes that may not have been done by a previous
400e02119d5SChris Mason 		 * sync
401e02119d5SChris Mason 		 */
402e02119d5SChris Mason 		if (ret == 0) {
403b3b4aa74SDavid Sterba 			btrfs_release_path(path);
404e02119d5SChris Mason 			return 0;
405e02119d5SChris Mason 		}
406e02119d5SChris Mason 
4074bc4bee4SJosef Bacik 		/*
4084bc4bee4SJosef Bacik 		 * We need to load the old nbytes into the inode so when we
4094bc4bee4SJosef Bacik 		 * replay the extents we've logged we get the right nbytes.
4104bc4bee4SJosef Bacik 		 */
4114bc4bee4SJosef Bacik 		if (inode_item) {
4124bc4bee4SJosef Bacik 			struct btrfs_inode_item *item;
4134bc4bee4SJosef Bacik 			u64 nbytes;
414d555438bSJosef Bacik 			u32 mode;
4154bc4bee4SJosef Bacik 
4164bc4bee4SJosef Bacik 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4174bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4184bc4bee4SJosef Bacik 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
4194bc4bee4SJosef Bacik 			item = btrfs_item_ptr(eb, slot,
4204bc4bee4SJosef Bacik 					      struct btrfs_inode_item);
4214bc4bee4SJosef Bacik 			btrfs_set_inode_nbytes(eb, item, nbytes);
422d555438bSJosef Bacik 
423d555438bSJosef Bacik 			/*
424d555438bSJosef Bacik 			 * If this is a directory we need to reset the i_size to
425d555438bSJosef Bacik 			 * 0 so that we can set it up properly when replaying
426d555438bSJosef Bacik 			 * the rest of the items in this log.
427d555438bSJosef Bacik 			 */
428d555438bSJosef Bacik 			mode = btrfs_inode_mode(eb, item);
429d555438bSJosef Bacik 			if (S_ISDIR(mode))
430d555438bSJosef Bacik 				btrfs_set_inode_size(eb, item, 0);
4314bc4bee4SJosef Bacik 		}
4324bc4bee4SJosef Bacik 	} else if (inode_item) {
4334bc4bee4SJosef Bacik 		struct btrfs_inode_item *item;
434d555438bSJosef Bacik 		u32 mode;
4354bc4bee4SJosef Bacik 
4364bc4bee4SJosef Bacik 		/*
4374bc4bee4SJosef Bacik 		 * New inode, set nbytes to 0 so that the nbytes comes out
4384bc4bee4SJosef Bacik 		 * properly when we replay the extents.
4394bc4bee4SJosef Bacik 		 */
4404bc4bee4SJosef Bacik 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
4414bc4bee4SJosef Bacik 		btrfs_set_inode_nbytes(eb, item, 0);
442d555438bSJosef Bacik 
443d555438bSJosef Bacik 		/*
444d555438bSJosef Bacik 		 * If this is a directory we need to reset the i_size to 0 so
445d555438bSJosef Bacik 		 * that we can set it up properly when replaying the rest of
446d555438bSJosef Bacik 		 * the items in this log.
447d555438bSJosef Bacik 		 */
448d555438bSJosef Bacik 		mode = btrfs_inode_mode(eb, item);
449d555438bSJosef Bacik 		if (S_ISDIR(mode))
450d555438bSJosef Bacik 			btrfs_set_inode_size(eb, item, 0);
451e02119d5SChris Mason 	}
452e02119d5SChris Mason insert:
453b3b4aa74SDavid Sterba 	btrfs_release_path(path);
454e02119d5SChris Mason 	/* try to insert the key into the destination tree */
455df8d116fSFilipe Manana 	path->skip_release_on_error = 1;
456e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path,
457e02119d5SChris Mason 				      key, item_size);
458df8d116fSFilipe Manana 	path->skip_release_on_error = 0;
459e02119d5SChris Mason 
460e02119d5SChris Mason 	/* make sure any existing item is the correct size */
461df8d116fSFilipe Manana 	if (ret == -EEXIST || ret == -EOVERFLOW) {
462e02119d5SChris Mason 		u32 found_size;
463e02119d5SChris Mason 		found_size = btrfs_item_size_nr(path->nodes[0],
464e02119d5SChris Mason 						path->slots[0]);
465143bede5SJeff Mahoney 		if (found_size > item_size)
4662ff7e61eSJeff Mahoney 			btrfs_truncate_item(fs_info, path, item_size, 1);
467143bede5SJeff Mahoney 		else if (found_size < item_size)
4682ff7e61eSJeff Mahoney 			btrfs_extend_item(fs_info, path,
46987b29b20SYan Zheng 					  item_size - found_size);
470e02119d5SChris Mason 	} else if (ret) {
4714a500fd1SYan, Zheng 		return ret;
472e02119d5SChris Mason 	}
473e02119d5SChris Mason 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
474e02119d5SChris Mason 					path->slots[0]);
475e02119d5SChris Mason 
476e02119d5SChris Mason 	/* don't overwrite an existing inode if the generation number
477e02119d5SChris Mason 	 * was logged as zero.  This is done when the tree logging code
478e02119d5SChris Mason 	 * is just logging an inode to make sure it exists after recovery.
479e02119d5SChris Mason 	 *
480e02119d5SChris Mason 	 * Also, don't overwrite i_size on directories during replay.
481e02119d5SChris Mason 	 * log replay inserts and removes directory items based on the
482e02119d5SChris Mason 	 * state of the tree found in the subvolume, and i_size is modified
483e02119d5SChris Mason 	 * as it goes
484e02119d5SChris Mason 	 */
485e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
486e02119d5SChris Mason 		struct btrfs_inode_item *src_item;
487e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
488e02119d5SChris Mason 
489e02119d5SChris Mason 		src_item = (struct btrfs_inode_item *)src_ptr;
490e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
491e02119d5SChris Mason 
4921a4bcf47SFilipe Manana 		if (btrfs_inode_generation(eb, src_item) == 0) {
4931a4bcf47SFilipe Manana 			struct extent_buffer *dst_eb = path->nodes[0];
4942f2ff0eeSFilipe Manana 			const u64 ino_size = btrfs_inode_size(eb, src_item);
4951a4bcf47SFilipe Manana 
4962f2ff0eeSFilipe Manana 			/*
4972f2ff0eeSFilipe Manana 			 * For regular files an ino_size == 0 is used only when
4982f2ff0eeSFilipe Manana 			 * logging that an inode exists, as part of a directory
4992f2ff0eeSFilipe Manana 			 * fsync, and the inode wasn't fsynced before. In this
5002f2ff0eeSFilipe Manana 			 * case don't set the size of the inode in the fs/subvol
5012f2ff0eeSFilipe Manana 			 * tree, otherwise we would be throwing valid data away.
5022f2ff0eeSFilipe Manana 			 */
5031a4bcf47SFilipe Manana 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
5042f2ff0eeSFilipe Manana 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
5052f2ff0eeSFilipe Manana 			    ino_size != 0) {
5061a4bcf47SFilipe Manana 				struct btrfs_map_token token;
5071a4bcf47SFilipe Manana 
5081a4bcf47SFilipe Manana 				btrfs_init_map_token(&token);
5091a4bcf47SFilipe Manana 				btrfs_set_token_inode_size(dst_eb, dst_item,
5101a4bcf47SFilipe Manana 							   ino_size, &token);
5111a4bcf47SFilipe Manana 			}
512e02119d5SChris Mason 			goto no_copy;
5131a4bcf47SFilipe Manana 		}
514e02119d5SChris Mason 
515e02119d5SChris Mason 		if (overwrite_root &&
516e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
517e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
518e02119d5SChris Mason 			save_old_i_size = 1;
519e02119d5SChris Mason 			saved_i_size = btrfs_inode_size(path->nodes[0],
520e02119d5SChris Mason 							dst_item);
521e02119d5SChris Mason 		}
522e02119d5SChris Mason 	}
523e02119d5SChris Mason 
524e02119d5SChris Mason 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
525e02119d5SChris Mason 			   src_ptr, item_size);
526e02119d5SChris Mason 
527e02119d5SChris Mason 	if (save_old_i_size) {
528e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
529e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
530e02119d5SChris Mason 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
531e02119d5SChris Mason 	}
532e02119d5SChris Mason 
533e02119d5SChris Mason 	/* make sure the generation is filled in */
534e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY) {
535e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
536e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
537e02119d5SChris Mason 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
538e02119d5SChris Mason 			btrfs_set_inode_generation(path->nodes[0], dst_item,
539e02119d5SChris Mason 						   trans->transid);
540e02119d5SChris Mason 		}
541e02119d5SChris Mason 	}
542e02119d5SChris Mason no_copy:
543e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
544b3b4aa74SDavid Sterba 	btrfs_release_path(path);
545e02119d5SChris Mason 	return 0;
546e02119d5SChris Mason }
547e02119d5SChris Mason 
548e02119d5SChris Mason /*
549e02119d5SChris Mason  * simple helper to read an inode off the disk from a given root
550e02119d5SChris Mason  * This can only be called for subvolume roots and not for the log
551e02119d5SChris Mason  */
552e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root,
553e02119d5SChris Mason 					     u64 objectid)
554e02119d5SChris Mason {
5555d4f98a2SYan Zheng 	struct btrfs_key key;
556e02119d5SChris Mason 	struct inode *inode;
557e02119d5SChris Mason 
5585d4f98a2SYan Zheng 	key.objectid = objectid;
5595d4f98a2SYan Zheng 	key.type = BTRFS_INODE_ITEM_KEY;
5605d4f98a2SYan Zheng 	key.offset = 0;
56173f73415SJosef Bacik 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
5625d4f98a2SYan Zheng 	if (IS_ERR(inode)) {
5635d4f98a2SYan Zheng 		inode = NULL;
5645d4f98a2SYan Zheng 	} else if (is_bad_inode(inode)) {
565e02119d5SChris Mason 		iput(inode);
566e02119d5SChris Mason 		inode = NULL;
567e02119d5SChris Mason 	}
568e02119d5SChris Mason 	return inode;
569e02119d5SChris Mason }
570e02119d5SChris Mason 
571e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the
572e02119d5SChris Mason  * subvolume 'root'.  path is released on entry and should be released
573e02119d5SChris Mason  * on exit.
574e02119d5SChris Mason  *
575e02119d5SChris Mason  * extents in the log tree have not been allocated out of the extent
576e02119d5SChris Mason  * tree yet.  So, this completes the allocation, taking a reference
577e02119d5SChris Mason  * as required if the extent already exists or creating a new extent
578e02119d5SChris Mason  * if it isn't in the extent allocation tree yet.
579e02119d5SChris Mason  *
580e02119d5SChris Mason  * The extent is inserted into the file, dropping any existing extents
581e02119d5SChris Mason  * from the file that overlap the new one.
582e02119d5SChris Mason  */
583e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
584e02119d5SChris Mason 				      struct btrfs_root *root,
585e02119d5SChris Mason 				      struct btrfs_path *path,
586e02119d5SChris Mason 				      struct extent_buffer *eb, int slot,
587e02119d5SChris Mason 				      struct btrfs_key *key)
588e02119d5SChris Mason {
5890b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
590e02119d5SChris Mason 	int found_type;
591e02119d5SChris Mason 	u64 extent_end;
592e02119d5SChris Mason 	u64 start = key->offset;
5934bc4bee4SJosef Bacik 	u64 nbytes = 0;
594e02119d5SChris Mason 	struct btrfs_file_extent_item *item;
595e02119d5SChris Mason 	struct inode *inode = NULL;
596e02119d5SChris Mason 	unsigned long size;
597e02119d5SChris Mason 	int ret = 0;
598e02119d5SChris Mason 
599e02119d5SChris Mason 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
600e02119d5SChris Mason 	found_type = btrfs_file_extent_type(eb, item);
601e02119d5SChris Mason 
602d899e052SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
6034bc4bee4SJosef Bacik 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6044bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_num_bytes(eb, item);
6054bc4bee4SJosef Bacik 		extent_end = start + nbytes;
6064bc4bee4SJosef Bacik 
6074bc4bee4SJosef Bacik 		/*
6084bc4bee4SJosef Bacik 		 * We don't add to the inodes nbytes if we are prealloc or a
6094bc4bee4SJosef Bacik 		 * hole.
6104bc4bee4SJosef Bacik 		 */
6114bc4bee4SJosef Bacik 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6124bc4bee4SJosef Bacik 			nbytes = 0;
6134bc4bee4SJosef Bacik 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
614514ac8adSChris Mason 		size = btrfs_file_extent_inline_len(eb, slot, item);
6154bc4bee4SJosef Bacik 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
616da17066cSJeff Mahoney 		extent_end = ALIGN(start + size,
6170b246afaSJeff Mahoney 				   fs_info->sectorsize);
618e02119d5SChris Mason 	} else {
619e02119d5SChris Mason 		ret = 0;
620e02119d5SChris Mason 		goto out;
621e02119d5SChris Mason 	}
622e02119d5SChris Mason 
623e02119d5SChris Mason 	inode = read_one_inode(root, key->objectid);
624e02119d5SChris Mason 	if (!inode) {
625e02119d5SChris Mason 		ret = -EIO;
626e02119d5SChris Mason 		goto out;
627e02119d5SChris Mason 	}
628e02119d5SChris Mason 
629e02119d5SChris Mason 	/*
630e02119d5SChris Mason 	 * first check to see if we already have this extent in the
631e02119d5SChris Mason 	 * file.  This must be done before the btrfs_drop_extents run
632e02119d5SChris Mason 	 * so we don't try to drop this extent.
633e02119d5SChris Mason 	 */
6344a0cc7caSNikolay Borisov 	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(BTRFS_I(inode)),
635e02119d5SChris Mason 				       start, 0);
636e02119d5SChris Mason 
637d899e052SYan Zheng 	if (ret == 0 &&
638d899e052SYan Zheng 	    (found_type == BTRFS_FILE_EXTENT_REG ||
639d899e052SYan Zheng 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
640e02119d5SChris Mason 		struct btrfs_file_extent_item cmp1;
641e02119d5SChris Mason 		struct btrfs_file_extent_item cmp2;
642e02119d5SChris Mason 		struct btrfs_file_extent_item *existing;
643e02119d5SChris Mason 		struct extent_buffer *leaf;
644e02119d5SChris Mason 
645e02119d5SChris Mason 		leaf = path->nodes[0];
646e02119d5SChris Mason 		existing = btrfs_item_ptr(leaf, path->slots[0],
647e02119d5SChris Mason 					  struct btrfs_file_extent_item);
648e02119d5SChris Mason 
649e02119d5SChris Mason 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
650e02119d5SChris Mason 				   sizeof(cmp1));
651e02119d5SChris Mason 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
652e02119d5SChris Mason 				   sizeof(cmp2));
653e02119d5SChris Mason 
654e02119d5SChris Mason 		/*
655e02119d5SChris Mason 		 * we already have a pointer to this exact extent,
656e02119d5SChris Mason 		 * we don't have to do anything
657e02119d5SChris Mason 		 */
658e02119d5SChris Mason 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
659b3b4aa74SDavid Sterba 			btrfs_release_path(path);
660e02119d5SChris Mason 			goto out;
661e02119d5SChris Mason 		}
662e02119d5SChris Mason 	}
663b3b4aa74SDavid Sterba 	btrfs_release_path(path);
664e02119d5SChris Mason 
665e02119d5SChris Mason 	/* drop any overlapping extents */
6662671485dSJosef Bacik 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
6673650860bSJosef Bacik 	if (ret)
6683650860bSJosef Bacik 		goto out;
669e02119d5SChris Mason 
67007d400a6SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
67107d400a6SYan Zheng 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6725d4f98a2SYan Zheng 		u64 offset;
67307d400a6SYan Zheng 		unsigned long dest_offset;
67407d400a6SYan Zheng 		struct btrfs_key ins;
67507d400a6SYan Zheng 
67607d400a6SYan Zheng 		ret = btrfs_insert_empty_item(trans, root, path, key,
67707d400a6SYan Zheng 					      sizeof(*item));
6783650860bSJosef Bacik 		if (ret)
6793650860bSJosef Bacik 			goto out;
68007d400a6SYan Zheng 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
68107d400a6SYan Zheng 						    path->slots[0]);
68207d400a6SYan Zheng 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
68307d400a6SYan Zheng 				(unsigned long)item,  sizeof(*item));
68407d400a6SYan Zheng 
68507d400a6SYan Zheng 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
68607d400a6SYan Zheng 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
68707d400a6SYan Zheng 		ins.type = BTRFS_EXTENT_ITEM_KEY;
6885d4f98a2SYan Zheng 		offset = key->offset - btrfs_file_extent_offset(eb, item);
68907d400a6SYan Zheng 
690df2c95f3SQu Wenruo 		/*
691df2c95f3SQu Wenruo 		 * Manually record dirty extent, as here we did a shallow
692df2c95f3SQu Wenruo 		 * file extent item copy and skip normal backref update,
693df2c95f3SQu Wenruo 		 * but modifying extent tree all by ourselves.
694df2c95f3SQu Wenruo 		 * So need to manually record dirty extent for qgroup,
695df2c95f3SQu Wenruo 		 * as the owner of the file extent changed from log tree
696df2c95f3SQu Wenruo 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
697df2c95f3SQu Wenruo 		 */
6980b246afaSJeff Mahoney 		ret = btrfs_qgroup_trace_extent(trans, fs_info,
699df2c95f3SQu Wenruo 				btrfs_file_extent_disk_bytenr(eb, item),
700df2c95f3SQu Wenruo 				btrfs_file_extent_disk_num_bytes(eb, item),
701df2c95f3SQu Wenruo 				GFP_NOFS);
702df2c95f3SQu Wenruo 		if (ret < 0)
703df2c95f3SQu Wenruo 			goto out;
704df2c95f3SQu Wenruo 
70507d400a6SYan Zheng 		if (ins.objectid > 0) {
70607d400a6SYan Zheng 			u64 csum_start;
70707d400a6SYan Zheng 			u64 csum_end;
70807d400a6SYan Zheng 			LIST_HEAD(ordered_sums);
70907d400a6SYan Zheng 			/*
71007d400a6SYan Zheng 			 * is this extent already allocated in the extent
71107d400a6SYan Zheng 			 * allocation tree?  If so, just add a reference
71207d400a6SYan Zheng 			 */
7132ff7e61eSJeff Mahoney 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
71407d400a6SYan Zheng 						ins.offset);
71507d400a6SYan Zheng 			if (ret == 0) {
7162ff7e61eSJeff Mahoney 				ret = btrfs_inc_extent_ref(trans, fs_info,
71707d400a6SYan Zheng 						ins.objectid, ins.offset,
7185d4f98a2SYan Zheng 						0, root->root_key.objectid,
719b06c4bf5SFilipe Manana 						key->objectid, offset);
720b50c6e25SJosef Bacik 				if (ret)
721b50c6e25SJosef Bacik 					goto out;
72207d400a6SYan Zheng 			} else {
72307d400a6SYan Zheng 				/*
72407d400a6SYan Zheng 				 * insert the extent pointer in the extent
72507d400a6SYan Zheng 				 * allocation tree
72607d400a6SYan Zheng 				 */
7275d4f98a2SYan Zheng 				ret = btrfs_alloc_logged_file_extent(trans,
7282ff7e61eSJeff Mahoney 						fs_info,
7292ff7e61eSJeff Mahoney 						root->root_key.objectid,
7305d4f98a2SYan Zheng 						key->objectid, offset, &ins);
731b50c6e25SJosef Bacik 				if (ret)
732b50c6e25SJosef Bacik 					goto out;
73307d400a6SYan Zheng 			}
734b3b4aa74SDavid Sterba 			btrfs_release_path(path);
73507d400a6SYan Zheng 
73607d400a6SYan Zheng 			if (btrfs_file_extent_compression(eb, item)) {
73707d400a6SYan Zheng 				csum_start = ins.objectid;
73807d400a6SYan Zheng 				csum_end = csum_start + ins.offset;
73907d400a6SYan Zheng 			} else {
74007d400a6SYan Zheng 				csum_start = ins.objectid +
74107d400a6SYan Zheng 					btrfs_file_extent_offset(eb, item);
74207d400a6SYan Zheng 				csum_end = csum_start +
74307d400a6SYan Zheng 					btrfs_file_extent_num_bytes(eb, item);
74407d400a6SYan Zheng 			}
74507d400a6SYan Zheng 
74607d400a6SYan Zheng 			ret = btrfs_lookup_csums_range(root->log_root,
74707d400a6SYan Zheng 						csum_start, csum_end - 1,
748a2de733cSArne Jansen 						&ordered_sums, 0);
7493650860bSJosef Bacik 			if (ret)
7503650860bSJosef Bacik 				goto out;
751b84b8390SFilipe Manana 			/*
752b84b8390SFilipe Manana 			 * Now delete all existing cums in the csum root that
753b84b8390SFilipe Manana 			 * cover our range. We do this because we can have an
754b84b8390SFilipe Manana 			 * extent that is completely referenced by one file
755b84b8390SFilipe Manana 			 * extent item and partially referenced by another
756b84b8390SFilipe Manana 			 * file extent item (like after using the clone or
757b84b8390SFilipe Manana 			 * extent_same ioctls). In this case if we end up doing
758b84b8390SFilipe Manana 			 * the replay of the one that partially references the
759b84b8390SFilipe Manana 			 * extent first, and we do not do the csum deletion
760b84b8390SFilipe Manana 			 * below, we can get 2 csum items in the csum tree that
761b84b8390SFilipe Manana 			 * overlap each other. For example, imagine our log has
762b84b8390SFilipe Manana 			 * the two following file extent items:
763b84b8390SFilipe Manana 			 *
764b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 409600)
765b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
766b84b8390SFilipe Manana 			 *     extent data offset 20480 nr 20480 ram 102400
767b84b8390SFilipe Manana 			 *
768b84b8390SFilipe Manana 			 * key (257 EXTENT_DATA 819200)
769b84b8390SFilipe Manana 			 *     extent data disk byte 12845056 nr 102400
770b84b8390SFilipe Manana 			 *     extent data offset 0 nr 102400 ram 102400
771b84b8390SFilipe Manana 			 *
772b84b8390SFilipe Manana 			 * Where the second one fully references the 100K extent
773b84b8390SFilipe Manana 			 * that starts at disk byte 12845056, and the log tree
774b84b8390SFilipe Manana 			 * has a single csum item that covers the entire range
775b84b8390SFilipe Manana 			 * of the extent:
776b84b8390SFilipe Manana 			 *
777b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
778b84b8390SFilipe Manana 			 *
779b84b8390SFilipe Manana 			 * After the first file extent item is replayed, the
780b84b8390SFilipe Manana 			 * csum tree gets the following csum item:
781b84b8390SFilipe Manana 			 *
782b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
783b84b8390SFilipe Manana 			 *
784b84b8390SFilipe Manana 			 * Which covers the 20K sub-range starting at offset 20K
785b84b8390SFilipe Manana 			 * of our extent. Now when we replay the second file
786b84b8390SFilipe Manana 			 * extent item, if we do not delete existing csum items
787b84b8390SFilipe Manana 			 * that cover any of its blocks, we end up getting two
788b84b8390SFilipe Manana 			 * csum items in our csum tree that overlap each other:
789b84b8390SFilipe Manana 			 *
790b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
791b84b8390SFilipe Manana 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
792b84b8390SFilipe Manana 			 *
793b84b8390SFilipe Manana 			 * Which is a problem, because after this anyone trying
794b84b8390SFilipe Manana 			 * to lookup up for the checksum of any block of our
795b84b8390SFilipe Manana 			 * extent starting at an offset of 40K or higher, will
796b84b8390SFilipe Manana 			 * end up looking at the second csum item only, which
797b84b8390SFilipe Manana 			 * does not contain the checksum for any block starting
798b84b8390SFilipe Manana 			 * at offset 40K or higher of our extent.
799b84b8390SFilipe Manana 			 */
80007d400a6SYan Zheng 			while (!list_empty(&ordered_sums)) {
80107d400a6SYan Zheng 				struct btrfs_ordered_sum *sums;
80207d400a6SYan Zheng 				sums = list_entry(ordered_sums.next,
80307d400a6SYan Zheng 						struct btrfs_ordered_sum,
80407d400a6SYan Zheng 						list);
8053650860bSJosef Bacik 				if (!ret)
8060b246afaSJeff Mahoney 					ret = btrfs_del_csums(trans, fs_info,
807b84b8390SFilipe Manana 							      sums->bytenr,
808b84b8390SFilipe Manana 							      sums->len);
809b84b8390SFilipe Manana 				if (!ret)
81007d400a6SYan Zheng 					ret = btrfs_csum_file_blocks(trans,
8110b246afaSJeff Mahoney 						fs_info->csum_root, sums);
81207d400a6SYan Zheng 				list_del(&sums->list);
81307d400a6SYan Zheng 				kfree(sums);
81407d400a6SYan Zheng 			}
8153650860bSJosef Bacik 			if (ret)
8163650860bSJosef Bacik 				goto out;
81707d400a6SYan Zheng 		} else {
818b3b4aa74SDavid Sterba 			btrfs_release_path(path);
81907d400a6SYan Zheng 		}
82007d400a6SYan Zheng 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
82107d400a6SYan Zheng 		/* inline extents are easy, we just overwrite them */
822e02119d5SChris Mason 		ret = overwrite_item(trans, root, path, eb, slot, key);
8233650860bSJosef Bacik 		if (ret)
8243650860bSJosef Bacik 			goto out;
82507d400a6SYan Zheng 	}
826e02119d5SChris Mason 
8274bc4bee4SJosef Bacik 	inode_add_bytes(inode, nbytes);
828b9959295STsutomu Itoh 	ret = btrfs_update_inode(trans, root, inode);
829e02119d5SChris Mason out:
830e02119d5SChris Mason 	if (inode)
831e02119d5SChris Mason 		iput(inode);
832e02119d5SChris Mason 	return ret;
833e02119d5SChris Mason }
834e02119d5SChris Mason 
835e02119d5SChris Mason /*
836e02119d5SChris Mason  * when cleaning up conflicts between the directory names in the
837e02119d5SChris Mason  * subvolume, directory names in the log and directory names in the
838e02119d5SChris Mason  * inode back references, we may have to unlink inodes from directories.
839e02119d5SChris Mason  *
840e02119d5SChris Mason  * This is a helper function to do the unlink of a specific directory
841e02119d5SChris Mason  * item
842e02119d5SChris Mason  */
843e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
844e02119d5SChris Mason 				      struct btrfs_root *root,
845e02119d5SChris Mason 				      struct btrfs_path *path,
846e02119d5SChris Mason 				      struct inode *dir,
847e02119d5SChris Mason 				      struct btrfs_dir_item *di)
848e02119d5SChris Mason {
8492ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
850e02119d5SChris Mason 	struct inode *inode;
851e02119d5SChris Mason 	char *name;
852e02119d5SChris Mason 	int name_len;
853e02119d5SChris Mason 	struct extent_buffer *leaf;
854e02119d5SChris Mason 	struct btrfs_key location;
855e02119d5SChris Mason 	int ret;
856e02119d5SChris Mason 
857e02119d5SChris Mason 	leaf = path->nodes[0];
858e02119d5SChris Mason 
859e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
860e02119d5SChris Mason 	name_len = btrfs_dir_name_len(leaf, di);
861e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
8622a29edc6Sliubo 	if (!name)
8632a29edc6Sliubo 		return -ENOMEM;
8642a29edc6Sliubo 
865e02119d5SChris Mason 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
866b3b4aa74SDavid Sterba 	btrfs_release_path(path);
867e02119d5SChris Mason 
868e02119d5SChris Mason 	inode = read_one_inode(root, location.objectid);
869c00e9493STsutomu Itoh 	if (!inode) {
8703650860bSJosef Bacik 		ret = -EIO;
8713650860bSJosef Bacik 		goto out;
872c00e9493STsutomu Itoh 	}
873e02119d5SChris Mason 
874ec051c0fSYan Zheng 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
8753650860bSJosef Bacik 	if (ret)
8763650860bSJosef Bacik 		goto out;
87712fcfd22SChris Mason 
878e02119d5SChris Mason 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
8793650860bSJosef Bacik 	if (ret)
8803650860bSJosef Bacik 		goto out;
881ada9af21SFilipe David Borba Manana 	else
8822ff7e61eSJeff Mahoney 		ret = btrfs_run_delayed_items(trans, fs_info);
8833650860bSJosef Bacik out:
8843650860bSJosef Bacik 	kfree(name);
8853650860bSJosef Bacik 	iput(inode);
886e02119d5SChris Mason 	return ret;
887e02119d5SChris Mason }
888e02119d5SChris Mason 
889e02119d5SChris Mason /*
890e02119d5SChris Mason  * helper function to see if a given name and sequence number found
891e02119d5SChris Mason  * in an inode back reference are already in a directory and correctly
892e02119d5SChris Mason  * point to this inode
893e02119d5SChris Mason  */
894e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root,
895e02119d5SChris Mason 				 struct btrfs_path *path,
896e02119d5SChris Mason 				 u64 dirid, u64 objectid, u64 index,
897e02119d5SChris Mason 				 const char *name, int name_len)
898e02119d5SChris Mason {
899e02119d5SChris Mason 	struct btrfs_dir_item *di;
900e02119d5SChris Mason 	struct btrfs_key location;
901e02119d5SChris Mason 	int match = 0;
902e02119d5SChris Mason 
903e02119d5SChris Mason 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
904e02119d5SChris Mason 					 index, name, name_len, 0);
905e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
906e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
907e02119d5SChris Mason 		if (location.objectid != objectid)
908e02119d5SChris Mason 			goto out;
909e02119d5SChris Mason 	} else
910e02119d5SChris Mason 		goto out;
911b3b4aa74SDavid Sterba 	btrfs_release_path(path);
912e02119d5SChris Mason 
913e02119d5SChris Mason 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
914e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
915e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
916e02119d5SChris Mason 		if (location.objectid != objectid)
917e02119d5SChris Mason 			goto out;
918e02119d5SChris Mason 	} else
919e02119d5SChris Mason 		goto out;
920e02119d5SChris Mason 	match = 1;
921e02119d5SChris Mason out:
922b3b4aa74SDavid Sterba 	btrfs_release_path(path);
923e02119d5SChris Mason 	return match;
924e02119d5SChris Mason }
925e02119d5SChris Mason 
926e02119d5SChris Mason /*
927e02119d5SChris Mason  * helper function to check a log tree for a named back reference in
928e02119d5SChris Mason  * an inode.  This is used to decide if a back reference that is
929e02119d5SChris Mason  * found in the subvolume conflicts with what we find in the log.
930e02119d5SChris Mason  *
931e02119d5SChris Mason  * inode backreferences may have multiple refs in a single item,
932e02119d5SChris Mason  * during replay we process one reference at a time, and we don't
933e02119d5SChris Mason  * want to delete valid links to a file from the subvolume if that
934e02119d5SChris Mason  * link is also in the log.
935e02119d5SChris Mason  */
936e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log,
937e02119d5SChris Mason 				   struct btrfs_key *key,
938f186373fSMark Fasheh 				   u64 ref_objectid,
939df8d116fSFilipe Manana 				   const char *name, int namelen)
940e02119d5SChris Mason {
941e02119d5SChris Mason 	struct btrfs_path *path;
942e02119d5SChris Mason 	struct btrfs_inode_ref *ref;
943e02119d5SChris Mason 	unsigned long ptr;
944e02119d5SChris Mason 	unsigned long ptr_end;
945e02119d5SChris Mason 	unsigned long name_ptr;
946e02119d5SChris Mason 	int found_name_len;
947e02119d5SChris Mason 	int item_size;
948e02119d5SChris Mason 	int ret;
949e02119d5SChris Mason 	int match = 0;
950e02119d5SChris Mason 
951e02119d5SChris Mason 	path = btrfs_alloc_path();
9522a29edc6Sliubo 	if (!path)
9532a29edc6Sliubo 		return -ENOMEM;
9542a29edc6Sliubo 
955e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
956e02119d5SChris Mason 	if (ret != 0)
957e02119d5SChris Mason 		goto out;
958e02119d5SChris Mason 
959e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
960f186373fSMark Fasheh 
961f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
962f186373fSMark Fasheh 		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
963f186373fSMark Fasheh 						   name, namelen, NULL))
964f186373fSMark Fasheh 			match = 1;
965f186373fSMark Fasheh 
966f186373fSMark Fasheh 		goto out;
967f186373fSMark Fasheh 	}
968f186373fSMark Fasheh 
969f186373fSMark Fasheh 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
970e02119d5SChris Mason 	ptr_end = ptr + item_size;
971e02119d5SChris Mason 	while (ptr < ptr_end) {
972e02119d5SChris Mason 		ref = (struct btrfs_inode_ref *)ptr;
973e02119d5SChris Mason 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
974e02119d5SChris Mason 		if (found_name_len == namelen) {
975e02119d5SChris Mason 			name_ptr = (unsigned long)(ref + 1);
976e02119d5SChris Mason 			ret = memcmp_extent_buffer(path->nodes[0], name,
977e02119d5SChris Mason 						   name_ptr, namelen);
978e02119d5SChris Mason 			if (ret == 0) {
979e02119d5SChris Mason 				match = 1;
980e02119d5SChris Mason 				goto out;
981e02119d5SChris Mason 			}
982e02119d5SChris Mason 		}
983e02119d5SChris Mason 		ptr = (unsigned long)(ref + 1) + found_name_len;
984e02119d5SChris Mason 	}
985e02119d5SChris Mason out:
986e02119d5SChris Mason 	btrfs_free_path(path);
987e02119d5SChris Mason 	return match;
988e02119d5SChris Mason }
989e02119d5SChris Mason 
9905a1d7843SJan Schmidt static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
9915a1d7843SJan Schmidt 				  struct btrfs_root *root,
9925a1d7843SJan Schmidt 				  struct btrfs_path *path,
9935a1d7843SJan Schmidt 				  struct btrfs_root *log_root,
9945a1d7843SJan Schmidt 				  struct inode *dir, struct inode *inode,
9955a1d7843SJan Schmidt 				  struct extent_buffer *eb,
996f186373fSMark Fasheh 				  u64 inode_objectid, u64 parent_objectid,
997f186373fSMark Fasheh 				  u64 ref_index, char *name, int namelen,
998f186373fSMark Fasheh 				  int *search_done)
9995a1d7843SJan Schmidt {
10002ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
10015a1d7843SJan Schmidt 	int ret;
10025a1d7843SJan Schmidt 	char *victim_name;
10035a1d7843SJan Schmidt 	int victim_name_len;
1004f186373fSMark Fasheh 	struct extent_buffer *leaf;
1005f186373fSMark Fasheh 	struct btrfs_dir_item *di;
1006f186373fSMark Fasheh 	struct btrfs_key search_key;
1007f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1008f186373fSMark Fasheh 
1009f186373fSMark Fasheh again:
1010f186373fSMark Fasheh 	/* Search old style refs */
1011f186373fSMark Fasheh 	search_key.objectid = inode_objectid;
1012f186373fSMark Fasheh 	search_key.type = BTRFS_INODE_REF_KEY;
1013f186373fSMark Fasheh 	search_key.offset = parent_objectid;
1014f186373fSMark Fasheh 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1015f186373fSMark Fasheh 	if (ret == 0) {
10165a1d7843SJan Schmidt 		struct btrfs_inode_ref *victim_ref;
10175a1d7843SJan Schmidt 		unsigned long ptr;
10185a1d7843SJan Schmidt 		unsigned long ptr_end;
1019f186373fSMark Fasheh 
1020f186373fSMark Fasheh 		leaf = path->nodes[0];
10215a1d7843SJan Schmidt 
10225a1d7843SJan Schmidt 		/* are we trying to overwrite a back ref for the root directory
10235a1d7843SJan Schmidt 		 * if so, just jump out, we're done
10245a1d7843SJan Schmidt 		 */
1025f186373fSMark Fasheh 		if (search_key.objectid == search_key.offset)
10265a1d7843SJan Schmidt 			return 1;
10275a1d7843SJan Schmidt 
10285a1d7843SJan Schmidt 		/* check all the names in this back reference to see
10295a1d7843SJan Schmidt 		 * if they are in the log.  if so, we allow them to stay
10305a1d7843SJan Schmidt 		 * otherwise they must be unlinked as a conflict
10315a1d7843SJan Schmidt 		 */
10325a1d7843SJan Schmidt 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
10335a1d7843SJan Schmidt 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
10345a1d7843SJan Schmidt 		while (ptr < ptr_end) {
10355a1d7843SJan Schmidt 			victim_ref = (struct btrfs_inode_ref *)ptr;
10365a1d7843SJan Schmidt 			victim_name_len = btrfs_inode_ref_name_len(leaf,
10375a1d7843SJan Schmidt 								   victim_ref);
10385a1d7843SJan Schmidt 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
10393650860bSJosef Bacik 			if (!victim_name)
10403650860bSJosef Bacik 				return -ENOMEM;
10415a1d7843SJan Schmidt 
10425a1d7843SJan Schmidt 			read_extent_buffer(leaf, victim_name,
10435a1d7843SJan Schmidt 					   (unsigned long)(victim_ref + 1),
10445a1d7843SJan Schmidt 					   victim_name_len);
10455a1d7843SJan Schmidt 
1046f186373fSMark Fasheh 			if (!backref_in_log(log_root, &search_key,
1047f186373fSMark Fasheh 					    parent_objectid,
1048f186373fSMark Fasheh 					    victim_name,
10495a1d7843SJan Schmidt 					    victim_name_len)) {
10508b558c5fSZach Brown 				inc_nlink(inode);
10515a1d7843SJan Schmidt 				btrfs_release_path(path);
10525a1d7843SJan Schmidt 
10535a1d7843SJan Schmidt 				ret = btrfs_unlink_inode(trans, root, dir,
10545a1d7843SJan Schmidt 							 inode, victim_name,
10555a1d7843SJan Schmidt 							 victim_name_len);
1056f186373fSMark Fasheh 				kfree(victim_name);
10573650860bSJosef Bacik 				if (ret)
10583650860bSJosef Bacik 					return ret;
10592ff7e61eSJeff Mahoney 				ret = btrfs_run_delayed_items(trans, fs_info);
1060ada9af21SFilipe David Borba Manana 				if (ret)
1061ada9af21SFilipe David Borba Manana 					return ret;
1062f186373fSMark Fasheh 				*search_done = 1;
1063f186373fSMark Fasheh 				goto again;
10645a1d7843SJan Schmidt 			}
10655a1d7843SJan Schmidt 			kfree(victim_name);
1066f186373fSMark Fasheh 
10675a1d7843SJan Schmidt 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
10685a1d7843SJan Schmidt 		}
10695a1d7843SJan Schmidt 
10705a1d7843SJan Schmidt 		/*
10715a1d7843SJan Schmidt 		 * NOTE: we have searched root tree and checked the
1072bb7ab3b9SAdam Buchbinder 		 * corresponding ref, it does not need to check again.
10735a1d7843SJan Schmidt 		 */
10745a1d7843SJan Schmidt 		*search_done = 1;
10755a1d7843SJan Schmidt 	}
10765a1d7843SJan Schmidt 	btrfs_release_path(path);
10775a1d7843SJan Schmidt 
1078f186373fSMark Fasheh 	/* Same search but for extended refs */
1079f186373fSMark Fasheh 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1080f186373fSMark Fasheh 					   inode_objectid, parent_objectid, 0,
1081f186373fSMark Fasheh 					   0);
1082f186373fSMark Fasheh 	if (!IS_ERR_OR_NULL(extref)) {
1083f186373fSMark Fasheh 		u32 item_size;
1084f186373fSMark Fasheh 		u32 cur_offset = 0;
1085f186373fSMark Fasheh 		unsigned long base;
1086f186373fSMark Fasheh 		struct inode *victim_parent;
1087f186373fSMark Fasheh 
1088f186373fSMark Fasheh 		leaf = path->nodes[0];
1089f186373fSMark Fasheh 
1090f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1091f186373fSMark Fasheh 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1092f186373fSMark Fasheh 
1093f186373fSMark Fasheh 		while (cur_offset < item_size) {
1094dd9ef135SQuentin Casasnovas 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1095f186373fSMark Fasheh 
1096f186373fSMark Fasheh 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1097f186373fSMark Fasheh 
1098f186373fSMark Fasheh 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1099f186373fSMark Fasheh 				goto next;
1100f186373fSMark Fasheh 
1101f186373fSMark Fasheh 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
11023650860bSJosef Bacik 			if (!victim_name)
11033650860bSJosef Bacik 				return -ENOMEM;
1104f186373fSMark Fasheh 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1105f186373fSMark Fasheh 					   victim_name_len);
1106f186373fSMark Fasheh 
1107f186373fSMark Fasheh 			search_key.objectid = inode_objectid;
1108f186373fSMark Fasheh 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1109f186373fSMark Fasheh 			search_key.offset = btrfs_extref_hash(parent_objectid,
1110f186373fSMark Fasheh 							      victim_name,
1111f186373fSMark Fasheh 							      victim_name_len);
1112f186373fSMark Fasheh 			ret = 0;
1113f186373fSMark Fasheh 			if (!backref_in_log(log_root, &search_key,
1114f186373fSMark Fasheh 					    parent_objectid, victim_name,
1115f186373fSMark Fasheh 					    victim_name_len)) {
1116f186373fSMark Fasheh 				ret = -ENOENT;
1117f186373fSMark Fasheh 				victim_parent = read_one_inode(root,
1118f186373fSMark Fasheh 							       parent_objectid);
1119f186373fSMark Fasheh 				if (victim_parent) {
11208b558c5fSZach Brown 					inc_nlink(inode);
1121f186373fSMark Fasheh 					btrfs_release_path(path);
1122f186373fSMark Fasheh 
1123f186373fSMark Fasheh 					ret = btrfs_unlink_inode(trans, root,
1124f186373fSMark Fasheh 								 victim_parent,
1125f186373fSMark Fasheh 								 inode,
1126f186373fSMark Fasheh 								 victim_name,
1127f186373fSMark Fasheh 								 victim_name_len);
1128ada9af21SFilipe David Borba Manana 					if (!ret)
1129ada9af21SFilipe David Borba Manana 						ret = btrfs_run_delayed_items(
11302ff7e61eSJeff Mahoney 								  trans,
11312ff7e61eSJeff Mahoney 								  fs_info);
1132f186373fSMark Fasheh 				}
1133f186373fSMark Fasheh 				iput(victim_parent);
1134f186373fSMark Fasheh 				kfree(victim_name);
11353650860bSJosef Bacik 				if (ret)
11363650860bSJosef Bacik 					return ret;
1137f186373fSMark Fasheh 				*search_done = 1;
1138f186373fSMark Fasheh 				goto again;
1139f186373fSMark Fasheh 			}
1140f186373fSMark Fasheh 			kfree(victim_name);
11413650860bSJosef Bacik 			if (ret)
11423650860bSJosef Bacik 				return ret;
1143f186373fSMark Fasheh next:
1144f186373fSMark Fasheh 			cur_offset += victim_name_len + sizeof(*extref);
1145f186373fSMark Fasheh 		}
1146f186373fSMark Fasheh 		*search_done = 1;
1147f186373fSMark Fasheh 	}
1148f186373fSMark Fasheh 	btrfs_release_path(path);
1149f186373fSMark Fasheh 
11505a1d7843SJan Schmidt 	/* look for a conflicting sequence number */
11514a0cc7caSNikolay Borisov 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(BTRFS_I(dir)),
1152f186373fSMark Fasheh 					 ref_index, name, namelen, 0);
11535a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
11545a1d7843SJan Schmidt 		ret = drop_one_dir_item(trans, root, path, dir, di);
11553650860bSJosef Bacik 		if (ret)
11563650860bSJosef Bacik 			return ret;
11575a1d7843SJan Schmidt 	}
11585a1d7843SJan Schmidt 	btrfs_release_path(path);
11595a1d7843SJan Schmidt 
11605a1d7843SJan Schmidt 	/* look for a conflicing name */
11614a0cc7caSNikolay Borisov 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(BTRFS_I(dir)),
11625a1d7843SJan Schmidt 				   name, namelen, 0);
11635a1d7843SJan Schmidt 	if (di && !IS_ERR(di)) {
11645a1d7843SJan Schmidt 		ret = drop_one_dir_item(trans, root, path, dir, di);
11653650860bSJosef Bacik 		if (ret)
11663650860bSJosef Bacik 			return ret;
11675a1d7843SJan Schmidt 	}
11685a1d7843SJan Schmidt 	btrfs_release_path(path);
11695a1d7843SJan Schmidt 
11705a1d7843SJan Schmidt 	return 0;
11715a1d7843SJan Schmidt }
1172e02119d5SChris Mason 
1173f186373fSMark Fasheh static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1174f186373fSMark Fasheh 			     u32 *namelen, char **name, u64 *index,
1175f186373fSMark Fasheh 			     u64 *parent_objectid)
1176f186373fSMark Fasheh {
1177f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1178f186373fSMark Fasheh 
1179f186373fSMark Fasheh 	extref = (struct btrfs_inode_extref *)ref_ptr;
1180f186373fSMark Fasheh 
1181f186373fSMark Fasheh 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1182f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1183f186373fSMark Fasheh 	if (*name == NULL)
1184f186373fSMark Fasheh 		return -ENOMEM;
1185f186373fSMark Fasheh 
1186f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1187f186373fSMark Fasheh 			   *namelen);
1188f186373fSMark Fasheh 
1189f186373fSMark Fasheh 	*index = btrfs_inode_extref_index(eb, extref);
1190f186373fSMark Fasheh 	if (parent_objectid)
1191f186373fSMark Fasheh 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1192f186373fSMark Fasheh 
1193f186373fSMark Fasheh 	return 0;
1194f186373fSMark Fasheh }
1195f186373fSMark Fasheh 
1196f186373fSMark Fasheh static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1197f186373fSMark Fasheh 			  u32 *namelen, char **name, u64 *index)
1198f186373fSMark Fasheh {
1199f186373fSMark Fasheh 	struct btrfs_inode_ref *ref;
1200f186373fSMark Fasheh 
1201f186373fSMark Fasheh 	ref = (struct btrfs_inode_ref *)ref_ptr;
1202f186373fSMark Fasheh 
1203f186373fSMark Fasheh 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1204f186373fSMark Fasheh 	*name = kmalloc(*namelen, GFP_NOFS);
1205f186373fSMark Fasheh 	if (*name == NULL)
1206f186373fSMark Fasheh 		return -ENOMEM;
1207f186373fSMark Fasheh 
1208f186373fSMark Fasheh 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1209f186373fSMark Fasheh 
1210f186373fSMark Fasheh 	*index = btrfs_inode_ref_index(eb, ref);
1211f186373fSMark Fasheh 
1212f186373fSMark Fasheh 	return 0;
1213f186373fSMark Fasheh }
1214f186373fSMark Fasheh 
1215e02119d5SChris Mason /*
1216e02119d5SChris Mason  * replay one inode back reference item found in the log tree.
1217e02119d5SChris Mason  * eb, slot and key refer to the buffer and key found in the log tree.
1218e02119d5SChris Mason  * root is the destination we are replaying into, and path is for temp
1219e02119d5SChris Mason  * use by this function.  (it should be released on return).
1220e02119d5SChris Mason  */
1221e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1222e02119d5SChris Mason 				  struct btrfs_root *root,
1223e02119d5SChris Mason 				  struct btrfs_root *log,
1224e02119d5SChris Mason 				  struct btrfs_path *path,
1225e02119d5SChris Mason 				  struct extent_buffer *eb, int slot,
1226e02119d5SChris Mason 				  struct btrfs_key *key)
1227e02119d5SChris Mason {
122803b2f08bSGeyslan G. Bem 	struct inode *dir = NULL;
122903b2f08bSGeyslan G. Bem 	struct inode *inode = NULL;
1230e02119d5SChris Mason 	unsigned long ref_ptr;
1231e02119d5SChris Mason 	unsigned long ref_end;
123203b2f08bSGeyslan G. Bem 	char *name = NULL;
123334f3e4f2Sliubo 	int namelen;
123434f3e4f2Sliubo 	int ret;
1235c622ae60Sliubo 	int search_done = 0;
1236f186373fSMark Fasheh 	int log_ref_ver = 0;
1237f186373fSMark Fasheh 	u64 parent_objectid;
1238f186373fSMark Fasheh 	u64 inode_objectid;
1239f46dbe3dSChris Mason 	u64 ref_index = 0;
1240f186373fSMark Fasheh 	int ref_struct_size;
1241f186373fSMark Fasheh 
1242f186373fSMark Fasheh 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1243f186373fSMark Fasheh 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1244f186373fSMark Fasheh 
1245f186373fSMark Fasheh 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1246f186373fSMark Fasheh 		struct btrfs_inode_extref *r;
1247f186373fSMark Fasheh 
1248f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1249f186373fSMark Fasheh 		log_ref_ver = 1;
1250f186373fSMark Fasheh 		r = (struct btrfs_inode_extref *)ref_ptr;
1251f186373fSMark Fasheh 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1252f186373fSMark Fasheh 	} else {
1253f186373fSMark Fasheh 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1254f186373fSMark Fasheh 		parent_objectid = key->offset;
1255f186373fSMark Fasheh 	}
1256f186373fSMark Fasheh 	inode_objectid = key->objectid;
1257e02119d5SChris Mason 
1258e02119d5SChris Mason 	/*
1259e02119d5SChris Mason 	 * it is possible that we didn't log all the parent directories
1260e02119d5SChris Mason 	 * for a given inode.  If we don't find the dir, just don't
1261e02119d5SChris Mason 	 * copy the back ref in.  The link count fixup code will take
1262e02119d5SChris Mason 	 * care of the rest
1263e02119d5SChris Mason 	 */
1264f186373fSMark Fasheh 	dir = read_one_inode(root, parent_objectid);
126503b2f08bSGeyslan G. Bem 	if (!dir) {
126603b2f08bSGeyslan G. Bem 		ret = -ENOENT;
126703b2f08bSGeyslan G. Bem 		goto out;
126803b2f08bSGeyslan G. Bem 	}
1269e02119d5SChris Mason 
1270f186373fSMark Fasheh 	inode = read_one_inode(root, inode_objectid);
1271c00e9493STsutomu Itoh 	if (!inode) {
127203b2f08bSGeyslan G. Bem 		ret = -EIO;
127303b2f08bSGeyslan G. Bem 		goto out;
1274c00e9493STsutomu Itoh 	}
1275e02119d5SChris Mason 
12765a1d7843SJan Schmidt 	while (ref_ptr < ref_end) {
1277f186373fSMark Fasheh 		if (log_ref_ver) {
1278f186373fSMark Fasheh 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1279f186373fSMark Fasheh 						&ref_index, &parent_objectid);
1280f186373fSMark Fasheh 			/*
1281f186373fSMark Fasheh 			 * parent object can change from one array
1282f186373fSMark Fasheh 			 * item to another.
1283f186373fSMark Fasheh 			 */
1284f186373fSMark Fasheh 			if (!dir)
1285f186373fSMark Fasheh 				dir = read_one_inode(root, parent_objectid);
128603b2f08bSGeyslan G. Bem 			if (!dir) {
128703b2f08bSGeyslan G. Bem 				ret = -ENOENT;
128803b2f08bSGeyslan G. Bem 				goto out;
128903b2f08bSGeyslan G. Bem 			}
1290f186373fSMark Fasheh 		} else {
1291f186373fSMark Fasheh 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1292f186373fSMark Fasheh 					     &ref_index);
1293f186373fSMark Fasheh 		}
1294f186373fSMark Fasheh 		if (ret)
129503b2f08bSGeyslan G. Bem 			goto out;
1296e02119d5SChris Mason 
1297e02119d5SChris Mason 		/* if we already have a perfect match, we're done */
12984a0cc7caSNikolay Borisov 		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), btrfs_ino(BTRFS_I(inode)),
1299f186373fSMark Fasheh 				  ref_index, name, namelen)) {
13005a1d7843SJan Schmidt 			/*
13015a1d7843SJan Schmidt 			 * look for a conflicting back reference in the
13025a1d7843SJan Schmidt 			 * metadata. if we find one we have to unlink that name
13035a1d7843SJan Schmidt 			 * of the file before we add our new link.  Later on, we
13045a1d7843SJan Schmidt 			 * overwrite any existing back reference, and we don't
13055a1d7843SJan Schmidt 			 * want to create dangling pointers in the directory.
13065a1d7843SJan Schmidt 			 */
13075a1d7843SJan Schmidt 
13085a1d7843SJan Schmidt 			if (!search_done) {
13095a1d7843SJan Schmidt 				ret = __add_inode_ref(trans, root, path, log,
1310f186373fSMark Fasheh 						      dir, inode, eb,
1311f186373fSMark Fasheh 						      inode_objectid,
1312f186373fSMark Fasheh 						      parent_objectid,
1313f186373fSMark Fasheh 						      ref_index, name, namelen,
13145a1d7843SJan Schmidt 						      &search_done);
131503b2f08bSGeyslan G. Bem 				if (ret) {
131603b2f08bSGeyslan G. Bem 					if (ret == 1)
13173650860bSJosef Bacik 						ret = 0;
1318e02119d5SChris Mason 					goto out;
13193650860bSJosef Bacik 				}
132034f3e4f2Sliubo 			}
132134f3e4f2Sliubo 
1322e02119d5SChris Mason 			/* insert our name */
13235a1d7843SJan Schmidt 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
1324f186373fSMark Fasheh 					     0, ref_index);
13253650860bSJosef Bacik 			if (ret)
13263650860bSJosef Bacik 				goto out;
1327e02119d5SChris Mason 
1328e02119d5SChris Mason 			btrfs_update_inode(trans, root, inode);
13295a1d7843SJan Schmidt 		}
1330e02119d5SChris Mason 
1331f186373fSMark Fasheh 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1332e02119d5SChris Mason 		kfree(name);
133303b2f08bSGeyslan G. Bem 		name = NULL;
1334f186373fSMark Fasheh 		if (log_ref_ver) {
1335f186373fSMark Fasheh 			iput(dir);
1336f186373fSMark Fasheh 			dir = NULL;
1337f186373fSMark Fasheh 		}
13385a1d7843SJan Schmidt 	}
1339e02119d5SChris Mason 
1340e02119d5SChris Mason 	/* finally write the back reference in the inode */
1341e02119d5SChris Mason 	ret = overwrite_item(trans, root, path, eb, slot, key);
13425a1d7843SJan Schmidt out:
1343b3b4aa74SDavid Sterba 	btrfs_release_path(path);
134403b2f08bSGeyslan G. Bem 	kfree(name);
1345e02119d5SChris Mason 	iput(dir);
1346e02119d5SChris Mason 	iput(inode);
13473650860bSJosef Bacik 	return ret;
1348e02119d5SChris Mason }
1349e02119d5SChris Mason 
1350c71bf099SYan, Zheng static int insert_orphan_item(struct btrfs_trans_handle *trans,
13519c4f61f0SDavid Sterba 			      struct btrfs_root *root, u64 ino)
1352c71bf099SYan, Zheng {
1353c71bf099SYan, Zheng 	int ret;
1354381cf658SDavid Sterba 
13559c4f61f0SDavid Sterba 	ret = btrfs_insert_orphan_item(trans, root, ino);
13569c4f61f0SDavid Sterba 	if (ret == -EEXIST)
13579c4f61f0SDavid Sterba 		ret = 0;
1358381cf658SDavid Sterba 
1359c71bf099SYan, Zheng 	return ret;
1360c71bf099SYan, Zheng }
1361c71bf099SYan, Zheng 
1362f186373fSMark Fasheh static int count_inode_extrefs(struct btrfs_root *root,
1363f186373fSMark Fasheh 			       struct inode *inode, struct btrfs_path *path)
1364e02119d5SChris Mason {
1365f186373fSMark Fasheh 	int ret = 0;
1366f186373fSMark Fasheh 	int name_len;
1367f186373fSMark Fasheh 	unsigned int nlink = 0;
1368f186373fSMark Fasheh 	u32 item_size;
1369f186373fSMark Fasheh 	u32 cur_offset = 0;
13704a0cc7caSNikolay Borisov 	u64 inode_objectid = btrfs_ino(BTRFS_I(inode));
1371f186373fSMark Fasheh 	u64 offset = 0;
1372f186373fSMark Fasheh 	unsigned long ptr;
1373f186373fSMark Fasheh 	struct btrfs_inode_extref *extref;
1374f186373fSMark Fasheh 	struct extent_buffer *leaf;
1375f186373fSMark Fasheh 
1376f186373fSMark Fasheh 	while (1) {
1377f186373fSMark Fasheh 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1378f186373fSMark Fasheh 					    &extref, &offset);
1379f186373fSMark Fasheh 		if (ret)
1380f186373fSMark Fasheh 			break;
1381f186373fSMark Fasheh 
1382f186373fSMark Fasheh 		leaf = path->nodes[0];
1383f186373fSMark Fasheh 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1384f186373fSMark Fasheh 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
13852c2c452bSFilipe Manana 		cur_offset = 0;
1386f186373fSMark Fasheh 
1387f186373fSMark Fasheh 		while (cur_offset < item_size) {
1388f186373fSMark Fasheh 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1389f186373fSMark Fasheh 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1390f186373fSMark Fasheh 
1391f186373fSMark Fasheh 			nlink++;
1392f186373fSMark Fasheh 
1393f186373fSMark Fasheh 			cur_offset += name_len + sizeof(*extref);
1394f186373fSMark Fasheh 		}
1395f186373fSMark Fasheh 
1396f186373fSMark Fasheh 		offset++;
1397f186373fSMark Fasheh 		btrfs_release_path(path);
1398f186373fSMark Fasheh 	}
1399f186373fSMark Fasheh 	btrfs_release_path(path);
1400f186373fSMark Fasheh 
14012c2c452bSFilipe Manana 	if (ret < 0 && ret != -ENOENT)
1402f186373fSMark Fasheh 		return ret;
1403f186373fSMark Fasheh 	return nlink;
1404f186373fSMark Fasheh }
1405f186373fSMark Fasheh 
1406f186373fSMark Fasheh static int count_inode_refs(struct btrfs_root *root,
1407f186373fSMark Fasheh 			       struct inode *inode, struct btrfs_path *path)
1408f186373fSMark Fasheh {
1409e02119d5SChris Mason 	int ret;
1410e02119d5SChris Mason 	struct btrfs_key key;
1411f186373fSMark Fasheh 	unsigned int nlink = 0;
1412e02119d5SChris Mason 	unsigned long ptr;
1413e02119d5SChris Mason 	unsigned long ptr_end;
1414e02119d5SChris Mason 	int name_len;
14154a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
1416e02119d5SChris Mason 
141733345d01SLi Zefan 	key.objectid = ino;
1418e02119d5SChris Mason 	key.type = BTRFS_INODE_REF_KEY;
1419e02119d5SChris Mason 	key.offset = (u64)-1;
1420e02119d5SChris Mason 
1421e02119d5SChris Mason 	while (1) {
1422e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1423e02119d5SChris Mason 		if (ret < 0)
1424e02119d5SChris Mason 			break;
1425e02119d5SChris Mason 		if (ret > 0) {
1426e02119d5SChris Mason 			if (path->slots[0] == 0)
1427e02119d5SChris Mason 				break;
1428e02119d5SChris Mason 			path->slots[0]--;
1429e02119d5SChris Mason 		}
1430e93ae26fSFilipe David Borba Manana process_slot:
1431e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1432e02119d5SChris Mason 				      path->slots[0]);
143333345d01SLi Zefan 		if (key.objectid != ino ||
1434e02119d5SChris Mason 		    key.type != BTRFS_INODE_REF_KEY)
1435e02119d5SChris Mason 			break;
1436e02119d5SChris Mason 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1437e02119d5SChris Mason 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1438e02119d5SChris Mason 						   path->slots[0]);
1439e02119d5SChris Mason 		while (ptr < ptr_end) {
1440e02119d5SChris Mason 			struct btrfs_inode_ref *ref;
1441e02119d5SChris Mason 
1442e02119d5SChris Mason 			ref = (struct btrfs_inode_ref *)ptr;
1443e02119d5SChris Mason 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1444e02119d5SChris Mason 							    ref);
1445e02119d5SChris Mason 			ptr = (unsigned long)(ref + 1) + name_len;
1446e02119d5SChris Mason 			nlink++;
1447e02119d5SChris Mason 		}
1448e02119d5SChris Mason 
1449e02119d5SChris Mason 		if (key.offset == 0)
1450e02119d5SChris Mason 			break;
1451e93ae26fSFilipe David Borba Manana 		if (path->slots[0] > 0) {
1452e93ae26fSFilipe David Borba Manana 			path->slots[0]--;
1453e93ae26fSFilipe David Borba Manana 			goto process_slot;
1454e93ae26fSFilipe David Borba Manana 		}
1455e02119d5SChris Mason 		key.offset--;
1456b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1457e02119d5SChris Mason 	}
1458b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1459f186373fSMark Fasheh 
1460f186373fSMark Fasheh 	return nlink;
1461f186373fSMark Fasheh }
1462f186373fSMark Fasheh 
1463f186373fSMark Fasheh /*
1464f186373fSMark Fasheh  * There are a few corners where the link count of the file can't
1465f186373fSMark Fasheh  * be properly maintained during replay.  So, instead of adding
1466f186373fSMark Fasheh  * lots of complexity to the log code, we just scan the backrefs
1467f186373fSMark Fasheh  * for any file that has been through replay.
1468f186373fSMark Fasheh  *
1469f186373fSMark Fasheh  * The scan will update the link count on the inode to reflect the
1470f186373fSMark Fasheh  * number of back refs found.  If it goes down to zero, the iput
1471f186373fSMark Fasheh  * will free the inode.
1472f186373fSMark Fasheh  */
1473f186373fSMark Fasheh static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1474f186373fSMark Fasheh 					   struct btrfs_root *root,
1475f186373fSMark Fasheh 					   struct inode *inode)
1476f186373fSMark Fasheh {
1477f186373fSMark Fasheh 	struct btrfs_path *path;
1478f186373fSMark Fasheh 	int ret;
1479f186373fSMark Fasheh 	u64 nlink = 0;
14804a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
1481f186373fSMark Fasheh 
1482f186373fSMark Fasheh 	path = btrfs_alloc_path();
1483f186373fSMark Fasheh 	if (!path)
1484f186373fSMark Fasheh 		return -ENOMEM;
1485f186373fSMark Fasheh 
1486f186373fSMark Fasheh 	ret = count_inode_refs(root, inode, path);
1487f186373fSMark Fasheh 	if (ret < 0)
1488f186373fSMark Fasheh 		goto out;
1489f186373fSMark Fasheh 
1490f186373fSMark Fasheh 	nlink = ret;
1491f186373fSMark Fasheh 
1492f186373fSMark Fasheh 	ret = count_inode_extrefs(root, inode, path);
1493f186373fSMark Fasheh 	if (ret < 0)
1494f186373fSMark Fasheh 		goto out;
1495f186373fSMark Fasheh 
1496f186373fSMark Fasheh 	nlink += ret;
1497f186373fSMark Fasheh 
1498f186373fSMark Fasheh 	ret = 0;
1499f186373fSMark Fasheh 
1500e02119d5SChris Mason 	if (nlink != inode->i_nlink) {
1501bfe86848SMiklos Szeredi 		set_nlink(inode, nlink);
1502e02119d5SChris Mason 		btrfs_update_inode(trans, root, inode);
1503e02119d5SChris Mason 	}
15048d5bf1cbSChris Mason 	BTRFS_I(inode)->index_cnt = (u64)-1;
1505e02119d5SChris Mason 
1506c71bf099SYan, Zheng 	if (inode->i_nlink == 0) {
1507c71bf099SYan, Zheng 		if (S_ISDIR(inode->i_mode)) {
150812fcfd22SChris Mason 			ret = replay_dir_deletes(trans, root, NULL, path,
150933345d01SLi Zefan 						 ino, 1);
15103650860bSJosef Bacik 			if (ret)
15113650860bSJosef Bacik 				goto out;
151212fcfd22SChris Mason 		}
151333345d01SLi Zefan 		ret = insert_orphan_item(trans, root, ino);
1514c71bf099SYan, Zheng 	}
151512fcfd22SChris Mason 
1516f186373fSMark Fasheh out:
1517f186373fSMark Fasheh 	btrfs_free_path(path);
1518f186373fSMark Fasheh 	return ret;
1519e02119d5SChris Mason }
1520e02119d5SChris Mason 
1521e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1522e02119d5SChris Mason 					    struct btrfs_root *root,
1523e02119d5SChris Mason 					    struct btrfs_path *path)
1524e02119d5SChris Mason {
1525e02119d5SChris Mason 	int ret;
1526e02119d5SChris Mason 	struct btrfs_key key;
1527e02119d5SChris Mason 	struct inode *inode;
1528e02119d5SChris Mason 
1529e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1530e02119d5SChris Mason 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1531e02119d5SChris Mason 	key.offset = (u64)-1;
1532e02119d5SChris Mason 	while (1) {
1533e02119d5SChris Mason 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1534e02119d5SChris Mason 		if (ret < 0)
1535e02119d5SChris Mason 			break;
1536e02119d5SChris Mason 
1537e02119d5SChris Mason 		if (ret == 1) {
1538e02119d5SChris Mason 			if (path->slots[0] == 0)
1539e02119d5SChris Mason 				break;
1540e02119d5SChris Mason 			path->slots[0]--;
1541e02119d5SChris Mason 		}
1542e02119d5SChris Mason 
1543e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1544e02119d5SChris Mason 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1545e02119d5SChris Mason 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1546e02119d5SChris Mason 			break;
1547e02119d5SChris Mason 
1548e02119d5SChris Mason 		ret = btrfs_del_item(trans, root, path);
154965a246c5STsutomu Itoh 		if (ret)
155065a246c5STsutomu Itoh 			goto out;
1551e02119d5SChris Mason 
1552b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1553e02119d5SChris Mason 		inode = read_one_inode(root, key.offset);
1554c00e9493STsutomu Itoh 		if (!inode)
1555c00e9493STsutomu Itoh 			return -EIO;
1556e02119d5SChris Mason 
1557e02119d5SChris Mason 		ret = fixup_inode_link_count(trans, root, inode);
1558e02119d5SChris Mason 		iput(inode);
15593650860bSJosef Bacik 		if (ret)
15603650860bSJosef Bacik 			goto out;
1561e02119d5SChris Mason 
156212fcfd22SChris Mason 		/*
156312fcfd22SChris Mason 		 * fixup on a directory may create new entries,
156412fcfd22SChris Mason 		 * make sure we always look for the highset possible
156512fcfd22SChris Mason 		 * offset
156612fcfd22SChris Mason 		 */
156712fcfd22SChris Mason 		key.offset = (u64)-1;
1568e02119d5SChris Mason 	}
156965a246c5STsutomu Itoh 	ret = 0;
157065a246c5STsutomu Itoh out:
1571b3b4aa74SDavid Sterba 	btrfs_release_path(path);
157265a246c5STsutomu Itoh 	return ret;
1573e02119d5SChris Mason }
1574e02119d5SChris Mason 
1575e02119d5SChris Mason 
1576e02119d5SChris Mason /*
1577e02119d5SChris Mason  * record a given inode in the fixup dir so we can check its link
1578e02119d5SChris Mason  * count when replay is done.  The link count is incremented here
1579e02119d5SChris Mason  * so the inode won't go away until we check it
1580e02119d5SChris Mason  */
1581e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1582e02119d5SChris Mason 				      struct btrfs_root *root,
1583e02119d5SChris Mason 				      struct btrfs_path *path,
1584e02119d5SChris Mason 				      u64 objectid)
1585e02119d5SChris Mason {
1586e02119d5SChris Mason 	struct btrfs_key key;
1587e02119d5SChris Mason 	int ret = 0;
1588e02119d5SChris Mason 	struct inode *inode;
1589e02119d5SChris Mason 
1590e02119d5SChris Mason 	inode = read_one_inode(root, objectid);
1591c00e9493STsutomu Itoh 	if (!inode)
1592c00e9493STsutomu Itoh 		return -EIO;
1593e02119d5SChris Mason 
1594e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1595962a298fSDavid Sterba 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1596e02119d5SChris Mason 	key.offset = objectid;
1597e02119d5SChris Mason 
1598e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1599e02119d5SChris Mason 
1600b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1601e02119d5SChris Mason 	if (ret == 0) {
16029bf7a489SJosef Bacik 		if (!inode->i_nlink)
16039bf7a489SJosef Bacik 			set_nlink(inode, 1);
16049bf7a489SJosef Bacik 		else
16058b558c5fSZach Brown 			inc_nlink(inode);
1606b9959295STsutomu Itoh 		ret = btrfs_update_inode(trans, root, inode);
1607e02119d5SChris Mason 	} else if (ret == -EEXIST) {
1608e02119d5SChris Mason 		ret = 0;
1609e02119d5SChris Mason 	} else {
16103650860bSJosef Bacik 		BUG(); /* Logic Error */
1611e02119d5SChris Mason 	}
1612e02119d5SChris Mason 	iput(inode);
1613e02119d5SChris Mason 
1614e02119d5SChris Mason 	return ret;
1615e02119d5SChris Mason }
1616e02119d5SChris Mason 
1617e02119d5SChris Mason /*
1618e02119d5SChris Mason  * when replaying the log for a directory, we only insert names
1619e02119d5SChris Mason  * for inodes that actually exist.  This means an fsync on a directory
1620e02119d5SChris Mason  * does not implicitly fsync all the new files in it
1621e02119d5SChris Mason  */
1622e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1623e02119d5SChris Mason 				    struct btrfs_root *root,
1624e02119d5SChris Mason 				    u64 dirid, u64 index,
162560d53eb3SZhaolei 				    char *name, int name_len,
1626e02119d5SChris Mason 				    struct btrfs_key *location)
1627e02119d5SChris Mason {
1628e02119d5SChris Mason 	struct inode *inode;
1629e02119d5SChris Mason 	struct inode *dir;
1630e02119d5SChris Mason 	int ret;
1631e02119d5SChris Mason 
1632e02119d5SChris Mason 	inode = read_one_inode(root, location->objectid);
1633e02119d5SChris Mason 	if (!inode)
1634e02119d5SChris Mason 		return -ENOENT;
1635e02119d5SChris Mason 
1636e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
1637e02119d5SChris Mason 	if (!dir) {
1638e02119d5SChris Mason 		iput(inode);
1639e02119d5SChris Mason 		return -EIO;
1640e02119d5SChris Mason 	}
1641d555438bSJosef Bacik 
1642e02119d5SChris Mason 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1643e02119d5SChris Mason 
1644e02119d5SChris Mason 	/* FIXME, put inode into FIXUP list */
1645e02119d5SChris Mason 
1646e02119d5SChris Mason 	iput(inode);
1647e02119d5SChris Mason 	iput(dir);
1648e02119d5SChris Mason 	return ret;
1649e02119d5SChris Mason }
1650e02119d5SChris Mason 
1651e02119d5SChris Mason /*
1652df8d116fSFilipe Manana  * Return true if an inode reference exists in the log for the given name,
1653df8d116fSFilipe Manana  * inode and parent inode.
1654df8d116fSFilipe Manana  */
1655df8d116fSFilipe Manana static bool name_in_log_ref(struct btrfs_root *log_root,
1656df8d116fSFilipe Manana 			    const char *name, const int name_len,
1657df8d116fSFilipe Manana 			    const u64 dirid, const u64 ino)
1658df8d116fSFilipe Manana {
1659df8d116fSFilipe Manana 	struct btrfs_key search_key;
1660df8d116fSFilipe Manana 
1661df8d116fSFilipe Manana 	search_key.objectid = ino;
1662df8d116fSFilipe Manana 	search_key.type = BTRFS_INODE_REF_KEY;
1663df8d116fSFilipe Manana 	search_key.offset = dirid;
1664df8d116fSFilipe Manana 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1665df8d116fSFilipe Manana 		return true;
1666df8d116fSFilipe Manana 
1667df8d116fSFilipe Manana 	search_key.type = BTRFS_INODE_EXTREF_KEY;
1668df8d116fSFilipe Manana 	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1669df8d116fSFilipe Manana 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1670df8d116fSFilipe Manana 		return true;
1671df8d116fSFilipe Manana 
1672df8d116fSFilipe Manana 	return false;
1673df8d116fSFilipe Manana }
1674df8d116fSFilipe Manana 
1675df8d116fSFilipe Manana /*
1676e02119d5SChris Mason  * take a single entry in a log directory item and replay it into
1677e02119d5SChris Mason  * the subvolume.
1678e02119d5SChris Mason  *
1679e02119d5SChris Mason  * if a conflicting item exists in the subdirectory already,
1680e02119d5SChris Mason  * the inode it points to is unlinked and put into the link count
1681e02119d5SChris Mason  * fix up tree.
1682e02119d5SChris Mason  *
1683e02119d5SChris Mason  * If a name from the log points to a file or directory that does
1684e02119d5SChris Mason  * not exist in the FS, it is skipped.  fsyncs on directories
1685e02119d5SChris Mason  * do not force down inodes inside that directory, just changes to the
1686e02119d5SChris Mason  * names or unlinks in a directory.
1687bb53eda9SFilipe Manana  *
1688bb53eda9SFilipe Manana  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1689bb53eda9SFilipe Manana  * non-existing inode) and 1 if the name was replayed.
1690e02119d5SChris Mason  */
1691e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1692e02119d5SChris Mason 				    struct btrfs_root *root,
1693e02119d5SChris Mason 				    struct btrfs_path *path,
1694e02119d5SChris Mason 				    struct extent_buffer *eb,
1695e02119d5SChris Mason 				    struct btrfs_dir_item *di,
1696e02119d5SChris Mason 				    struct btrfs_key *key)
1697e02119d5SChris Mason {
1698e02119d5SChris Mason 	char *name;
1699e02119d5SChris Mason 	int name_len;
1700e02119d5SChris Mason 	struct btrfs_dir_item *dst_di;
1701e02119d5SChris Mason 	struct btrfs_key found_key;
1702e02119d5SChris Mason 	struct btrfs_key log_key;
1703e02119d5SChris Mason 	struct inode *dir;
1704e02119d5SChris Mason 	u8 log_type;
17054bef0848SChris Mason 	int exists;
17063650860bSJosef Bacik 	int ret = 0;
1707d555438bSJosef Bacik 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1708bb53eda9SFilipe Manana 	bool name_added = false;
1709e02119d5SChris Mason 
1710e02119d5SChris Mason 	dir = read_one_inode(root, key->objectid);
1711c00e9493STsutomu Itoh 	if (!dir)
1712c00e9493STsutomu Itoh 		return -EIO;
1713e02119d5SChris Mason 
1714e02119d5SChris Mason 	name_len = btrfs_dir_name_len(eb, di);
1715e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
17162bac325eSFilipe David Borba Manana 	if (!name) {
17172bac325eSFilipe David Borba Manana 		ret = -ENOMEM;
17182bac325eSFilipe David Borba Manana 		goto out;
17192bac325eSFilipe David Borba Manana 	}
17202a29edc6Sliubo 
1721e02119d5SChris Mason 	log_type = btrfs_dir_type(eb, di);
1722e02119d5SChris Mason 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1723e02119d5SChris Mason 		   name_len);
1724e02119d5SChris Mason 
1725e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
17264bef0848SChris Mason 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
17274bef0848SChris Mason 	if (exists == 0)
17284bef0848SChris Mason 		exists = 1;
17294bef0848SChris Mason 	else
17304bef0848SChris Mason 		exists = 0;
1731b3b4aa74SDavid Sterba 	btrfs_release_path(path);
17324bef0848SChris Mason 
1733e02119d5SChris Mason 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1734e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1735e02119d5SChris Mason 				       name, name_len, 1);
1736d397712bSChris Mason 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1737e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1738e02119d5SChris Mason 						     key->objectid,
1739e02119d5SChris Mason 						     key->offset, name,
1740e02119d5SChris Mason 						     name_len, 1);
1741e02119d5SChris Mason 	} else {
17423650860bSJosef Bacik 		/* Corruption */
17433650860bSJosef Bacik 		ret = -EINVAL;
17443650860bSJosef Bacik 		goto out;
1745e02119d5SChris Mason 	}
1746c704005dSDavid Sterba 	if (IS_ERR_OR_NULL(dst_di)) {
1747e02119d5SChris Mason 		/* we need a sequence number to insert, so we only
1748e02119d5SChris Mason 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1749e02119d5SChris Mason 		 */
1750e02119d5SChris Mason 		if (key->type != BTRFS_DIR_INDEX_KEY)
1751e02119d5SChris Mason 			goto out;
1752e02119d5SChris Mason 		goto insert;
1753e02119d5SChris Mason 	}
1754e02119d5SChris Mason 
1755e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1756e02119d5SChris Mason 	/* the existing item matches the logged item */
1757e02119d5SChris Mason 	if (found_key.objectid == log_key.objectid &&
1758e02119d5SChris Mason 	    found_key.type == log_key.type &&
1759e02119d5SChris Mason 	    found_key.offset == log_key.offset &&
1760e02119d5SChris Mason 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1761a2cc11dbSFilipe Manana 		update_size = false;
1762e02119d5SChris Mason 		goto out;
1763e02119d5SChris Mason 	}
1764e02119d5SChris Mason 
1765e02119d5SChris Mason 	/*
1766e02119d5SChris Mason 	 * don't drop the conflicting directory entry if the inode
1767e02119d5SChris Mason 	 * for the new entry doesn't exist
1768e02119d5SChris Mason 	 */
17694bef0848SChris Mason 	if (!exists)
1770e02119d5SChris Mason 		goto out;
1771e02119d5SChris Mason 
1772e02119d5SChris Mason 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
17733650860bSJosef Bacik 	if (ret)
17743650860bSJosef Bacik 		goto out;
1775e02119d5SChris Mason 
1776e02119d5SChris Mason 	if (key->type == BTRFS_DIR_INDEX_KEY)
1777e02119d5SChris Mason 		goto insert;
1778e02119d5SChris Mason out:
1779b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1780d555438bSJosef Bacik 	if (!ret && update_size) {
1781d555438bSJosef Bacik 		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1782d555438bSJosef Bacik 		ret = btrfs_update_inode(trans, root, dir);
1783d555438bSJosef Bacik 	}
1784e02119d5SChris Mason 	kfree(name);
1785e02119d5SChris Mason 	iput(dir);
1786bb53eda9SFilipe Manana 	if (!ret && name_added)
1787bb53eda9SFilipe Manana 		ret = 1;
17883650860bSJosef Bacik 	return ret;
1789e02119d5SChris Mason 
1790e02119d5SChris Mason insert:
1791df8d116fSFilipe Manana 	if (name_in_log_ref(root->log_root, name, name_len,
1792df8d116fSFilipe Manana 			    key->objectid, log_key.objectid)) {
1793df8d116fSFilipe Manana 		/* The dentry will be added later. */
1794df8d116fSFilipe Manana 		ret = 0;
1795df8d116fSFilipe Manana 		update_size = false;
1796df8d116fSFilipe Manana 		goto out;
1797df8d116fSFilipe Manana 	}
1798b3b4aa74SDavid Sterba 	btrfs_release_path(path);
179960d53eb3SZhaolei 	ret = insert_one_name(trans, root, key->objectid, key->offset,
180060d53eb3SZhaolei 			      name, name_len, &log_key);
1801df8d116fSFilipe Manana 	if (ret && ret != -ENOENT && ret != -EEXIST)
18023650860bSJosef Bacik 		goto out;
1803bb53eda9SFilipe Manana 	if (!ret)
1804bb53eda9SFilipe Manana 		name_added = true;
1805d555438bSJosef Bacik 	update_size = false;
18063650860bSJosef Bacik 	ret = 0;
1807e02119d5SChris Mason 	goto out;
1808e02119d5SChris Mason }
1809e02119d5SChris Mason 
1810e02119d5SChris Mason /*
1811e02119d5SChris Mason  * find all the names in a directory item and reconcile them into
1812e02119d5SChris Mason  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1813e02119d5SChris Mason  * one name in a directory item, but the same code gets used for
1814e02119d5SChris Mason  * both directory index types
1815e02119d5SChris Mason  */
1816e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1817e02119d5SChris Mason 					struct btrfs_root *root,
1818e02119d5SChris Mason 					struct btrfs_path *path,
1819e02119d5SChris Mason 					struct extent_buffer *eb, int slot,
1820e02119d5SChris Mason 					struct btrfs_key *key)
1821e02119d5SChris Mason {
18222ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
1823bb53eda9SFilipe Manana 	int ret = 0;
1824e02119d5SChris Mason 	u32 item_size = btrfs_item_size_nr(eb, slot);
1825e02119d5SChris Mason 	struct btrfs_dir_item *di;
1826e02119d5SChris Mason 	int name_len;
1827e02119d5SChris Mason 	unsigned long ptr;
1828e02119d5SChris Mason 	unsigned long ptr_end;
1829bb53eda9SFilipe Manana 	struct btrfs_path *fixup_path = NULL;
1830e02119d5SChris Mason 
1831e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
1832e02119d5SChris Mason 	ptr_end = ptr + item_size;
1833e02119d5SChris Mason 	while (ptr < ptr_end) {
1834e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
18352ff7e61eSJeff Mahoney 		if (verify_dir_item(fs_info, eb, di))
183622a94d44SJosef Bacik 			return -EIO;
1837e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
1838e02119d5SChris Mason 		ret = replay_one_name(trans, root, path, eb, di, key);
1839bb53eda9SFilipe Manana 		if (ret < 0)
1840bb53eda9SFilipe Manana 			break;
1841e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
1842e02119d5SChris Mason 		ptr += name_len;
1843bb53eda9SFilipe Manana 
1844bb53eda9SFilipe Manana 		/*
1845bb53eda9SFilipe Manana 		 * If this entry refers to a non-directory (directories can not
1846bb53eda9SFilipe Manana 		 * have a link count > 1) and it was added in the transaction
1847bb53eda9SFilipe Manana 		 * that was not committed, make sure we fixup the link count of
1848bb53eda9SFilipe Manana 		 * the inode it the entry points to. Otherwise something like
1849bb53eda9SFilipe Manana 		 * the following would result in a directory pointing to an
1850bb53eda9SFilipe Manana 		 * inode with a wrong link that does not account for this dir
1851bb53eda9SFilipe Manana 		 * entry:
1852bb53eda9SFilipe Manana 		 *
1853bb53eda9SFilipe Manana 		 * mkdir testdir
1854bb53eda9SFilipe Manana 		 * touch testdir/foo
1855bb53eda9SFilipe Manana 		 * touch testdir/bar
1856bb53eda9SFilipe Manana 		 * sync
1857bb53eda9SFilipe Manana 		 *
1858bb53eda9SFilipe Manana 		 * ln testdir/bar testdir/bar_link
1859bb53eda9SFilipe Manana 		 * ln testdir/foo testdir/foo_link
1860bb53eda9SFilipe Manana 		 * xfs_io -c "fsync" testdir/bar
1861bb53eda9SFilipe Manana 		 *
1862bb53eda9SFilipe Manana 		 * <power failure>
1863bb53eda9SFilipe Manana 		 *
1864bb53eda9SFilipe Manana 		 * mount fs, log replay happens
1865bb53eda9SFilipe Manana 		 *
1866bb53eda9SFilipe Manana 		 * File foo would remain with a link count of 1 when it has two
1867bb53eda9SFilipe Manana 		 * entries pointing to it in the directory testdir. This would
1868bb53eda9SFilipe Manana 		 * make it impossible to ever delete the parent directory has
1869bb53eda9SFilipe Manana 		 * it would result in stale dentries that can never be deleted.
1870bb53eda9SFilipe Manana 		 */
1871bb53eda9SFilipe Manana 		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1872bb53eda9SFilipe Manana 			struct btrfs_key di_key;
1873bb53eda9SFilipe Manana 
1874bb53eda9SFilipe Manana 			if (!fixup_path) {
1875bb53eda9SFilipe Manana 				fixup_path = btrfs_alloc_path();
1876bb53eda9SFilipe Manana 				if (!fixup_path) {
1877bb53eda9SFilipe Manana 					ret = -ENOMEM;
1878bb53eda9SFilipe Manana 					break;
1879e02119d5SChris Mason 				}
1880bb53eda9SFilipe Manana 			}
1881bb53eda9SFilipe Manana 
1882bb53eda9SFilipe Manana 			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1883bb53eda9SFilipe Manana 			ret = link_to_fixup_dir(trans, root, fixup_path,
1884bb53eda9SFilipe Manana 						di_key.objectid);
1885bb53eda9SFilipe Manana 			if (ret)
1886bb53eda9SFilipe Manana 				break;
1887bb53eda9SFilipe Manana 		}
1888bb53eda9SFilipe Manana 		ret = 0;
1889bb53eda9SFilipe Manana 	}
1890bb53eda9SFilipe Manana 	btrfs_free_path(fixup_path);
1891bb53eda9SFilipe Manana 	return ret;
1892e02119d5SChris Mason }
1893e02119d5SChris Mason 
1894e02119d5SChris Mason /*
1895e02119d5SChris Mason  * directory replay has two parts.  There are the standard directory
1896e02119d5SChris Mason  * items in the log copied from the subvolume, and range items
1897e02119d5SChris Mason  * created in the log while the subvolume was logged.
1898e02119d5SChris Mason  *
1899e02119d5SChris Mason  * The range items tell us which parts of the key space the log
1900e02119d5SChris Mason  * is authoritative for.  During replay, if a key in the subvolume
1901e02119d5SChris Mason  * directory is in a logged range item, but not actually in the log
1902e02119d5SChris Mason  * that means it was deleted from the directory before the fsync
1903e02119d5SChris Mason  * and should be removed.
1904e02119d5SChris Mason  */
1905e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root,
1906e02119d5SChris Mason 				   struct btrfs_path *path,
1907e02119d5SChris Mason 				   u64 dirid, int key_type,
1908e02119d5SChris Mason 				   u64 *start_ret, u64 *end_ret)
1909e02119d5SChris Mason {
1910e02119d5SChris Mason 	struct btrfs_key key;
1911e02119d5SChris Mason 	u64 found_end;
1912e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
1913e02119d5SChris Mason 	int ret;
1914e02119d5SChris Mason 	int nritems;
1915e02119d5SChris Mason 
1916e02119d5SChris Mason 	if (*start_ret == (u64)-1)
1917e02119d5SChris Mason 		return 1;
1918e02119d5SChris Mason 
1919e02119d5SChris Mason 	key.objectid = dirid;
1920e02119d5SChris Mason 	key.type = key_type;
1921e02119d5SChris Mason 	key.offset = *start_ret;
1922e02119d5SChris Mason 
1923e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1924e02119d5SChris Mason 	if (ret < 0)
1925e02119d5SChris Mason 		goto out;
1926e02119d5SChris Mason 	if (ret > 0) {
1927e02119d5SChris Mason 		if (path->slots[0] == 0)
1928e02119d5SChris Mason 			goto out;
1929e02119d5SChris Mason 		path->slots[0]--;
1930e02119d5SChris Mason 	}
1931e02119d5SChris Mason 	if (ret != 0)
1932e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1933e02119d5SChris Mason 
1934e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
1935e02119d5SChris Mason 		ret = 1;
1936e02119d5SChris Mason 		goto next;
1937e02119d5SChris Mason 	}
1938e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1939e02119d5SChris Mason 			      struct btrfs_dir_log_item);
1940e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1941e02119d5SChris Mason 
1942e02119d5SChris Mason 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1943e02119d5SChris Mason 		ret = 0;
1944e02119d5SChris Mason 		*start_ret = key.offset;
1945e02119d5SChris Mason 		*end_ret = found_end;
1946e02119d5SChris Mason 		goto out;
1947e02119d5SChris Mason 	}
1948e02119d5SChris Mason 	ret = 1;
1949e02119d5SChris Mason next:
1950e02119d5SChris Mason 	/* check the next slot in the tree to see if it is a valid item */
1951e02119d5SChris Mason 	nritems = btrfs_header_nritems(path->nodes[0]);
19522a7bf53fSRobbie Ko 	path->slots[0]++;
1953e02119d5SChris Mason 	if (path->slots[0] >= nritems) {
1954e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
1955e02119d5SChris Mason 		if (ret)
1956e02119d5SChris Mason 			goto out;
1957e02119d5SChris Mason 	}
1958e02119d5SChris Mason 
1959e02119d5SChris Mason 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1960e02119d5SChris Mason 
1961e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
1962e02119d5SChris Mason 		ret = 1;
1963e02119d5SChris Mason 		goto out;
1964e02119d5SChris Mason 	}
1965e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1966e02119d5SChris Mason 			      struct btrfs_dir_log_item);
1967e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1968e02119d5SChris Mason 	*start_ret = key.offset;
1969e02119d5SChris Mason 	*end_ret = found_end;
1970e02119d5SChris Mason 	ret = 0;
1971e02119d5SChris Mason out:
1972b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1973e02119d5SChris Mason 	return ret;
1974e02119d5SChris Mason }
1975e02119d5SChris Mason 
1976e02119d5SChris Mason /*
1977e02119d5SChris Mason  * this looks for a given directory item in the log.  If the directory
1978e02119d5SChris Mason  * item is not in the log, the item is removed and the inode it points
1979e02119d5SChris Mason  * to is unlinked
1980e02119d5SChris Mason  */
1981e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1982e02119d5SChris Mason 				      struct btrfs_root *root,
1983e02119d5SChris Mason 				      struct btrfs_root *log,
1984e02119d5SChris Mason 				      struct btrfs_path *path,
1985e02119d5SChris Mason 				      struct btrfs_path *log_path,
1986e02119d5SChris Mason 				      struct inode *dir,
1987e02119d5SChris Mason 				      struct btrfs_key *dir_key)
1988e02119d5SChris Mason {
19892ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
1990e02119d5SChris Mason 	int ret;
1991e02119d5SChris Mason 	struct extent_buffer *eb;
1992e02119d5SChris Mason 	int slot;
1993e02119d5SChris Mason 	u32 item_size;
1994e02119d5SChris Mason 	struct btrfs_dir_item *di;
1995e02119d5SChris Mason 	struct btrfs_dir_item *log_di;
1996e02119d5SChris Mason 	int name_len;
1997e02119d5SChris Mason 	unsigned long ptr;
1998e02119d5SChris Mason 	unsigned long ptr_end;
1999e02119d5SChris Mason 	char *name;
2000e02119d5SChris Mason 	struct inode *inode;
2001e02119d5SChris Mason 	struct btrfs_key location;
2002e02119d5SChris Mason 
2003e02119d5SChris Mason again:
2004e02119d5SChris Mason 	eb = path->nodes[0];
2005e02119d5SChris Mason 	slot = path->slots[0];
2006e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
2007e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
2008e02119d5SChris Mason 	ptr_end = ptr + item_size;
2009e02119d5SChris Mason 	while (ptr < ptr_end) {
2010e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
20112ff7e61eSJeff Mahoney 		if (verify_dir_item(fs_info, eb, di)) {
201222a94d44SJosef Bacik 			ret = -EIO;
201322a94d44SJosef Bacik 			goto out;
201422a94d44SJosef Bacik 		}
201522a94d44SJosef Bacik 
2016e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
2017e02119d5SChris Mason 		name = kmalloc(name_len, GFP_NOFS);
2018e02119d5SChris Mason 		if (!name) {
2019e02119d5SChris Mason 			ret = -ENOMEM;
2020e02119d5SChris Mason 			goto out;
2021e02119d5SChris Mason 		}
2022e02119d5SChris Mason 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2023e02119d5SChris Mason 				  name_len);
2024e02119d5SChris Mason 		log_di = NULL;
202512fcfd22SChris Mason 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2026e02119d5SChris Mason 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2027e02119d5SChris Mason 						       dir_key->objectid,
2028e02119d5SChris Mason 						       name, name_len, 0);
202912fcfd22SChris Mason 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2030e02119d5SChris Mason 			log_di = btrfs_lookup_dir_index_item(trans, log,
2031e02119d5SChris Mason 						     log_path,
2032e02119d5SChris Mason 						     dir_key->objectid,
2033e02119d5SChris Mason 						     dir_key->offset,
2034e02119d5SChris Mason 						     name, name_len, 0);
2035e02119d5SChris Mason 		}
2036269d040fSFilipe David Borba Manana 		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2037e02119d5SChris Mason 			btrfs_dir_item_key_to_cpu(eb, di, &location);
2038b3b4aa74SDavid Sterba 			btrfs_release_path(path);
2039b3b4aa74SDavid Sterba 			btrfs_release_path(log_path);
2040e02119d5SChris Mason 			inode = read_one_inode(root, location.objectid);
2041c00e9493STsutomu Itoh 			if (!inode) {
2042c00e9493STsutomu Itoh 				kfree(name);
2043c00e9493STsutomu Itoh 				return -EIO;
2044c00e9493STsutomu Itoh 			}
2045e02119d5SChris Mason 
2046e02119d5SChris Mason 			ret = link_to_fixup_dir(trans, root,
2047e02119d5SChris Mason 						path, location.objectid);
20483650860bSJosef Bacik 			if (ret) {
20493650860bSJosef Bacik 				kfree(name);
20503650860bSJosef Bacik 				iput(inode);
20513650860bSJosef Bacik 				goto out;
20523650860bSJosef Bacik 			}
20533650860bSJosef Bacik 
20548b558c5fSZach Brown 			inc_nlink(inode);
2055e02119d5SChris Mason 			ret = btrfs_unlink_inode(trans, root, dir, inode,
2056e02119d5SChris Mason 						 name, name_len);
20573650860bSJosef Bacik 			if (!ret)
20582ff7e61eSJeff Mahoney 				ret = btrfs_run_delayed_items(trans, fs_info);
2059e02119d5SChris Mason 			kfree(name);
2060e02119d5SChris Mason 			iput(inode);
20613650860bSJosef Bacik 			if (ret)
20623650860bSJosef Bacik 				goto out;
2063e02119d5SChris Mason 
2064e02119d5SChris Mason 			/* there might still be more names under this key
2065e02119d5SChris Mason 			 * check and repeat if required
2066e02119d5SChris Mason 			 */
2067e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, dir_key, path,
2068e02119d5SChris Mason 						0, 0);
2069e02119d5SChris Mason 			if (ret == 0)
2070e02119d5SChris Mason 				goto again;
2071e02119d5SChris Mason 			ret = 0;
2072e02119d5SChris Mason 			goto out;
2073269d040fSFilipe David Borba Manana 		} else if (IS_ERR(log_di)) {
2074269d040fSFilipe David Borba Manana 			kfree(name);
2075269d040fSFilipe David Borba Manana 			return PTR_ERR(log_di);
2076e02119d5SChris Mason 		}
2077b3b4aa74SDavid Sterba 		btrfs_release_path(log_path);
2078e02119d5SChris Mason 		kfree(name);
2079e02119d5SChris Mason 
2080e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
2081e02119d5SChris Mason 		ptr += name_len;
2082e02119d5SChris Mason 	}
2083e02119d5SChris Mason 	ret = 0;
2084e02119d5SChris Mason out:
2085b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2086b3b4aa74SDavid Sterba 	btrfs_release_path(log_path);
2087e02119d5SChris Mason 	return ret;
2088e02119d5SChris Mason }
2089e02119d5SChris Mason 
20904f764e51SFilipe Manana static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
20914f764e51SFilipe Manana 			      struct btrfs_root *root,
20924f764e51SFilipe Manana 			      struct btrfs_root *log,
20934f764e51SFilipe Manana 			      struct btrfs_path *path,
20944f764e51SFilipe Manana 			      const u64 ino)
20954f764e51SFilipe Manana {
20964f764e51SFilipe Manana 	struct btrfs_key search_key;
20974f764e51SFilipe Manana 	struct btrfs_path *log_path;
20984f764e51SFilipe Manana 	int i;
20994f764e51SFilipe Manana 	int nritems;
21004f764e51SFilipe Manana 	int ret;
21014f764e51SFilipe Manana 
21024f764e51SFilipe Manana 	log_path = btrfs_alloc_path();
21034f764e51SFilipe Manana 	if (!log_path)
21044f764e51SFilipe Manana 		return -ENOMEM;
21054f764e51SFilipe Manana 
21064f764e51SFilipe Manana 	search_key.objectid = ino;
21074f764e51SFilipe Manana 	search_key.type = BTRFS_XATTR_ITEM_KEY;
21084f764e51SFilipe Manana 	search_key.offset = 0;
21094f764e51SFilipe Manana again:
21104f764e51SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
21114f764e51SFilipe Manana 	if (ret < 0)
21124f764e51SFilipe Manana 		goto out;
21134f764e51SFilipe Manana process_leaf:
21144f764e51SFilipe Manana 	nritems = btrfs_header_nritems(path->nodes[0]);
21154f764e51SFilipe Manana 	for (i = path->slots[0]; i < nritems; i++) {
21164f764e51SFilipe Manana 		struct btrfs_key key;
21174f764e51SFilipe Manana 		struct btrfs_dir_item *di;
21184f764e51SFilipe Manana 		struct btrfs_dir_item *log_di;
21194f764e51SFilipe Manana 		u32 total_size;
21204f764e51SFilipe Manana 		u32 cur;
21214f764e51SFilipe Manana 
21224f764e51SFilipe Manana 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
21234f764e51SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
21244f764e51SFilipe Manana 			ret = 0;
21254f764e51SFilipe Manana 			goto out;
21264f764e51SFilipe Manana 		}
21274f764e51SFilipe Manana 
21284f764e51SFilipe Manana 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
21294f764e51SFilipe Manana 		total_size = btrfs_item_size_nr(path->nodes[0], i);
21304f764e51SFilipe Manana 		cur = 0;
21314f764e51SFilipe Manana 		while (cur < total_size) {
21324f764e51SFilipe Manana 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
21334f764e51SFilipe Manana 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
21344f764e51SFilipe Manana 			u32 this_len = sizeof(*di) + name_len + data_len;
21354f764e51SFilipe Manana 			char *name;
21364f764e51SFilipe Manana 
21374f764e51SFilipe Manana 			name = kmalloc(name_len, GFP_NOFS);
21384f764e51SFilipe Manana 			if (!name) {
21394f764e51SFilipe Manana 				ret = -ENOMEM;
21404f764e51SFilipe Manana 				goto out;
21414f764e51SFilipe Manana 			}
21424f764e51SFilipe Manana 			read_extent_buffer(path->nodes[0], name,
21434f764e51SFilipe Manana 					   (unsigned long)(di + 1), name_len);
21444f764e51SFilipe Manana 
21454f764e51SFilipe Manana 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
21464f764e51SFilipe Manana 						    name, name_len, 0);
21474f764e51SFilipe Manana 			btrfs_release_path(log_path);
21484f764e51SFilipe Manana 			if (!log_di) {
21494f764e51SFilipe Manana 				/* Doesn't exist in log tree, so delete it. */
21504f764e51SFilipe Manana 				btrfs_release_path(path);
21514f764e51SFilipe Manana 				di = btrfs_lookup_xattr(trans, root, path, ino,
21524f764e51SFilipe Manana 							name, name_len, -1);
21534f764e51SFilipe Manana 				kfree(name);
21544f764e51SFilipe Manana 				if (IS_ERR(di)) {
21554f764e51SFilipe Manana 					ret = PTR_ERR(di);
21564f764e51SFilipe Manana 					goto out;
21574f764e51SFilipe Manana 				}
21584f764e51SFilipe Manana 				ASSERT(di);
21594f764e51SFilipe Manana 				ret = btrfs_delete_one_dir_name(trans, root,
21604f764e51SFilipe Manana 								path, di);
21614f764e51SFilipe Manana 				if (ret)
21624f764e51SFilipe Manana 					goto out;
21634f764e51SFilipe Manana 				btrfs_release_path(path);
21644f764e51SFilipe Manana 				search_key = key;
21654f764e51SFilipe Manana 				goto again;
21664f764e51SFilipe Manana 			}
21674f764e51SFilipe Manana 			kfree(name);
21684f764e51SFilipe Manana 			if (IS_ERR(log_di)) {
21694f764e51SFilipe Manana 				ret = PTR_ERR(log_di);
21704f764e51SFilipe Manana 				goto out;
21714f764e51SFilipe Manana 			}
21724f764e51SFilipe Manana 			cur += this_len;
21734f764e51SFilipe Manana 			di = (struct btrfs_dir_item *)((char *)di + this_len);
21744f764e51SFilipe Manana 		}
21754f764e51SFilipe Manana 	}
21764f764e51SFilipe Manana 	ret = btrfs_next_leaf(root, path);
21774f764e51SFilipe Manana 	if (ret > 0)
21784f764e51SFilipe Manana 		ret = 0;
21794f764e51SFilipe Manana 	else if (ret == 0)
21804f764e51SFilipe Manana 		goto process_leaf;
21814f764e51SFilipe Manana out:
21824f764e51SFilipe Manana 	btrfs_free_path(log_path);
21834f764e51SFilipe Manana 	btrfs_release_path(path);
21844f764e51SFilipe Manana 	return ret;
21854f764e51SFilipe Manana }
21864f764e51SFilipe Manana 
21874f764e51SFilipe Manana 
2188e02119d5SChris Mason /*
2189e02119d5SChris Mason  * deletion replay happens before we copy any new directory items
2190e02119d5SChris Mason  * out of the log or out of backreferences from inodes.  It
2191e02119d5SChris Mason  * scans the log to find ranges of keys that log is authoritative for,
2192e02119d5SChris Mason  * and then scans the directory to find items in those ranges that are
2193e02119d5SChris Mason  * not present in the log.
2194e02119d5SChris Mason  *
2195e02119d5SChris Mason  * Anything we don't find in the log is unlinked and removed from the
2196e02119d5SChris Mason  * directory.
2197e02119d5SChris Mason  */
2198e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2199e02119d5SChris Mason 				       struct btrfs_root *root,
2200e02119d5SChris Mason 				       struct btrfs_root *log,
2201e02119d5SChris Mason 				       struct btrfs_path *path,
220212fcfd22SChris Mason 				       u64 dirid, int del_all)
2203e02119d5SChris Mason {
2204e02119d5SChris Mason 	u64 range_start;
2205e02119d5SChris Mason 	u64 range_end;
2206e02119d5SChris Mason 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2207e02119d5SChris Mason 	int ret = 0;
2208e02119d5SChris Mason 	struct btrfs_key dir_key;
2209e02119d5SChris Mason 	struct btrfs_key found_key;
2210e02119d5SChris Mason 	struct btrfs_path *log_path;
2211e02119d5SChris Mason 	struct inode *dir;
2212e02119d5SChris Mason 
2213e02119d5SChris Mason 	dir_key.objectid = dirid;
2214e02119d5SChris Mason 	dir_key.type = BTRFS_DIR_ITEM_KEY;
2215e02119d5SChris Mason 	log_path = btrfs_alloc_path();
2216e02119d5SChris Mason 	if (!log_path)
2217e02119d5SChris Mason 		return -ENOMEM;
2218e02119d5SChris Mason 
2219e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
2220e02119d5SChris Mason 	/* it isn't an error if the inode isn't there, that can happen
2221e02119d5SChris Mason 	 * because we replay the deletes before we copy in the inode item
2222e02119d5SChris Mason 	 * from the log
2223e02119d5SChris Mason 	 */
2224e02119d5SChris Mason 	if (!dir) {
2225e02119d5SChris Mason 		btrfs_free_path(log_path);
2226e02119d5SChris Mason 		return 0;
2227e02119d5SChris Mason 	}
2228e02119d5SChris Mason again:
2229e02119d5SChris Mason 	range_start = 0;
2230e02119d5SChris Mason 	range_end = 0;
2231e02119d5SChris Mason 	while (1) {
223212fcfd22SChris Mason 		if (del_all)
223312fcfd22SChris Mason 			range_end = (u64)-1;
223412fcfd22SChris Mason 		else {
2235e02119d5SChris Mason 			ret = find_dir_range(log, path, dirid, key_type,
2236e02119d5SChris Mason 					     &range_start, &range_end);
2237e02119d5SChris Mason 			if (ret != 0)
2238e02119d5SChris Mason 				break;
223912fcfd22SChris Mason 		}
2240e02119d5SChris Mason 
2241e02119d5SChris Mason 		dir_key.offset = range_start;
2242e02119d5SChris Mason 		while (1) {
2243e02119d5SChris Mason 			int nritems;
2244e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2245e02119d5SChris Mason 						0, 0);
2246e02119d5SChris Mason 			if (ret < 0)
2247e02119d5SChris Mason 				goto out;
2248e02119d5SChris Mason 
2249e02119d5SChris Mason 			nritems = btrfs_header_nritems(path->nodes[0]);
2250e02119d5SChris Mason 			if (path->slots[0] >= nritems) {
2251e02119d5SChris Mason 				ret = btrfs_next_leaf(root, path);
2252e02119d5SChris Mason 				if (ret)
2253e02119d5SChris Mason 					break;
2254e02119d5SChris Mason 			}
2255e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2256e02119d5SChris Mason 					      path->slots[0]);
2257e02119d5SChris Mason 			if (found_key.objectid != dirid ||
2258e02119d5SChris Mason 			    found_key.type != dir_key.type)
2259e02119d5SChris Mason 				goto next_type;
2260e02119d5SChris Mason 
2261e02119d5SChris Mason 			if (found_key.offset > range_end)
2262e02119d5SChris Mason 				break;
2263e02119d5SChris Mason 
2264e02119d5SChris Mason 			ret = check_item_in_log(trans, root, log, path,
226512fcfd22SChris Mason 						log_path, dir,
226612fcfd22SChris Mason 						&found_key);
22673650860bSJosef Bacik 			if (ret)
22683650860bSJosef Bacik 				goto out;
2269e02119d5SChris Mason 			if (found_key.offset == (u64)-1)
2270e02119d5SChris Mason 				break;
2271e02119d5SChris Mason 			dir_key.offset = found_key.offset + 1;
2272e02119d5SChris Mason 		}
2273b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2274e02119d5SChris Mason 		if (range_end == (u64)-1)
2275e02119d5SChris Mason 			break;
2276e02119d5SChris Mason 		range_start = range_end + 1;
2277e02119d5SChris Mason 	}
2278e02119d5SChris Mason 
2279e02119d5SChris Mason next_type:
2280e02119d5SChris Mason 	ret = 0;
2281e02119d5SChris Mason 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2282e02119d5SChris Mason 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2283e02119d5SChris Mason 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2284b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2285e02119d5SChris Mason 		goto again;
2286e02119d5SChris Mason 	}
2287e02119d5SChris Mason out:
2288b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2289e02119d5SChris Mason 	btrfs_free_path(log_path);
2290e02119d5SChris Mason 	iput(dir);
2291e02119d5SChris Mason 	return ret;
2292e02119d5SChris Mason }
2293e02119d5SChris Mason 
2294e02119d5SChris Mason /*
2295e02119d5SChris Mason  * the process_func used to replay items from the log tree.  This
2296e02119d5SChris Mason  * gets called in two different stages.  The first stage just looks
2297e02119d5SChris Mason  * for inodes and makes sure they are all copied into the subvolume.
2298e02119d5SChris Mason  *
2299e02119d5SChris Mason  * The second stage copies all the other item types from the log into
2300e02119d5SChris Mason  * the subvolume.  The two stage approach is slower, but gets rid of
2301e02119d5SChris Mason  * lots of complexity around inodes referencing other inodes that exist
2302e02119d5SChris Mason  * only in the log (references come from either directory items or inode
2303e02119d5SChris Mason  * back refs).
2304e02119d5SChris Mason  */
2305e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2306e02119d5SChris Mason 			     struct walk_control *wc, u64 gen)
2307e02119d5SChris Mason {
2308e02119d5SChris Mason 	int nritems;
2309e02119d5SChris Mason 	struct btrfs_path *path;
2310e02119d5SChris Mason 	struct btrfs_root *root = wc->replay_dest;
2311e02119d5SChris Mason 	struct btrfs_key key;
2312e02119d5SChris Mason 	int level;
2313e02119d5SChris Mason 	int i;
2314e02119d5SChris Mason 	int ret;
2315e02119d5SChris Mason 
2316018642a1STsutomu Itoh 	ret = btrfs_read_buffer(eb, gen);
2317018642a1STsutomu Itoh 	if (ret)
2318018642a1STsutomu Itoh 		return ret;
2319e02119d5SChris Mason 
2320e02119d5SChris Mason 	level = btrfs_header_level(eb);
2321e02119d5SChris Mason 
2322e02119d5SChris Mason 	if (level != 0)
2323e02119d5SChris Mason 		return 0;
2324e02119d5SChris Mason 
2325e02119d5SChris Mason 	path = btrfs_alloc_path();
23261e5063d0SMark Fasheh 	if (!path)
23271e5063d0SMark Fasheh 		return -ENOMEM;
2328e02119d5SChris Mason 
2329e02119d5SChris Mason 	nritems = btrfs_header_nritems(eb);
2330e02119d5SChris Mason 	for (i = 0; i < nritems; i++) {
2331e02119d5SChris Mason 		btrfs_item_key_to_cpu(eb, &key, i);
2332e02119d5SChris Mason 
2333e02119d5SChris Mason 		/* inode keys are done during the first stage */
2334e02119d5SChris Mason 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2335e02119d5SChris Mason 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2336e02119d5SChris Mason 			struct btrfs_inode_item *inode_item;
2337e02119d5SChris Mason 			u32 mode;
2338e02119d5SChris Mason 
2339e02119d5SChris Mason 			inode_item = btrfs_item_ptr(eb, i,
2340e02119d5SChris Mason 					    struct btrfs_inode_item);
23414f764e51SFilipe Manana 			ret = replay_xattr_deletes(wc->trans, root, log,
23424f764e51SFilipe Manana 						   path, key.objectid);
23434f764e51SFilipe Manana 			if (ret)
23444f764e51SFilipe Manana 				break;
2345e02119d5SChris Mason 			mode = btrfs_inode_mode(eb, inode_item);
2346e02119d5SChris Mason 			if (S_ISDIR(mode)) {
2347e02119d5SChris Mason 				ret = replay_dir_deletes(wc->trans,
234812fcfd22SChris Mason 					 root, log, path, key.objectid, 0);
2349b50c6e25SJosef Bacik 				if (ret)
2350b50c6e25SJosef Bacik 					break;
2351e02119d5SChris Mason 			}
2352e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2353e02119d5SChris Mason 					     eb, i, &key);
2354b50c6e25SJosef Bacik 			if (ret)
2355b50c6e25SJosef Bacik 				break;
2356e02119d5SChris Mason 
2357c71bf099SYan, Zheng 			/* for regular files, make sure corresponding
235801327610SNicholas D Steeves 			 * orphan item exist. extents past the new EOF
2359c71bf099SYan, Zheng 			 * will be truncated later by orphan cleanup.
2360e02119d5SChris Mason 			 */
2361e02119d5SChris Mason 			if (S_ISREG(mode)) {
2362c71bf099SYan, Zheng 				ret = insert_orphan_item(wc->trans, root,
2363e02119d5SChris Mason 							 key.objectid);
2364b50c6e25SJosef Bacik 				if (ret)
2365b50c6e25SJosef Bacik 					break;
2366c71bf099SYan, Zheng 			}
2367a74ac322SChris Mason 
2368e02119d5SChris Mason 			ret = link_to_fixup_dir(wc->trans, root,
2369e02119d5SChris Mason 						path, key.objectid);
2370b50c6e25SJosef Bacik 			if (ret)
2371b50c6e25SJosef Bacik 				break;
2372e02119d5SChris Mason 		}
2373dd8e7217SJosef Bacik 
2374dd8e7217SJosef Bacik 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2375dd8e7217SJosef Bacik 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2376dd8e7217SJosef Bacik 			ret = replay_one_dir_item(wc->trans, root, path,
2377dd8e7217SJosef Bacik 						  eb, i, &key);
2378dd8e7217SJosef Bacik 			if (ret)
2379dd8e7217SJosef Bacik 				break;
2380dd8e7217SJosef Bacik 		}
2381dd8e7217SJosef Bacik 
2382e02119d5SChris Mason 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2383e02119d5SChris Mason 			continue;
2384e02119d5SChris Mason 
2385e02119d5SChris Mason 		/* these keys are simply copied */
2386e02119d5SChris Mason 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2387e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
2388e02119d5SChris Mason 					     eb, i, &key);
2389b50c6e25SJosef Bacik 			if (ret)
2390b50c6e25SJosef Bacik 				break;
23912da1c669SLiu Bo 		} else if (key.type == BTRFS_INODE_REF_KEY ||
23922da1c669SLiu Bo 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2393f186373fSMark Fasheh 			ret = add_inode_ref(wc->trans, root, log, path,
2394f186373fSMark Fasheh 					    eb, i, &key);
2395b50c6e25SJosef Bacik 			if (ret && ret != -ENOENT)
2396b50c6e25SJosef Bacik 				break;
2397b50c6e25SJosef Bacik 			ret = 0;
2398e02119d5SChris Mason 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2399e02119d5SChris Mason 			ret = replay_one_extent(wc->trans, root, path,
2400e02119d5SChris Mason 						eb, i, &key);
2401b50c6e25SJosef Bacik 			if (ret)
2402b50c6e25SJosef Bacik 				break;
2403dd8e7217SJosef Bacik 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2404e02119d5SChris Mason 			ret = replay_one_dir_item(wc->trans, root, path,
2405e02119d5SChris Mason 						  eb, i, &key);
2406b50c6e25SJosef Bacik 			if (ret)
2407b50c6e25SJosef Bacik 				break;
2408e02119d5SChris Mason 		}
2409e02119d5SChris Mason 	}
2410e02119d5SChris Mason 	btrfs_free_path(path);
2411b50c6e25SJosef Bacik 	return ret;
2412e02119d5SChris Mason }
2413e02119d5SChris Mason 
2414d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2415e02119d5SChris Mason 				   struct btrfs_root *root,
2416e02119d5SChris Mason 				   struct btrfs_path *path, int *level,
2417e02119d5SChris Mason 				   struct walk_control *wc)
2418e02119d5SChris Mason {
24190b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2420e02119d5SChris Mason 	u64 root_owner;
2421e02119d5SChris Mason 	u64 bytenr;
2422e02119d5SChris Mason 	u64 ptr_gen;
2423e02119d5SChris Mason 	struct extent_buffer *next;
2424e02119d5SChris Mason 	struct extent_buffer *cur;
2425e02119d5SChris Mason 	struct extent_buffer *parent;
2426e02119d5SChris Mason 	u32 blocksize;
2427e02119d5SChris Mason 	int ret = 0;
2428e02119d5SChris Mason 
2429e02119d5SChris Mason 	WARN_ON(*level < 0);
2430e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2431e02119d5SChris Mason 
2432e02119d5SChris Mason 	while (*level > 0) {
2433e02119d5SChris Mason 		WARN_ON(*level < 0);
2434e02119d5SChris Mason 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
2435e02119d5SChris Mason 		cur = path->nodes[*level];
2436e02119d5SChris Mason 
2437fae7f21cSDulshani Gunawardhana 		WARN_ON(btrfs_header_level(cur) != *level);
2438e02119d5SChris Mason 
2439e02119d5SChris Mason 		if (path->slots[*level] >=
2440e02119d5SChris Mason 		    btrfs_header_nritems(cur))
2441e02119d5SChris Mason 			break;
2442e02119d5SChris Mason 
2443e02119d5SChris Mason 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2444e02119d5SChris Mason 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
24450b246afaSJeff Mahoney 		blocksize = fs_info->nodesize;
2446e02119d5SChris Mason 
2447e02119d5SChris Mason 		parent = path->nodes[*level];
2448e02119d5SChris Mason 		root_owner = btrfs_header_owner(parent);
2449e02119d5SChris Mason 
24502ff7e61eSJeff Mahoney 		next = btrfs_find_create_tree_block(fs_info, bytenr);
2451c871b0f2SLiu Bo 		if (IS_ERR(next))
2452c871b0f2SLiu Bo 			return PTR_ERR(next);
2453e02119d5SChris Mason 
24544a500fd1SYan, Zheng 		if (*level == 1) {
24551e5063d0SMark Fasheh 			ret = wc->process_func(root, next, wc, ptr_gen);
2456b50c6e25SJosef Bacik 			if (ret) {
2457b50c6e25SJosef Bacik 				free_extent_buffer(next);
24581e5063d0SMark Fasheh 				return ret;
2459b50c6e25SJosef Bacik 			}
2460e02119d5SChris Mason 
2461e02119d5SChris Mason 			path->slots[*level]++;
2462e02119d5SChris Mason 			if (wc->free) {
2463018642a1STsutomu Itoh 				ret = btrfs_read_buffer(next, ptr_gen);
2464018642a1STsutomu Itoh 				if (ret) {
2465018642a1STsutomu Itoh 					free_extent_buffer(next);
2466018642a1STsutomu Itoh 					return ret;
2467018642a1STsutomu Itoh 				}
2468e02119d5SChris Mason 
2469681ae509SJosef Bacik 				if (trans) {
2470e02119d5SChris Mason 					btrfs_tree_lock(next);
2471b4ce94deSChris Mason 					btrfs_set_lock_blocking(next);
24720b246afaSJeff Mahoney 					clean_tree_block(trans, fs_info, next);
2473e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2474e02119d5SChris Mason 					btrfs_tree_unlock(next);
2475681ae509SJosef Bacik 				}
2476e02119d5SChris Mason 
2477e02119d5SChris Mason 				WARN_ON(root_owner !=
2478e02119d5SChris Mason 					BTRFS_TREE_LOG_OBJECTID);
24792ff7e61eSJeff Mahoney 				ret = btrfs_free_and_pin_reserved_extent(
24802ff7e61eSJeff Mahoney 							fs_info, bytenr,
24812ff7e61eSJeff Mahoney 							blocksize);
24823650860bSJosef Bacik 				if (ret) {
24833650860bSJosef Bacik 					free_extent_buffer(next);
24843650860bSJosef Bacik 					return ret;
24853650860bSJosef Bacik 				}
2486e02119d5SChris Mason 			}
2487e02119d5SChris Mason 			free_extent_buffer(next);
2488e02119d5SChris Mason 			continue;
2489e02119d5SChris Mason 		}
2490018642a1STsutomu Itoh 		ret = btrfs_read_buffer(next, ptr_gen);
2491018642a1STsutomu Itoh 		if (ret) {
2492018642a1STsutomu Itoh 			free_extent_buffer(next);
2493018642a1STsutomu Itoh 			return ret;
2494018642a1STsutomu Itoh 		}
2495e02119d5SChris Mason 
2496e02119d5SChris Mason 		WARN_ON(*level <= 0);
2497e02119d5SChris Mason 		if (path->nodes[*level-1])
2498e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level-1]);
2499e02119d5SChris Mason 		path->nodes[*level-1] = next;
2500e02119d5SChris Mason 		*level = btrfs_header_level(next);
2501e02119d5SChris Mason 		path->slots[*level] = 0;
2502e02119d5SChris Mason 		cond_resched();
2503e02119d5SChris Mason 	}
2504e02119d5SChris Mason 	WARN_ON(*level < 0);
2505e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2506e02119d5SChris Mason 
25074a500fd1SYan, Zheng 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2508e02119d5SChris Mason 
2509e02119d5SChris Mason 	cond_resched();
2510e02119d5SChris Mason 	return 0;
2511e02119d5SChris Mason }
2512e02119d5SChris Mason 
2513d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2514e02119d5SChris Mason 				 struct btrfs_root *root,
2515e02119d5SChris Mason 				 struct btrfs_path *path, int *level,
2516e02119d5SChris Mason 				 struct walk_control *wc)
2517e02119d5SChris Mason {
25180b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2519e02119d5SChris Mason 	u64 root_owner;
2520e02119d5SChris Mason 	int i;
2521e02119d5SChris Mason 	int slot;
2522e02119d5SChris Mason 	int ret;
2523e02119d5SChris Mason 
2524e02119d5SChris Mason 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2525e02119d5SChris Mason 		slot = path->slots[i];
25264a500fd1SYan, Zheng 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2527e02119d5SChris Mason 			path->slots[i]++;
2528e02119d5SChris Mason 			*level = i;
2529e02119d5SChris Mason 			WARN_ON(*level == 0);
2530e02119d5SChris Mason 			return 0;
2531e02119d5SChris Mason 		} else {
253231840ae1SZheng Yan 			struct extent_buffer *parent;
253331840ae1SZheng Yan 			if (path->nodes[*level] == root->node)
253431840ae1SZheng Yan 				parent = path->nodes[*level];
253531840ae1SZheng Yan 			else
253631840ae1SZheng Yan 				parent = path->nodes[*level + 1];
253731840ae1SZheng Yan 
253831840ae1SZheng Yan 			root_owner = btrfs_header_owner(parent);
25391e5063d0SMark Fasheh 			ret = wc->process_func(root, path->nodes[*level], wc,
2540e02119d5SChris Mason 				 btrfs_header_generation(path->nodes[*level]));
25411e5063d0SMark Fasheh 			if (ret)
25421e5063d0SMark Fasheh 				return ret;
25431e5063d0SMark Fasheh 
2544e02119d5SChris Mason 			if (wc->free) {
2545e02119d5SChris Mason 				struct extent_buffer *next;
2546e02119d5SChris Mason 
2547e02119d5SChris Mason 				next = path->nodes[*level];
2548e02119d5SChris Mason 
2549681ae509SJosef Bacik 				if (trans) {
2550e02119d5SChris Mason 					btrfs_tree_lock(next);
2551b4ce94deSChris Mason 					btrfs_set_lock_blocking(next);
25520b246afaSJeff Mahoney 					clean_tree_block(trans, fs_info, next);
2553e02119d5SChris Mason 					btrfs_wait_tree_block_writeback(next);
2554e02119d5SChris Mason 					btrfs_tree_unlock(next);
2555681ae509SJosef Bacik 				}
2556e02119d5SChris Mason 
2557e02119d5SChris Mason 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
25582ff7e61eSJeff Mahoney 				ret = btrfs_free_and_pin_reserved_extent(
25592ff7e61eSJeff Mahoney 						fs_info,
2560e02119d5SChris Mason 						path->nodes[*level]->start,
2561d00aff00SChris Mason 						path->nodes[*level]->len);
25623650860bSJosef Bacik 				if (ret)
25633650860bSJosef Bacik 					return ret;
2564e02119d5SChris Mason 			}
2565e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level]);
2566e02119d5SChris Mason 			path->nodes[*level] = NULL;
2567e02119d5SChris Mason 			*level = i + 1;
2568e02119d5SChris Mason 		}
2569e02119d5SChris Mason 	}
2570e02119d5SChris Mason 	return 1;
2571e02119d5SChris Mason }
2572e02119d5SChris Mason 
2573e02119d5SChris Mason /*
2574e02119d5SChris Mason  * drop the reference count on the tree rooted at 'snap'.  This traverses
2575e02119d5SChris Mason  * the tree freeing any blocks that have a ref count of zero after being
2576e02119d5SChris Mason  * decremented.
2577e02119d5SChris Mason  */
2578e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans,
2579e02119d5SChris Mason 			 struct btrfs_root *log, struct walk_control *wc)
2580e02119d5SChris Mason {
25812ff7e61eSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
2582e02119d5SChris Mason 	int ret = 0;
2583e02119d5SChris Mason 	int wret;
2584e02119d5SChris Mason 	int level;
2585e02119d5SChris Mason 	struct btrfs_path *path;
2586e02119d5SChris Mason 	int orig_level;
2587e02119d5SChris Mason 
2588e02119d5SChris Mason 	path = btrfs_alloc_path();
2589db5b493aSTsutomu Itoh 	if (!path)
2590db5b493aSTsutomu Itoh 		return -ENOMEM;
2591e02119d5SChris Mason 
2592e02119d5SChris Mason 	level = btrfs_header_level(log->node);
2593e02119d5SChris Mason 	orig_level = level;
2594e02119d5SChris Mason 	path->nodes[level] = log->node;
2595e02119d5SChris Mason 	extent_buffer_get(log->node);
2596e02119d5SChris Mason 	path->slots[level] = 0;
2597e02119d5SChris Mason 
2598e02119d5SChris Mason 	while (1) {
2599e02119d5SChris Mason 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2600e02119d5SChris Mason 		if (wret > 0)
2601e02119d5SChris Mason 			break;
260279787eaaSJeff Mahoney 		if (wret < 0) {
2603e02119d5SChris Mason 			ret = wret;
260479787eaaSJeff Mahoney 			goto out;
260579787eaaSJeff Mahoney 		}
2606e02119d5SChris Mason 
2607e02119d5SChris Mason 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2608e02119d5SChris Mason 		if (wret > 0)
2609e02119d5SChris Mason 			break;
261079787eaaSJeff Mahoney 		if (wret < 0) {
2611e02119d5SChris Mason 			ret = wret;
261279787eaaSJeff Mahoney 			goto out;
261379787eaaSJeff Mahoney 		}
2614e02119d5SChris Mason 	}
2615e02119d5SChris Mason 
2616e02119d5SChris Mason 	/* was the root node processed? if not, catch it here */
2617e02119d5SChris Mason 	if (path->nodes[orig_level]) {
261879787eaaSJeff Mahoney 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2619e02119d5SChris Mason 			 btrfs_header_generation(path->nodes[orig_level]));
262079787eaaSJeff Mahoney 		if (ret)
262179787eaaSJeff Mahoney 			goto out;
2622e02119d5SChris Mason 		if (wc->free) {
2623e02119d5SChris Mason 			struct extent_buffer *next;
2624e02119d5SChris Mason 
2625e02119d5SChris Mason 			next = path->nodes[orig_level];
2626e02119d5SChris Mason 
2627681ae509SJosef Bacik 			if (trans) {
2628e02119d5SChris Mason 				btrfs_tree_lock(next);
2629b4ce94deSChris Mason 				btrfs_set_lock_blocking(next);
26302ff7e61eSJeff Mahoney 				clean_tree_block(trans, fs_info, next);
2631e02119d5SChris Mason 				btrfs_wait_tree_block_writeback(next);
2632e02119d5SChris Mason 				btrfs_tree_unlock(next);
2633681ae509SJosef Bacik 			}
2634e02119d5SChris Mason 
2635e02119d5SChris Mason 			WARN_ON(log->root_key.objectid !=
2636e02119d5SChris Mason 				BTRFS_TREE_LOG_OBJECTID);
26372ff7e61eSJeff Mahoney 			ret = btrfs_free_and_pin_reserved_extent(fs_info,
26382ff7e61eSJeff Mahoney 							next->start, next->len);
26393650860bSJosef Bacik 			if (ret)
26403650860bSJosef Bacik 				goto out;
2641e02119d5SChris Mason 		}
2642e02119d5SChris Mason 	}
2643e02119d5SChris Mason 
264479787eaaSJeff Mahoney out:
2645e02119d5SChris Mason 	btrfs_free_path(path);
2646e02119d5SChris Mason 	return ret;
2647e02119d5SChris Mason }
2648e02119d5SChris Mason 
26497237f183SYan Zheng /*
26507237f183SYan Zheng  * helper function to update the item for a given subvolumes log root
26517237f183SYan Zheng  * in the tree of log roots
26527237f183SYan Zheng  */
26537237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans,
26547237f183SYan Zheng 			   struct btrfs_root *log)
26557237f183SYan Zheng {
26560b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = log->fs_info;
26577237f183SYan Zheng 	int ret;
26587237f183SYan Zheng 
26597237f183SYan Zheng 	if (log->log_transid == 1) {
26607237f183SYan Zheng 		/* insert root item on the first sync */
26610b246afaSJeff Mahoney 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
26627237f183SYan Zheng 				&log->root_key, &log->root_item);
26637237f183SYan Zheng 	} else {
26640b246afaSJeff Mahoney 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
26657237f183SYan Zheng 				&log->root_key, &log->root_item);
26667237f183SYan Zheng 	}
26677237f183SYan Zheng 	return ret;
26687237f183SYan Zheng }
26697237f183SYan Zheng 
267060d53eb3SZhaolei static void wait_log_commit(struct btrfs_root *root, int transid)
2671e02119d5SChris Mason {
2672e02119d5SChris Mason 	DEFINE_WAIT(wait);
26737237f183SYan Zheng 	int index = transid % 2;
2674e02119d5SChris Mason 
26757237f183SYan Zheng 	/*
26767237f183SYan Zheng 	 * we only allow two pending log transactions at a time,
26777237f183SYan Zheng 	 * so we know that if ours is more than 2 older than the
26787237f183SYan Zheng 	 * current transaction, we're done
26797237f183SYan Zheng 	 */
2680e02119d5SChris Mason 	do {
26817237f183SYan Zheng 		prepare_to_wait(&root->log_commit_wait[index],
26827237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
26837237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
268412fcfd22SChris Mason 
2685d1433debSMiao Xie 		if (root->log_transid_committed < transid &&
26867237f183SYan Zheng 		    atomic_read(&root->log_commit[index]))
2687e02119d5SChris Mason 			schedule();
268812fcfd22SChris Mason 
26897237f183SYan Zheng 		finish_wait(&root->log_commit_wait[index], &wait);
26907237f183SYan Zheng 		mutex_lock(&root->log_mutex);
2691d1433debSMiao Xie 	} while (root->log_transid_committed < transid &&
26927237f183SYan Zheng 		 atomic_read(&root->log_commit[index]));
26937237f183SYan Zheng }
26947237f183SYan Zheng 
269560d53eb3SZhaolei static void wait_for_writer(struct btrfs_root *root)
26967237f183SYan Zheng {
26977237f183SYan Zheng 	DEFINE_WAIT(wait);
26988b050d35SMiao Xie 
26998b050d35SMiao Xie 	while (atomic_read(&root->log_writers)) {
27007237f183SYan Zheng 		prepare_to_wait(&root->log_writer_wait,
27017237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
27027237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
27038b050d35SMiao Xie 		if (atomic_read(&root->log_writers))
27047237f183SYan Zheng 			schedule();
27057237f183SYan Zheng 		finish_wait(&root->log_writer_wait, &wait);
2706575849ecSFilipe Manana 		mutex_lock(&root->log_mutex);
27077237f183SYan Zheng 	}
2708e02119d5SChris Mason }
2709e02119d5SChris Mason 
27108b050d35SMiao Xie static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
27118b050d35SMiao Xie 					struct btrfs_log_ctx *ctx)
27128b050d35SMiao Xie {
27138b050d35SMiao Xie 	if (!ctx)
27148b050d35SMiao Xie 		return;
27158b050d35SMiao Xie 
27168b050d35SMiao Xie 	mutex_lock(&root->log_mutex);
27178b050d35SMiao Xie 	list_del_init(&ctx->list);
27188b050d35SMiao Xie 	mutex_unlock(&root->log_mutex);
27198b050d35SMiao Xie }
27208b050d35SMiao Xie 
27218b050d35SMiao Xie /*
27228b050d35SMiao Xie  * Invoked in log mutex context, or be sure there is no other task which
27238b050d35SMiao Xie  * can access the list.
27248b050d35SMiao Xie  */
27258b050d35SMiao Xie static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
27268b050d35SMiao Xie 					     int index, int error)
27278b050d35SMiao Xie {
27288b050d35SMiao Xie 	struct btrfs_log_ctx *ctx;
2729570dd450SChris Mason 	struct btrfs_log_ctx *safe;
27308b050d35SMiao Xie 
2731570dd450SChris Mason 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2732570dd450SChris Mason 		list_del_init(&ctx->list);
27338b050d35SMiao Xie 		ctx->log_ret = error;
2734570dd450SChris Mason 	}
27358b050d35SMiao Xie 
27368b050d35SMiao Xie 	INIT_LIST_HEAD(&root->log_ctxs[index]);
27378b050d35SMiao Xie }
27388b050d35SMiao Xie 
2739e02119d5SChris Mason /*
2740e02119d5SChris Mason  * btrfs_sync_log does sends a given tree log down to the disk and
2741e02119d5SChris Mason  * updates the super blocks to record it.  When this call is done,
274212fcfd22SChris Mason  * you know that any inodes previously logged are safely on disk only
274312fcfd22SChris Mason  * if it returns 0.
274412fcfd22SChris Mason  *
274512fcfd22SChris Mason  * Any other return value means you need to call btrfs_commit_transaction.
274612fcfd22SChris Mason  * Some of the edge cases for fsyncing directories that have had unlinks
274712fcfd22SChris Mason  * or renames done in the past mean that sometimes the only safe
274812fcfd22SChris Mason  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
274912fcfd22SChris Mason  * that has happened.
2750e02119d5SChris Mason  */
2751e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans,
27528b050d35SMiao Xie 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2753e02119d5SChris Mason {
27547237f183SYan Zheng 	int index1;
27557237f183SYan Zheng 	int index2;
27568cef4e16SYan, Zheng 	int mark;
2757e02119d5SChris Mason 	int ret;
27580b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
2759e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
27600b246afaSJeff Mahoney 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2761bb14a59bSMiao Xie 	int log_transid = 0;
27628b050d35SMiao Xie 	struct btrfs_log_ctx root_log_ctx;
2763c6adc9ccSMiao Xie 	struct blk_plug plug;
2764e02119d5SChris Mason 
27657237f183SYan Zheng 	mutex_lock(&root->log_mutex);
2766d1433debSMiao Xie 	log_transid = ctx->log_transid;
2767d1433debSMiao Xie 	if (root->log_transid_committed >= log_transid) {
27687237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
27698b050d35SMiao Xie 		return ctx->log_ret;
2770e02119d5SChris Mason 	}
2771d1433debSMiao Xie 
2772d1433debSMiao Xie 	index1 = log_transid % 2;
2773d1433debSMiao Xie 	if (atomic_read(&root->log_commit[index1])) {
277460d53eb3SZhaolei 		wait_log_commit(root, log_transid);
2775d1433debSMiao Xie 		mutex_unlock(&root->log_mutex);
2776d1433debSMiao Xie 		return ctx->log_ret;
2777d1433debSMiao Xie 	}
2778d1433debSMiao Xie 	ASSERT(log_transid == root->log_transid);
27797237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 1);
27807237f183SYan Zheng 
27817237f183SYan Zheng 	/* wait for previous tree log sync to complete */
27827237f183SYan Zheng 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
278360d53eb3SZhaolei 		wait_log_commit(root, log_transid - 1);
278448cab2e0SMiao Xie 
278586df7eb9SYan, Zheng 	while (1) {
27862ecb7923SMiao Xie 		int batch = atomic_read(&root->log_batch);
2787cd354ad6SChris Mason 		/* when we're on an ssd, just kick the log commit out */
27880b246afaSJeff Mahoney 		if (!btrfs_test_opt(fs_info, SSD) &&
278927cdeb70SMiao Xie 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
27907237f183SYan Zheng 			mutex_unlock(&root->log_mutex);
2791e02119d5SChris Mason 			schedule_timeout_uninterruptible(1);
27927237f183SYan Zheng 			mutex_lock(&root->log_mutex);
279386df7eb9SYan, Zheng 		}
279460d53eb3SZhaolei 		wait_for_writer(root);
27952ecb7923SMiao Xie 		if (batch == atomic_read(&root->log_batch))
2796e02119d5SChris Mason 			break;
2797e02119d5SChris Mason 	}
2798d0c803c4SChris Mason 
279912fcfd22SChris Mason 	/* bail out if we need to do a full commit */
28000b246afaSJeff Mahoney 	if (btrfs_need_log_full_commit(fs_info, trans)) {
280112fcfd22SChris Mason 		ret = -EAGAIN;
28022ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
280312fcfd22SChris Mason 		mutex_unlock(&root->log_mutex);
280412fcfd22SChris Mason 		goto out;
280512fcfd22SChris Mason 	}
280612fcfd22SChris Mason 
28078cef4e16SYan, Zheng 	if (log_transid % 2 == 0)
28088cef4e16SYan, Zheng 		mark = EXTENT_DIRTY;
28098cef4e16SYan, Zheng 	else
28108cef4e16SYan, Zheng 		mark = EXTENT_NEW;
28118cef4e16SYan, Zheng 
2812690587d1SChris Mason 	/* we start IO on  all the marked extents here, but we don't actually
2813690587d1SChris Mason 	 * wait for them until later.
2814690587d1SChris Mason 	 */
2815c6adc9ccSMiao Xie 	blk_start_plug(&plug);
28162ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
281779787eaaSJeff Mahoney 	if (ret) {
2818c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
281966642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
28202ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
28210b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
282279787eaaSJeff Mahoney 		mutex_unlock(&root->log_mutex);
282379787eaaSJeff Mahoney 		goto out;
282479787eaaSJeff Mahoney 	}
28257237f183SYan Zheng 
28265d4f98a2SYan Zheng 	btrfs_set_root_node(&log->root_item, log->node);
28277237f183SYan Zheng 
28287237f183SYan Zheng 	root->log_transid++;
28297237f183SYan Zheng 	log->log_transid = root->log_transid;
2830ff782e0aSJosef Bacik 	root->log_start_pid = 0;
28317237f183SYan Zheng 	/*
28328cef4e16SYan, Zheng 	 * IO has been started, blocks of the log tree have WRITTEN flag set
28338cef4e16SYan, Zheng 	 * in their headers. new modifications of the log will be written to
28348cef4e16SYan, Zheng 	 * new positions. so it's safe to allow log writers to go in.
28357237f183SYan Zheng 	 */
28367237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
28377237f183SYan Zheng 
283828a23593SFilipe Manana 	btrfs_init_log_ctx(&root_log_ctx, NULL);
2839d1433debSMiao Xie 
28407237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
28412ecb7923SMiao Xie 	atomic_inc(&log_root_tree->log_batch);
28427237f183SYan Zheng 	atomic_inc(&log_root_tree->log_writers);
2843d1433debSMiao Xie 
2844d1433debSMiao Xie 	index2 = log_root_tree->log_transid % 2;
2845d1433debSMiao Xie 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2846d1433debSMiao Xie 	root_log_ctx.log_transid = log_root_tree->log_transid;
2847d1433debSMiao Xie 
28487237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
28497237f183SYan Zheng 
28507237f183SYan Zheng 	ret = update_log_root(trans, log);
28517237f183SYan Zheng 
28527237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
28537237f183SYan Zheng 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2854779adf0fSDavid Sterba 		/*
2855779adf0fSDavid Sterba 		 * Implicit memory barrier after atomic_dec_and_test
2856779adf0fSDavid Sterba 		 */
28577237f183SYan Zheng 		if (waitqueue_active(&log_root_tree->log_writer_wait))
28587237f183SYan Zheng 			wake_up(&log_root_tree->log_writer_wait);
28597237f183SYan Zheng 	}
28607237f183SYan Zheng 
28614a500fd1SYan, Zheng 	if (ret) {
2862d1433debSMiao Xie 		if (!list_empty(&root_log_ctx.list))
2863d1433debSMiao Xie 			list_del_init(&root_log_ctx.list);
2864d1433debSMiao Xie 
2865c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
28660b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
2867995946ddSMiao Xie 
286879787eaaSJeff Mahoney 		if (ret != -ENOSPC) {
286966642832SJeff Mahoney 			btrfs_abort_transaction(trans, ret);
287079787eaaSJeff Mahoney 			mutex_unlock(&log_root_tree->log_mutex);
287179787eaaSJeff Mahoney 			goto out;
287279787eaaSJeff Mahoney 		}
2873bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
28742ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
28754a500fd1SYan, Zheng 		mutex_unlock(&log_root_tree->log_mutex);
28764a500fd1SYan, Zheng 		ret = -EAGAIN;
28774a500fd1SYan, Zheng 		goto out;
28784a500fd1SYan, Zheng 	}
28794a500fd1SYan, Zheng 
2880d1433debSMiao Xie 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
28813da5ab56SForrest Liu 		blk_finish_plug(&plug);
2882cbd60aa7SChris Mason 		list_del_init(&root_log_ctx.list);
2883d1433debSMiao Xie 		mutex_unlock(&log_root_tree->log_mutex);
2884d1433debSMiao Xie 		ret = root_log_ctx.log_ret;
2885d1433debSMiao Xie 		goto out;
2886d1433debSMiao Xie 	}
28878b050d35SMiao Xie 
2888d1433debSMiao Xie 	index2 = root_log_ctx.log_transid % 2;
28897237f183SYan Zheng 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2890c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
2891bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log, mark);
289250d9aa99SJosef Bacik 		btrfs_wait_logged_extents(trans, log, log_transid);
289360d53eb3SZhaolei 		wait_log_commit(log_root_tree,
2894d1433debSMiao Xie 				root_log_ctx.log_transid);
28957237f183SYan Zheng 		mutex_unlock(&log_root_tree->log_mutex);
28965ab5e44aSFilipe Manana 		if (!ret)
28978b050d35SMiao Xie 			ret = root_log_ctx.log_ret;
28987237f183SYan Zheng 		goto out;
28997237f183SYan Zheng 	}
2900d1433debSMiao Xie 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
29017237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 1);
29027237f183SYan Zheng 
290312fcfd22SChris Mason 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
290460d53eb3SZhaolei 		wait_log_commit(log_root_tree,
2905d1433debSMiao Xie 				root_log_ctx.log_transid - 1);
290612fcfd22SChris Mason 	}
29077237f183SYan Zheng 
290860d53eb3SZhaolei 	wait_for_writer(log_root_tree);
290912fcfd22SChris Mason 
291012fcfd22SChris Mason 	/*
291112fcfd22SChris Mason 	 * now that we've moved on to the tree of log tree roots,
291212fcfd22SChris Mason 	 * check the full commit flag again
291312fcfd22SChris Mason 	 */
29140b246afaSJeff Mahoney 	if (btrfs_need_log_full_commit(fs_info, trans)) {
2915c6adc9ccSMiao Xie 		blk_finish_plug(&plug);
2916bf89d38fSJeff Mahoney 		btrfs_wait_tree_log_extents(log, mark);
29172ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
291812fcfd22SChris Mason 		mutex_unlock(&log_root_tree->log_mutex);
291912fcfd22SChris Mason 		ret = -EAGAIN;
292012fcfd22SChris Mason 		goto out_wake_log_root;
292112fcfd22SChris Mason 	}
29227237f183SYan Zheng 
29232ff7e61eSJeff Mahoney 	ret = btrfs_write_marked_extents(fs_info,
29248cef4e16SYan, Zheng 					 &log_root_tree->dirty_log_pages,
29258cef4e16SYan, Zheng 					 EXTENT_DIRTY | EXTENT_NEW);
2926c6adc9ccSMiao Xie 	blk_finish_plug(&plug);
292779787eaaSJeff Mahoney 	if (ret) {
29280b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
292966642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
29302ab28f32SJosef Bacik 		btrfs_free_logged_extents(log, log_transid);
293179787eaaSJeff Mahoney 		mutex_unlock(&log_root_tree->log_mutex);
293279787eaaSJeff Mahoney 		goto out_wake_log_root;
293379787eaaSJeff Mahoney 	}
2934bf89d38fSJeff Mahoney 	ret = btrfs_wait_tree_log_extents(log, mark);
29355ab5e44aSFilipe Manana 	if (!ret)
2936bf89d38fSJeff Mahoney 		ret = btrfs_wait_tree_log_extents(log_root_tree,
2937c6adc9ccSMiao Xie 						  EXTENT_NEW | EXTENT_DIRTY);
29385ab5e44aSFilipe Manana 	if (ret) {
29390b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
29405ab5e44aSFilipe Manana 		btrfs_free_logged_extents(log, log_transid);
29415ab5e44aSFilipe Manana 		mutex_unlock(&log_root_tree->log_mutex);
29425ab5e44aSFilipe Manana 		goto out_wake_log_root;
29435ab5e44aSFilipe Manana 	}
294450d9aa99SJosef Bacik 	btrfs_wait_logged_extents(trans, log, log_transid);
2945e02119d5SChris Mason 
29460b246afaSJeff Mahoney 	btrfs_set_super_log_root(fs_info->super_for_commit,
29477237f183SYan Zheng 				 log_root_tree->node->start);
29480b246afaSJeff Mahoney 	btrfs_set_super_log_root_level(fs_info->super_for_commit,
29497237f183SYan Zheng 				       btrfs_header_level(log_root_tree->node));
2950e02119d5SChris Mason 
29517237f183SYan Zheng 	log_root_tree->log_transid++;
29527237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
29537237f183SYan Zheng 
29547237f183SYan Zheng 	/*
29557237f183SYan Zheng 	 * nobody else is going to jump in and write the the ctree
29567237f183SYan Zheng 	 * super here because the log_commit atomic below is protecting
29577237f183SYan Zheng 	 * us.  We must be called with a transaction handle pinning
29587237f183SYan Zheng 	 * the running transaction open, so a full commit can't hop
29597237f183SYan Zheng 	 * in and cause problems either.
29607237f183SYan Zheng 	 */
29612ff7e61eSJeff Mahoney 	ret = write_ctree_super(trans, fs_info, 1);
29625af3e8ccSStefan Behrens 	if (ret) {
29630b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
296466642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
29655af3e8ccSStefan Behrens 		goto out_wake_log_root;
29665af3e8ccSStefan Behrens 	}
29677237f183SYan Zheng 
2968257c62e1SChris Mason 	mutex_lock(&root->log_mutex);
2969257c62e1SChris Mason 	if (root->last_log_commit < log_transid)
2970257c62e1SChris Mason 		root->last_log_commit = log_transid;
2971257c62e1SChris Mason 	mutex_unlock(&root->log_mutex);
2972257c62e1SChris Mason 
297312fcfd22SChris Mason out_wake_log_root:
2974570dd450SChris Mason 	mutex_lock(&log_root_tree->log_mutex);
29758b050d35SMiao Xie 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
29768b050d35SMiao Xie 
2977d1433debSMiao Xie 	log_root_tree->log_transid_committed++;
29787237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 0);
2979d1433debSMiao Xie 	mutex_unlock(&log_root_tree->log_mutex);
2980d1433debSMiao Xie 
298133a9eca7SDavid Sterba 	/*
298233a9eca7SDavid Sterba 	 * The barrier before waitqueue_active is implied by mutex_unlock
298333a9eca7SDavid Sterba 	 */
29847237f183SYan Zheng 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
29857237f183SYan Zheng 		wake_up(&log_root_tree->log_commit_wait[index2]);
2986e02119d5SChris Mason out:
2987d1433debSMiao Xie 	mutex_lock(&root->log_mutex);
2988570dd450SChris Mason 	btrfs_remove_all_log_ctxs(root, index1, ret);
2989d1433debSMiao Xie 	root->log_transid_committed++;
29907237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 0);
2991d1433debSMiao Xie 	mutex_unlock(&root->log_mutex);
29928b050d35SMiao Xie 
299333a9eca7SDavid Sterba 	/*
299433a9eca7SDavid Sterba 	 * The barrier before waitqueue_active is implied by mutex_unlock
299533a9eca7SDavid Sterba 	 */
29967237f183SYan Zheng 	if (waitqueue_active(&root->log_commit_wait[index1]))
29977237f183SYan Zheng 		wake_up(&root->log_commit_wait[index1]);
2998b31eabd8SChris Mason 	return ret;
2999e02119d5SChris Mason }
3000e02119d5SChris Mason 
30014a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans,
30024a500fd1SYan, Zheng 			  struct btrfs_root *log)
3003e02119d5SChris Mason {
3004e02119d5SChris Mason 	int ret;
3005d0c803c4SChris Mason 	u64 start;
3006d0c803c4SChris Mason 	u64 end;
3007e02119d5SChris Mason 	struct walk_control wc = {
3008e02119d5SChris Mason 		.free = 1,
3009e02119d5SChris Mason 		.process_func = process_one_buffer
3010e02119d5SChris Mason 	};
3011e02119d5SChris Mason 
3012e02119d5SChris Mason 	ret = walk_log_tree(trans, log, &wc);
30133650860bSJosef Bacik 	/* I don't think this can happen but just in case */
30143650860bSJosef Bacik 	if (ret)
301566642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
3016e02119d5SChris Mason 
3017d0c803c4SChris Mason 	while (1) {
3018d0c803c4SChris Mason 		ret = find_first_extent_bit(&log->dirty_log_pages,
3019e6138876SJosef Bacik 				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
3020e6138876SJosef Bacik 				NULL);
3021d0c803c4SChris Mason 		if (ret)
3022d0c803c4SChris Mason 			break;
3023d0c803c4SChris Mason 
30248cef4e16SYan, Zheng 		clear_extent_bits(&log->dirty_log_pages, start, end,
302591166212SDavid Sterba 				  EXTENT_DIRTY | EXTENT_NEW);
3026d0c803c4SChris Mason 	}
3027d0c803c4SChris Mason 
30282ab28f32SJosef Bacik 	/*
30292ab28f32SJosef Bacik 	 * We may have short-circuited the log tree with the full commit logic
30302ab28f32SJosef Bacik 	 * and left ordered extents on our list, so clear these out to keep us
30312ab28f32SJosef Bacik 	 * from leaking inodes and memory.
30322ab28f32SJosef Bacik 	 */
30332ab28f32SJosef Bacik 	btrfs_free_logged_extents(log, 0);
30342ab28f32SJosef Bacik 	btrfs_free_logged_extents(log, 1);
30352ab28f32SJosef Bacik 
30367237f183SYan Zheng 	free_extent_buffer(log->node);
30377237f183SYan Zheng 	kfree(log);
30384a500fd1SYan, Zheng }
30394a500fd1SYan, Zheng 
30404a500fd1SYan, Zheng /*
30414a500fd1SYan, Zheng  * free all the extents used by the tree log.  This should be called
30424a500fd1SYan, Zheng  * at commit time of the full transaction
30434a500fd1SYan, Zheng  */
30444a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
30454a500fd1SYan, Zheng {
30464a500fd1SYan, Zheng 	if (root->log_root) {
30474a500fd1SYan, Zheng 		free_log_tree(trans, root->log_root);
30484a500fd1SYan, Zheng 		root->log_root = NULL;
30494a500fd1SYan, Zheng 	}
30504a500fd1SYan, Zheng 	return 0;
30514a500fd1SYan, Zheng }
30524a500fd1SYan, Zheng 
30534a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
30544a500fd1SYan, Zheng 			     struct btrfs_fs_info *fs_info)
30554a500fd1SYan, Zheng {
30564a500fd1SYan, Zheng 	if (fs_info->log_root_tree) {
30574a500fd1SYan, Zheng 		free_log_tree(trans, fs_info->log_root_tree);
30584a500fd1SYan, Zheng 		fs_info->log_root_tree = NULL;
30594a500fd1SYan, Zheng 	}
3060e02119d5SChris Mason 	return 0;
3061e02119d5SChris Mason }
3062e02119d5SChris Mason 
3063e02119d5SChris Mason /*
3064e02119d5SChris Mason  * If both a file and directory are logged, and unlinks or renames are
3065e02119d5SChris Mason  * mixed in, we have a few interesting corners:
3066e02119d5SChris Mason  *
3067e02119d5SChris Mason  * create file X in dir Y
3068e02119d5SChris Mason  * link file X to X.link in dir Y
3069e02119d5SChris Mason  * fsync file X
3070e02119d5SChris Mason  * unlink file X but leave X.link
3071e02119d5SChris Mason  * fsync dir Y
3072e02119d5SChris Mason  *
3073e02119d5SChris Mason  * After a crash we would expect only X.link to exist.  But file X
3074e02119d5SChris Mason  * didn't get fsync'd again so the log has back refs for X and X.link.
3075e02119d5SChris Mason  *
3076e02119d5SChris Mason  * We solve this by removing directory entries and inode backrefs from the
3077e02119d5SChris Mason  * log when a file that was logged in the current transaction is
3078e02119d5SChris Mason  * unlinked.  Any later fsync will include the updated log entries, and
3079e02119d5SChris Mason  * we'll be able to reconstruct the proper directory items from backrefs.
3080e02119d5SChris Mason  *
3081e02119d5SChris Mason  * This optimizations allows us to avoid relogging the entire inode
3082e02119d5SChris Mason  * or the entire directory.
3083e02119d5SChris Mason  */
3084e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3085e02119d5SChris Mason 				 struct btrfs_root *root,
3086e02119d5SChris Mason 				 const char *name, int name_len,
308749f34d1fSNikolay Borisov 				 struct btrfs_inode *dir, u64 index)
3088e02119d5SChris Mason {
3089e02119d5SChris Mason 	struct btrfs_root *log;
3090e02119d5SChris Mason 	struct btrfs_dir_item *di;
3091e02119d5SChris Mason 	struct btrfs_path *path;
3092e02119d5SChris Mason 	int ret;
30934a500fd1SYan, Zheng 	int err = 0;
3094e02119d5SChris Mason 	int bytes_del = 0;
309549f34d1fSNikolay Borisov 	u64 dir_ino = btrfs_ino(dir);
3096e02119d5SChris Mason 
309749f34d1fSNikolay Borisov 	if (dir->logged_trans < trans->transid)
30983a5f1d45SChris Mason 		return 0;
30993a5f1d45SChris Mason 
3100e02119d5SChris Mason 	ret = join_running_log_trans(root);
3101e02119d5SChris Mason 	if (ret)
3102e02119d5SChris Mason 		return 0;
3103e02119d5SChris Mason 
310449f34d1fSNikolay Borisov 	mutex_lock(&dir->log_mutex);
3105e02119d5SChris Mason 
3106e02119d5SChris Mason 	log = root->log_root;
3107e02119d5SChris Mason 	path = btrfs_alloc_path();
3108a62f44a5STsutomu Itoh 	if (!path) {
3109a62f44a5STsutomu Itoh 		err = -ENOMEM;
3110a62f44a5STsutomu Itoh 		goto out_unlock;
3111a62f44a5STsutomu Itoh 	}
31122a29edc6Sliubo 
311333345d01SLi Zefan 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3114e02119d5SChris Mason 				   name, name_len, -1);
31154a500fd1SYan, Zheng 	if (IS_ERR(di)) {
31164a500fd1SYan, Zheng 		err = PTR_ERR(di);
31174a500fd1SYan, Zheng 		goto fail;
31184a500fd1SYan, Zheng 	}
31194a500fd1SYan, Zheng 	if (di) {
3120e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3121e02119d5SChris Mason 		bytes_del += name_len;
31223650860bSJosef Bacik 		if (ret) {
31233650860bSJosef Bacik 			err = ret;
31243650860bSJosef Bacik 			goto fail;
31253650860bSJosef Bacik 		}
3126e02119d5SChris Mason 	}
3127b3b4aa74SDavid Sterba 	btrfs_release_path(path);
312833345d01SLi Zefan 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3129e02119d5SChris Mason 					 index, name, name_len, -1);
31304a500fd1SYan, Zheng 	if (IS_ERR(di)) {
31314a500fd1SYan, Zheng 		err = PTR_ERR(di);
31324a500fd1SYan, Zheng 		goto fail;
31334a500fd1SYan, Zheng 	}
31344a500fd1SYan, Zheng 	if (di) {
3135e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3136e02119d5SChris Mason 		bytes_del += name_len;
31373650860bSJosef Bacik 		if (ret) {
31383650860bSJosef Bacik 			err = ret;
31393650860bSJosef Bacik 			goto fail;
31403650860bSJosef Bacik 		}
3141e02119d5SChris Mason 	}
3142e02119d5SChris Mason 
3143e02119d5SChris Mason 	/* update the directory size in the log to reflect the names
3144e02119d5SChris Mason 	 * we have removed
3145e02119d5SChris Mason 	 */
3146e02119d5SChris Mason 	if (bytes_del) {
3147e02119d5SChris Mason 		struct btrfs_key key;
3148e02119d5SChris Mason 
314933345d01SLi Zefan 		key.objectid = dir_ino;
3150e02119d5SChris Mason 		key.offset = 0;
3151e02119d5SChris Mason 		key.type = BTRFS_INODE_ITEM_KEY;
3152b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3153e02119d5SChris Mason 
3154e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
31554a500fd1SYan, Zheng 		if (ret < 0) {
31564a500fd1SYan, Zheng 			err = ret;
31574a500fd1SYan, Zheng 			goto fail;
31584a500fd1SYan, Zheng 		}
3159e02119d5SChris Mason 		if (ret == 0) {
3160e02119d5SChris Mason 			struct btrfs_inode_item *item;
3161e02119d5SChris Mason 			u64 i_size;
3162e02119d5SChris Mason 
3163e02119d5SChris Mason 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3164e02119d5SChris Mason 					      struct btrfs_inode_item);
3165e02119d5SChris Mason 			i_size = btrfs_inode_size(path->nodes[0], item);
3166e02119d5SChris Mason 			if (i_size > bytes_del)
3167e02119d5SChris Mason 				i_size -= bytes_del;
3168e02119d5SChris Mason 			else
3169e02119d5SChris Mason 				i_size = 0;
3170e02119d5SChris Mason 			btrfs_set_inode_size(path->nodes[0], item, i_size);
3171e02119d5SChris Mason 			btrfs_mark_buffer_dirty(path->nodes[0]);
3172e02119d5SChris Mason 		} else
3173e02119d5SChris Mason 			ret = 0;
3174b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3175e02119d5SChris Mason 	}
31764a500fd1SYan, Zheng fail:
3177e02119d5SChris Mason 	btrfs_free_path(path);
3178a62f44a5STsutomu Itoh out_unlock:
317949f34d1fSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
31804a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
3181995946ddSMiao Xie 		btrfs_set_log_full_commit(root->fs_info, trans);
31824a500fd1SYan, Zheng 		ret = 0;
318379787eaaSJeff Mahoney 	} else if (ret < 0)
318466642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
318579787eaaSJeff Mahoney 
318612fcfd22SChris Mason 	btrfs_end_log_trans(root);
3187e02119d5SChris Mason 
3188411fc6bcSAndi Kleen 	return err;
3189e02119d5SChris Mason }
3190e02119d5SChris Mason 
3191e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */
3192e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3193e02119d5SChris Mason 			       struct btrfs_root *root,
3194e02119d5SChris Mason 			       const char *name, int name_len,
3195a491abb2SNikolay Borisov 			       struct btrfs_inode *inode, u64 dirid)
3196e02119d5SChris Mason {
31970b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
3198e02119d5SChris Mason 	struct btrfs_root *log;
3199e02119d5SChris Mason 	u64 index;
3200e02119d5SChris Mason 	int ret;
3201e02119d5SChris Mason 
3202a491abb2SNikolay Borisov 	if (inode->logged_trans < trans->transid)
32033a5f1d45SChris Mason 		return 0;
32043a5f1d45SChris Mason 
3205e02119d5SChris Mason 	ret = join_running_log_trans(root);
3206e02119d5SChris Mason 	if (ret)
3207e02119d5SChris Mason 		return 0;
3208e02119d5SChris Mason 	log = root->log_root;
3209a491abb2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
3210e02119d5SChris Mason 
3211a491abb2SNikolay Borisov 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3212e02119d5SChris Mason 				  dirid, &index);
3213a491abb2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
32144a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
32150b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
32164a500fd1SYan, Zheng 		ret = 0;
321779787eaaSJeff Mahoney 	} else if (ret < 0 && ret != -ENOENT)
321866642832SJeff Mahoney 		btrfs_abort_transaction(trans, ret);
321912fcfd22SChris Mason 	btrfs_end_log_trans(root);
3220e02119d5SChris Mason 
3221e02119d5SChris Mason 	return ret;
3222e02119d5SChris Mason }
3223e02119d5SChris Mason 
3224e02119d5SChris Mason /*
3225e02119d5SChris Mason  * creates a range item in the log for 'dirid'.  first_offset and
3226e02119d5SChris Mason  * last_offset tell us which parts of the key space the log should
3227e02119d5SChris Mason  * be considered authoritative for.
3228e02119d5SChris Mason  */
3229e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3230e02119d5SChris Mason 				       struct btrfs_root *log,
3231e02119d5SChris Mason 				       struct btrfs_path *path,
3232e02119d5SChris Mason 				       int key_type, u64 dirid,
3233e02119d5SChris Mason 				       u64 first_offset, u64 last_offset)
3234e02119d5SChris Mason {
3235e02119d5SChris Mason 	int ret;
3236e02119d5SChris Mason 	struct btrfs_key key;
3237e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
3238e02119d5SChris Mason 
3239e02119d5SChris Mason 	key.objectid = dirid;
3240e02119d5SChris Mason 	key.offset = first_offset;
3241e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY)
3242e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3243e02119d5SChris Mason 	else
3244e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3245e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
32464a500fd1SYan, Zheng 	if (ret)
32474a500fd1SYan, Zheng 		return ret;
3248e02119d5SChris Mason 
3249e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3250e02119d5SChris Mason 			      struct btrfs_dir_log_item);
3251e02119d5SChris Mason 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3252e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
3253b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3254e02119d5SChris Mason 	return 0;
3255e02119d5SChris Mason }
3256e02119d5SChris Mason 
3257e02119d5SChris Mason /*
3258e02119d5SChris Mason  * log all the items included in the current transaction for a given
3259e02119d5SChris Mason  * directory.  This also creates the range items in the log tree required
3260e02119d5SChris Mason  * to replay anything deleted before the fsync
3261e02119d5SChris Mason  */
3262e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3263e02119d5SChris Mason 			  struct btrfs_root *root, struct inode *inode,
3264e02119d5SChris Mason 			  struct btrfs_path *path,
3265e02119d5SChris Mason 			  struct btrfs_path *dst_path, int key_type,
32662f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx,
3267e02119d5SChris Mason 			  u64 min_offset, u64 *last_offset_ret)
3268e02119d5SChris Mason {
3269e02119d5SChris Mason 	struct btrfs_key min_key;
3270e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
3271e02119d5SChris Mason 	struct extent_buffer *src;
32724a500fd1SYan, Zheng 	int err = 0;
3273e02119d5SChris Mason 	int ret;
3274e02119d5SChris Mason 	int i;
3275e02119d5SChris Mason 	int nritems;
3276e02119d5SChris Mason 	u64 first_offset = min_offset;
3277e02119d5SChris Mason 	u64 last_offset = (u64)-1;
32784a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
3279e02119d5SChris Mason 
3280e02119d5SChris Mason 	log = root->log_root;
3281e02119d5SChris Mason 
328233345d01SLi Zefan 	min_key.objectid = ino;
3283e02119d5SChris Mason 	min_key.type = key_type;
3284e02119d5SChris Mason 	min_key.offset = min_offset;
3285e02119d5SChris Mason 
32866174d3cbSFilipe David Borba Manana 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3287e02119d5SChris Mason 
3288e02119d5SChris Mason 	/*
3289e02119d5SChris Mason 	 * we didn't find anything from this transaction, see if there
3290e02119d5SChris Mason 	 * is anything at all
3291e02119d5SChris Mason 	 */
329233345d01SLi Zefan 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
329333345d01SLi Zefan 		min_key.objectid = ino;
3294e02119d5SChris Mason 		min_key.type = key_type;
3295e02119d5SChris Mason 		min_key.offset = (u64)-1;
3296b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3297e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3298e02119d5SChris Mason 		if (ret < 0) {
3299b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3300e02119d5SChris Mason 			return ret;
3301e02119d5SChris Mason 		}
330233345d01SLi Zefan 		ret = btrfs_previous_item(root, path, ino, key_type);
3303e02119d5SChris Mason 
3304e02119d5SChris Mason 		/* if ret == 0 there are items for this type,
3305e02119d5SChris Mason 		 * create a range to tell us the last key of this type.
3306e02119d5SChris Mason 		 * otherwise, there are no items in this directory after
3307e02119d5SChris Mason 		 * *min_offset, and we create a range to indicate that.
3308e02119d5SChris Mason 		 */
3309e02119d5SChris Mason 		if (ret == 0) {
3310e02119d5SChris Mason 			struct btrfs_key tmp;
3311e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3312e02119d5SChris Mason 					      path->slots[0]);
3313d397712bSChris Mason 			if (key_type == tmp.type)
3314e02119d5SChris Mason 				first_offset = max(min_offset, tmp.offset) + 1;
3315e02119d5SChris Mason 		}
3316e02119d5SChris Mason 		goto done;
3317e02119d5SChris Mason 	}
3318e02119d5SChris Mason 
3319e02119d5SChris Mason 	/* go backward to find any previous key */
332033345d01SLi Zefan 	ret = btrfs_previous_item(root, path, ino, key_type);
3321e02119d5SChris Mason 	if (ret == 0) {
3322e02119d5SChris Mason 		struct btrfs_key tmp;
3323e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3324e02119d5SChris Mason 		if (key_type == tmp.type) {
3325e02119d5SChris Mason 			first_offset = tmp.offset;
3326e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3327e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3328e02119d5SChris Mason 					     &tmp);
33294a500fd1SYan, Zheng 			if (ret) {
33304a500fd1SYan, Zheng 				err = ret;
33314a500fd1SYan, Zheng 				goto done;
33324a500fd1SYan, Zheng 			}
3333e02119d5SChris Mason 		}
3334e02119d5SChris Mason 	}
3335b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3336e02119d5SChris Mason 
3337e02119d5SChris Mason 	/* find the first key from this transaction again */
3338e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3339fae7f21cSDulshani Gunawardhana 	if (WARN_ON(ret != 0))
3340e02119d5SChris Mason 		goto done;
3341e02119d5SChris Mason 
3342e02119d5SChris Mason 	/*
3343e02119d5SChris Mason 	 * we have a block from this transaction, log every item in it
3344e02119d5SChris Mason 	 * from our directory
3345e02119d5SChris Mason 	 */
3346e02119d5SChris Mason 	while (1) {
3347e02119d5SChris Mason 		struct btrfs_key tmp;
3348e02119d5SChris Mason 		src = path->nodes[0];
3349e02119d5SChris Mason 		nritems = btrfs_header_nritems(src);
3350e02119d5SChris Mason 		for (i = path->slots[0]; i < nritems; i++) {
33512f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
33522f2ff0eeSFilipe Manana 
3353e02119d5SChris Mason 			btrfs_item_key_to_cpu(src, &min_key, i);
3354e02119d5SChris Mason 
335533345d01SLi Zefan 			if (min_key.objectid != ino || min_key.type != key_type)
3356e02119d5SChris Mason 				goto done;
3357e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path, src, i,
3358e02119d5SChris Mason 					     &min_key);
33594a500fd1SYan, Zheng 			if (ret) {
33604a500fd1SYan, Zheng 				err = ret;
33614a500fd1SYan, Zheng 				goto done;
33624a500fd1SYan, Zheng 			}
33632f2ff0eeSFilipe Manana 
33642f2ff0eeSFilipe Manana 			/*
33652f2ff0eeSFilipe Manana 			 * We must make sure that when we log a directory entry,
33662f2ff0eeSFilipe Manana 			 * the corresponding inode, after log replay, has a
33672f2ff0eeSFilipe Manana 			 * matching link count. For example:
33682f2ff0eeSFilipe Manana 			 *
33692f2ff0eeSFilipe Manana 			 * touch foo
33702f2ff0eeSFilipe Manana 			 * mkdir mydir
33712f2ff0eeSFilipe Manana 			 * sync
33722f2ff0eeSFilipe Manana 			 * ln foo mydir/bar
33732f2ff0eeSFilipe Manana 			 * xfs_io -c "fsync" mydir
33742f2ff0eeSFilipe Manana 			 * <crash>
33752f2ff0eeSFilipe Manana 			 * <mount fs and log replay>
33762f2ff0eeSFilipe Manana 			 *
33772f2ff0eeSFilipe Manana 			 * Would result in a fsync log that when replayed, our
33782f2ff0eeSFilipe Manana 			 * file inode would have a link count of 1, but we get
33792f2ff0eeSFilipe Manana 			 * two directory entries pointing to the same inode.
33802f2ff0eeSFilipe Manana 			 * After removing one of the names, it would not be
33812f2ff0eeSFilipe Manana 			 * possible to remove the other name, which resulted
33822f2ff0eeSFilipe Manana 			 * always in stale file handle errors, and would not
33832f2ff0eeSFilipe Manana 			 * be possible to rmdir the parent directory, since
33842f2ff0eeSFilipe Manana 			 * its i_size could never decrement to the value
33852f2ff0eeSFilipe Manana 			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
33862f2ff0eeSFilipe Manana 			 */
33872f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
33882f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
33892f2ff0eeSFilipe Manana 			if (ctx &&
33902f2ff0eeSFilipe Manana 			    (btrfs_dir_transid(src, di) == trans->transid ||
33912f2ff0eeSFilipe Manana 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
33922f2ff0eeSFilipe Manana 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
33932f2ff0eeSFilipe Manana 				ctx->log_new_dentries = true;
3394e02119d5SChris Mason 		}
3395e02119d5SChris Mason 		path->slots[0] = nritems;
3396e02119d5SChris Mason 
3397e02119d5SChris Mason 		/*
3398e02119d5SChris Mason 		 * look ahead to the next item and see if it is also
3399e02119d5SChris Mason 		 * from this directory and from this transaction
3400e02119d5SChris Mason 		 */
3401e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
3402e02119d5SChris Mason 		if (ret == 1) {
3403e02119d5SChris Mason 			last_offset = (u64)-1;
3404e02119d5SChris Mason 			goto done;
3405e02119d5SChris Mason 		}
3406e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
340733345d01SLi Zefan 		if (tmp.objectid != ino || tmp.type != key_type) {
3408e02119d5SChris Mason 			last_offset = (u64)-1;
3409e02119d5SChris Mason 			goto done;
3410e02119d5SChris Mason 		}
3411e02119d5SChris Mason 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3412e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
3413e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
3414e02119d5SChris Mason 					     &tmp);
34154a500fd1SYan, Zheng 			if (ret)
34164a500fd1SYan, Zheng 				err = ret;
34174a500fd1SYan, Zheng 			else
3418e02119d5SChris Mason 				last_offset = tmp.offset;
3419e02119d5SChris Mason 			goto done;
3420e02119d5SChris Mason 		}
3421e02119d5SChris Mason 	}
3422e02119d5SChris Mason done:
3423b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3424b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
3425e02119d5SChris Mason 
34264a500fd1SYan, Zheng 	if (err == 0) {
34274a500fd1SYan, Zheng 		*last_offset_ret = last_offset;
34284a500fd1SYan, Zheng 		/*
34294a500fd1SYan, Zheng 		 * insert the log range keys to indicate where the log
34304a500fd1SYan, Zheng 		 * is valid
34314a500fd1SYan, Zheng 		 */
34324a500fd1SYan, Zheng 		ret = insert_dir_log_key(trans, log, path, key_type,
343333345d01SLi Zefan 					 ino, first_offset, last_offset);
34344a500fd1SYan, Zheng 		if (ret)
34354a500fd1SYan, Zheng 			err = ret;
34364a500fd1SYan, Zheng 	}
34374a500fd1SYan, Zheng 	return err;
3438e02119d5SChris Mason }
3439e02119d5SChris Mason 
3440e02119d5SChris Mason /*
3441e02119d5SChris Mason  * logging directories is very similar to logging inodes, We find all the items
3442e02119d5SChris Mason  * from the current transaction and write them to the log.
3443e02119d5SChris Mason  *
3444e02119d5SChris Mason  * The recovery code scans the directory in the subvolume, and if it finds a
3445e02119d5SChris Mason  * key in the range logged that is not present in the log tree, then it means
3446e02119d5SChris Mason  * that dir entry was unlinked during the transaction.
3447e02119d5SChris Mason  *
3448e02119d5SChris Mason  * In order for that scan to work, we must include one key smaller than
3449e02119d5SChris Mason  * the smallest logged by this transaction and one key larger than the largest
3450e02119d5SChris Mason  * key logged by this transaction.
3451e02119d5SChris Mason  */
3452e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3453e02119d5SChris Mason 			  struct btrfs_root *root, struct inode *inode,
3454e02119d5SChris Mason 			  struct btrfs_path *path,
34552f2ff0eeSFilipe Manana 			  struct btrfs_path *dst_path,
34562f2ff0eeSFilipe Manana 			  struct btrfs_log_ctx *ctx)
3457e02119d5SChris Mason {
3458e02119d5SChris Mason 	u64 min_key;
3459e02119d5SChris Mason 	u64 max_key;
3460e02119d5SChris Mason 	int ret;
3461e02119d5SChris Mason 	int key_type = BTRFS_DIR_ITEM_KEY;
3462e02119d5SChris Mason 
3463e02119d5SChris Mason again:
3464e02119d5SChris Mason 	min_key = 0;
3465e02119d5SChris Mason 	max_key = 0;
3466e02119d5SChris Mason 	while (1) {
3467e02119d5SChris Mason 		ret = log_dir_items(trans, root, inode, path,
34682f2ff0eeSFilipe Manana 				    dst_path, key_type, ctx, min_key,
3469e02119d5SChris Mason 				    &max_key);
34704a500fd1SYan, Zheng 		if (ret)
34714a500fd1SYan, Zheng 			return ret;
3472e02119d5SChris Mason 		if (max_key == (u64)-1)
3473e02119d5SChris Mason 			break;
3474e02119d5SChris Mason 		min_key = max_key + 1;
3475e02119d5SChris Mason 	}
3476e02119d5SChris Mason 
3477e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3478e02119d5SChris Mason 		key_type = BTRFS_DIR_INDEX_KEY;
3479e02119d5SChris Mason 		goto again;
3480e02119d5SChris Mason 	}
3481e02119d5SChris Mason 	return 0;
3482e02119d5SChris Mason }
3483e02119d5SChris Mason 
3484e02119d5SChris Mason /*
3485e02119d5SChris Mason  * a helper function to drop items from the log before we relog an
3486e02119d5SChris Mason  * inode.  max_key_type indicates the highest item type to remove.
3487e02119d5SChris Mason  * This cannot be run for file data extents because it does not
3488e02119d5SChris Mason  * free the extents they point to.
3489e02119d5SChris Mason  */
3490e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans,
3491e02119d5SChris Mason 				  struct btrfs_root *log,
3492e02119d5SChris Mason 				  struct btrfs_path *path,
3493e02119d5SChris Mason 				  u64 objectid, int max_key_type)
3494e02119d5SChris Mason {
3495e02119d5SChris Mason 	int ret;
3496e02119d5SChris Mason 	struct btrfs_key key;
3497e02119d5SChris Mason 	struct btrfs_key found_key;
349818ec90d6SJosef Bacik 	int start_slot;
3499e02119d5SChris Mason 
3500e02119d5SChris Mason 	key.objectid = objectid;
3501e02119d5SChris Mason 	key.type = max_key_type;
3502e02119d5SChris Mason 	key.offset = (u64)-1;
3503e02119d5SChris Mason 
3504e02119d5SChris Mason 	while (1) {
3505e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
35063650860bSJosef Bacik 		BUG_ON(ret == 0); /* Logic error */
35074a500fd1SYan, Zheng 		if (ret < 0)
3508e02119d5SChris Mason 			break;
3509e02119d5SChris Mason 
3510e02119d5SChris Mason 		if (path->slots[0] == 0)
3511e02119d5SChris Mason 			break;
3512e02119d5SChris Mason 
3513e02119d5SChris Mason 		path->slots[0]--;
3514e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3515e02119d5SChris Mason 				      path->slots[0]);
3516e02119d5SChris Mason 
3517e02119d5SChris Mason 		if (found_key.objectid != objectid)
3518e02119d5SChris Mason 			break;
3519e02119d5SChris Mason 
352018ec90d6SJosef Bacik 		found_key.offset = 0;
352118ec90d6SJosef Bacik 		found_key.type = 0;
352218ec90d6SJosef Bacik 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
352318ec90d6SJosef Bacik 				       &start_slot);
352418ec90d6SJosef Bacik 
352518ec90d6SJosef Bacik 		ret = btrfs_del_items(trans, log, path, start_slot,
352618ec90d6SJosef Bacik 				      path->slots[0] - start_slot + 1);
352718ec90d6SJosef Bacik 		/*
352818ec90d6SJosef Bacik 		 * If start slot isn't 0 then we don't need to re-search, we've
352918ec90d6SJosef Bacik 		 * found the last guy with the objectid in this tree.
353018ec90d6SJosef Bacik 		 */
353118ec90d6SJosef Bacik 		if (ret || start_slot != 0)
353265a246c5STsutomu Itoh 			break;
3533b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3534e02119d5SChris Mason 	}
3535b3b4aa74SDavid Sterba 	btrfs_release_path(path);
35365bdbeb21SJosef Bacik 	if (ret > 0)
35375bdbeb21SJosef Bacik 		ret = 0;
35384a500fd1SYan, Zheng 	return ret;
3539e02119d5SChris Mason }
3540e02119d5SChris Mason 
354194edf4aeSJosef Bacik static void fill_inode_item(struct btrfs_trans_handle *trans,
354294edf4aeSJosef Bacik 			    struct extent_buffer *leaf,
354394edf4aeSJosef Bacik 			    struct btrfs_inode_item *item,
35441a4bcf47SFilipe Manana 			    struct inode *inode, int log_inode_only,
35451a4bcf47SFilipe Manana 			    u64 logged_isize)
354694edf4aeSJosef Bacik {
35470b1c6ccaSJosef Bacik 	struct btrfs_map_token token;
354894edf4aeSJosef Bacik 
35490b1c6ccaSJosef Bacik 	btrfs_init_map_token(&token);
355094edf4aeSJosef Bacik 
355194edf4aeSJosef Bacik 	if (log_inode_only) {
355294edf4aeSJosef Bacik 		/* set the generation to zero so the recover code
355394edf4aeSJosef Bacik 		 * can tell the difference between an logging
355494edf4aeSJosef Bacik 		 * just to say 'this inode exists' and a logging
355594edf4aeSJosef Bacik 		 * to say 'update this inode with these values'
355694edf4aeSJosef Bacik 		 */
35570b1c6ccaSJosef Bacik 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
35581a4bcf47SFilipe Manana 		btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
355994edf4aeSJosef Bacik 	} else {
35600b1c6ccaSJosef Bacik 		btrfs_set_token_inode_generation(leaf, item,
35610b1c6ccaSJosef Bacik 						 BTRFS_I(inode)->generation,
35620b1c6ccaSJosef Bacik 						 &token);
35630b1c6ccaSJosef Bacik 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
356494edf4aeSJosef Bacik 	}
356594edf4aeSJosef Bacik 
35660b1c6ccaSJosef Bacik 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
35670b1c6ccaSJosef Bacik 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
35680b1c6ccaSJosef Bacik 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
35690b1c6ccaSJosef Bacik 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
35700b1c6ccaSJosef Bacik 
3571a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->atime,
35720b1c6ccaSJosef Bacik 				     inode->i_atime.tv_sec, &token);
3573a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
35740b1c6ccaSJosef Bacik 				      inode->i_atime.tv_nsec, &token);
35750b1c6ccaSJosef Bacik 
3576a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
35770b1c6ccaSJosef Bacik 				     inode->i_mtime.tv_sec, &token);
3578a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
35790b1c6ccaSJosef Bacik 				      inode->i_mtime.tv_nsec, &token);
35800b1c6ccaSJosef Bacik 
3581a937b979SDavid Sterba 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
35820b1c6ccaSJosef Bacik 				     inode->i_ctime.tv_sec, &token);
3583a937b979SDavid Sterba 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
35840b1c6ccaSJosef Bacik 				      inode->i_ctime.tv_nsec, &token);
35850b1c6ccaSJosef Bacik 
35860b1c6ccaSJosef Bacik 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
35870b1c6ccaSJosef Bacik 				     &token);
35880b1c6ccaSJosef Bacik 
35890b1c6ccaSJosef Bacik 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
35900b1c6ccaSJosef Bacik 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
35910b1c6ccaSJosef Bacik 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
35920b1c6ccaSJosef Bacik 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
35930b1c6ccaSJosef Bacik 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
359494edf4aeSJosef Bacik }
359594edf4aeSJosef Bacik 
3596a95249b3SJosef Bacik static int log_inode_item(struct btrfs_trans_handle *trans,
3597a95249b3SJosef Bacik 			  struct btrfs_root *log, struct btrfs_path *path,
3598a95249b3SJosef Bacik 			  struct inode *inode)
3599a95249b3SJosef Bacik {
3600a95249b3SJosef Bacik 	struct btrfs_inode_item *inode_item;
3601a95249b3SJosef Bacik 	int ret;
3602a95249b3SJosef Bacik 
3603efd0c405SFilipe David Borba Manana 	ret = btrfs_insert_empty_item(trans, log, path,
3604efd0c405SFilipe David Borba Manana 				      &BTRFS_I(inode)->location,
3605a95249b3SJosef Bacik 				      sizeof(*inode_item));
3606a95249b3SJosef Bacik 	if (ret && ret != -EEXIST)
3607a95249b3SJosef Bacik 		return ret;
3608a95249b3SJosef Bacik 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3609a95249b3SJosef Bacik 				    struct btrfs_inode_item);
36101a4bcf47SFilipe Manana 	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3611a95249b3SJosef Bacik 	btrfs_release_path(path);
3612a95249b3SJosef Bacik 	return 0;
3613a95249b3SJosef Bacik }
3614a95249b3SJosef Bacik 
361531ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans,
361644d70e19SNikolay Borisov 			       struct btrfs_inode *inode,
361731ff1cd2SChris Mason 			       struct btrfs_path *dst_path,
361816e7549fSJosef Bacik 			       struct btrfs_path *src_path, u64 *last_extent,
36191a4bcf47SFilipe Manana 			       int start_slot, int nr, int inode_only,
36201a4bcf47SFilipe Manana 			       u64 logged_isize)
362131ff1cd2SChris Mason {
362244d70e19SNikolay Borisov 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
362331ff1cd2SChris Mason 	unsigned long src_offset;
362431ff1cd2SChris Mason 	unsigned long dst_offset;
362544d70e19SNikolay Borisov 	struct btrfs_root *log = inode->root->log_root;
362631ff1cd2SChris Mason 	struct btrfs_file_extent_item *extent;
362731ff1cd2SChris Mason 	struct btrfs_inode_item *inode_item;
362816e7549fSJosef Bacik 	struct extent_buffer *src = src_path->nodes[0];
362916e7549fSJosef Bacik 	struct btrfs_key first_key, last_key, key;
363031ff1cd2SChris Mason 	int ret;
363131ff1cd2SChris Mason 	struct btrfs_key *ins_keys;
363231ff1cd2SChris Mason 	u32 *ins_sizes;
363331ff1cd2SChris Mason 	char *ins_data;
363431ff1cd2SChris Mason 	int i;
3635d20f7043SChris Mason 	struct list_head ordered_sums;
363644d70e19SNikolay Borisov 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
363716e7549fSJosef Bacik 	bool has_extents = false;
363874121f7cSFilipe Manana 	bool need_find_last_extent = true;
363916e7549fSJosef Bacik 	bool done = false;
3640d20f7043SChris Mason 
3641d20f7043SChris Mason 	INIT_LIST_HEAD(&ordered_sums);
364231ff1cd2SChris Mason 
364331ff1cd2SChris Mason 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
364431ff1cd2SChris Mason 			   nr * sizeof(u32), GFP_NOFS);
36452a29edc6Sliubo 	if (!ins_data)
36462a29edc6Sliubo 		return -ENOMEM;
36472a29edc6Sliubo 
364816e7549fSJosef Bacik 	first_key.objectid = (u64)-1;
364916e7549fSJosef Bacik 
365031ff1cd2SChris Mason 	ins_sizes = (u32 *)ins_data;
365131ff1cd2SChris Mason 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
365231ff1cd2SChris Mason 
365331ff1cd2SChris Mason 	for (i = 0; i < nr; i++) {
365431ff1cd2SChris Mason 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
365531ff1cd2SChris Mason 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
365631ff1cd2SChris Mason 	}
365731ff1cd2SChris Mason 	ret = btrfs_insert_empty_items(trans, log, dst_path,
365831ff1cd2SChris Mason 				       ins_keys, ins_sizes, nr);
36594a500fd1SYan, Zheng 	if (ret) {
36604a500fd1SYan, Zheng 		kfree(ins_data);
36614a500fd1SYan, Zheng 		return ret;
36624a500fd1SYan, Zheng 	}
366331ff1cd2SChris Mason 
36645d4f98a2SYan Zheng 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
366531ff1cd2SChris Mason 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
366631ff1cd2SChris Mason 						   dst_path->slots[0]);
366731ff1cd2SChris Mason 
366831ff1cd2SChris Mason 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
366931ff1cd2SChris Mason 
367016e7549fSJosef Bacik 		if ((i == (nr - 1)))
367116e7549fSJosef Bacik 			last_key = ins_keys[i];
367216e7549fSJosef Bacik 
367394edf4aeSJosef Bacik 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
367431ff1cd2SChris Mason 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
367531ff1cd2SChris Mason 						    dst_path->slots[0],
367631ff1cd2SChris Mason 						    struct btrfs_inode_item);
367794edf4aeSJosef Bacik 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
367844d70e19SNikolay Borisov 					&inode->vfs_inode, inode_only == LOG_INODE_EXISTS,
36791a4bcf47SFilipe Manana 					logged_isize);
368094edf4aeSJosef Bacik 		} else {
368194edf4aeSJosef Bacik 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
368294edf4aeSJosef Bacik 					   src_offset, ins_sizes[i]);
368331ff1cd2SChris Mason 		}
368494edf4aeSJosef Bacik 
368516e7549fSJosef Bacik 		/*
368616e7549fSJosef Bacik 		 * We set need_find_last_extent here in case we know we were
368716e7549fSJosef Bacik 		 * processing other items and then walk into the first extent in
368816e7549fSJosef Bacik 		 * the inode.  If we don't hit an extent then nothing changes,
368916e7549fSJosef Bacik 		 * we'll do the last search the next time around.
369016e7549fSJosef Bacik 		 */
369116e7549fSJosef Bacik 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
369216e7549fSJosef Bacik 			has_extents = true;
369374121f7cSFilipe Manana 			if (first_key.objectid == (u64)-1)
369416e7549fSJosef Bacik 				first_key = ins_keys[i];
369516e7549fSJosef Bacik 		} else {
369616e7549fSJosef Bacik 			need_find_last_extent = false;
369716e7549fSJosef Bacik 		}
369816e7549fSJosef Bacik 
369931ff1cd2SChris Mason 		/* take a reference on file data extents so that truncates
370031ff1cd2SChris Mason 		 * or deletes of this inode don't have to relog the inode
370131ff1cd2SChris Mason 		 * again
370231ff1cd2SChris Mason 		 */
3703962a298fSDavid Sterba 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3704d2794405SLiu Bo 		    !skip_csum) {
370531ff1cd2SChris Mason 			int found_type;
370631ff1cd2SChris Mason 			extent = btrfs_item_ptr(src, start_slot + i,
370731ff1cd2SChris Mason 						struct btrfs_file_extent_item);
370831ff1cd2SChris Mason 
37098e531cdfSliubo 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
37108e531cdfSliubo 				continue;
37118e531cdfSliubo 
371231ff1cd2SChris Mason 			found_type = btrfs_file_extent_type(src, extent);
37136f1fed77SJosef Bacik 			if (found_type == BTRFS_FILE_EXTENT_REG) {
37145d4f98a2SYan Zheng 				u64 ds, dl, cs, cl;
37155d4f98a2SYan Zheng 				ds = btrfs_file_extent_disk_bytenr(src,
371631ff1cd2SChris Mason 								extent);
37175d4f98a2SYan Zheng 				/* ds == 0 is a hole */
37185d4f98a2SYan Zheng 				if (ds == 0)
37195d4f98a2SYan Zheng 					continue;
37205d4f98a2SYan Zheng 
37215d4f98a2SYan Zheng 				dl = btrfs_file_extent_disk_num_bytes(src,
372231ff1cd2SChris Mason 								extent);
37235d4f98a2SYan Zheng 				cs = btrfs_file_extent_offset(src, extent);
37245d4f98a2SYan Zheng 				cl = btrfs_file_extent_num_bytes(src,
3725a419aef8SJoe Perches 								extent);
3726580afd76SChris Mason 				if (btrfs_file_extent_compression(src,
3727580afd76SChris Mason 								  extent)) {
3728580afd76SChris Mason 					cs = 0;
3729580afd76SChris Mason 					cl = dl;
3730580afd76SChris Mason 				}
37315d4f98a2SYan Zheng 
373207d400a6SYan Zheng 				ret = btrfs_lookup_csums_range(
37330b246afaSJeff Mahoney 						fs_info->csum_root,
373407d400a6SYan Zheng 						ds + cs, ds + cs + cl - 1,
3735a2de733cSArne Jansen 						&ordered_sums, 0);
37363650860bSJosef Bacik 				if (ret) {
37373650860bSJosef Bacik 					btrfs_release_path(dst_path);
37383650860bSJosef Bacik 					kfree(ins_data);
37393650860bSJosef Bacik 					return ret;
37403650860bSJosef Bacik 				}
374131ff1cd2SChris Mason 			}
374231ff1cd2SChris Mason 		}
374331ff1cd2SChris Mason 	}
374431ff1cd2SChris Mason 
374531ff1cd2SChris Mason 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3746b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
374731ff1cd2SChris Mason 	kfree(ins_data);
3748d20f7043SChris Mason 
3749d20f7043SChris Mason 	/*
3750d20f7043SChris Mason 	 * we have to do this after the loop above to avoid changing the
3751d20f7043SChris Mason 	 * log tree while trying to change the log tree.
3752d20f7043SChris Mason 	 */
37534a500fd1SYan, Zheng 	ret = 0;
3754d20f7043SChris Mason 	while (!list_empty(&ordered_sums)) {
3755d20f7043SChris Mason 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3756d20f7043SChris Mason 						   struct btrfs_ordered_sum,
3757d20f7043SChris Mason 						   list);
37584a500fd1SYan, Zheng 		if (!ret)
3759d20f7043SChris Mason 			ret = btrfs_csum_file_blocks(trans, log, sums);
3760d20f7043SChris Mason 		list_del(&sums->list);
3761d20f7043SChris Mason 		kfree(sums);
3762d20f7043SChris Mason 	}
376316e7549fSJosef Bacik 
376416e7549fSJosef Bacik 	if (!has_extents)
376516e7549fSJosef Bacik 		return ret;
376616e7549fSJosef Bacik 
376774121f7cSFilipe Manana 	if (need_find_last_extent && *last_extent == first_key.offset) {
376874121f7cSFilipe Manana 		/*
376974121f7cSFilipe Manana 		 * We don't have any leafs between our current one and the one
377074121f7cSFilipe Manana 		 * we processed before that can have file extent items for our
377174121f7cSFilipe Manana 		 * inode (and have a generation number smaller than our current
377274121f7cSFilipe Manana 		 * transaction id).
377374121f7cSFilipe Manana 		 */
377474121f7cSFilipe Manana 		need_find_last_extent = false;
377574121f7cSFilipe Manana 	}
377674121f7cSFilipe Manana 
377716e7549fSJosef Bacik 	/*
377816e7549fSJosef Bacik 	 * Because we use btrfs_search_forward we could skip leaves that were
377916e7549fSJosef Bacik 	 * not modified and then assume *last_extent is valid when it really
378016e7549fSJosef Bacik 	 * isn't.  So back up to the previous leaf and read the end of the last
378116e7549fSJosef Bacik 	 * extent before we go and fill in holes.
378216e7549fSJosef Bacik 	 */
378316e7549fSJosef Bacik 	if (need_find_last_extent) {
378416e7549fSJosef Bacik 		u64 len;
378516e7549fSJosef Bacik 
378644d70e19SNikolay Borisov 		ret = btrfs_prev_leaf(inode->root, src_path);
378716e7549fSJosef Bacik 		if (ret < 0)
378816e7549fSJosef Bacik 			return ret;
378916e7549fSJosef Bacik 		if (ret)
379016e7549fSJosef Bacik 			goto fill_holes;
379116e7549fSJosef Bacik 		if (src_path->slots[0])
379216e7549fSJosef Bacik 			src_path->slots[0]--;
379316e7549fSJosef Bacik 		src = src_path->nodes[0];
379416e7549fSJosef Bacik 		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
379544d70e19SNikolay Borisov 		if (key.objectid != btrfs_ino(inode) ||
379616e7549fSJosef Bacik 		    key.type != BTRFS_EXTENT_DATA_KEY)
379716e7549fSJosef Bacik 			goto fill_holes;
379816e7549fSJosef Bacik 		extent = btrfs_item_ptr(src, src_path->slots[0],
379916e7549fSJosef Bacik 					struct btrfs_file_extent_item);
380016e7549fSJosef Bacik 		if (btrfs_file_extent_type(src, extent) ==
380116e7549fSJosef Bacik 		    BTRFS_FILE_EXTENT_INLINE) {
3802514ac8adSChris Mason 			len = btrfs_file_extent_inline_len(src,
3803514ac8adSChris Mason 							   src_path->slots[0],
3804514ac8adSChris Mason 							   extent);
380516e7549fSJosef Bacik 			*last_extent = ALIGN(key.offset + len,
38060b246afaSJeff Mahoney 					     fs_info->sectorsize);
380716e7549fSJosef Bacik 		} else {
380816e7549fSJosef Bacik 			len = btrfs_file_extent_num_bytes(src, extent);
380916e7549fSJosef Bacik 			*last_extent = key.offset + len;
381016e7549fSJosef Bacik 		}
381116e7549fSJosef Bacik 	}
381216e7549fSJosef Bacik fill_holes:
381316e7549fSJosef Bacik 	/* So we did prev_leaf, now we need to move to the next leaf, but a few
381416e7549fSJosef Bacik 	 * things could have happened
381516e7549fSJosef Bacik 	 *
381616e7549fSJosef Bacik 	 * 1) A merge could have happened, so we could currently be on a leaf
381716e7549fSJosef Bacik 	 * that holds what we were copying in the first place.
381816e7549fSJosef Bacik 	 * 2) A split could have happened, and now not all of the items we want
381916e7549fSJosef Bacik 	 * are on the same leaf.
382016e7549fSJosef Bacik 	 *
382116e7549fSJosef Bacik 	 * So we need to adjust how we search for holes, we need to drop the
382216e7549fSJosef Bacik 	 * path and re-search for the first extent key we found, and then walk
382316e7549fSJosef Bacik 	 * forward until we hit the last one we copied.
382416e7549fSJosef Bacik 	 */
382516e7549fSJosef Bacik 	if (need_find_last_extent) {
382616e7549fSJosef Bacik 		/* btrfs_prev_leaf could return 1 without releasing the path */
382716e7549fSJosef Bacik 		btrfs_release_path(src_path);
382844d70e19SNikolay Borisov 		ret = btrfs_search_slot(NULL, inode->root, &first_key, src_path, 0, 0);
382916e7549fSJosef Bacik 		if (ret < 0)
383016e7549fSJosef Bacik 			return ret;
383116e7549fSJosef Bacik 		ASSERT(ret == 0);
383216e7549fSJosef Bacik 		src = src_path->nodes[0];
383316e7549fSJosef Bacik 		i = src_path->slots[0];
383416e7549fSJosef Bacik 	} else {
383516e7549fSJosef Bacik 		i = start_slot;
383616e7549fSJosef Bacik 	}
383716e7549fSJosef Bacik 
383816e7549fSJosef Bacik 	/*
383916e7549fSJosef Bacik 	 * Ok so here we need to go through and fill in any holes we may have
384016e7549fSJosef Bacik 	 * to make sure that holes are punched for those areas in case they had
384116e7549fSJosef Bacik 	 * extents previously.
384216e7549fSJosef Bacik 	 */
384316e7549fSJosef Bacik 	while (!done) {
384416e7549fSJosef Bacik 		u64 offset, len;
384516e7549fSJosef Bacik 		u64 extent_end;
384616e7549fSJosef Bacik 
384716e7549fSJosef Bacik 		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
384844d70e19SNikolay Borisov 			ret = btrfs_next_leaf(inode->root, src_path);
384916e7549fSJosef Bacik 			if (ret < 0)
385016e7549fSJosef Bacik 				return ret;
385116e7549fSJosef Bacik 			ASSERT(ret == 0);
385216e7549fSJosef Bacik 			src = src_path->nodes[0];
385316e7549fSJosef Bacik 			i = 0;
385416e7549fSJosef Bacik 		}
385516e7549fSJosef Bacik 
385616e7549fSJosef Bacik 		btrfs_item_key_to_cpu(src, &key, i);
385716e7549fSJosef Bacik 		if (!btrfs_comp_cpu_keys(&key, &last_key))
385816e7549fSJosef Bacik 			done = true;
385944d70e19SNikolay Borisov 		if (key.objectid != btrfs_ino(inode) ||
386016e7549fSJosef Bacik 		    key.type != BTRFS_EXTENT_DATA_KEY) {
386116e7549fSJosef Bacik 			i++;
386216e7549fSJosef Bacik 			continue;
386316e7549fSJosef Bacik 		}
386416e7549fSJosef Bacik 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
386516e7549fSJosef Bacik 		if (btrfs_file_extent_type(src, extent) ==
386616e7549fSJosef Bacik 		    BTRFS_FILE_EXTENT_INLINE) {
3867514ac8adSChris Mason 			len = btrfs_file_extent_inline_len(src, i, extent);
3868da17066cSJeff Mahoney 			extent_end = ALIGN(key.offset + len,
38690b246afaSJeff Mahoney 					   fs_info->sectorsize);
387016e7549fSJosef Bacik 		} else {
387116e7549fSJosef Bacik 			len = btrfs_file_extent_num_bytes(src, extent);
387216e7549fSJosef Bacik 			extent_end = key.offset + len;
387316e7549fSJosef Bacik 		}
387416e7549fSJosef Bacik 		i++;
387516e7549fSJosef Bacik 
387616e7549fSJosef Bacik 		if (*last_extent == key.offset) {
387716e7549fSJosef Bacik 			*last_extent = extent_end;
387816e7549fSJosef Bacik 			continue;
387916e7549fSJosef Bacik 		}
388016e7549fSJosef Bacik 		offset = *last_extent;
388116e7549fSJosef Bacik 		len = key.offset - *last_extent;
388244d70e19SNikolay Borisov 		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
388344d70e19SNikolay Borisov 					       offset, 0, 0, len, 0, len, 0, 0, 0);
388416e7549fSJosef Bacik 		if (ret)
388516e7549fSJosef Bacik 			break;
388674121f7cSFilipe Manana 		*last_extent = extent_end;
388716e7549fSJosef Bacik 	}
388816e7549fSJosef Bacik 	/*
388916e7549fSJosef Bacik 	 * Need to let the callers know we dropped the path so they should
389016e7549fSJosef Bacik 	 * re-search.
389116e7549fSJosef Bacik 	 */
389216e7549fSJosef Bacik 	if (!ret && need_find_last_extent)
389316e7549fSJosef Bacik 		ret = 1;
38944a500fd1SYan, Zheng 	return ret;
389531ff1cd2SChris Mason }
389631ff1cd2SChris Mason 
38975dc562c5SJosef Bacik static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
38985dc562c5SJosef Bacik {
38995dc562c5SJosef Bacik 	struct extent_map *em1, *em2;
39005dc562c5SJosef Bacik 
39015dc562c5SJosef Bacik 	em1 = list_entry(a, struct extent_map, list);
39025dc562c5SJosef Bacik 	em2 = list_entry(b, struct extent_map, list);
39035dc562c5SJosef Bacik 
39045dc562c5SJosef Bacik 	if (em1->start < em2->start)
39055dc562c5SJosef Bacik 		return -1;
39065dc562c5SJosef Bacik 	else if (em1->start > em2->start)
39075dc562c5SJosef Bacik 		return 1;
39085dc562c5SJosef Bacik 	return 0;
39095dc562c5SJosef Bacik }
39105dc562c5SJosef Bacik 
39118407f553SFilipe Manana static int wait_ordered_extents(struct btrfs_trans_handle *trans,
39128407f553SFilipe Manana 				struct inode *inode,
39138407f553SFilipe Manana 				struct btrfs_root *root,
39148407f553SFilipe Manana 				const struct extent_map *em,
39158407f553SFilipe Manana 				const struct list_head *logged_list,
39168407f553SFilipe Manana 				bool *ordered_io_error)
39175dc562c5SJosef Bacik {
39180b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
39192ab28f32SJosef Bacik 	struct btrfs_ordered_extent *ordered;
39208407f553SFilipe Manana 	struct btrfs_root *log = root->log_root;
39212ab28f32SJosef Bacik 	u64 mod_start = em->mod_start;
39222ab28f32SJosef Bacik 	u64 mod_len = em->mod_len;
39238407f553SFilipe Manana 	const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
39242ab28f32SJosef Bacik 	u64 csum_offset;
39252ab28f32SJosef Bacik 	u64 csum_len;
39268407f553SFilipe Manana 	LIST_HEAD(ordered_sums);
39278407f553SFilipe Manana 	int ret = 0;
392809a2a8f9SJosef Bacik 
39298407f553SFilipe Manana 	*ordered_io_error = false;
39301acae57bSFilipe David Borba Manana 
39318407f553SFilipe Manana 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
39328407f553SFilipe Manana 	    em->block_start == EXTENT_MAP_HOLE)
393370c8a91cSJosef Bacik 		return 0;
393470c8a91cSJosef Bacik 
39352ab28f32SJosef Bacik 	/*
39368407f553SFilipe Manana 	 * Wait far any ordered extent that covers our extent map. If it
39378407f553SFilipe Manana 	 * finishes without an error, first check and see if our csums are on
39388407f553SFilipe Manana 	 * our outstanding ordered extents.
39392ab28f32SJosef Bacik 	 */
3940827463c4SMiao Xie 	list_for_each_entry(ordered, logged_list, log_list) {
39412ab28f32SJosef Bacik 		struct btrfs_ordered_sum *sum;
39422ab28f32SJosef Bacik 
39432ab28f32SJosef Bacik 		if (!mod_len)
39442ab28f32SJosef Bacik 			break;
39452ab28f32SJosef Bacik 
39462ab28f32SJosef Bacik 		if (ordered->file_offset + ordered->len <= mod_start ||
39472ab28f32SJosef Bacik 		    mod_start + mod_len <= ordered->file_offset)
39482ab28f32SJosef Bacik 			continue;
39492ab28f32SJosef Bacik 
39508407f553SFilipe Manana 		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
39518407f553SFilipe Manana 		    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
39528407f553SFilipe Manana 		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
39538407f553SFilipe Manana 			const u64 start = ordered->file_offset;
39548407f553SFilipe Manana 			const u64 end = ordered->file_offset + ordered->len - 1;
39558407f553SFilipe Manana 
39568407f553SFilipe Manana 			WARN_ON(ordered->inode != inode);
39578407f553SFilipe Manana 			filemap_fdatawrite_range(inode->i_mapping, start, end);
39588407f553SFilipe Manana 		}
39598407f553SFilipe Manana 
39608407f553SFilipe Manana 		wait_event(ordered->wait,
39618407f553SFilipe Manana 			   (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
39628407f553SFilipe Manana 			    test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
39638407f553SFilipe Manana 
39648407f553SFilipe Manana 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3965b38ef71cSFilipe Manana 			/*
3966b38ef71cSFilipe Manana 			 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3967b38ef71cSFilipe Manana 			 * i_mapping flags, so that the next fsync won't get
3968b38ef71cSFilipe Manana 			 * an outdated io error too.
3969b38ef71cSFilipe Manana 			 */
3970f0312210SMiklos Szeredi 			filemap_check_errors(inode->i_mapping);
39718407f553SFilipe Manana 			*ordered_io_error = true;
39728407f553SFilipe Manana 			break;
39738407f553SFilipe Manana 		}
39742ab28f32SJosef Bacik 		/*
39752ab28f32SJosef Bacik 		 * We are going to copy all the csums on this ordered extent, so
39762ab28f32SJosef Bacik 		 * go ahead and adjust mod_start and mod_len in case this
39772ab28f32SJosef Bacik 		 * ordered extent has already been logged.
39782ab28f32SJosef Bacik 		 */
39792ab28f32SJosef Bacik 		if (ordered->file_offset > mod_start) {
39802ab28f32SJosef Bacik 			if (ordered->file_offset + ordered->len >=
39812ab28f32SJosef Bacik 			    mod_start + mod_len)
39822ab28f32SJosef Bacik 				mod_len = ordered->file_offset - mod_start;
39832ab28f32SJosef Bacik 			/*
39842ab28f32SJosef Bacik 			 * If we have this case
39852ab28f32SJosef Bacik 			 *
39862ab28f32SJosef Bacik 			 * |--------- logged extent ---------|
39872ab28f32SJosef Bacik 			 *       |----- ordered extent ----|
39882ab28f32SJosef Bacik 			 *
39892ab28f32SJosef Bacik 			 * Just don't mess with mod_start and mod_len, we'll
39902ab28f32SJosef Bacik 			 * just end up logging more csums than we need and it
39912ab28f32SJosef Bacik 			 * will be ok.
39922ab28f32SJosef Bacik 			 */
39932ab28f32SJosef Bacik 		} else {
39942ab28f32SJosef Bacik 			if (ordered->file_offset + ordered->len <
39952ab28f32SJosef Bacik 			    mod_start + mod_len) {
39962ab28f32SJosef Bacik 				mod_len = (mod_start + mod_len) -
39972ab28f32SJosef Bacik 					(ordered->file_offset + ordered->len);
39982ab28f32SJosef Bacik 				mod_start = ordered->file_offset +
39992ab28f32SJosef Bacik 					ordered->len;
40002ab28f32SJosef Bacik 			} else {
40012ab28f32SJosef Bacik 				mod_len = 0;
40022ab28f32SJosef Bacik 			}
40032ab28f32SJosef Bacik 		}
40042ab28f32SJosef Bacik 
40058407f553SFilipe Manana 		if (skip_csum)
40068407f553SFilipe Manana 			continue;
40078407f553SFilipe Manana 
40082ab28f32SJosef Bacik 		/*
40092ab28f32SJosef Bacik 		 * To keep us from looping for the above case of an ordered
40102ab28f32SJosef Bacik 		 * extent that falls inside of the logged extent.
40112ab28f32SJosef Bacik 		 */
40122ab28f32SJosef Bacik 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
40132ab28f32SJosef Bacik 				     &ordered->flags))
40142ab28f32SJosef Bacik 			continue;
40152ab28f32SJosef Bacik 
40162ab28f32SJosef Bacik 		list_for_each_entry(sum, &ordered->list, list) {
40172ab28f32SJosef Bacik 			ret = btrfs_csum_file_blocks(trans, log, sum);
4018827463c4SMiao Xie 			if (ret)
40198407f553SFilipe Manana 				break;
40208407f553SFilipe Manana 		}
40212ab28f32SJosef Bacik 	}
40222ab28f32SJosef Bacik 
40238407f553SFilipe Manana 	if (*ordered_io_error || !mod_len || ret || skip_csum)
40242ab28f32SJosef Bacik 		return ret;
40252ab28f32SJosef Bacik 
4026488111aaSFilipe David Borba Manana 	if (em->compress_type) {
4027488111aaSFilipe David Borba Manana 		csum_offset = 0;
40288407f553SFilipe Manana 		csum_len = max(em->block_len, em->orig_block_len);
4029488111aaSFilipe David Borba Manana 	} else {
40302ab28f32SJosef Bacik 		csum_offset = mod_start - em->start;
40312ab28f32SJosef Bacik 		csum_len = mod_len;
4032488111aaSFilipe David Borba Manana 	}
40332ab28f32SJosef Bacik 
403470c8a91cSJosef Bacik 	/* block start is already adjusted for the file extent offset. */
40350b246afaSJeff Mahoney 	ret = btrfs_lookup_csums_range(fs_info->csum_root,
403670c8a91cSJosef Bacik 				       em->block_start + csum_offset,
403770c8a91cSJosef Bacik 				       em->block_start + csum_offset +
403870c8a91cSJosef Bacik 				       csum_len - 1, &ordered_sums, 0);
40395dc562c5SJosef Bacik 	if (ret)
40405dc562c5SJosef Bacik 		return ret;
404170c8a91cSJosef Bacik 
404270c8a91cSJosef Bacik 	while (!list_empty(&ordered_sums)) {
404370c8a91cSJosef Bacik 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
404470c8a91cSJosef Bacik 						   struct btrfs_ordered_sum,
404570c8a91cSJosef Bacik 						   list);
404670c8a91cSJosef Bacik 		if (!ret)
404770c8a91cSJosef Bacik 			ret = btrfs_csum_file_blocks(trans, log, sums);
404870c8a91cSJosef Bacik 		list_del(&sums->list);
404970c8a91cSJosef Bacik 		kfree(sums);
40505dc562c5SJosef Bacik 	}
40515dc562c5SJosef Bacik 
405270c8a91cSJosef Bacik 	return ret;
40535dc562c5SJosef Bacik }
40545dc562c5SJosef Bacik 
40558407f553SFilipe Manana static int log_one_extent(struct btrfs_trans_handle *trans,
40568407f553SFilipe Manana 			  struct inode *inode, struct btrfs_root *root,
40578407f553SFilipe Manana 			  const struct extent_map *em,
40588407f553SFilipe Manana 			  struct btrfs_path *path,
40598407f553SFilipe Manana 			  const struct list_head *logged_list,
40608407f553SFilipe Manana 			  struct btrfs_log_ctx *ctx)
40618407f553SFilipe Manana {
40628407f553SFilipe Manana 	struct btrfs_root *log = root->log_root;
40638407f553SFilipe Manana 	struct btrfs_file_extent_item *fi;
40648407f553SFilipe Manana 	struct extent_buffer *leaf;
40658407f553SFilipe Manana 	struct btrfs_map_token token;
40668407f553SFilipe Manana 	struct btrfs_key key;
40678407f553SFilipe Manana 	u64 extent_offset = em->start - em->orig_start;
40688407f553SFilipe Manana 	u64 block_len;
40698407f553SFilipe Manana 	int ret;
40708407f553SFilipe Manana 	int extent_inserted = 0;
40718407f553SFilipe Manana 	bool ordered_io_err = false;
40728407f553SFilipe Manana 
40738407f553SFilipe Manana 	ret = wait_ordered_extents(trans, inode, root, em, logged_list,
40748407f553SFilipe Manana 				   &ordered_io_err);
40758407f553SFilipe Manana 	if (ret)
40768407f553SFilipe Manana 		return ret;
40778407f553SFilipe Manana 
40788407f553SFilipe Manana 	if (ordered_io_err) {
40798407f553SFilipe Manana 		ctx->io_err = -EIO;
40808407f553SFilipe Manana 		return 0;
40818407f553SFilipe Manana 	}
40828407f553SFilipe Manana 
40838407f553SFilipe Manana 	btrfs_init_map_token(&token);
40848407f553SFilipe Manana 
40858407f553SFilipe Manana 	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
40868407f553SFilipe Manana 				   em->start + em->len, NULL, 0, 1,
40878407f553SFilipe Manana 				   sizeof(*fi), &extent_inserted);
40888407f553SFilipe Manana 	if (ret)
40898407f553SFilipe Manana 		return ret;
40908407f553SFilipe Manana 
40918407f553SFilipe Manana 	if (!extent_inserted) {
40924a0cc7caSNikolay Borisov 		key.objectid = btrfs_ino(BTRFS_I(inode));
40938407f553SFilipe Manana 		key.type = BTRFS_EXTENT_DATA_KEY;
40948407f553SFilipe Manana 		key.offset = em->start;
40958407f553SFilipe Manana 
40968407f553SFilipe Manana 		ret = btrfs_insert_empty_item(trans, log, path, &key,
40978407f553SFilipe Manana 					      sizeof(*fi));
40988407f553SFilipe Manana 		if (ret)
40998407f553SFilipe Manana 			return ret;
41008407f553SFilipe Manana 	}
41018407f553SFilipe Manana 	leaf = path->nodes[0];
41028407f553SFilipe Manana 	fi = btrfs_item_ptr(leaf, path->slots[0],
41038407f553SFilipe Manana 			    struct btrfs_file_extent_item);
41048407f553SFilipe Manana 
410550d9aa99SJosef Bacik 	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
41068407f553SFilipe Manana 					       &token);
41078407f553SFilipe Manana 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
41088407f553SFilipe Manana 		btrfs_set_token_file_extent_type(leaf, fi,
41098407f553SFilipe Manana 						 BTRFS_FILE_EXTENT_PREALLOC,
41108407f553SFilipe Manana 						 &token);
41118407f553SFilipe Manana 	else
41128407f553SFilipe Manana 		btrfs_set_token_file_extent_type(leaf, fi,
41138407f553SFilipe Manana 						 BTRFS_FILE_EXTENT_REG,
41148407f553SFilipe Manana 						 &token);
41158407f553SFilipe Manana 
41168407f553SFilipe Manana 	block_len = max(em->block_len, em->orig_block_len);
41178407f553SFilipe Manana 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
41188407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
41198407f553SFilipe Manana 							em->block_start,
41208407f553SFilipe Manana 							&token);
41218407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
41228407f553SFilipe Manana 							   &token);
41238407f553SFilipe Manana 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
41248407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
41258407f553SFilipe Manana 							em->block_start -
41268407f553SFilipe Manana 							extent_offset, &token);
41278407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
41288407f553SFilipe Manana 							   &token);
41298407f553SFilipe Manana 	} else {
41308407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
41318407f553SFilipe Manana 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
41328407f553SFilipe Manana 							   &token);
41338407f553SFilipe Manana 	}
41348407f553SFilipe Manana 
41358407f553SFilipe Manana 	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
41368407f553SFilipe Manana 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
41378407f553SFilipe Manana 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
41388407f553SFilipe Manana 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
41398407f553SFilipe Manana 						&token);
41408407f553SFilipe Manana 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
41418407f553SFilipe Manana 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
41428407f553SFilipe Manana 	btrfs_mark_buffer_dirty(leaf);
41438407f553SFilipe Manana 
41448407f553SFilipe Manana 	btrfs_release_path(path);
41458407f553SFilipe Manana 
41468407f553SFilipe Manana 	return ret;
41478407f553SFilipe Manana }
41488407f553SFilipe Manana 
41495dc562c5SJosef Bacik static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
41505dc562c5SJosef Bacik 				     struct btrfs_root *root,
41515dc562c5SJosef Bacik 				     struct inode *inode,
4152827463c4SMiao Xie 				     struct btrfs_path *path,
41538407f553SFilipe Manana 				     struct list_head *logged_list,
4154de0ee0edSFilipe Manana 				     struct btrfs_log_ctx *ctx,
4155de0ee0edSFilipe Manana 				     const u64 start,
4156de0ee0edSFilipe Manana 				     const u64 end)
41575dc562c5SJosef Bacik {
41585dc562c5SJosef Bacik 	struct extent_map *em, *n;
41595dc562c5SJosef Bacik 	struct list_head extents;
41605dc562c5SJosef Bacik 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
41615dc562c5SJosef Bacik 	u64 test_gen;
41625dc562c5SJosef Bacik 	int ret = 0;
41632ab28f32SJosef Bacik 	int num = 0;
41645dc562c5SJosef Bacik 
41655dc562c5SJosef Bacik 	INIT_LIST_HEAD(&extents);
41665dc562c5SJosef Bacik 
41675f9a8a51SFilipe Manana 	down_write(&BTRFS_I(inode)->dio_sem);
41685dc562c5SJosef Bacik 	write_lock(&tree->lock);
41695dc562c5SJosef Bacik 	test_gen = root->fs_info->last_trans_committed;
41705dc562c5SJosef Bacik 
41715dc562c5SJosef Bacik 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
41725dc562c5SJosef Bacik 		list_del_init(&em->list);
41732ab28f32SJosef Bacik 
41742ab28f32SJosef Bacik 		/*
41752ab28f32SJosef Bacik 		 * Just an arbitrary number, this can be really CPU intensive
41762ab28f32SJosef Bacik 		 * once we start getting a lot of extents, and really once we
41772ab28f32SJosef Bacik 		 * have a bunch of extents we just want to commit since it will
41782ab28f32SJosef Bacik 		 * be faster.
41792ab28f32SJosef Bacik 		 */
41802ab28f32SJosef Bacik 		if (++num > 32768) {
41812ab28f32SJosef Bacik 			list_del_init(&tree->modified_extents);
41822ab28f32SJosef Bacik 			ret = -EFBIG;
41832ab28f32SJosef Bacik 			goto process;
41842ab28f32SJosef Bacik 		}
41852ab28f32SJosef Bacik 
41865dc562c5SJosef Bacik 		if (em->generation <= test_gen)
41875dc562c5SJosef Bacik 			continue;
4188ff44c6e3SJosef Bacik 		/* Need a ref to keep it from getting evicted from cache */
4189ff44c6e3SJosef Bacik 		atomic_inc(&em->refs);
4190ff44c6e3SJosef Bacik 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
41915dc562c5SJosef Bacik 		list_add_tail(&em->list, &extents);
41922ab28f32SJosef Bacik 		num++;
41935dc562c5SJosef Bacik 	}
41945dc562c5SJosef Bacik 
41955dc562c5SJosef Bacik 	list_sort(NULL, &extents, extent_cmp);
4196de0ee0edSFilipe Manana 	btrfs_get_logged_extents(inode, logged_list, start, end);
41975f9a8a51SFilipe Manana 	/*
41985f9a8a51SFilipe Manana 	 * Some ordered extents started by fsync might have completed
41995f9a8a51SFilipe Manana 	 * before we could collect them into the list logged_list, which
42005f9a8a51SFilipe Manana 	 * means they're gone, not in our logged_list nor in the inode's
42015f9a8a51SFilipe Manana 	 * ordered tree. We want the application/user space to know an
42025f9a8a51SFilipe Manana 	 * error happened while attempting to persist file data so that
42035f9a8a51SFilipe Manana 	 * it can take proper action. If such error happened, we leave
42045f9a8a51SFilipe Manana 	 * without writing to the log tree and the fsync must report the
42055f9a8a51SFilipe Manana 	 * file data write error and not commit the current transaction.
42065f9a8a51SFilipe Manana 	 */
4207f0312210SMiklos Szeredi 	ret = filemap_check_errors(inode->i_mapping);
42085f9a8a51SFilipe Manana 	if (ret)
42095f9a8a51SFilipe Manana 		ctx->io_err = ret;
42102ab28f32SJosef Bacik process:
42115dc562c5SJosef Bacik 	while (!list_empty(&extents)) {
42125dc562c5SJosef Bacik 		em = list_entry(extents.next, struct extent_map, list);
42135dc562c5SJosef Bacik 
42145dc562c5SJosef Bacik 		list_del_init(&em->list);
42155dc562c5SJosef Bacik 
42165dc562c5SJosef Bacik 		/*
42175dc562c5SJosef Bacik 		 * If we had an error we just need to delete everybody from our
42185dc562c5SJosef Bacik 		 * private list.
42195dc562c5SJosef Bacik 		 */
4220ff44c6e3SJosef Bacik 		if (ret) {
4221201a9038SJosef Bacik 			clear_em_logging(tree, em);
4222ff44c6e3SJosef Bacik 			free_extent_map(em);
42235dc562c5SJosef Bacik 			continue;
4224ff44c6e3SJosef Bacik 		}
4225ff44c6e3SJosef Bacik 
4226ff44c6e3SJosef Bacik 		write_unlock(&tree->lock);
42275dc562c5SJosef Bacik 
42288407f553SFilipe Manana 		ret = log_one_extent(trans, inode, root, em, path, logged_list,
42298407f553SFilipe Manana 				     ctx);
4230ff44c6e3SJosef Bacik 		write_lock(&tree->lock);
4231201a9038SJosef Bacik 		clear_em_logging(tree, em);
4232201a9038SJosef Bacik 		free_extent_map(em);
42335dc562c5SJosef Bacik 	}
4234ff44c6e3SJosef Bacik 	WARN_ON(!list_empty(&extents));
4235ff44c6e3SJosef Bacik 	write_unlock(&tree->lock);
42365f9a8a51SFilipe Manana 	up_write(&BTRFS_I(inode)->dio_sem);
42375dc562c5SJosef Bacik 
42385dc562c5SJosef Bacik 	btrfs_release_path(path);
42395dc562c5SJosef Bacik 	return ret;
42405dc562c5SJosef Bacik }
42415dc562c5SJosef Bacik 
4242481b01c0SNikolay Borisov static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
42431a4bcf47SFilipe Manana 			     struct btrfs_path *path, u64 *size_ret)
42441a4bcf47SFilipe Manana {
42451a4bcf47SFilipe Manana 	struct btrfs_key key;
42461a4bcf47SFilipe Manana 	int ret;
42471a4bcf47SFilipe Manana 
4248481b01c0SNikolay Borisov 	key.objectid = btrfs_ino(inode);
42491a4bcf47SFilipe Manana 	key.type = BTRFS_INODE_ITEM_KEY;
42501a4bcf47SFilipe Manana 	key.offset = 0;
42511a4bcf47SFilipe Manana 
42521a4bcf47SFilipe Manana 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
42531a4bcf47SFilipe Manana 	if (ret < 0) {
42541a4bcf47SFilipe Manana 		return ret;
42551a4bcf47SFilipe Manana 	} else if (ret > 0) {
42562f2ff0eeSFilipe Manana 		*size_ret = 0;
42571a4bcf47SFilipe Manana 	} else {
42581a4bcf47SFilipe Manana 		struct btrfs_inode_item *item;
42591a4bcf47SFilipe Manana 
42601a4bcf47SFilipe Manana 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
42611a4bcf47SFilipe Manana 				      struct btrfs_inode_item);
42621a4bcf47SFilipe Manana 		*size_ret = btrfs_inode_size(path->nodes[0], item);
42631a4bcf47SFilipe Manana 	}
42641a4bcf47SFilipe Manana 
42651a4bcf47SFilipe Manana 	btrfs_release_path(path);
42661a4bcf47SFilipe Manana 	return 0;
42671a4bcf47SFilipe Manana }
42681a4bcf47SFilipe Manana 
426936283bf7SFilipe Manana /*
427036283bf7SFilipe Manana  * At the moment we always log all xattrs. This is to figure out at log replay
427136283bf7SFilipe Manana  * time which xattrs must have their deletion replayed. If a xattr is missing
427236283bf7SFilipe Manana  * in the log tree and exists in the fs/subvol tree, we delete it. This is
427336283bf7SFilipe Manana  * because if a xattr is deleted, the inode is fsynced and a power failure
427436283bf7SFilipe Manana  * happens, causing the log to be replayed the next time the fs is mounted,
427536283bf7SFilipe Manana  * we want the xattr to not exist anymore (same behaviour as other filesystems
427636283bf7SFilipe Manana  * with a journal, ext3/4, xfs, f2fs, etc).
427736283bf7SFilipe Manana  */
427836283bf7SFilipe Manana static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
427936283bf7SFilipe Manana 				struct btrfs_root *root,
4280*1a93c36aSNikolay Borisov 				struct btrfs_inode *inode,
428136283bf7SFilipe Manana 				struct btrfs_path *path,
428236283bf7SFilipe Manana 				struct btrfs_path *dst_path)
428336283bf7SFilipe Manana {
428436283bf7SFilipe Manana 	int ret;
428536283bf7SFilipe Manana 	struct btrfs_key key;
4286*1a93c36aSNikolay Borisov 	const u64 ino = btrfs_ino(inode);
428736283bf7SFilipe Manana 	int ins_nr = 0;
428836283bf7SFilipe Manana 	int start_slot = 0;
428936283bf7SFilipe Manana 
429036283bf7SFilipe Manana 	key.objectid = ino;
429136283bf7SFilipe Manana 	key.type = BTRFS_XATTR_ITEM_KEY;
429236283bf7SFilipe Manana 	key.offset = 0;
429336283bf7SFilipe Manana 
429436283bf7SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
429536283bf7SFilipe Manana 	if (ret < 0)
429636283bf7SFilipe Manana 		return ret;
429736283bf7SFilipe Manana 
429836283bf7SFilipe Manana 	while (true) {
429936283bf7SFilipe Manana 		int slot = path->slots[0];
430036283bf7SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
430136283bf7SFilipe Manana 		int nritems = btrfs_header_nritems(leaf);
430236283bf7SFilipe Manana 
430336283bf7SFilipe Manana 		if (slot >= nritems) {
430436283bf7SFilipe Manana 			if (ins_nr > 0) {
430536283bf7SFilipe Manana 				u64 last_extent = 0;
430636283bf7SFilipe Manana 
4307*1a93c36aSNikolay Borisov 				ret = copy_items(trans, inode, dst_path, path,
430836283bf7SFilipe Manana 						 &last_extent, start_slot,
430936283bf7SFilipe Manana 						 ins_nr, 1, 0);
431036283bf7SFilipe Manana 				/* can't be 1, extent items aren't processed */
431136283bf7SFilipe Manana 				ASSERT(ret <= 0);
431236283bf7SFilipe Manana 				if (ret < 0)
431336283bf7SFilipe Manana 					return ret;
431436283bf7SFilipe Manana 				ins_nr = 0;
431536283bf7SFilipe Manana 			}
431636283bf7SFilipe Manana 			ret = btrfs_next_leaf(root, path);
431736283bf7SFilipe Manana 			if (ret < 0)
431836283bf7SFilipe Manana 				return ret;
431936283bf7SFilipe Manana 			else if (ret > 0)
432036283bf7SFilipe Manana 				break;
432136283bf7SFilipe Manana 			continue;
432236283bf7SFilipe Manana 		}
432336283bf7SFilipe Manana 
432436283bf7SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
432536283bf7SFilipe Manana 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
432636283bf7SFilipe Manana 			break;
432736283bf7SFilipe Manana 
432836283bf7SFilipe Manana 		if (ins_nr == 0)
432936283bf7SFilipe Manana 			start_slot = slot;
433036283bf7SFilipe Manana 		ins_nr++;
433136283bf7SFilipe Manana 		path->slots[0]++;
433236283bf7SFilipe Manana 		cond_resched();
433336283bf7SFilipe Manana 	}
433436283bf7SFilipe Manana 	if (ins_nr > 0) {
433536283bf7SFilipe Manana 		u64 last_extent = 0;
433636283bf7SFilipe Manana 
4337*1a93c36aSNikolay Borisov 		ret = copy_items(trans, inode, dst_path, path,
433836283bf7SFilipe Manana 				 &last_extent, start_slot,
433936283bf7SFilipe Manana 				 ins_nr, 1, 0);
434036283bf7SFilipe Manana 		/* can't be 1, extent items aren't processed */
434136283bf7SFilipe Manana 		ASSERT(ret <= 0);
434236283bf7SFilipe Manana 		if (ret < 0)
434336283bf7SFilipe Manana 			return ret;
434436283bf7SFilipe Manana 	}
434536283bf7SFilipe Manana 
434636283bf7SFilipe Manana 	return 0;
434736283bf7SFilipe Manana }
434836283bf7SFilipe Manana 
4349a89ca6f2SFilipe Manana /*
4350a89ca6f2SFilipe Manana  * If the no holes feature is enabled we need to make sure any hole between the
4351a89ca6f2SFilipe Manana  * last extent and the i_size of our inode is explicitly marked in the log. This
4352a89ca6f2SFilipe Manana  * is to make sure that doing something like:
4353a89ca6f2SFilipe Manana  *
4354a89ca6f2SFilipe Manana  *      1) create file with 128Kb of data
4355a89ca6f2SFilipe Manana  *      2) truncate file to 64Kb
4356a89ca6f2SFilipe Manana  *      3) truncate file to 256Kb
4357a89ca6f2SFilipe Manana  *      4) fsync file
4358a89ca6f2SFilipe Manana  *      5) <crash/power failure>
4359a89ca6f2SFilipe Manana  *      6) mount fs and trigger log replay
4360a89ca6f2SFilipe Manana  *
4361a89ca6f2SFilipe Manana  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4362a89ca6f2SFilipe Manana  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4363a89ca6f2SFilipe Manana  * file correspond to a hole. The presence of explicit holes in a log tree is
4364a89ca6f2SFilipe Manana  * what guarantees that log replay will remove/adjust file extent items in the
4365a89ca6f2SFilipe Manana  * fs/subvol tree.
4366a89ca6f2SFilipe Manana  *
4367a89ca6f2SFilipe Manana  * Here we do not need to care about holes between extents, that is already done
4368a89ca6f2SFilipe Manana  * by copy_items(). We also only need to do this in the full sync path, where we
4369a89ca6f2SFilipe Manana  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4370a89ca6f2SFilipe Manana  * lookup the list of modified extent maps and if any represents a hole, we
4371a89ca6f2SFilipe Manana  * insert a corresponding extent representing a hole in the log tree.
4372a89ca6f2SFilipe Manana  */
4373a89ca6f2SFilipe Manana static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4374a89ca6f2SFilipe Manana 				   struct btrfs_root *root,
4375a89ca6f2SFilipe Manana 				   struct inode *inode,
4376a89ca6f2SFilipe Manana 				   struct btrfs_path *path)
4377a89ca6f2SFilipe Manana {
43780b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4379a89ca6f2SFilipe Manana 	int ret;
4380a89ca6f2SFilipe Manana 	struct btrfs_key key;
4381a89ca6f2SFilipe Manana 	u64 hole_start;
4382a89ca6f2SFilipe Manana 	u64 hole_size;
4383a89ca6f2SFilipe Manana 	struct extent_buffer *leaf;
4384a89ca6f2SFilipe Manana 	struct btrfs_root *log = root->log_root;
43854a0cc7caSNikolay Borisov 	const u64 ino = btrfs_ino(BTRFS_I(inode));
4386a89ca6f2SFilipe Manana 	const u64 i_size = i_size_read(inode);
4387a89ca6f2SFilipe Manana 
43880b246afaSJeff Mahoney 	if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4389a89ca6f2SFilipe Manana 		return 0;
4390a89ca6f2SFilipe Manana 
4391a89ca6f2SFilipe Manana 	key.objectid = ino;
4392a89ca6f2SFilipe Manana 	key.type = BTRFS_EXTENT_DATA_KEY;
4393a89ca6f2SFilipe Manana 	key.offset = (u64)-1;
4394a89ca6f2SFilipe Manana 
4395a89ca6f2SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4396a89ca6f2SFilipe Manana 	ASSERT(ret != 0);
4397a89ca6f2SFilipe Manana 	if (ret < 0)
4398a89ca6f2SFilipe Manana 		return ret;
4399a89ca6f2SFilipe Manana 
4400a89ca6f2SFilipe Manana 	ASSERT(path->slots[0] > 0);
4401a89ca6f2SFilipe Manana 	path->slots[0]--;
4402a89ca6f2SFilipe Manana 	leaf = path->nodes[0];
4403a89ca6f2SFilipe Manana 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4404a89ca6f2SFilipe Manana 
4405a89ca6f2SFilipe Manana 	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4406a89ca6f2SFilipe Manana 		/* inode does not have any extents */
4407a89ca6f2SFilipe Manana 		hole_start = 0;
4408a89ca6f2SFilipe Manana 		hole_size = i_size;
4409a89ca6f2SFilipe Manana 	} else {
4410a89ca6f2SFilipe Manana 		struct btrfs_file_extent_item *extent;
4411a89ca6f2SFilipe Manana 		u64 len;
4412a89ca6f2SFilipe Manana 
4413a89ca6f2SFilipe Manana 		/*
4414a89ca6f2SFilipe Manana 		 * If there's an extent beyond i_size, an explicit hole was
4415a89ca6f2SFilipe Manana 		 * already inserted by copy_items().
4416a89ca6f2SFilipe Manana 		 */
4417a89ca6f2SFilipe Manana 		if (key.offset >= i_size)
4418a89ca6f2SFilipe Manana 			return 0;
4419a89ca6f2SFilipe Manana 
4420a89ca6f2SFilipe Manana 		extent = btrfs_item_ptr(leaf, path->slots[0],
4421a89ca6f2SFilipe Manana 					struct btrfs_file_extent_item);
4422a89ca6f2SFilipe Manana 
4423a89ca6f2SFilipe Manana 		if (btrfs_file_extent_type(leaf, extent) ==
4424a89ca6f2SFilipe Manana 		    BTRFS_FILE_EXTENT_INLINE) {
4425a89ca6f2SFilipe Manana 			len = btrfs_file_extent_inline_len(leaf,
4426a89ca6f2SFilipe Manana 							   path->slots[0],
4427a89ca6f2SFilipe Manana 							   extent);
4428a89ca6f2SFilipe Manana 			ASSERT(len == i_size);
4429a89ca6f2SFilipe Manana 			return 0;
4430a89ca6f2SFilipe Manana 		}
4431a89ca6f2SFilipe Manana 
4432a89ca6f2SFilipe Manana 		len = btrfs_file_extent_num_bytes(leaf, extent);
4433a89ca6f2SFilipe Manana 		/* Last extent goes beyond i_size, no need to log a hole. */
4434a89ca6f2SFilipe Manana 		if (key.offset + len > i_size)
4435a89ca6f2SFilipe Manana 			return 0;
4436a89ca6f2SFilipe Manana 		hole_start = key.offset + len;
4437a89ca6f2SFilipe Manana 		hole_size = i_size - hole_start;
4438a89ca6f2SFilipe Manana 	}
4439a89ca6f2SFilipe Manana 	btrfs_release_path(path);
4440a89ca6f2SFilipe Manana 
4441a89ca6f2SFilipe Manana 	/* Last extent ends at i_size. */
4442a89ca6f2SFilipe Manana 	if (hole_size == 0)
4443a89ca6f2SFilipe Manana 		return 0;
4444a89ca6f2SFilipe Manana 
44450b246afaSJeff Mahoney 	hole_size = ALIGN(hole_size, fs_info->sectorsize);
4446a89ca6f2SFilipe Manana 	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4447a89ca6f2SFilipe Manana 				       hole_size, 0, hole_size, 0, 0, 0);
4448a89ca6f2SFilipe Manana 	return ret;
4449a89ca6f2SFilipe Manana }
4450a89ca6f2SFilipe Manana 
445156f23fdbSFilipe Manana /*
445256f23fdbSFilipe Manana  * When we are logging a new inode X, check if it doesn't have a reference that
445356f23fdbSFilipe Manana  * matches the reference from some other inode Y created in a past transaction
445456f23fdbSFilipe Manana  * and that was renamed in the current transaction. If we don't do this, then at
445556f23fdbSFilipe Manana  * log replay time we can lose inode Y (and all its files if it's a directory):
445656f23fdbSFilipe Manana  *
445756f23fdbSFilipe Manana  * mkdir /mnt/x
445856f23fdbSFilipe Manana  * echo "hello world" > /mnt/x/foobar
445956f23fdbSFilipe Manana  * sync
446056f23fdbSFilipe Manana  * mv /mnt/x /mnt/y
446156f23fdbSFilipe Manana  * mkdir /mnt/x                 # or touch /mnt/x
446256f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/x
446356f23fdbSFilipe Manana  * <power fail>
446456f23fdbSFilipe Manana  * mount fs, trigger log replay
446556f23fdbSFilipe Manana  *
446656f23fdbSFilipe Manana  * After the log replay procedure, we would lose the first directory and all its
446756f23fdbSFilipe Manana  * files (file foobar).
446856f23fdbSFilipe Manana  * For the case where inode Y is not a directory we simply end up losing it:
446956f23fdbSFilipe Manana  *
447056f23fdbSFilipe Manana  * echo "123" > /mnt/foo
447156f23fdbSFilipe Manana  * sync
447256f23fdbSFilipe Manana  * mv /mnt/foo /mnt/bar
447356f23fdbSFilipe Manana  * echo "abc" > /mnt/foo
447456f23fdbSFilipe Manana  * xfs_io -c fsync /mnt/foo
447556f23fdbSFilipe Manana  * <power fail>
447656f23fdbSFilipe Manana  *
447756f23fdbSFilipe Manana  * We also need this for cases where a snapshot entry is replaced by some other
447856f23fdbSFilipe Manana  * entry (file or directory) otherwise we end up with an unreplayable log due to
447956f23fdbSFilipe Manana  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
448056f23fdbSFilipe Manana  * if it were a regular entry:
448156f23fdbSFilipe Manana  *
448256f23fdbSFilipe Manana  * mkdir /mnt/x
448356f23fdbSFilipe Manana  * btrfs subvolume snapshot /mnt /mnt/x/snap
448456f23fdbSFilipe Manana  * btrfs subvolume delete /mnt/x/snap
448556f23fdbSFilipe Manana  * rmdir /mnt/x
448656f23fdbSFilipe Manana  * mkdir /mnt/x
448756f23fdbSFilipe Manana  * fsync /mnt/x or fsync some new file inside it
448856f23fdbSFilipe Manana  * <power fail>
448956f23fdbSFilipe Manana  *
449056f23fdbSFilipe Manana  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
449156f23fdbSFilipe Manana  * the same transaction.
449256f23fdbSFilipe Manana  */
449356f23fdbSFilipe Manana static int btrfs_check_ref_name_override(struct extent_buffer *eb,
449456f23fdbSFilipe Manana 					 const int slot,
449556f23fdbSFilipe Manana 					 const struct btrfs_key *key,
44964791c8f1SNikolay Borisov 					 struct btrfs_inode *inode,
449744f714daSFilipe Manana 					 u64 *other_ino)
449856f23fdbSFilipe Manana {
449956f23fdbSFilipe Manana 	int ret;
450056f23fdbSFilipe Manana 	struct btrfs_path *search_path;
450156f23fdbSFilipe Manana 	char *name = NULL;
450256f23fdbSFilipe Manana 	u32 name_len = 0;
450356f23fdbSFilipe Manana 	u32 item_size = btrfs_item_size_nr(eb, slot);
450456f23fdbSFilipe Manana 	u32 cur_offset = 0;
450556f23fdbSFilipe Manana 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
450656f23fdbSFilipe Manana 
450756f23fdbSFilipe Manana 	search_path = btrfs_alloc_path();
450856f23fdbSFilipe Manana 	if (!search_path)
450956f23fdbSFilipe Manana 		return -ENOMEM;
451056f23fdbSFilipe Manana 	search_path->search_commit_root = 1;
451156f23fdbSFilipe Manana 	search_path->skip_locking = 1;
451256f23fdbSFilipe Manana 
451356f23fdbSFilipe Manana 	while (cur_offset < item_size) {
451456f23fdbSFilipe Manana 		u64 parent;
451556f23fdbSFilipe Manana 		u32 this_name_len;
451656f23fdbSFilipe Manana 		u32 this_len;
451756f23fdbSFilipe Manana 		unsigned long name_ptr;
451856f23fdbSFilipe Manana 		struct btrfs_dir_item *di;
451956f23fdbSFilipe Manana 
452056f23fdbSFilipe Manana 		if (key->type == BTRFS_INODE_REF_KEY) {
452156f23fdbSFilipe Manana 			struct btrfs_inode_ref *iref;
452256f23fdbSFilipe Manana 
452356f23fdbSFilipe Manana 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
452456f23fdbSFilipe Manana 			parent = key->offset;
452556f23fdbSFilipe Manana 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
452656f23fdbSFilipe Manana 			name_ptr = (unsigned long)(iref + 1);
452756f23fdbSFilipe Manana 			this_len = sizeof(*iref) + this_name_len;
452856f23fdbSFilipe Manana 		} else {
452956f23fdbSFilipe Manana 			struct btrfs_inode_extref *extref;
453056f23fdbSFilipe Manana 
453156f23fdbSFilipe Manana 			extref = (struct btrfs_inode_extref *)(ptr +
453256f23fdbSFilipe Manana 							       cur_offset);
453356f23fdbSFilipe Manana 			parent = btrfs_inode_extref_parent(eb, extref);
453456f23fdbSFilipe Manana 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
453556f23fdbSFilipe Manana 			name_ptr = (unsigned long)&extref->name;
453656f23fdbSFilipe Manana 			this_len = sizeof(*extref) + this_name_len;
453756f23fdbSFilipe Manana 		}
453856f23fdbSFilipe Manana 
453956f23fdbSFilipe Manana 		if (this_name_len > name_len) {
454056f23fdbSFilipe Manana 			char *new_name;
454156f23fdbSFilipe Manana 
454256f23fdbSFilipe Manana 			new_name = krealloc(name, this_name_len, GFP_NOFS);
454356f23fdbSFilipe Manana 			if (!new_name) {
454456f23fdbSFilipe Manana 				ret = -ENOMEM;
454556f23fdbSFilipe Manana 				goto out;
454656f23fdbSFilipe Manana 			}
454756f23fdbSFilipe Manana 			name_len = this_name_len;
454856f23fdbSFilipe Manana 			name = new_name;
454956f23fdbSFilipe Manana 		}
455056f23fdbSFilipe Manana 
455156f23fdbSFilipe Manana 		read_extent_buffer(eb, name, name_ptr, this_name_len);
45524791c8f1SNikolay Borisov 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
45534791c8f1SNikolay Borisov 				parent, name, this_name_len, 0);
455456f23fdbSFilipe Manana 		if (di && !IS_ERR(di)) {
455544f714daSFilipe Manana 			struct btrfs_key di_key;
455644f714daSFilipe Manana 
455744f714daSFilipe Manana 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
455844f714daSFilipe Manana 						  di, &di_key);
455944f714daSFilipe Manana 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
456056f23fdbSFilipe Manana 				ret = 1;
456144f714daSFilipe Manana 				*other_ino = di_key.objectid;
456244f714daSFilipe Manana 			} else {
456344f714daSFilipe Manana 				ret = -EAGAIN;
456444f714daSFilipe Manana 			}
456556f23fdbSFilipe Manana 			goto out;
456656f23fdbSFilipe Manana 		} else if (IS_ERR(di)) {
456756f23fdbSFilipe Manana 			ret = PTR_ERR(di);
456856f23fdbSFilipe Manana 			goto out;
456956f23fdbSFilipe Manana 		}
457056f23fdbSFilipe Manana 		btrfs_release_path(search_path);
457156f23fdbSFilipe Manana 
457256f23fdbSFilipe Manana 		cur_offset += this_len;
457356f23fdbSFilipe Manana 	}
457456f23fdbSFilipe Manana 	ret = 0;
457556f23fdbSFilipe Manana out:
457656f23fdbSFilipe Manana 	btrfs_free_path(search_path);
457756f23fdbSFilipe Manana 	kfree(name);
457856f23fdbSFilipe Manana 	return ret;
457956f23fdbSFilipe Manana }
458056f23fdbSFilipe Manana 
4581e02119d5SChris Mason /* log a single inode in the tree log.
4582e02119d5SChris Mason  * At least one parent directory for this inode must exist in the tree
4583e02119d5SChris Mason  * or be logged already.
4584e02119d5SChris Mason  *
4585e02119d5SChris Mason  * Any items from this inode changed by the current transaction are copied
4586e02119d5SChris Mason  * to the log tree.  An extra reference is taken on any extents in this
4587e02119d5SChris Mason  * file, allowing us to avoid a whole pile of corner cases around logging
4588e02119d5SChris Mason  * blocks that have been removed from the tree.
4589e02119d5SChris Mason  *
4590e02119d5SChris Mason  * See LOG_INODE_ALL and related defines for a description of what inode_only
4591e02119d5SChris Mason  * does.
4592e02119d5SChris Mason  *
4593e02119d5SChris Mason  * This handles both files and directories.
4594e02119d5SChris Mason  */
459512fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4596e02119d5SChris Mason 			   struct btrfs_root *root, struct inode *inode,
459749dae1bcSFilipe Manana 			   int inode_only,
459849dae1bcSFilipe Manana 			   const loff_t start,
45998407f553SFilipe Manana 			   const loff_t end,
46008407f553SFilipe Manana 			   struct btrfs_log_ctx *ctx)
4601e02119d5SChris Mason {
46020b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
4603e02119d5SChris Mason 	struct btrfs_path *path;
4604e02119d5SChris Mason 	struct btrfs_path *dst_path;
4605e02119d5SChris Mason 	struct btrfs_key min_key;
4606e02119d5SChris Mason 	struct btrfs_key max_key;
4607e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
460831ff1cd2SChris Mason 	struct extent_buffer *src = NULL;
4609827463c4SMiao Xie 	LIST_HEAD(logged_list);
461016e7549fSJosef Bacik 	u64 last_extent = 0;
46114a500fd1SYan, Zheng 	int err = 0;
4612e02119d5SChris Mason 	int ret;
46133a5f1d45SChris Mason 	int nritems;
461431ff1cd2SChris Mason 	int ins_start_slot = 0;
461531ff1cd2SChris Mason 	int ins_nr;
46165dc562c5SJosef Bacik 	bool fast_search = false;
46174a0cc7caSNikolay Borisov 	u64 ino = btrfs_ino(BTRFS_I(inode));
461849dae1bcSFilipe Manana 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
46191a4bcf47SFilipe Manana 	u64 logged_isize = 0;
4620e4545de5SFilipe Manana 	bool need_log_inode_item = true;
4621e02119d5SChris Mason 
4622e02119d5SChris Mason 	path = btrfs_alloc_path();
46235df67083STsutomu Itoh 	if (!path)
46245df67083STsutomu Itoh 		return -ENOMEM;
4625e02119d5SChris Mason 	dst_path = btrfs_alloc_path();
46265df67083STsutomu Itoh 	if (!dst_path) {
46275df67083STsutomu Itoh 		btrfs_free_path(path);
46285df67083STsutomu Itoh 		return -ENOMEM;
46295df67083STsutomu Itoh 	}
4630e02119d5SChris Mason 
463133345d01SLi Zefan 	min_key.objectid = ino;
4632e02119d5SChris Mason 	min_key.type = BTRFS_INODE_ITEM_KEY;
4633e02119d5SChris Mason 	min_key.offset = 0;
4634e02119d5SChris Mason 
463533345d01SLi Zefan 	max_key.objectid = ino;
463612fcfd22SChris Mason 
463712fcfd22SChris Mason 
46385dc562c5SJosef Bacik 	/* today the code can only do partial logging of directories */
46395269b67eSMiao Xie 	if (S_ISDIR(inode->i_mode) ||
46405269b67eSMiao Xie 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
46415269b67eSMiao Xie 		       &BTRFS_I(inode)->runtime_flags) &&
4642781feef7SLiu Bo 	     inode_only >= LOG_INODE_EXISTS))
4643e02119d5SChris Mason 		max_key.type = BTRFS_XATTR_ITEM_KEY;
4644e02119d5SChris Mason 	else
4645e02119d5SChris Mason 		max_key.type = (u8)-1;
4646e02119d5SChris Mason 	max_key.offset = (u64)-1;
4647e02119d5SChris Mason 
46482c2c452bSFilipe Manana 	/*
46492c2c452bSFilipe Manana 	 * Only run delayed items if we are a dir or a new file.
46502c2c452bSFilipe Manana 	 * Otherwise commit the delayed inode only, which is needed in
46512c2c452bSFilipe Manana 	 * order for the log replay code to mark inodes for link count
46522c2c452bSFilipe Manana 	 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
46532c2c452bSFilipe Manana 	 */
465494edf4aeSJosef Bacik 	if (S_ISDIR(inode->i_mode) ||
46550b246afaSJeff Mahoney 	    BTRFS_I(inode)->generation > fs_info->last_trans_committed)
46565f4b32e9SNikolay Borisov 		ret = btrfs_commit_inode_delayed_items(trans, BTRFS_I(inode));
46572c2c452bSFilipe Manana 	else
4658aa79021fSNikolay Borisov 		ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
46592c2c452bSFilipe Manana 
466016cdcec7SMiao Xie 	if (ret) {
466116cdcec7SMiao Xie 		btrfs_free_path(path);
466216cdcec7SMiao Xie 		btrfs_free_path(dst_path);
466316cdcec7SMiao Xie 		return ret;
466416cdcec7SMiao Xie 	}
466516cdcec7SMiao Xie 
4666781feef7SLiu Bo 	if (inode_only == LOG_OTHER_INODE) {
4667781feef7SLiu Bo 		inode_only = LOG_INODE_EXISTS;
4668781feef7SLiu Bo 		mutex_lock_nested(&BTRFS_I(inode)->log_mutex,
4669781feef7SLiu Bo 				  SINGLE_DEPTH_NESTING);
4670781feef7SLiu Bo 	} else {
4671e02119d5SChris Mason 		mutex_lock(&BTRFS_I(inode)->log_mutex);
4672781feef7SLiu Bo 	}
4673e02119d5SChris Mason 
46745e33a2bdSFilipe Manana 	/*
4675e02119d5SChris Mason 	 * a brute force approach to making sure we get the most uptodate
4676e02119d5SChris Mason 	 * copies of everything.
4677e02119d5SChris Mason 	 */
4678e02119d5SChris Mason 	if (S_ISDIR(inode->i_mode)) {
4679e02119d5SChris Mason 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4680e02119d5SChris Mason 
46814f764e51SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS)
46824f764e51SFilipe Manana 			max_key_type = BTRFS_XATTR_ITEM_KEY;
468333345d01SLi Zefan 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4684e02119d5SChris Mason 	} else {
46851a4bcf47SFilipe Manana 		if (inode_only == LOG_INODE_EXISTS) {
46861a4bcf47SFilipe Manana 			/*
46871a4bcf47SFilipe Manana 			 * Make sure the new inode item we write to the log has
46881a4bcf47SFilipe Manana 			 * the same isize as the current one (if it exists).
46891a4bcf47SFilipe Manana 			 * This is necessary to prevent data loss after log
46901a4bcf47SFilipe Manana 			 * replay, and also to prevent doing a wrong expanding
46911a4bcf47SFilipe Manana 			 * truncate - for e.g. create file, write 4K into offset
46921a4bcf47SFilipe Manana 			 * 0, fsync, write 4K into offset 4096, add hard link,
46931a4bcf47SFilipe Manana 			 * fsync some other file (to sync log), power fail - if
46941a4bcf47SFilipe Manana 			 * we use the inode's current i_size, after log replay
46951a4bcf47SFilipe Manana 			 * we get a 8Kb file, with the last 4Kb extent as a hole
46961a4bcf47SFilipe Manana 			 * (zeroes), as if an expanding truncate happened,
46971a4bcf47SFilipe Manana 			 * instead of getting a file of 4Kb only.
46981a4bcf47SFilipe Manana 			 */
4699481b01c0SNikolay Borisov 			err = logged_inode_size(log, BTRFS_I(inode), path,
47001a4bcf47SFilipe Manana 						&logged_isize);
47011a4bcf47SFilipe Manana 			if (err)
47021a4bcf47SFilipe Manana 				goto out_unlock;
47031a4bcf47SFilipe Manana 		}
4704a742994aSFilipe Manana 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
47055dc562c5SJosef Bacik 			     &BTRFS_I(inode)->runtime_flags)) {
4706a742994aSFilipe Manana 			if (inode_only == LOG_INODE_EXISTS) {
47074f764e51SFilipe Manana 				max_key.type = BTRFS_XATTR_ITEM_KEY;
4708a742994aSFilipe Manana 				ret = drop_objectid_items(trans, log, path, ino,
4709a742994aSFilipe Manana 							  max_key.type);
4710a742994aSFilipe Manana 			} else {
4711a742994aSFilipe Manana 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4712a742994aSFilipe Manana 					  &BTRFS_I(inode)->runtime_flags);
4713e9976151SJosef Bacik 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4714e9976151SJosef Bacik 					  &BTRFS_I(inode)->runtime_flags);
471528ed1345SChris Mason 				while(1) {
471628ed1345SChris Mason 					ret = btrfs_truncate_inode_items(trans,
471728ed1345SChris Mason 							 log, inode, 0, 0);
471828ed1345SChris Mason 					if (ret != -EAGAIN)
471928ed1345SChris Mason 						break;
472028ed1345SChris Mason 				}
4721a742994aSFilipe Manana 			}
47224f764e51SFilipe Manana 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
47236cfab851SJosef Bacik 					      &BTRFS_I(inode)->runtime_flags) ||
47246cfab851SJosef Bacik 			   inode_only == LOG_INODE_EXISTS) {
47254f764e51SFilipe Manana 			if (inode_only == LOG_INODE_ALL)
4726a95249b3SJosef Bacik 				fast_search = true;
4727a95249b3SJosef Bacik 			max_key.type = BTRFS_XATTR_ITEM_KEY;
4728a95249b3SJosef Bacik 			ret = drop_objectid_items(trans, log, path, ino,
4729a95249b3SJosef Bacik 						  max_key.type);
47305dc562c5SJosef Bacik 		} else {
4731183f37faSLiu Bo 			if (inode_only == LOG_INODE_ALL)
47325dc562c5SJosef Bacik 				fast_search = true;
4733a95249b3SJosef Bacik 			goto log_extents;
4734a95249b3SJosef Bacik 		}
4735a95249b3SJosef Bacik 
4736e02119d5SChris Mason 	}
47374a500fd1SYan, Zheng 	if (ret) {
47384a500fd1SYan, Zheng 		err = ret;
47394a500fd1SYan, Zheng 		goto out_unlock;
47404a500fd1SYan, Zheng 	}
4741e02119d5SChris Mason 
4742e02119d5SChris Mason 	while (1) {
474331ff1cd2SChris Mason 		ins_nr = 0;
47446174d3cbSFilipe David Borba Manana 		ret = btrfs_search_forward(root, &min_key,
4745de78b51aSEric Sandeen 					   path, trans->transid);
4746fb770ae4SLiu Bo 		if (ret < 0) {
4747fb770ae4SLiu Bo 			err = ret;
4748fb770ae4SLiu Bo 			goto out_unlock;
4749fb770ae4SLiu Bo 		}
4750e02119d5SChris Mason 		if (ret != 0)
4751e02119d5SChris Mason 			break;
47523a5f1d45SChris Mason again:
475331ff1cd2SChris Mason 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
475433345d01SLi Zefan 		if (min_key.objectid != ino)
4755e02119d5SChris Mason 			break;
4756e02119d5SChris Mason 		if (min_key.type > max_key.type)
4757e02119d5SChris Mason 			break;
475831ff1cd2SChris Mason 
4759e4545de5SFilipe Manana 		if (min_key.type == BTRFS_INODE_ITEM_KEY)
4760e4545de5SFilipe Manana 			need_log_inode_item = false;
4761e4545de5SFilipe Manana 
476256f23fdbSFilipe Manana 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
476356f23fdbSFilipe Manana 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
476456f23fdbSFilipe Manana 		    BTRFS_I(inode)->generation == trans->transid) {
476544f714daSFilipe Manana 			u64 other_ino = 0;
476644f714daSFilipe Manana 
476756f23fdbSFilipe Manana 			ret = btrfs_check_ref_name_override(path->nodes[0],
476856f23fdbSFilipe Manana 							    path->slots[0],
47694791c8f1SNikolay Borisov 							    &min_key, BTRFS_I(inode),
477044f714daSFilipe Manana 							    &other_ino);
477156f23fdbSFilipe Manana 			if (ret < 0) {
477256f23fdbSFilipe Manana 				err = ret;
477356f23fdbSFilipe Manana 				goto out_unlock;
477428a23593SFilipe Manana 			} else if (ret > 0 && ctx &&
47754a0cc7caSNikolay Borisov 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
477644f714daSFilipe Manana 				struct btrfs_key inode_key;
477744f714daSFilipe Manana 				struct inode *other_inode;
477844f714daSFilipe Manana 
477944f714daSFilipe Manana 				if (ins_nr > 0) {
478044f714daSFilipe Manana 					ins_nr++;
478144f714daSFilipe Manana 				} else {
478244f714daSFilipe Manana 					ins_nr = 1;
478344f714daSFilipe Manana 					ins_start_slot = path->slots[0];
478444f714daSFilipe Manana 				}
478544d70e19SNikolay Borisov 				ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
478644f714daSFilipe Manana 						 &last_extent, ins_start_slot,
478744f714daSFilipe Manana 						 ins_nr, inode_only,
478844f714daSFilipe Manana 						 logged_isize);
478944f714daSFilipe Manana 				if (ret < 0) {
479044f714daSFilipe Manana 					err = ret;
479156f23fdbSFilipe Manana 					goto out_unlock;
479256f23fdbSFilipe Manana 				}
479344f714daSFilipe Manana 				ins_nr = 0;
479444f714daSFilipe Manana 				btrfs_release_path(path);
479544f714daSFilipe Manana 				inode_key.objectid = other_ino;
479644f714daSFilipe Manana 				inode_key.type = BTRFS_INODE_ITEM_KEY;
479744f714daSFilipe Manana 				inode_key.offset = 0;
47980b246afaSJeff Mahoney 				other_inode = btrfs_iget(fs_info->sb,
479944f714daSFilipe Manana 							 &inode_key, root,
480044f714daSFilipe Manana 							 NULL);
480144f714daSFilipe Manana 				/*
480244f714daSFilipe Manana 				 * If the other inode that had a conflicting dir
480344f714daSFilipe Manana 				 * entry was deleted in the current transaction,
480444f714daSFilipe Manana 				 * we don't need to do more work nor fallback to
480544f714daSFilipe Manana 				 * a transaction commit.
480644f714daSFilipe Manana 				 */
480744f714daSFilipe Manana 				if (IS_ERR(other_inode) &&
480844f714daSFilipe Manana 				    PTR_ERR(other_inode) == -ENOENT) {
480944f714daSFilipe Manana 					goto next_key;
481044f714daSFilipe Manana 				} else if (IS_ERR(other_inode)) {
481144f714daSFilipe Manana 					err = PTR_ERR(other_inode);
481244f714daSFilipe Manana 					goto out_unlock;
481344f714daSFilipe Manana 				}
481444f714daSFilipe Manana 				/*
481544f714daSFilipe Manana 				 * We are safe logging the other inode without
481644f714daSFilipe Manana 				 * acquiring its i_mutex as long as we log with
481744f714daSFilipe Manana 				 * the LOG_INODE_EXISTS mode. We're safe against
481844f714daSFilipe Manana 				 * concurrent renames of the other inode as well
481944f714daSFilipe Manana 				 * because during a rename we pin the log and
482044f714daSFilipe Manana 				 * update the log with the new name before we
482144f714daSFilipe Manana 				 * unpin it.
482244f714daSFilipe Manana 				 */
482344f714daSFilipe Manana 				err = btrfs_log_inode(trans, root, other_inode,
4824781feef7SLiu Bo 						      LOG_OTHER_INODE,
482544f714daSFilipe Manana 						      0, LLONG_MAX, ctx);
482644f714daSFilipe Manana 				iput(other_inode);
482744f714daSFilipe Manana 				if (err)
482844f714daSFilipe Manana 					goto out_unlock;
482944f714daSFilipe Manana 				else
483044f714daSFilipe Manana 					goto next_key;
483144f714daSFilipe Manana 			}
483256f23fdbSFilipe Manana 		}
483356f23fdbSFilipe Manana 
483436283bf7SFilipe Manana 		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
483536283bf7SFilipe Manana 		if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
483636283bf7SFilipe Manana 			if (ins_nr == 0)
483736283bf7SFilipe Manana 				goto next_slot;
483844d70e19SNikolay Borisov 			ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
483936283bf7SFilipe Manana 					 &last_extent, ins_start_slot,
484036283bf7SFilipe Manana 					 ins_nr, inode_only, logged_isize);
484136283bf7SFilipe Manana 			if (ret < 0) {
484236283bf7SFilipe Manana 				err = ret;
484336283bf7SFilipe Manana 				goto out_unlock;
484436283bf7SFilipe Manana 			}
484536283bf7SFilipe Manana 			ins_nr = 0;
484636283bf7SFilipe Manana 			if (ret) {
484736283bf7SFilipe Manana 				btrfs_release_path(path);
484836283bf7SFilipe Manana 				continue;
484936283bf7SFilipe Manana 			}
485036283bf7SFilipe Manana 			goto next_slot;
485136283bf7SFilipe Manana 		}
485236283bf7SFilipe Manana 
4853e02119d5SChris Mason 		src = path->nodes[0];
485431ff1cd2SChris Mason 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
485531ff1cd2SChris Mason 			ins_nr++;
485631ff1cd2SChris Mason 			goto next_slot;
485731ff1cd2SChris Mason 		} else if (!ins_nr) {
485831ff1cd2SChris Mason 			ins_start_slot = path->slots[0];
485931ff1cd2SChris Mason 			ins_nr = 1;
486031ff1cd2SChris Mason 			goto next_slot;
4861e02119d5SChris Mason 		}
4862e02119d5SChris Mason 
486344d70e19SNikolay Borisov 		ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent,
48641a4bcf47SFilipe Manana 				 ins_start_slot, ins_nr, inode_only,
48651a4bcf47SFilipe Manana 				 logged_isize);
486616e7549fSJosef Bacik 		if (ret < 0) {
48674a500fd1SYan, Zheng 			err = ret;
48684a500fd1SYan, Zheng 			goto out_unlock;
4869a71db86eSRasmus Villemoes 		}
4870a71db86eSRasmus Villemoes 		if (ret) {
487116e7549fSJosef Bacik 			ins_nr = 0;
487216e7549fSJosef Bacik 			btrfs_release_path(path);
487316e7549fSJosef Bacik 			continue;
48744a500fd1SYan, Zheng 		}
487531ff1cd2SChris Mason 		ins_nr = 1;
487631ff1cd2SChris Mason 		ins_start_slot = path->slots[0];
487731ff1cd2SChris Mason next_slot:
4878e02119d5SChris Mason 
48793a5f1d45SChris Mason 		nritems = btrfs_header_nritems(path->nodes[0]);
48803a5f1d45SChris Mason 		path->slots[0]++;
48813a5f1d45SChris Mason 		if (path->slots[0] < nritems) {
48823a5f1d45SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
48833a5f1d45SChris Mason 					      path->slots[0]);
48843a5f1d45SChris Mason 			goto again;
48853a5f1d45SChris Mason 		}
488631ff1cd2SChris Mason 		if (ins_nr) {
488744d70e19SNikolay Borisov 			ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
488816e7549fSJosef Bacik 					 &last_extent, ins_start_slot,
48891a4bcf47SFilipe Manana 					 ins_nr, inode_only, logged_isize);
489016e7549fSJosef Bacik 			if (ret < 0) {
48914a500fd1SYan, Zheng 				err = ret;
48924a500fd1SYan, Zheng 				goto out_unlock;
48934a500fd1SYan, Zheng 			}
489416e7549fSJosef Bacik 			ret = 0;
489531ff1cd2SChris Mason 			ins_nr = 0;
489631ff1cd2SChris Mason 		}
4897b3b4aa74SDavid Sterba 		btrfs_release_path(path);
489844f714daSFilipe Manana next_key:
48993d41d702SFilipe David Borba Manana 		if (min_key.offset < (u64)-1) {
4900e02119d5SChris Mason 			min_key.offset++;
49013d41d702SFilipe David Borba Manana 		} else if (min_key.type < max_key.type) {
4902e02119d5SChris Mason 			min_key.type++;
49033d41d702SFilipe David Borba Manana 			min_key.offset = 0;
49043d41d702SFilipe David Borba Manana 		} else {
4905e02119d5SChris Mason 			break;
4906e02119d5SChris Mason 		}
49073d41d702SFilipe David Borba Manana 	}
490831ff1cd2SChris Mason 	if (ins_nr) {
490944d70e19SNikolay Borisov 		ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent,
49101a4bcf47SFilipe Manana 				 ins_start_slot, ins_nr, inode_only,
49111a4bcf47SFilipe Manana 				 logged_isize);
491216e7549fSJosef Bacik 		if (ret < 0) {
49134a500fd1SYan, Zheng 			err = ret;
49144a500fd1SYan, Zheng 			goto out_unlock;
49154a500fd1SYan, Zheng 		}
491616e7549fSJosef Bacik 		ret = 0;
491731ff1cd2SChris Mason 		ins_nr = 0;
491831ff1cd2SChris Mason 	}
49195dc562c5SJosef Bacik 
492036283bf7SFilipe Manana 	btrfs_release_path(path);
492136283bf7SFilipe Manana 	btrfs_release_path(dst_path);
4922*1a93c36aSNikolay Borisov 	err = btrfs_log_all_xattrs(trans, root, BTRFS_I(inode), path, dst_path);
492336283bf7SFilipe Manana 	if (err)
492436283bf7SFilipe Manana 		goto out_unlock;
4925a89ca6f2SFilipe Manana 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4926a89ca6f2SFilipe Manana 		btrfs_release_path(path);
4927a89ca6f2SFilipe Manana 		btrfs_release_path(dst_path);
4928a89ca6f2SFilipe Manana 		err = btrfs_log_trailing_hole(trans, root, inode, path);
4929a89ca6f2SFilipe Manana 		if (err)
4930a89ca6f2SFilipe Manana 			goto out_unlock;
4931a89ca6f2SFilipe Manana 	}
4932a95249b3SJosef Bacik log_extents:
4933f3b15ccdSJosef Bacik 	btrfs_release_path(path);
49345dc562c5SJosef Bacik 	btrfs_release_path(dst_path);
4935e4545de5SFilipe Manana 	if (need_log_inode_item) {
4936e4545de5SFilipe Manana 		err = log_inode_item(trans, log, dst_path, inode);
4937e4545de5SFilipe Manana 		if (err)
4938e4545de5SFilipe Manana 			goto out_unlock;
4939e4545de5SFilipe Manana 	}
4940f3b15ccdSJosef Bacik 	if (fast_search) {
4941827463c4SMiao Xie 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4942de0ee0edSFilipe Manana 						&logged_list, ctx, start, end);
49435dc562c5SJosef Bacik 		if (ret) {
49445dc562c5SJosef Bacik 			err = ret;
49455dc562c5SJosef Bacik 			goto out_unlock;
49465dc562c5SJosef Bacik 		}
4947d006a048SJosef Bacik 	} else if (inode_only == LOG_INODE_ALL) {
494806d3d22bSLiu Bo 		struct extent_map *em, *n;
494906d3d22bSLiu Bo 
495049dae1bcSFilipe Manana 		write_lock(&em_tree->lock);
495149dae1bcSFilipe Manana 		/*
495249dae1bcSFilipe Manana 		 * We can't just remove every em if we're called for a ranged
495349dae1bcSFilipe Manana 		 * fsync - that is, one that doesn't cover the whole possible
495449dae1bcSFilipe Manana 		 * file range (0 to LLONG_MAX). This is because we can have
495549dae1bcSFilipe Manana 		 * em's that fall outside the range we're logging and therefore
495649dae1bcSFilipe Manana 		 * their ordered operations haven't completed yet
495749dae1bcSFilipe Manana 		 * (btrfs_finish_ordered_io() not invoked yet). This means we
495849dae1bcSFilipe Manana 		 * didn't get their respective file extent item in the fs/subvol
495949dae1bcSFilipe Manana 		 * tree yet, and need to let the next fast fsync (one which
496049dae1bcSFilipe Manana 		 * consults the list of modified extent maps) find the em so
496149dae1bcSFilipe Manana 		 * that it logs a matching file extent item and waits for the
496249dae1bcSFilipe Manana 		 * respective ordered operation to complete (if it's still
496349dae1bcSFilipe Manana 		 * running).
496449dae1bcSFilipe Manana 		 *
496549dae1bcSFilipe Manana 		 * Removing every em outside the range we're logging would make
496649dae1bcSFilipe Manana 		 * the next fast fsync not log their matching file extent items,
496749dae1bcSFilipe Manana 		 * therefore making us lose data after a log replay.
496849dae1bcSFilipe Manana 		 */
496949dae1bcSFilipe Manana 		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
497049dae1bcSFilipe Manana 					 list) {
497149dae1bcSFilipe Manana 			const u64 mod_end = em->mod_start + em->mod_len - 1;
497249dae1bcSFilipe Manana 
497349dae1bcSFilipe Manana 			if (em->mod_start >= start && mod_end <= end)
497406d3d22bSLiu Bo 				list_del_init(&em->list);
497549dae1bcSFilipe Manana 		}
497649dae1bcSFilipe Manana 		write_unlock(&em_tree->lock);
49775dc562c5SJosef Bacik 	}
49785dc562c5SJosef Bacik 
49799623f9a3SChris Mason 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
49802f2ff0eeSFilipe Manana 		ret = log_directory_changes(trans, root, inode, path, dst_path,
49812f2ff0eeSFilipe Manana 					    ctx);
49824a500fd1SYan, Zheng 		if (ret) {
49834a500fd1SYan, Zheng 			err = ret;
49844a500fd1SYan, Zheng 			goto out_unlock;
49854a500fd1SYan, Zheng 		}
4986e02119d5SChris Mason 	}
498749dae1bcSFilipe Manana 
49882f2ff0eeSFilipe Manana 	spin_lock(&BTRFS_I(inode)->lock);
49893a5f1d45SChris Mason 	BTRFS_I(inode)->logged_trans = trans->transid;
4990125c4cf9SFilipe Manana 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
49912f2ff0eeSFilipe Manana 	spin_unlock(&BTRFS_I(inode)->lock);
49924a500fd1SYan, Zheng out_unlock:
4993827463c4SMiao Xie 	if (unlikely(err))
4994827463c4SMiao Xie 		btrfs_put_logged_extents(&logged_list);
4995827463c4SMiao Xie 	else
4996827463c4SMiao Xie 		btrfs_submit_logged_extents(&logged_list, log);
4997e02119d5SChris Mason 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
4998e02119d5SChris Mason 
4999e02119d5SChris Mason 	btrfs_free_path(path);
5000e02119d5SChris Mason 	btrfs_free_path(dst_path);
50014a500fd1SYan, Zheng 	return err;
5002e02119d5SChris Mason }
5003e02119d5SChris Mason 
500412fcfd22SChris Mason /*
50052be63d5cSFilipe Manana  * Check if we must fallback to a transaction commit when logging an inode.
50062be63d5cSFilipe Manana  * This must be called after logging the inode and is used only in the context
50072be63d5cSFilipe Manana  * when fsyncing an inode requires the need to log some other inode - in which
50082be63d5cSFilipe Manana  * case we can't lock the i_mutex of each other inode we need to log as that
50092be63d5cSFilipe Manana  * can lead to deadlocks with concurrent fsync against other inodes (as we can
50102be63d5cSFilipe Manana  * log inodes up or down in the hierarchy) or rename operations for example. So
50112be63d5cSFilipe Manana  * we take the log_mutex of the inode after we have logged it and then check for
50122be63d5cSFilipe Manana  * its last_unlink_trans value - this is safe because any task setting
50132be63d5cSFilipe Manana  * last_unlink_trans must take the log_mutex and it must do this before it does
50142be63d5cSFilipe Manana  * the actual unlink operation, so if we do this check before a concurrent task
50152be63d5cSFilipe Manana  * sets last_unlink_trans it means we've logged a consistent version/state of
50162be63d5cSFilipe Manana  * all the inode items, otherwise we are not sure and must do a transaction
501701327610SNicholas D Steeves  * commit (the concurrent task might have only updated last_unlink_trans before
50182be63d5cSFilipe Manana  * we logged the inode or it might have also done the unlink).
50192be63d5cSFilipe Manana  */
50202be63d5cSFilipe Manana static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5021ab1717b2SNikolay Borisov 					  struct btrfs_inode *inode)
50222be63d5cSFilipe Manana {
5023ab1717b2SNikolay Borisov 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
50242be63d5cSFilipe Manana 	bool ret = false;
50252be63d5cSFilipe Manana 
5026ab1717b2SNikolay Borisov 	mutex_lock(&inode->log_mutex);
5027ab1717b2SNikolay Borisov 	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
50282be63d5cSFilipe Manana 		/*
50292be63d5cSFilipe Manana 		 * Make sure any commits to the log are forced to be full
50302be63d5cSFilipe Manana 		 * commits.
50312be63d5cSFilipe Manana 		 */
50322be63d5cSFilipe Manana 		btrfs_set_log_full_commit(fs_info, trans);
50332be63d5cSFilipe Manana 		ret = true;
50342be63d5cSFilipe Manana 	}
5035ab1717b2SNikolay Borisov 	mutex_unlock(&inode->log_mutex);
50362be63d5cSFilipe Manana 
50372be63d5cSFilipe Manana 	return ret;
50382be63d5cSFilipe Manana }
50392be63d5cSFilipe Manana 
50402be63d5cSFilipe Manana /*
504112fcfd22SChris Mason  * follow the dentry parent pointers up the chain and see if any
504212fcfd22SChris Mason  * of the directories in it require a full commit before they can
504312fcfd22SChris Mason  * be logged.  Returns zero if nothing special needs to be done or 1 if
504412fcfd22SChris Mason  * a full commit is required.
504512fcfd22SChris Mason  */
504612fcfd22SChris Mason static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
504712fcfd22SChris Mason 					       struct inode *inode,
504812fcfd22SChris Mason 					       struct dentry *parent,
504912fcfd22SChris Mason 					       struct super_block *sb,
505012fcfd22SChris Mason 					       u64 last_committed)
5051e02119d5SChris Mason {
505212fcfd22SChris Mason 	int ret = 0;
50536a912213SJosef Bacik 	struct dentry *old_parent = NULL;
5054de2b530bSJosef Bacik 	struct inode *orig_inode = inode;
5055e02119d5SChris Mason 
5056af4176b4SChris Mason 	/*
5057af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
5058af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
5059af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
5060af4176b4SChris Mason 	 * and other fun in this file.
5061af4176b4SChris Mason 	 */
5062af4176b4SChris Mason 	if (S_ISREG(inode->i_mode) &&
5063af4176b4SChris Mason 	    BTRFS_I(inode)->generation <= last_committed &&
5064af4176b4SChris Mason 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
5065af4176b4SChris Mason 			goto out;
5066af4176b4SChris Mason 
506712fcfd22SChris Mason 	if (!S_ISDIR(inode->i_mode)) {
5068fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
506912fcfd22SChris Mason 			goto out;
50702b0143b5SDavid Howells 		inode = d_inode(parent);
507112fcfd22SChris Mason 	}
507212fcfd22SChris Mason 
507312fcfd22SChris Mason 	while (1) {
5074de2b530bSJosef Bacik 		/*
5075de2b530bSJosef Bacik 		 * If we are logging a directory then we start with our inode,
507601327610SNicholas D Steeves 		 * not our parent's inode, so we need to skip setting the
5077de2b530bSJosef Bacik 		 * logged_trans so that further down in the log code we don't
5078de2b530bSJosef Bacik 		 * think this inode has already been logged.
5079de2b530bSJosef Bacik 		 */
5080de2b530bSJosef Bacik 		if (inode != orig_inode)
508112fcfd22SChris Mason 			BTRFS_I(inode)->logged_trans = trans->transid;
508212fcfd22SChris Mason 		smp_mb();
508312fcfd22SChris Mason 
5084ab1717b2SNikolay Borisov 		if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) {
508512fcfd22SChris Mason 			ret = 1;
508612fcfd22SChris Mason 			break;
508712fcfd22SChris Mason 		}
508812fcfd22SChris Mason 
5089fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
509012fcfd22SChris Mason 			break;
509112fcfd22SChris Mason 
509244f714daSFilipe Manana 		if (IS_ROOT(parent)) {
509344f714daSFilipe Manana 			inode = d_inode(parent);
5094ab1717b2SNikolay Borisov 			if (btrfs_must_commit_transaction(trans, BTRFS_I(inode)))
509544f714daSFilipe Manana 				ret = 1;
509612fcfd22SChris Mason 			break;
509744f714daSFilipe Manana 		}
509812fcfd22SChris Mason 
50996a912213SJosef Bacik 		parent = dget_parent(parent);
51006a912213SJosef Bacik 		dput(old_parent);
51016a912213SJosef Bacik 		old_parent = parent;
51022b0143b5SDavid Howells 		inode = d_inode(parent);
510312fcfd22SChris Mason 
510412fcfd22SChris Mason 	}
51056a912213SJosef Bacik 	dput(old_parent);
510612fcfd22SChris Mason out:
5107e02119d5SChris Mason 	return ret;
5108e02119d5SChris Mason }
5109e02119d5SChris Mason 
51102f2ff0eeSFilipe Manana struct btrfs_dir_list {
51112f2ff0eeSFilipe Manana 	u64 ino;
51122f2ff0eeSFilipe Manana 	struct list_head list;
51132f2ff0eeSFilipe Manana };
51142f2ff0eeSFilipe Manana 
51152f2ff0eeSFilipe Manana /*
51162f2ff0eeSFilipe Manana  * Log the inodes of the new dentries of a directory. See log_dir_items() for
51172f2ff0eeSFilipe Manana  * details about the why it is needed.
51182f2ff0eeSFilipe Manana  * This is a recursive operation - if an existing dentry corresponds to a
51192f2ff0eeSFilipe Manana  * directory, that directory's new entries are logged too (same behaviour as
51202f2ff0eeSFilipe Manana  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
51212f2ff0eeSFilipe Manana  * the dentries point to we do not lock their i_mutex, otherwise lockdep
51222f2ff0eeSFilipe Manana  * complains about the following circular lock dependency / possible deadlock:
51232f2ff0eeSFilipe Manana  *
51242f2ff0eeSFilipe Manana  *        CPU0                                        CPU1
51252f2ff0eeSFilipe Manana  *        ----                                        ----
51262f2ff0eeSFilipe Manana  * lock(&type->i_mutex_dir_key#3/2);
51272f2ff0eeSFilipe Manana  *                                            lock(sb_internal#2);
51282f2ff0eeSFilipe Manana  *                                            lock(&type->i_mutex_dir_key#3/2);
51292f2ff0eeSFilipe Manana  * lock(&sb->s_type->i_mutex_key#14);
51302f2ff0eeSFilipe Manana  *
51312f2ff0eeSFilipe Manana  * Where sb_internal is the lock (a counter that works as a lock) acquired by
51322f2ff0eeSFilipe Manana  * sb_start_intwrite() in btrfs_start_transaction().
51332f2ff0eeSFilipe Manana  * Not locking i_mutex of the inodes is still safe because:
51342f2ff0eeSFilipe Manana  *
51352f2ff0eeSFilipe Manana  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
51362f2ff0eeSFilipe Manana  *    that while logging the inode new references (names) are added or removed
51372f2ff0eeSFilipe Manana  *    from the inode, leaving the logged inode item with a link count that does
51382f2ff0eeSFilipe Manana  *    not match the number of logged inode reference items. This is fine because
51392f2ff0eeSFilipe Manana  *    at log replay time we compute the real number of links and correct the
51402f2ff0eeSFilipe Manana  *    link count in the inode item (see replay_one_buffer() and
51412f2ff0eeSFilipe Manana  *    link_to_fixup_dir());
51422f2ff0eeSFilipe Manana  *
51432f2ff0eeSFilipe Manana  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
51442f2ff0eeSFilipe Manana  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
51452f2ff0eeSFilipe Manana  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
51462f2ff0eeSFilipe Manana  *    has a size that doesn't match the sum of the lengths of all the logged
51472f2ff0eeSFilipe Manana  *    names. This does not result in a problem because if a dir_item key is
51482f2ff0eeSFilipe Manana  *    logged but its matching dir_index key is not logged, at log replay time we
51492f2ff0eeSFilipe Manana  *    don't use it to replay the respective name (see replay_one_name()). On the
51502f2ff0eeSFilipe Manana  *    other hand if only the dir_index key ends up being logged, the respective
51512f2ff0eeSFilipe Manana  *    name is added to the fs/subvol tree with both the dir_item and dir_index
51522f2ff0eeSFilipe Manana  *    keys created (see replay_one_name()).
51532f2ff0eeSFilipe Manana  *    The directory's inode item with a wrong i_size is not a problem as well,
51542f2ff0eeSFilipe Manana  *    since we don't use it at log replay time to set the i_size in the inode
51552f2ff0eeSFilipe Manana  *    item of the fs/subvol tree (see overwrite_item()).
51562f2ff0eeSFilipe Manana  */
51572f2ff0eeSFilipe Manana static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
51582f2ff0eeSFilipe Manana 				struct btrfs_root *root,
51592f2ff0eeSFilipe Manana 				struct inode *start_inode,
51602f2ff0eeSFilipe Manana 				struct btrfs_log_ctx *ctx)
51612f2ff0eeSFilipe Manana {
51620b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
51632f2ff0eeSFilipe Manana 	struct btrfs_root *log = root->log_root;
51642f2ff0eeSFilipe Manana 	struct btrfs_path *path;
51652f2ff0eeSFilipe Manana 	LIST_HEAD(dir_list);
51662f2ff0eeSFilipe Manana 	struct btrfs_dir_list *dir_elem;
51672f2ff0eeSFilipe Manana 	int ret = 0;
51682f2ff0eeSFilipe Manana 
51692f2ff0eeSFilipe Manana 	path = btrfs_alloc_path();
51702f2ff0eeSFilipe Manana 	if (!path)
51712f2ff0eeSFilipe Manana 		return -ENOMEM;
51722f2ff0eeSFilipe Manana 
51732f2ff0eeSFilipe Manana 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
51742f2ff0eeSFilipe Manana 	if (!dir_elem) {
51752f2ff0eeSFilipe Manana 		btrfs_free_path(path);
51762f2ff0eeSFilipe Manana 		return -ENOMEM;
51772f2ff0eeSFilipe Manana 	}
51784a0cc7caSNikolay Borisov 	dir_elem->ino = btrfs_ino(BTRFS_I(start_inode));
51792f2ff0eeSFilipe Manana 	list_add_tail(&dir_elem->list, &dir_list);
51802f2ff0eeSFilipe Manana 
51812f2ff0eeSFilipe Manana 	while (!list_empty(&dir_list)) {
51822f2ff0eeSFilipe Manana 		struct extent_buffer *leaf;
51832f2ff0eeSFilipe Manana 		struct btrfs_key min_key;
51842f2ff0eeSFilipe Manana 		int nritems;
51852f2ff0eeSFilipe Manana 		int i;
51862f2ff0eeSFilipe Manana 
51872f2ff0eeSFilipe Manana 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
51882f2ff0eeSFilipe Manana 					    list);
51892f2ff0eeSFilipe Manana 		if (ret)
51902f2ff0eeSFilipe Manana 			goto next_dir_inode;
51912f2ff0eeSFilipe Manana 
51922f2ff0eeSFilipe Manana 		min_key.objectid = dir_elem->ino;
51932f2ff0eeSFilipe Manana 		min_key.type = BTRFS_DIR_ITEM_KEY;
51942f2ff0eeSFilipe Manana 		min_key.offset = 0;
51952f2ff0eeSFilipe Manana again:
51962f2ff0eeSFilipe Manana 		btrfs_release_path(path);
51972f2ff0eeSFilipe Manana 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
51982f2ff0eeSFilipe Manana 		if (ret < 0) {
51992f2ff0eeSFilipe Manana 			goto next_dir_inode;
52002f2ff0eeSFilipe Manana 		} else if (ret > 0) {
52012f2ff0eeSFilipe Manana 			ret = 0;
52022f2ff0eeSFilipe Manana 			goto next_dir_inode;
52032f2ff0eeSFilipe Manana 		}
52042f2ff0eeSFilipe Manana 
52052f2ff0eeSFilipe Manana process_leaf:
52062f2ff0eeSFilipe Manana 		leaf = path->nodes[0];
52072f2ff0eeSFilipe Manana 		nritems = btrfs_header_nritems(leaf);
52082f2ff0eeSFilipe Manana 		for (i = path->slots[0]; i < nritems; i++) {
52092f2ff0eeSFilipe Manana 			struct btrfs_dir_item *di;
52102f2ff0eeSFilipe Manana 			struct btrfs_key di_key;
52112f2ff0eeSFilipe Manana 			struct inode *di_inode;
52122f2ff0eeSFilipe Manana 			struct btrfs_dir_list *new_dir_elem;
52132f2ff0eeSFilipe Manana 			int log_mode = LOG_INODE_EXISTS;
52142f2ff0eeSFilipe Manana 			int type;
52152f2ff0eeSFilipe Manana 
52162f2ff0eeSFilipe Manana 			btrfs_item_key_to_cpu(leaf, &min_key, i);
52172f2ff0eeSFilipe Manana 			if (min_key.objectid != dir_elem->ino ||
52182f2ff0eeSFilipe Manana 			    min_key.type != BTRFS_DIR_ITEM_KEY)
52192f2ff0eeSFilipe Manana 				goto next_dir_inode;
52202f2ff0eeSFilipe Manana 
52212f2ff0eeSFilipe Manana 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
52222f2ff0eeSFilipe Manana 			type = btrfs_dir_type(leaf, di);
52232f2ff0eeSFilipe Manana 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
52242f2ff0eeSFilipe Manana 			    type != BTRFS_FT_DIR)
52252f2ff0eeSFilipe Manana 				continue;
52262f2ff0eeSFilipe Manana 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
52272f2ff0eeSFilipe Manana 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
52282f2ff0eeSFilipe Manana 				continue;
52292f2ff0eeSFilipe Manana 
5230ec125cfbSRobbie Ko 			btrfs_release_path(path);
52310b246afaSJeff Mahoney 			di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
52322f2ff0eeSFilipe Manana 			if (IS_ERR(di_inode)) {
52332f2ff0eeSFilipe Manana 				ret = PTR_ERR(di_inode);
52342f2ff0eeSFilipe Manana 				goto next_dir_inode;
52352f2ff0eeSFilipe Manana 			}
52362f2ff0eeSFilipe Manana 
52370f8939b8SNikolay Borisov 			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
52382f2ff0eeSFilipe Manana 				iput(di_inode);
5239ec125cfbSRobbie Ko 				break;
52402f2ff0eeSFilipe Manana 			}
52412f2ff0eeSFilipe Manana 
52422f2ff0eeSFilipe Manana 			ctx->log_new_dentries = false;
52433f9749f6SFilipe Manana 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
52442f2ff0eeSFilipe Manana 				log_mode = LOG_INODE_ALL;
52452f2ff0eeSFilipe Manana 			ret = btrfs_log_inode(trans, root, di_inode,
52462f2ff0eeSFilipe Manana 					      log_mode, 0, LLONG_MAX, ctx);
52472be63d5cSFilipe Manana 			if (!ret &&
5248ab1717b2SNikolay Borisov 			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
52492be63d5cSFilipe Manana 				ret = 1;
52502f2ff0eeSFilipe Manana 			iput(di_inode);
52512f2ff0eeSFilipe Manana 			if (ret)
52522f2ff0eeSFilipe Manana 				goto next_dir_inode;
52532f2ff0eeSFilipe Manana 			if (ctx->log_new_dentries) {
52542f2ff0eeSFilipe Manana 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
52552f2ff0eeSFilipe Manana 						       GFP_NOFS);
52562f2ff0eeSFilipe Manana 				if (!new_dir_elem) {
52572f2ff0eeSFilipe Manana 					ret = -ENOMEM;
52582f2ff0eeSFilipe Manana 					goto next_dir_inode;
52592f2ff0eeSFilipe Manana 				}
52602f2ff0eeSFilipe Manana 				new_dir_elem->ino = di_key.objectid;
52612f2ff0eeSFilipe Manana 				list_add_tail(&new_dir_elem->list, &dir_list);
52622f2ff0eeSFilipe Manana 			}
52632f2ff0eeSFilipe Manana 			break;
52642f2ff0eeSFilipe Manana 		}
52652f2ff0eeSFilipe Manana 		if (i == nritems) {
52662f2ff0eeSFilipe Manana 			ret = btrfs_next_leaf(log, path);
52672f2ff0eeSFilipe Manana 			if (ret < 0) {
52682f2ff0eeSFilipe Manana 				goto next_dir_inode;
52692f2ff0eeSFilipe Manana 			} else if (ret > 0) {
52702f2ff0eeSFilipe Manana 				ret = 0;
52712f2ff0eeSFilipe Manana 				goto next_dir_inode;
52722f2ff0eeSFilipe Manana 			}
52732f2ff0eeSFilipe Manana 			goto process_leaf;
52742f2ff0eeSFilipe Manana 		}
52752f2ff0eeSFilipe Manana 		if (min_key.offset < (u64)-1) {
52762f2ff0eeSFilipe Manana 			min_key.offset++;
52772f2ff0eeSFilipe Manana 			goto again;
52782f2ff0eeSFilipe Manana 		}
52792f2ff0eeSFilipe Manana next_dir_inode:
52802f2ff0eeSFilipe Manana 		list_del(&dir_elem->list);
52812f2ff0eeSFilipe Manana 		kfree(dir_elem);
52822f2ff0eeSFilipe Manana 	}
52832f2ff0eeSFilipe Manana 
52842f2ff0eeSFilipe Manana 	btrfs_free_path(path);
52852f2ff0eeSFilipe Manana 	return ret;
52862f2ff0eeSFilipe Manana }
52872f2ff0eeSFilipe Manana 
528818aa0922SFilipe Manana static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
528918aa0922SFilipe Manana 				 struct inode *inode,
529018aa0922SFilipe Manana 				 struct btrfs_log_ctx *ctx)
529118aa0922SFilipe Manana {
52920b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
529318aa0922SFilipe Manana 	int ret;
529418aa0922SFilipe Manana 	struct btrfs_path *path;
529518aa0922SFilipe Manana 	struct btrfs_key key;
529618aa0922SFilipe Manana 	struct btrfs_root *root = BTRFS_I(inode)->root;
52974a0cc7caSNikolay Borisov 	const u64 ino = btrfs_ino(BTRFS_I(inode));
529818aa0922SFilipe Manana 
529918aa0922SFilipe Manana 	path = btrfs_alloc_path();
530018aa0922SFilipe Manana 	if (!path)
530118aa0922SFilipe Manana 		return -ENOMEM;
530218aa0922SFilipe Manana 	path->skip_locking = 1;
530318aa0922SFilipe Manana 	path->search_commit_root = 1;
530418aa0922SFilipe Manana 
530518aa0922SFilipe Manana 	key.objectid = ino;
530618aa0922SFilipe Manana 	key.type = BTRFS_INODE_REF_KEY;
530718aa0922SFilipe Manana 	key.offset = 0;
530818aa0922SFilipe Manana 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
530918aa0922SFilipe Manana 	if (ret < 0)
531018aa0922SFilipe Manana 		goto out;
531118aa0922SFilipe Manana 
531218aa0922SFilipe Manana 	while (true) {
531318aa0922SFilipe Manana 		struct extent_buffer *leaf = path->nodes[0];
531418aa0922SFilipe Manana 		int slot = path->slots[0];
531518aa0922SFilipe Manana 		u32 cur_offset = 0;
531618aa0922SFilipe Manana 		u32 item_size;
531718aa0922SFilipe Manana 		unsigned long ptr;
531818aa0922SFilipe Manana 
531918aa0922SFilipe Manana 		if (slot >= btrfs_header_nritems(leaf)) {
532018aa0922SFilipe Manana 			ret = btrfs_next_leaf(root, path);
532118aa0922SFilipe Manana 			if (ret < 0)
532218aa0922SFilipe Manana 				goto out;
532318aa0922SFilipe Manana 			else if (ret > 0)
532418aa0922SFilipe Manana 				break;
532518aa0922SFilipe Manana 			continue;
532618aa0922SFilipe Manana 		}
532718aa0922SFilipe Manana 
532818aa0922SFilipe Manana 		btrfs_item_key_to_cpu(leaf, &key, slot);
532918aa0922SFilipe Manana 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
533018aa0922SFilipe Manana 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
533118aa0922SFilipe Manana 			break;
533218aa0922SFilipe Manana 
533318aa0922SFilipe Manana 		item_size = btrfs_item_size_nr(leaf, slot);
533418aa0922SFilipe Manana 		ptr = btrfs_item_ptr_offset(leaf, slot);
533518aa0922SFilipe Manana 		while (cur_offset < item_size) {
533618aa0922SFilipe Manana 			struct btrfs_key inode_key;
533718aa0922SFilipe Manana 			struct inode *dir_inode;
533818aa0922SFilipe Manana 
533918aa0922SFilipe Manana 			inode_key.type = BTRFS_INODE_ITEM_KEY;
534018aa0922SFilipe Manana 			inode_key.offset = 0;
534118aa0922SFilipe Manana 
534218aa0922SFilipe Manana 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
534318aa0922SFilipe Manana 				struct btrfs_inode_extref *extref;
534418aa0922SFilipe Manana 
534518aa0922SFilipe Manana 				extref = (struct btrfs_inode_extref *)
534618aa0922SFilipe Manana 					(ptr + cur_offset);
534718aa0922SFilipe Manana 				inode_key.objectid = btrfs_inode_extref_parent(
534818aa0922SFilipe Manana 					leaf, extref);
534918aa0922SFilipe Manana 				cur_offset += sizeof(*extref);
535018aa0922SFilipe Manana 				cur_offset += btrfs_inode_extref_name_len(leaf,
535118aa0922SFilipe Manana 					extref);
535218aa0922SFilipe Manana 			} else {
535318aa0922SFilipe Manana 				inode_key.objectid = key.offset;
535418aa0922SFilipe Manana 				cur_offset = item_size;
535518aa0922SFilipe Manana 			}
535618aa0922SFilipe Manana 
53570b246afaSJeff Mahoney 			dir_inode = btrfs_iget(fs_info->sb, &inode_key,
535818aa0922SFilipe Manana 					       root, NULL);
535918aa0922SFilipe Manana 			/* If parent inode was deleted, skip it. */
536018aa0922SFilipe Manana 			if (IS_ERR(dir_inode))
536118aa0922SFilipe Manana 				continue;
536218aa0922SFilipe Manana 
5363657ed1aaSFilipe Manana 			if (ctx)
5364657ed1aaSFilipe Manana 				ctx->log_new_dentries = false;
536518aa0922SFilipe Manana 			ret = btrfs_log_inode(trans, root, dir_inode,
536618aa0922SFilipe Manana 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
53672be63d5cSFilipe Manana 			if (!ret &&
5368ab1717b2SNikolay Borisov 			    btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
53692be63d5cSFilipe Manana 				ret = 1;
5370657ed1aaSFilipe Manana 			if (!ret && ctx && ctx->log_new_dentries)
5371657ed1aaSFilipe Manana 				ret = log_new_dir_dentries(trans, root,
5372657ed1aaSFilipe Manana 							   dir_inode, ctx);
537318aa0922SFilipe Manana 			iput(dir_inode);
537418aa0922SFilipe Manana 			if (ret)
537518aa0922SFilipe Manana 				goto out;
537618aa0922SFilipe Manana 		}
537718aa0922SFilipe Manana 		path->slots[0]++;
537818aa0922SFilipe Manana 	}
537918aa0922SFilipe Manana 	ret = 0;
538018aa0922SFilipe Manana out:
538118aa0922SFilipe Manana 	btrfs_free_path(path);
538218aa0922SFilipe Manana 	return ret;
538318aa0922SFilipe Manana }
538418aa0922SFilipe Manana 
5385e02119d5SChris Mason /*
5386e02119d5SChris Mason  * helper function around btrfs_log_inode to make sure newly created
5387e02119d5SChris Mason  * parent directories also end up in the log.  A minimal inode and backref
5388e02119d5SChris Mason  * only logging is done of any parent directories that are older than
5389e02119d5SChris Mason  * the last committed transaction
5390e02119d5SChris Mason  */
539148a3b636SEric Sandeen static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
539212fcfd22SChris Mason 			    	  struct btrfs_root *root, struct inode *inode,
539349dae1bcSFilipe Manana 				  struct dentry *parent,
539449dae1bcSFilipe Manana 				  const loff_t start,
539549dae1bcSFilipe Manana 				  const loff_t end,
539649dae1bcSFilipe Manana 				  int exists_only,
53978b050d35SMiao Xie 				  struct btrfs_log_ctx *ctx)
5398e02119d5SChris Mason {
53990b246afaSJeff Mahoney 	struct btrfs_fs_info *fs_info = root->fs_info;
540012fcfd22SChris Mason 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
5401e02119d5SChris Mason 	struct super_block *sb;
54026a912213SJosef Bacik 	struct dentry *old_parent = NULL;
540312fcfd22SChris Mason 	int ret = 0;
54040b246afaSJeff Mahoney 	u64 last_committed = fs_info->last_trans_committed;
54052f2ff0eeSFilipe Manana 	bool log_dentries = false;
54062f2ff0eeSFilipe Manana 	struct inode *orig_inode = inode;
540712fcfd22SChris Mason 
540812fcfd22SChris Mason 	sb = inode->i_sb;
540912fcfd22SChris Mason 
54100b246afaSJeff Mahoney 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
54113a5e1404SSage Weil 		ret = 1;
54123a5e1404SSage Weil 		goto end_no_trans;
54133a5e1404SSage Weil 	}
54143a5e1404SSage Weil 
5415995946ddSMiao Xie 	/*
5416995946ddSMiao Xie 	 * The prev transaction commit doesn't complete, we need do
5417995946ddSMiao Xie 	 * full commit by ourselves.
5418995946ddSMiao Xie 	 */
54190b246afaSJeff Mahoney 	if (fs_info->last_trans_log_full_commit >
54200b246afaSJeff Mahoney 	    fs_info->last_trans_committed) {
542112fcfd22SChris Mason 		ret = 1;
542212fcfd22SChris Mason 		goto end_no_trans;
542312fcfd22SChris Mason 	}
542412fcfd22SChris Mason 
542576dda93cSYan, Zheng 	if (root != BTRFS_I(inode)->root ||
542676dda93cSYan, Zheng 	    btrfs_root_refs(&root->root_item) == 0) {
542776dda93cSYan, Zheng 		ret = 1;
542876dda93cSYan, Zheng 		goto end_no_trans;
542976dda93cSYan, Zheng 	}
543076dda93cSYan, Zheng 
543112fcfd22SChris Mason 	ret = check_parent_dirs_for_sync(trans, inode, parent,
543212fcfd22SChris Mason 					 sb, last_committed);
543312fcfd22SChris Mason 	if (ret)
543412fcfd22SChris Mason 		goto end_no_trans;
5435e02119d5SChris Mason 
54360f8939b8SNikolay Borisov 	if (btrfs_inode_in_log(BTRFS_I(inode), trans->transid)) {
5437257c62e1SChris Mason 		ret = BTRFS_NO_LOG_SYNC;
5438257c62e1SChris Mason 		goto end_no_trans;
5439257c62e1SChris Mason 	}
5440257c62e1SChris Mason 
54418b050d35SMiao Xie 	ret = start_log_trans(trans, root, ctx);
54424a500fd1SYan, Zheng 	if (ret)
5443e87ac136SMiao Xie 		goto end_no_trans;
544412fcfd22SChris Mason 
54458407f553SFilipe Manana 	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
54464a500fd1SYan, Zheng 	if (ret)
54474a500fd1SYan, Zheng 		goto end_trans;
5448e02119d5SChris Mason 
5449af4176b4SChris Mason 	/*
5450af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
5451af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
5452af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
5453af4176b4SChris Mason 	 * and other fun in this file.
5454af4176b4SChris Mason 	 */
5455af4176b4SChris Mason 	if (S_ISREG(inode->i_mode) &&
5456af4176b4SChris Mason 	    BTRFS_I(inode)->generation <= last_committed &&
54574a500fd1SYan, Zheng 	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
54584a500fd1SYan, Zheng 		ret = 0;
54594a500fd1SYan, Zheng 		goto end_trans;
54604a500fd1SYan, Zheng 	}
5461af4176b4SChris Mason 
54622f2ff0eeSFilipe Manana 	if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
54632f2ff0eeSFilipe Manana 		log_dentries = true;
54642f2ff0eeSFilipe Manana 
546518aa0922SFilipe Manana 	/*
546601327610SNicholas D Steeves 	 * On unlink we must make sure all our current and old parent directory
546718aa0922SFilipe Manana 	 * inodes are fully logged. This is to prevent leaving dangling
546818aa0922SFilipe Manana 	 * directory index entries in directories that were our parents but are
546918aa0922SFilipe Manana 	 * not anymore. Not doing this results in old parent directory being
547018aa0922SFilipe Manana 	 * impossible to delete after log replay (rmdir will always fail with
547118aa0922SFilipe Manana 	 * error -ENOTEMPTY).
547218aa0922SFilipe Manana 	 *
547318aa0922SFilipe Manana 	 * Example 1:
547418aa0922SFilipe Manana 	 *
547518aa0922SFilipe Manana 	 * mkdir testdir
547618aa0922SFilipe Manana 	 * touch testdir/foo
547718aa0922SFilipe Manana 	 * ln testdir/foo testdir/bar
547818aa0922SFilipe Manana 	 * sync
547918aa0922SFilipe Manana 	 * unlink testdir/bar
548018aa0922SFilipe Manana 	 * xfs_io -c fsync testdir/foo
548118aa0922SFilipe Manana 	 * <power failure>
548218aa0922SFilipe Manana 	 * mount fs, triggers log replay
548318aa0922SFilipe Manana 	 *
548418aa0922SFilipe Manana 	 * If we don't log the parent directory (testdir), after log replay the
548518aa0922SFilipe Manana 	 * directory still has an entry pointing to the file inode using the bar
548618aa0922SFilipe Manana 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
548718aa0922SFilipe Manana 	 * the file inode has a link count of 1.
548818aa0922SFilipe Manana 	 *
548918aa0922SFilipe Manana 	 * Example 2:
549018aa0922SFilipe Manana 	 *
549118aa0922SFilipe Manana 	 * mkdir testdir
549218aa0922SFilipe Manana 	 * touch foo
549318aa0922SFilipe Manana 	 * ln foo testdir/foo2
549418aa0922SFilipe Manana 	 * ln foo testdir/foo3
549518aa0922SFilipe Manana 	 * sync
549618aa0922SFilipe Manana 	 * unlink testdir/foo3
549718aa0922SFilipe Manana 	 * xfs_io -c fsync foo
549818aa0922SFilipe Manana 	 * <power failure>
549918aa0922SFilipe Manana 	 * mount fs, triggers log replay
550018aa0922SFilipe Manana 	 *
550118aa0922SFilipe Manana 	 * Similar as the first example, after log replay the parent directory
550218aa0922SFilipe Manana 	 * testdir still has an entry pointing to the inode file with name foo3
550318aa0922SFilipe Manana 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
550418aa0922SFilipe Manana 	 * and has a link count of 2.
550518aa0922SFilipe Manana 	 */
550618aa0922SFilipe Manana 	if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
550718aa0922SFilipe Manana 		ret = btrfs_log_all_parents(trans, orig_inode, ctx);
550818aa0922SFilipe Manana 		if (ret)
550918aa0922SFilipe Manana 			goto end_trans;
551018aa0922SFilipe Manana 	}
551118aa0922SFilipe Manana 
551212fcfd22SChris Mason 	while (1) {
5513fc64005cSAl Viro 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5514e02119d5SChris Mason 			break;
5515e02119d5SChris Mason 
55162b0143b5SDavid Howells 		inode = d_inode(parent);
551776dda93cSYan, Zheng 		if (root != BTRFS_I(inode)->root)
551876dda93cSYan, Zheng 			break;
551976dda93cSYan, Zheng 
552018aa0922SFilipe Manana 		if (BTRFS_I(inode)->generation > last_committed) {
552118aa0922SFilipe Manana 			ret = btrfs_log_inode(trans, root, inode,
552218aa0922SFilipe Manana 					      LOG_INODE_EXISTS,
55238407f553SFilipe Manana 					      0, LLONG_MAX, ctx);
55244a500fd1SYan, Zheng 			if (ret)
55254a500fd1SYan, Zheng 				goto end_trans;
5526e02119d5SChris Mason 		}
552776dda93cSYan, Zheng 		if (IS_ROOT(parent))
552812fcfd22SChris Mason 			break;
552912fcfd22SChris Mason 
55306a912213SJosef Bacik 		parent = dget_parent(parent);
55316a912213SJosef Bacik 		dput(old_parent);
55326a912213SJosef Bacik 		old_parent = parent;
553312fcfd22SChris Mason 	}
55342f2ff0eeSFilipe Manana 	if (log_dentries)
55352f2ff0eeSFilipe Manana 		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
55362f2ff0eeSFilipe Manana 	else
553712fcfd22SChris Mason 		ret = 0;
55384a500fd1SYan, Zheng end_trans:
55396a912213SJosef Bacik 	dput(old_parent);
55404a500fd1SYan, Zheng 	if (ret < 0) {
55410b246afaSJeff Mahoney 		btrfs_set_log_full_commit(fs_info, trans);
55424a500fd1SYan, Zheng 		ret = 1;
55434a500fd1SYan, Zheng 	}
55448b050d35SMiao Xie 
55458b050d35SMiao Xie 	if (ret)
55468b050d35SMiao Xie 		btrfs_remove_log_ctx(root, ctx);
554712fcfd22SChris Mason 	btrfs_end_log_trans(root);
554812fcfd22SChris Mason end_no_trans:
554912fcfd22SChris Mason 	return ret;
5550e02119d5SChris Mason }
5551e02119d5SChris Mason 
5552e02119d5SChris Mason /*
5553e02119d5SChris Mason  * it is not safe to log dentry if the chunk root has added new
5554e02119d5SChris Mason  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5555e02119d5SChris Mason  * If this returns 1, you must commit the transaction to safely get your
5556e02119d5SChris Mason  * data on disk.
5557e02119d5SChris Mason  */
5558e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
55598b050d35SMiao Xie 			  struct btrfs_root *root, struct dentry *dentry,
556049dae1bcSFilipe Manana 			  const loff_t start,
556149dae1bcSFilipe Manana 			  const loff_t end,
55628b050d35SMiao Xie 			  struct btrfs_log_ctx *ctx)
5563e02119d5SChris Mason {
55646a912213SJosef Bacik 	struct dentry *parent = dget_parent(dentry);
55656a912213SJosef Bacik 	int ret;
55666a912213SJosef Bacik 
55672b0143b5SDavid Howells 	ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
556849dae1bcSFilipe Manana 				     start, end, 0, ctx);
55696a912213SJosef Bacik 	dput(parent);
55706a912213SJosef Bacik 
55716a912213SJosef Bacik 	return ret;
5572e02119d5SChris Mason }
5573e02119d5SChris Mason 
5574e02119d5SChris Mason /*
5575e02119d5SChris Mason  * should be called during mount to recover any replay any log trees
5576e02119d5SChris Mason  * from the FS
5577e02119d5SChris Mason  */
5578e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5579e02119d5SChris Mason {
5580e02119d5SChris Mason 	int ret;
5581e02119d5SChris Mason 	struct btrfs_path *path;
5582e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
5583e02119d5SChris Mason 	struct btrfs_key key;
5584e02119d5SChris Mason 	struct btrfs_key found_key;
5585e02119d5SChris Mason 	struct btrfs_key tmp_key;
5586e02119d5SChris Mason 	struct btrfs_root *log;
5587e02119d5SChris Mason 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5588e02119d5SChris Mason 	struct walk_control wc = {
5589e02119d5SChris Mason 		.process_func = process_one_buffer,
5590e02119d5SChris Mason 		.stage = 0,
5591e02119d5SChris Mason 	};
5592e02119d5SChris Mason 
5593e02119d5SChris Mason 	path = btrfs_alloc_path();
5594db5b493aSTsutomu Itoh 	if (!path)
5595db5b493aSTsutomu Itoh 		return -ENOMEM;
5596db5b493aSTsutomu Itoh 
5597afcdd129SJosef Bacik 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5598e02119d5SChris Mason 
55994a500fd1SYan, Zheng 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
560079787eaaSJeff Mahoney 	if (IS_ERR(trans)) {
560179787eaaSJeff Mahoney 		ret = PTR_ERR(trans);
560279787eaaSJeff Mahoney 		goto error;
560379787eaaSJeff Mahoney 	}
5604e02119d5SChris Mason 
5605e02119d5SChris Mason 	wc.trans = trans;
5606e02119d5SChris Mason 	wc.pin = 1;
5607e02119d5SChris Mason 
5608db5b493aSTsutomu Itoh 	ret = walk_log_tree(trans, log_root_tree, &wc);
560979787eaaSJeff Mahoney 	if (ret) {
56105d163e0eSJeff Mahoney 		btrfs_handle_fs_error(fs_info, ret,
56115d163e0eSJeff Mahoney 			"Failed to pin buffers while recovering log root tree.");
561279787eaaSJeff Mahoney 		goto error;
561379787eaaSJeff Mahoney 	}
5614e02119d5SChris Mason 
5615e02119d5SChris Mason again:
5616e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
5617e02119d5SChris Mason 	key.offset = (u64)-1;
5618962a298fSDavid Sterba 	key.type = BTRFS_ROOT_ITEM_KEY;
5619e02119d5SChris Mason 
5620e02119d5SChris Mason 	while (1) {
5621e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
562279787eaaSJeff Mahoney 
562379787eaaSJeff Mahoney 		if (ret < 0) {
562434d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
562579787eaaSJeff Mahoney 				    "Couldn't find tree log root.");
562679787eaaSJeff Mahoney 			goto error;
562779787eaaSJeff Mahoney 		}
5628e02119d5SChris Mason 		if (ret > 0) {
5629e02119d5SChris Mason 			if (path->slots[0] == 0)
5630e02119d5SChris Mason 				break;
5631e02119d5SChris Mason 			path->slots[0]--;
5632e02119d5SChris Mason 		}
5633e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5634e02119d5SChris Mason 				      path->slots[0]);
5635b3b4aa74SDavid Sterba 		btrfs_release_path(path);
5636e02119d5SChris Mason 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5637e02119d5SChris Mason 			break;
5638e02119d5SChris Mason 
5639cb517eabSMiao Xie 		log = btrfs_read_fs_root(log_root_tree, &found_key);
564079787eaaSJeff Mahoney 		if (IS_ERR(log)) {
564179787eaaSJeff Mahoney 			ret = PTR_ERR(log);
564234d97007SAnand Jain 			btrfs_handle_fs_error(fs_info, ret,
564379787eaaSJeff Mahoney 				    "Couldn't read tree log root.");
564479787eaaSJeff Mahoney 			goto error;
564579787eaaSJeff Mahoney 		}
5646e02119d5SChris Mason 
5647e02119d5SChris Mason 		tmp_key.objectid = found_key.offset;
5648e02119d5SChris Mason 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5649e02119d5SChris Mason 		tmp_key.offset = (u64)-1;
5650e02119d5SChris Mason 
5651e02119d5SChris Mason 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
565279787eaaSJeff Mahoney 		if (IS_ERR(wc.replay_dest)) {
565379787eaaSJeff Mahoney 			ret = PTR_ERR(wc.replay_dest);
5654b50c6e25SJosef Bacik 			free_extent_buffer(log->node);
5655b50c6e25SJosef Bacik 			free_extent_buffer(log->commit_root);
5656b50c6e25SJosef Bacik 			kfree(log);
56575d163e0eSJeff Mahoney 			btrfs_handle_fs_error(fs_info, ret,
56585d163e0eSJeff Mahoney 				"Couldn't read target root for tree log recovery.");
565979787eaaSJeff Mahoney 			goto error;
566079787eaaSJeff Mahoney 		}
5661e02119d5SChris Mason 
566207d400a6SYan Zheng 		wc.replay_dest->log_root = log;
56635d4f98a2SYan Zheng 		btrfs_record_root_in_trans(trans, wc.replay_dest);
5664e02119d5SChris Mason 		ret = walk_log_tree(trans, log, &wc);
5665e02119d5SChris Mason 
5666b50c6e25SJosef Bacik 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5667e02119d5SChris Mason 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
5668e02119d5SChris Mason 						      path);
5669e02119d5SChris Mason 		}
5670e02119d5SChris Mason 
5671e02119d5SChris Mason 		key.offset = found_key.offset - 1;
567207d400a6SYan Zheng 		wc.replay_dest->log_root = NULL;
5673e02119d5SChris Mason 		free_extent_buffer(log->node);
5674b263c2c8SChris Mason 		free_extent_buffer(log->commit_root);
5675e02119d5SChris Mason 		kfree(log);
5676e02119d5SChris Mason 
5677b50c6e25SJosef Bacik 		if (ret)
5678b50c6e25SJosef Bacik 			goto error;
5679b50c6e25SJosef Bacik 
5680e02119d5SChris Mason 		if (found_key.offset == 0)
5681e02119d5SChris Mason 			break;
5682e02119d5SChris Mason 	}
5683b3b4aa74SDavid Sterba 	btrfs_release_path(path);
5684e02119d5SChris Mason 
5685e02119d5SChris Mason 	/* step one is to pin it all, step two is to replay just inodes */
5686e02119d5SChris Mason 	if (wc.pin) {
5687e02119d5SChris Mason 		wc.pin = 0;
5688e02119d5SChris Mason 		wc.process_func = replay_one_buffer;
5689e02119d5SChris Mason 		wc.stage = LOG_WALK_REPLAY_INODES;
5690e02119d5SChris Mason 		goto again;
5691e02119d5SChris Mason 	}
5692e02119d5SChris Mason 	/* step three is to replay everything */
5693e02119d5SChris Mason 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
5694e02119d5SChris Mason 		wc.stage++;
5695e02119d5SChris Mason 		goto again;
5696e02119d5SChris Mason 	}
5697e02119d5SChris Mason 
5698e02119d5SChris Mason 	btrfs_free_path(path);
5699e02119d5SChris Mason 
5700abefa55aSJosef Bacik 	/* step 4: commit the transaction, which also unpins the blocks */
57013a45bb20SJeff Mahoney 	ret = btrfs_commit_transaction(trans);
5702abefa55aSJosef Bacik 	if (ret)
5703abefa55aSJosef Bacik 		return ret;
5704abefa55aSJosef Bacik 
5705e02119d5SChris Mason 	free_extent_buffer(log_root_tree->node);
5706e02119d5SChris Mason 	log_root_tree->log_root = NULL;
5707afcdd129SJosef Bacik 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5708e02119d5SChris Mason 	kfree(log_root_tree);
570979787eaaSJeff Mahoney 
5710abefa55aSJosef Bacik 	return 0;
571179787eaaSJeff Mahoney error:
5712b50c6e25SJosef Bacik 	if (wc.trans)
57133a45bb20SJeff Mahoney 		btrfs_end_transaction(wc.trans);
571479787eaaSJeff Mahoney 	btrfs_free_path(path);
571579787eaaSJeff Mahoney 	return ret;
5716e02119d5SChris Mason }
571712fcfd22SChris Mason 
571812fcfd22SChris Mason /*
571912fcfd22SChris Mason  * there are some corner cases where we want to force a full
572012fcfd22SChris Mason  * commit instead of allowing a directory to be logged.
572112fcfd22SChris Mason  *
572212fcfd22SChris Mason  * They revolve around files there were unlinked from the directory, and
572312fcfd22SChris Mason  * this function updates the parent directory so that a full commit is
572412fcfd22SChris Mason  * properly done if it is fsync'd later after the unlinks are done.
57252be63d5cSFilipe Manana  *
57262be63d5cSFilipe Manana  * Must be called before the unlink operations (updates to the subvolume tree,
57272be63d5cSFilipe Manana  * inodes, etc) are done.
572812fcfd22SChris Mason  */
572912fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
57304176bdbfSNikolay Borisov 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
573112fcfd22SChris Mason 			     int for_rename)
573212fcfd22SChris Mason {
573312fcfd22SChris Mason 	/*
5734af4176b4SChris Mason 	 * when we're logging a file, if it hasn't been renamed
5735af4176b4SChris Mason 	 * or unlinked, and its inode is fully committed on disk,
5736af4176b4SChris Mason 	 * we don't have to worry about walking up the directory chain
5737af4176b4SChris Mason 	 * to log its parents.
5738af4176b4SChris Mason 	 *
5739af4176b4SChris Mason 	 * So, we use the last_unlink_trans field to put this transid
5740af4176b4SChris Mason 	 * into the file.  When the file is logged we check it and
5741af4176b4SChris Mason 	 * don't log the parents if the file is fully on disk.
5742af4176b4SChris Mason 	 */
57434176bdbfSNikolay Borisov 	mutex_lock(&inode->log_mutex);
57444176bdbfSNikolay Borisov 	inode->last_unlink_trans = trans->transid;
57454176bdbfSNikolay Borisov 	mutex_unlock(&inode->log_mutex);
5746af4176b4SChris Mason 
5747af4176b4SChris Mason 	/*
574812fcfd22SChris Mason 	 * if this directory was already logged any new
574912fcfd22SChris Mason 	 * names for this file/dir will get recorded
575012fcfd22SChris Mason 	 */
575112fcfd22SChris Mason 	smp_mb();
57524176bdbfSNikolay Borisov 	if (dir->logged_trans == trans->transid)
575312fcfd22SChris Mason 		return;
575412fcfd22SChris Mason 
575512fcfd22SChris Mason 	/*
575612fcfd22SChris Mason 	 * if the inode we're about to unlink was logged,
575712fcfd22SChris Mason 	 * the log will be properly updated for any new names
575812fcfd22SChris Mason 	 */
57594176bdbfSNikolay Borisov 	if (inode->logged_trans == trans->transid)
576012fcfd22SChris Mason 		return;
576112fcfd22SChris Mason 
576212fcfd22SChris Mason 	/*
576312fcfd22SChris Mason 	 * when renaming files across directories, if the directory
576412fcfd22SChris Mason 	 * there we're unlinking from gets fsync'd later on, there's
576512fcfd22SChris Mason 	 * no way to find the destination directory later and fsync it
576612fcfd22SChris Mason 	 * properly.  So, we have to be conservative and force commits
576712fcfd22SChris Mason 	 * so the new name gets discovered.
576812fcfd22SChris Mason 	 */
576912fcfd22SChris Mason 	if (for_rename)
577012fcfd22SChris Mason 		goto record;
577112fcfd22SChris Mason 
577212fcfd22SChris Mason 	/* we can safely do the unlink without any special recording */
577312fcfd22SChris Mason 	return;
577412fcfd22SChris Mason 
577512fcfd22SChris Mason record:
57764176bdbfSNikolay Borisov 	mutex_lock(&dir->log_mutex);
57774176bdbfSNikolay Borisov 	dir->last_unlink_trans = trans->transid;
57784176bdbfSNikolay Borisov 	mutex_unlock(&dir->log_mutex);
577912fcfd22SChris Mason }
578012fcfd22SChris Mason 
578112fcfd22SChris Mason /*
57821ec9a1aeSFilipe Manana  * Make sure that if someone attempts to fsync the parent directory of a deleted
57831ec9a1aeSFilipe Manana  * snapshot, it ends up triggering a transaction commit. This is to guarantee
57841ec9a1aeSFilipe Manana  * that after replaying the log tree of the parent directory's root we will not
57851ec9a1aeSFilipe Manana  * see the snapshot anymore and at log replay time we will not see any log tree
57861ec9a1aeSFilipe Manana  * corresponding to the deleted snapshot's root, which could lead to replaying
57871ec9a1aeSFilipe Manana  * it after replaying the log tree of the parent directory (which would replay
57881ec9a1aeSFilipe Manana  * the snapshot delete operation).
57892be63d5cSFilipe Manana  *
57902be63d5cSFilipe Manana  * Must be called before the actual snapshot destroy operation (updates to the
57912be63d5cSFilipe Manana  * parent root and tree of tree roots trees, etc) are done.
57921ec9a1aeSFilipe Manana  */
57931ec9a1aeSFilipe Manana void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
579443663557SNikolay Borisov 				   struct btrfs_inode *dir)
57951ec9a1aeSFilipe Manana {
579643663557SNikolay Borisov 	mutex_lock(&dir->log_mutex);
579743663557SNikolay Borisov 	dir->last_unlink_trans = trans->transid;
579843663557SNikolay Borisov 	mutex_unlock(&dir->log_mutex);
57991ec9a1aeSFilipe Manana }
58001ec9a1aeSFilipe Manana 
58011ec9a1aeSFilipe Manana /*
580212fcfd22SChris Mason  * Call this after adding a new name for a file and it will properly
580312fcfd22SChris Mason  * update the log to reflect the new name.
580412fcfd22SChris Mason  *
580512fcfd22SChris Mason  * It will return zero if all goes well, and it will return 1 if a
580612fcfd22SChris Mason  * full transaction commit is required.
580712fcfd22SChris Mason  */
580812fcfd22SChris Mason int btrfs_log_new_name(struct btrfs_trans_handle *trans,
58099ca5fbfbSNikolay Borisov 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
581012fcfd22SChris Mason 			struct dentry *parent)
581112fcfd22SChris Mason {
58129ca5fbfbSNikolay Borisov 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
58139ca5fbfbSNikolay Borisov 	struct btrfs_root * root = inode->root;
581412fcfd22SChris Mason 
581512fcfd22SChris Mason 	/*
5816af4176b4SChris Mason 	 * this will force the logging code to walk the dentry chain
5817af4176b4SChris Mason 	 * up for the file
5818af4176b4SChris Mason 	 */
58199ca5fbfbSNikolay Borisov 	if (S_ISREG(inode->vfs_inode.i_mode))
58209ca5fbfbSNikolay Borisov 		inode->last_unlink_trans = trans->transid;
5821af4176b4SChris Mason 
5822af4176b4SChris Mason 	/*
582312fcfd22SChris Mason 	 * if this inode hasn't been logged and directory we're renaming it
582412fcfd22SChris Mason 	 * from hasn't been logged, we don't need to log it
582512fcfd22SChris Mason 	 */
58269ca5fbfbSNikolay Borisov 	if (inode->logged_trans <= fs_info->last_trans_committed &&
58279ca5fbfbSNikolay Borisov 	    (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
582812fcfd22SChris Mason 		return 0;
582912fcfd22SChris Mason 
58309ca5fbfbSNikolay Borisov 	return btrfs_log_inode_parent(trans, root, &inode->vfs_inode, parent, 0,
583149dae1bcSFilipe Manana 				      LLONG_MAX, 1, NULL);
583212fcfd22SChris Mason }
583312fcfd22SChris Mason 
5834