xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 2ecb79239bcd04c9d410f4cdce16adb6840b19da)
1e02119d5SChris Mason /*
2e02119d5SChris Mason  * Copyright (C) 2008 Oracle.  All rights reserved.
3e02119d5SChris Mason  *
4e02119d5SChris Mason  * This program is free software; you can redistribute it and/or
5e02119d5SChris Mason  * modify it under the terms of the GNU General Public
6e02119d5SChris Mason  * License v2 as published by the Free Software Foundation.
7e02119d5SChris Mason  *
8e02119d5SChris Mason  * This program is distributed in the hope that it will be useful,
9e02119d5SChris Mason  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10e02119d5SChris Mason  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11e02119d5SChris Mason  * General Public License for more details.
12e02119d5SChris Mason  *
13e02119d5SChris Mason  * You should have received a copy of the GNU General Public
14e02119d5SChris Mason  * License along with this program; if not, write to the
15e02119d5SChris Mason  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16e02119d5SChris Mason  * Boston, MA 021110-1307, USA.
17e02119d5SChris Mason  */
18e02119d5SChris Mason 
19e02119d5SChris Mason #include <linux/sched.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
215dc562c5SJosef Bacik #include <linux/list_sort.h>
22e02119d5SChris Mason #include "ctree.h"
23e02119d5SChris Mason #include "transaction.h"
24e02119d5SChris Mason #include "disk-io.h"
25e02119d5SChris Mason #include "locking.h"
26e02119d5SChris Mason #include "print-tree.h"
27e02119d5SChris Mason #include "compat.h"
28b2950863SChristoph Hellwig #include "tree-log.h"
29e02119d5SChris Mason 
30e02119d5SChris Mason /* magic values for the inode_only field in btrfs_log_inode:
31e02119d5SChris Mason  *
32e02119d5SChris Mason  * LOG_INODE_ALL means to log everything
33e02119d5SChris Mason  * LOG_INODE_EXISTS means to log just enough to recreate the inode
34e02119d5SChris Mason  * during log replay
35e02119d5SChris Mason  */
36e02119d5SChris Mason #define LOG_INODE_ALL 0
37e02119d5SChris Mason #define LOG_INODE_EXISTS 1
38e02119d5SChris Mason 
39e02119d5SChris Mason /*
4012fcfd22SChris Mason  * directory trouble cases
4112fcfd22SChris Mason  *
4212fcfd22SChris Mason  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
4312fcfd22SChris Mason  * log, we must force a full commit before doing an fsync of the directory
4412fcfd22SChris Mason  * where the unlink was done.
4512fcfd22SChris Mason  * ---> record transid of last unlink/rename per directory
4612fcfd22SChris Mason  *
4712fcfd22SChris Mason  * mkdir foo/some_dir
4812fcfd22SChris Mason  * normal commit
4912fcfd22SChris Mason  * rename foo/some_dir foo2/some_dir
5012fcfd22SChris Mason  * mkdir foo/some_dir
5112fcfd22SChris Mason  * fsync foo/some_dir/some_file
5212fcfd22SChris Mason  *
5312fcfd22SChris Mason  * The fsync above will unlink the original some_dir without recording
5412fcfd22SChris Mason  * it in its new location (foo2).  After a crash, some_dir will be gone
5512fcfd22SChris Mason  * unless the fsync of some_file forces a full commit
5612fcfd22SChris Mason  *
5712fcfd22SChris Mason  * 2) we must log any new names for any file or dir that is in the fsync
5812fcfd22SChris Mason  * log. ---> check inode while renaming/linking.
5912fcfd22SChris Mason  *
6012fcfd22SChris Mason  * 2a) we must log any new names for any file or dir during rename
6112fcfd22SChris Mason  * when the directory they are being removed from was logged.
6212fcfd22SChris Mason  * ---> check inode and old parent dir during rename
6312fcfd22SChris Mason  *
6412fcfd22SChris Mason  *  2a is actually the more important variant.  With the extra logging
6512fcfd22SChris Mason  *  a crash might unlink the old name without recreating the new one
6612fcfd22SChris Mason  *
6712fcfd22SChris Mason  * 3) after a crash, we must go through any directories with a link count
6812fcfd22SChris Mason  * of zero and redo the rm -rf
6912fcfd22SChris Mason  *
7012fcfd22SChris Mason  * mkdir f1/foo
7112fcfd22SChris Mason  * normal commit
7212fcfd22SChris Mason  * rm -rf f1/foo
7312fcfd22SChris Mason  * fsync(f1)
7412fcfd22SChris Mason  *
7512fcfd22SChris Mason  * The directory f1 was fully removed from the FS, but fsync was never
7612fcfd22SChris Mason  * called on f1, only its parent dir.  After a crash the rm -rf must
7712fcfd22SChris Mason  * be replayed.  This must be able to recurse down the entire
7812fcfd22SChris Mason  * directory tree.  The inode link count fixup code takes care of the
7912fcfd22SChris Mason  * ugly details.
8012fcfd22SChris Mason  */
8112fcfd22SChris Mason 
8212fcfd22SChris Mason /*
83e02119d5SChris Mason  * stages for the tree walking.  The first
84e02119d5SChris Mason  * stage (0) is to only pin down the blocks we find
85e02119d5SChris Mason  * the second stage (1) is to make sure that all the inodes
86e02119d5SChris Mason  * we find in the log are created in the subvolume.
87e02119d5SChris Mason  *
88e02119d5SChris Mason  * The last stage is to deal with directories and links and extents
89e02119d5SChris Mason  * and all the other fun semantics
90e02119d5SChris Mason  */
91e02119d5SChris Mason #define LOG_WALK_PIN_ONLY 0
92e02119d5SChris Mason #define LOG_WALK_REPLAY_INODES 1
93e02119d5SChris Mason #define LOG_WALK_REPLAY_ALL 2
94e02119d5SChris Mason 
9512fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
96e02119d5SChris Mason 			     struct btrfs_root *root, struct inode *inode,
97e02119d5SChris Mason 			     int inode_only);
98ec051c0fSYan Zheng static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
99ec051c0fSYan Zheng 			     struct btrfs_root *root,
100ec051c0fSYan Zheng 			     struct btrfs_path *path, u64 objectid);
10112fcfd22SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
10212fcfd22SChris Mason 				       struct btrfs_root *root,
10312fcfd22SChris Mason 				       struct btrfs_root *log,
10412fcfd22SChris Mason 				       struct btrfs_path *path,
10512fcfd22SChris Mason 				       u64 dirid, int del_all);
106e02119d5SChris Mason 
107e02119d5SChris Mason /*
108e02119d5SChris Mason  * tree logging is a special write ahead log used to make sure that
109e02119d5SChris Mason  * fsyncs and O_SYNCs can happen without doing full tree commits.
110e02119d5SChris Mason  *
111e02119d5SChris Mason  * Full tree commits are expensive because they require commonly
112e02119d5SChris Mason  * modified blocks to be recowed, creating many dirty pages in the
113e02119d5SChris Mason  * extent tree an 4x-6x higher write load than ext3.
114e02119d5SChris Mason  *
115e02119d5SChris Mason  * Instead of doing a tree commit on every fsync, we use the
116e02119d5SChris Mason  * key ranges and transaction ids to find items for a given file or directory
117e02119d5SChris Mason  * that have changed in this transaction.  Those items are copied into
118e02119d5SChris Mason  * a special tree (one per subvolume root), that tree is written to disk
119e02119d5SChris Mason  * and then the fsync is considered complete.
120e02119d5SChris Mason  *
121e02119d5SChris Mason  * After a crash, items are copied out of the log-tree back into the
122e02119d5SChris Mason  * subvolume tree.  Any file data extents found are recorded in the extent
123e02119d5SChris Mason  * allocation tree, and the log-tree freed.
124e02119d5SChris Mason  *
125e02119d5SChris Mason  * The log tree is read three times, once to pin down all the extents it is
126e02119d5SChris Mason  * using in ram and once, once to create all the inodes logged in the tree
127e02119d5SChris Mason  * and once to do all the other items.
128e02119d5SChris Mason  */
129e02119d5SChris Mason 
130e02119d5SChris Mason /*
131e02119d5SChris Mason  * start a sub transaction and setup the log tree
132e02119d5SChris Mason  * this increments the log tree writer count to make the people
133e02119d5SChris Mason  * syncing the tree wait for us to finish
134e02119d5SChris Mason  */
135e02119d5SChris Mason static int start_log_trans(struct btrfs_trans_handle *trans,
136e02119d5SChris Mason 			   struct btrfs_root *root)
137e02119d5SChris Mason {
138e02119d5SChris Mason 	int ret;
1394a500fd1SYan, Zheng 	int err = 0;
1407237f183SYan Zheng 
1417237f183SYan Zheng 	mutex_lock(&root->log_mutex);
1427237f183SYan Zheng 	if (root->log_root) {
143ff782e0aSJosef Bacik 		if (!root->log_start_pid) {
144ff782e0aSJosef Bacik 			root->log_start_pid = current->pid;
145ff782e0aSJosef Bacik 			root->log_multiple_pids = false;
146ff782e0aSJosef Bacik 		} else if (root->log_start_pid != current->pid) {
147ff782e0aSJosef Bacik 			root->log_multiple_pids = true;
148ff782e0aSJosef Bacik 		}
149ff782e0aSJosef Bacik 
150*2ecb7923SMiao Xie 		atomic_inc(&root->log_batch);
1517237f183SYan Zheng 		atomic_inc(&root->log_writers);
1527237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
1537237f183SYan Zheng 		return 0;
1547237f183SYan Zheng 	}
155ff782e0aSJosef Bacik 	root->log_multiple_pids = false;
156ff782e0aSJosef Bacik 	root->log_start_pid = current->pid;
157e02119d5SChris Mason 	mutex_lock(&root->fs_info->tree_log_mutex);
158e02119d5SChris Mason 	if (!root->fs_info->log_root_tree) {
159e02119d5SChris Mason 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
1604a500fd1SYan, Zheng 		if (ret)
1614a500fd1SYan, Zheng 			err = ret;
162e02119d5SChris Mason 	}
1634a500fd1SYan, Zheng 	if (err == 0 && !root->log_root) {
164e02119d5SChris Mason 		ret = btrfs_add_log_tree(trans, root);
1654a500fd1SYan, Zheng 		if (ret)
1664a500fd1SYan, Zheng 			err = ret;
167e02119d5SChris Mason 	}
168e02119d5SChris Mason 	mutex_unlock(&root->fs_info->tree_log_mutex);
169*2ecb7923SMiao Xie 	atomic_inc(&root->log_batch);
1707237f183SYan Zheng 	atomic_inc(&root->log_writers);
1717237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
1724a500fd1SYan, Zheng 	return err;
173e02119d5SChris Mason }
174e02119d5SChris Mason 
175e02119d5SChris Mason /*
176e02119d5SChris Mason  * returns 0 if there was a log transaction running and we were able
177e02119d5SChris Mason  * to join, or returns -ENOENT if there were not transactions
178e02119d5SChris Mason  * in progress
179e02119d5SChris Mason  */
180e02119d5SChris Mason static int join_running_log_trans(struct btrfs_root *root)
181e02119d5SChris Mason {
182e02119d5SChris Mason 	int ret = -ENOENT;
183e02119d5SChris Mason 
184e02119d5SChris Mason 	smp_mb();
185e02119d5SChris Mason 	if (!root->log_root)
186e02119d5SChris Mason 		return -ENOENT;
187e02119d5SChris Mason 
1887237f183SYan Zheng 	mutex_lock(&root->log_mutex);
189e02119d5SChris Mason 	if (root->log_root) {
190e02119d5SChris Mason 		ret = 0;
1917237f183SYan Zheng 		atomic_inc(&root->log_writers);
192e02119d5SChris Mason 	}
1937237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
194e02119d5SChris Mason 	return ret;
195e02119d5SChris Mason }
196e02119d5SChris Mason 
197e02119d5SChris Mason /*
19812fcfd22SChris Mason  * This either makes the current running log transaction wait
19912fcfd22SChris Mason  * until you call btrfs_end_log_trans() or it makes any future
20012fcfd22SChris Mason  * log transactions wait until you call btrfs_end_log_trans()
20112fcfd22SChris Mason  */
20212fcfd22SChris Mason int btrfs_pin_log_trans(struct btrfs_root *root)
20312fcfd22SChris Mason {
20412fcfd22SChris Mason 	int ret = -ENOENT;
20512fcfd22SChris Mason 
20612fcfd22SChris Mason 	mutex_lock(&root->log_mutex);
20712fcfd22SChris Mason 	atomic_inc(&root->log_writers);
20812fcfd22SChris Mason 	mutex_unlock(&root->log_mutex);
20912fcfd22SChris Mason 	return ret;
21012fcfd22SChris Mason }
21112fcfd22SChris Mason 
21212fcfd22SChris Mason /*
213e02119d5SChris Mason  * indicate we're done making changes to the log tree
214e02119d5SChris Mason  * and wake up anyone waiting to do a sync
215e02119d5SChris Mason  */
216143bede5SJeff Mahoney void btrfs_end_log_trans(struct btrfs_root *root)
217e02119d5SChris Mason {
2187237f183SYan Zheng 	if (atomic_dec_and_test(&root->log_writers)) {
219e02119d5SChris Mason 		smp_mb();
2207237f183SYan Zheng 		if (waitqueue_active(&root->log_writer_wait))
2217237f183SYan Zheng 			wake_up(&root->log_writer_wait);
2227237f183SYan Zheng 	}
223e02119d5SChris Mason }
224e02119d5SChris Mason 
225e02119d5SChris Mason 
226e02119d5SChris Mason /*
227e02119d5SChris Mason  * the walk control struct is used to pass state down the chain when
228e02119d5SChris Mason  * processing the log tree.  The stage field tells us which part
229e02119d5SChris Mason  * of the log tree processing we are currently doing.  The others
230e02119d5SChris Mason  * are state fields used for that specific part
231e02119d5SChris Mason  */
232e02119d5SChris Mason struct walk_control {
233e02119d5SChris Mason 	/* should we free the extent on disk when done?  This is used
234e02119d5SChris Mason 	 * at transaction commit time while freeing a log tree
235e02119d5SChris Mason 	 */
236e02119d5SChris Mason 	int free;
237e02119d5SChris Mason 
238e02119d5SChris Mason 	/* should we write out the extent buffer?  This is used
239e02119d5SChris Mason 	 * while flushing the log tree to disk during a sync
240e02119d5SChris Mason 	 */
241e02119d5SChris Mason 	int write;
242e02119d5SChris Mason 
243e02119d5SChris Mason 	/* should we wait for the extent buffer io to finish?  Also used
244e02119d5SChris Mason 	 * while flushing the log tree to disk for a sync
245e02119d5SChris Mason 	 */
246e02119d5SChris Mason 	int wait;
247e02119d5SChris Mason 
248e02119d5SChris Mason 	/* pin only walk, we record which extents on disk belong to the
249e02119d5SChris Mason 	 * log trees
250e02119d5SChris Mason 	 */
251e02119d5SChris Mason 	int pin;
252e02119d5SChris Mason 
253e02119d5SChris Mason 	/* what stage of the replay code we're currently in */
254e02119d5SChris Mason 	int stage;
255e02119d5SChris Mason 
256e02119d5SChris Mason 	/* the root we are currently replaying */
257e02119d5SChris Mason 	struct btrfs_root *replay_dest;
258e02119d5SChris Mason 
259e02119d5SChris Mason 	/* the trans handle for the current replay */
260e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
261e02119d5SChris Mason 
262e02119d5SChris Mason 	/* the function that gets used to process blocks we find in the
263e02119d5SChris Mason 	 * tree.  Note the extent_buffer might not be up to date when it is
264e02119d5SChris Mason 	 * passed in, and it must be checked or read if you need the data
265e02119d5SChris Mason 	 * inside it
266e02119d5SChris Mason 	 */
267e02119d5SChris Mason 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
268e02119d5SChris Mason 			    struct walk_control *wc, u64 gen);
269e02119d5SChris Mason };
270e02119d5SChris Mason 
271e02119d5SChris Mason /*
272e02119d5SChris Mason  * process_func used to pin down extents, write them or wait on them
273e02119d5SChris Mason  */
274e02119d5SChris Mason static int process_one_buffer(struct btrfs_root *log,
275e02119d5SChris Mason 			      struct extent_buffer *eb,
276e02119d5SChris Mason 			      struct walk_control *wc, u64 gen)
277e02119d5SChris Mason {
27804018de5SJosef Bacik 	if (wc->pin)
279e688b725SChris Mason 		btrfs_pin_extent_for_log_replay(wc->trans,
280e688b725SChris Mason 						log->fs_info->extent_root,
281e688b725SChris Mason 						eb->start, eb->len);
282e02119d5SChris Mason 
283b9fab919SChris Mason 	if (btrfs_buffer_uptodate(eb, gen, 0)) {
284e02119d5SChris Mason 		if (wc->write)
285e02119d5SChris Mason 			btrfs_write_tree_block(eb);
286e02119d5SChris Mason 		if (wc->wait)
287e02119d5SChris Mason 			btrfs_wait_tree_block_writeback(eb);
288e02119d5SChris Mason 	}
289e02119d5SChris Mason 	return 0;
290e02119d5SChris Mason }
291e02119d5SChris Mason 
292e02119d5SChris Mason /*
293e02119d5SChris Mason  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
294e02119d5SChris Mason  * to the src data we are copying out.
295e02119d5SChris Mason  *
296e02119d5SChris Mason  * root is the tree we are copying into, and path is a scratch
297e02119d5SChris Mason  * path for use in this function (it should be released on entry and
298e02119d5SChris Mason  * will be released on exit).
299e02119d5SChris Mason  *
300e02119d5SChris Mason  * If the key is already in the destination tree the existing item is
301e02119d5SChris Mason  * overwritten.  If the existing item isn't big enough, it is extended.
302e02119d5SChris Mason  * If it is too large, it is truncated.
303e02119d5SChris Mason  *
304e02119d5SChris Mason  * If the key isn't in the destination yet, a new item is inserted.
305e02119d5SChris Mason  */
306e02119d5SChris Mason static noinline int overwrite_item(struct btrfs_trans_handle *trans,
307e02119d5SChris Mason 				   struct btrfs_root *root,
308e02119d5SChris Mason 				   struct btrfs_path *path,
309e02119d5SChris Mason 				   struct extent_buffer *eb, int slot,
310e02119d5SChris Mason 				   struct btrfs_key *key)
311e02119d5SChris Mason {
312e02119d5SChris Mason 	int ret;
313e02119d5SChris Mason 	u32 item_size;
314e02119d5SChris Mason 	u64 saved_i_size = 0;
315e02119d5SChris Mason 	int save_old_i_size = 0;
316e02119d5SChris Mason 	unsigned long src_ptr;
317e02119d5SChris Mason 	unsigned long dst_ptr;
318e02119d5SChris Mason 	int overwrite_root = 0;
319e02119d5SChris Mason 
320e02119d5SChris Mason 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
321e02119d5SChris Mason 		overwrite_root = 1;
322e02119d5SChris Mason 
323e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
324e02119d5SChris Mason 	src_ptr = btrfs_item_ptr_offset(eb, slot);
325e02119d5SChris Mason 
326e02119d5SChris Mason 	/* look for the key in the destination tree */
327e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
328e02119d5SChris Mason 	if (ret == 0) {
329e02119d5SChris Mason 		char *src_copy;
330e02119d5SChris Mason 		char *dst_copy;
331e02119d5SChris Mason 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
332e02119d5SChris Mason 						  path->slots[0]);
333e02119d5SChris Mason 		if (dst_size != item_size)
334e02119d5SChris Mason 			goto insert;
335e02119d5SChris Mason 
336e02119d5SChris Mason 		if (item_size == 0) {
337b3b4aa74SDavid Sterba 			btrfs_release_path(path);
338e02119d5SChris Mason 			return 0;
339e02119d5SChris Mason 		}
340e02119d5SChris Mason 		dst_copy = kmalloc(item_size, GFP_NOFS);
341e02119d5SChris Mason 		src_copy = kmalloc(item_size, GFP_NOFS);
3422a29edc6Sliubo 		if (!dst_copy || !src_copy) {
343b3b4aa74SDavid Sterba 			btrfs_release_path(path);
3442a29edc6Sliubo 			kfree(dst_copy);
3452a29edc6Sliubo 			kfree(src_copy);
3462a29edc6Sliubo 			return -ENOMEM;
3472a29edc6Sliubo 		}
348e02119d5SChris Mason 
349e02119d5SChris Mason 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
350e02119d5SChris Mason 
351e02119d5SChris Mason 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
352e02119d5SChris Mason 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
353e02119d5SChris Mason 				   item_size);
354e02119d5SChris Mason 		ret = memcmp(dst_copy, src_copy, item_size);
355e02119d5SChris Mason 
356e02119d5SChris Mason 		kfree(dst_copy);
357e02119d5SChris Mason 		kfree(src_copy);
358e02119d5SChris Mason 		/*
359e02119d5SChris Mason 		 * they have the same contents, just return, this saves
360e02119d5SChris Mason 		 * us from cowing blocks in the destination tree and doing
361e02119d5SChris Mason 		 * extra writes that may not have been done by a previous
362e02119d5SChris Mason 		 * sync
363e02119d5SChris Mason 		 */
364e02119d5SChris Mason 		if (ret == 0) {
365b3b4aa74SDavid Sterba 			btrfs_release_path(path);
366e02119d5SChris Mason 			return 0;
367e02119d5SChris Mason 		}
368e02119d5SChris Mason 
369e02119d5SChris Mason 	}
370e02119d5SChris Mason insert:
371b3b4aa74SDavid Sterba 	btrfs_release_path(path);
372e02119d5SChris Mason 	/* try to insert the key into the destination tree */
373e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path,
374e02119d5SChris Mason 				      key, item_size);
375e02119d5SChris Mason 
376e02119d5SChris Mason 	/* make sure any existing item is the correct size */
377e02119d5SChris Mason 	if (ret == -EEXIST) {
378e02119d5SChris Mason 		u32 found_size;
379e02119d5SChris Mason 		found_size = btrfs_item_size_nr(path->nodes[0],
380e02119d5SChris Mason 						path->slots[0]);
381143bede5SJeff Mahoney 		if (found_size > item_size)
382e02119d5SChris Mason 			btrfs_truncate_item(trans, root, path, item_size, 1);
383143bede5SJeff Mahoney 		else if (found_size < item_size)
384143bede5SJeff Mahoney 			btrfs_extend_item(trans, root, path,
38587b29b20SYan Zheng 					  item_size - found_size);
386e02119d5SChris Mason 	} else if (ret) {
3874a500fd1SYan, Zheng 		return ret;
388e02119d5SChris Mason 	}
389e02119d5SChris Mason 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
390e02119d5SChris Mason 					path->slots[0]);
391e02119d5SChris Mason 
392e02119d5SChris Mason 	/* don't overwrite an existing inode if the generation number
393e02119d5SChris Mason 	 * was logged as zero.  This is done when the tree logging code
394e02119d5SChris Mason 	 * is just logging an inode to make sure it exists after recovery.
395e02119d5SChris Mason 	 *
396e02119d5SChris Mason 	 * Also, don't overwrite i_size on directories during replay.
397e02119d5SChris Mason 	 * log replay inserts and removes directory items based on the
398e02119d5SChris Mason 	 * state of the tree found in the subvolume, and i_size is modified
399e02119d5SChris Mason 	 * as it goes
400e02119d5SChris Mason 	 */
401e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
402e02119d5SChris Mason 		struct btrfs_inode_item *src_item;
403e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
404e02119d5SChris Mason 
405e02119d5SChris Mason 		src_item = (struct btrfs_inode_item *)src_ptr;
406e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
407e02119d5SChris Mason 
408e02119d5SChris Mason 		if (btrfs_inode_generation(eb, src_item) == 0)
409e02119d5SChris Mason 			goto no_copy;
410e02119d5SChris Mason 
411e02119d5SChris Mason 		if (overwrite_root &&
412e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
413e02119d5SChris Mason 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
414e02119d5SChris Mason 			save_old_i_size = 1;
415e02119d5SChris Mason 			saved_i_size = btrfs_inode_size(path->nodes[0],
416e02119d5SChris Mason 							dst_item);
417e02119d5SChris Mason 		}
418e02119d5SChris Mason 	}
419e02119d5SChris Mason 
420e02119d5SChris Mason 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
421e02119d5SChris Mason 			   src_ptr, item_size);
422e02119d5SChris Mason 
423e02119d5SChris Mason 	if (save_old_i_size) {
424e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
425e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
426e02119d5SChris Mason 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
427e02119d5SChris Mason 	}
428e02119d5SChris Mason 
429e02119d5SChris Mason 	/* make sure the generation is filled in */
430e02119d5SChris Mason 	if (key->type == BTRFS_INODE_ITEM_KEY) {
431e02119d5SChris Mason 		struct btrfs_inode_item *dst_item;
432e02119d5SChris Mason 		dst_item = (struct btrfs_inode_item *)dst_ptr;
433e02119d5SChris Mason 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
434e02119d5SChris Mason 			btrfs_set_inode_generation(path->nodes[0], dst_item,
435e02119d5SChris Mason 						   trans->transid);
436e02119d5SChris Mason 		}
437e02119d5SChris Mason 	}
438e02119d5SChris Mason no_copy:
439e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
440b3b4aa74SDavid Sterba 	btrfs_release_path(path);
441e02119d5SChris Mason 	return 0;
442e02119d5SChris Mason }
443e02119d5SChris Mason 
444e02119d5SChris Mason /*
445e02119d5SChris Mason  * simple helper to read an inode off the disk from a given root
446e02119d5SChris Mason  * This can only be called for subvolume roots and not for the log
447e02119d5SChris Mason  */
448e02119d5SChris Mason static noinline struct inode *read_one_inode(struct btrfs_root *root,
449e02119d5SChris Mason 					     u64 objectid)
450e02119d5SChris Mason {
4515d4f98a2SYan Zheng 	struct btrfs_key key;
452e02119d5SChris Mason 	struct inode *inode;
453e02119d5SChris Mason 
4545d4f98a2SYan Zheng 	key.objectid = objectid;
4555d4f98a2SYan Zheng 	key.type = BTRFS_INODE_ITEM_KEY;
4565d4f98a2SYan Zheng 	key.offset = 0;
45773f73415SJosef Bacik 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4585d4f98a2SYan Zheng 	if (IS_ERR(inode)) {
4595d4f98a2SYan Zheng 		inode = NULL;
4605d4f98a2SYan Zheng 	} else if (is_bad_inode(inode)) {
461e02119d5SChris Mason 		iput(inode);
462e02119d5SChris Mason 		inode = NULL;
463e02119d5SChris Mason 	}
464e02119d5SChris Mason 	return inode;
465e02119d5SChris Mason }
466e02119d5SChris Mason 
467e02119d5SChris Mason /* replays a single extent in 'eb' at 'slot' with 'key' into the
468e02119d5SChris Mason  * subvolume 'root'.  path is released on entry and should be released
469e02119d5SChris Mason  * on exit.
470e02119d5SChris Mason  *
471e02119d5SChris Mason  * extents in the log tree have not been allocated out of the extent
472e02119d5SChris Mason  * tree yet.  So, this completes the allocation, taking a reference
473e02119d5SChris Mason  * as required if the extent already exists or creating a new extent
474e02119d5SChris Mason  * if it isn't in the extent allocation tree yet.
475e02119d5SChris Mason  *
476e02119d5SChris Mason  * The extent is inserted into the file, dropping any existing extents
477e02119d5SChris Mason  * from the file that overlap the new one.
478e02119d5SChris Mason  */
479e02119d5SChris Mason static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
480e02119d5SChris Mason 				      struct btrfs_root *root,
481e02119d5SChris Mason 				      struct btrfs_path *path,
482e02119d5SChris Mason 				      struct extent_buffer *eb, int slot,
483e02119d5SChris Mason 				      struct btrfs_key *key)
484e02119d5SChris Mason {
485e02119d5SChris Mason 	int found_type;
486e02119d5SChris Mason 	u64 mask = root->sectorsize - 1;
487e02119d5SChris Mason 	u64 extent_end;
488e02119d5SChris Mason 	u64 start = key->offset;
48907d400a6SYan Zheng 	u64 saved_nbytes;
490e02119d5SChris Mason 	struct btrfs_file_extent_item *item;
491e02119d5SChris Mason 	struct inode *inode = NULL;
492e02119d5SChris Mason 	unsigned long size;
493e02119d5SChris Mason 	int ret = 0;
494e02119d5SChris Mason 
495e02119d5SChris Mason 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
496e02119d5SChris Mason 	found_type = btrfs_file_extent_type(eb, item);
497e02119d5SChris Mason 
498d899e052SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
499d899e052SYan Zheng 	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
500e02119d5SChris Mason 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
501e02119d5SChris Mason 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
502c8b97818SChris Mason 		size = btrfs_file_extent_inline_len(eb, item);
503e02119d5SChris Mason 		extent_end = (start + size + mask) & ~mask;
504e02119d5SChris Mason 	} else {
505e02119d5SChris Mason 		ret = 0;
506e02119d5SChris Mason 		goto out;
507e02119d5SChris Mason 	}
508e02119d5SChris Mason 
509e02119d5SChris Mason 	inode = read_one_inode(root, key->objectid);
510e02119d5SChris Mason 	if (!inode) {
511e02119d5SChris Mason 		ret = -EIO;
512e02119d5SChris Mason 		goto out;
513e02119d5SChris Mason 	}
514e02119d5SChris Mason 
515e02119d5SChris Mason 	/*
516e02119d5SChris Mason 	 * first check to see if we already have this extent in the
517e02119d5SChris Mason 	 * file.  This must be done before the btrfs_drop_extents run
518e02119d5SChris Mason 	 * so we don't try to drop this extent.
519e02119d5SChris Mason 	 */
52033345d01SLi Zefan 	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
521e02119d5SChris Mason 				       start, 0);
522e02119d5SChris Mason 
523d899e052SYan Zheng 	if (ret == 0 &&
524d899e052SYan Zheng 	    (found_type == BTRFS_FILE_EXTENT_REG ||
525d899e052SYan Zheng 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
526e02119d5SChris Mason 		struct btrfs_file_extent_item cmp1;
527e02119d5SChris Mason 		struct btrfs_file_extent_item cmp2;
528e02119d5SChris Mason 		struct btrfs_file_extent_item *existing;
529e02119d5SChris Mason 		struct extent_buffer *leaf;
530e02119d5SChris Mason 
531e02119d5SChris Mason 		leaf = path->nodes[0];
532e02119d5SChris Mason 		existing = btrfs_item_ptr(leaf, path->slots[0],
533e02119d5SChris Mason 					  struct btrfs_file_extent_item);
534e02119d5SChris Mason 
535e02119d5SChris Mason 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
536e02119d5SChris Mason 				   sizeof(cmp1));
537e02119d5SChris Mason 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
538e02119d5SChris Mason 				   sizeof(cmp2));
539e02119d5SChris Mason 
540e02119d5SChris Mason 		/*
541e02119d5SChris Mason 		 * we already have a pointer to this exact extent,
542e02119d5SChris Mason 		 * we don't have to do anything
543e02119d5SChris Mason 		 */
544e02119d5SChris Mason 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
545b3b4aa74SDavid Sterba 			btrfs_release_path(path);
546e02119d5SChris Mason 			goto out;
547e02119d5SChris Mason 		}
548e02119d5SChris Mason 	}
549b3b4aa74SDavid Sterba 	btrfs_release_path(path);
550e02119d5SChris Mason 
55107d400a6SYan Zheng 	saved_nbytes = inode_get_bytes(inode);
552e02119d5SChris Mason 	/* drop any overlapping extents */
5532671485dSJosef Bacik 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
554e02119d5SChris Mason 	BUG_ON(ret);
555e02119d5SChris Mason 
55607d400a6SYan Zheng 	if (found_type == BTRFS_FILE_EXTENT_REG ||
55707d400a6SYan Zheng 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5585d4f98a2SYan Zheng 		u64 offset;
55907d400a6SYan Zheng 		unsigned long dest_offset;
56007d400a6SYan Zheng 		struct btrfs_key ins;
56107d400a6SYan Zheng 
56207d400a6SYan Zheng 		ret = btrfs_insert_empty_item(trans, root, path, key,
56307d400a6SYan Zheng 					      sizeof(*item));
56407d400a6SYan Zheng 		BUG_ON(ret);
56507d400a6SYan Zheng 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
56607d400a6SYan Zheng 						    path->slots[0]);
56707d400a6SYan Zheng 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
56807d400a6SYan Zheng 				(unsigned long)item,  sizeof(*item));
56907d400a6SYan Zheng 
57007d400a6SYan Zheng 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
57107d400a6SYan Zheng 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
57207d400a6SYan Zheng 		ins.type = BTRFS_EXTENT_ITEM_KEY;
5735d4f98a2SYan Zheng 		offset = key->offset - btrfs_file_extent_offset(eb, item);
57407d400a6SYan Zheng 
57507d400a6SYan Zheng 		if (ins.objectid > 0) {
57607d400a6SYan Zheng 			u64 csum_start;
57707d400a6SYan Zheng 			u64 csum_end;
57807d400a6SYan Zheng 			LIST_HEAD(ordered_sums);
57907d400a6SYan Zheng 			/*
58007d400a6SYan Zheng 			 * is this extent already allocated in the extent
58107d400a6SYan Zheng 			 * allocation tree?  If so, just add a reference
58207d400a6SYan Zheng 			 */
58307d400a6SYan Zheng 			ret = btrfs_lookup_extent(root, ins.objectid,
58407d400a6SYan Zheng 						ins.offset);
58507d400a6SYan Zheng 			if (ret == 0) {
58607d400a6SYan Zheng 				ret = btrfs_inc_extent_ref(trans, root,
58707d400a6SYan Zheng 						ins.objectid, ins.offset,
5885d4f98a2SYan Zheng 						0, root->root_key.objectid,
58966d7e7f0SArne Jansen 						key->objectid, offset, 0);
59037daa4f9STsutomu Itoh 				BUG_ON(ret);
59107d400a6SYan Zheng 			} else {
59207d400a6SYan Zheng 				/*
59307d400a6SYan Zheng 				 * insert the extent pointer in the extent
59407d400a6SYan Zheng 				 * allocation tree
59507d400a6SYan Zheng 				 */
5965d4f98a2SYan Zheng 				ret = btrfs_alloc_logged_file_extent(trans,
5975d4f98a2SYan Zheng 						root, root->root_key.objectid,
5985d4f98a2SYan Zheng 						key->objectid, offset, &ins);
59907d400a6SYan Zheng 				BUG_ON(ret);
60007d400a6SYan Zheng 			}
601b3b4aa74SDavid Sterba 			btrfs_release_path(path);
60207d400a6SYan Zheng 
60307d400a6SYan Zheng 			if (btrfs_file_extent_compression(eb, item)) {
60407d400a6SYan Zheng 				csum_start = ins.objectid;
60507d400a6SYan Zheng 				csum_end = csum_start + ins.offset;
60607d400a6SYan Zheng 			} else {
60707d400a6SYan Zheng 				csum_start = ins.objectid +
60807d400a6SYan Zheng 					btrfs_file_extent_offset(eb, item);
60907d400a6SYan Zheng 				csum_end = csum_start +
61007d400a6SYan Zheng 					btrfs_file_extent_num_bytes(eb, item);
61107d400a6SYan Zheng 			}
61207d400a6SYan Zheng 
61307d400a6SYan Zheng 			ret = btrfs_lookup_csums_range(root->log_root,
61407d400a6SYan Zheng 						csum_start, csum_end - 1,
615a2de733cSArne Jansen 						&ordered_sums, 0);
61607d400a6SYan Zheng 			BUG_ON(ret);
61707d400a6SYan Zheng 			while (!list_empty(&ordered_sums)) {
61807d400a6SYan Zheng 				struct btrfs_ordered_sum *sums;
61907d400a6SYan Zheng 				sums = list_entry(ordered_sums.next,
62007d400a6SYan Zheng 						struct btrfs_ordered_sum,
62107d400a6SYan Zheng 						list);
62207d400a6SYan Zheng 				ret = btrfs_csum_file_blocks(trans,
62307d400a6SYan Zheng 						root->fs_info->csum_root,
62407d400a6SYan Zheng 						sums);
62507d400a6SYan Zheng 				BUG_ON(ret);
62607d400a6SYan Zheng 				list_del(&sums->list);
62707d400a6SYan Zheng 				kfree(sums);
62807d400a6SYan Zheng 			}
62907d400a6SYan Zheng 		} else {
630b3b4aa74SDavid Sterba 			btrfs_release_path(path);
63107d400a6SYan Zheng 		}
63207d400a6SYan Zheng 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
63307d400a6SYan Zheng 		/* inline extents are easy, we just overwrite them */
634e02119d5SChris Mason 		ret = overwrite_item(trans, root, path, eb, slot, key);
635e02119d5SChris Mason 		BUG_ON(ret);
63607d400a6SYan Zheng 	}
637e02119d5SChris Mason 
63807d400a6SYan Zheng 	inode_set_bytes(inode, saved_nbytes);
639b9959295STsutomu Itoh 	ret = btrfs_update_inode(trans, root, inode);
640e02119d5SChris Mason out:
641e02119d5SChris Mason 	if (inode)
642e02119d5SChris Mason 		iput(inode);
643e02119d5SChris Mason 	return ret;
644e02119d5SChris Mason }
645e02119d5SChris Mason 
646e02119d5SChris Mason /*
647e02119d5SChris Mason  * when cleaning up conflicts between the directory names in the
648e02119d5SChris Mason  * subvolume, directory names in the log and directory names in the
649e02119d5SChris Mason  * inode back references, we may have to unlink inodes from directories.
650e02119d5SChris Mason  *
651e02119d5SChris Mason  * This is a helper function to do the unlink of a specific directory
652e02119d5SChris Mason  * item
653e02119d5SChris Mason  */
654e02119d5SChris Mason static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
655e02119d5SChris Mason 				      struct btrfs_root *root,
656e02119d5SChris Mason 				      struct btrfs_path *path,
657e02119d5SChris Mason 				      struct inode *dir,
658e02119d5SChris Mason 				      struct btrfs_dir_item *di)
659e02119d5SChris Mason {
660e02119d5SChris Mason 	struct inode *inode;
661e02119d5SChris Mason 	char *name;
662e02119d5SChris Mason 	int name_len;
663e02119d5SChris Mason 	struct extent_buffer *leaf;
664e02119d5SChris Mason 	struct btrfs_key location;
665e02119d5SChris Mason 	int ret;
666e02119d5SChris Mason 
667e02119d5SChris Mason 	leaf = path->nodes[0];
668e02119d5SChris Mason 
669e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
670e02119d5SChris Mason 	name_len = btrfs_dir_name_len(leaf, di);
671e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
6722a29edc6Sliubo 	if (!name)
6732a29edc6Sliubo 		return -ENOMEM;
6742a29edc6Sliubo 
675e02119d5SChris Mason 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
676b3b4aa74SDavid Sterba 	btrfs_release_path(path);
677e02119d5SChris Mason 
678e02119d5SChris Mason 	inode = read_one_inode(root, location.objectid);
679c00e9493STsutomu Itoh 	if (!inode) {
680c00e9493STsutomu Itoh 		kfree(name);
681c00e9493STsutomu Itoh 		return -EIO;
682c00e9493STsutomu Itoh 	}
683e02119d5SChris Mason 
684ec051c0fSYan Zheng 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
685ec051c0fSYan Zheng 	BUG_ON(ret);
68612fcfd22SChris Mason 
687e02119d5SChris Mason 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
688ec051c0fSYan Zheng 	BUG_ON(ret);
689e02119d5SChris Mason 	kfree(name);
690e02119d5SChris Mason 
691e02119d5SChris Mason 	iput(inode);
692b6305567SChris Mason 
693b6305567SChris Mason 	btrfs_run_delayed_items(trans, root);
694e02119d5SChris Mason 	return ret;
695e02119d5SChris Mason }
696e02119d5SChris Mason 
697e02119d5SChris Mason /*
698e02119d5SChris Mason  * helper function to see if a given name and sequence number found
699e02119d5SChris Mason  * in an inode back reference are already in a directory and correctly
700e02119d5SChris Mason  * point to this inode
701e02119d5SChris Mason  */
702e02119d5SChris Mason static noinline int inode_in_dir(struct btrfs_root *root,
703e02119d5SChris Mason 				 struct btrfs_path *path,
704e02119d5SChris Mason 				 u64 dirid, u64 objectid, u64 index,
705e02119d5SChris Mason 				 const char *name, int name_len)
706e02119d5SChris Mason {
707e02119d5SChris Mason 	struct btrfs_dir_item *di;
708e02119d5SChris Mason 	struct btrfs_key location;
709e02119d5SChris Mason 	int match = 0;
710e02119d5SChris Mason 
711e02119d5SChris Mason 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
712e02119d5SChris Mason 					 index, name, name_len, 0);
713e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
714e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
715e02119d5SChris Mason 		if (location.objectid != objectid)
716e02119d5SChris Mason 			goto out;
717e02119d5SChris Mason 	} else
718e02119d5SChris Mason 		goto out;
719b3b4aa74SDavid Sterba 	btrfs_release_path(path);
720e02119d5SChris Mason 
721e02119d5SChris Mason 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
722e02119d5SChris Mason 	if (di && !IS_ERR(di)) {
723e02119d5SChris Mason 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
724e02119d5SChris Mason 		if (location.objectid != objectid)
725e02119d5SChris Mason 			goto out;
726e02119d5SChris Mason 	} else
727e02119d5SChris Mason 		goto out;
728e02119d5SChris Mason 	match = 1;
729e02119d5SChris Mason out:
730b3b4aa74SDavid Sterba 	btrfs_release_path(path);
731e02119d5SChris Mason 	return match;
732e02119d5SChris Mason }
733e02119d5SChris Mason 
734e02119d5SChris Mason /*
735e02119d5SChris Mason  * helper function to check a log tree for a named back reference in
736e02119d5SChris Mason  * an inode.  This is used to decide if a back reference that is
737e02119d5SChris Mason  * found in the subvolume conflicts with what we find in the log.
738e02119d5SChris Mason  *
739e02119d5SChris Mason  * inode backreferences may have multiple refs in a single item,
740e02119d5SChris Mason  * during replay we process one reference at a time, and we don't
741e02119d5SChris Mason  * want to delete valid links to a file from the subvolume if that
742e02119d5SChris Mason  * link is also in the log.
743e02119d5SChris Mason  */
744e02119d5SChris Mason static noinline int backref_in_log(struct btrfs_root *log,
745e02119d5SChris Mason 				   struct btrfs_key *key,
746e02119d5SChris Mason 				   char *name, int namelen)
747e02119d5SChris Mason {
748e02119d5SChris Mason 	struct btrfs_path *path;
749e02119d5SChris Mason 	struct btrfs_inode_ref *ref;
750e02119d5SChris Mason 	unsigned long ptr;
751e02119d5SChris Mason 	unsigned long ptr_end;
752e02119d5SChris Mason 	unsigned long name_ptr;
753e02119d5SChris Mason 	int found_name_len;
754e02119d5SChris Mason 	int item_size;
755e02119d5SChris Mason 	int ret;
756e02119d5SChris Mason 	int match = 0;
757e02119d5SChris Mason 
758e02119d5SChris Mason 	path = btrfs_alloc_path();
7592a29edc6Sliubo 	if (!path)
7602a29edc6Sliubo 		return -ENOMEM;
7612a29edc6Sliubo 
762e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
763e02119d5SChris Mason 	if (ret != 0)
764e02119d5SChris Mason 		goto out;
765e02119d5SChris Mason 
766e02119d5SChris Mason 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
767e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
768e02119d5SChris Mason 	ptr_end = ptr + item_size;
769e02119d5SChris Mason 	while (ptr < ptr_end) {
770e02119d5SChris Mason 		ref = (struct btrfs_inode_ref *)ptr;
771e02119d5SChris Mason 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
772e02119d5SChris Mason 		if (found_name_len == namelen) {
773e02119d5SChris Mason 			name_ptr = (unsigned long)(ref + 1);
774e02119d5SChris Mason 			ret = memcmp_extent_buffer(path->nodes[0], name,
775e02119d5SChris Mason 						   name_ptr, namelen);
776e02119d5SChris Mason 			if (ret == 0) {
777e02119d5SChris Mason 				match = 1;
778e02119d5SChris Mason 				goto out;
779e02119d5SChris Mason 			}
780e02119d5SChris Mason 		}
781e02119d5SChris Mason 		ptr = (unsigned long)(ref + 1) + found_name_len;
782e02119d5SChris Mason 	}
783e02119d5SChris Mason out:
784e02119d5SChris Mason 	btrfs_free_path(path);
785e02119d5SChris Mason 	return match;
786e02119d5SChris Mason }
787e02119d5SChris Mason 
788e02119d5SChris Mason 
789e02119d5SChris Mason /*
790e02119d5SChris Mason  * replay one inode back reference item found in the log tree.
791e02119d5SChris Mason  * eb, slot and key refer to the buffer and key found in the log tree.
792e02119d5SChris Mason  * root is the destination we are replaying into, and path is for temp
793e02119d5SChris Mason  * use by this function.  (it should be released on return).
794e02119d5SChris Mason  */
795e02119d5SChris Mason static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
796e02119d5SChris Mason 				  struct btrfs_root *root,
797e02119d5SChris Mason 				  struct btrfs_root *log,
798e02119d5SChris Mason 				  struct btrfs_path *path,
799e02119d5SChris Mason 				  struct extent_buffer *eb, int slot,
800e02119d5SChris Mason 				  struct btrfs_key *key)
801e02119d5SChris Mason {
802e02119d5SChris Mason 	struct btrfs_inode_ref *ref;
80334f3e4f2Sliubo 	struct btrfs_dir_item *di;
80434f3e4f2Sliubo 	struct inode *dir;
805e02119d5SChris Mason 	struct inode *inode;
806e02119d5SChris Mason 	unsigned long ref_ptr;
807e02119d5SChris Mason 	unsigned long ref_end;
80834f3e4f2Sliubo 	char *name;
80934f3e4f2Sliubo 	int namelen;
81034f3e4f2Sliubo 	int ret;
811c622ae60Sliubo 	int search_done = 0;
812e02119d5SChris Mason 
813e02119d5SChris Mason 	/*
814e02119d5SChris Mason 	 * it is possible that we didn't log all the parent directories
815e02119d5SChris Mason 	 * for a given inode.  If we don't find the dir, just don't
816e02119d5SChris Mason 	 * copy the back ref in.  The link count fixup code will take
817e02119d5SChris Mason 	 * care of the rest
818e02119d5SChris Mason 	 */
819e02119d5SChris Mason 	dir = read_one_inode(root, key->offset);
820e02119d5SChris Mason 	if (!dir)
821e02119d5SChris Mason 		return -ENOENT;
822e02119d5SChris Mason 
823e02119d5SChris Mason 	inode = read_one_inode(root, key->objectid);
824c00e9493STsutomu Itoh 	if (!inode) {
825c00e9493STsutomu Itoh 		iput(dir);
826c00e9493STsutomu Itoh 		return -EIO;
827c00e9493STsutomu Itoh 	}
828e02119d5SChris Mason 
829e02119d5SChris Mason 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
830e02119d5SChris Mason 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
831e02119d5SChris Mason 
832e02119d5SChris Mason again:
833e02119d5SChris Mason 	ref = (struct btrfs_inode_ref *)ref_ptr;
834e02119d5SChris Mason 
835e02119d5SChris Mason 	namelen = btrfs_inode_ref_name_len(eb, ref);
836e02119d5SChris Mason 	name = kmalloc(namelen, GFP_NOFS);
837e02119d5SChris Mason 	BUG_ON(!name);
838e02119d5SChris Mason 
839e02119d5SChris Mason 	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
840e02119d5SChris Mason 
841e02119d5SChris Mason 	/* if we already have a perfect match, we're done */
84233345d01SLi Zefan 	if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
843e02119d5SChris Mason 			 btrfs_inode_ref_index(eb, ref),
844e02119d5SChris Mason 			 name, namelen)) {
845e02119d5SChris Mason 		goto out;
846e02119d5SChris Mason 	}
847e02119d5SChris Mason 
848e02119d5SChris Mason 	/*
849e02119d5SChris Mason 	 * look for a conflicting back reference in the metadata.
850e02119d5SChris Mason 	 * if we find one we have to unlink that name of the file
851e02119d5SChris Mason 	 * before we add our new link.  Later on, we overwrite any
852e02119d5SChris Mason 	 * existing back reference, and we don't want to create
853e02119d5SChris Mason 	 * dangling pointers in the directory.
854e02119d5SChris Mason 	 */
855c622ae60Sliubo 
856c622ae60Sliubo 	if (search_done)
857c622ae60Sliubo 		goto insert;
858c622ae60Sliubo 
859e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
860e02119d5SChris Mason 	if (ret == 0) {
861e02119d5SChris Mason 		char *victim_name;
862e02119d5SChris Mason 		int victim_name_len;
863e02119d5SChris Mason 		struct btrfs_inode_ref *victim_ref;
864e02119d5SChris Mason 		unsigned long ptr;
865e02119d5SChris Mason 		unsigned long ptr_end;
866e02119d5SChris Mason 		struct extent_buffer *leaf = path->nodes[0];
867e02119d5SChris Mason 
868e02119d5SChris Mason 		/* are we trying to overwrite a back ref for the root directory
869e02119d5SChris Mason 		 * if so, just jump out, we're done
870e02119d5SChris Mason 		 */
871e02119d5SChris Mason 		if (key->objectid == key->offset)
872e02119d5SChris Mason 			goto out_nowrite;
873e02119d5SChris Mason 
874e02119d5SChris Mason 		/* check all the names in this back reference to see
875e02119d5SChris Mason 		 * if they are in the log.  if so, we allow them to stay
876e02119d5SChris Mason 		 * otherwise they must be unlinked as a conflict
877e02119d5SChris Mason 		 */
878e02119d5SChris Mason 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
879e02119d5SChris Mason 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
880e02119d5SChris Mason 		while (ptr < ptr_end) {
881e02119d5SChris Mason 			victim_ref = (struct btrfs_inode_ref *)ptr;
882e02119d5SChris Mason 			victim_name_len = btrfs_inode_ref_name_len(leaf,
883e02119d5SChris Mason 								   victim_ref);
884e02119d5SChris Mason 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
885e02119d5SChris Mason 			BUG_ON(!victim_name);
886e02119d5SChris Mason 
887e02119d5SChris Mason 			read_extent_buffer(leaf, victim_name,
888e02119d5SChris Mason 					   (unsigned long)(victim_ref + 1),
889e02119d5SChris Mason 					   victim_name_len);
890e02119d5SChris Mason 
891e02119d5SChris Mason 			if (!backref_in_log(log, key, victim_name,
892e02119d5SChris Mason 					    victim_name_len)) {
893e02119d5SChris Mason 				btrfs_inc_nlink(inode);
894b3b4aa74SDavid Sterba 				btrfs_release_path(path);
89512fcfd22SChris Mason 
896e02119d5SChris Mason 				ret = btrfs_unlink_inode(trans, root, dir,
897e02119d5SChris Mason 							 inode, victim_name,
898e02119d5SChris Mason 							 victim_name_len);
899b6305567SChris Mason 				btrfs_run_delayed_items(trans, root);
900e02119d5SChris Mason 			}
901e02119d5SChris Mason 			kfree(victim_name);
902e02119d5SChris Mason 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
903e02119d5SChris Mason 		}
904e02119d5SChris Mason 		BUG_ON(ret);
905c622ae60Sliubo 
906c622ae60Sliubo 		/*
907c622ae60Sliubo 		 * NOTE: we have searched root tree and checked the
908c622ae60Sliubo 		 * coresponding ref, it does not need to check again.
909c622ae60Sliubo 		 */
910c622ae60Sliubo 		search_done = 1;
911e02119d5SChris Mason 	}
912b3b4aa74SDavid Sterba 	btrfs_release_path(path);
913e02119d5SChris Mason 
91434f3e4f2Sliubo 	/* look for a conflicting sequence number */
91534f3e4f2Sliubo 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
91634f3e4f2Sliubo 					 btrfs_inode_ref_index(eb, ref),
91734f3e4f2Sliubo 					 name, namelen, 0);
91834f3e4f2Sliubo 	if (di && !IS_ERR(di)) {
91934f3e4f2Sliubo 		ret = drop_one_dir_item(trans, root, path, dir, di);
92034f3e4f2Sliubo 		BUG_ON(ret);
92134f3e4f2Sliubo 	}
92234f3e4f2Sliubo 	btrfs_release_path(path);
92334f3e4f2Sliubo 
92434f3e4f2Sliubo 	/* look for a conflicing name */
92534f3e4f2Sliubo 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
92634f3e4f2Sliubo 				   name, namelen, 0);
92734f3e4f2Sliubo 	if (di && !IS_ERR(di)) {
92834f3e4f2Sliubo 		ret = drop_one_dir_item(trans, root, path, dir, di);
92934f3e4f2Sliubo 		BUG_ON(ret);
93034f3e4f2Sliubo 	}
93134f3e4f2Sliubo 	btrfs_release_path(path);
93234f3e4f2Sliubo 
933c622ae60Sliubo insert:
934e02119d5SChris Mason 	/* insert our name */
935e02119d5SChris Mason 	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
936e02119d5SChris Mason 			     btrfs_inode_ref_index(eb, ref));
937e02119d5SChris Mason 	BUG_ON(ret);
938e02119d5SChris Mason 
939e02119d5SChris Mason 	btrfs_update_inode(trans, root, inode);
940e02119d5SChris Mason 
941e02119d5SChris Mason out:
942e02119d5SChris Mason 	ref_ptr = (unsigned long)(ref + 1) + namelen;
943e02119d5SChris Mason 	kfree(name);
944e02119d5SChris Mason 	if (ref_ptr < ref_end)
945e02119d5SChris Mason 		goto again;
946e02119d5SChris Mason 
947e02119d5SChris Mason 	/* finally write the back reference in the inode */
948e02119d5SChris Mason 	ret = overwrite_item(trans, root, path, eb, slot, key);
949e02119d5SChris Mason 	BUG_ON(ret);
950e02119d5SChris Mason 
951e02119d5SChris Mason out_nowrite:
952b3b4aa74SDavid Sterba 	btrfs_release_path(path);
953e02119d5SChris Mason 	iput(dir);
954e02119d5SChris Mason 	iput(inode);
955e02119d5SChris Mason 	return 0;
956e02119d5SChris Mason }
957e02119d5SChris Mason 
958c71bf099SYan, Zheng static int insert_orphan_item(struct btrfs_trans_handle *trans,
959c71bf099SYan, Zheng 			      struct btrfs_root *root, u64 offset)
960c71bf099SYan, Zheng {
961c71bf099SYan, Zheng 	int ret;
962c71bf099SYan, Zheng 	ret = btrfs_find_orphan_item(root, offset);
963c71bf099SYan, Zheng 	if (ret > 0)
964c71bf099SYan, Zheng 		ret = btrfs_insert_orphan_item(trans, root, offset);
965c71bf099SYan, Zheng 	return ret;
966c71bf099SYan, Zheng }
967c71bf099SYan, Zheng 
968c71bf099SYan, Zheng 
969e02119d5SChris Mason /*
970e02119d5SChris Mason  * There are a few corners where the link count of the file can't
971e02119d5SChris Mason  * be properly maintained during replay.  So, instead of adding
972e02119d5SChris Mason  * lots of complexity to the log code, we just scan the backrefs
973e02119d5SChris Mason  * for any file that has been through replay.
974e02119d5SChris Mason  *
975e02119d5SChris Mason  * The scan will update the link count on the inode to reflect the
976e02119d5SChris Mason  * number of back refs found.  If it goes down to zero, the iput
977e02119d5SChris Mason  * will free the inode.
978e02119d5SChris Mason  */
979e02119d5SChris Mason static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
980e02119d5SChris Mason 					   struct btrfs_root *root,
981e02119d5SChris Mason 					   struct inode *inode)
982e02119d5SChris Mason {
983e02119d5SChris Mason 	struct btrfs_path *path;
984e02119d5SChris Mason 	int ret;
985e02119d5SChris Mason 	struct btrfs_key key;
986e02119d5SChris Mason 	u64 nlink = 0;
987e02119d5SChris Mason 	unsigned long ptr;
988e02119d5SChris Mason 	unsigned long ptr_end;
989e02119d5SChris Mason 	int name_len;
99033345d01SLi Zefan 	u64 ino = btrfs_ino(inode);
991e02119d5SChris Mason 
99233345d01SLi Zefan 	key.objectid = ino;
993e02119d5SChris Mason 	key.type = BTRFS_INODE_REF_KEY;
994e02119d5SChris Mason 	key.offset = (u64)-1;
995e02119d5SChris Mason 
996e02119d5SChris Mason 	path = btrfs_alloc_path();
9972a29edc6Sliubo 	if (!path)
9982a29edc6Sliubo 		return -ENOMEM;
999e02119d5SChris Mason 
1000e02119d5SChris Mason 	while (1) {
1001e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1002e02119d5SChris Mason 		if (ret < 0)
1003e02119d5SChris Mason 			break;
1004e02119d5SChris Mason 		if (ret > 0) {
1005e02119d5SChris Mason 			if (path->slots[0] == 0)
1006e02119d5SChris Mason 				break;
1007e02119d5SChris Mason 			path->slots[0]--;
1008e02119d5SChris Mason 		}
1009e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1010e02119d5SChris Mason 				      path->slots[0]);
101133345d01SLi Zefan 		if (key.objectid != ino ||
1012e02119d5SChris Mason 		    key.type != BTRFS_INODE_REF_KEY)
1013e02119d5SChris Mason 			break;
1014e02119d5SChris Mason 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1015e02119d5SChris Mason 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1016e02119d5SChris Mason 						   path->slots[0]);
1017e02119d5SChris Mason 		while (ptr < ptr_end) {
1018e02119d5SChris Mason 			struct btrfs_inode_ref *ref;
1019e02119d5SChris Mason 
1020e02119d5SChris Mason 			ref = (struct btrfs_inode_ref *)ptr;
1021e02119d5SChris Mason 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1022e02119d5SChris Mason 							    ref);
1023e02119d5SChris Mason 			ptr = (unsigned long)(ref + 1) + name_len;
1024e02119d5SChris Mason 			nlink++;
1025e02119d5SChris Mason 		}
1026e02119d5SChris Mason 
1027e02119d5SChris Mason 		if (key.offset == 0)
1028e02119d5SChris Mason 			break;
1029e02119d5SChris Mason 		key.offset--;
1030b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1031e02119d5SChris Mason 	}
1032b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1033e02119d5SChris Mason 	if (nlink != inode->i_nlink) {
1034bfe86848SMiklos Szeredi 		set_nlink(inode, nlink);
1035e02119d5SChris Mason 		btrfs_update_inode(trans, root, inode);
1036e02119d5SChris Mason 	}
10378d5bf1cbSChris Mason 	BTRFS_I(inode)->index_cnt = (u64)-1;
1038e02119d5SChris Mason 
1039c71bf099SYan, Zheng 	if (inode->i_nlink == 0) {
1040c71bf099SYan, Zheng 		if (S_ISDIR(inode->i_mode)) {
104112fcfd22SChris Mason 			ret = replay_dir_deletes(trans, root, NULL, path,
104233345d01SLi Zefan 						 ino, 1);
104312fcfd22SChris Mason 			BUG_ON(ret);
104412fcfd22SChris Mason 		}
104533345d01SLi Zefan 		ret = insert_orphan_item(trans, root, ino);
1046c71bf099SYan, Zheng 		BUG_ON(ret);
1047c71bf099SYan, Zheng 	}
104812fcfd22SChris Mason 	btrfs_free_path(path);
104912fcfd22SChris Mason 
1050e02119d5SChris Mason 	return 0;
1051e02119d5SChris Mason }
1052e02119d5SChris Mason 
1053e02119d5SChris Mason static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1054e02119d5SChris Mason 					    struct btrfs_root *root,
1055e02119d5SChris Mason 					    struct btrfs_path *path)
1056e02119d5SChris Mason {
1057e02119d5SChris Mason 	int ret;
1058e02119d5SChris Mason 	struct btrfs_key key;
1059e02119d5SChris Mason 	struct inode *inode;
1060e02119d5SChris Mason 
1061e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1062e02119d5SChris Mason 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1063e02119d5SChris Mason 	key.offset = (u64)-1;
1064e02119d5SChris Mason 	while (1) {
1065e02119d5SChris Mason 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1066e02119d5SChris Mason 		if (ret < 0)
1067e02119d5SChris Mason 			break;
1068e02119d5SChris Mason 
1069e02119d5SChris Mason 		if (ret == 1) {
1070e02119d5SChris Mason 			if (path->slots[0] == 0)
1071e02119d5SChris Mason 				break;
1072e02119d5SChris Mason 			path->slots[0]--;
1073e02119d5SChris Mason 		}
1074e02119d5SChris Mason 
1075e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1076e02119d5SChris Mason 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1077e02119d5SChris Mason 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1078e02119d5SChris Mason 			break;
1079e02119d5SChris Mason 
1080e02119d5SChris Mason 		ret = btrfs_del_item(trans, root, path);
108165a246c5STsutomu Itoh 		if (ret)
108265a246c5STsutomu Itoh 			goto out;
1083e02119d5SChris Mason 
1084b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1085e02119d5SChris Mason 		inode = read_one_inode(root, key.offset);
1086c00e9493STsutomu Itoh 		if (!inode)
1087c00e9493STsutomu Itoh 			return -EIO;
1088e02119d5SChris Mason 
1089e02119d5SChris Mason 		ret = fixup_inode_link_count(trans, root, inode);
1090e02119d5SChris Mason 		BUG_ON(ret);
1091e02119d5SChris Mason 
1092e02119d5SChris Mason 		iput(inode);
1093e02119d5SChris Mason 
109412fcfd22SChris Mason 		/*
109512fcfd22SChris Mason 		 * fixup on a directory may create new entries,
109612fcfd22SChris Mason 		 * make sure we always look for the highset possible
109712fcfd22SChris Mason 		 * offset
109812fcfd22SChris Mason 		 */
109912fcfd22SChris Mason 		key.offset = (u64)-1;
1100e02119d5SChris Mason 	}
110165a246c5STsutomu Itoh 	ret = 0;
110265a246c5STsutomu Itoh out:
1103b3b4aa74SDavid Sterba 	btrfs_release_path(path);
110465a246c5STsutomu Itoh 	return ret;
1105e02119d5SChris Mason }
1106e02119d5SChris Mason 
1107e02119d5SChris Mason 
1108e02119d5SChris Mason /*
1109e02119d5SChris Mason  * record a given inode in the fixup dir so we can check its link
1110e02119d5SChris Mason  * count when replay is done.  The link count is incremented here
1111e02119d5SChris Mason  * so the inode won't go away until we check it
1112e02119d5SChris Mason  */
1113e02119d5SChris Mason static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1114e02119d5SChris Mason 				      struct btrfs_root *root,
1115e02119d5SChris Mason 				      struct btrfs_path *path,
1116e02119d5SChris Mason 				      u64 objectid)
1117e02119d5SChris Mason {
1118e02119d5SChris Mason 	struct btrfs_key key;
1119e02119d5SChris Mason 	int ret = 0;
1120e02119d5SChris Mason 	struct inode *inode;
1121e02119d5SChris Mason 
1122e02119d5SChris Mason 	inode = read_one_inode(root, objectid);
1123c00e9493STsutomu Itoh 	if (!inode)
1124c00e9493STsutomu Itoh 		return -EIO;
1125e02119d5SChris Mason 
1126e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1127e02119d5SChris Mason 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1128e02119d5SChris Mason 	key.offset = objectid;
1129e02119d5SChris Mason 
1130e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1131e02119d5SChris Mason 
1132b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1133e02119d5SChris Mason 	if (ret == 0) {
1134e02119d5SChris Mason 		btrfs_inc_nlink(inode);
1135b9959295STsutomu Itoh 		ret = btrfs_update_inode(trans, root, inode);
1136e02119d5SChris Mason 	} else if (ret == -EEXIST) {
1137e02119d5SChris Mason 		ret = 0;
1138e02119d5SChris Mason 	} else {
1139e02119d5SChris Mason 		BUG();
1140e02119d5SChris Mason 	}
1141e02119d5SChris Mason 	iput(inode);
1142e02119d5SChris Mason 
1143e02119d5SChris Mason 	return ret;
1144e02119d5SChris Mason }
1145e02119d5SChris Mason 
1146e02119d5SChris Mason /*
1147e02119d5SChris Mason  * when replaying the log for a directory, we only insert names
1148e02119d5SChris Mason  * for inodes that actually exist.  This means an fsync on a directory
1149e02119d5SChris Mason  * does not implicitly fsync all the new files in it
1150e02119d5SChris Mason  */
1151e02119d5SChris Mason static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1152e02119d5SChris Mason 				    struct btrfs_root *root,
1153e02119d5SChris Mason 				    struct btrfs_path *path,
1154e02119d5SChris Mason 				    u64 dirid, u64 index,
1155e02119d5SChris Mason 				    char *name, int name_len, u8 type,
1156e02119d5SChris Mason 				    struct btrfs_key *location)
1157e02119d5SChris Mason {
1158e02119d5SChris Mason 	struct inode *inode;
1159e02119d5SChris Mason 	struct inode *dir;
1160e02119d5SChris Mason 	int ret;
1161e02119d5SChris Mason 
1162e02119d5SChris Mason 	inode = read_one_inode(root, location->objectid);
1163e02119d5SChris Mason 	if (!inode)
1164e02119d5SChris Mason 		return -ENOENT;
1165e02119d5SChris Mason 
1166e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
1167e02119d5SChris Mason 	if (!dir) {
1168e02119d5SChris Mason 		iput(inode);
1169e02119d5SChris Mason 		return -EIO;
1170e02119d5SChris Mason 	}
1171e02119d5SChris Mason 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1172e02119d5SChris Mason 
1173e02119d5SChris Mason 	/* FIXME, put inode into FIXUP list */
1174e02119d5SChris Mason 
1175e02119d5SChris Mason 	iput(inode);
1176e02119d5SChris Mason 	iput(dir);
1177e02119d5SChris Mason 	return ret;
1178e02119d5SChris Mason }
1179e02119d5SChris Mason 
1180e02119d5SChris Mason /*
1181e02119d5SChris Mason  * take a single entry in a log directory item and replay it into
1182e02119d5SChris Mason  * the subvolume.
1183e02119d5SChris Mason  *
1184e02119d5SChris Mason  * if a conflicting item exists in the subdirectory already,
1185e02119d5SChris Mason  * the inode it points to is unlinked and put into the link count
1186e02119d5SChris Mason  * fix up tree.
1187e02119d5SChris Mason  *
1188e02119d5SChris Mason  * If a name from the log points to a file or directory that does
1189e02119d5SChris Mason  * not exist in the FS, it is skipped.  fsyncs on directories
1190e02119d5SChris Mason  * do not force down inodes inside that directory, just changes to the
1191e02119d5SChris Mason  * names or unlinks in a directory.
1192e02119d5SChris Mason  */
1193e02119d5SChris Mason static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1194e02119d5SChris Mason 				    struct btrfs_root *root,
1195e02119d5SChris Mason 				    struct btrfs_path *path,
1196e02119d5SChris Mason 				    struct extent_buffer *eb,
1197e02119d5SChris Mason 				    struct btrfs_dir_item *di,
1198e02119d5SChris Mason 				    struct btrfs_key *key)
1199e02119d5SChris Mason {
1200e02119d5SChris Mason 	char *name;
1201e02119d5SChris Mason 	int name_len;
1202e02119d5SChris Mason 	struct btrfs_dir_item *dst_di;
1203e02119d5SChris Mason 	struct btrfs_key found_key;
1204e02119d5SChris Mason 	struct btrfs_key log_key;
1205e02119d5SChris Mason 	struct inode *dir;
1206e02119d5SChris Mason 	u8 log_type;
12074bef0848SChris Mason 	int exists;
1208e02119d5SChris Mason 	int ret;
1209e02119d5SChris Mason 
1210e02119d5SChris Mason 	dir = read_one_inode(root, key->objectid);
1211c00e9493STsutomu Itoh 	if (!dir)
1212c00e9493STsutomu Itoh 		return -EIO;
1213e02119d5SChris Mason 
1214e02119d5SChris Mason 	name_len = btrfs_dir_name_len(eb, di);
1215e02119d5SChris Mason 	name = kmalloc(name_len, GFP_NOFS);
12162a29edc6Sliubo 	if (!name)
12172a29edc6Sliubo 		return -ENOMEM;
12182a29edc6Sliubo 
1219e02119d5SChris Mason 	log_type = btrfs_dir_type(eb, di);
1220e02119d5SChris Mason 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1221e02119d5SChris Mason 		   name_len);
1222e02119d5SChris Mason 
1223e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
12244bef0848SChris Mason 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
12254bef0848SChris Mason 	if (exists == 0)
12264bef0848SChris Mason 		exists = 1;
12274bef0848SChris Mason 	else
12284bef0848SChris Mason 		exists = 0;
1229b3b4aa74SDavid Sterba 	btrfs_release_path(path);
12304bef0848SChris Mason 
1231e02119d5SChris Mason 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1232e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1233e02119d5SChris Mason 				       name, name_len, 1);
1234d397712bSChris Mason 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1235e02119d5SChris Mason 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1236e02119d5SChris Mason 						     key->objectid,
1237e02119d5SChris Mason 						     key->offset, name,
1238e02119d5SChris Mason 						     name_len, 1);
1239e02119d5SChris Mason 	} else {
1240e02119d5SChris Mason 		BUG();
1241e02119d5SChris Mason 	}
1242c704005dSDavid Sterba 	if (IS_ERR_OR_NULL(dst_di)) {
1243e02119d5SChris Mason 		/* we need a sequence number to insert, so we only
1244e02119d5SChris Mason 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1245e02119d5SChris Mason 		 */
1246e02119d5SChris Mason 		if (key->type != BTRFS_DIR_INDEX_KEY)
1247e02119d5SChris Mason 			goto out;
1248e02119d5SChris Mason 		goto insert;
1249e02119d5SChris Mason 	}
1250e02119d5SChris Mason 
1251e02119d5SChris Mason 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1252e02119d5SChris Mason 	/* the existing item matches the logged item */
1253e02119d5SChris Mason 	if (found_key.objectid == log_key.objectid &&
1254e02119d5SChris Mason 	    found_key.type == log_key.type &&
1255e02119d5SChris Mason 	    found_key.offset == log_key.offset &&
1256e02119d5SChris Mason 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1257e02119d5SChris Mason 		goto out;
1258e02119d5SChris Mason 	}
1259e02119d5SChris Mason 
1260e02119d5SChris Mason 	/*
1261e02119d5SChris Mason 	 * don't drop the conflicting directory entry if the inode
1262e02119d5SChris Mason 	 * for the new entry doesn't exist
1263e02119d5SChris Mason 	 */
12644bef0848SChris Mason 	if (!exists)
1265e02119d5SChris Mason 		goto out;
1266e02119d5SChris Mason 
1267e02119d5SChris Mason 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1268e02119d5SChris Mason 	BUG_ON(ret);
1269e02119d5SChris Mason 
1270e02119d5SChris Mason 	if (key->type == BTRFS_DIR_INDEX_KEY)
1271e02119d5SChris Mason 		goto insert;
1272e02119d5SChris Mason out:
1273b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1274e02119d5SChris Mason 	kfree(name);
1275e02119d5SChris Mason 	iput(dir);
1276e02119d5SChris Mason 	return 0;
1277e02119d5SChris Mason 
1278e02119d5SChris Mason insert:
1279b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1280e02119d5SChris Mason 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1281e02119d5SChris Mason 			      name, name_len, log_type, &log_key);
1282e02119d5SChris Mason 
1283c293498bSStoyan Gaydarov 	BUG_ON(ret && ret != -ENOENT);
1284e02119d5SChris Mason 	goto out;
1285e02119d5SChris Mason }
1286e02119d5SChris Mason 
1287e02119d5SChris Mason /*
1288e02119d5SChris Mason  * find all the names in a directory item and reconcile them into
1289e02119d5SChris Mason  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1290e02119d5SChris Mason  * one name in a directory item, but the same code gets used for
1291e02119d5SChris Mason  * both directory index types
1292e02119d5SChris Mason  */
1293e02119d5SChris Mason static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1294e02119d5SChris Mason 					struct btrfs_root *root,
1295e02119d5SChris Mason 					struct btrfs_path *path,
1296e02119d5SChris Mason 					struct extent_buffer *eb, int slot,
1297e02119d5SChris Mason 					struct btrfs_key *key)
1298e02119d5SChris Mason {
1299e02119d5SChris Mason 	int ret;
1300e02119d5SChris Mason 	u32 item_size = btrfs_item_size_nr(eb, slot);
1301e02119d5SChris Mason 	struct btrfs_dir_item *di;
1302e02119d5SChris Mason 	int name_len;
1303e02119d5SChris Mason 	unsigned long ptr;
1304e02119d5SChris Mason 	unsigned long ptr_end;
1305e02119d5SChris Mason 
1306e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
1307e02119d5SChris Mason 	ptr_end = ptr + item_size;
1308e02119d5SChris Mason 	while (ptr < ptr_end) {
1309e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
131022a94d44SJosef Bacik 		if (verify_dir_item(root, eb, di))
131122a94d44SJosef Bacik 			return -EIO;
1312e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
1313e02119d5SChris Mason 		ret = replay_one_name(trans, root, path, eb, di, key);
1314e02119d5SChris Mason 		BUG_ON(ret);
1315e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
1316e02119d5SChris Mason 		ptr += name_len;
1317e02119d5SChris Mason 	}
1318e02119d5SChris Mason 	return 0;
1319e02119d5SChris Mason }
1320e02119d5SChris Mason 
1321e02119d5SChris Mason /*
1322e02119d5SChris Mason  * directory replay has two parts.  There are the standard directory
1323e02119d5SChris Mason  * items in the log copied from the subvolume, and range items
1324e02119d5SChris Mason  * created in the log while the subvolume was logged.
1325e02119d5SChris Mason  *
1326e02119d5SChris Mason  * The range items tell us which parts of the key space the log
1327e02119d5SChris Mason  * is authoritative for.  During replay, if a key in the subvolume
1328e02119d5SChris Mason  * directory is in a logged range item, but not actually in the log
1329e02119d5SChris Mason  * that means it was deleted from the directory before the fsync
1330e02119d5SChris Mason  * and should be removed.
1331e02119d5SChris Mason  */
1332e02119d5SChris Mason static noinline int find_dir_range(struct btrfs_root *root,
1333e02119d5SChris Mason 				   struct btrfs_path *path,
1334e02119d5SChris Mason 				   u64 dirid, int key_type,
1335e02119d5SChris Mason 				   u64 *start_ret, u64 *end_ret)
1336e02119d5SChris Mason {
1337e02119d5SChris Mason 	struct btrfs_key key;
1338e02119d5SChris Mason 	u64 found_end;
1339e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
1340e02119d5SChris Mason 	int ret;
1341e02119d5SChris Mason 	int nritems;
1342e02119d5SChris Mason 
1343e02119d5SChris Mason 	if (*start_ret == (u64)-1)
1344e02119d5SChris Mason 		return 1;
1345e02119d5SChris Mason 
1346e02119d5SChris Mason 	key.objectid = dirid;
1347e02119d5SChris Mason 	key.type = key_type;
1348e02119d5SChris Mason 	key.offset = *start_ret;
1349e02119d5SChris Mason 
1350e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1351e02119d5SChris Mason 	if (ret < 0)
1352e02119d5SChris Mason 		goto out;
1353e02119d5SChris Mason 	if (ret > 0) {
1354e02119d5SChris Mason 		if (path->slots[0] == 0)
1355e02119d5SChris Mason 			goto out;
1356e02119d5SChris Mason 		path->slots[0]--;
1357e02119d5SChris Mason 	}
1358e02119d5SChris Mason 	if (ret != 0)
1359e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1360e02119d5SChris Mason 
1361e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
1362e02119d5SChris Mason 		ret = 1;
1363e02119d5SChris Mason 		goto next;
1364e02119d5SChris Mason 	}
1365e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1366e02119d5SChris Mason 			      struct btrfs_dir_log_item);
1367e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1368e02119d5SChris Mason 
1369e02119d5SChris Mason 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1370e02119d5SChris Mason 		ret = 0;
1371e02119d5SChris Mason 		*start_ret = key.offset;
1372e02119d5SChris Mason 		*end_ret = found_end;
1373e02119d5SChris Mason 		goto out;
1374e02119d5SChris Mason 	}
1375e02119d5SChris Mason 	ret = 1;
1376e02119d5SChris Mason next:
1377e02119d5SChris Mason 	/* check the next slot in the tree to see if it is a valid item */
1378e02119d5SChris Mason 	nritems = btrfs_header_nritems(path->nodes[0]);
1379e02119d5SChris Mason 	if (path->slots[0] >= nritems) {
1380e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
1381e02119d5SChris Mason 		if (ret)
1382e02119d5SChris Mason 			goto out;
1383e02119d5SChris Mason 	} else {
1384e02119d5SChris Mason 		path->slots[0]++;
1385e02119d5SChris Mason 	}
1386e02119d5SChris Mason 
1387e02119d5SChris Mason 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1388e02119d5SChris Mason 
1389e02119d5SChris Mason 	if (key.type != key_type || key.objectid != dirid) {
1390e02119d5SChris Mason 		ret = 1;
1391e02119d5SChris Mason 		goto out;
1392e02119d5SChris Mason 	}
1393e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1394e02119d5SChris Mason 			      struct btrfs_dir_log_item);
1395e02119d5SChris Mason 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1396e02119d5SChris Mason 	*start_ret = key.offset;
1397e02119d5SChris Mason 	*end_ret = found_end;
1398e02119d5SChris Mason 	ret = 0;
1399e02119d5SChris Mason out:
1400b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1401e02119d5SChris Mason 	return ret;
1402e02119d5SChris Mason }
1403e02119d5SChris Mason 
1404e02119d5SChris Mason /*
1405e02119d5SChris Mason  * this looks for a given directory item in the log.  If the directory
1406e02119d5SChris Mason  * item is not in the log, the item is removed and the inode it points
1407e02119d5SChris Mason  * to is unlinked
1408e02119d5SChris Mason  */
1409e02119d5SChris Mason static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1410e02119d5SChris Mason 				      struct btrfs_root *root,
1411e02119d5SChris Mason 				      struct btrfs_root *log,
1412e02119d5SChris Mason 				      struct btrfs_path *path,
1413e02119d5SChris Mason 				      struct btrfs_path *log_path,
1414e02119d5SChris Mason 				      struct inode *dir,
1415e02119d5SChris Mason 				      struct btrfs_key *dir_key)
1416e02119d5SChris Mason {
1417e02119d5SChris Mason 	int ret;
1418e02119d5SChris Mason 	struct extent_buffer *eb;
1419e02119d5SChris Mason 	int slot;
1420e02119d5SChris Mason 	u32 item_size;
1421e02119d5SChris Mason 	struct btrfs_dir_item *di;
1422e02119d5SChris Mason 	struct btrfs_dir_item *log_di;
1423e02119d5SChris Mason 	int name_len;
1424e02119d5SChris Mason 	unsigned long ptr;
1425e02119d5SChris Mason 	unsigned long ptr_end;
1426e02119d5SChris Mason 	char *name;
1427e02119d5SChris Mason 	struct inode *inode;
1428e02119d5SChris Mason 	struct btrfs_key location;
1429e02119d5SChris Mason 
1430e02119d5SChris Mason again:
1431e02119d5SChris Mason 	eb = path->nodes[0];
1432e02119d5SChris Mason 	slot = path->slots[0];
1433e02119d5SChris Mason 	item_size = btrfs_item_size_nr(eb, slot);
1434e02119d5SChris Mason 	ptr = btrfs_item_ptr_offset(eb, slot);
1435e02119d5SChris Mason 	ptr_end = ptr + item_size;
1436e02119d5SChris Mason 	while (ptr < ptr_end) {
1437e02119d5SChris Mason 		di = (struct btrfs_dir_item *)ptr;
143822a94d44SJosef Bacik 		if (verify_dir_item(root, eb, di)) {
143922a94d44SJosef Bacik 			ret = -EIO;
144022a94d44SJosef Bacik 			goto out;
144122a94d44SJosef Bacik 		}
144222a94d44SJosef Bacik 
1443e02119d5SChris Mason 		name_len = btrfs_dir_name_len(eb, di);
1444e02119d5SChris Mason 		name = kmalloc(name_len, GFP_NOFS);
1445e02119d5SChris Mason 		if (!name) {
1446e02119d5SChris Mason 			ret = -ENOMEM;
1447e02119d5SChris Mason 			goto out;
1448e02119d5SChris Mason 		}
1449e02119d5SChris Mason 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
1450e02119d5SChris Mason 				  name_len);
1451e02119d5SChris Mason 		log_di = NULL;
145212fcfd22SChris Mason 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1453e02119d5SChris Mason 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
1454e02119d5SChris Mason 						       dir_key->objectid,
1455e02119d5SChris Mason 						       name, name_len, 0);
145612fcfd22SChris Mason 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1457e02119d5SChris Mason 			log_di = btrfs_lookup_dir_index_item(trans, log,
1458e02119d5SChris Mason 						     log_path,
1459e02119d5SChris Mason 						     dir_key->objectid,
1460e02119d5SChris Mason 						     dir_key->offset,
1461e02119d5SChris Mason 						     name, name_len, 0);
1462e02119d5SChris Mason 		}
1463c704005dSDavid Sterba 		if (IS_ERR_OR_NULL(log_di)) {
1464e02119d5SChris Mason 			btrfs_dir_item_key_to_cpu(eb, di, &location);
1465b3b4aa74SDavid Sterba 			btrfs_release_path(path);
1466b3b4aa74SDavid Sterba 			btrfs_release_path(log_path);
1467e02119d5SChris Mason 			inode = read_one_inode(root, location.objectid);
1468c00e9493STsutomu Itoh 			if (!inode) {
1469c00e9493STsutomu Itoh 				kfree(name);
1470c00e9493STsutomu Itoh 				return -EIO;
1471c00e9493STsutomu Itoh 			}
1472e02119d5SChris Mason 
1473e02119d5SChris Mason 			ret = link_to_fixup_dir(trans, root,
1474e02119d5SChris Mason 						path, location.objectid);
1475e02119d5SChris Mason 			BUG_ON(ret);
1476e02119d5SChris Mason 			btrfs_inc_nlink(inode);
1477e02119d5SChris Mason 			ret = btrfs_unlink_inode(trans, root, dir, inode,
1478e02119d5SChris Mason 						 name, name_len);
1479e02119d5SChris Mason 			BUG_ON(ret);
1480b6305567SChris Mason 
1481b6305567SChris Mason 			btrfs_run_delayed_items(trans, root);
1482b6305567SChris Mason 
1483e02119d5SChris Mason 			kfree(name);
1484e02119d5SChris Mason 			iput(inode);
1485e02119d5SChris Mason 
1486e02119d5SChris Mason 			/* there might still be more names under this key
1487e02119d5SChris Mason 			 * check and repeat if required
1488e02119d5SChris Mason 			 */
1489e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, dir_key, path,
1490e02119d5SChris Mason 						0, 0);
1491e02119d5SChris Mason 			if (ret == 0)
1492e02119d5SChris Mason 				goto again;
1493e02119d5SChris Mason 			ret = 0;
1494e02119d5SChris Mason 			goto out;
1495e02119d5SChris Mason 		}
1496b3b4aa74SDavid Sterba 		btrfs_release_path(log_path);
1497e02119d5SChris Mason 		kfree(name);
1498e02119d5SChris Mason 
1499e02119d5SChris Mason 		ptr = (unsigned long)(di + 1);
1500e02119d5SChris Mason 		ptr += name_len;
1501e02119d5SChris Mason 	}
1502e02119d5SChris Mason 	ret = 0;
1503e02119d5SChris Mason out:
1504b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1505b3b4aa74SDavid Sterba 	btrfs_release_path(log_path);
1506e02119d5SChris Mason 	return ret;
1507e02119d5SChris Mason }
1508e02119d5SChris Mason 
1509e02119d5SChris Mason /*
1510e02119d5SChris Mason  * deletion replay happens before we copy any new directory items
1511e02119d5SChris Mason  * out of the log or out of backreferences from inodes.  It
1512e02119d5SChris Mason  * scans the log to find ranges of keys that log is authoritative for,
1513e02119d5SChris Mason  * and then scans the directory to find items in those ranges that are
1514e02119d5SChris Mason  * not present in the log.
1515e02119d5SChris Mason  *
1516e02119d5SChris Mason  * Anything we don't find in the log is unlinked and removed from the
1517e02119d5SChris Mason  * directory.
1518e02119d5SChris Mason  */
1519e02119d5SChris Mason static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1520e02119d5SChris Mason 				       struct btrfs_root *root,
1521e02119d5SChris Mason 				       struct btrfs_root *log,
1522e02119d5SChris Mason 				       struct btrfs_path *path,
152312fcfd22SChris Mason 				       u64 dirid, int del_all)
1524e02119d5SChris Mason {
1525e02119d5SChris Mason 	u64 range_start;
1526e02119d5SChris Mason 	u64 range_end;
1527e02119d5SChris Mason 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1528e02119d5SChris Mason 	int ret = 0;
1529e02119d5SChris Mason 	struct btrfs_key dir_key;
1530e02119d5SChris Mason 	struct btrfs_key found_key;
1531e02119d5SChris Mason 	struct btrfs_path *log_path;
1532e02119d5SChris Mason 	struct inode *dir;
1533e02119d5SChris Mason 
1534e02119d5SChris Mason 	dir_key.objectid = dirid;
1535e02119d5SChris Mason 	dir_key.type = BTRFS_DIR_ITEM_KEY;
1536e02119d5SChris Mason 	log_path = btrfs_alloc_path();
1537e02119d5SChris Mason 	if (!log_path)
1538e02119d5SChris Mason 		return -ENOMEM;
1539e02119d5SChris Mason 
1540e02119d5SChris Mason 	dir = read_one_inode(root, dirid);
1541e02119d5SChris Mason 	/* it isn't an error if the inode isn't there, that can happen
1542e02119d5SChris Mason 	 * because we replay the deletes before we copy in the inode item
1543e02119d5SChris Mason 	 * from the log
1544e02119d5SChris Mason 	 */
1545e02119d5SChris Mason 	if (!dir) {
1546e02119d5SChris Mason 		btrfs_free_path(log_path);
1547e02119d5SChris Mason 		return 0;
1548e02119d5SChris Mason 	}
1549e02119d5SChris Mason again:
1550e02119d5SChris Mason 	range_start = 0;
1551e02119d5SChris Mason 	range_end = 0;
1552e02119d5SChris Mason 	while (1) {
155312fcfd22SChris Mason 		if (del_all)
155412fcfd22SChris Mason 			range_end = (u64)-1;
155512fcfd22SChris Mason 		else {
1556e02119d5SChris Mason 			ret = find_dir_range(log, path, dirid, key_type,
1557e02119d5SChris Mason 					     &range_start, &range_end);
1558e02119d5SChris Mason 			if (ret != 0)
1559e02119d5SChris Mason 				break;
156012fcfd22SChris Mason 		}
1561e02119d5SChris Mason 
1562e02119d5SChris Mason 		dir_key.offset = range_start;
1563e02119d5SChris Mason 		while (1) {
1564e02119d5SChris Mason 			int nritems;
1565e02119d5SChris Mason 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
1566e02119d5SChris Mason 						0, 0);
1567e02119d5SChris Mason 			if (ret < 0)
1568e02119d5SChris Mason 				goto out;
1569e02119d5SChris Mason 
1570e02119d5SChris Mason 			nritems = btrfs_header_nritems(path->nodes[0]);
1571e02119d5SChris Mason 			if (path->slots[0] >= nritems) {
1572e02119d5SChris Mason 				ret = btrfs_next_leaf(root, path);
1573e02119d5SChris Mason 				if (ret)
1574e02119d5SChris Mason 					break;
1575e02119d5SChris Mason 			}
1576e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1577e02119d5SChris Mason 					      path->slots[0]);
1578e02119d5SChris Mason 			if (found_key.objectid != dirid ||
1579e02119d5SChris Mason 			    found_key.type != dir_key.type)
1580e02119d5SChris Mason 				goto next_type;
1581e02119d5SChris Mason 
1582e02119d5SChris Mason 			if (found_key.offset > range_end)
1583e02119d5SChris Mason 				break;
1584e02119d5SChris Mason 
1585e02119d5SChris Mason 			ret = check_item_in_log(trans, root, log, path,
158612fcfd22SChris Mason 						log_path, dir,
158712fcfd22SChris Mason 						&found_key);
1588e02119d5SChris Mason 			BUG_ON(ret);
1589e02119d5SChris Mason 			if (found_key.offset == (u64)-1)
1590e02119d5SChris Mason 				break;
1591e02119d5SChris Mason 			dir_key.offset = found_key.offset + 1;
1592e02119d5SChris Mason 		}
1593b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1594e02119d5SChris Mason 		if (range_end == (u64)-1)
1595e02119d5SChris Mason 			break;
1596e02119d5SChris Mason 		range_start = range_end + 1;
1597e02119d5SChris Mason 	}
1598e02119d5SChris Mason 
1599e02119d5SChris Mason next_type:
1600e02119d5SChris Mason 	ret = 0;
1601e02119d5SChris Mason 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1602e02119d5SChris Mason 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
1603e02119d5SChris Mason 		dir_key.type = BTRFS_DIR_INDEX_KEY;
1604b3b4aa74SDavid Sterba 		btrfs_release_path(path);
1605e02119d5SChris Mason 		goto again;
1606e02119d5SChris Mason 	}
1607e02119d5SChris Mason out:
1608b3b4aa74SDavid Sterba 	btrfs_release_path(path);
1609e02119d5SChris Mason 	btrfs_free_path(log_path);
1610e02119d5SChris Mason 	iput(dir);
1611e02119d5SChris Mason 	return ret;
1612e02119d5SChris Mason }
1613e02119d5SChris Mason 
1614e02119d5SChris Mason /*
1615e02119d5SChris Mason  * the process_func used to replay items from the log tree.  This
1616e02119d5SChris Mason  * gets called in two different stages.  The first stage just looks
1617e02119d5SChris Mason  * for inodes and makes sure they are all copied into the subvolume.
1618e02119d5SChris Mason  *
1619e02119d5SChris Mason  * The second stage copies all the other item types from the log into
1620e02119d5SChris Mason  * the subvolume.  The two stage approach is slower, but gets rid of
1621e02119d5SChris Mason  * lots of complexity around inodes referencing other inodes that exist
1622e02119d5SChris Mason  * only in the log (references come from either directory items or inode
1623e02119d5SChris Mason  * back refs).
1624e02119d5SChris Mason  */
1625e02119d5SChris Mason static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1626e02119d5SChris Mason 			     struct walk_control *wc, u64 gen)
1627e02119d5SChris Mason {
1628e02119d5SChris Mason 	int nritems;
1629e02119d5SChris Mason 	struct btrfs_path *path;
1630e02119d5SChris Mason 	struct btrfs_root *root = wc->replay_dest;
1631e02119d5SChris Mason 	struct btrfs_key key;
1632e02119d5SChris Mason 	int level;
1633e02119d5SChris Mason 	int i;
1634e02119d5SChris Mason 	int ret;
1635e02119d5SChris Mason 
1636018642a1STsutomu Itoh 	ret = btrfs_read_buffer(eb, gen);
1637018642a1STsutomu Itoh 	if (ret)
1638018642a1STsutomu Itoh 		return ret;
1639e02119d5SChris Mason 
1640e02119d5SChris Mason 	level = btrfs_header_level(eb);
1641e02119d5SChris Mason 
1642e02119d5SChris Mason 	if (level != 0)
1643e02119d5SChris Mason 		return 0;
1644e02119d5SChris Mason 
1645e02119d5SChris Mason 	path = btrfs_alloc_path();
16461e5063d0SMark Fasheh 	if (!path)
16471e5063d0SMark Fasheh 		return -ENOMEM;
1648e02119d5SChris Mason 
1649e02119d5SChris Mason 	nritems = btrfs_header_nritems(eb);
1650e02119d5SChris Mason 	for (i = 0; i < nritems; i++) {
1651e02119d5SChris Mason 		btrfs_item_key_to_cpu(eb, &key, i);
1652e02119d5SChris Mason 
1653e02119d5SChris Mason 		/* inode keys are done during the first stage */
1654e02119d5SChris Mason 		if (key.type == BTRFS_INODE_ITEM_KEY &&
1655e02119d5SChris Mason 		    wc->stage == LOG_WALK_REPLAY_INODES) {
1656e02119d5SChris Mason 			struct btrfs_inode_item *inode_item;
1657e02119d5SChris Mason 			u32 mode;
1658e02119d5SChris Mason 
1659e02119d5SChris Mason 			inode_item = btrfs_item_ptr(eb, i,
1660e02119d5SChris Mason 					    struct btrfs_inode_item);
1661e02119d5SChris Mason 			mode = btrfs_inode_mode(eb, inode_item);
1662e02119d5SChris Mason 			if (S_ISDIR(mode)) {
1663e02119d5SChris Mason 				ret = replay_dir_deletes(wc->trans,
166412fcfd22SChris Mason 					 root, log, path, key.objectid, 0);
1665e02119d5SChris Mason 				BUG_ON(ret);
1666e02119d5SChris Mason 			}
1667e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
1668e02119d5SChris Mason 					     eb, i, &key);
1669e02119d5SChris Mason 			BUG_ON(ret);
1670e02119d5SChris Mason 
1671c71bf099SYan, Zheng 			/* for regular files, make sure corresponding
1672c71bf099SYan, Zheng 			 * orhpan item exist. extents past the new EOF
1673c71bf099SYan, Zheng 			 * will be truncated later by orphan cleanup.
1674e02119d5SChris Mason 			 */
1675e02119d5SChris Mason 			if (S_ISREG(mode)) {
1676c71bf099SYan, Zheng 				ret = insert_orphan_item(wc->trans, root,
1677e02119d5SChris Mason 							 key.objectid);
1678e02119d5SChris Mason 				BUG_ON(ret);
1679c71bf099SYan, Zheng 			}
1680a74ac322SChris Mason 
1681e02119d5SChris Mason 			ret = link_to_fixup_dir(wc->trans, root,
1682e02119d5SChris Mason 						path, key.objectid);
1683e02119d5SChris Mason 			BUG_ON(ret);
1684e02119d5SChris Mason 		}
1685e02119d5SChris Mason 		if (wc->stage < LOG_WALK_REPLAY_ALL)
1686e02119d5SChris Mason 			continue;
1687e02119d5SChris Mason 
1688e02119d5SChris Mason 		/* these keys are simply copied */
1689e02119d5SChris Mason 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
1690e02119d5SChris Mason 			ret = overwrite_item(wc->trans, root, path,
1691e02119d5SChris Mason 					     eb, i, &key);
1692e02119d5SChris Mason 			BUG_ON(ret);
1693e02119d5SChris Mason 		} else if (key.type == BTRFS_INODE_REF_KEY) {
1694e02119d5SChris Mason 			ret = add_inode_ref(wc->trans, root, log, path,
1695e02119d5SChris Mason 					    eb, i, &key);
1696e02119d5SChris Mason 			BUG_ON(ret && ret != -ENOENT);
1697e02119d5SChris Mason 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1698e02119d5SChris Mason 			ret = replay_one_extent(wc->trans, root, path,
1699e02119d5SChris Mason 						eb, i, &key);
1700e02119d5SChris Mason 			BUG_ON(ret);
1701e02119d5SChris Mason 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
1702e02119d5SChris Mason 			   key.type == BTRFS_DIR_INDEX_KEY) {
1703e02119d5SChris Mason 			ret = replay_one_dir_item(wc->trans, root, path,
1704e02119d5SChris Mason 						  eb, i, &key);
1705e02119d5SChris Mason 			BUG_ON(ret);
1706e02119d5SChris Mason 		}
1707e02119d5SChris Mason 	}
1708e02119d5SChris Mason 	btrfs_free_path(path);
1709e02119d5SChris Mason 	return 0;
1710e02119d5SChris Mason }
1711e02119d5SChris Mason 
1712d397712bSChris Mason static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1713e02119d5SChris Mason 				   struct btrfs_root *root,
1714e02119d5SChris Mason 				   struct btrfs_path *path, int *level,
1715e02119d5SChris Mason 				   struct walk_control *wc)
1716e02119d5SChris Mason {
1717e02119d5SChris Mason 	u64 root_owner;
1718e02119d5SChris Mason 	u64 bytenr;
1719e02119d5SChris Mason 	u64 ptr_gen;
1720e02119d5SChris Mason 	struct extent_buffer *next;
1721e02119d5SChris Mason 	struct extent_buffer *cur;
1722e02119d5SChris Mason 	struct extent_buffer *parent;
1723e02119d5SChris Mason 	u32 blocksize;
1724e02119d5SChris Mason 	int ret = 0;
1725e02119d5SChris Mason 
1726e02119d5SChris Mason 	WARN_ON(*level < 0);
1727e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1728e02119d5SChris Mason 
1729e02119d5SChris Mason 	while (*level > 0) {
1730e02119d5SChris Mason 		WARN_ON(*level < 0);
1731e02119d5SChris Mason 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
1732e02119d5SChris Mason 		cur = path->nodes[*level];
1733e02119d5SChris Mason 
1734e02119d5SChris Mason 		if (btrfs_header_level(cur) != *level)
1735e02119d5SChris Mason 			WARN_ON(1);
1736e02119d5SChris Mason 
1737e02119d5SChris Mason 		if (path->slots[*level] >=
1738e02119d5SChris Mason 		    btrfs_header_nritems(cur))
1739e02119d5SChris Mason 			break;
1740e02119d5SChris Mason 
1741e02119d5SChris Mason 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1742e02119d5SChris Mason 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1743e02119d5SChris Mason 		blocksize = btrfs_level_size(root, *level - 1);
1744e02119d5SChris Mason 
1745e02119d5SChris Mason 		parent = path->nodes[*level];
1746e02119d5SChris Mason 		root_owner = btrfs_header_owner(parent);
1747e02119d5SChris Mason 
1748e02119d5SChris Mason 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
17492a29edc6Sliubo 		if (!next)
17502a29edc6Sliubo 			return -ENOMEM;
1751e02119d5SChris Mason 
17524a500fd1SYan, Zheng 		if (*level == 1) {
17531e5063d0SMark Fasheh 			ret = wc->process_func(root, next, wc, ptr_gen);
17541e5063d0SMark Fasheh 			if (ret)
17551e5063d0SMark Fasheh 				return ret;
1756e02119d5SChris Mason 
1757e02119d5SChris Mason 			path->slots[*level]++;
1758e02119d5SChris Mason 			if (wc->free) {
1759018642a1STsutomu Itoh 				ret = btrfs_read_buffer(next, ptr_gen);
1760018642a1STsutomu Itoh 				if (ret) {
1761018642a1STsutomu Itoh 					free_extent_buffer(next);
1762018642a1STsutomu Itoh 					return ret;
1763018642a1STsutomu Itoh 				}
1764e02119d5SChris Mason 
1765e02119d5SChris Mason 				btrfs_tree_lock(next);
1766b4ce94deSChris Mason 				btrfs_set_lock_blocking(next);
1767bd681513SChris Mason 				clean_tree_block(trans, root, next);
1768e02119d5SChris Mason 				btrfs_wait_tree_block_writeback(next);
1769e02119d5SChris Mason 				btrfs_tree_unlock(next);
1770e02119d5SChris Mason 
1771e02119d5SChris Mason 				WARN_ON(root_owner !=
1772e02119d5SChris Mason 					BTRFS_TREE_LOG_OBJECTID);
1773e688b725SChris Mason 				ret = btrfs_free_and_pin_reserved_extent(root,
1774d00aff00SChris Mason 							 bytenr, blocksize);
177579787eaaSJeff Mahoney 				BUG_ON(ret); /* -ENOMEM or logic errors */
1776e02119d5SChris Mason 			}
1777e02119d5SChris Mason 			free_extent_buffer(next);
1778e02119d5SChris Mason 			continue;
1779e02119d5SChris Mason 		}
1780018642a1STsutomu Itoh 		ret = btrfs_read_buffer(next, ptr_gen);
1781018642a1STsutomu Itoh 		if (ret) {
1782018642a1STsutomu Itoh 			free_extent_buffer(next);
1783018642a1STsutomu Itoh 			return ret;
1784018642a1STsutomu Itoh 		}
1785e02119d5SChris Mason 
1786e02119d5SChris Mason 		WARN_ON(*level <= 0);
1787e02119d5SChris Mason 		if (path->nodes[*level-1])
1788e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level-1]);
1789e02119d5SChris Mason 		path->nodes[*level-1] = next;
1790e02119d5SChris Mason 		*level = btrfs_header_level(next);
1791e02119d5SChris Mason 		path->slots[*level] = 0;
1792e02119d5SChris Mason 		cond_resched();
1793e02119d5SChris Mason 	}
1794e02119d5SChris Mason 	WARN_ON(*level < 0);
1795e02119d5SChris Mason 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1796e02119d5SChris Mason 
17974a500fd1SYan, Zheng 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1798e02119d5SChris Mason 
1799e02119d5SChris Mason 	cond_resched();
1800e02119d5SChris Mason 	return 0;
1801e02119d5SChris Mason }
1802e02119d5SChris Mason 
1803d397712bSChris Mason static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1804e02119d5SChris Mason 				 struct btrfs_root *root,
1805e02119d5SChris Mason 				 struct btrfs_path *path, int *level,
1806e02119d5SChris Mason 				 struct walk_control *wc)
1807e02119d5SChris Mason {
1808e02119d5SChris Mason 	u64 root_owner;
1809e02119d5SChris Mason 	int i;
1810e02119d5SChris Mason 	int slot;
1811e02119d5SChris Mason 	int ret;
1812e02119d5SChris Mason 
1813e02119d5SChris Mason 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1814e02119d5SChris Mason 		slot = path->slots[i];
18154a500fd1SYan, Zheng 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1816e02119d5SChris Mason 			path->slots[i]++;
1817e02119d5SChris Mason 			*level = i;
1818e02119d5SChris Mason 			WARN_ON(*level == 0);
1819e02119d5SChris Mason 			return 0;
1820e02119d5SChris Mason 		} else {
182131840ae1SZheng Yan 			struct extent_buffer *parent;
182231840ae1SZheng Yan 			if (path->nodes[*level] == root->node)
182331840ae1SZheng Yan 				parent = path->nodes[*level];
182431840ae1SZheng Yan 			else
182531840ae1SZheng Yan 				parent = path->nodes[*level + 1];
182631840ae1SZheng Yan 
182731840ae1SZheng Yan 			root_owner = btrfs_header_owner(parent);
18281e5063d0SMark Fasheh 			ret = wc->process_func(root, path->nodes[*level], wc,
1829e02119d5SChris Mason 				 btrfs_header_generation(path->nodes[*level]));
18301e5063d0SMark Fasheh 			if (ret)
18311e5063d0SMark Fasheh 				return ret;
18321e5063d0SMark Fasheh 
1833e02119d5SChris Mason 			if (wc->free) {
1834e02119d5SChris Mason 				struct extent_buffer *next;
1835e02119d5SChris Mason 
1836e02119d5SChris Mason 				next = path->nodes[*level];
1837e02119d5SChris Mason 
1838e02119d5SChris Mason 				btrfs_tree_lock(next);
1839b4ce94deSChris Mason 				btrfs_set_lock_blocking(next);
1840bd681513SChris Mason 				clean_tree_block(trans, root, next);
1841e02119d5SChris Mason 				btrfs_wait_tree_block_writeback(next);
1842e02119d5SChris Mason 				btrfs_tree_unlock(next);
1843e02119d5SChris Mason 
1844e02119d5SChris Mason 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1845e688b725SChris Mason 				ret = btrfs_free_and_pin_reserved_extent(root,
1846e02119d5SChris Mason 						path->nodes[*level]->start,
1847d00aff00SChris Mason 						path->nodes[*level]->len);
1848e02119d5SChris Mason 				BUG_ON(ret);
1849e02119d5SChris Mason 			}
1850e02119d5SChris Mason 			free_extent_buffer(path->nodes[*level]);
1851e02119d5SChris Mason 			path->nodes[*level] = NULL;
1852e02119d5SChris Mason 			*level = i + 1;
1853e02119d5SChris Mason 		}
1854e02119d5SChris Mason 	}
1855e02119d5SChris Mason 	return 1;
1856e02119d5SChris Mason }
1857e02119d5SChris Mason 
1858e02119d5SChris Mason /*
1859e02119d5SChris Mason  * drop the reference count on the tree rooted at 'snap'.  This traverses
1860e02119d5SChris Mason  * the tree freeing any blocks that have a ref count of zero after being
1861e02119d5SChris Mason  * decremented.
1862e02119d5SChris Mason  */
1863e02119d5SChris Mason static int walk_log_tree(struct btrfs_trans_handle *trans,
1864e02119d5SChris Mason 			 struct btrfs_root *log, struct walk_control *wc)
1865e02119d5SChris Mason {
1866e02119d5SChris Mason 	int ret = 0;
1867e02119d5SChris Mason 	int wret;
1868e02119d5SChris Mason 	int level;
1869e02119d5SChris Mason 	struct btrfs_path *path;
1870e02119d5SChris Mason 	int i;
1871e02119d5SChris Mason 	int orig_level;
1872e02119d5SChris Mason 
1873e02119d5SChris Mason 	path = btrfs_alloc_path();
1874db5b493aSTsutomu Itoh 	if (!path)
1875db5b493aSTsutomu Itoh 		return -ENOMEM;
1876e02119d5SChris Mason 
1877e02119d5SChris Mason 	level = btrfs_header_level(log->node);
1878e02119d5SChris Mason 	orig_level = level;
1879e02119d5SChris Mason 	path->nodes[level] = log->node;
1880e02119d5SChris Mason 	extent_buffer_get(log->node);
1881e02119d5SChris Mason 	path->slots[level] = 0;
1882e02119d5SChris Mason 
1883e02119d5SChris Mason 	while (1) {
1884e02119d5SChris Mason 		wret = walk_down_log_tree(trans, log, path, &level, wc);
1885e02119d5SChris Mason 		if (wret > 0)
1886e02119d5SChris Mason 			break;
188779787eaaSJeff Mahoney 		if (wret < 0) {
1888e02119d5SChris Mason 			ret = wret;
188979787eaaSJeff Mahoney 			goto out;
189079787eaaSJeff Mahoney 		}
1891e02119d5SChris Mason 
1892e02119d5SChris Mason 		wret = walk_up_log_tree(trans, log, path, &level, wc);
1893e02119d5SChris Mason 		if (wret > 0)
1894e02119d5SChris Mason 			break;
189579787eaaSJeff Mahoney 		if (wret < 0) {
1896e02119d5SChris Mason 			ret = wret;
189779787eaaSJeff Mahoney 			goto out;
189879787eaaSJeff Mahoney 		}
1899e02119d5SChris Mason 	}
1900e02119d5SChris Mason 
1901e02119d5SChris Mason 	/* was the root node processed? if not, catch it here */
1902e02119d5SChris Mason 	if (path->nodes[orig_level]) {
190379787eaaSJeff Mahoney 		ret = wc->process_func(log, path->nodes[orig_level], wc,
1904e02119d5SChris Mason 			 btrfs_header_generation(path->nodes[orig_level]));
190579787eaaSJeff Mahoney 		if (ret)
190679787eaaSJeff Mahoney 			goto out;
1907e02119d5SChris Mason 		if (wc->free) {
1908e02119d5SChris Mason 			struct extent_buffer *next;
1909e02119d5SChris Mason 
1910e02119d5SChris Mason 			next = path->nodes[orig_level];
1911e02119d5SChris Mason 
1912e02119d5SChris Mason 			btrfs_tree_lock(next);
1913b4ce94deSChris Mason 			btrfs_set_lock_blocking(next);
1914bd681513SChris Mason 			clean_tree_block(trans, log, next);
1915e02119d5SChris Mason 			btrfs_wait_tree_block_writeback(next);
1916e02119d5SChris Mason 			btrfs_tree_unlock(next);
1917e02119d5SChris Mason 
1918e02119d5SChris Mason 			WARN_ON(log->root_key.objectid !=
1919e02119d5SChris Mason 				BTRFS_TREE_LOG_OBJECTID);
1920e688b725SChris Mason 			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1921d00aff00SChris Mason 							 next->len);
192279787eaaSJeff Mahoney 			BUG_ON(ret); /* -ENOMEM or logic errors */
1923e02119d5SChris Mason 		}
1924e02119d5SChris Mason 	}
1925e02119d5SChris Mason 
192679787eaaSJeff Mahoney out:
1927e02119d5SChris Mason 	for (i = 0; i <= orig_level; i++) {
1928e02119d5SChris Mason 		if (path->nodes[i]) {
1929e02119d5SChris Mason 			free_extent_buffer(path->nodes[i]);
1930e02119d5SChris Mason 			path->nodes[i] = NULL;
1931e02119d5SChris Mason 		}
1932e02119d5SChris Mason 	}
1933e02119d5SChris Mason 	btrfs_free_path(path);
1934e02119d5SChris Mason 	return ret;
1935e02119d5SChris Mason }
1936e02119d5SChris Mason 
19377237f183SYan Zheng /*
19387237f183SYan Zheng  * helper function to update the item for a given subvolumes log root
19397237f183SYan Zheng  * in the tree of log roots
19407237f183SYan Zheng  */
19417237f183SYan Zheng static int update_log_root(struct btrfs_trans_handle *trans,
19427237f183SYan Zheng 			   struct btrfs_root *log)
19437237f183SYan Zheng {
19447237f183SYan Zheng 	int ret;
19457237f183SYan Zheng 
19467237f183SYan Zheng 	if (log->log_transid == 1) {
19477237f183SYan Zheng 		/* insert root item on the first sync */
19487237f183SYan Zheng 		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
19497237f183SYan Zheng 				&log->root_key, &log->root_item);
19507237f183SYan Zheng 	} else {
19517237f183SYan Zheng 		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
19527237f183SYan Zheng 				&log->root_key, &log->root_item);
19537237f183SYan Zheng 	}
19547237f183SYan Zheng 	return ret;
19557237f183SYan Zheng }
19567237f183SYan Zheng 
195712fcfd22SChris Mason static int wait_log_commit(struct btrfs_trans_handle *trans,
195812fcfd22SChris Mason 			   struct btrfs_root *root, unsigned long transid)
1959e02119d5SChris Mason {
1960e02119d5SChris Mason 	DEFINE_WAIT(wait);
19617237f183SYan Zheng 	int index = transid % 2;
1962e02119d5SChris Mason 
19637237f183SYan Zheng 	/*
19647237f183SYan Zheng 	 * we only allow two pending log transactions at a time,
19657237f183SYan Zheng 	 * so we know that if ours is more than 2 older than the
19667237f183SYan Zheng 	 * current transaction, we're done
19677237f183SYan Zheng 	 */
1968e02119d5SChris Mason 	do {
19697237f183SYan Zheng 		prepare_to_wait(&root->log_commit_wait[index],
19707237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
19717237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
197212fcfd22SChris Mason 
197312fcfd22SChris Mason 		if (root->fs_info->last_trans_log_full_commit !=
197412fcfd22SChris Mason 		    trans->transid && root->log_transid < transid + 2 &&
19757237f183SYan Zheng 		    atomic_read(&root->log_commit[index]))
1976e02119d5SChris Mason 			schedule();
197712fcfd22SChris Mason 
19787237f183SYan Zheng 		finish_wait(&root->log_commit_wait[index], &wait);
19797237f183SYan Zheng 		mutex_lock(&root->log_mutex);
19806dd70ce4SJan Kara 	} while (root->fs_info->last_trans_log_full_commit !=
19816dd70ce4SJan Kara 		 trans->transid && root->log_transid < transid + 2 &&
19827237f183SYan Zheng 		 atomic_read(&root->log_commit[index]));
19837237f183SYan Zheng 	return 0;
19847237f183SYan Zheng }
19857237f183SYan Zheng 
1986143bede5SJeff Mahoney static void wait_for_writer(struct btrfs_trans_handle *trans,
198712fcfd22SChris Mason 			    struct btrfs_root *root)
19887237f183SYan Zheng {
19897237f183SYan Zheng 	DEFINE_WAIT(wait);
19906dd70ce4SJan Kara 	while (root->fs_info->last_trans_log_full_commit !=
19916dd70ce4SJan Kara 	       trans->transid && atomic_read(&root->log_writers)) {
19927237f183SYan Zheng 		prepare_to_wait(&root->log_writer_wait,
19937237f183SYan Zheng 				&wait, TASK_UNINTERRUPTIBLE);
19947237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
199512fcfd22SChris Mason 		if (root->fs_info->last_trans_log_full_commit !=
199612fcfd22SChris Mason 		    trans->transid && atomic_read(&root->log_writers))
19977237f183SYan Zheng 			schedule();
19987237f183SYan Zheng 		mutex_lock(&root->log_mutex);
19997237f183SYan Zheng 		finish_wait(&root->log_writer_wait, &wait);
20007237f183SYan Zheng 	}
2001e02119d5SChris Mason }
2002e02119d5SChris Mason 
2003e02119d5SChris Mason /*
2004e02119d5SChris Mason  * btrfs_sync_log does sends a given tree log down to the disk and
2005e02119d5SChris Mason  * updates the super blocks to record it.  When this call is done,
200612fcfd22SChris Mason  * you know that any inodes previously logged are safely on disk only
200712fcfd22SChris Mason  * if it returns 0.
200812fcfd22SChris Mason  *
200912fcfd22SChris Mason  * Any other return value means you need to call btrfs_commit_transaction.
201012fcfd22SChris Mason  * Some of the edge cases for fsyncing directories that have had unlinks
201112fcfd22SChris Mason  * or renames done in the past mean that sometimes the only safe
201212fcfd22SChris Mason  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
201312fcfd22SChris Mason  * that has happened.
2014e02119d5SChris Mason  */
2015e02119d5SChris Mason int btrfs_sync_log(struct btrfs_trans_handle *trans,
2016e02119d5SChris Mason 		   struct btrfs_root *root)
2017e02119d5SChris Mason {
20187237f183SYan Zheng 	int index1;
20197237f183SYan Zheng 	int index2;
20208cef4e16SYan, Zheng 	int mark;
2021e02119d5SChris Mason 	int ret;
2022e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
20237237f183SYan Zheng 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
20248cef4e16SYan, Zheng 	unsigned long log_transid = 0;
2025e02119d5SChris Mason 
20267237f183SYan Zheng 	mutex_lock(&root->log_mutex);
20277237f183SYan Zheng 	index1 = root->log_transid % 2;
20287237f183SYan Zheng 	if (atomic_read(&root->log_commit[index1])) {
202912fcfd22SChris Mason 		wait_log_commit(trans, root, root->log_transid);
20307237f183SYan Zheng 		mutex_unlock(&root->log_mutex);
20317237f183SYan Zheng 		return 0;
2032e02119d5SChris Mason 	}
20337237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 1);
20347237f183SYan Zheng 
20357237f183SYan Zheng 	/* wait for previous tree log sync to complete */
20367237f183SYan Zheng 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
203712fcfd22SChris Mason 		wait_log_commit(trans, root, root->log_transid - 1);
203886df7eb9SYan, Zheng 	while (1) {
2039*2ecb7923SMiao Xie 		int batch = atomic_read(&root->log_batch);
2040cd354ad6SChris Mason 		/* when we're on an ssd, just kick the log commit out */
2041cd354ad6SChris Mason 		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
20427237f183SYan Zheng 			mutex_unlock(&root->log_mutex);
2043e02119d5SChris Mason 			schedule_timeout_uninterruptible(1);
20447237f183SYan Zheng 			mutex_lock(&root->log_mutex);
204586df7eb9SYan, Zheng 		}
204612fcfd22SChris Mason 		wait_for_writer(trans, root);
2047*2ecb7923SMiao Xie 		if (batch == atomic_read(&root->log_batch))
2048e02119d5SChris Mason 			break;
2049e02119d5SChris Mason 	}
2050d0c803c4SChris Mason 
205112fcfd22SChris Mason 	/* bail out if we need to do a full commit */
205212fcfd22SChris Mason 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
205312fcfd22SChris Mason 		ret = -EAGAIN;
205412fcfd22SChris Mason 		mutex_unlock(&root->log_mutex);
205512fcfd22SChris Mason 		goto out;
205612fcfd22SChris Mason 	}
205712fcfd22SChris Mason 
20588cef4e16SYan, Zheng 	log_transid = root->log_transid;
20598cef4e16SYan, Zheng 	if (log_transid % 2 == 0)
20608cef4e16SYan, Zheng 		mark = EXTENT_DIRTY;
20618cef4e16SYan, Zheng 	else
20628cef4e16SYan, Zheng 		mark = EXTENT_NEW;
20638cef4e16SYan, Zheng 
2064690587d1SChris Mason 	/* we start IO on  all the marked extents here, but we don't actually
2065690587d1SChris Mason 	 * wait for them until later.
2066690587d1SChris Mason 	 */
20678cef4e16SYan, Zheng 	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
206879787eaaSJeff Mahoney 	if (ret) {
206979787eaaSJeff Mahoney 		btrfs_abort_transaction(trans, root, ret);
207079787eaaSJeff Mahoney 		mutex_unlock(&root->log_mutex);
207179787eaaSJeff Mahoney 		goto out;
207279787eaaSJeff Mahoney 	}
20737237f183SYan Zheng 
20745d4f98a2SYan Zheng 	btrfs_set_root_node(&log->root_item, log->node);
20757237f183SYan Zheng 
20767237f183SYan Zheng 	root->log_transid++;
20777237f183SYan Zheng 	log->log_transid = root->log_transid;
2078ff782e0aSJosef Bacik 	root->log_start_pid = 0;
20797237f183SYan Zheng 	smp_mb();
20807237f183SYan Zheng 	/*
20818cef4e16SYan, Zheng 	 * IO has been started, blocks of the log tree have WRITTEN flag set
20828cef4e16SYan, Zheng 	 * in their headers. new modifications of the log will be written to
20838cef4e16SYan, Zheng 	 * new positions. so it's safe to allow log writers to go in.
20847237f183SYan Zheng 	 */
20857237f183SYan Zheng 	mutex_unlock(&root->log_mutex);
20867237f183SYan Zheng 
20877237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
2088*2ecb7923SMiao Xie 	atomic_inc(&log_root_tree->log_batch);
20897237f183SYan Zheng 	atomic_inc(&log_root_tree->log_writers);
20907237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
20917237f183SYan Zheng 
20927237f183SYan Zheng 	ret = update_log_root(trans, log);
20937237f183SYan Zheng 
20947237f183SYan Zheng 	mutex_lock(&log_root_tree->log_mutex);
20957237f183SYan Zheng 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
20967237f183SYan Zheng 		smp_mb();
20977237f183SYan Zheng 		if (waitqueue_active(&log_root_tree->log_writer_wait))
20987237f183SYan Zheng 			wake_up(&log_root_tree->log_writer_wait);
20997237f183SYan Zheng 	}
21007237f183SYan Zheng 
21014a500fd1SYan, Zheng 	if (ret) {
210279787eaaSJeff Mahoney 		if (ret != -ENOSPC) {
210379787eaaSJeff Mahoney 			btrfs_abort_transaction(trans, root, ret);
210479787eaaSJeff Mahoney 			mutex_unlock(&log_root_tree->log_mutex);
210579787eaaSJeff Mahoney 			goto out;
210679787eaaSJeff Mahoney 		}
21074a500fd1SYan, Zheng 		root->fs_info->last_trans_log_full_commit = trans->transid;
21084a500fd1SYan, Zheng 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
21094a500fd1SYan, Zheng 		mutex_unlock(&log_root_tree->log_mutex);
21104a500fd1SYan, Zheng 		ret = -EAGAIN;
21114a500fd1SYan, Zheng 		goto out;
21124a500fd1SYan, Zheng 	}
21134a500fd1SYan, Zheng 
21147237f183SYan Zheng 	index2 = log_root_tree->log_transid % 2;
21157237f183SYan Zheng 	if (atomic_read(&log_root_tree->log_commit[index2])) {
21168cef4e16SYan, Zheng 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
211712fcfd22SChris Mason 		wait_log_commit(trans, log_root_tree,
211812fcfd22SChris Mason 				log_root_tree->log_transid);
21197237f183SYan Zheng 		mutex_unlock(&log_root_tree->log_mutex);
2120b31eabd8SChris Mason 		ret = 0;
21217237f183SYan Zheng 		goto out;
21227237f183SYan Zheng 	}
21237237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 1);
21247237f183SYan Zheng 
212512fcfd22SChris Mason 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
212612fcfd22SChris Mason 		wait_log_commit(trans, log_root_tree,
212712fcfd22SChris Mason 				log_root_tree->log_transid - 1);
212812fcfd22SChris Mason 	}
21297237f183SYan Zheng 
213012fcfd22SChris Mason 	wait_for_writer(trans, log_root_tree);
213112fcfd22SChris Mason 
213212fcfd22SChris Mason 	/*
213312fcfd22SChris Mason 	 * now that we've moved on to the tree of log tree roots,
213412fcfd22SChris Mason 	 * check the full commit flag again
213512fcfd22SChris Mason 	 */
213612fcfd22SChris Mason 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
21378cef4e16SYan, Zheng 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
213812fcfd22SChris Mason 		mutex_unlock(&log_root_tree->log_mutex);
213912fcfd22SChris Mason 		ret = -EAGAIN;
214012fcfd22SChris Mason 		goto out_wake_log_root;
214112fcfd22SChris Mason 	}
21427237f183SYan Zheng 
21437237f183SYan Zheng 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
21448cef4e16SYan, Zheng 				&log_root_tree->dirty_log_pages,
21458cef4e16SYan, Zheng 				EXTENT_DIRTY | EXTENT_NEW);
214679787eaaSJeff Mahoney 	if (ret) {
214779787eaaSJeff Mahoney 		btrfs_abort_transaction(trans, root, ret);
214879787eaaSJeff Mahoney 		mutex_unlock(&log_root_tree->log_mutex);
214979787eaaSJeff Mahoney 		goto out_wake_log_root;
215079787eaaSJeff Mahoney 	}
21518cef4e16SYan, Zheng 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2152e02119d5SChris Mason 
21536c41761fSDavid Sterba 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
21547237f183SYan Zheng 				log_root_tree->node->start);
21556c41761fSDavid Sterba 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
21567237f183SYan Zheng 				btrfs_header_level(log_root_tree->node));
2157e02119d5SChris Mason 
21587237f183SYan Zheng 	log_root_tree->log_transid++;
2159e02119d5SChris Mason 	smp_mb();
21607237f183SYan Zheng 
21617237f183SYan Zheng 	mutex_unlock(&log_root_tree->log_mutex);
21627237f183SYan Zheng 
21637237f183SYan Zheng 	/*
21647237f183SYan Zheng 	 * nobody else is going to jump in and write the the ctree
21657237f183SYan Zheng 	 * super here because the log_commit atomic below is protecting
21667237f183SYan Zheng 	 * us.  We must be called with a transaction handle pinning
21677237f183SYan Zheng 	 * the running transaction open, so a full commit can't hop
21687237f183SYan Zheng 	 * in and cause problems either.
21697237f183SYan Zheng 	 */
2170a2de733cSArne Jansen 	btrfs_scrub_pause_super(root);
21714722607dSChris Mason 	write_ctree_super(trans, root->fs_info->tree_root, 1);
2172a2de733cSArne Jansen 	btrfs_scrub_continue_super(root);
217312fcfd22SChris Mason 	ret = 0;
21747237f183SYan Zheng 
2175257c62e1SChris Mason 	mutex_lock(&root->log_mutex);
2176257c62e1SChris Mason 	if (root->last_log_commit < log_transid)
2177257c62e1SChris Mason 		root->last_log_commit = log_transid;
2178257c62e1SChris Mason 	mutex_unlock(&root->log_mutex);
2179257c62e1SChris Mason 
218012fcfd22SChris Mason out_wake_log_root:
21817237f183SYan Zheng 	atomic_set(&log_root_tree->log_commit[index2], 0);
21827237f183SYan Zheng 	smp_mb();
21837237f183SYan Zheng 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
21847237f183SYan Zheng 		wake_up(&log_root_tree->log_commit_wait[index2]);
2185e02119d5SChris Mason out:
21867237f183SYan Zheng 	atomic_set(&root->log_commit[index1], 0);
21877237f183SYan Zheng 	smp_mb();
21887237f183SYan Zheng 	if (waitqueue_active(&root->log_commit_wait[index1]))
21897237f183SYan Zheng 		wake_up(&root->log_commit_wait[index1]);
2190b31eabd8SChris Mason 	return ret;
2191e02119d5SChris Mason }
2192e02119d5SChris Mason 
21934a500fd1SYan, Zheng static void free_log_tree(struct btrfs_trans_handle *trans,
21944a500fd1SYan, Zheng 			  struct btrfs_root *log)
2195e02119d5SChris Mason {
2196e02119d5SChris Mason 	int ret;
2197d0c803c4SChris Mason 	u64 start;
2198d0c803c4SChris Mason 	u64 end;
2199e02119d5SChris Mason 	struct walk_control wc = {
2200e02119d5SChris Mason 		.free = 1,
2201e02119d5SChris Mason 		.process_func = process_one_buffer
2202e02119d5SChris Mason 	};
2203e02119d5SChris Mason 
2204e02119d5SChris Mason 	ret = walk_log_tree(trans, log, &wc);
2205e02119d5SChris Mason 	BUG_ON(ret);
2206e02119d5SChris Mason 
2207d0c803c4SChris Mason 	while (1) {
2208d0c803c4SChris Mason 		ret = find_first_extent_bit(&log->dirty_log_pages,
22098cef4e16SYan, Zheng 				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
2210d0c803c4SChris Mason 		if (ret)
2211d0c803c4SChris Mason 			break;
2212d0c803c4SChris Mason 
22138cef4e16SYan, Zheng 		clear_extent_bits(&log->dirty_log_pages, start, end,
22148cef4e16SYan, Zheng 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2215d0c803c4SChris Mason 	}
2216d0c803c4SChris Mason 
22177237f183SYan Zheng 	free_extent_buffer(log->node);
22187237f183SYan Zheng 	kfree(log);
22194a500fd1SYan, Zheng }
22204a500fd1SYan, Zheng 
22214a500fd1SYan, Zheng /*
22224a500fd1SYan, Zheng  * free all the extents used by the tree log.  This should be called
22234a500fd1SYan, Zheng  * at commit time of the full transaction
22244a500fd1SYan, Zheng  */
22254a500fd1SYan, Zheng int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
22264a500fd1SYan, Zheng {
22274a500fd1SYan, Zheng 	if (root->log_root) {
22284a500fd1SYan, Zheng 		free_log_tree(trans, root->log_root);
22294a500fd1SYan, Zheng 		root->log_root = NULL;
22304a500fd1SYan, Zheng 	}
22314a500fd1SYan, Zheng 	return 0;
22324a500fd1SYan, Zheng }
22334a500fd1SYan, Zheng 
22344a500fd1SYan, Zheng int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
22354a500fd1SYan, Zheng 			     struct btrfs_fs_info *fs_info)
22364a500fd1SYan, Zheng {
22374a500fd1SYan, Zheng 	if (fs_info->log_root_tree) {
22384a500fd1SYan, Zheng 		free_log_tree(trans, fs_info->log_root_tree);
22394a500fd1SYan, Zheng 		fs_info->log_root_tree = NULL;
22404a500fd1SYan, Zheng 	}
2241e02119d5SChris Mason 	return 0;
2242e02119d5SChris Mason }
2243e02119d5SChris Mason 
2244e02119d5SChris Mason /*
2245e02119d5SChris Mason  * If both a file and directory are logged, and unlinks or renames are
2246e02119d5SChris Mason  * mixed in, we have a few interesting corners:
2247e02119d5SChris Mason  *
2248e02119d5SChris Mason  * create file X in dir Y
2249e02119d5SChris Mason  * link file X to X.link in dir Y
2250e02119d5SChris Mason  * fsync file X
2251e02119d5SChris Mason  * unlink file X but leave X.link
2252e02119d5SChris Mason  * fsync dir Y
2253e02119d5SChris Mason  *
2254e02119d5SChris Mason  * After a crash we would expect only X.link to exist.  But file X
2255e02119d5SChris Mason  * didn't get fsync'd again so the log has back refs for X and X.link.
2256e02119d5SChris Mason  *
2257e02119d5SChris Mason  * We solve this by removing directory entries and inode backrefs from the
2258e02119d5SChris Mason  * log when a file that was logged in the current transaction is
2259e02119d5SChris Mason  * unlinked.  Any later fsync will include the updated log entries, and
2260e02119d5SChris Mason  * we'll be able to reconstruct the proper directory items from backrefs.
2261e02119d5SChris Mason  *
2262e02119d5SChris Mason  * This optimizations allows us to avoid relogging the entire inode
2263e02119d5SChris Mason  * or the entire directory.
2264e02119d5SChris Mason  */
2265e02119d5SChris Mason int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2266e02119d5SChris Mason 				 struct btrfs_root *root,
2267e02119d5SChris Mason 				 const char *name, int name_len,
2268e02119d5SChris Mason 				 struct inode *dir, u64 index)
2269e02119d5SChris Mason {
2270e02119d5SChris Mason 	struct btrfs_root *log;
2271e02119d5SChris Mason 	struct btrfs_dir_item *di;
2272e02119d5SChris Mason 	struct btrfs_path *path;
2273e02119d5SChris Mason 	int ret;
22744a500fd1SYan, Zheng 	int err = 0;
2275e02119d5SChris Mason 	int bytes_del = 0;
227633345d01SLi Zefan 	u64 dir_ino = btrfs_ino(dir);
2277e02119d5SChris Mason 
22783a5f1d45SChris Mason 	if (BTRFS_I(dir)->logged_trans < trans->transid)
22793a5f1d45SChris Mason 		return 0;
22803a5f1d45SChris Mason 
2281e02119d5SChris Mason 	ret = join_running_log_trans(root);
2282e02119d5SChris Mason 	if (ret)
2283e02119d5SChris Mason 		return 0;
2284e02119d5SChris Mason 
2285e02119d5SChris Mason 	mutex_lock(&BTRFS_I(dir)->log_mutex);
2286e02119d5SChris Mason 
2287e02119d5SChris Mason 	log = root->log_root;
2288e02119d5SChris Mason 	path = btrfs_alloc_path();
2289a62f44a5STsutomu Itoh 	if (!path) {
2290a62f44a5STsutomu Itoh 		err = -ENOMEM;
2291a62f44a5STsutomu Itoh 		goto out_unlock;
2292a62f44a5STsutomu Itoh 	}
22932a29edc6Sliubo 
229433345d01SLi Zefan 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2295e02119d5SChris Mason 				   name, name_len, -1);
22964a500fd1SYan, Zheng 	if (IS_ERR(di)) {
22974a500fd1SYan, Zheng 		err = PTR_ERR(di);
22984a500fd1SYan, Zheng 		goto fail;
22994a500fd1SYan, Zheng 	}
23004a500fd1SYan, Zheng 	if (di) {
2301e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2302e02119d5SChris Mason 		bytes_del += name_len;
2303e02119d5SChris Mason 		BUG_ON(ret);
2304e02119d5SChris Mason 	}
2305b3b4aa74SDavid Sterba 	btrfs_release_path(path);
230633345d01SLi Zefan 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2307e02119d5SChris Mason 					 index, name, name_len, -1);
23084a500fd1SYan, Zheng 	if (IS_ERR(di)) {
23094a500fd1SYan, Zheng 		err = PTR_ERR(di);
23104a500fd1SYan, Zheng 		goto fail;
23114a500fd1SYan, Zheng 	}
23124a500fd1SYan, Zheng 	if (di) {
2313e02119d5SChris Mason 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2314e02119d5SChris Mason 		bytes_del += name_len;
2315e02119d5SChris Mason 		BUG_ON(ret);
2316e02119d5SChris Mason 	}
2317e02119d5SChris Mason 
2318e02119d5SChris Mason 	/* update the directory size in the log to reflect the names
2319e02119d5SChris Mason 	 * we have removed
2320e02119d5SChris Mason 	 */
2321e02119d5SChris Mason 	if (bytes_del) {
2322e02119d5SChris Mason 		struct btrfs_key key;
2323e02119d5SChris Mason 
232433345d01SLi Zefan 		key.objectid = dir_ino;
2325e02119d5SChris Mason 		key.offset = 0;
2326e02119d5SChris Mason 		key.type = BTRFS_INODE_ITEM_KEY;
2327b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2328e02119d5SChris Mason 
2329e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
23304a500fd1SYan, Zheng 		if (ret < 0) {
23314a500fd1SYan, Zheng 			err = ret;
23324a500fd1SYan, Zheng 			goto fail;
23334a500fd1SYan, Zheng 		}
2334e02119d5SChris Mason 		if (ret == 0) {
2335e02119d5SChris Mason 			struct btrfs_inode_item *item;
2336e02119d5SChris Mason 			u64 i_size;
2337e02119d5SChris Mason 
2338e02119d5SChris Mason 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2339e02119d5SChris Mason 					      struct btrfs_inode_item);
2340e02119d5SChris Mason 			i_size = btrfs_inode_size(path->nodes[0], item);
2341e02119d5SChris Mason 			if (i_size > bytes_del)
2342e02119d5SChris Mason 				i_size -= bytes_del;
2343e02119d5SChris Mason 			else
2344e02119d5SChris Mason 				i_size = 0;
2345e02119d5SChris Mason 			btrfs_set_inode_size(path->nodes[0], item, i_size);
2346e02119d5SChris Mason 			btrfs_mark_buffer_dirty(path->nodes[0]);
2347e02119d5SChris Mason 		} else
2348e02119d5SChris Mason 			ret = 0;
2349b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2350e02119d5SChris Mason 	}
23514a500fd1SYan, Zheng fail:
2352e02119d5SChris Mason 	btrfs_free_path(path);
2353a62f44a5STsutomu Itoh out_unlock:
2354e02119d5SChris Mason 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
23554a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
23564a500fd1SYan, Zheng 		root->fs_info->last_trans_log_full_commit = trans->transid;
23574a500fd1SYan, Zheng 		ret = 0;
235879787eaaSJeff Mahoney 	} else if (ret < 0)
235979787eaaSJeff Mahoney 		btrfs_abort_transaction(trans, root, ret);
236079787eaaSJeff Mahoney 
236112fcfd22SChris Mason 	btrfs_end_log_trans(root);
2362e02119d5SChris Mason 
2363411fc6bcSAndi Kleen 	return err;
2364e02119d5SChris Mason }
2365e02119d5SChris Mason 
2366e02119d5SChris Mason /* see comments for btrfs_del_dir_entries_in_log */
2367e02119d5SChris Mason int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2368e02119d5SChris Mason 			       struct btrfs_root *root,
2369e02119d5SChris Mason 			       const char *name, int name_len,
2370e02119d5SChris Mason 			       struct inode *inode, u64 dirid)
2371e02119d5SChris Mason {
2372e02119d5SChris Mason 	struct btrfs_root *log;
2373e02119d5SChris Mason 	u64 index;
2374e02119d5SChris Mason 	int ret;
2375e02119d5SChris Mason 
23763a5f1d45SChris Mason 	if (BTRFS_I(inode)->logged_trans < trans->transid)
23773a5f1d45SChris Mason 		return 0;
23783a5f1d45SChris Mason 
2379e02119d5SChris Mason 	ret = join_running_log_trans(root);
2380e02119d5SChris Mason 	if (ret)
2381e02119d5SChris Mason 		return 0;
2382e02119d5SChris Mason 	log = root->log_root;
2383e02119d5SChris Mason 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2384e02119d5SChris Mason 
238533345d01SLi Zefan 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2386e02119d5SChris Mason 				  dirid, &index);
2387e02119d5SChris Mason 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
23884a500fd1SYan, Zheng 	if (ret == -ENOSPC) {
23894a500fd1SYan, Zheng 		root->fs_info->last_trans_log_full_commit = trans->transid;
23904a500fd1SYan, Zheng 		ret = 0;
239179787eaaSJeff Mahoney 	} else if (ret < 0 && ret != -ENOENT)
239279787eaaSJeff Mahoney 		btrfs_abort_transaction(trans, root, ret);
239312fcfd22SChris Mason 	btrfs_end_log_trans(root);
2394e02119d5SChris Mason 
2395e02119d5SChris Mason 	return ret;
2396e02119d5SChris Mason }
2397e02119d5SChris Mason 
2398e02119d5SChris Mason /*
2399e02119d5SChris Mason  * creates a range item in the log for 'dirid'.  first_offset and
2400e02119d5SChris Mason  * last_offset tell us which parts of the key space the log should
2401e02119d5SChris Mason  * be considered authoritative for.
2402e02119d5SChris Mason  */
2403e02119d5SChris Mason static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2404e02119d5SChris Mason 				       struct btrfs_root *log,
2405e02119d5SChris Mason 				       struct btrfs_path *path,
2406e02119d5SChris Mason 				       int key_type, u64 dirid,
2407e02119d5SChris Mason 				       u64 first_offset, u64 last_offset)
2408e02119d5SChris Mason {
2409e02119d5SChris Mason 	int ret;
2410e02119d5SChris Mason 	struct btrfs_key key;
2411e02119d5SChris Mason 	struct btrfs_dir_log_item *item;
2412e02119d5SChris Mason 
2413e02119d5SChris Mason 	key.objectid = dirid;
2414e02119d5SChris Mason 	key.offset = first_offset;
2415e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY)
2416e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
2417e02119d5SChris Mason 	else
2418e02119d5SChris Mason 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
2419e02119d5SChris Mason 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
24204a500fd1SYan, Zheng 	if (ret)
24214a500fd1SYan, Zheng 		return ret;
2422e02119d5SChris Mason 
2423e02119d5SChris Mason 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2424e02119d5SChris Mason 			      struct btrfs_dir_log_item);
2425e02119d5SChris Mason 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2426e02119d5SChris Mason 	btrfs_mark_buffer_dirty(path->nodes[0]);
2427b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2428e02119d5SChris Mason 	return 0;
2429e02119d5SChris Mason }
2430e02119d5SChris Mason 
2431e02119d5SChris Mason /*
2432e02119d5SChris Mason  * log all the items included in the current transaction for a given
2433e02119d5SChris Mason  * directory.  This also creates the range items in the log tree required
2434e02119d5SChris Mason  * to replay anything deleted before the fsync
2435e02119d5SChris Mason  */
2436e02119d5SChris Mason static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2437e02119d5SChris Mason 			  struct btrfs_root *root, struct inode *inode,
2438e02119d5SChris Mason 			  struct btrfs_path *path,
2439e02119d5SChris Mason 			  struct btrfs_path *dst_path, int key_type,
2440e02119d5SChris Mason 			  u64 min_offset, u64 *last_offset_ret)
2441e02119d5SChris Mason {
2442e02119d5SChris Mason 	struct btrfs_key min_key;
2443e02119d5SChris Mason 	struct btrfs_key max_key;
2444e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
2445e02119d5SChris Mason 	struct extent_buffer *src;
24464a500fd1SYan, Zheng 	int err = 0;
2447e02119d5SChris Mason 	int ret;
2448e02119d5SChris Mason 	int i;
2449e02119d5SChris Mason 	int nritems;
2450e02119d5SChris Mason 	u64 first_offset = min_offset;
2451e02119d5SChris Mason 	u64 last_offset = (u64)-1;
245233345d01SLi Zefan 	u64 ino = btrfs_ino(inode);
2453e02119d5SChris Mason 
2454e02119d5SChris Mason 	log = root->log_root;
245533345d01SLi Zefan 	max_key.objectid = ino;
2456e02119d5SChris Mason 	max_key.offset = (u64)-1;
2457e02119d5SChris Mason 	max_key.type = key_type;
2458e02119d5SChris Mason 
245933345d01SLi Zefan 	min_key.objectid = ino;
2460e02119d5SChris Mason 	min_key.type = key_type;
2461e02119d5SChris Mason 	min_key.offset = min_offset;
2462e02119d5SChris Mason 
2463e02119d5SChris Mason 	path->keep_locks = 1;
2464e02119d5SChris Mason 
2465e02119d5SChris Mason 	ret = btrfs_search_forward(root, &min_key, &max_key,
2466e02119d5SChris Mason 				   path, 0, trans->transid);
2467e02119d5SChris Mason 
2468e02119d5SChris Mason 	/*
2469e02119d5SChris Mason 	 * we didn't find anything from this transaction, see if there
2470e02119d5SChris Mason 	 * is anything at all
2471e02119d5SChris Mason 	 */
247233345d01SLi Zefan 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
247333345d01SLi Zefan 		min_key.objectid = ino;
2474e02119d5SChris Mason 		min_key.type = key_type;
2475e02119d5SChris Mason 		min_key.offset = (u64)-1;
2476b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2477e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2478e02119d5SChris Mason 		if (ret < 0) {
2479b3b4aa74SDavid Sterba 			btrfs_release_path(path);
2480e02119d5SChris Mason 			return ret;
2481e02119d5SChris Mason 		}
248233345d01SLi Zefan 		ret = btrfs_previous_item(root, path, ino, key_type);
2483e02119d5SChris Mason 
2484e02119d5SChris Mason 		/* if ret == 0 there are items for this type,
2485e02119d5SChris Mason 		 * create a range to tell us the last key of this type.
2486e02119d5SChris Mason 		 * otherwise, there are no items in this directory after
2487e02119d5SChris Mason 		 * *min_offset, and we create a range to indicate that.
2488e02119d5SChris Mason 		 */
2489e02119d5SChris Mason 		if (ret == 0) {
2490e02119d5SChris Mason 			struct btrfs_key tmp;
2491e02119d5SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2492e02119d5SChris Mason 					      path->slots[0]);
2493d397712bSChris Mason 			if (key_type == tmp.type)
2494e02119d5SChris Mason 				first_offset = max(min_offset, tmp.offset) + 1;
2495e02119d5SChris Mason 		}
2496e02119d5SChris Mason 		goto done;
2497e02119d5SChris Mason 	}
2498e02119d5SChris Mason 
2499e02119d5SChris Mason 	/* go backward to find any previous key */
250033345d01SLi Zefan 	ret = btrfs_previous_item(root, path, ino, key_type);
2501e02119d5SChris Mason 	if (ret == 0) {
2502e02119d5SChris Mason 		struct btrfs_key tmp;
2503e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2504e02119d5SChris Mason 		if (key_type == tmp.type) {
2505e02119d5SChris Mason 			first_offset = tmp.offset;
2506e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
2507e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
2508e02119d5SChris Mason 					     &tmp);
25094a500fd1SYan, Zheng 			if (ret) {
25104a500fd1SYan, Zheng 				err = ret;
25114a500fd1SYan, Zheng 				goto done;
25124a500fd1SYan, Zheng 			}
2513e02119d5SChris Mason 		}
2514e02119d5SChris Mason 	}
2515b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2516e02119d5SChris Mason 
2517e02119d5SChris Mason 	/* find the first key from this transaction again */
2518e02119d5SChris Mason 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2519e02119d5SChris Mason 	if (ret != 0) {
2520e02119d5SChris Mason 		WARN_ON(1);
2521e02119d5SChris Mason 		goto done;
2522e02119d5SChris Mason 	}
2523e02119d5SChris Mason 
2524e02119d5SChris Mason 	/*
2525e02119d5SChris Mason 	 * we have a block from this transaction, log every item in it
2526e02119d5SChris Mason 	 * from our directory
2527e02119d5SChris Mason 	 */
2528e02119d5SChris Mason 	while (1) {
2529e02119d5SChris Mason 		struct btrfs_key tmp;
2530e02119d5SChris Mason 		src = path->nodes[0];
2531e02119d5SChris Mason 		nritems = btrfs_header_nritems(src);
2532e02119d5SChris Mason 		for (i = path->slots[0]; i < nritems; i++) {
2533e02119d5SChris Mason 			btrfs_item_key_to_cpu(src, &min_key, i);
2534e02119d5SChris Mason 
253533345d01SLi Zefan 			if (min_key.objectid != ino || min_key.type != key_type)
2536e02119d5SChris Mason 				goto done;
2537e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path, src, i,
2538e02119d5SChris Mason 					     &min_key);
25394a500fd1SYan, Zheng 			if (ret) {
25404a500fd1SYan, Zheng 				err = ret;
25414a500fd1SYan, Zheng 				goto done;
25424a500fd1SYan, Zheng 			}
2543e02119d5SChris Mason 		}
2544e02119d5SChris Mason 		path->slots[0] = nritems;
2545e02119d5SChris Mason 
2546e02119d5SChris Mason 		/*
2547e02119d5SChris Mason 		 * look ahead to the next item and see if it is also
2548e02119d5SChris Mason 		 * from this directory and from this transaction
2549e02119d5SChris Mason 		 */
2550e02119d5SChris Mason 		ret = btrfs_next_leaf(root, path);
2551e02119d5SChris Mason 		if (ret == 1) {
2552e02119d5SChris Mason 			last_offset = (u64)-1;
2553e02119d5SChris Mason 			goto done;
2554e02119d5SChris Mason 		}
2555e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
255633345d01SLi Zefan 		if (tmp.objectid != ino || tmp.type != key_type) {
2557e02119d5SChris Mason 			last_offset = (u64)-1;
2558e02119d5SChris Mason 			goto done;
2559e02119d5SChris Mason 		}
2560e02119d5SChris Mason 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2561e02119d5SChris Mason 			ret = overwrite_item(trans, log, dst_path,
2562e02119d5SChris Mason 					     path->nodes[0], path->slots[0],
2563e02119d5SChris Mason 					     &tmp);
25644a500fd1SYan, Zheng 			if (ret)
25654a500fd1SYan, Zheng 				err = ret;
25664a500fd1SYan, Zheng 			else
2567e02119d5SChris Mason 				last_offset = tmp.offset;
2568e02119d5SChris Mason 			goto done;
2569e02119d5SChris Mason 		}
2570e02119d5SChris Mason 	}
2571e02119d5SChris Mason done:
2572b3b4aa74SDavid Sterba 	btrfs_release_path(path);
2573b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
2574e02119d5SChris Mason 
25754a500fd1SYan, Zheng 	if (err == 0) {
25764a500fd1SYan, Zheng 		*last_offset_ret = last_offset;
25774a500fd1SYan, Zheng 		/*
25784a500fd1SYan, Zheng 		 * insert the log range keys to indicate where the log
25794a500fd1SYan, Zheng 		 * is valid
25804a500fd1SYan, Zheng 		 */
25814a500fd1SYan, Zheng 		ret = insert_dir_log_key(trans, log, path, key_type,
258233345d01SLi Zefan 					 ino, first_offset, last_offset);
25834a500fd1SYan, Zheng 		if (ret)
25844a500fd1SYan, Zheng 			err = ret;
25854a500fd1SYan, Zheng 	}
25864a500fd1SYan, Zheng 	return err;
2587e02119d5SChris Mason }
2588e02119d5SChris Mason 
2589e02119d5SChris Mason /*
2590e02119d5SChris Mason  * logging directories is very similar to logging inodes, We find all the items
2591e02119d5SChris Mason  * from the current transaction and write them to the log.
2592e02119d5SChris Mason  *
2593e02119d5SChris Mason  * The recovery code scans the directory in the subvolume, and if it finds a
2594e02119d5SChris Mason  * key in the range logged that is not present in the log tree, then it means
2595e02119d5SChris Mason  * that dir entry was unlinked during the transaction.
2596e02119d5SChris Mason  *
2597e02119d5SChris Mason  * In order for that scan to work, we must include one key smaller than
2598e02119d5SChris Mason  * the smallest logged by this transaction and one key larger than the largest
2599e02119d5SChris Mason  * key logged by this transaction.
2600e02119d5SChris Mason  */
2601e02119d5SChris Mason static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2602e02119d5SChris Mason 			  struct btrfs_root *root, struct inode *inode,
2603e02119d5SChris Mason 			  struct btrfs_path *path,
2604e02119d5SChris Mason 			  struct btrfs_path *dst_path)
2605e02119d5SChris Mason {
2606e02119d5SChris Mason 	u64 min_key;
2607e02119d5SChris Mason 	u64 max_key;
2608e02119d5SChris Mason 	int ret;
2609e02119d5SChris Mason 	int key_type = BTRFS_DIR_ITEM_KEY;
2610e02119d5SChris Mason 
2611e02119d5SChris Mason again:
2612e02119d5SChris Mason 	min_key = 0;
2613e02119d5SChris Mason 	max_key = 0;
2614e02119d5SChris Mason 	while (1) {
2615e02119d5SChris Mason 		ret = log_dir_items(trans, root, inode, path,
2616e02119d5SChris Mason 				    dst_path, key_type, min_key,
2617e02119d5SChris Mason 				    &max_key);
26184a500fd1SYan, Zheng 		if (ret)
26194a500fd1SYan, Zheng 			return ret;
2620e02119d5SChris Mason 		if (max_key == (u64)-1)
2621e02119d5SChris Mason 			break;
2622e02119d5SChris Mason 		min_key = max_key + 1;
2623e02119d5SChris Mason 	}
2624e02119d5SChris Mason 
2625e02119d5SChris Mason 	if (key_type == BTRFS_DIR_ITEM_KEY) {
2626e02119d5SChris Mason 		key_type = BTRFS_DIR_INDEX_KEY;
2627e02119d5SChris Mason 		goto again;
2628e02119d5SChris Mason 	}
2629e02119d5SChris Mason 	return 0;
2630e02119d5SChris Mason }
2631e02119d5SChris Mason 
2632e02119d5SChris Mason /*
2633e02119d5SChris Mason  * a helper function to drop items from the log before we relog an
2634e02119d5SChris Mason  * inode.  max_key_type indicates the highest item type to remove.
2635e02119d5SChris Mason  * This cannot be run for file data extents because it does not
2636e02119d5SChris Mason  * free the extents they point to.
2637e02119d5SChris Mason  */
2638e02119d5SChris Mason static int drop_objectid_items(struct btrfs_trans_handle *trans,
2639e02119d5SChris Mason 				  struct btrfs_root *log,
2640e02119d5SChris Mason 				  struct btrfs_path *path,
2641e02119d5SChris Mason 				  u64 objectid, int max_key_type)
2642e02119d5SChris Mason {
2643e02119d5SChris Mason 	int ret;
2644e02119d5SChris Mason 	struct btrfs_key key;
2645e02119d5SChris Mason 	struct btrfs_key found_key;
2646e02119d5SChris Mason 
2647e02119d5SChris Mason 	key.objectid = objectid;
2648e02119d5SChris Mason 	key.type = max_key_type;
2649e02119d5SChris Mason 	key.offset = (u64)-1;
2650e02119d5SChris Mason 
2651e02119d5SChris Mason 	while (1) {
2652e02119d5SChris Mason 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
26534a500fd1SYan, Zheng 		BUG_ON(ret == 0);
26544a500fd1SYan, Zheng 		if (ret < 0)
2655e02119d5SChris Mason 			break;
2656e02119d5SChris Mason 
2657e02119d5SChris Mason 		if (path->slots[0] == 0)
2658e02119d5SChris Mason 			break;
2659e02119d5SChris Mason 
2660e02119d5SChris Mason 		path->slots[0]--;
2661e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2662e02119d5SChris Mason 				      path->slots[0]);
2663e02119d5SChris Mason 
2664e02119d5SChris Mason 		if (found_key.objectid != objectid)
2665e02119d5SChris Mason 			break;
2666e02119d5SChris Mason 
2667e02119d5SChris Mason 		ret = btrfs_del_item(trans, log, path);
266865a246c5STsutomu Itoh 		if (ret)
266965a246c5STsutomu Itoh 			break;
2670b3b4aa74SDavid Sterba 		btrfs_release_path(path);
2671e02119d5SChris Mason 	}
2672b3b4aa74SDavid Sterba 	btrfs_release_path(path);
26735bdbeb21SJosef Bacik 	if (ret > 0)
26745bdbeb21SJosef Bacik 		ret = 0;
26754a500fd1SYan, Zheng 	return ret;
2676e02119d5SChris Mason }
2677e02119d5SChris Mason 
267831ff1cd2SChris Mason static noinline int copy_items(struct btrfs_trans_handle *trans,
2679d2794405SLiu Bo 			       struct inode *inode,
268031ff1cd2SChris Mason 			       struct btrfs_path *dst_path,
268131ff1cd2SChris Mason 			       struct extent_buffer *src,
268231ff1cd2SChris Mason 			       int start_slot, int nr, int inode_only)
268331ff1cd2SChris Mason {
268431ff1cd2SChris Mason 	unsigned long src_offset;
268531ff1cd2SChris Mason 	unsigned long dst_offset;
2686d2794405SLiu Bo 	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
268731ff1cd2SChris Mason 	struct btrfs_file_extent_item *extent;
268831ff1cd2SChris Mason 	struct btrfs_inode_item *inode_item;
268931ff1cd2SChris Mason 	int ret;
269031ff1cd2SChris Mason 	struct btrfs_key *ins_keys;
269131ff1cd2SChris Mason 	u32 *ins_sizes;
269231ff1cd2SChris Mason 	char *ins_data;
269331ff1cd2SChris Mason 	int i;
2694d20f7043SChris Mason 	struct list_head ordered_sums;
2695d2794405SLiu Bo 	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2696d20f7043SChris Mason 
2697d20f7043SChris Mason 	INIT_LIST_HEAD(&ordered_sums);
269831ff1cd2SChris Mason 
269931ff1cd2SChris Mason 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
270031ff1cd2SChris Mason 			   nr * sizeof(u32), GFP_NOFS);
27012a29edc6Sliubo 	if (!ins_data)
27022a29edc6Sliubo 		return -ENOMEM;
27032a29edc6Sliubo 
270431ff1cd2SChris Mason 	ins_sizes = (u32 *)ins_data;
270531ff1cd2SChris Mason 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
270631ff1cd2SChris Mason 
270731ff1cd2SChris Mason 	for (i = 0; i < nr; i++) {
270831ff1cd2SChris Mason 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
270931ff1cd2SChris Mason 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
271031ff1cd2SChris Mason 	}
271131ff1cd2SChris Mason 	ret = btrfs_insert_empty_items(trans, log, dst_path,
271231ff1cd2SChris Mason 				       ins_keys, ins_sizes, nr);
27134a500fd1SYan, Zheng 	if (ret) {
27144a500fd1SYan, Zheng 		kfree(ins_data);
27154a500fd1SYan, Zheng 		return ret;
27164a500fd1SYan, Zheng 	}
271731ff1cd2SChris Mason 
27185d4f98a2SYan Zheng 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
271931ff1cd2SChris Mason 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
272031ff1cd2SChris Mason 						   dst_path->slots[0]);
272131ff1cd2SChris Mason 
272231ff1cd2SChris Mason 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
272331ff1cd2SChris Mason 
272431ff1cd2SChris Mason 		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
272531ff1cd2SChris Mason 				   src_offset, ins_sizes[i]);
272631ff1cd2SChris Mason 
272731ff1cd2SChris Mason 		if (inode_only == LOG_INODE_EXISTS &&
272831ff1cd2SChris Mason 		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
272931ff1cd2SChris Mason 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
273031ff1cd2SChris Mason 						    dst_path->slots[0],
273131ff1cd2SChris Mason 						    struct btrfs_inode_item);
273231ff1cd2SChris Mason 			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
273331ff1cd2SChris Mason 
273431ff1cd2SChris Mason 			/* set the generation to zero so the recover code
273531ff1cd2SChris Mason 			 * can tell the difference between an logging
273631ff1cd2SChris Mason 			 * just to say 'this inode exists' and a logging
273731ff1cd2SChris Mason 			 * to say 'update this inode with these values'
273831ff1cd2SChris Mason 			 */
273931ff1cd2SChris Mason 			btrfs_set_inode_generation(dst_path->nodes[0],
274031ff1cd2SChris Mason 						   inode_item, 0);
274131ff1cd2SChris Mason 		}
274231ff1cd2SChris Mason 		/* take a reference on file data extents so that truncates
274331ff1cd2SChris Mason 		 * or deletes of this inode don't have to relog the inode
274431ff1cd2SChris Mason 		 * again
274531ff1cd2SChris Mason 		 */
2746d2794405SLiu Bo 		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
2747d2794405SLiu Bo 		    !skip_csum) {
274831ff1cd2SChris Mason 			int found_type;
274931ff1cd2SChris Mason 			extent = btrfs_item_ptr(src, start_slot + i,
275031ff1cd2SChris Mason 						struct btrfs_file_extent_item);
275131ff1cd2SChris Mason 
27528e531cdfSliubo 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
27538e531cdfSliubo 				continue;
27548e531cdfSliubo 
275531ff1cd2SChris Mason 			found_type = btrfs_file_extent_type(src, extent);
2756d899e052SYan Zheng 			if (found_type == BTRFS_FILE_EXTENT_REG ||
2757d899e052SYan Zheng 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
27585d4f98a2SYan Zheng 				u64 ds, dl, cs, cl;
27595d4f98a2SYan Zheng 				ds = btrfs_file_extent_disk_bytenr(src,
276031ff1cd2SChris Mason 								extent);
27615d4f98a2SYan Zheng 				/* ds == 0 is a hole */
27625d4f98a2SYan Zheng 				if (ds == 0)
27635d4f98a2SYan Zheng 					continue;
27645d4f98a2SYan Zheng 
27655d4f98a2SYan Zheng 				dl = btrfs_file_extent_disk_num_bytes(src,
276631ff1cd2SChris Mason 								extent);
27675d4f98a2SYan Zheng 				cs = btrfs_file_extent_offset(src, extent);
27685d4f98a2SYan Zheng 				cl = btrfs_file_extent_num_bytes(src,
2769a419aef8SJoe Perches 								extent);
2770580afd76SChris Mason 				if (btrfs_file_extent_compression(src,
2771580afd76SChris Mason 								  extent)) {
2772580afd76SChris Mason 					cs = 0;
2773580afd76SChris Mason 					cl = dl;
2774580afd76SChris Mason 				}
27755d4f98a2SYan Zheng 
277607d400a6SYan Zheng 				ret = btrfs_lookup_csums_range(
2777d20f7043SChris Mason 						log->fs_info->csum_root,
277807d400a6SYan Zheng 						ds + cs, ds + cs + cl - 1,
2779a2de733cSArne Jansen 						&ordered_sums, 0);
2780d20f7043SChris Mason 				BUG_ON(ret);
278131ff1cd2SChris Mason 			}
278231ff1cd2SChris Mason 		}
278331ff1cd2SChris Mason 	}
278431ff1cd2SChris Mason 
278531ff1cd2SChris Mason 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2786b3b4aa74SDavid Sterba 	btrfs_release_path(dst_path);
278731ff1cd2SChris Mason 	kfree(ins_data);
2788d20f7043SChris Mason 
2789d20f7043SChris Mason 	/*
2790d20f7043SChris Mason 	 * we have to do this after the loop above to avoid changing the
2791d20f7043SChris Mason 	 * log tree while trying to change the log tree.
2792d20f7043SChris Mason 	 */
27934a500fd1SYan, Zheng 	ret = 0;
2794d20f7043SChris Mason 	while (!list_empty(&ordered_sums)) {
2795d20f7043SChris Mason 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2796d20f7043SChris Mason 						   struct btrfs_ordered_sum,
2797d20f7043SChris Mason 						   list);
27984a500fd1SYan, Zheng 		if (!ret)
2799d20f7043SChris Mason 			ret = btrfs_csum_file_blocks(trans, log, sums);
2800d20f7043SChris Mason 		list_del(&sums->list);
2801d20f7043SChris Mason 		kfree(sums);
2802d20f7043SChris Mason 	}
28034a500fd1SYan, Zheng 	return ret;
280431ff1cd2SChris Mason }
280531ff1cd2SChris Mason 
28065dc562c5SJosef Bacik static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
28075dc562c5SJosef Bacik {
28085dc562c5SJosef Bacik 	struct extent_map *em1, *em2;
28095dc562c5SJosef Bacik 
28105dc562c5SJosef Bacik 	em1 = list_entry(a, struct extent_map, list);
28115dc562c5SJosef Bacik 	em2 = list_entry(b, struct extent_map, list);
28125dc562c5SJosef Bacik 
28135dc562c5SJosef Bacik 	if (em1->start < em2->start)
28145dc562c5SJosef Bacik 		return -1;
28155dc562c5SJosef Bacik 	else if (em1->start > em2->start)
28165dc562c5SJosef Bacik 		return 1;
28175dc562c5SJosef Bacik 	return 0;
28185dc562c5SJosef Bacik }
28195dc562c5SJosef Bacik 
28205dc562c5SJosef Bacik struct log_args {
28215dc562c5SJosef Bacik 	struct extent_buffer *src;
28225dc562c5SJosef Bacik 	u64 next_offset;
28235dc562c5SJosef Bacik 	int start_slot;
28245dc562c5SJosef Bacik 	int nr;
28255dc562c5SJosef Bacik };
28265dc562c5SJosef Bacik 
28275dc562c5SJosef Bacik static int log_one_extent(struct btrfs_trans_handle *trans,
28285dc562c5SJosef Bacik 			  struct inode *inode, struct btrfs_root *root,
28295dc562c5SJosef Bacik 			  struct extent_map *em, struct btrfs_path *path,
28305dc562c5SJosef Bacik 			  struct btrfs_path *dst_path, struct log_args *args)
28315dc562c5SJosef Bacik {
28325dc562c5SJosef Bacik 	struct btrfs_root *log = root->log_root;
28335dc562c5SJosef Bacik 	struct btrfs_file_extent_item *fi;
28345dc562c5SJosef Bacik 	struct btrfs_key key;
28354e2f84e6SLiu Bo 	u64 start = em->mod_start;
28364e2f84e6SLiu Bo 	u64 len = em->mod_len;
28375dc562c5SJosef Bacik 	u64 num_bytes;
28385dc562c5SJosef Bacik 	int nritems;
28395dc562c5SJosef Bacik 	int ret;
28405dc562c5SJosef Bacik 
28415dc562c5SJosef Bacik 	if (BTRFS_I(inode)->logged_trans == trans->transid) {
28425dc562c5SJosef Bacik 		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
28432aaa6655SJosef Bacik 					   start + len, NULL, 0);
28445dc562c5SJosef Bacik 		if (ret)
28455dc562c5SJosef Bacik 			return ret;
28465dc562c5SJosef Bacik 	}
28475dc562c5SJosef Bacik 
28485dc562c5SJosef Bacik 	while (len) {
28495dc562c5SJosef Bacik 		if (args->nr)
28505dc562c5SJosef Bacik 			goto next_slot;
28515dc562c5SJosef Bacik 		key.objectid = btrfs_ino(inode);
28525dc562c5SJosef Bacik 		key.type = BTRFS_EXTENT_DATA_KEY;
28535dc562c5SJosef Bacik 		key.offset = start;
28545dc562c5SJosef Bacik 
28555dc562c5SJosef Bacik 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
28565dc562c5SJosef Bacik 		if (ret < 0)
28575dc562c5SJosef Bacik 			return ret;
28585dc562c5SJosef Bacik 		if (ret) {
28595dc562c5SJosef Bacik 			/*
28605dc562c5SJosef Bacik 			 * This shouldn't happen, but it might so warn and
28615dc562c5SJosef Bacik 			 * return an error.
28625dc562c5SJosef Bacik 			 */
28635dc562c5SJosef Bacik 			WARN_ON(1);
28645dc562c5SJosef Bacik 			return -ENOENT;
28655dc562c5SJosef Bacik 		}
28665dc562c5SJosef Bacik 		args->src = path->nodes[0];
28675dc562c5SJosef Bacik next_slot:
28685dc562c5SJosef Bacik 		fi = btrfs_item_ptr(args->src, path->slots[0],
28695dc562c5SJosef Bacik 				    struct btrfs_file_extent_item);
28705dc562c5SJosef Bacik 		if (args->nr &&
28715dc562c5SJosef Bacik 		    args->start_slot + args->nr == path->slots[0]) {
28725dc562c5SJosef Bacik 			args->nr++;
28735dc562c5SJosef Bacik 		} else if (args->nr) {
2874d2794405SLiu Bo 			ret = copy_items(trans, inode, dst_path, args->src,
28755dc562c5SJosef Bacik 					 args->start_slot, args->nr,
28765dc562c5SJosef Bacik 					 LOG_INODE_ALL);
28775dc562c5SJosef Bacik 			if (ret)
28785dc562c5SJosef Bacik 				return ret;
28795dc562c5SJosef Bacik 			args->nr = 1;
28805dc562c5SJosef Bacik 			args->start_slot = path->slots[0];
28815dc562c5SJosef Bacik 		} else if (!args->nr) {
28825dc562c5SJosef Bacik 			args->nr = 1;
28835dc562c5SJosef Bacik 			args->start_slot = path->slots[0];
28845dc562c5SJosef Bacik 		}
28855dc562c5SJosef Bacik 		nritems = btrfs_header_nritems(path->nodes[0]);
28865dc562c5SJosef Bacik 		path->slots[0]++;
28875dc562c5SJosef Bacik 		num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
28885dc562c5SJosef Bacik 		if (len < num_bytes) {
28895dc562c5SJosef Bacik 			/* I _think_ this is ok, envision we write to a
28905dc562c5SJosef Bacik 			 * preallocated space that is adjacent to a previously
28915dc562c5SJosef Bacik 			 * written preallocated space that gets merged when we
28925dc562c5SJosef Bacik 			 * mark this preallocated space written.  If we do not
28935dc562c5SJosef Bacik 			 * have the adjacent extent in cache then when we copy
28945dc562c5SJosef Bacik 			 * this extent it could end up being larger than our EM
28955dc562c5SJosef Bacik 			 * thinks it is, which is a-ok, so just set len to 0.
28965dc562c5SJosef Bacik 			 */
28975dc562c5SJosef Bacik 			len = 0;
28985dc562c5SJosef Bacik 		} else {
28995dc562c5SJosef Bacik 			len -= num_bytes;
29005dc562c5SJosef Bacik 		}
29015dc562c5SJosef Bacik 		start += btrfs_file_extent_num_bytes(args->src, fi);
29025dc562c5SJosef Bacik 		args->next_offset = start;
29035dc562c5SJosef Bacik 
29045dc562c5SJosef Bacik 		if (path->slots[0] < nritems) {
29055dc562c5SJosef Bacik 			if (len)
29065dc562c5SJosef Bacik 				goto next_slot;
29075dc562c5SJosef Bacik 			break;
29085dc562c5SJosef Bacik 		}
29095dc562c5SJosef Bacik 
29105dc562c5SJosef Bacik 		if (args->nr) {
2911d2794405SLiu Bo 			ret = copy_items(trans, inode, dst_path, args->src,
29125dc562c5SJosef Bacik 					 args->start_slot, args->nr,
29135dc562c5SJosef Bacik 					 LOG_INODE_ALL);
29145dc562c5SJosef Bacik 			if (ret)
29155dc562c5SJosef Bacik 				return ret;
29165dc562c5SJosef Bacik 			args->nr = 0;
29175dc562c5SJosef Bacik 			btrfs_release_path(path);
29185dc562c5SJosef Bacik 		}
29195dc562c5SJosef Bacik 	}
29205dc562c5SJosef Bacik 
29215dc562c5SJosef Bacik 	return 0;
29225dc562c5SJosef Bacik }
29235dc562c5SJosef Bacik 
29245dc562c5SJosef Bacik static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
29255dc562c5SJosef Bacik 				     struct btrfs_root *root,
29265dc562c5SJosef Bacik 				     struct inode *inode,
29275dc562c5SJosef Bacik 				     struct btrfs_path *path,
29285dc562c5SJosef Bacik 				     struct btrfs_path *dst_path)
29295dc562c5SJosef Bacik {
29305dc562c5SJosef Bacik 	struct log_args args;
29315dc562c5SJosef Bacik 	struct extent_map *em, *n;
29325dc562c5SJosef Bacik 	struct list_head extents;
29335dc562c5SJosef Bacik 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
29345dc562c5SJosef Bacik 	u64 test_gen;
29355dc562c5SJosef Bacik 	int ret = 0;
29365dc562c5SJosef Bacik 
29375dc562c5SJosef Bacik 	INIT_LIST_HEAD(&extents);
29385dc562c5SJosef Bacik 
29395dc562c5SJosef Bacik 	memset(&args, 0, sizeof(args));
29405dc562c5SJosef Bacik 
29415dc562c5SJosef Bacik 	write_lock(&tree->lock);
29425dc562c5SJosef Bacik 	test_gen = root->fs_info->last_trans_committed;
29435dc562c5SJosef Bacik 
29445dc562c5SJosef Bacik 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
29455dc562c5SJosef Bacik 		list_del_init(&em->list);
29465dc562c5SJosef Bacik 		if (em->generation <= test_gen)
29475dc562c5SJosef Bacik 			continue;
29485dc562c5SJosef Bacik 		list_add_tail(&em->list, &extents);
29495dc562c5SJosef Bacik 	}
29505dc562c5SJosef Bacik 
29515dc562c5SJosef Bacik 	list_sort(NULL, &extents, extent_cmp);
29525dc562c5SJosef Bacik 
29535dc562c5SJosef Bacik 	while (!list_empty(&extents)) {
29545dc562c5SJosef Bacik 		em = list_entry(extents.next, struct extent_map, list);
29555dc562c5SJosef Bacik 
29565dc562c5SJosef Bacik 		list_del_init(&em->list);
29575dc562c5SJosef Bacik 
29585dc562c5SJosef Bacik 		/*
29595dc562c5SJosef Bacik 		 * If we had an error we just need to delete everybody from our
29605dc562c5SJosef Bacik 		 * private list.
29615dc562c5SJosef Bacik 		 */
29625dc562c5SJosef Bacik 		if (ret)
29635dc562c5SJosef Bacik 			continue;
29645dc562c5SJosef Bacik 
29655dc562c5SJosef Bacik 		/*
29665dc562c5SJosef Bacik 		 * If the previous EM and the last extent we left off on aren't
29675dc562c5SJosef Bacik 		 * sequential then we need to copy the items we have and redo
29685dc562c5SJosef Bacik 		 * our search
29695dc562c5SJosef Bacik 		 */
29704e2f84e6SLiu Bo 		if (args.nr && em->mod_start != args.next_offset) {
2971d2794405SLiu Bo 			ret = copy_items(trans, inode, dst_path, args.src,
29725dc562c5SJosef Bacik 					 args.start_slot, args.nr,
29735dc562c5SJosef Bacik 					 LOG_INODE_ALL);
29745dc562c5SJosef Bacik 			if (ret)
29755dc562c5SJosef Bacik 				continue;
29765dc562c5SJosef Bacik 			btrfs_release_path(path);
29775dc562c5SJosef Bacik 			args.nr = 0;
29785dc562c5SJosef Bacik 		}
29795dc562c5SJosef Bacik 
29805dc562c5SJosef Bacik 		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
29815dc562c5SJosef Bacik 	}
29825dc562c5SJosef Bacik 
29835dc562c5SJosef Bacik 	if (!ret && args.nr)
2984d2794405SLiu Bo 		ret = copy_items(trans, inode, dst_path, args.src,
29855dc562c5SJosef Bacik 				 args.start_slot, args.nr, LOG_INODE_ALL);
29865dc562c5SJosef Bacik 	btrfs_release_path(path);
29875dc562c5SJosef Bacik 	WARN_ON(!list_empty(&extents));
29885dc562c5SJosef Bacik 	write_unlock(&tree->lock);
29895dc562c5SJosef Bacik 	return ret;
29905dc562c5SJosef Bacik }
29915dc562c5SJosef Bacik 
2992e02119d5SChris Mason /* log a single inode in the tree log.
2993e02119d5SChris Mason  * At least one parent directory for this inode must exist in the tree
2994e02119d5SChris Mason  * or be logged already.
2995e02119d5SChris Mason  *
2996e02119d5SChris Mason  * Any items from this inode changed by the current transaction are copied
2997e02119d5SChris Mason  * to the log tree.  An extra reference is taken on any extents in this
2998e02119d5SChris Mason  * file, allowing us to avoid a whole pile of corner cases around logging
2999e02119d5SChris Mason  * blocks that have been removed from the tree.
3000e02119d5SChris Mason  *
3001e02119d5SChris Mason  * See LOG_INODE_ALL and related defines for a description of what inode_only
3002e02119d5SChris Mason  * does.
3003e02119d5SChris Mason  *
3004e02119d5SChris Mason  * This handles both files and directories.
3005e02119d5SChris Mason  */
300612fcfd22SChris Mason static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3007e02119d5SChris Mason 			     struct btrfs_root *root, struct inode *inode,
3008e02119d5SChris Mason 			     int inode_only)
3009e02119d5SChris Mason {
3010e02119d5SChris Mason 	struct btrfs_path *path;
3011e02119d5SChris Mason 	struct btrfs_path *dst_path;
3012e02119d5SChris Mason 	struct btrfs_key min_key;
3013e02119d5SChris Mason 	struct btrfs_key max_key;
3014e02119d5SChris Mason 	struct btrfs_root *log = root->log_root;
301531ff1cd2SChris Mason 	struct extent_buffer *src = NULL;
30164a500fd1SYan, Zheng 	int err = 0;
3017e02119d5SChris Mason 	int ret;
30183a5f1d45SChris Mason 	int nritems;
301931ff1cd2SChris Mason 	int ins_start_slot = 0;
302031ff1cd2SChris Mason 	int ins_nr;
30215dc562c5SJosef Bacik 	bool fast_search = false;
302233345d01SLi Zefan 	u64 ino = btrfs_ino(inode);
3023e02119d5SChris Mason 
3024e02119d5SChris Mason 	log = root->log_root;
3025e02119d5SChris Mason 
3026e02119d5SChris Mason 	path = btrfs_alloc_path();
30275df67083STsutomu Itoh 	if (!path)
30285df67083STsutomu Itoh 		return -ENOMEM;
3029e02119d5SChris Mason 	dst_path = btrfs_alloc_path();
30305df67083STsutomu Itoh 	if (!dst_path) {
30315df67083STsutomu Itoh 		btrfs_free_path(path);
30325df67083STsutomu Itoh 		return -ENOMEM;
30335df67083STsutomu Itoh 	}
3034e02119d5SChris Mason 
303533345d01SLi Zefan 	min_key.objectid = ino;
3036e02119d5SChris Mason 	min_key.type = BTRFS_INODE_ITEM_KEY;
3037e02119d5SChris Mason 	min_key.offset = 0;
3038e02119d5SChris Mason 
303933345d01SLi Zefan 	max_key.objectid = ino;
304012fcfd22SChris Mason 
304112fcfd22SChris Mason 
30425dc562c5SJosef Bacik 	/* today the code can only do partial logging of directories */
3043e02119d5SChris Mason 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
3044e02119d5SChris Mason 		max_key.type = BTRFS_XATTR_ITEM_KEY;
3045e02119d5SChris Mason 	else
3046e02119d5SChris Mason 		max_key.type = (u8)-1;
3047e02119d5SChris Mason 	max_key.offset = (u64)-1;
3048e02119d5SChris Mason 
304916cdcec7SMiao Xie 	ret = btrfs_commit_inode_delayed_items(trans, inode);
305016cdcec7SMiao Xie 	if (ret) {
305116cdcec7SMiao Xie 		btrfs_free_path(path);
305216cdcec7SMiao Xie 		btrfs_free_path(dst_path);
305316cdcec7SMiao Xie 		return ret;
305416cdcec7SMiao Xie 	}
305516cdcec7SMiao Xie 
3056e02119d5SChris Mason 	mutex_lock(&BTRFS_I(inode)->log_mutex);
3057e02119d5SChris Mason 
3058e02119d5SChris Mason 	/*
3059e02119d5SChris Mason 	 * a brute force approach to making sure we get the most uptodate
3060e02119d5SChris Mason 	 * copies of everything.
3061e02119d5SChris Mason 	 */
3062e02119d5SChris Mason 	if (S_ISDIR(inode->i_mode)) {
3063e02119d5SChris Mason 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3064e02119d5SChris Mason 
3065e02119d5SChris Mason 		if (inode_only == LOG_INODE_EXISTS)
3066e02119d5SChris Mason 			max_key_type = BTRFS_XATTR_ITEM_KEY;
306733345d01SLi Zefan 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3068e02119d5SChris Mason 	} else {
30695dc562c5SJosef Bacik 		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
30705dc562c5SJosef Bacik 				       &BTRFS_I(inode)->runtime_flags)) {
30715dc562c5SJosef Bacik 			ret = btrfs_truncate_inode_items(trans, log,
30725dc562c5SJosef Bacik 							 inode, 0, 0);
30735dc562c5SJosef Bacik 		} else {
30745dc562c5SJosef Bacik 			fast_search = true;
30755dc562c5SJosef Bacik 			max_key.type = BTRFS_XATTR_ITEM_KEY;
30765dc562c5SJosef Bacik 			ret = drop_objectid_items(trans, log, path, ino,
30775dc562c5SJosef Bacik 						  BTRFS_XATTR_ITEM_KEY);
30785dc562c5SJosef Bacik 		}
3079e02119d5SChris Mason 	}
30804a500fd1SYan, Zheng 	if (ret) {
30814a500fd1SYan, Zheng 		err = ret;
30824a500fd1SYan, Zheng 		goto out_unlock;
30834a500fd1SYan, Zheng 	}
3084e02119d5SChris Mason 	path->keep_locks = 1;
3085e02119d5SChris Mason 
3086e02119d5SChris Mason 	while (1) {
308731ff1cd2SChris Mason 		ins_nr = 0;
3088e02119d5SChris Mason 		ret = btrfs_search_forward(root, &min_key, &max_key,
3089e02119d5SChris Mason 					   path, 0, trans->transid);
3090e02119d5SChris Mason 		if (ret != 0)
3091e02119d5SChris Mason 			break;
30923a5f1d45SChris Mason again:
309331ff1cd2SChris Mason 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
309433345d01SLi Zefan 		if (min_key.objectid != ino)
3095e02119d5SChris Mason 			break;
3096e02119d5SChris Mason 		if (min_key.type > max_key.type)
3097e02119d5SChris Mason 			break;
309831ff1cd2SChris Mason 
3099e02119d5SChris Mason 		src = path->nodes[0];
310031ff1cd2SChris Mason 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
310131ff1cd2SChris Mason 			ins_nr++;
310231ff1cd2SChris Mason 			goto next_slot;
310331ff1cd2SChris Mason 		} else if (!ins_nr) {
310431ff1cd2SChris Mason 			ins_start_slot = path->slots[0];
310531ff1cd2SChris Mason 			ins_nr = 1;
310631ff1cd2SChris Mason 			goto next_slot;
3107e02119d5SChris Mason 		}
3108e02119d5SChris Mason 
3109d2794405SLiu Bo 		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
311031ff1cd2SChris Mason 				 ins_nr, inode_only);
31114a500fd1SYan, Zheng 		if (ret) {
31124a500fd1SYan, Zheng 			err = ret;
31134a500fd1SYan, Zheng 			goto out_unlock;
31144a500fd1SYan, Zheng 		}
311531ff1cd2SChris Mason 		ins_nr = 1;
311631ff1cd2SChris Mason 		ins_start_slot = path->slots[0];
311731ff1cd2SChris Mason next_slot:
3118e02119d5SChris Mason 
31193a5f1d45SChris Mason 		nritems = btrfs_header_nritems(path->nodes[0]);
31203a5f1d45SChris Mason 		path->slots[0]++;
31213a5f1d45SChris Mason 		if (path->slots[0] < nritems) {
31223a5f1d45SChris Mason 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
31233a5f1d45SChris Mason 					      path->slots[0]);
31243a5f1d45SChris Mason 			goto again;
31253a5f1d45SChris Mason 		}
312631ff1cd2SChris Mason 		if (ins_nr) {
3127d2794405SLiu Bo 			ret = copy_items(trans, inode, dst_path, src,
312831ff1cd2SChris Mason 					 ins_start_slot,
312931ff1cd2SChris Mason 					 ins_nr, inode_only);
31304a500fd1SYan, Zheng 			if (ret) {
31314a500fd1SYan, Zheng 				err = ret;
31324a500fd1SYan, Zheng 				goto out_unlock;
31334a500fd1SYan, Zheng 			}
313431ff1cd2SChris Mason 			ins_nr = 0;
313531ff1cd2SChris Mason 		}
3136b3b4aa74SDavid Sterba 		btrfs_release_path(path);
31373a5f1d45SChris Mason 
3138e02119d5SChris Mason 		if (min_key.offset < (u64)-1)
3139e02119d5SChris Mason 			min_key.offset++;
3140e02119d5SChris Mason 		else if (min_key.type < (u8)-1)
3141e02119d5SChris Mason 			min_key.type++;
3142e02119d5SChris Mason 		else if (min_key.objectid < (u64)-1)
3143e02119d5SChris Mason 			min_key.objectid++;
3144e02119d5SChris Mason 		else
3145e02119d5SChris Mason 			break;
3146e02119d5SChris Mason 	}
314731ff1cd2SChris Mason 	if (ins_nr) {
3148d2794405SLiu Bo 		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
314931ff1cd2SChris Mason 				 ins_nr, inode_only);
31504a500fd1SYan, Zheng 		if (ret) {
31514a500fd1SYan, Zheng 			err = ret;
31524a500fd1SYan, Zheng 			goto out_unlock;
31534a500fd1SYan, Zheng 		}
315431ff1cd2SChris Mason 		ins_nr = 0;
315531ff1cd2SChris Mason 	}
31565dc562c5SJosef Bacik 
31575dc562c5SJosef Bacik 	if (fast_search) {
31585dc562c5SJosef Bacik 		btrfs_release_path(path);
31595dc562c5SJosef Bacik 		btrfs_release_path(dst_path);
31605dc562c5SJosef Bacik 		ret = btrfs_log_changed_extents(trans, root, inode, path,
31615dc562c5SJosef Bacik 						dst_path);
31625dc562c5SJosef Bacik 		if (ret) {
31635dc562c5SJosef Bacik 			err = ret;
31645dc562c5SJosef Bacik 			goto out_unlock;
31655dc562c5SJosef Bacik 		}
316606d3d22bSLiu Bo 	} else {
316706d3d22bSLiu Bo 		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
316806d3d22bSLiu Bo 		struct extent_map *em, *n;
316906d3d22bSLiu Bo 
317006d3d22bSLiu Bo 		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
317106d3d22bSLiu Bo 			list_del_init(&em->list);
31725dc562c5SJosef Bacik 	}
31735dc562c5SJosef Bacik 
31749623f9a3SChris Mason 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3175b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3176b3b4aa74SDavid Sterba 		btrfs_release_path(dst_path);
3177e02119d5SChris Mason 		ret = log_directory_changes(trans, root, inode, path, dst_path);
31784a500fd1SYan, Zheng 		if (ret) {
31794a500fd1SYan, Zheng 			err = ret;
31804a500fd1SYan, Zheng 			goto out_unlock;
31814a500fd1SYan, Zheng 		}
3182e02119d5SChris Mason 	}
31833a5f1d45SChris Mason 	BTRFS_I(inode)->logged_trans = trans->transid;
318446d8bc34SLiu Bo 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
31854a500fd1SYan, Zheng out_unlock:
3186e02119d5SChris Mason 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
3187e02119d5SChris Mason 
3188e02119d5SChris Mason 	btrfs_free_path(path);
3189e02119d5SChris Mason 	btrfs_free_path(dst_path);
31904a500fd1SYan, Zheng 	return err;
3191e02119d5SChris Mason }
3192e02119d5SChris Mason 
319312fcfd22SChris Mason /*
319412fcfd22SChris Mason  * follow the dentry parent pointers up the chain and see if any
319512fcfd22SChris Mason  * of the directories in it require a full commit before they can
319612fcfd22SChris Mason  * be logged.  Returns zero if nothing special needs to be done or 1 if
319712fcfd22SChris Mason  * a full commit is required.
319812fcfd22SChris Mason  */
319912fcfd22SChris Mason static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
320012fcfd22SChris Mason 					       struct inode *inode,
320112fcfd22SChris Mason 					       struct dentry *parent,
320212fcfd22SChris Mason 					       struct super_block *sb,
320312fcfd22SChris Mason 					       u64 last_committed)
3204e02119d5SChris Mason {
320512fcfd22SChris Mason 	int ret = 0;
320612fcfd22SChris Mason 	struct btrfs_root *root;
32076a912213SJosef Bacik 	struct dentry *old_parent = NULL;
3208e02119d5SChris Mason 
3209af4176b4SChris Mason 	/*
3210af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
3211af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
3212af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
3213af4176b4SChris Mason 	 * and other fun in this file.
3214af4176b4SChris Mason 	 */
3215af4176b4SChris Mason 	if (S_ISREG(inode->i_mode) &&
3216af4176b4SChris Mason 	    BTRFS_I(inode)->generation <= last_committed &&
3217af4176b4SChris Mason 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
3218af4176b4SChris Mason 			goto out;
3219af4176b4SChris Mason 
322012fcfd22SChris Mason 	if (!S_ISDIR(inode->i_mode)) {
322112fcfd22SChris Mason 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
322212fcfd22SChris Mason 			goto out;
322312fcfd22SChris Mason 		inode = parent->d_inode;
322412fcfd22SChris Mason 	}
322512fcfd22SChris Mason 
322612fcfd22SChris Mason 	while (1) {
322712fcfd22SChris Mason 		BTRFS_I(inode)->logged_trans = trans->transid;
322812fcfd22SChris Mason 		smp_mb();
322912fcfd22SChris Mason 
323012fcfd22SChris Mason 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
323112fcfd22SChris Mason 			root = BTRFS_I(inode)->root;
323212fcfd22SChris Mason 
323312fcfd22SChris Mason 			/*
323412fcfd22SChris Mason 			 * make sure any commits to the log are forced
323512fcfd22SChris Mason 			 * to be full commits
323612fcfd22SChris Mason 			 */
323712fcfd22SChris Mason 			root->fs_info->last_trans_log_full_commit =
323812fcfd22SChris Mason 				trans->transid;
323912fcfd22SChris Mason 			ret = 1;
324012fcfd22SChris Mason 			break;
324112fcfd22SChris Mason 		}
324212fcfd22SChris Mason 
324312fcfd22SChris Mason 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
324412fcfd22SChris Mason 			break;
324512fcfd22SChris Mason 
324676dda93cSYan, Zheng 		if (IS_ROOT(parent))
324712fcfd22SChris Mason 			break;
324812fcfd22SChris Mason 
32496a912213SJosef Bacik 		parent = dget_parent(parent);
32506a912213SJosef Bacik 		dput(old_parent);
32516a912213SJosef Bacik 		old_parent = parent;
325212fcfd22SChris Mason 		inode = parent->d_inode;
325312fcfd22SChris Mason 
325412fcfd22SChris Mason 	}
32556a912213SJosef Bacik 	dput(old_parent);
325612fcfd22SChris Mason out:
3257e02119d5SChris Mason 	return ret;
3258e02119d5SChris Mason }
3259e02119d5SChris Mason 
3260e02119d5SChris Mason /*
3261e02119d5SChris Mason  * helper function around btrfs_log_inode to make sure newly created
3262e02119d5SChris Mason  * parent directories also end up in the log.  A minimal inode and backref
3263e02119d5SChris Mason  * only logging is done of any parent directories that are older than
3264e02119d5SChris Mason  * the last committed transaction
3265e02119d5SChris Mason  */
326612fcfd22SChris Mason int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
326712fcfd22SChris Mason 		    struct btrfs_root *root, struct inode *inode,
326812fcfd22SChris Mason 		    struct dentry *parent, int exists_only)
3269e02119d5SChris Mason {
327012fcfd22SChris Mason 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3271e02119d5SChris Mason 	struct super_block *sb;
32726a912213SJosef Bacik 	struct dentry *old_parent = NULL;
327312fcfd22SChris Mason 	int ret = 0;
327412fcfd22SChris Mason 	u64 last_committed = root->fs_info->last_trans_committed;
327512fcfd22SChris Mason 
327612fcfd22SChris Mason 	sb = inode->i_sb;
327712fcfd22SChris Mason 
32783a5e1404SSage Weil 	if (btrfs_test_opt(root, NOTREELOG)) {
32793a5e1404SSage Weil 		ret = 1;
32803a5e1404SSage Weil 		goto end_no_trans;
32813a5e1404SSage Weil 	}
32823a5e1404SSage Weil 
328312fcfd22SChris Mason 	if (root->fs_info->last_trans_log_full_commit >
328412fcfd22SChris Mason 	    root->fs_info->last_trans_committed) {
328512fcfd22SChris Mason 		ret = 1;
328612fcfd22SChris Mason 		goto end_no_trans;
328712fcfd22SChris Mason 	}
328812fcfd22SChris Mason 
328976dda93cSYan, Zheng 	if (root != BTRFS_I(inode)->root ||
329076dda93cSYan, Zheng 	    btrfs_root_refs(&root->root_item) == 0) {
329176dda93cSYan, Zheng 		ret = 1;
329276dda93cSYan, Zheng 		goto end_no_trans;
329376dda93cSYan, Zheng 	}
329476dda93cSYan, Zheng 
329512fcfd22SChris Mason 	ret = check_parent_dirs_for_sync(trans, inode, parent,
329612fcfd22SChris Mason 					 sb, last_committed);
329712fcfd22SChris Mason 	if (ret)
329812fcfd22SChris Mason 		goto end_no_trans;
3299e02119d5SChris Mason 
330022ee6985SJosef Bacik 	if (btrfs_inode_in_log(inode, trans->transid)) {
3301257c62e1SChris Mason 		ret = BTRFS_NO_LOG_SYNC;
3302257c62e1SChris Mason 		goto end_no_trans;
3303257c62e1SChris Mason 	}
3304257c62e1SChris Mason 
33054a500fd1SYan, Zheng 	ret = start_log_trans(trans, root);
33064a500fd1SYan, Zheng 	if (ret)
33074a500fd1SYan, Zheng 		goto end_trans;
330812fcfd22SChris Mason 
330912fcfd22SChris Mason 	ret = btrfs_log_inode(trans, root, inode, inode_only);
33104a500fd1SYan, Zheng 	if (ret)
33114a500fd1SYan, Zheng 		goto end_trans;
3312e02119d5SChris Mason 
3313af4176b4SChris Mason 	/*
3314af4176b4SChris Mason 	 * for regular files, if its inode is already on disk, we don't
3315af4176b4SChris Mason 	 * have to worry about the parents at all.  This is because
3316af4176b4SChris Mason 	 * we can use the last_unlink_trans field to record renames
3317af4176b4SChris Mason 	 * and other fun in this file.
3318af4176b4SChris Mason 	 */
3319af4176b4SChris Mason 	if (S_ISREG(inode->i_mode) &&
3320af4176b4SChris Mason 	    BTRFS_I(inode)->generation <= last_committed &&
33214a500fd1SYan, Zheng 	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
33224a500fd1SYan, Zheng 		ret = 0;
33234a500fd1SYan, Zheng 		goto end_trans;
33244a500fd1SYan, Zheng 	}
3325af4176b4SChris Mason 
3326af4176b4SChris Mason 	inode_only = LOG_INODE_EXISTS;
332712fcfd22SChris Mason 	while (1) {
332812fcfd22SChris Mason 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3329e02119d5SChris Mason 			break;
3330e02119d5SChris Mason 
333112fcfd22SChris Mason 		inode = parent->d_inode;
333276dda93cSYan, Zheng 		if (root != BTRFS_I(inode)->root)
333376dda93cSYan, Zheng 			break;
333476dda93cSYan, Zheng 
333512fcfd22SChris Mason 		if (BTRFS_I(inode)->generation >
333612fcfd22SChris Mason 		    root->fs_info->last_trans_committed) {
333712fcfd22SChris Mason 			ret = btrfs_log_inode(trans, root, inode, inode_only);
33384a500fd1SYan, Zheng 			if (ret)
33394a500fd1SYan, Zheng 				goto end_trans;
3340e02119d5SChris Mason 		}
334176dda93cSYan, Zheng 		if (IS_ROOT(parent))
334212fcfd22SChris Mason 			break;
334312fcfd22SChris Mason 
33446a912213SJosef Bacik 		parent = dget_parent(parent);
33456a912213SJosef Bacik 		dput(old_parent);
33466a912213SJosef Bacik 		old_parent = parent;
334712fcfd22SChris Mason 	}
334812fcfd22SChris Mason 	ret = 0;
33494a500fd1SYan, Zheng end_trans:
33506a912213SJosef Bacik 	dput(old_parent);
33514a500fd1SYan, Zheng 	if (ret < 0) {
33520fa83cdbSJosef Bacik 		WARN_ON(ret != -ENOSPC);
33534a500fd1SYan, Zheng 		root->fs_info->last_trans_log_full_commit = trans->transid;
33544a500fd1SYan, Zheng 		ret = 1;
33554a500fd1SYan, Zheng 	}
335612fcfd22SChris Mason 	btrfs_end_log_trans(root);
335712fcfd22SChris Mason end_no_trans:
335812fcfd22SChris Mason 	return ret;
3359e02119d5SChris Mason }
3360e02119d5SChris Mason 
3361e02119d5SChris Mason /*
3362e02119d5SChris Mason  * it is not safe to log dentry if the chunk root has added new
3363e02119d5SChris Mason  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
3364e02119d5SChris Mason  * If this returns 1, you must commit the transaction to safely get your
3365e02119d5SChris Mason  * data on disk.
3366e02119d5SChris Mason  */
3367e02119d5SChris Mason int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3368e02119d5SChris Mason 			  struct btrfs_root *root, struct dentry *dentry)
3369e02119d5SChris Mason {
33706a912213SJosef Bacik 	struct dentry *parent = dget_parent(dentry);
33716a912213SJosef Bacik 	int ret;
33726a912213SJosef Bacik 
33736a912213SJosef Bacik 	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
33746a912213SJosef Bacik 	dput(parent);
33756a912213SJosef Bacik 
33766a912213SJosef Bacik 	return ret;
3377e02119d5SChris Mason }
3378e02119d5SChris Mason 
3379e02119d5SChris Mason /*
3380e02119d5SChris Mason  * should be called during mount to recover any replay any log trees
3381e02119d5SChris Mason  * from the FS
3382e02119d5SChris Mason  */
3383e02119d5SChris Mason int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3384e02119d5SChris Mason {
3385e02119d5SChris Mason 	int ret;
3386e02119d5SChris Mason 	struct btrfs_path *path;
3387e02119d5SChris Mason 	struct btrfs_trans_handle *trans;
3388e02119d5SChris Mason 	struct btrfs_key key;
3389e02119d5SChris Mason 	struct btrfs_key found_key;
3390e02119d5SChris Mason 	struct btrfs_key tmp_key;
3391e02119d5SChris Mason 	struct btrfs_root *log;
3392e02119d5SChris Mason 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
3393e02119d5SChris Mason 	struct walk_control wc = {
3394e02119d5SChris Mason 		.process_func = process_one_buffer,
3395e02119d5SChris Mason 		.stage = 0,
3396e02119d5SChris Mason 	};
3397e02119d5SChris Mason 
3398e02119d5SChris Mason 	path = btrfs_alloc_path();
3399db5b493aSTsutomu Itoh 	if (!path)
3400db5b493aSTsutomu Itoh 		return -ENOMEM;
3401db5b493aSTsutomu Itoh 
3402db5b493aSTsutomu Itoh 	fs_info->log_root_recovering = 1;
3403e02119d5SChris Mason 
34044a500fd1SYan, Zheng 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
340579787eaaSJeff Mahoney 	if (IS_ERR(trans)) {
340679787eaaSJeff Mahoney 		ret = PTR_ERR(trans);
340779787eaaSJeff Mahoney 		goto error;
340879787eaaSJeff Mahoney 	}
3409e02119d5SChris Mason 
3410e02119d5SChris Mason 	wc.trans = trans;
3411e02119d5SChris Mason 	wc.pin = 1;
3412e02119d5SChris Mason 
3413db5b493aSTsutomu Itoh 	ret = walk_log_tree(trans, log_root_tree, &wc);
341479787eaaSJeff Mahoney 	if (ret) {
341579787eaaSJeff Mahoney 		btrfs_error(fs_info, ret, "Failed to pin buffers while "
341679787eaaSJeff Mahoney 			    "recovering log root tree.");
341779787eaaSJeff Mahoney 		goto error;
341879787eaaSJeff Mahoney 	}
3419e02119d5SChris Mason 
3420e02119d5SChris Mason again:
3421e02119d5SChris Mason 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
3422e02119d5SChris Mason 	key.offset = (u64)-1;
3423e02119d5SChris Mason 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
3424e02119d5SChris Mason 
3425e02119d5SChris Mason 	while (1) {
3426e02119d5SChris Mason 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
342779787eaaSJeff Mahoney 
342879787eaaSJeff Mahoney 		if (ret < 0) {
342979787eaaSJeff Mahoney 			btrfs_error(fs_info, ret,
343079787eaaSJeff Mahoney 				    "Couldn't find tree log root.");
343179787eaaSJeff Mahoney 			goto error;
343279787eaaSJeff Mahoney 		}
3433e02119d5SChris Mason 		if (ret > 0) {
3434e02119d5SChris Mason 			if (path->slots[0] == 0)
3435e02119d5SChris Mason 				break;
3436e02119d5SChris Mason 			path->slots[0]--;
3437e02119d5SChris Mason 		}
3438e02119d5SChris Mason 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3439e02119d5SChris Mason 				      path->slots[0]);
3440b3b4aa74SDavid Sterba 		btrfs_release_path(path);
3441e02119d5SChris Mason 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
3442e02119d5SChris Mason 			break;
3443e02119d5SChris Mason 
3444e02119d5SChris Mason 		log = btrfs_read_fs_root_no_radix(log_root_tree,
3445e02119d5SChris Mason 						  &found_key);
344679787eaaSJeff Mahoney 		if (IS_ERR(log)) {
344779787eaaSJeff Mahoney 			ret = PTR_ERR(log);
344879787eaaSJeff Mahoney 			btrfs_error(fs_info, ret,
344979787eaaSJeff Mahoney 				    "Couldn't read tree log root.");
345079787eaaSJeff Mahoney 			goto error;
345179787eaaSJeff Mahoney 		}
3452e02119d5SChris Mason 
3453e02119d5SChris Mason 		tmp_key.objectid = found_key.offset;
3454e02119d5SChris Mason 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
3455e02119d5SChris Mason 		tmp_key.offset = (u64)-1;
3456e02119d5SChris Mason 
3457e02119d5SChris Mason 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
345879787eaaSJeff Mahoney 		if (IS_ERR(wc.replay_dest)) {
345979787eaaSJeff Mahoney 			ret = PTR_ERR(wc.replay_dest);
346079787eaaSJeff Mahoney 			btrfs_error(fs_info, ret, "Couldn't read target root "
346179787eaaSJeff Mahoney 				    "for tree log recovery.");
346279787eaaSJeff Mahoney 			goto error;
346379787eaaSJeff Mahoney 		}
3464e02119d5SChris Mason 
346507d400a6SYan Zheng 		wc.replay_dest->log_root = log;
34665d4f98a2SYan Zheng 		btrfs_record_root_in_trans(trans, wc.replay_dest);
3467e02119d5SChris Mason 		ret = walk_log_tree(trans, log, &wc);
3468e02119d5SChris Mason 		BUG_ON(ret);
3469e02119d5SChris Mason 
3470e02119d5SChris Mason 		if (wc.stage == LOG_WALK_REPLAY_ALL) {
3471e02119d5SChris Mason 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
3472e02119d5SChris Mason 						      path);
3473e02119d5SChris Mason 			BUG_ON(ret);
3474e02119d5SChris Mason 		}
3475e02119d5SChris Mason 
3476e02119d5SChris Mason 		key.offset = found_key.offset - 1;
347707d400a6SYan Zheng 		wc.replay_dest->log_root = NULL;
3478e02119d5SChris Mason 		free_extent_buffer(log->node);
3479b263c2c8SChris Mason 		free_extent_buffer(log->commit_root);
3480e02119d5SChris Mason 		kfree(log);
3481e02119d5SChris Mason 
3482e02119d5SChris Mason 		if (found_key.offset == 0)
3483e02119d5SChris Mason 			break;
3484e02119d5SChris Mason 	}
3485b3b4aa74SDavid Sterba 	btrfs_release_path(path);
3486e02119d5SChris Mason 
3487e02119d5SChris Mason 	/* step one is to pin it all, step two is to replay just inodes */
3488e02119d5SChris Mason 	if (wc.pin) {
3489e02119d5SChris Mason 		wc.pin = 0;
3490e02119d5SChris Mason 		wc.process_func = replay_one_buffer;
3491e02119d5SChris Mason 		wc.stage = LOG_WALK_REPLAY_INODES;
3492e02119d5SChris Mason 		goto again;
3493e02119d5SChris Mason 	}
3494e02119d5SChris Mason 	/* step three is to replay everything */
3495e02119d5SChris Mason 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
3496e02119d5SChris Mason 		wc.stage++;
3497e02119d5SChris Mason 		goto again;
3498e02119d5SChris Mason 	}
3499e02119d5SChris Mason 
3500e02119d5SChris Mason 	btrfs_free_path(path);
3501e02119d5SChris Mason 
3502e02119d5SChris Mason 	free_extent_buffer(log_root_tree->node);
3503e02119d5SChris Mason 	log_root_tree->log_root = NULL;
3504e02119d5SChris Mason 	fs_info->log_root_recovering = 0;
3505e02119d5SChris Mason 
3506e02119d5SChris Mason 	/* step 4: commit the transaction, which also unpins the blocks */
3507e02119d5SChris Mason 	btrfs_commit_transaction(trans, fs_info->tree_root);
3508e02119d5SChris Mason 
3509e02119d5SChris Mason 	kfree(log_root_tree);
3510e02119d5SChris Mason 	return 0;
351179787eaaSJeff Mahoney 
351279787eaaSJeff Mahoney error:
351379787eaaSJeff Mahoney 	btrfs_free_path(path);
351479787eaaSJeff Mahoney 	return ret;
3515e02119d5SChris Mason }
351612fcfd22SChris Mason 
351712fcfd22SChris Mason /*
351812fcfd22SChris Mason  * there are some corner cases where we want to force a full
351912fcfd22SChris Mason  * commit instead of allowing a directory to be logged.
352012fcfd22SChris Mason  *
352112fcfd22SChris Mason  * They revolve around files there were unlinked from the directory, and
352212fcfd22SChris Mason  * this function updates the parent directory so that a full commit is
352312fcfd22SChris Mason  * properly done if it is fsync'd later after the unlinks are done.
352412fcfd22SChris Mason  */
352512fcfd22SChris Mason void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
352612fcfd22SChris Mason 			     struct inode *dir, struct inode *inode,
352712fcfd22SChris Mason 			     int for_rename)
352812fcfd22SChris Mason {
352912fcfd22SChris Mason 	/*
3530af4176b4SChris Mason 	 * when we're logging a file, if it hasn't been renamed
3531af4176b4SChris Mason 	 * or unlinked, and its inode is fully committed on disk,
3532af4176b4SChris Mason 	 * we don't have to worry about walking up the directory chain
3533af4176b4SChris Mason 	 * to log its parents.
3534af4176b4SChris Mason 	 *
3535af4176b4SChris Mason 	 * So, we use the last_unlink_trans field to put this transid
3536af4176b4SChris Mason 	 * into the file.  When the file is logged we check it and
3537af4176b4SChris Mason 	 * don't log the parents if the file is fully on disk.
3538af4176b4SChris Mason 	 */
3539af4176b4SChris Mason 	if (S_ISREG(inode->i_mode))
3540af4176b4SChris Mason 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
3541af4176b4SChris Mason 
3542af4176b4SChris Mason 	/*
354312fcfd22SChris Mason 	 * if this directory was already logged any new
354412fcfd22SChris Mason 	 * names for this file/dir will get recorded
354512fcfd22SChris Mason 	 */
354612fcfd22SChris Mason 	smp_mb();
354712fcfd22SChris Mason 	if (BTRFS_I(dir)->logged_trans == trans->transid)
354812fcfd22SChris Mason 		return;
354912fcfd22SChris Mason 
355012fcfd22SChris Mason 	/*
355112fcfd22SChris Mason 	 * if the inode we're about to unlink was logged,
355212fcfd22SChris Mason 	 * the log will be properly updated for any new names
355312fcfd22SChris Mason 	 */
355412fcfd22SChris Mason 	if (BTRFS_I(inode)->logged_trans == trans->transid)
355512fcfd22SChris Mason 		return;
355612fcfd22SChris Mason 
355712fcfd22SChris Mason 	/*
355812fcfd22SChris Mason 	 * when renaming files across directories, if the directory
355912fcfd22SChris Mason 	 * there we're unlinking from gets fsync'd later on, there's
356012fcfd22SChris Mason 	 * no way to find the destination directory later and fsync it
356112fcfd22SChris Mason 	 * properly.  So, we have to be conservative and force commits
356212fcfd22SChris Mason 	 * so the new name gets discovered.
356312fcfd22SChris Mason 	 */
356412fcfd22SChris Mason 	if (for_rename)
356512fcfd22SChris Mason 		goto record;
356612fcfd22SChris Mason 
356712fcfd22SChris Mason 	/* we can safely do the unlink without any special recording */
356812fcfd22SChris Mason 	return;
356912fcfd22SChris Mason 
357012fcfd22SChris Mason record:
357112fcfd22SChris Mason 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
357212fcfd22SChris Mason }
357312fcfd22SChris Mason 
357412fcfd22SChris Mason /*
357512fcfd22SChris Mason  * Call this after adding a new name for a file and it will properly
357612fcfd22SChris Mason  * update the log to reflect the new name.
357712fcfd22SChris Mason  *
357812fcfd22SChris Mason  * It will return zero if all goes well, and it will return 1 if a
357912fcfd22SChris Mason  * full transaction commit is required.
358012fcfd22SChris Mason  */
358112fcfd22SChris Mason int btrfs_log_new_name(struct btrfs_trans_handle *trans,
358212fcfd22SChris Mason 			struct inode *inode, struct inode *old_dir,
358312fcfd22SChris Mason 			struct dentry *parent)
358412fcfd22SChris Mason {
358512fcfd22SChris Mason 	struct btrfs_root * root = BTRFS_I(inode)->root;
358612fcfd22SChris Mason 
358712fcfd22SChris Mason 	/*
3588af4176b4SChris Mason 	 * this will force the logging code to walk the dentry chain
3589af4176b4SChris Mason 	 * up for the file
3590af4176b4SChris Mason 	 */
3591af4176b4SChris Mason 	if (S_ISREG(inode->i_mode))
3592af4176b4SChris Mason 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
3593af4176b4SChris Mason 
3594af4176b4SChris Mason 	/*
359512fcfd22SChris Mason 	 * if this inode hasn't been logged and directory we're renaming it
359612fcfd22SChris Mason 	 * from hasn't been logged, we don't need to log it
359712fcfd22SChris Mason 	 */
359812fcfd22SChris Mason 	if (BTRFS_I(inode)->logged_trans <=
359912fcfd22SChris Mason 	    root->fs_info->last_trans_committed &&
360012fcfd22SChris Mason 	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
360112fcfd22SChris Mason 		    root->fs_info->last_trans_committed))
360212fcfd22SChris Mason 		return 0;
360312fcfd22SChris Mason 
360412fcfd22SChris Mason 	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
360512fcfd22SChris Mason }
360612fcfd22SChris Mason 
3607