xref: /openbmc/linux/fs/btrfs/tree-log.c (revision bb26cfd9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "print-tree.h"
17 #include "backref.h"
18 #include "compression.h"
19 #include "qgroup.h"
20 #include "block-group.h"
21 #include "space-info.h"
22 #include "zoned.h"
23 #include "inode-item.h"
24 
25 /* magic values for the inode_only field in btrfs_log_inode:
26  *
27  * LOG_INODE_ALL means to log everything
28  * LOG_INODE_EXISTS means to log just enough to recreate the inode
29  * during log replay
30  */
31 enum {
32 	LOG_INODE_ALL,
33 	LOG_INODE_EXISTS,
34 	LOG_OTHER_INODE,
35 	LOG_OTHER_INODE_ALL,
36 };
37 
38 /*
39  * directory trouble cases
40  *
41  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
42  * log, we must force a full commit before doing an fsync of the directory
43  * where the unlink was done.
44  * ---> record transid of last unlink/rename per directory
45  *
46  * mkdir foo/some_dir
47  * normal commit
48  * rename foo/some_dir foo2/some_dir
49  * mkdir foo/some_dir
50  * fsync foo/some_dir/some_file
51  *
52  * The fsync above will unlink the original some_dir without recording
53  * it in its new location (foo2).  After a crash, some_dir will be gone
54  * unless the fsync of some_file forces a full commit
55  *
56  * 2) we must log any new names for any file or dir that is in the fsync
57  * log. ---> check inode while renaming/linking.
58  *
59  * 2a) we must log any new names for any file or dir during rename
60  * when the directory they are being removed from was logged.
61  * ---> check inode and old parent dir during rename
62  *
63  *  2a is actually the more important variant.  With the extra logging
64  *  a crash might unlink the old name without recreating the new one
65  *
66  * 3) after a crash, we must go through any directories with a link count
67  * of zero and redo the rm -rf
68  *
69  * mkdir f1/foo
70  * normal commit
71  * rm -rf f1/foo
72  * fsync(f1)
73  *
74  * The directory f1 was fully removed from the FS, but fsync was never
75  * called on f1, only its parent dir.  After a crash the rm -rf must
76  * be replayed.  This must be able to recurse down the entire
77  * directory tree.  The inode link count fixup code takes care of the
78  * ugly details.
79  */
80 
81 /*
82  * stages for the tree walking.  The first
83  * stage (0) is to only pin down the blocks we find
84  * the second stage (1) is to make sure that all the inodes
85  * we find in the log are created in the subvolume.
86  *
87  * The last stage is to deal with directories and links and extents
88  * and all the other fun semantics
89  */
90 enum {
91 	LOG_WALK_PIN_ONLY,
92 	LOG_WALK_REPLAY_INODES,
93 	LOG_WALK_REPLAY_DIR_INDEX,
94 	LOG_WALK_REPLAY_ALL,
95 };
96 
97 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
98 			   struct btrfs_inode *inode,
99 			   int inode_only,
100 			   struct btrfs_log_ctx *ctx);
101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
102 			     struct btrfs_root *root,
103 			     struct btrfs_path *path, u64 objectid);
104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
105 				       struct btrfs_root *root,
106 				       struct btrfs_root *log,
107 				       struct btrfs_path *path,
108 				       u64 dirid, int del_all);
109 static void wait_log_commit(struct btrfs_root *root, int transid);
110 
111 /*
112  * tree logging is a special write ahead log used to make sure that
113  * fsyncs and O_SYNCs can happen without doing full tree commits.
114  *
115  * Full tree commits are expensive because they require commonly
116  * modified blocks to be recowed, creating many dirty pages in the
117  * extent tree an 4x-6x higher write load than ext3.
118  *
119  * Instead of doing a tree commit on every fsync, we use the
120  * key ranges and transaction ids to find items for a given file or directory
121  * that have changed in this transaction.  Those items are copied into
122  * a special tree (one per subvolume root), that tree is written to disk
123  * and then the fsync is considered complete.
124  *
125  * After a crash, items are copied out of the log-tree back into the
126  * subvolume tree.  Any file data extents found are recorded in the extent
127  * allocation tree, and the log-tree freed.
128  *
129  * The log tree is read three times, once to pin down all the extents it is
130  * using in ram and once, once to create all the inodes logged in the tree
131  * and once to do all the other items.
132  */
133 
134 /*
135  * start a sub transaction and setup the log tree
136  * this increments the log tree writer count to make the people
137  * syncing the tree wait for us to finish
138  */
139 static int start_log_trans(struct btrfs_trans_handle *trans,
140 			   struct btrfs_root *root,
141 			   struct btrfs_log_ctx *ctx)
142 {
143 	struct btrfs_fs_info *fs_info = root->fs_info;
144 	struct btrfs_root *tree_root = fs_info->tree_root;
145 	const bool zoned = btrfs_is_zoned(fs_info);
146 	int ret = 0;
147 	bool created = false;
148 
149 	/*
150 	 * First check if the log root tree was already created. If not, create
151 	 * it before locking the root's log_mutex, just to keep lockdep happy.
152 	 */
153 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
154 		mutex_lock(&tree_root->log_mutex);
155 		if (!fs_info->log_root_tree) {
156 			ret = btrfs_init_log_root_tree(trans, fs_info);
157 			if (!ret) {
158 				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
159 				created = true;
160 			}
161 		}
162 		mutex_unlock(&tree_root->log_mutex);
163 		if (ret)
164 			return ret;
165 	}
166 
167 	mutex_lock(&root->log_mutex);
168 
169 again:
170 	if (root->log_root) {
171 		int index = (root->log_transid + 1) % 2;
172 
173 		if (btrfs_need_log_full_commit(trans)) {
174 			ret = BTRFS_LOG_FORCE_COMMIT;
175 			goto out;
176 		}
177 
178 		if (zoned && atomic_read(&root->log_commit[index])) {
179 			wait_log_commit(root, root->log_transid - 1);
180 			goto again;
181 		}
182 
183 		if (!root->log_start_pid) {
184 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
185 			root->log_start_pid = current->pid;
186 		} else if (root->log_start_pid != current->pid) {
187 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
188 		}
189 	} else {
190 		/*
191 		 * This means fs_info->log_root_tree was already created
192 		 * for some other FS trees. Do the full commit not to mix
193 		 * nodes from multiple log transactions to do sequential
194 		 * writing.
195 		 */
196 		if (zoned && !created) {
197 			ret = BTRFS_LOG_FORCE_COMMIT;
198 			goto out;
199 		}
200 
201 		ret = btrfs_add_log_tree(trans, root);
202 		if (ret)
203 			goto out;
204 
205 		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
206 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
207 		root->log_start_pid = current->pid;
208 	}
209 
210 	atomic_inc(&root->log_writers);
211 	if (!ctx->logging_new_name) {
212 		int index = root->log_transid % 2;
213 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
214 		ctx->log_transid = root->log_transid;
215 	}
216 
217 out:
218 	mutex_unlock(&root->log_mutex);
219 	return ret;
220 }
221 
222 /*
223  * returns 0 if there was a log transaction running and we were able
224  * to join, or returns -ENOENT if there were not transactions
225  * in progress
226  */
227 static int join_running_log_trans(struct btrfs_root *root)
228 {
229 	const bool zoned = btrfs_is_zoned(root->fs_info);
230 	int ret = -ENOENT;
231 
232 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
233 		return ret;
234 
235 	mutex_lock(&root->log_mutex);
236 again:
237 	if (root->log_root) {
238 		int index = (root->log_transid + 1) % 2;
239 
240 		ret = 0;
241 		if (zoned && atomic_read(&root->log_commit[index])) {
242 			wait_log_commit(root, root->log_transid - 1);
243 			goto again;
244 		}
245 		atomic_inc(&root->log_writers);
246 	}
247 	mutex_unlock(&root->log_mutex);
248 	return ret;
249 }
250 
251 /*
252  * This either makes the current running log transaction wait
253  * until you call btrfs_end_log_trans() or it makes any future
254  * log transactions wait until you call btrfs_end_log_trans()
255  */
256 void btrfs_pin_log_trans(struct btrfs_root *root)
257 {
258 	atomic_inc(&root->log_writers);
259 }
260 
261 /*
262  * indicate we're done making changes to the log tree
263  * and wake up anyone waiting to do a sync
264  */
265 void btrfs_end_log_trans(struct btrfs_root *root)
266 {
267 	if (atomic_dec_and_test(&root->log_writers)) {
268 		/* atomic_dec_and_test implies a barrier */
269 		cond_wake_up_nomb(&root->log_writer_wait);
270 	}
271 }
272 
273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
274 {
275 	filemap_fdatawait_range(buf->pages[0]->mapping,
276 			        buf->start, buf->start + buf->len - 1);
277 }
278 
279 /*
280  * the walk control struct is used to pass state down the chain when
281  * processing the log tree.  The stage field tells us which part
282  * of the log tree processing we are currently doing.  The others
283  * are state fields used for that specific part
284  */
285 struct walk_control {
286 	/* should we free the extent on disk when done?  This is used
287 	 * at transaction commit time while freeing a log tree
288 	 */
289 	int free;
290 
291 	/* pin only walk, we record which extents on disk belong to the
292 	 * log trees
293 	 */
294 	int pin;
295 
296 	/* what stage of the replay code we're currently in */
297 	int stage;
298 
299 	/*
300 	 * Ignore any items from the inode currently being processed. Needs
301 	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
302 	 * the LOG_WALK_REPLAY_INODES stage.
303 	 */
304 	bool ignore_cur_inode;
305 
306 	/* the root we are currently replaying */
307 	struct btrfs_root *replay_dest;
308 
309 	/* the trans handle for the current replay */
310 	struct btrfs_trans_handle *trans;
311 
312 	/* the function that gets used to process blocks we find in the
313 	 * tree.  Note the extent_buffer might not be up to date when it is
314 	 * passed in, and it must be checked or read if you need the data
315 	 * inside it
316 	 */
317 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
318 			    struct walk_control *wc, u64 gen, int level);
319 };
320 
321 /*
322  * process_func used to pin down extents, write them or wait on them
323  */
324 static int process_one_buffer(struct btrfs_root *log,
325 			      struct extent_buffer *eb,
326 			      struct walk_control *wc, u64 gen, int level)
327 {
328 	struct btrfs_fs_info *fs_info = log->fs_info;
329 	int ret = 0;
330 
331 	/*
332 	 * If this fs is mixed then we need to be able to process the leaves to
333 	 * pin down any logged extents, so we have to read the block.
334 	 */
335 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
336 		ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
337 		if (ret)
338 			return ret;
339 	}
340 
341 	if (wc->pin) {
342 		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
343 						      eb->len);
344 		if (ret)
345 			return ret;
346 
347 		if (btrfs_buffer_uptodate(eb, gen, 0) &&
348 		    btrfs_header_level(eb) == 0)
349 			ret = btrfs_exclude_logged_extents(eb);
350 	}
351 	return ret;
352 }
353 
354 static int do_overwrite_item(struct btrfs_trans_handle *trans,
355 			     struct btrfs_root *root,
356 			     struct btrfs_path *path,
357 			     struct extent_buffer *eb, int slot,
358 			     struct btrfs_key *key)
359 {
360 	int ret;
361 	u32 item_size;
362 	u64 saved_i_size = 0;
363 	int save_old_i_size = 0;
364 	unsigned long src_ptr;
365 	unsigned long dst_ptr;
366 	int overwrite_root = 0;
367 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
368 
369 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
370 		overwrite_root = 1;
371 
372 	item_size = btrfs_item_size(eb, slot);
373 	src_ptr = btrfs_item_ptr_offset(eb, slot);
374 
375 	/* Our caller must have done a search for the key for us. */
376 	ASSERT(path->nodes[0] != NULL);
377 
378 	/*
379 	 * And the slot must point to the exact key or the slot where the key
380 	 * should be at (the first item with a key greater than 'key')
381 	 */
382 	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
383 		struct btrfs_key found_key;
384 
385 		btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
386 		ret = btrfs_comp_cpu_keys(&found_key, key);
387 		ASSERT(ret >= 0);
388 	} else {
389 		ret = 1;
390 	}
391 
392 	if (ret == 0) {
393 		char *src_copy;
394 		char *dst_copy;
395 		u32 dst_size = btrfs_item_size(path->nodes[0],
396 						  path->slots[0]);
397 		if (dst_size != item_size)
398 			goto insert;
399 
400 		if (item_size == 0) {
401 			btrfs_release_path(path);
402 			return 0;
403 		}
404 		dst_copy = kmalloc(item_size, GFP_NOFS);
405 		src_copy = kmalloc(item_size, GFP_NOFS);
406 		if (!dst_copy || !src_copy) {
407 			btrfs_release_path(path);
408 			kfree(dst_copy);
409 			kfree(src_copy);
410 			return -ENOMEM;
411 		}
412 
413 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
414 
415 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
416 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
417 				   item_size);
418 		ret = memcmp(dst_copy, src_copy, item_size);
419 
420 		kfree(dst_copy);
421 		kfree(src_copy);
422 		/*
423 		 * they have the same contents, just return, this saves
424 		 * us from cowing blocks in the destination tree and doing
425 		 * extra writes that may not have been done by a previous
426 		 * sync
427 		 */
428 		if (ret == 0) {
429 			btrfs_release_path(path);
430 			return 0;
431 		}
432 
433 		/*
434 		 * We need to load the old nbytes into the inode so when we
435 		 * replay the extents we've logged we get the right nbytes.
436 		 */
437 		if (inode_item) {
438 			struct btrfs_inode_item *item;
439 			u64 nbytes;
440 			u32 mode;
441 
442 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
443 					      struct btrfs_inode_item);
444 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
445 			item = btrfs_item_ptr(eb, slot,
446 					      struct btrfs_inode_item);
447 			btrfs_set_inode_nbytes(eb, item, nbytes);
448 
449 			/*
450 			 * If this is a directory we need to reset the i_size to
451 			 * 0 so that we can set it up properly when replaying
452 			 * the rest of the items in this log.
453 			 */
454 			mode = btrfs_inode_mode(eb, item);
455 			if (S_ISDIR(mode))
456 				btrfs_set_inode_size(eb, item, 0);
457 		}
458 	} else if (inode_item) {
459 		struct btrfs_inode_item *item;
460 		u32 mode;
461 
462 		/*
463 		 * New inode, set nbytes to 0 so that the nbytes comes out
464 		 * properly when we replay the extents.
465 		 */
466 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
467 		btrfs_set_inode_nbytes(eb, item, 0);
468 
469 		/*
470 		 * If this is a directory we need to reset the i_size to 0 so
471 		 * that we can set it up properly when replaying the rest of
472 		 * the items in this log.
473 		 */
474 		mode = btrfs_inode_mode(eb, item);
475 		if (S_ISDIR(mode))
476 			btrfs_set_inode_size(eb, item, 0);
477 	}
478 insert:
479 	btrfs_release_path(path);
480 	/* try to insert the key into the destination tree */
481 	path->skip_release_on_error = 1;
482 	ret = btrfs_insert_empty_item(trans, root, path,
483 				      key, item_size);
484 	path->skip_release_on_error = 0;
485 
486 	/* make sure any existing item is the correct size */
487 	if (ret == -EEXIST || ret == -EOVERFLOW) {
488 		u32 found_size;
489 		found_size = btrfs_item_size(path->nodes[0],
490 						path->slots[0]);
491 		if (found_size > item_size)
492 			btrfs_truncate_item(path, item_size, 1);
493 		else if (found_size < item_size)
494 			btrfs_extend_item(path, item_size - found_size);
495 	} else if (ret) {
496 		return ret;
497 	}
498 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
499 					path->slots[0]);
500 
501 	/* don't overwrite an existing inode if the generation number
502 	 * was logged as zero.  This is done when the tree logging code
503 	 * is just logging an inode to make sure it exists after recovery.
504 	 *
505 	 * Also, don't overwrite i_size on directories during replay.
506 	 * log replay inserts and removes directory items based on the
507 	 * state of the tree found in the subvolume, and i_size is modified
508 	 * as it goes
509 	 */
510 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
511 		struct btrfs_inode_item *src_item;
512 		struct btrfs_inode_item *dst_item;
513 
514 		src_item = (struct btrfs_inode_item *)src_ptr;
515 		dst_item = (struct btrfs_inode_item *)dst_ptr;
516 
517 		if (btrfs_inode_generation(eb, src_item) == 0) {
518 			struct extent_buffer *dst_eb = path->nodes[0];
519 			const u64 ino_size = btrfs_inode_size(eb, src_item);
520 
521 			/*
522 			 * For regular files an ino_size == 0 is used only when
523 			 * logging that an inode exists, as part of a directory
524 			 * fsync, and the inode wasn't fsynced before. In this
525 			 * case don't set the size of the inode in the fs/subvol
526 			 * tree, otherwise we would be throwing valid data away.
527 			 */
528 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
529 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
530 			    ino_size != 0)
531 				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
532 			goto no_copy;
533 		}
534 
535 		if (overwrite_root &&
536 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
537 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
538 			save_old_i_size = 1;
539 			saved_i_size = btrfs_inode_size(path->nodes[0],
540 							dst_item);
541 		}
542 	}
543 
544 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
545 			   src_ptr, item_size);
546 
547 	if (save_old_i_size) {
548 		struct btrfs_inode_item *dst_item;
549 		dst_item = (struct btrfs_inode_item *)dst_ptr;
550 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
551 	}
552 
553 	/* make sure the generation is filled in */
554 	if (key->type == BTRFS_INODE_ITEM_KEY) {
555 		struct btrfs_inode_item *dst_item;
556 		dst_item = (struct btrfs_inode_item *)dst_ptr;
557 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
558 			btrfs_set_inode_generation(path->nodes[0], dst_item,
559 						   trans->transid);
560 		}
561 	}
562 no_copy:
563 	btrfs_mark_buffer_dirty(path->nodes[0]);
564 	btrfs_release_path(path);
565 	return 0;
566 }
567 
568 /*
569  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
570  * to the src data we are copying out.
571  *
572  * root is the tree we are copying into, and path is a scratch
573  * path for use in this function (it should be released on entry and
574  * will be released on exit).
575  *
576  * If the key is already in the destination tree the existing item is
577  * overwritten.  If the existing item isn't big enough, it is extended.
578  * If it is too large, it is truncated.
579  *
580  * If the key isn't in the destination yet, a new item is inserted.
581  */
582 static int overwrite_item(struct btrfs_trans_handle *trans,
583 			  struct btrfs_root *root,
584 			  struct btrfs_path *path,
585 			  struct extent_buffer *eb, int slot,
586 			  struct btrfs_key *key)
587 {
588 	int ret;
589 
590 	/* Look for the key in the destination tree. */
591 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
592 	if (ret < 0)
593 		return ret;
594 
595 	return do_overwrite_item(trans, root, path, eb, slot, key);
596 }
597 
598 /*
599  * simple helper to read an inode off the disk from a given root
600  * This can only be called for subvolume roots and not for the log
601  */
602 static noinline struct inode *read_one_inode(struct btrfs_root *root,
603 					     u64 objectid)
604 {
605 	struct inode *inode;
606 
607 	inode = btrfs_iget(root->fs_info->sb, objectid, root);
608 	if (IS_ERR(inode))
609 		inode = NULL;
610 	return inode;
611 }
612 
613 /* replays a single extent in 'eb' at 'slot' with 'key' into the
614  * subvolume 'root'.  path is released on entry and should be released
615  * on exit.
616  *
617  * extents in the log tree have not been allocated out of the extent
618  * tree yet.  So, this completes the allocation, taking a reference
619  * as required if the extent already exists or creating a new extent
620  * if it isn't in the extent allocation tree yet.
621  *
622  * The extent is inserted into the file, dropping any existing extents
623  * from the file that overlap the new one.
624  */
625 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
626 				      struct btrfs_root *root,
627 				      struct btrfs_path *path,
628 				      struct extent_buffer *eb, int slot,
629 				      struct btrfs_key *key)
630 {
631 	struct btrfs_drop_extents_args drop_args = { 0 };
632 	struct btrfs_fs_info *fs_info = root->fs_info;
633 	int found_type;
634 	u64 extent_end;
635 	u64 start = key->offset;
636 	u64 nbytes = 0;
637 	struct btrfs_file_extent_item *item;
638 	struct inode *inode = NULL;
639 	unsigned long size;
640 	int ret = 0;
641 
642 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
643 	found_type = btrfs_file_extent_type(eb, item);
644 
645 	if (found_type == BTRFS_FILE_EXTENT_REG ||
646 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
647 		nbytes = btrfs_file_extent_num_bytes(eb, item);
648 		extent_end = start + nbytes;
649 
650 		/*
651 		 * We don't add to the inodes nbytes if we are prealloc or a
652 		 * hole.
653 		 */
654 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
655 			nbytes = 0;
656 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
657 		size = btrfs_file_extent_ram_bytes(eb, item);
658 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
659 		extent_end = ALIGN(start + size,
660 				   fs_info->sectorsize);
661 	} else {
662 		ret = 0;
663 		goto out;
664 	}
665 
666 	inode = read_one_inode(root, key->objectid);
667 	if (!inode) {
668 		ret = -EIO;
669 		goto out;
670 	}
671 
672 	/*
673 	 * first check to see if we already have this extent in the
674 	 * file.  This must be done before the btrfs_drop_extents run
675 	 * so we don't try to drop this extent.
676 	 */
677 	ret = btrfs_lookup_file_extent(trans, root, path,
678 			btrfs_ino(BTRFS_I(inode)), start, 0);
679 
680 	if (ret == 0 &&
681 	    (found_type == BTRFS_FILE_EXTENT_REG ||
682 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
683 		struct btrfs_file_extent_item cmp1;
684 		struct btrfs_file_extent_item cmp2;
685 		struct btrfs_file_extent_item *existing;
686 		struct extent_buffer *leaf;
687 
688 		leaf = path->nodes[0];
689 		existing = btrfs_item_ptr(leaf, path->slots[0],
690 					  struct btrfs_file_extent_item);
691 
692 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
693 				   sizeof(cmp1));
694 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
695 				   sizeof(cmp2));
696 
697 		/*
698 		 * we already have a pointer to this exact extent,
699 		 * we don't have to do anything
700 		 */
701 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
702 			btrfs_release_path(path);
703 			goto out;
704 		}
705 	}
706 	btrfs_release_path(path);
707 
708 	/* drop any overlapping extents */
709 	drop_args.start = start;
710 	drop_args.end = extent_end;
711 	drop_args.drop_cache = true;
712 	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
713 	if (ret)
714 		goto out;
715 
716 	if (found_type == BTRFS_FILE_EXTENT_REG ||
717 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
718 		u64 offset;
719 		unsigned long dest_offset;
720 		struct btrfs_key ins;
721 
722 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
723 		    btrfs_fs_incompat(fs_info, NO_HOLES))
724 			goto update_inode;
725 
726 		ret = btrfs_insert_empty_item(trans, root, path, key,
727 					      sizeof(*item));
728 		if (ret)
729 			goto out;
730 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
731 						    path->slots[0]);
732 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
733 				(unsigned long)item,  sizeof(*item));
734 
735 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
736 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
737 		ins.type = BTRFS_EXTENT_ITEM_KEY;
738 		offset = key->offset - btrfs_file_extent_offset(eb, item);
739 
740 		/*
741 		 * Manually record dirty extent, as here we did a shallow
742 		 * file extent item copy and skip normal backref update,
743 		 * but modifying extent tree all by ourselves.
744 		 * So need to manually record dirty extent for qgroup,
745 		 * as the owner of the file extent changed from log tree
746 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
747 		 */
748 		ret = btrfs_qgroup_trace_extent(trans,
749 				btrfs_file_extent_disk_bytenr(eb, item),
750 				btrfs_file_extent_disk_num_bytes(eb, item),
751 				GFP_NOFS);
752 		if (ret < 0)
753 			goto out;
754 
755 		if (ins.objectid > 0) {
756 			struct btrfs_ref ref = { 0 };
757 			u64 csum_start;
758 			u64 csum_end;
759 			LIST_HEAD(ordered_sums);
760 
761 			/*
762 			 * is this extent already allocated in the extent
763 			 * allocation tree?  If so, just add a reference
764 			 */
765 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
766 						ins.offset);
767 			if (ret < 0) {
768 				goto out;
769 			} else if (ret == 0) {
770 				btrfs_init_generic_ref(&ref,
771 						BTRFS_ADD_DELAYED_REF,
772 						ins.objectid, ins.offset, 0);
773 				btrfs_init_data_ref(&ref,
774 						root->root_key.objectid,
775 						key->objectid, offset, 0, false);
776 				ret = btrfs_inc_extent_ref(trans, &ref);
777 				if (ret)
778 					goto out;
779 			} else {
780 				/*
781 				 * insert the extent pointer in the extent
782 				 * allocation tree
783 				 */
784 				ret = btrfs_alloc_logged_file_extent(trans,
785 						root->root_key.objectid,
786 						key->objectid, offset, &ins);
787 				if (ret)
788 					goto out;
789 			}
790 			btrfs_release_path(path);
791 
792 			if (btrfs_file_extent_compression(eb, item)) {
793 				csum_start = ins.objectid;
794 				csum_end = csum_start + ins.offset;
795 			} else {
796 				csum_start = ins.objectid +
797 					btrfs_file_extent_offset(eb, item);
798 				csum_end = csum_start +
799 					btrfs_file_extent_num_bytes(eb, item);
800 			}
801 
802 			ret = btrfs_lookup_csums_range(root->log_root,
803 						csum_start, csum_end - 1,
804 						&ordered_sums, 0);
805 			if (ret)
806 				goto out;
807 			/*
808 			 * Now delete all existing cums in the csum root that
809 			 * cover our range. We do this because we can have an
810 			 * extent that is completely referenced by one file
811 			 * extent item and partially referenced by another
812 			 * file extent item (like after using the clone or
813 			 * extent_same ioctls). In this case if we end up doing
814 			 * the replay of the one that partially references the
815 			 * extent first, and we do not do the csum deletion
816 			 * below, we can get 2 csum items in the csum tree that
817 			 * overlap each other. For example, imagine our log has
818 			 * the two following file extent items:
819 			 *
820 			 * key (257 EXTENT_DATA 409600)
821 			 *     extent data disk byte 12845056 nr 102400
822 			 *     extent data offset 20480 nr 20480 ram 102400
823 			 *
824 			 * key (257 EXTENT_DATA 819200)
825 			 *     extent data disk byte 12845056 nr 102400
826 			 *     extent data offset 0 nr 102400 ram 102400
827 			 *
828 			 * Where the second one fully references the 100K extent
829 			 * that starts at disk byte 12845056, and the log tree
830 			 * has a single csum item that covers the entire range
831 			 * of the extent:
832 			 *
833 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
834 			 *
835 			 * After the first file extent item is replayed, the
836 			 * csum tree gets the following csum item:
837 			 *
838 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
839 			 *
840 			 * Which covers the 20K sub-range starting at offset 20K
841 			 * of our extent. Now when we replay the second file
842 			 * extent item, if we do not delete existing csum items
843 			 * that cover any of its blocks, we end up getting two
844 			 * csum items in our csum tree that overlap each other:
845 			 *
846 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
847 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
848 			 *
849 			 * Which is a problem, because after this anyone trying
850 			 * to lookup up for the checksum of any block of our
851 			 * extent starting at an offset of 40K or higher, will
852 			 * end up looking at the second csum item only, which
853 			 * does not contain the checksum for any block starting
854 			 * at offset 40K or higher of our extent.
855 			 */
856 			while (!list_empty(&ordered_sums)) {
857 				struct btrfs_ordered_sum *sums;
858 				struct btrfs_root *csum_root;
859 
860 				sums = list_entry(ordered_sums.next,
861 						struct btrfs_ordered_sum,
862 						list);
863 				csum_root = btrfs_csum_root(fs_info,
864 							    sums->bytenr);
865 				if (!ret)
866 					ret = btrfs_del_csums(trans, csum_root,
867 							      sums->bytenr,
868 							      sums->len);
869 				if (!ret)
870 					ret = btrfs_csum_file_blocks(trans,
871 								     csum_root,
872 								     sums);
873 				list_del(&sums->list);
874 				kfree(sums);
875 			}
876 			if (ret)
877 				goto out;
878 		} else {
879 			btrfs_release_path(path);
880 		}
881 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
882 		/* inline extents are easy, we just overwrite them */
883 		ret = overwrite_item(trans, root, path, eb, slot, key);
884 		if (ret)
885 			goto out;
886 	}
887 
888 	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
889 						extent_end - start);
890 	if (ret)
891 		goto out;
892 
893 update_inode:
894 	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
895 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
896 out:
897 	iput(inode);
898 	return ret;
899 }
900 
901 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
902 				       struct btrfs_inode *dir,
903 				       struct btrfs_inode *inode,
904 				       const char *name,
905 				       int name_len)
906 {
907 	int ret;
908 
909 	ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
910 	if (ret)
911 		return ret;
912 	/*
913 	 * Whenever we need to check if a name exists or not, we check the
914 	 * fs/subvolume tree. So after an unlink we must run delayed items, so
915 	 * that future checks for a name during log replay see that the name
916 	 * does not exists anymore.
917 	 */
918 	return btrfs_run_delayed_items(trans);
919 }
920 
921 /*
922  * when cleaning up conflicts between the directory names in the
923  * subvolume, directory names in the log and directory names in the
924  * inode back references, we may have to unlink inodes from directories.
925  *
926  * This is a helper function to do the unlink of a specific directory
927  * item
928  */
929 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
930 				      struct btrfs_path *path,
931 				      struct btrfs_inode *dir,
932 				      struct btrfs_dir_item *di)
933 {
934 	struct btrfs_root *root = dir->root;
935 	struct inode *inode;
936 	char *name;
937 	int name_len;
938 	struct extent_buffer *leaf;
939 	struct btrfs_key location;
940 	int ret;
941 
942 	leaf = path->nodes[0];
943 
944 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
945 	name_len = btrfs_dir_name_len(leaf, di);
946 	name = kmalloc(name_len, GFP_NOFS);
947 	if (!name)
948 		return -ENOMEM;
949 
950 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
951 	btrfs_release_path(path);
952 
953 	inode = read_one_inode(root, location.objectid);
954 	if (!inode) {
955 		ret = -EIO;
956 		goto out;
957 	}
958 
959 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
960 	if (ret)
961 		goto out;
962 
963 	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
964 			name_len);
965 out:
966 	kfree(name);
967 	iput(inode);
968 	return ret;
969 }
970 
971 /*
972  * See if a given name and sequence number found in an inode back reference are
973  * already in a directory and correctly point to this inode.
974  *
975  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
976  * exists.
977  */
978 static noinline int inode_in_dir(struct btrfs_root *root,
979 				 struct btrfs_path *path,
980 				 u64 dirid, u64 objectid, u64 index,
981 				 const char *name, int name_len)
982 {
983 	struct btrfs_dir_item *di;
984 	struct btrfs_key location;
985 	int ret = 0;
986 
987 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
988 					 index, name, name_len, 0);
989 	if (IS_ERR(di)) {
990 		ret = PTR_ERR(di);
991 		goto out;
992 	} else if (di) {
993 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
994 		if (location.objectid != objectid)
995 			goto out;
996 	} else {
997 		goto out;
998 	}
999 
1000 	btrfs_release_path(path);
1001 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
1002 	if (IS_ERR(di)) {
1003 		ret = PTR_ERR(di);
1004 		goto out;
1005 	} else if (di) {
1006 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1007 		if (location.objectid == objectid)
1008 			ret = 1;
1009 	}
1010 out:
1011 	btrfs_release_path(path);
1012 	return ret;
1013 }
1014 
1015 /*
1016  * helper function to check a log tree for a named back reference in
1017  * an inode.  This is used to decide if a back reference that is
1018  * found in the subvolume conflicts with what we find in the log.
1019  *
1020  * inode backreferences may have multiple refs in a single item,
1021  * during replay we process one reference at a time, and we don't
1022  * want to delete valid links to a file from the subvolume if that
1023  * link is also in the log.
1024  */
1025 static noinline int backref_in_log(struct btrfs_root *log,
1026 				   struct btrfs_key *key,
1027 				   u64 ref_objectid,
1028 				   const char *name, int namelen)
1029 {
1030 	struct btrfs_path *path;
1031 	int ret;
1032 
1033 	path = btrfs_alloc_path();
1034 	if (!path)
1035 		return -ENOMEM;
1036 
1037 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1038 	if (ret < 0) {
1039 		goto out;
1040 	} else if (ret == 1) {
1041 		ret = 0;
1042 		goto out;
1043 	}
1044 
1045 	if (key->type == BTRFS_INODE_EXTREF_KEY)
1046 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1047 						       path->slots[0],
1048 						       ref_objectid,
1049 						       name, namelen);
1050 	else
1051 		ret = !!btrfs_find_name_in_backref(path->nodes[0],
1052 						   path->slots[0],
1053 						   name, namelen);
1054 out:
1055 	btrfs_free_path(path);
1056 	return ret;
1057 }
1058 
1059 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1060 				  struct btrfs_root *root,
1061 				  struct btrfs_path *path,
1062 				  struct btrfs_root *log_root,
1063 				  struct btrfs_inode *dir,
1064 				  struct btrfs_inode *inode,
1065 				  u64 inode_objectid, u64 parent_objectid,
1066 				  u64 ref_index, char *name, int namelen,
1067 				  int *search_done)
1068 {
1069 	int ret;
1070 	char *victim_name;
1071 	int victim_name_len;
1072 	struct extent_buffer *leaf;
1073 	struct btrfs_dir_item *di;
1074 	struct btrfs_key search_key;
1075 	struct btrfs_inode_extref *extref;
1076 
1077 again:
1078 	/* Search old style refs */
1079 	search_key.objectid = inode_objectid;
1080 	search_key.type = BTRFS_INODE_REF_KEY;
1081 	search_key.offset = parent_objectid;
1082 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1083 	if (ret == 0) {
1084 		struct btrfs_inode_ref *victim_ref;
1085 		unsigned long ptr;
1086 		unsigned long ptr_end;
1087 
1088 		leaf = path->nodes[0];
1089 
1090 		/* are we trying to overwrite a back ref for the root directory
1091 		 * if so, just jump out, we're done
1092 		 */
1093 		if (search_key.objectid == search_key.offset)
1094 			return 1;
1095 
1096 		/* check all the names in this back reference to see
1097 		 * if they are in the log.  if so, we allow them to stay
1098 		 * otherwise they must be unlinked as a conflict
1099 		 */
1100 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1101 		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1102 		while (ptr < ptr_end) {
1103 			victim_ref = (struct btrfs_inode_ref *)ptr;
1104 			victim_name_len = btrfs_inode_ref_name_len(leaf,
1105 								   victim_ref);
1106 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1107 			if (!victim_name)
1108 				return -ENOMEM;
1109 
1110 			read_extent_buffer(leaf, victim_name,
1111 					   (unsigned long)(victim_ref + 1),
1112 					   victim_name_len);
1113 
1114 			ret = backref_in_log(log_root, &search_key,
1115 					     parent_objectid, victim_name,
1116 					     victim_name_len);
1117 			if (ret < 0) {
1118 				kfree(victim_name);
1119 				return ret;
1120 			} else if (!ret) {
1121 				inc_nlink(&inode->vfs_inode);
1122 				btrfs_release_path(path);
1123 
1124 				ret = unlink_inode_for_log_replay(trans, dir, inode,
1125 						victim_name, victim_name_len);
1126 				kfree(victim_name);
1127 				if (ret)
1128 					return ret;
1129 				*search_done = 1;
1130 				goto again;
1131 			}
1132 			kfree(victim_name);
1133 
1134 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1135 		}
1136 
1137 		/*
1138 		 * NOTE: we have searched root tree and checked the
1139 		 * corresponding ref, it does not need to check again.
1140 		 */
1141 		*search_done = 1;
1142 	}
1143 	btrfs_release_path(path);
1144 
1145 	/* Same search but for extended refs */
1146 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1147 					   inode_objectid, parent_objectid, 0,
1148 					   0);
1149 	if (!IS_ERR_OR_NULL(extref)) {
1150 		u32 item_size;
1151 		u32 cur_offset = 0;
1152 		unsigned long base;
1153 		struct inode *victim_parent;
1154 
1155 		leaf = path->nodes[0];
1156 
1157 		item_size = btrfs_item_size(leaf, path->slots[0]);
1158 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1159 
1160 		while (cur_offset < item_size) {
1161 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1162 
1163 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1164 
1165 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1166 				goto next;
1167 
1168 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1169 			if (!victim_name)
1170 				return -ENOMEM;
1171 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1172 					   victim_name_len);
1173 
1174 			search_key.objectid = inode_objectid;
1175 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1176 			search_key.offset = btrfs_extref_hash(parent_objectid,
1177 							      victim_name,
1178 							      victim_name_len);
1179 			ret = backref_in_log(log_root, &search_key,
1180 					     parent_objectid, victim_name,
1181 					     victim_name_len);
1182 			if (ret < 0) {
1183 				kfree(victim_name);
1184 				return ret;
1185 			} else if (!ret) {
1186 				ret = -ENOENT;
1187 				victim_parent = read_one_inode(root,
1188 						parent_objectid);
1189 				if (victim_parent) {
1190 					inc_nlink(&inode->vfs_inode);
1191 					btrfs_release_path(path);
1192 
1193 					ret = unlink_inode_for_log_replay(trans,
1194 							BTRFS_I(victim_parent),
1195 							inode,
1196 							victim_name,
1197 							victim_name_len);
1198 				}
1199 				iput(victim_parent);
1200 				kfree(victim_name);
1201 				if (ret)
1202 					return ret;
1203 				*search_done = 1;
1204 				goto again;
1205 			}
1206 			kfree(victim_name);
1207 next:
1208 			cur_offset += victim_name_len + sizeof(*extref);
1209 		}
1210 		*search_done = 1;
1211 	}
1212 	btrfs_release_path(path);
1213 
1214 	/* look for a conflicting sequence number */
1215 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1216 					 ref_index, name, namelen, 0);
1217 	if (IS_ERR(di)) {
1218 		return PTR_ERR(di);
1219 	} else if (di) {
1220 		ret = drop_one_dir_item(trans, path, dir, di);
1221 		if (ret)
1222 			return ret;
1223 	}
1224 	btrfs_release_path(path);
1225 
1226 	/* look for a conflicting name */
1227 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1228 				   name, namelen, 0);
1229 	if (IS_ERR(di)) {
1230 		return PTR_ERR(di);
1231 	} else if (di) {
1232 		ret = drop_one_dir_item(trans, path, dir, di);
1233 		if (ret)
1234 			return ret;
1235 	}
1236 	btrfs_release_path(path);
1237 
1238 	return 0;
1239 }
1240 
1241 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1242 			     u32 *namelen, char **name, u64 *index,
1243 			     u64 *parent_objectid)
1244 {
1245 	struct btrfs_inode_extref *extref;
1246 
1247 	extref = (struct btrfs_inode_extref *)ref_ptr;
1248 
1249 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1250 	*name = kmalloc(*namelen, GFP_NOFS);
1251 	if (*name == NULL)
1252 		return -ENOMEM;
1253 
1254 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1255 			   *namelen);
1256 
1257 	if (index)
1258 		*index = btrfs_inode_extref_index(eb, extref);
1259 	if (parent_objectid)
1260 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1261 
1262 	return 0;
1263 }
1264 
1265 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1266 			  u32 *namelen, char **name, u64 *index)
1267 {
1268 	struct btrfs_inode_ref *ref;
1269 
1270 	ref = (struct btrfs_inode_ref *)ref_ptr;
1271 
1272 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1273 	*name = kmalloc(*namelen, GFP_NOFS);
1274 	if (*name == NULL)
1275 		return -ENOMEM;
1276 
1277 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1278 
1279 	if (index)
1280 		*index = btrfs_inode_ref_index(eb, ref);
1281 
1282 	return 0;
1283 }
1284 
1285 /*
1286  * Take an inode reference item from the log tree and iterate all names from the
1287  * inode reference item in the subvolume tree with the same key (if it exists).
1288  * For any name that is not in the inode reference item from the log tree, do a
1289  * proper unlink of that name (that is, remove its entry from the inode
1290  * reference item and both dir index keys).
1291  */
1292 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1293 				 struct btrfs_root *root,
1294 				 struct btrfs_path *path,
1295 				 struct btrfs_inode *inode,
1296 				 struct extent_buffer *log_eb,
1297 				 int log_slot,
1298 				 struct btrfs_key *key)
1299 {
1300 	int ret;
1301 	unsigned long ref_ptr;
1302 	unsigned long ref_end;
1303 	struct extent_buffer *eb;
1304 
1305 again:
1306 	btrfs_release_path(path);
1307 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1308 	if (ret > 0) {
1309 		ret = 0;
1310 		goto out;
1311 	}
1312 	if (ret < 0)
1313 		goto out;
1314 
1315 	eb = path->nodes[0];
1316 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1317 	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1318 	while (ref_ptr < ref_end) {
1319 		char *name = NULL;
1320 		int namelen;
1321 		u64 parent_id;
1322 
1323 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1324 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1325 						NULL, &parent_id);
1326 		} else {
1327 			parent_id = key->offset;
1328 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1329 					     NULL);
1330 		}
1331 		if (ret)
1332 			goto out;
1333 
1334 		if (key->type == BTRFS_INODE_EXTREF_KEY)
1335 			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1336 							       parent_id, name,
1337 							       namelen);
1338 		else
1339 			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1340 							   name, namelen);
1341 
1342 		if (!ret) {
1343 			struct inode *dir;
1344 
1345 			btrfs_release_path(path);
1346 			dir = read_one_inode(root, parent_id);
1347 			if (!dir) {
1348 				ret = -ENOENT;
1349 				kfree(name);
1350 				goto out;
1351 			}
1352 			ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
1353 						 inode, name, namelen);
1354 			kfree(name);
1355 			iput(dir);
1356 			if (ret)
1357 				goto out;
1358 			goto again;
1359 		}
1360 
1361 		kfree(name);
1362 		ref_ptr += namelen;
1363 		if (key->type == BTRFS_INODE_EXTREF_KEY)
1364 			ref_ptr += sizeof(struct btrfs_inode_extref);
1365 		else
1366 			ref_ptr += sizeof(struct btrfs_inode_ref);
1367 	}
1368 	ret = 0;
1369  out:
1370 	btrfs_release_path(path);
1371 	return ret;
1372 }
1373 
1374 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1375 				  const u8 ref_type, const char *name,
1376 				  const int namelen)
1377 {
1378 	struct btrfs_key key;
1379 	struct btrfs_path *path;
1380 	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1381 	int ret;
1382 
1383 	path = btrfs_alloc_path();
1384 	if (!path)
1385 		return -ENOMEM;
1386 
1387 	key.objectid = btrfs_ino(BTRFS_I(inode));
1388 	key.type = ref_type;
1389 	if (key.type == BTRFS_INODE_REF_KEY)
1390 		key.offset = parent_id;
1391 	else
1392 		key.offset = btrfs_extref_hash(parent_id, name, namelen);
1393 
1394 	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1395 	if (ret < 0)
1396 		goto out;
1397 	if (ret > 0) {
1398 		ret = 0;
1399 		goto out;
1400 	}
1401 	if (key.type == BTRFS_INODE_EXTREF_KEY)
1402 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1403 				path->slots[0], parent_id, name, namelen);
1404 	else
1405 		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1406 						   name, namelen);
1407 
1408 out:
1409 	btrfs_free_path(path);
1410 	return ret;
1411 }
1412 
1413 static int add_link(struct btrfs_trans_handle *trans,
1414 		    struct inode *dir, struct inode *inode, const char *name,
1415 		    int namelen, u64 ref_index)
1416 {
1417 	struct btrfs_root *root = BTRFS_I(dir)->root;
1418 	struct btrfs_dir_item *dir_item;
1419 	struct btrfs_key key;
1420 	struct btrfs_path *path;
1421 	struct inode *other_inode = NULL;
1422 	int ret;
1423 
1424 	path = btrfs_alloc_path();
1425 	if (!path)
1426 		return -ENOMEM;
1427 
1428 	dir_item = btrfs_lookup_dir_item(NULL, root, path,
1429 					 btrfs_ino(BTRFS_I(dir)),
1430 					 name, namelen, 0);
1431 	if (!dir_item) {
1432 		btrfs_release_path(path);
1433 		goto add_link;
1434 	} else if (IS_ERR(dir_item)) {
1435 		ret = PTR_ERR(dir_item);
1436 		goto out;
1437 	}
1438 
1439 	/*
1440 	 * Our inode's dentry collides with the dentry of another inode which is
1441 	 * in the log but not yet processed since it has a higher inode number.
1442 	 * So delete that other dentry.
1443 	 */
1444 	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1445 	btrfs_release_path(path);
1446 	other_inode = read_one_inode(root, key.objectid);
1447 	if (!other_inode) {
1448 		ret = -ENOENT;
1449 		goto out;
1450 	}
1451 	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
1452 					  name, namelen);
1453 	if (ret)
1454 		goto out;
1455 	/*
1456 	 * If we dropped the link count to 0, bump it so that later the iput()
1457 	 * on the inode will not free it. We will fixup the link count later.
1458 	 */
1459 	if (other_inode->i_nlink == 0)
1460 		inc_nlink(other_inode);
1461 add_link:
1462 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1463 			     name, namelen, 0, ref_index);
1464 out:
1465 	iput(other_inode);
1466 	btrfs_free_path(path);
1467 
1468 	return ret;
1469 }
1470 
1471 /*
1472  * replay one inode back reference item found in the log tree.
1473  * eb, slot and key refer to the buffer and key found in the log tree.
1474  * root is the destination we are replaying into, and path is for temp
1475  * use by this function.  (it should be released on return).
1476  */
1477 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1478 				  struct btrfs_root *root,
1479 				  struct btrfs_root *log,
1480 				  struct btrfs_path *path,
1481 				  struct extent_buffer *eb, int slot,
1482 				  struct btrfs_key *key)
1483 {
1484 	struct inode *dir = NULL;
1485 	struct inode *inode = NULL;
1486 	unsigned long ref_ptr;
1487 	unsigned long ref_end;
1488 	char *name = NULL;
1489 	int namelen;
1490 	int ret;
1491 	int search_done = 0;
1492 	int log_ref_ver = 0;
1493 	u64 parent_objectid;
1494 	u64 inode_objectid;
1495 	u64 ref_index = 0;
1496 	int ref_struct_size;
1497 
1498 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1499 	ref_end = ref_ptr + btrfs_item_size(eb, slot);
1500 
1501 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1502 		struct btrfs_inode_extref *r;
1503 
1504 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1505 		log_ref_ver = 1;
1506 		r = (struct btrfs_inode_extref *)ref_ptr;
1507 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1508 	} else {
1509 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1510 		parent_objectid = key->offset;
1511 	}
1512 	inode_objectid = key->objectid;
1513 
1514 	/*
1515 	 * it is possible that we didn't log all the parent directories
1516 	 * for a given inode.  If we don't find the dir, just don't
1517 	 * copy the back ref in.  The link count fixup code will take
1518 	 * care of the rest
1519 	 */
1520 	dir = read_one_inode(root, parent_objectid);
1521 	if (!dir) {
1522 		ret = -ENOENT;
1523 		goto out;
1524 	}
1525 
1526 	inode = read_one_inode(root, inode_objectid);
1527 	if (!inode) {
1528 		ret = -EIO;
1529 		goto out;
1530 	}
1531 
1532 	while (ref_ptr < ref_end) {
1533 		if (log_ref_ver) {
1534 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1535 						&ref_index, &parent_objectid);
1536 			/*
1537 			 * parent object can change from one array
1538 			 * item to another.
1539 			 */
1540 			if (!dir)
1541 				dir = read_one_inode(root, parent_objectid);
1542 			if (!dir) {
1543 				ret = -ENOENT;
1544 				goto out;
1545 			}
1546 		} else {
1547 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1548 					     &ref_index);
1549 		}
1550 		if (ret)
1551 			goto out;
1552 
1553 		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1554 				   btrfs_ino(BTRFS_I(inode)), ref_index,
1555 				   name, namelen);
1556 		if (ret < 0) {
1557 			goto out;
1558 		} else if (ret == 0) {
1559 			/*
1560 			 * look for a conflicting back reference in the
1561 			 * metadata. if we find one we have to unlink that name
1562 			 * of the file before we add our new link.  Later on, we
1563 			 * overwrite any existing back reference, and we don't
1564 			 * want to create dangling pointers in the directory.
1565 			 */
1566 
1567 			if (!search_done) {
1568 				ret = __add_inode_ref(trans, root, path, log,
1569 						      BTRFS_I(dir),
1570 						      BTRFS_I(inode),
1571 						      inode_objectid,
1572 						      parent_objectid,
1573 						      ref_index, name, namelen,
1574 						      &search_done);
1575 				if (ret) {
1576 					if (ret == 1)
1577 						ret = 0;
1578 					goto out;
1579 				}
1580 			}
1581 
1582 			/*
1583 			 * If a reference item already exists for this inode
1584 			 * with the same parent and name, but different index,
1585 			 * drop it and the corresponding directory index entries
1586 			 * from the parent before adding the new reference item
1587 			 * and dir index entries, otherwise we would fail with
1588 			 * -EEXIST returned from btrfs_add_link() below.
1589 			 */
1590 			ret = btrfs_inode_ref_exists(inode, dir, key->type,
1591 						     name, namelen);
1592 			if (ret > 0) {
1593 				ret = unlink_inode_for_log_replay(trans,
1594 							 BTRFS_I(dir),
1595 							 BTRFS_I(inode),
1596 							 name, namelen);
1597 				/*
1598 				 * If we dropped the link count to 0, bump it so
1599 				 * that later the iput() on the inode will not
1600 				 * free it. We will fixup the link count later.
1601 				 */
1602 				if (!ret && inode->i_nlink == 0)
1603 					inc_nlink(inode);
1604 			}
1605 			if (ret < 0)
1606 				goto out;
1607 
1608 			/* insert our name */
1609 			ret = add_link(trans, dir, inode, name, namelen,
1610 				       ref_index);
1611 			if (ret)
1612 				goto out;
1613 
1614 			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1615 			if (ret)
1616 				goto out;
1617 		}
1618 		/* Else, ret == 1, we already have a perfect match, we're done. */
1619 
1620 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1621 		kfree(name);
1622 		name = NULL;
1623 		if (log_ref_ver) {
1624 			iput(dir);
1625 			dir = NULL;
1626 		}
1627 	}
1628 
1629 	/*
1630 	 * Before we overwrite the inode reference item in the subvolume tree
1631 	 * with the item from the log tree, we must unlink all names from the
1632 	 * parent directory that are in the subvolume's tree inode reference
1633 	 * item, otherwise we end up with an inconsistent subvolume tree where
1634 	 * dir index entries exist for a name but there is no inode reference
1635 	 * item with the same name.
1636 	 */
1637 	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1638 				    key);
1639 	if (ret)
1640 		goto out;
1641 
1642 	/* finally write the back reference in the inode */
1643 	ret = overwrite_item(trans, root, path, eb, slot, key);
1644 out:
1645 	btrfs_release_path(path);
1646 	kfree(name);
1647 	iput(dir);
1648 	iput(inode);
1649 	return ret;
1650 }
1651 
1652 static int count_inode_extrefs(struct btrfs_root *root,
1653 		struct btrfs_inode *inode, struct btrfs_path *path)
1654 {
1655 	int ret = 0;
1656 	int name_len;
1657 	unsigned int nlink = 0;
1658 	u32 item_size;
1659 	u32 cur_offset = 0;
1660 	u64 inode_objectid = btrfs_ino(inode);
1661 	u64 offset = 0;
1662 	unsigned long ptr;
1663 	struct btrfs_inode_extref *extref;
1664 	struct extent_buffer *leaf;
1665 
1666 	while (1) {
1667 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1668 					    &extref, &offset);
1669 		if (ret)
1670 			break;
1671 
1672 		leaf = path->nodes[0];
1673 		item_size = btrfs_item_size(leaf, path->slots[0]);
1674 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1675 		cur_offset = 0;
1676 
1677 		while (cur_offset < item_size) {
1678 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1679 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1680 
1681 			nlink++;
1682 
1683 			cur_offset += name_len + sizeof(*extref);
1684 		}
1685 
1686 		offset++;
1687 		btrfs_release_path(path);
1688 	}
1689 	btrfs_release_path(path);
1690 
1691 	if (ret < 0 && ret != -ENOENT)
1692 		return ret;
1693 	return nlink;
1694 }
1695 
1696 static int count_inode_refs(struct btrfs_root *root,
1697 			struct btrfs_inode *inode, struct btrfs_path *path)
1698 {
1699 	int ret;
1700 	struct btrfs_key key;
1701 	unsigned int nlink = 0;
1702 	unsigned long ptr;
1703 	unsigned long ptr_end;
1704 	int name_len;
1705 	u64 ino = btrfs_ino(inode);
1706 
1707 	key.objectid = ino;
1708 	key.type = BTRFS_INODE_REF_KEY;
1709 	key.offset = (u64)-1;
1710 
1711 	while (1) {
1712 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1713 		if (ret < 0)
1714 			break;
1715 		if (ret > 0) {
1716 			if (path->slots[0] == 0)
1717 				break;
1718 			path->slots[0]--;
1719 		}
1720 process_slot:
1721 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1722 				      path->slots[0]);
1723 		if (key.objectid != ino ||
1724 		    key.type != BTRFS_INODE_REF_KEY)
1725 			break;
1726 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1727 		ptr_end = ptr + btrfs_item_size(path->nodes[0],
1728 						   path->slots[0]);
1729 		while (ptr < ptr_end) {
1730 			struct btrfs_inode_ref *ref;
1731 
1732 			ref = (struct btrfs_inode_ref *)ptr;
1733 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1734 							    ref);
1735 			ptr = (unsigned long)(ref + 1) + name_len;
1736 			nlink++;
1737 		}
1738 
1739 		if (key.offset == 0)
1740 			break;
1741 		if (path->slots[0] > 0) {
1742 			path->slots[0]--;
1743 			goto process_slot;
1744 		}
1745 		key.offset--;
1746 		btrfs_release_path(path);
1747 	}
1748 	btrfs_release_path(path);
1749 
1750 	return nlink;
1751 }
1752 
1753 /*
1754  * There are a few corners where the link count of the file can't
1755  * be properly maintained during replay.  So, instead of adding
1756  * lots of complexity to the log code, we just scan the backrefs
1757  * for any file that has been through replay.
1758  *
1759  * The scan will update the link count on the inode to reflect the
1760  * number of back refs found.  If it goes down to zero, the iput
1761  * will free the inode.
1762  */
1763 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1764 					   struct btrfs_root *root,
1765 					   struct inode *inode)
1766 {
1767 	struct btrfs_path *path;
1768 	int ret;
1769 	u64 nlink = 0;
1770 	u64 ino = btrfs_ino(BTRFS_I(inode));
1771 
1772 	path = btrfs_alloc_path();
1773 	if (!path)
1774 		return -ENOMEM;
1775 
1776 	ret = count_inode_refs(root, BTRFS_I(inode), path);
1777 	if (ret < 0)
1778 		goto out;
1779 
1780 	nlink = ret;
1781 
1782 	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1783 	if (ret < 0)
1784 		goto out;
1785 
1786 	nlink += ret;
1787 
1788 	ret = 0;
1789 
1790 	if (nlink != inode->i_nlink) {
1791 		set_nlink(inode, nlink);
1792 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1793 		if (ret)
1794 			goto out;
1795 	}
1796 	BTRFS_I(inode)->index_cnt = (u64)-1;
1797 
1798 	if (inode->i_nlink == 0) {
1799 		if (S_ISDIR(inode->i_mode)) {
1800 			ret = replay_dir_deletes(trans, root, NULL, path,
1801 						 ino, 1);
1802 			if (ret)
1803 				goto out;
1804 		}
1805 		ret = btrfs_insert_orphan_item(trans, root, ino);
1806 		if (ret == -EEXIST)
1807 			ret = 0;
1808 	}
1809 
1810 out:
1811 	btrfs_free_path(path);
1812 	return ret;
1813 }
1814 
1815 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1816 					    struct btrfs_root *root,
1817 					    struct btrfs_path *path)
1818 {
1819 	int ret;
1820 	struct btrfs_key key;
1821 	struct inode *inode;
1822 
1823 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1824 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1825 	key.offset = (u64)-1;
1826 	while (1) {
1827 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1828 		if (ret < 0)
1829 			break;
1830 
1831 		if (ret == 1) {
1832 			ret = 0;
1833 			if (path->slots[0] == 0)
1834 				break;
1835 			path->slots[0]--;
1836 		}
1837 
1838 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1839 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1840 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1841 			break;
1842 
1843 		ret = btrfs_del_item(trans, root, path);
1844 		if (ret)
1845 			break;
1846 
1847 		btrfs_release_path(path);
1848 		inode = read_one_inode(root, key.offset);
1849 		if (!inode) {
1850 			ret = -EIO;
1851 			break;
1852 		}
1853 
1854 		ret = fixup_inode_link_count(trans, root, inode);
1855 		iput(inode);
1856 		if (ret)
1857 			break;
1858 
1859 		/*
1860 		 * fixup on a directory may create new entries,
1861 		 * make sure we always look for the highset possible
1862 		 * offset
1863 		 */
1864 		key.offset = (u64)-1;
1865 	}
1866 	btrfs_release_path(path);
1867 	return ret;
1868 }
1869 
1870 
1871 /*
1872  * record a given inode in the fixup dir so we can check its link
1873  * count when replay is done.  The link count is incremented here
1874  * so the inode won't go away until we check it
1875  */
1876 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1877 				      struct btrfs_root *root,
1878 				      struct btrfs_path *path,
1879 				      u64 objectid)
1880 {
1881 	struct btrfs_key key;
1882 	int ret = 0;
1883 	struct inode *inode;
1884 
1885 	inode = read_one_inode(root, objectid);
1886 	if (!inode)
1887 		return -EIO;
1888 
1889 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1890 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1891 	key.offset = objectid;
1892 
1893 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1894 
1895 	btrfs_release_path(path);
1896 	if (ret == 0) {
1897 		if (!inode->i_nlink)
1898 			set_nlink(inode, 1);
1899 		else
1900 			inc_nlink(inode);
1901 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1902 	} else if (ret == -EEXIST) {
1903 		ret = 0;
1904 	}
1905 	iput(inode);
1906 
1907 	return ret;
1908 }
1909 
1910 /*
1911  * when replaying the log for a directory, we only insert names
1912  * for inodes that actually exist.  This means an fsync on a directory
1913  * does not implicitly fsync all the new files in it
1914  */
1915 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1916 				    struct btrfs_root *root,
1917 				    u64 dirid, u64 index,
1918 				    char *name, int name_len,
1919 				    struct btrfs_key *location)
1920 {
1921 	struct inode *inode;
1922 	struct inode *dir;
1923 	int ret;
1924 
1925 	inode = read_one_inode(root, location->objectid);
1926 	if (!inode)
1927 		return -ENOENT;
1928 
1929 	dir = read_one_inode(root, dirid);
1930 	if (!dir) {
1931 		iput(inode);
1932 		return -EIO;
1933 	}
1934 
1935 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1936 			name_len, 1, index);
1937 
1938 	/* FIXME, put inode into FIXUP list */
1939 
1940 	iput(inode);
1941 	iput(dir);
1942 	return ret;
1943 }
1944 
1945 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1946 					struct btrfs_inode *dir,
1947 					struct btrfs_path *path,
1948 					struct btrfs_dir_item *dst_di,
1949 					const struct btrfs_key *log_key,
1950 					u8 log_type,
1951 					bool exists)
1952 {
1953 	struct btrfs_key found_key;
1954 
1955 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1956 	/* The existing dentry points to the same inode, don't delete it. */
1957 	if (found_key.objectid == log_key->objectid &&
1958 	    found_key.type == log_key->type &&
1959 	    found_key.offset == log_key->offset &&
1960 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
1961 		return 1;
1962 
1963 	/*
1964 	 * Don't drop the conflicting directory entry if the inode for the new
1965 	 * entry doesn't exist.
1966 	 */
1967 	if (!exists)
1968 		return 0;
1969 
1970 	return drop_one_dir_item(trans, path, dir, dst_di);
1971 }
1972 
1973 /*
1974  * take a single entry in a log directory item and replay it into
1975  * the subvolume.
1976  *
1977  * if a conflicting item exists in the subdirectory already,
1978  * the inode it points to is unlinked and put into the link count
1979  * fix up tree.
1980  *
1981  * If a name from the log points to a file or directory that does
1982  * not exist in the FS, it is skipped.  fsyncs on directories
1983  * do not force down inodes inside that directory, just changes to the
1984  * names or unlinks in a directory.
1985  *
1986  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1987  * non-existing inode) and 1 if the name was replayed.
1988  */
1989 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1990 				    struct btrfs_root *root,
1991 				    struct btrfs_path *path,
1992 				    struct extent_buffer *eb,
1993 				    struct btrfs_dir_item *di,
1994 				    struct btrfs_key *key)
1995 {
1996 	char *name;
1997 	int name_len;
1998 	struct btrfs_dir_item *dir_dst_di;
1999 	struct btrfs_dir_item *index_dst_di;
2000 	bool dir_dst_matches = false;
2001 	bool index_dst_matches = false;
2002 	struct btrfs_key log_key;
2003 	struct btrfs_key search_key;
2004 	struct inode *dir;
2005 	u8 log_type;
2006 	bool exists;
2007 	int ret;
2008 	bool update_size = true;
2009 	bool name_added = false;
2010 
2011 	dir = read_one_inode(root, key->objectid);
2012 	if (!dir)
2013 		return -EIO;
2014 
2015 	name_len = btrfs_dir_name_len(eb, di);
2016 	name = kmalloc(name_len, GFP_NOFS);
2017 	if (!name) {
2018 		ret = -ENOMEM;
2019 		goto out;
2020 	}
2021 
2022 	log_type = btrfs_dir_type(eb, di);
2023 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
2024 		   name_len);
2025 
2026 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
2027 	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
2028 	btrfs_release_path(path);
2029 	if (ret < 0)
2030 		goto out;
2031 	exists = (ret == 0);
2032 	ret = 0;
2033 
2034 	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
2035 					   name, name_len, 1);
2036 	if (IS_ERR(dir_dst_di)) {
2037 		ret = PTR_ERR(dir_dst_di);
2038 		goto out;
2039 	} else if (dir_dst_di) {
2040 		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
2041 						   dir_dst_di, &log_key, log_type,
2042 						   exists);
2043 		if (ret < 0)
2044 			goto out;
2045 		dir_dst_matches = (ret == 1);
2046 	}
2047 
2048 	btrfs_release_path(path);
2049 
2050 	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
2051 						   key->objectid, key->offset,
2052 						   name, name_len, 1);
2053 	if (IS_ERR(index_dst_di)) {
2054 		ret = PTR_ERR(index_dst_di);
2055 		goto out;
2056 	} else if (index_dst_di) {
2057 		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
2058 						   index_dst_di, &log_key,
2059 						   log_type, exists);
2060 		if (ret < 0)
2061 			goto out;
2062 		index_dst_matches = (ret == 1);
2063 	}
2064 
2065 	btrfs_release_path(path);
2066 
2067 	if (dir_dst_matches && index_dst_matches) {
2068 		ret = 0;
2069 		update_size = false;
2070 		goto out;
2071 	}
2072 
2073 	/*
2074 	 * Check if the inode reference exists in the log for the given name,
2075 	 * inode and parent inode
2076 	 */
2077 	search_key.objectid = log_key.objectid;
2078 	search_key.type = BTRFS_INODE_REF_KEY;
2079 	search_key.offset = key->objectid;
2080 	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
2081 	if (ret < 0) {
2082 	        goto out;
2083 	} else if (ret) {
2084 	        /* The dentry will be added later. */
2085 	        ret = 0;
2086 	        update_size = false;
2087 	        goto out;
2088 	}
2089 
2090 	search_key.objectid = log_key.objectid;
2091 	search_key.type = BTRFS_INODE_EXTREF_KEY;
2092 	search_key.offset = key->objectid;
2093 	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
2094 			     name_len);
2095 	if (ret < 0) {
2096 		goto out;
2097 	} else if (ret) {
2098 		/* The dentry will be added later. */
2099 		ret = 0;
2100 		update_size = false;
2101 		goto out;
2102 	}
2103 	btrfs_release_path(path);
2104 	ret = insert_one_name(trans, root, key->objectid, key->offset,
2105 			      name, name_len, &log_key);
2106 	if (ret && ret != -ENOENT && ret != -EEXIST)
2107 		goto out;
2108 	if (!ret)
2109 		name_added = true;
2110 	update_size = false;
2111 	ret = 0;
2112 
2113 out:
2114 	if (!ret && update_size) {
2115 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2116 		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
2117 	}
2118 	kfree(name);
2119 	iput(dir);
2120 	if (!ret && name_added)
2121 		ret = 1;
2122 	return ret;
2123 }
2124 
2125 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
2126 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2127 					struct btrfs_root *root,
2128 					struct btrfs_path *path,
2129 					struct extent_buffer *eb, int slot,
2130 					struct btrfs_key *key)
2131 {
2132 	int ret;
2133 	struct btrfs_dir_item *di;
2134 
2135 	/* We only log dir index keys, which only contain a single dir item. */
2136 	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
2137 
2138 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2139 	ret = replay_one_name(trans, root, path, eb, di, key);
2140 	if (ret < 0)
2141 		return ret;
2142 
2143 	/*
2144 	 * If this entry refers to a non-directory (directories can not have a
2145 	 * link count > 1) and it was added in the transaction that was not
2146 	 * committed, make sure we fixup the link count of the inode the entry
2147 	 * points to. Otherwise something like the following would result in a
2148 	 * directory pointing to an inode with a wrong link that does not account
2149 	 * for this dir entry:
2150 	 *
2151 	 * mkdir testdir
2152 	 * touch testdir/foo
2153 	 * touch testdir/bar
2154 	 * sync
2155 	 *
2156 	 * ln testdir/bar testdir/bar_link
2157 	 * ln testdir/foo testdir/foo_link
2158 	 * xfs_io -c "fsync" testdir/bar
2159 	 *
2160 	 * <power failure>
2161 	 *
2162 	 * mount fs, log replay happens
2163 	 *
2164 	 * File foo would remain with a link count of 1 when it has two entries
2165 	 * pointing to it in the directory testdir. This would make it impossible
2166 	 * to ever delete the parent directory has it would result in stale
2167 	 * dentries that can never be deleted.
2168 	 */
2169 	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2170 		struct btrfs_path *fixup_path;
2171 		struct btrfs_key di_key;
2172 
2173 		fixup_path = btrfs_alloc_path();
2174 		if (!fixup_path)
2175 			return -ENOMEM;
2176 
2177 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2178 		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2179 		btrfs_free_path(fixup_path);
2180 	}
2181 
2182 	return ret;
2183 }
2184 
2185 /*
2186  * directory replay has two parts.  There are the standard directory
2187  * items in the log copied from the subvolume, and range items
2188  * created in the log while the subvolume was logged.
2189  *
2190  * The range items tell us which parts of the key space the log
2191  * is authoritative for.  During replay, if a key in the subvolume
2192  * directory is in a logged range item, but not actually in the log
2193  * that means it was deleted from the directory before the fsync
2194  * and should be removed.
2195  */
2196 static noinline int find_dir_range(struct btrfs_root *root,
2197 				   struct btrfs_path *path,
2198 				   u64 dirid,
2199 				   u64 *start_ret, u64 *end_ret)
2200 {
2201 	struct btrfs_key key;
2202 	u64 found_end;
2203 	struct btrfs_dir_log_item *item;
2204 	int ret;
2205 	int nritems;
2206 
2207 	if (*start_ret == (u64)-1)
2208 		return 1;
2209 
2210 	key.objectid = dirid;
2211 	key.type = BTRFS_DIR_LOG_INDEX_KEY;
2212 	key.offset = *start_ret;
2213 
2214 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2215 	if (ret < 0)
2216 		goto out;
2217 	if (ret > 0) {
2218 		if (path->slots[0] == 0)
2219 			goto out;
2220 		path->slots[0]--;
2221 	}
2222 	if (ret != 0)
2223 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2224 
2225 	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2226 		ret = 1;
2227 		goto next;
2228 	}
2229 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2230 			      struct btrfs_dir_log_item);
2231 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2232 
2233 	if (*start_ret >= key.offset && *start_ret <= found_end) {
2234 		ret = 0;
2235 		*start_ret = key.offset;
2236 		*end_ret = found_end;
2237 		goto out;
2238 	}
2239 	ret = 1;
2240 next:
2241 	/* check the next slot in the tree to see if it is a valid item */
2242 	nritems = btrfs_header_nritems(path->nodes[0]);
2243 	path->slots[0]++;
2244 	if (path->slots[0] >= nritems) {
2245 		ret = btrfs_next_leaf(root, path);
2246 		if (ret)
2247 			goto out;
2248 	}
2249 
2250 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2251 
2252 	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2253 		ret = 1;
2254 		goto out;
2255 	}
2256 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2257 			      struct btrfs_dir_log_item);
2258 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2259 	*start_ret = key.offset;
2260 	*end_ret = found_end;
2261 	ret = 0;
2262 out:
2263 	btrfs_release_path(path);
2264 	return ret;
2265 }
2266 
2267 /*
2268  * this looks for a given directory item in the log.  If the directory
2269  * item is not in the log, the item is removed and the inode it points
2270  * to is unlinked
2271  */
2272 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2273 				      struct btrfs_root *log,
2274 				      struct btrfs_path *path,
2275 				      struct btrfs_path *log_path,
2276 				      struct inode *dir,
2277 				      struct btrfs_key *dir_key)
2278 {
2279 	struct btrfs_root *root = BTRFS_I(dir)->root;
2280 	int ret;
2281 	struct extent_buffer *eb;
2282 	int slot;
2283 	struct btrfs_dir_item *di;
2284 	int name_len;
2285 	char *name;
2286 	struct inode *inode = NULL;
2287 	struct btrfs_key location;
2288 
2289 	/*
2290 	 * Currently we only log dir index keys. Even if we replay a log created
2291 	 * by an older kernel that logged both dir index and dir item keys, all
2292 	 * we need to do is process the dir index keys, we (and our caller) can
2293 	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2294 	 */
2295 	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2296 
2297 	eb = path->nodes[0];
2298 	slot = path->slots[0];
2299 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2300 	name_len = btrfs_dir_name_len(eb, di);
2301 	name = kmalloc(name_len, GFP_NOFS);
2302 	if (!name) {
2303 		ret = -ENOMEM;
2304 		goto out;
2305 	}
2306 
2307 	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
2308 
2309 	if (log) {
2310 		struct btrfs_dir_item *log_di;
2311 
2312 		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2313 						     dir_key->objectid,
2314 						     dir_key->offset,
2315 						     name, name_len, 0);
2316 		if (IS_ERR(log_di)) {
2317 			ret = PTR_ERR(log_di);
2318 			goto out;
2319 		} else if (log_di) {
2320 			/* The dentry exists in the log, we have nothing to do. */
2321 			ret = 0;
2322 			goto out;
2323 		}
2324 	}
2325 
2326 	btrfs_dir_item_key_to_cpu(eb, di, &location);
2327 	btrfs_release_path(path);
2328 	btrfs_release_path(log_path);
2329 	inode = read_one_inode(root, location.objectid);
2330 	if (!inode) {
2331 		ret = -EIO;
2332 		goto out;
2333 	}
2334 
2335 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
2336 	if (ret)
2337 		goto out;
2338 
2339 	inc_nlink(inode);
2340 	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
2341 					  name, name_len);
2342 	/*
2343 	 * Unlike dir item keys, dir index keys can only have one name (entry) in
2344 	 * them, as there are no key collisions since each key has a unique offset
2345 	 * (an index number), so we're done.
2346 	 */
2347 out:
2348 	btrfs_release_path(path);
2349 	btrfs_release_path(log_path);
2350 	kfree(name);
2351 	iput(inode);
2352 	return ret;
2353 }
2354 
2355 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2356 			      struct btrfs_root *root,
2357 			      struct btrfs_root *log,
2358 			      struct btrfs_path *path,
2359 			      const u64 ino)
2360 {
2361 	struct btrfs_key search_key;
2362 	struct btrfs_path *log_path;
2363 	int i;
2364 	int nritems;
2365 	int ret;
2366 
2367 	log_path = btrfs_alloc_path();
2368 	if (!log_path)
2369 		return -ENOMEM;
2370 
2371 	search_key.objectid = ino;
2372 	search_key.type = BTRFS_XATTR_ITEM_KEY;
2373 	search_key.offset = 0;
2374 again:
2375 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2376 	if (ret < 0)
2377 		goto out;
2378 process_leaf:
2379 	nritems = btrfs_header_nritems(path->nodes[0]);
2380 	for (i = path->slots[0]; i < nritems; i++) {
2381 		struct btrfs_key key;
2382 		struct btrfs_dir_item *di;
2383 		struct btrfs_dir_item *log_di;
2384 		u32 total_size;
2385 		u32 cur;
2386 
2387 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2388 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2389 			ret = 0;
2390 			goto out;
2391 		}
2392 
2393 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2394 		total_size = btrfs_item_size(path->nodes[0], i);
2395 		cur = 0;
2396 		while (cur < total_size) {
2397 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2398 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2399 			u32 this_len = sizeof(*di) + name_len + data_len;
2400 			char *name;
2401 
2402 			name = kmalloc(name_len, GFP_NOFS);
2403 			if (!name) {
2404 				ret = -ENOMEM;
2405 				goto out;
2406 			}
2407 			read_extent_buffer(path->nodes[0], name,
2408 					   (unsigned long)(di + 1), name_len);
2409 
2410 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2411 						    name, name_len, 0);
2412 			btrfs_release_path(log_path);
2413 			if (!log_di) {
2414 				/* Doesn't exist in log tree, so delete it. */
2415 				btrfs_release_path(path);
2416 				di = btrfs_lookup_xattr(trans, root, path, ino,
2417 							name, name_len, -1);
2418 				kfree(name);
2419 				if (IS_ERR(di)) {
2420 					ret = PTR_ERR(di);
2421 					goto out;
2422 				}
2423 				ASSERT(di);
2424 				ret = btrfs_delete_one_dir_name(trans, root,
2425 								path, di);
2426 				if (ret)
2427 					goto out;
2428 				btrfs_release_path(path);
2429 				search_key = key;
2430 				goto again;
2431 			}
2432 			kfree(name);
2433 			if (IS_ERR(log_di)) {
2434 				ret = PTR_ERR(log_di);
2435 				goto out;
2436 			}
2437 			cur += this_len;
2438 			di = (struct btrfs_dir_item *)((char *)di + this_len);
2439 		}
2440 	}
2441 	ret = btrfs_next_leaf(root, path);
2442 	if (ret > 0)
2443 		ret = 0;
2444 	else if (ret == 0)
2445 		goto process_leaf;
2446 out:
2447 	btrfs_free_path(log_path);
2448 	btrfs_release_path(path);
2449 	return ret;
2450 }
2451 
2452 
2453 /*
2454  * deletion replay happens before we copy any new directory items
2455  * out of the log or out of backreferences from inodes.  It
2456  * scans the log to find ranges of keys that log is authoritative for,
2457  * and then scans the directory to find items in those ranges that are
2458  * not present in the log.
2459  *
2460  * Anything we don't find in the log is unlinked and removed from the
2461  * directory.
2462  */
2463 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2464 				       struct btrfs_root *root,
2465 				       struct btrfs_root *log,
2466 				       struct btrfs_path *path,
2467 				       u64 dirid, int del_all)
2468 {
2469 	u64 range_start;
2470 	u64 range_end;
2471 	int ret = 0;
2472 	struct btrfs_key dir_key;
2473 	struct btrfs_key found_key;
2474 	struct btrfs_path *log_path;
2475 	struct inode *dir;
2476 
2477 	dir_key.objectid = dirid;
2478 	dir_key.type = BTRFS_DIR_INDEX_KEY;
2479 	log_path = btrfs_alloc_path();
2480 	if (!log_path)
2481 		return -ENOMEM;
2482 
2483 	dir = read_one_inode(root, dirid);
2484 	/* it isn't an error if the inode isn't there, that can happen
2485 	 * because we replay the deletes before we copy in the inode item
2486 	 * from the log
2487 	 */
2488 	if (!dir) {
2489 		btrfs_free_path(log_path);
2490 		return 0;
2491 	}
2492 
2493 	range_start = 0;
2494 	range_end = 0;
2495 	while (1) {
2496 		if (del_all)
2497 			range_end = (u64)-1;
2498 		else {
2499 			ret = find_dir_range(log, path, dirid,
2500 					     &range_start, &range_end);
2501 			if (ret < 0)
2502 				goto out;
2503 			else if (ret > 0)
2504 				break;
2505 		}
2506 
2507 		dir_key.offset = range_start;
2508 		while (1) {
2509 			int nritems;
2510 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2511 						0, 0);
2512 			if (ret < 0)
2513 				goto out;
2514 
2515 			nritems = btrfs_header_nritems(path->nodes[0]);
2516 			if (path->slots[0] >= nritems) {
2517 				ret = btrfs_next_leaf(root, path);
2518 				if (ret == 1)
2519 					break;
2520 				else if (ret < 0)
2521 					goto out;
2522 			}
2523 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2524 					      path->slots[0]);
2525 			if (found_key.objectid != dirid ||
2526 			    found_key.type != dir_key.type) {
2527 				ret = 0;
2528 				goto out;
2529 			}
2530 
2531 			if (found_key.offset > range_end)
2532 				break;
2533 
2534 			ret = check_item_in_log(trans, log, path,
2535 						log_path, dir,
2536 						&found_key);
2537 			if (ret)
2538 				goto out;
2539 			if (found_key.offset == (u64)-1)
2540 				break;
2541 			dir_key.offset = found_key.offset + 1;
2542 		}
2543 		btrfs_release_path(path);
2544 		if (range_end == (u64)-1)
2545 			break;
2546 		range_start = range_end + 1;
2547 	}
2548 	ret = 0;
2549 out:
2550 	btrfs_release_path(path);
2551 	btrfs_free_path(log_path);
2552 	iput(dir);
2553 	return ret;
2554 }
2555 
2556 /*
2557  * the process_func used to replay items from the log tree.  This
2558  * gets called in two different stages.  The first stage just looks
2559  * for inodes and makes sure they are all copied into the subvolume.
2560  *
2561  * The second stage copies all the other item types from the log into
2562  * the subvolume.  The two stage approach is slower, but gets rid of
2563  * lots of complexity around inodes referencing other inodes that exist
2564  * only in the log (references come from either directory items or inode
2565  * back refs).
2566  */
2567 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2568 			     struct walk_control *wc, u64 gen, int level)
2569 {
2570 	int nritems;
2571 	struct btrfs_path *path;
2572 	struct btrfs_root *root = wc->replay_dest;
2573 	struct btrfs_key key;
2574 	int i;
2575 	int ret;
2576 
2577 	ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
2578 	if (ret)
2579 		return ret;
2580 
2581 	level = btrfs_header_level(eb);
2582 
2583 	if (level != 0)
2584 		return 0;
2585 
2586 	path = btrfs_alloc_path();
2587 	if (!path)
2588 		return -ENOMEM;
2589 
2590 	nritems = btrfs_header_nritems(eb);
2591 	for (i = 0; i < nritems; i++) {
2592 		btrfs_item_key_to_cpu(eb, &key, i);
2593 
2594 		/* inode keys are done during the first stage */
2595 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2596 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2597 			struct btrfs_inode_item *inode_item;
2598 			u32 mode;
2599 
2600 			inode_item = btrfs_item_ptr(eb, i,
2601 					    struct btrfs_inode_item);
2602 			/*
2603 			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2604 			 * and never got linked before the fsync, skip it, as
2605 			 * replaying it is pointless since it would be deleted
2606 			 * later. We skip logging tmpfiles, but it's always
2607 			 * possible we are replaying a log created with a kernel
2608 			 * that used to log tmpfiles.
2609 			 */
2610 			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2611 				wc->ignore_cur_inode = true;
2612 				continue;
2613 			} else {
2614 				wc->ignore_cur_inode = false;
2615 			}
2616 			ret = replay_xattr_deletes(wc->trans, root, log,
2617 						   path, key.objectid);
2618 			if (ret)
2619 				break;
2620 			mode = btrfs_inode_mode(eb, inode_item);
2621 			if (S_ISDIR(mode)) {
2622 				ret = replay_dir_deletes(wc->trans,
2623 					 root, log, path, key.objectid, 0);
2624 				if (ret)
2625 					break;
2626 			}
2627 			ret = overwrite_item(wc->trans, root, path,
2628 					     eb, i, &key);
2629 			if (ret)
2630 				break;
2631 
2632 			/*
2633 			 * Before replaying extents, truncate the inode to its
2634 			 * size. We need to do it now and not after log replay
2635 			 * because before an fsync we can have prealloc extents
2636 			 * added beyond the inode's i_size. If we did it after,
2637 			 * through orphan cleanup for example, we would drop
2638 			 * those prealloc extents just after replaying them.
2639 			 */
2640 			if (S_ISREG(mode)) {
2641 				struct btrfs_drop_extents_args drop_args = { 0 };
2642 				struct inode *inode;
2643 				u64 from;
2644 
2645 				inode = read_one_inode(root, key.objectid);
2646 				if (!inode) {
2647 					ret = -EIO;
2648 					break;
2649 				}
2650 				from = ALIGN(i_size_read(inode),
2651 					     root->fs_info->sectorsize);
2652 				drop_args.start = from;
2653 				drop_args.end = (u64)-1;
2654 				drop_args.drop_cache = true;
2655 				ret = btrfs_drop_extents(wc->trans, root,
2656 							 BTRFS_I(inode),
2657 							 &drop_args);
2658 				if (!ret) {
2659 					inode_sub_bytes(inode,
2660 							drop_args.bytes_found);
2661 					/* Update the inode's nbytes. */
2662 					ret = btrfs_update_inode(wc->trans,
2663 							root, BTRFS_I(inode));
2664 				}
2665 				iput(inode);
2666 				if (ret)
2667 					break;
2668 			}
2669 
2670 			ret = link_to_fixup_dir(wc->trans, root,
2671 						path, key.objectid);
2672 			if (ret)
2673 				break;
2674 		}
2675 
2676 		if (wc->ignore_cur_inode)
2677 			continue;
2678 
2679 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2680 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2681 			ret = replay_one_dir_item(wc->trans, root, path,
2682 						  eb, i, &key);
2683 			if (ret)
2684 				break;
2685 		}
2686 
2687 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2688 			continue;
2689 
2690 		/* these keys are simply copied */
2691 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2692 			ret = overwrite_item(wc->trans, root, path,
2693 					     eb, i, &key);
2694 			if (ret)
2695 				break;
2696 		} else if (key.type == BTRFS_INODE_REF_KEY ||
2697 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2698 			ret = add_inode_ref(wc->trans, root, log, path,
2699 					    eb, i, &key);
2700 			if (ret && ret != -ENOENT)
2701 				break;
2702 			ret = 0;
2703 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2704 			ret = replay_one_extent(wc->trans, root, path,
2705 						eb, i, &key);
2706 			if (ret)
2707 				break;
2708 		}
2709 		/*
2710 		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2711 		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2712 		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2713 		 * older kernel with such keys, ignore them.
2714 		 */
2715 	}
2716 	btrfs_free_path(path);
2717 	return ret;
2718 }
2719 
2720 /*
2721  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2722  */
2723 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2724 {
2725 	struct btrfs_block_group *cache;
2726 
2727 	cache = btrfs_lookup_block_group(fs_info, start);
2728 	if (!cache) {
2729 		btrfs_err(fs_info, "unable to find block group for %llu", start);
2730 		return;
2731 	}
2732 
2733 	spin_lock(&cache->space_info->lock);
2734 	spin_lock(&cache->lock);
2735 	cache->reserved -= fs_info->nodesize;
2736 	cache->space_info->bytes_reserved -= fs_info->nodesize;
2737 	spin_unlock(&cache->lock);
2738 	spin_unlock(&cache->space_info->lock);
2739 
2740 	btrfs_put_block_group(cache);
2741 }
2742 
2743 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2744 				   struct btrfs_root *root,
2745 				   struct btrfs_path *path, int *level,
2746 				   struct walk_control *wc)
2747 {
2748 	struct btrfs_fs_info *fs_info = root->fs_info;
2749 	u64 bytenr;
2750 	u64 ptr_gen;
2751 	struct extent_buffer *next;
2752 	struct extent_buffer *cur;
2753 	u32 blocksize;
2754 	int ret = 0;
2755 
2756 	while (*level > 0) {
2757 		struct btrfs_key first_key;
2758 
2759 		cur = path->nodes[*level];
2760 
2761 		WARN_ON(btrfs_header_level(cur) != *level);
2762 
2763 		if (path->slots[*level] >=
2764 		    btrfs_header_nritems(cur))
2765 			break;
2766 
2767 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2768 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2769 		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2770 		blocksize = fs_info->nodesize;
2771 
2772 		next = btrfs_find_create_tree_block(fs_info, bytenr,
2773 						    btrfs_header_owner(cur),
2774 						    *level - 1);
2775 		if (IS_ERR(next))
2776 			return PTR_ERR(next);
2777 
2778 		if (*level == 1) {
2779 			ret = wc->process_func(root, next, wc, ptr_gen,
2780 					       *level - 1);
2781 			if (ret) {
2782 				free_extent_buffer(next);
2783 				return ret;
2784 			}
2785 
2786 			path->slots[*level]++;
2787 			if (wc->free) {
2788 				ret = btrfs_read_extent_buffer(next, ptr_gen,
2789 							*level - 1, &first_key);
2790 				if (ret) {
2791 					free_extent_buffer(next);
2792 					return ret;
2793 				}
2794 
2795 				if (trans) {
2796 					btrfs_tree_lock(next);
2797 					btrfs_clean_tree_block(next);
2798 					btrfs_wait_tree_block_writeback(next);
2799 					btrfs_tree_unlock(next);
2800 					ret = btrfs_pin_reserved_extent(trans,
2801 							bytenr, blocksize);
2802 					if (ret) {
2803 						free_extent_buffer(next);
2804 						return ret;
2805 					}
2806 					btrfs_redirty_list_add(
2807 						trans->transaction, next);
2808 				} else {
2809 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2810 						clear_extent_buffer_dirty(next);
2811 					unaccount_log_buffer(fs_info, bytenr);
2812 				}
2813 			}
2814 			free_extent_buffer(next);
2815 			continue;
2816 		}
2817 		ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
2818 		if (ret) {
2819 			free_extent_buffer(next);
2820 			return ret;
2821 		}
2822 
2823 		if (path->nodes[*level-1])
2824 			free_extent_buffer(path->nodes[*level-1]);
2825 		path->nodes[*level-1] = next;
2826 		*level = btrfs_header_level(next);
2827 		path->slots[*level] = 0;
2828 		cond_resched();
2829 	}
2830 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2831 
2832 	cond_resched();
2833 	return 0;
2834 }
2835 
2836 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2837 				 struct btrfs_root *root,
2838 				 struct btrfs_path *path, int *level,
2839 				 struct walk_control *wc)
2840 {
2841 	struct btrfs_fs_info *fs_info = root->fs_info;
2842 	int i;
2843 	int slot;
2844 	int ret;
2845 
2846 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2847 		slot = path->slots[i];
2848 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2849 			path->slots[i]++;
2850 			*level = i;
2851 			WARN_ON(*level == 0);
2852 			return 0;
2853 		} else {
2854 			ret = wc->process_func(root, path->nodes[*level], wc,
2855 				 btrfs_header_generation(path->nodes[*level]),
2856 				 *level);
2857 			if (ret)
2858 				return ret;
2859 
2860 			if (wc->free) {
2861 				struct extent_buffer *next;
2862 
2863 				next = path->nodes[*level];
2864 
2865 				if (trans) {
2866 					btrfs_tree_lock(next);
2867 					btrfs_clean_tree_block(next);
2868 					btrfs_wait_tree_block_writeback(next);
2869 					btrfs_tree_unlock(next);
2870 					ret = btrfs_pin_reserved_extent(trans,
2871 						     path->nodes[*level]->start,
2872 						     path->nodes[*level]->len);
2873 					if (ret)
2874 						return ret;
2875 					btrfs_redirty_list_add(trans->transaction,
2876 							       next);
2877 				} else {
2878 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2879 						clear_extent_buffer_dirty(next);
2880 
2881 					unaccount_log_buffer(fs_info,
2882 						path->nodes[*level]->start);
2883 				}
2884 			}
2885 			free_extent_buffer(path->nodes[*level]);
2886 			path->nodes[*level] = NULL;
2887 			*level = i + 1;
2888 		}
2889 	}
2890 	return 1;
2891 }
2892 
2893 /*
2894  * drop the reference count on the tree rooted at 'snap'.  This traverses
2895  * the tree freeing any blocks that have a ref count of zero after being
2896  * decremented.
2897  */
2898 static int walk_log_tree(struct btrfs_trans_handle *trans,
2899 			 struct btrfs_root *log, struct walk_control *wc)
2900 {
2901 	struct btrfs_fs_info *fs_info = log->fs_info;
2902 	int ret = 0;
2903 	int wret;
2904 	int level;
2905 	struct btrfs_path *path;
2906 	int orig_level;
2907 
2908 	path = btrfs_alloc_path();
2909 	if (!path)
2910 		return -ENOMEM;
2911 
2912 	level = btrfs_header_level(log->node);
2913 	orig_level = level;
2914 	path->nodes[level] = log->node;
2915 	atomic_inc(&log->node->refs);
2916 	path->slots[level] = 0;
2917 
2918 	while (1) {
2919 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2920 		if (wret > 0)
2921 			break;
2922 		if (wret < 0) {
2923 			ret = wret;
2924 			goto out;
2925 		}
2926 
2927 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2928 		if (wret > 0)
2929 			break;
2930 		if (wret < 0) {
2931 			ret = wret;
2932 			goto out;
2933 		}
2934 	}
2935 
2936 	/* was the root node processed? if not, catch it here */
2937 	if (path->nodes[orig_level]) {
2938 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2939 			 btrfs_header_generation(path->nodes[orig_level]),
2940 			 orig_level);
2941 		if (ret)
2942 			goto out;
2943 		if (wc->free) {
2944 			struct extent_buffer *next;
2945 
2946 			next = path->nodes[orig_level];
2947 
2948 			if (trans) {
2949 				btrfs_tree_lock(next);
2950 				btrfs_clean_tree_block(next);
2951 				btrfs_wait_tree_block_writeback(next);
2952 				btrfs_tree_unlock(next);
2953 				ret = btrfs_pin_reserved_extent(trans,
2954 						next->start, next->len);
2955 				if (ret)
2956 					goto out;
2957 				btrfs_redirty_list_add(trans->transaction, next);
2958 			} else {
2959 				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2960 					clear_extent_buffer_dirty(next);
2961 				unaccount_log_buffer(fs_info, next->start);
2962 			}
2963 		}
2964 	}
2965 
2966 out:
2967 	btrfs_free_path(path);
2968 	return ret;
2969 }
2970 
2971 /*
2972  * helper function to update the item for a given subvolumes log root
2973  * in the tree of log roots
2974  */
2975 static int update_log_root(struct btrfs_trans_handle *trans,
2976 			   struct btrfs_root *log,
2977 			   struct btrfs_root_item *root_item)
2978 {
2979 	struct btrfs_fs_info *fs_info = log->fs_info;
2980 	int ret;
2981 
2982 	if (log->log_transid == 1) {
2983 		/* insert root item on the first sync */
2984 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2985 				&log->root_key, root_item);
2986 	} else {
2987 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2988 				&log->root_key, root_item);
2989 	}
2990 	return ret;
2991 }
2992 
2993 static void wait_log_commit(struct btrfs_root *root, int transid)
2994 {
2995 	DEFINE_WAIT(wait);
2996 	int index = transid % 2;
2997 
2998 	/*
2999 	 * we only allow two pending log transactions at a time,
3000 	 * so we know that if ours is more than 2 older than the
3001 	 * current transaction, we're done
3002 	 */
3003 	for (;;) {
3004 		prepare_to_wait(&root->log_commit_wait[index],
3005 				&wait, TASK_UNINTERRUPTIBLE);
3006 
3007 		if (!(root->log_transid_committed < transid &&
3008 		      atomic_read(&root->log_commit[index])))
3009 			break;
3010 
3011 		mutex_unlock(&root->log_mutex);
3012 		schedule();
3013 		mutex_lock(&root->log_mutex);
3014 	}
3015 	finish_wait(&root->log_commit_wait[index], &wait);
3016 }
3017 
3018 static void wait_for_writer(struct btrfs_root *root)
3019 {
3020 	DEFINE_WAIT(wait);
3021 
3022 	for (;;) {
3023 		prepare_to_wait(&root->log_writer_wait, &wait,
3024 				TASK_UNINTERRUPTIBLE);
3025 		if (!atomic_read(&root->log_writers))
3026 			break;
3027 
3028 		mutex_unlock(&root->log_mutex);
3029 		schedule();
3030 		mutex_lock(&root->log_mutex);
3031 	}
3032 	finish_wait(&root->log_writer_wait, &wait);
3033 }
3034 
3035 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3036 					struct btrfs_log_ctx *ctx)
3037 {
3038 	mutex_lock(&root->log_mutex);
3039 	list_del_init(&ctx->list);
3040 	mutex_unlock(&root->log_mutex);
3041 }
3042 
3043 /*
3044  * Invoked in log mutex context, or be sure there is no other task which
3045  * can access the list.
3046  */
3047 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3048 					     int index, int error)
3049 {
3050 	struct btrfs_log_ctx *ctx;
3051 	struct btrfs_log_ctx *safe;
3052 
3053 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3054 		list_del_init(&ctx->list);
3055 		ctx->log_ret = error;
3056 	}
3057 }
3058 
3059 /*
3060  * btrfs_sync_log does sends a given tree log down to the disk and
3061  * updates the super blocks to record it.  When this call is done,
3062  * you know that any inodes previously logged are safely on disk only
3063  * if it returns 0.
3064  *
3065  * Any other return value means you need to call btrfs_commit_transaction.
3066  * Some of the edge cases for fsyncing directories that have had unlinks
3067  * or renames done in the past mean that sometimes the only safe
3068  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3069  * that has happened.
3070  */
3071 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3072 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3073 {
3074 	int index1;
3075 	int index2;
3076 	int mark;
3077 	int ret;
3078 	struct btrfs_fs_info *fs_info = root->fs_info;
3079 	struct btrfs_root *log = root->log_root;
3080 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3081 	struct btrfs_root_item new_root_item;
3082 	int log_transid = 0;
3083 	struct btrfs_log_ctx root_log_ctx;
3084 	struct blk_plug plug;
3085 	u64 log_root_start;
3086 	u64 log_root_level;
3087 
3088 	mutex_lock(&root->log_mutex);
3089 	log_transid = ctx->log_transid;
3090 	if (root->log_transid_committed >= log_transid) {
3091 		mutex_unlock(&root->log_mutex);
3092 		return ctx->log_ret;
3093 	}
3094 
3095 	index1 = log_transid % 2;
3096 	if (atomic_read(&root->log_commit[index1])) {
3097 		wait_log_commit(root, log_transid);
3098 		mutex_unlock(&root->log_mutex);
3099 		return ctx->log_ret;
3100 	}
3101 	ASSERT(log_transid == root->log_transid);
3102 	atomic_set(&root->log_commit[index1], 1);
3103 
3104 	/* wait for previous tree log sync to complete */
3105 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3106 		wait_log_commit(root, log_transid - 1);
3107 
3108 	while (1) {
3109 		int batch = atomic_read(&root->log_batch);
3110 		/* when we're on an ssd, just kick the log commit out */
3111 		if (!btrfs_test_opt(fs_info, SSD) &&
3112 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3113 			mutex_unlock(&root->log_mutex);
3114 			schedule_timeout_uninterruptible(1);
3115 			mutex_lock(&root->log_mutex);
3116 		}
3117 		wait_for_writer(root);
3118 		if (batch == atomic_read(&root->log_batch))
3119 			break;
3120 	}
3121 
3122 	/* bail out if we need to do a full commit */
3123 	if (btrfs_need_log_full_commit(trans)) {
3124 		ret = BTRFS_LOG_FORCE_COMMIT;
3125 		mutex_unlock(&root->log_mutex);
3126 		goto out;
3127 	}
3128 
3129 	if (log_transid % 2 == 0)
3130 		mark = EXTENT_DIRTY;
3131 	else
3132 		mark = EXTENT_NEW;
3133 
3134 	/* we start IO on  all the marked extents here, but we don't actually
3135 	 * wait for them until later.
3136 	 */
3137 	blk_start_plug(&plug);
3138 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3139 	/*
3140 	 * -EAGAIN happens when someone, e.g., a concurrent transaction
3141 	 *  commit, writes a dirty extent in this tree-log commit. This
3142 	 *  concurrent write will create a hole writing out the extents,
3143 	 *  and we cannot proceed on a zoned filesystem, requiring
3144 	 *  sequential writing. While we can bail out to a full commit
3145 	 *  here, but we can continue hoping the concurrent writing fills
3146 	 *  the hole.
3147 	 */
3148 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3149 		ret = 0;
3150 	if (ret) {
3151 		blk_finish_plug(&plug);
3152 		btrfs_abort_transaction(trans, ret);
3153 		btrfs_set_log_full_commit(trans);
3154 		mutex_unlock(&root->log_mutex);
3155 		goto out;
3156 	}
3157 
3158 	/*
3159 	 * We _must_ update under the root->log_mutex in order to make sure we
3160 	 * have a consistent view of the log root we are trying to commit at
3161 	 * this moment.
3162 	 *
3163 	 * We _must_ copy this into a local copy, because we are not holding the
3164 	 * log_root_tree->log_mutex yet.  This is important because when we
3165 	 * commit the log_root_tree we must have a consistent view of the
3166 	 * log_root_tree when we update the super block to point at the
3167 	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3168 	 * with the commit and possibly point at the new block which we may not
3169 	 * have written out.
3170 	 */
3171 	btrfs_set_root_node(&log->root_item, log->node);
3172 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3173 
3174 	root->log_transid++;
3175 	log->log_transid = root->log_transid;
3176 	root->log_start_pid = 0;
3177 	/*
3178 	 * IO has been started, blocks of the log tree have WRITTEN flag set
3179 	 * in their headers. new modifications of the log will be written to
3180 	 * new positions. so it's safe to allow log writers to go in.
3181 	 */
3182 	mutex_unlock(&root->log_mutex);
3183 
3184 	if (btrfs_is_zoned(fs_info)) {
3185 		mutex_lock(&fs_info->tree_root->log_mutex);
3186 		if (!log_root_tree->node) {
3187 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3188 			if (ret) {
3189 				mutex_unlock(&fs_info->tree_root->log_mutex);
3190 				blk_finish_plug(&plug);
3191 				goto out;
3192 			}
3193 		}
3194 		mutex_unlock(&fs_info->tree_root->log_mutex);
3195 	}
3196 
3197 	btrfs_init_log_ctx(&root_log_ctx, NULL);
3198 
3199 	mutex_lock(&log_root_tree->log_mutex);
3200 
3201 	index2 = log_root_tree->log_transid % 2;
3202 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3203 	root_log_ctx.log_transid = log_root_tree->log_transid;
3204 
3205 	/*
3206 	 * Now we are safe to update the log_root_tree because we're under the
3207 	 * log_mutex, and we're a current writer so we're holding the commit
3208 	 * open until we drop the log_mutex.
3209 	 */
3210 	ret = update_log_root(trans, log, &new_root_item);
3211 	if (ret) {
3212 		if (!list_empty(&root_log_ctx.list))
3213 			list_del_init(&root_log_ctx.list);
3214 
3215 		blk_finish_plug(&plug);
3216 		btrfs_set_log_full_commit(trans);
3217 
3218 		if (ret != -ENOSPC) {
3219 			btrfs_abort_transaction(trans, ret);
3220 			mutex_unlock(&log_root_tree->log_mutex);
3221 			goto out;
3222 		}
3223 		btrfs_wait_tree_log_extents(log, mark);
3224 		mutex_unlock(&log_root_tree->log_mutex);
3225 		ret = BTRFS_LOG_FORCE_COMMIT;
3226 		goto out;
3227 	}
3228 
3229 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3230 		blk_finish_plug(&plug);
3231 		list_del_init(&root_log_ctx.list);
3232 		mutex_unlock(&log_root_tree->log_mutex);
3233 		ret = root_log_ctx.log_ret;
3234 		goto out;
3235 	}
3236 
3237 	index2 = root_log_ctx.log_transid % 2;
3238 	if (atomic_read(&log_root_tree->log_commit[index2])) {
3239 		blk_finish_plug(&plug);
3240 		ret = btrfs_wait_tree_log_extents(log, mark);
3241 		wait_log_commit(log_root_tree,
3242 				root_log_ctx.log_transid);
3243 		mutex_unlock(&log_root_tree->log_mutex);
3244 		if (!ret)
3245 			ret = root_log_ctx.log_ret;
3246 		goto out;
3247 	}
3248 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3249 	atomic_set(&log_root_tree->log_commit[index2], 1);
3250 
3251 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3252 		wait_log_commit(log_root_tree,
3253 				root_log_ctx.log_transid - 1);
3254 	}
3255 
3256 	/*
3257 	 * now that we've moved on to the tree of log tree roots,
3258 	 * check the full commit flag again
3259 	 */
3260 	if (btrfs_need_log_full_commit(trans)) {
3261 		blk_finish_plug(&plug);
3262 		btrfs_wait_tree_log_extents(log, mark);
3263 		mutex_unlock(&log_root_tree->log_mutex);
3264 		ret = BTRFS_LOG_FORCE_COMMIT;
3265 		goto out_wake_log_root;
3266 	}
3267 
3268 	ret = btrfs_write_marked_extents(fs_info,
3269 					 &log_root_tree->dirty_log_pages,
3270 					 EXTENT_DIRTY | EXTENT_NEW);
3271 	blk_finish_plug(&plug);
3272 	/*
3273 	 * As described above, -EAGAIN indicates a hole in the extents. We
3274 	 * cannot wait for these write outs since the waiting cause a
3275 	 * deadlock. Bail out to the full commit instead.
3276 	 */
3277 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3278 		btrfs_set_log_full_commit(trans);
3279 		btrfs_wait_tree_log_extents(log, mark);
3280 		mutex_unlock(&log_root_tree->log_mutex);
3281 		goto out_wake_log_root;
3282 	} else if (ret) {
3283 		btrfs_set_log_full_commit(trans);
3284 		btrfs_abort_transaction(trans, ret);
3285 		mutex_unlock(&log_root_tree->log_mutex);
3286 		goto out_wake_log_root;
3287 	}
3288 	ret = btrfs_wait_tree_log_extents(log, mark);
3289 	if (!ret)
3290 		ret = btrfs_wait_tree_log_extents(log_root_tree,
3291 						  EXTENT_NEW | EXTENT_DIRTY);
3292 	if (ret) {
3293 		btrfs_set_log_full_commit(trans);
3294 		mutex_unlock(&log_root_tree->log_mutex);
3295 		goto out_wake_log_root;
3296 	}
3297 
3298 	log_root_start = log_root_tree->node->start;
3299 	log_root_level = btrfs_header_level(log_root_tree->node);
3300 	log_root_tree->log_transid++;
3301 	mutex_unlock(&log_root_tree->log_mutex);
3302 
3303 	/*
3304 	 * Here we are guaranteed that nobody is going to write the superblock
3305 	 * for the current transaction before us and that neither we do write
3306 	 * our superblock before the previous transaction finishes its commit
3307 	 * and writes its superblock, because:
3308 	 *
3309 	 * 1) We are holding a handle on the current transaction, so no body
3310 	 *    can commit it until we release the handle;
3311 	 *
3312 	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3313 	 *    if the previous transaction is still committing, and hasn't yet
3314 	 *    written its superblock, we wait for it to do it, because a
3315 	 *    transaction commit acquires the tree_log_mutex when the commit
3316 	 *    begins and releases it only after writing its superblock.
3317 	 */
3318 	mutex_lock(&fs_info->tree_log_mutex);
3319 
3320 	/*
3321 	 * The previous transaction writeout phase could have failed, and thus
3322 	 * marked the fs in an error state.  We must not commit here, as we
3323 	 * could have updated our generation in the super_for_commit and
3324 	 * writing the super here would result in transid mismatches.  If there
3325 	 * is an error here just bail.
3326 	 */
3327 	if (BTRFS_FS_ERROR(fs_info)) {
3328 		ret = -EIO;
3329 		btrfs_set_log_full_commit(trans);
3330 		btrfs_abort_transaction(trans, ret);
3331 		mutex_unlock(&fs_info->tree_log_mutex);
3332 		goto out_wake_log_root;
3333 	}
3334 
3335 	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3336 	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3337 	ret = write_all_supers(fs_info, 1);
3338 	mutex_unlock(&fs_info->tree_log_mutex);
3339 	if (ret) {
3340 		btrfs_set_log_full_commit(trans);
3341 		btrfs_abort_transaction(trans, ret);
3342 		goto out_wake_log_root;
3343 	}
3344 
3345 	/*
3346 	 * We know there can only be one task here, since we have not yet set
3347 	 * root->log_commit[index1] to 0 and any task attempting to sync the
3348 	 * log must wait for the previous log transaction to commit if it's
3349 	 * still in progress or wait for the current log transaction commit if
3350 	 * someone else already started it. We use <= and not < because the
3351 	 * first log transaction has an ID of 0.
3352 	 */
3353 	ASSERT(root->last_log_commit <= log_transid);
3354 	root->last_log_commit = log_transid;
3355 
3356 out_wake_log_root:
3357 	mutex_lock(&log_root_tree->log_mutex);
3358 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3359 
3360 	log_root_tree->log_transid_committed++;
3361 	atomic_set(&log_root_tree->log_commit[index2], 0);
3362 	mutex_unlock(&log_root_tree->log_mutex);
3363 
3364 	/*
3365 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3366 	 * all the updates above are seen by the woken threads. It might not be
3367 	 * necessary, but proving that seems to be hard.
3368 	 */
3369 	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3370 out:
3371 	mutex_lock(&root->log_mutex);
3372 	btrfs_remove_all_log_ctxs(root, index1, ret);
3373 	root->log_transid_committed++;
3374 	atomic_set(&root->log_commit[index1], 0);
3375 	mutex_unlock(&root->log_mutex);
3376 
3377 	/*
3378 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3379 	 * all the updates above are seen by the woken threads. It might not be
3380 	 * necessary, but proving that seems to be hard.
3381 	 */
3382 	cond_wake_up(&root->log_commit_wait[index1]);
3383 	return ret;
3384 }
3385 
3386 static void free_log_tree(struct btrfs_trans_handle *trans,
3387 			  struct btrfs_root *log)
3388 {
3389 	int ret;
3390 	struct walk_control wc = {
3391 		.free = 1,
3392 		.process_func = process_one_buffer
3393 	};
3394 
3395 	if (log->node) {
3396 		ret = walk_log_tree(trans, log, &wc);
3397 		if (ret) {
3398 			/*
3399 			 * We weren't able to traverse the entire log tree, the
3400 			 * typical scenario is getting an -EIO when reading an
3401 			 * extent buffer of the tree, due to a previous writeback
3402 			 * failure of it.
3403 			 */
3404 			set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3405 				&log->fs_info->fs_state);
3406 
3407 			/*
3408 			 * Some extent buffers of the log tree may still be dirty
3409 			 * and not yet written back to storage, because we may
3410 			 * have updates to a log tree without syncing a log tree,
3411 			 * such as during rename and link operations. So flush
3412 			 * them out and wait for their writeback to complete, so
3413 			 * that we properly cleanup their state and pages.
3414 			 */
3415 			btrfs_write_marked_extents(log->fs_info,
3416 						   &log->dirty_log_pages,
3417 						   EXTENT_DIRTY | EXTENT_NEW);
3418 			btrfs_wait_tree_log_extents(log,
3419 						    EXTENT_DIRTY | EXTENT_NEW);
3420 
3421 			if (trans)
3422 				btrfs_abort_transaction(trans, ret);
3423 			else
3424 				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3425 		}
3426 	}
3427 
3428 	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3429 			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3430 	extent_io_tree_release(&log->log_csum_range);
3431 
3432 	btrfs_put_root(log);
3433 }
3434 
3435 /*
3436  * free all the extents used by the tree log.  This should be called
3437  * at commit time of the full transaction
3438  */
3439 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3440 {
3441 	if (root->log_root) {
3442 		free_log_tree(trans, root->log_root);
3443 		root->log_root = NULL;
3444 		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3445 	}
3446 	return 0;
3447 }
3448 
3449 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3450 			     struct btrfs_fs_info *fs_info)
3451 {
3452 	if (fs_info->log_root_tree) {
3453 		free_log_tree(trans, fs_info->log_root_tree);
3454 		fs_info->log_root_tree = NULL;
3455 		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3456 	}
3457 	return 0;
3458 }
3459 
3460 /*
3461  * Check if an inode was logged in the current transaction. This correctly deals
3462  * with the case where the inode was logged but has a logged_trans of 0, which
3463  * happens if the inode is evicted and loaded again, as logged_trans is an in
3464  * memory only field (not persisted).
3465  *
3466  * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3467  * and < 0 on error.
3468  */
3469 static int inode_logged(struct btrfs_trans_handle *trans,
3470 			struct btrfs_inode *inode,
3471 			struct btrfs_path *path_in)
3472 {
3473 	struct btrfs_path *path = path_in;
3474 	struct btrfs_key key;
3475 	int ret;
3476 
3477 	if (inode->logged_trans == trans->transid)
3478 		return 1;
3479 
3480 	/*
3481 	 * If logged_trans is not 0, then we know the inode logged was not logged
3482 	 * in this transaction, so we can return false right away.
3483 	 */
3484 	if (inode->logged_trans > 0)
3485 		return 0;
3486 
3487 	/*
3488 	 * If no log tree was created for this root in this transaction, then
3489 	 * the inode can not have been logged in this transaction. In that case
3490 	 * set logged_trans to anything greater than 0 and less than the current
3491 	 * transaction's ID, to avoid the search below in a future call in case
3492 	 * a log tree gets created after this.
3493 	 */
3494 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3495 		inode->logged_trans = trans->transid - 1;
3496 		return 0;
3497 	}
3498 
3499 	/*
3500 	 * We have a log tree and the inode's logged_trans is 0. We can't tell
3501 	 * for sure if the inode was logged before in this transaction by looking
3502 	 * only at logged_trans. We could be pessimistic and assume it was, but
3503 	 * that can lead to unnecessarily logging an inode during rename and link
3504 	 * operations, and then further updating the log in followup rename and
3505 	 * link operations, specially if it's a directory, which adds latency
3506 	 * visible to applications doing a series of rename or link operations.
3507 	 *
3508 	 * A logged_trans of 0 here can mean several things:
3509 	 *
3510 	 * 1) The inode was never logged since the filesystem was mounted, and may
3511 	 *    or may have not been evicted and loaded again;
3512 	 *
3513 	 * 2) The inode was logged in a previous transaction, then evicted and
3514 	 *    then loaded again;
3515 	 *
3516 	 * 3) The inode was logged in the current transaction, then evicted and
3517 	 *    then loaded again.
3518 	 *
3519 	 * For cases 1) and 2) we don't want to return true, but we need to detect
3520 	 * case 3) and return true. So we do a search in the log root for the inode
3521 	 * item.
3522 	 */
3523 	key.objectid = btrfs_ino(inode);
3524 	key.type = BTRFS_INODE_ITEM_KEY;
3525 	key.offset = 0;
3526 
3527 	if (!path) {
3528 		path = btrfs_alloc_path();
3529 		if (!path)
3530 			return -ENOMEM;
3531 	}
3532 
3533 	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3534 
3535 	if (path_in)
3536 		btrfs_release_path(path);
3537 	else
3538 		btrfs_free_path(path);
3539 
3540 	/*
3541 	 * Logging an inode always results in logging its inode item. So if we
3542 	 * did not find the item we know the inode was not logged for sure.
3543 	 */
3544 	if (ret < 0) {
3545 		return ret;
3546 	} else if (ret > 0) {
3547 		/*
3548 		 * Set logged_trans to a value greater than 0 and less then the
3549 		 * current transaction to avoid doing the search in future calls.
3550 		 */
3551 		inode->logged_trans = trans->transid - 1;
3552 		return 0;
3553 	}
3554 
3555 	/*
3556 	 * The inode was previously logged and then evicted, set logged_trans to
3557 	 * the current transacion's ID, to avoid future tree searches as long as
3558 	 * the inode is not evicted again.
3559 	 */
3560 	inode->logged_trans = trans->transid;
3561 
3562 	/*
3563 	 * If it's a directory, then we must set last_dir_index_offset to the
3564 	 * maximum possible value, so that the next attempt to log the inode does
3565 	 * not skip checking if dir index keys found in modified subvolume tree
3566 	 * leaves have been logged before, otherwise it would result in attempts
3567 	 * to insert duplicate dir index keys in the log tree. This must be done
3568 	 * because last_dir_index_offset is an in-memory only field, not persisted
3569 	 * in the inode item or any other on-disk structure, so its value is lost
3570 	 * once the inode is evicted.
3571 	 */
3572 	if (S_ISDIR(inode->vfs_inode.i_mode))
3573 		inode->last_dir_index_offset = (u64)-1;
3574 
3575 	return 1;
3576 }
3577 
3578 /*
3579  * Delete a directory entry from the log if it exists.
3580  *
3581  * Returns < 0 on error
3582  *           1 if the entry does not exists
3583  *           0 if the entry existed and was successfully deleted
3584  */
3585 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3586 			     struct btrfs_root *log,
3587 			     struct btrfs_path *path,
3588 			     u64 dir_ino,
3589 			     const char *name, int name_len,
3590 			     u64 index)
3591 {
3592 	struct btrfs_dir_item *di;
3593 
3594 	/*
3595 	 * We only log dir index items of a directory, so we don't need to look
3596 	 * for dir item keys.
3597 	 */
3598 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3599 					 index, name, name_len, -1);
3600 	if (IS_ERR(di))
3601 		return PTR_ERR(di);
3602 	else if (!di)
3603 		return 1;
3604 
3605 	/*
3606 	 * We do not need to update the size field of the directory's
3607 	 * inode item because on log replay we update the field to reflect
3608 	 * all existing entries in the directory (see overwrite_item()).
3609 	 */
3610 	return btrfs_delete_one_dir_name(trans, log, path, di);
3611 }
3612 
3613 /*
3614  * If both a file and directory are logged, and unlinks or renames are
3615  * mixed in, we have a few interesting corners:
3616  *
3617  * create file X in dir Y
3618  * link file X to X.link in dir Y
3619  * fsync file X
3620  * unlink file X but leave X.link
3621  * fsync dir Y
3622  *
3623  * After a crash we would expect only X.link to exist.  But file X
3624  * didn't get fsync'd again so the log has back refs for X and X.link.
3625  *
3626  * We solve this by removing directory entries and inode backrefs from the
3627  * log when a file that was logged in the current transaction is
3628  * unlinked.  Any later fsync will include the updated log entries, and
3629  * we'll be able to reconstruct the proper directory items from backrefs.
3630  *
3631  * This optimizations allows us to avoid relogging the entire inode
3632  * or the entire directory.
3633  */
3634 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3635 				  struct btrfs_root *root,
3636 				  const char *name, int name_len,
3637 				  struct btrfs_inode *dir, u64 index)
3638 {
3639 	struct btrfs_path *path;
3640 	int ret;
3641 
3642 	ret = inode_logged(trans, dir, NULL);
3643 	if (ret == 0)
3644 		return;
3645 	else if (ret < 0) {
3646 		btrfs_set_log_full_commit(trans);
3647 		return;
3648 	}
3649 
3650 	ret = join_running_log_trans(root);
3651 	if (ret)
3652 		return;
3653 
3654 	mutex_lock(&dir->log_mutex);
3655 
3656 	path = btrfs_alloc_path();
3657 	if (!path) {
3658 		ret = -ENOMEM;
3659 		goto out_unlock;
3660 	}
3661 
3662 	ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3663 				name, name_len, index);
3664 	btrfs_free_path(path);
3665 out_unlock:
3666 	mutex_unlock(&dir->log_mutex);
3667 	if (ret < 0)
3668 		btrfs_set_log_full_commit(trans);
3669 	btrfs_end_log_trans(root);
3670 }
3671 
3672 /* see comments for btrfs_del_dir_entries_in_log */
3673 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3674 				struct btrfs_root *root,
3675 				const char *name, int name_len,
3676 				struct btrfs_inode *inode, u64 dirid)
3677 {
3678 	struct btrfs_root *log;
3679 	u64 index;
3680 	int ret;
3681 
3682 	ret = inode_logged(trans, inode, NULL);
3683 	if (ret == 0)
3684 		return;
3685 	else if (ret < 0) {
3686 		btrfs_set_log_full_commit(trans);
3687 		return;
3688 	}
3689 
3690 	ret = join_running_log_trans(root);
3691 	if (ret)
3692 		return;
3693 	log = root->log_root;
3694 	mutex_lock(&inode->log_mutex);
3695 
3696 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3697 				  dirid, &index);
3698 	mutex_unlock(&inode->log_mutex);
3699 	if (ret < 0 && ret != -ENOENT)
3700 		btrfs_set_log_full_commit(trans);
3701 	btrfs_end_log_trans(root);
3702 }
3703 
3704 /*
3705  * creates a range item in the log for 'dirid'.  first_offset and
3706  * last_offset tell us which parts of the key space the log should
3707  * be considered authoritative for.
3708  */
3709 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3710 				       struct btrfs_root *log,
3711 				       struct btrfs_path *path,
3712 				       u64 dirid,
3713 				       u64 first_offset, u64 last_offset)
3714 {
3715 	int ret;
3716 	struct btrfs_key key;
3717 	struct btrfs_dir_log_item *item;
3718 
3719 	key.objectid = dirid;
3720 	key.offset = first_offset;
3721 	key.type = BTRFS_DIR_LOG_INDEX_KEY;
3722 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3723 	/*
3724 	 * -EEXIST is fine and can happen sporadically when we are logging a
3725 	 * directory and have concurrent insertions in the subvolume's tree for
3726 	 * items from other inodes and that result in pushing off some dir items
3727 	 * from one leaf to another in order to accommodate for the new items.
3728 	 * This results in logging the same dir index range key.
3729 	 */
3730 	if (ret && ret != -EEXIST)
3731 		return ret;
3732 
3733 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3734 			      struct btrfs_dir_log_item);
3735 	if (ret == -EEXIST) {
3736 		const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3737 
3738 		/*
3739 		 * btrfs_del_dir_entries_in_log() might have been called during
3740 		 * an unlink between the initial insertion of this key and the
3741 		 * current update, or we might be logging a single entry deletion
3742 		 * during a rename, so set the new last_offset to the max value.
3743 		 */
3744 		last_offset = max(last_offset, curr_end);
3745 	}
3746 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3747 	btrfs_mark_buffer_dirty(path->nodes[0]);
3748 	btrfs_release_path(path);
3749 	return 0;
3750 }
3751 
3752 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3753 				 struct btrfs_root *log,
3754 				 struct extent_buffer *src,
3755 				 struct btrfs_path *dst_path,
3756 				 int start_slot,
3757 				 int count)
3758 {
3759 	char *ins_data = NULL;
3760 	struct btrfs_item_batch batch;
3761 	struct extent_buffer *dst;
3762 	unsigned long src_offset;
3763 	unsigned long dst_offset;
3764 	struct btrfs_key key;
3765 	u32 item_size;
3766 	int ret;
3767 	int i;
3768 
3769 	ASSERT(count > 0);
3770 	batch.nr = count;
3771 
3772 	if (count == 1) {
3773 		btrfs_item_key_to_cpu(src, &key, start_slot);
3774 		item_size = btrfs_item_size(src, start_slot);
3775 		batch.keys = &key;
3776 		batch.data_sizes = &item_size;
3777 		batch.total_data_size = item_size;
3778 	} else {
3779 		struct btrfs_key *ins_keys;
3780 		u32 *ins_sizes;
3781 
3782 		ins_data = kmalloc(count * sizeof(u32) +
3783 				   count * sizeof(struct btrfs_key), GFP_NOFS);
3784 		if (!ins_data)
3785 			return -ENOMEM;
3786 
3787 		ins_sizes = (u32 *)ins_data;
3788 		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3789 		batch.keys = ins_keys;
3790 		batch.data_sizes = ins_sizes;
3791 		batch.total_data_size = 0;
3792 
3793 		for (i = 0; i < count; i++) {
3794 			const int slot = start_slot + i;
3795 
3796 			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3797 			ins_sizes[i] = btrfs_item_size(src, slot);
3798 			batch.total_data_size += ins_sizes[i];
3799 		}
3800 	}
3801 
3802 	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3803 	if (ret)
3804 		goto out;
3805 
3806 	dst = dst_path->nodes[0];
3807 	/*
3808 	 * Copy all the items in bulk, in a single copy operation. Item data is
3809 	 * organized such that it's placed at the end of a leaf and from right
3810 	 * to left. For example, the data for the second item ends at an offset
3811 	 * that matches the offset where the data for the first item starts, the
3812 	 * data for the third item ends at an offset that matches the offset
3813 	 * where the data of the second items starts, and so on.
3814 	 * Therefore our source and destination start offsets for copy match the
3815 	 * offsets of the last items (highest slots).
3816 	 */
3817 	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3818 	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3819 	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3820 	btrfs_release_path(dst_path);
3821 out:
3822 	kfree(ins_data);
3823 
3824 	return ret;
3825 }
3826 
3827 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3828 				  struct btrfs_inode *inode,
3829 				  struct btrfs_path *path,
3830 				  struct btrfs_path *dst_path,
3831 				  struct btrfs_log_ctx *ctx,
3832 				  u64 *last_old_dentry_offset)
3833 {
3834 	struct btrfs_root *log = inode->root->log_root;
3835 	struct extent_buffer *src = path->nodes[0];
3836 	const int nritems = btrfs_header_nritems(src);
3837 	const u64 ino = btrfs_ino(inode);
3838 	bool last_found = false;
3839 	int batch_start = 0;
3840 	int batch_size = 0;
3841 	int i;
3842 
3843 	for (i = path->slots[0]; i < nritems; i++) {
3844 		struct btrfs_dir_item *di;
3845 		struct btrfs_key key;
3846 		int ret;
3847 
3848 		btrfs_item_key_to_cpu(src, &key, i);
3849 
3850 		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3851 			last_found = true;
3852 			break;
3853 		}
3854 
3855 		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3856 		ctx->last_dir_item_offset = key.offset;
3857 
3858 		/*
3859 		 * Skip ranges of items that consist only of dir item keys created
3860 		 * in past transactions. However if we find a gap, we must log a
3861 		 * dir index range item for that gap, so that index keys in that
3862 		 * gap are deleted during log replay.
3863 		 */
3864 		if (btrfs_dir_transid(src, di) < trans->transid) {
3865 			if (key.offset > *last_old_dentry_offset + 1) {
3866 				ret = insert_dir_log_key(trans, log, dst_path,
3867 						 ino, *last_old_dentry_offset + 1,
3868 						 key.offset - 1);
3869 				if (ret < 0)
3870 					return ret;
3871 			}
3872 
3873 			*last_old_dentry_offset = key.offset;
3874 			continue;
3875 		}
3876 		/*
3877 		 * We must make sure that when we log a directory entry, the
3878 		 * corresponding inode, after log replay, has a matching link
3879 		 * count. For example:
3880 		 *
3881 		 * touch foo
3882 		 * mkdir mydir
3883 		 * sync
3884 		 * ln foo mydir/bar
3885 		 * xfs_io -c "fsync" mydir
3886 		 * <crash>
3887 		 * <mount fs and log replay>
3888 		 *
3889 		 * Would result in a fsync log that when replayed, our file inode
3890 		 * would have a link count of 1, but we get two directory entries
3891 		 * pointing to the same inode. After removing one of the names,
3892 		 * it would not be possible to remove the other name, which
3893 		 * resulted always in stale file handle errors, and would not be
3894 		 * possible to rmdir the parent directory, since its i_size could
3895 		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3896 		 * resulting in -ENOTEMPTY errors.
3897 		 */
3898 		if (!ctx->log_new_dentries) {
3899 			struct btrfs_key di_key;
3900 
3901 			btrfs_dir_item_key_to_cpu(src, di, &di_key);
3902 			if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3903 				ctx->log_new_dentries = true;
3904 		}
3905 
3906 		if (!ctx->logged_before)
3907 			goto add_to_batch;
3908 
3909 		/*
3910 		 * If we were logged before and have logged dir items, we can skip
3911 		 * checking if any item with a key offset larger than the last one
3912 		 * we logged is in the log tree, saving time and avoiding adding
3913 		 * contention on the log tree. We can only rely on the value of
3914 		 * last_dir_index_offset when we know for sure that the inode was
3915 		 * previously logged in the current transaction.
3916 		 */
3917 		if (key.offset > inode->last_dir_index_offset)
3918 			goto add_to_batch;
3919 		/*
3920 		 * Check if the key was already logged before. If not we can add
3921 		 * it to a batch for bulk insertion.
3922 		 */
3923 		ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
3924 		if (ret < 0) {
3925 			return ret;
3926 		} else if (ret > 0) {
3927 			btrfs_release_path(dst_path);
3928 			goto add_to_batch;
3929 		}
3930 
3931 		/*
3932 		 * Item exists in the log. Overwrite the item in the log if it
3933 		 * has different content or do nothing if it has exactly the same
3934 		 * content. And then flush the current batch if any - do it after
3935 		 * overwriting the current item, or we would deadlock otherwise,
3936 		 * since we are holding a path for the existing item.
3937 		 */
3938 		ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
3939 		if (ret < 0)
3940 			return ret;
3941 
3942 		if (batch_size > 0) {
3943 			ret = flush_dir_items_batch(trans, log, src, dst_path,
3944 						    batch_start, batch_size);
3945 			if (ret < 0)
3946 				return ret;
3947 			batch_size = 0;
3948 		}
3949 		continue;
3950 add_to_batch:
3951 		if (batch_size == 0)
3952 			batch_start = i;
3953 		batch_size++;
3954 	}
3955 
3956 	if (batch_size > 0) {
3957 		int ret;
3958 
3959 		ret = flush_dir_items_batch(trans, log, src, dst_path,
3960 					    batch_start, batch_size);
3961 		if (ret < 0)
3962 			return ret;
3963 	}
3964 
3965 	return last_found ? 1 : 0;
3966 }
3967 
3968 /*
3969  * log all the items included in the current transaction for a given
3970  * directory.  This also creates the range items in the log tree required
3971  * to replay anything deleted before the fsync
3972  */
3973 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3974 			  struct btrfs_inode *inode,
3975 			  struct btrfs_path *path,
3976 			  struct btrfs_path *dst_path,
3977 			  struct btrfs_log_ctx *ctx,
3978 			  u64 min_offset, u64 *last_offset_ret)
3979 {
3980 	struct btrfs_key min_key;
3981 	struct btrfs_root *root = inode->root;
3982 	struct btrfs_root *log = root->log_root;
3983 	int err = 0;
3984 	int ret;
3985 	u64 last_old_dentry_offset = min_offset - 1;
3986 	u64 last_offset = (u64)-1;
3987 	u64 ino = btrfs_ino(inode);
3988 
3989 	min_key.objectid = ino;
3990 	min_key.type = BTRFS_DIR_INDEX_KEY;
3991 	min_key.offset = min_offset;
3992 
3993 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3994 
3995 	/*
3996 	 * we didn't find anything from this transaction, see if there
3997 	 * is anything at all
3998 	 */
3999 	if (ret != 0 || min_key.objectid != ino ||
4000 	    min_key.type != BTRFS_DIR_INDEX_KEY) {
4001 		min_key.objectid = ino;
4002 		min_key.type = BTRFS_DIR_INDEX_KEY;
4003 		min_key.offset = (u64)-1;
4004 		btrfs_release_path(path);
4005 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
4006 		if (ret < 0) {
4007 			btrfs_release_path(path);
4008 			return ret;
4009 		}
4010 		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
4011 
4012 		/* if ret == 0 there are items for this type,
4013 		 * create a range to tell us the last key of this type.
4014 		 * otherwise, there are no items in this directory after
4015 		 * *min_offset, and we create a range to indicate that.
4016 		 */
4017 		if (ret == 0) {
4018 			struct btrfs_key tmp;
4019 
4020 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
4021 					      path->slots[0]);
4022 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
4023 				last_old_dentry_offset = tmp.offset;
4024 		}
4025 		goto done;
4026 	}
4027 
4028 	/* go backward to find any previous key */
4029 	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
4030 	if (ret == 0) {
4031 		struct btrfs_key tmp;
4032 
4033 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
4034 		/*
4035 		 * The dir index key before the first one we found that needs to
4036 		 * be logged might be in a previous leaf, and there might be a
4037 		 * gap between these keys, meaning that we had deletions that
4038 		 * happened. So the key range item we log (key type
4039 		 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
4040 		 * previous key's offset plus 1, so that those deletes are replayed.
4041 		 */
4042 		if (tmp.type == BTRFS_DIR_INDEX_KEY)
4043 			last_old_dentry_offset = tmp.offset;
4044 	}
4045 	btrfs_release_path(path);
4046 
4047 	/*
4048 	 * Find the first key from this transaction again.  See the note for
4049 	 * log_new_dir_dentries, if we're logging a directory recursively we
4050 	 * won't be holding its i_mutex, which means we can modify the directory
4051 	 * while we're logging it.  If we remove an entry between our first
4052 	 * search and this search we'll not find the key again and can just
4053 	 * bail.
4054 	 */
4055 search:
4056 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
4057 	if (ret != 0)
4058 		goto done;
4059 
4060 	/*
4061 	 * we have a block from this transaction, log every item in it
4062 	 * from our directory
4063 	 */
4064 	while (1) {
4065 		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
4066 					     &last_old_dentry_offset);
4067 		if (ret != 0) {
4068 			if (ret < 0)
4069 				err = ret;
4070 			goto done;
4071 		}
4072 		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
4073 
4074 		/*
4075 		 * look ahead to the next item and see if it is also
4076 		 * from this directory and from this transaction
4077 		 */
4078 		ret = btrfs_next_leaf(root, path);
4079 		if (ret) {
4080 			if (ret == 1)
4081 				last_offset = (u64)-1;
4082 			else
4083 				err = ret;
4084 			goto done;
4085 		}
4086 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
4087 		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
4088 			last_offset = (u64)-1;
4089 			goto done;
4090 		}
4091 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
4092 			/*
4093 			 * The next leaf was not changed in the current transaction
4094 			 * and has at least one dir index key.
4095 			 * We check for the next key because there might have been
4096 			 * one or more deletions between the last key we logged and
4097 			 * that next key. So the key range item we log (key type
4098 			 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
4099 			 * offset minus 1, so that those deletes are replayed.
4100 			 */
4101 			last_offset = min_key.offset - 1;
4102 			goto done;
4103 		}
4104 		if (need_resched()) {
4105 			btrfs_release_path(path);
4106 			cond_resched();
4107 			goto search;
4108 		}
4109 	}
4110 done:
4111 	btrfs_release_path(path);
4112 	btrfs_release_path(dst_path);
4113 
4114 	if (err == 0) {
4115 		*last_offset_ret = last_offset;
4116 		/*
4117 		 * In case the leaf was changed in the current transaction but
4118 		 * all its dir items are from a past transaction, the last item
4119 		 * in the leaf is a dir item and there's no gap between that last
4120 		 * dir item and the first one on the next leaf (which did not
4121 		 * change in the current transaction), then we don't need to log
4122 		 * a range, last_old_dentry_offset is == to last_offset.
4123 		 */
4124 		ASSERT(last_old_dentry_offset <= last_offset);
4125 		if (last_old_dentry_offset < last_offset) {
4126 			ret = insert_dir_log_key(trans, log, path, ino,
4127 						 last_old_dentry_offset + 1,
4128 						 last_offset);
4129 			if (ret)
4130 				err = ret;
4131 		}
4132 	}
4133 	return err;
4134 }
4135 
4136 /*
4137  * logging directories is very similar to logging inodes, We find all the items
4138  * from the current transaction and write them to the log.
4139  *
4140  * The recovery code scans the directory in the subvolume, and if it finds a
4141  * key in the range logged that is not present in the log tree, then it means
4142  * that dir entry was unlinked during the transaction.
4143  *
4144  * In order for that scan to work, we must include one key smaller than
4145  * the smallest logged by this transaction and one key larger than the largest
4146  * key logged by this transaction.
4147  */
4148 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4149 			  struct btrfs_inode *inode,
4150 			  struct btrfs_path *path,
4151 			  struct btrfs_path *dst_path,
4152 			  struct btrfs_log_ctx *ctx)
4153 {
4154 	u64 min_key;
4155 	u64 max_key;
4156 	int ret;
4157 
4158 	min_key = BTRFS_DIR_START_INDEX;
4159 	max_key = 0;
4160 	ctx->last_dir_item_offset = inode->last_dir_index_offset;
4161 
4162 	while (1) {
4163 		ret = log_dir_items(trans, inode, path, dst_path,
4164 				ctx, min_key, &max_key);
4165 		if (ret)
4166 			return ret;
4167 		if (max_key == (u64)-1)
4168 			break;
4169 		min_key = max_key + 1;
4170 	}
4171 
4172 	inode->last_dir_index_offset = ctx->last_dir_item_offset;
4173 
4174 	return 0;
4175 }
4176 
4177 /*
4178  * a helper function to drop items from the log before we relog an
4179  * inode.  max_key_type indicates the highest item type to remove.
4180  * This cannot be run for file data extents because it does not
4181  * free the extents they point to.
4182  */
4183 static int drop_inode_items(struct btrfs_trans_handle *trans,
4184 				  struct btrfs_root *log,
4185 				  struct btrfs_path *path,
4186 				  struct btrfs_inode *inode,
4187 				  int max_key_type)
4188 {
4189 	int ret;
4190 	struct btrfs_key key;
4191 	struct btrfs_key found_key;
4192 	int start_slot;
4193 
4194 	key.objectid = btrfs_ino(inode);
4195 	key.type = max_key_type;
4196 	key.offset = (u64)-1;
4197 
4198 	while (1) {
4199 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4200 		BUG_ON(ret == 0); /* Logic error */
4201 		if (ret < 0)
4202 			break;
4203 
4204 		if (path->slots[0] == 0)
4205 			break;
4206 
4207 		path->slots[0]--;
4208 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4209 				      path->slots[0]);
4210 
4211 		if (found_key.objectid != key.objectid)
4212 			break;
4213 
4214 		found_key.offset = 0;
4215 		found_key.type = 0;
4216 		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
4217 		if (ret < 0)
4218 			break;
4219 
4220 		ret = btrfs_del_items(trans, log, path, start_slot,
4221 				      path->slots[0] - start_slot + 1);
4222 		/*
4223 		 * If start slot isn't 0 then we don't need to re-search, we've
4224 		 * found the last guy with the objectid in this tree.
4225 		 */
4226 		if (ret || start_slot != 0)
4227 			break;
4228 		btrfs_release_path(path);
4229 	}
4230 	btrfs_release_path(path);
4231 	if (ret > 0)
4232 		ret = 0;
4233 	return ret;
4234 }
4235 
4236 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4237 				struct btrfs_root *log_root,
4238 				struct btrfs_inode *inode,
4239 				u64 new_size, u32 min_type)
4240 {
4241 	struct btrfs_truncate_control control = {
4242 		.new_size = new_size,
4243 		.ino = btrfs_ino(inode),
4244 		.min_type = min_type,
4245 		.skip_ref_updates = true,
4246 	};
4247 
4248 	return btrfs_truncate_inode_items(trans, log_root, &control);
4249 }
4250 
4251 static void fill_inode_item(struct btrfs_trans_handle *trans,
4252 			    struct extent_buffer *leaf,
4253 			    struct btrfs_inode_item *item,
4254 			    struct inode *inode, int log_inode_only,
4255 			    u64 logged_isize)
4256 {
4257 	struct btrfs_map_token token;
4258 	u64 flags;
4259 
4260 	btrfs_init_map_token(&token, leaf);
4261 
4262 	if (log_inode_only) {
4263 		/* set the generation to zero so the recover code
4264 		 * can tell the difference between an logging
4265 		 * just to say 'this inode exists' and a logging
4266 		 * to say 'update this inode with these values'
4267 		 */
4268 		btrfs_set_token_inode_generation(&token, item, 0);
4269 		btrfs_set_token_inode_size(&token, item, logged_isize);
4270 	} else {
4271 		btrfs_set_token_inode_generation(&token, item,
4272 						 BTRFS_I(inode)->generation);
4273 		btrfs_set_token_inode_size(&token, item, inode->i_size);
4274 	}
4275 
4276 	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4277 	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4278 	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4279 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4280 
4281 	btrfs_set_token_timespec_sec(&token, &item->atime,
4282 				     inode->i_atime.tv_sec);
4283 	btrfs_set_token_timespec_nsec(&token, &item->atime,
4284 				      inode->i_atime.tv_nsec);
4285 
4286 	btrfs_set_token_timespec_sec(&token, &item->mtime,
4287 				     inode->i_mtime.tv_sec);
4288 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
4289 				      inode->i_mtime.tv_nsec);
4290 
4291 	btrfs_set_token_timespec_sec(&token, &item->ctime,
4292 				     inode->i_ctime.tv_sec);
4293 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
4294 				      inode->i_ctime.tv_nsec);
4295 
4296 	/*
4297 	 * We do not need to set the nbytes field, in fact during a fast fsync
4298 	 * its value may not even be correct, since a fast fsync does not wait
4299 	 * for ordered extent completion, which is where we update nbytes, it
4300 	 * only waits for writeback to complete. During log replay as we find
4301 	 * file extent items and replay them, we adjust the nbytes field of the
4302 	 * inode item in subvolume tree as needed (see overwrite_item()).
4303 	 */
4304 
4305 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4306 	btrfs_set_token_inode_transid(&token, item, trans->transid);
4307 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4308 	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4309 					  BTRFS_I(inode)->ro_flags);
4310 	btrfs_set_token_inode_flags(&token, item, flags);
4311 	btrfs_set_token_inode_block_group(&token, item, 0);
4312 }
4313 
4314 static int log_inode_item(struct btrfs_trans_handle *trans,
4315 			  struct btrfs_root *log, struct btrfs_path *path,
4316 			  struct btrfs_inode *inode, bool inode_item_dropped)
4317 {
4318 	struct btrfs_inode_item *inode_item;
4319 	int ret;
4320 
4321 	/*
4322 	 * If we are doing a fast fsync and the inode was logged before in the
4323 	 * current transaction, then we know the inode was previously logged and
4324 	 * it exists in the log tree. For performance reasons, in this case use
4325 	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4326 	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4327 	 * contention in case there are concurrent fsyncs for other inodes of the
4328 	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4329 	 * already exists can also result in unnecessarily splitting a leaf.
4330 	 */
4331 	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4332 		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4333 		ASSERT(ret <= 0);
4334 		if (ret > 0)
4335 			ret = -ENOENT;
4336 	} else {
4337 		/*
4338 		 * This means it is the first fsync in the current transaction,
4339 		 * so the inode item is not in the log and we need to insert it.
4340 		 * We can never get -EEXIST because we are only called for a fast
4341 		 * fsync and in case an inode eviction happens after the inode was
4342 		 * logged before in the current transaction, when we load again
4343 		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4344 		 * flags and set ->logged_trans to 0.
4345 		 */
4346 		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4347 					      sizeof(*inode_item));
4348 		ASSERT(ret != -EEXIST);
4349 	}
4350 	if (ret)
4351 		return ret;
4352 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4353 				    struct btrfs_inode_item);
4354 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4355 			0, 0);
4356 	btrfs_release_path(path);
4357 	return 0;
4358 }
4359 
4360 static int log_csums(struct btrfs_trans_handle *trans,
4361 		     struct btrfs_inode *inode,
4362 		     struct btrfs_root *log_root,
4363 		     struct btrfs_ordered_sum *sums)
4364 {
4365 	const u64 lock_end = sums->bytenr + sums->len - 1;
4366 	struct extent_state *cached_state = NULL;
4367 	int ret;
4368 
4369 	/*
4370 	 * If this inode was not used for reflink operations in the current
4371 	 * transaction with new extents, then do the fast path, no need to
4372 	 * worry about logging checksum items with overlapping ranges.
4373 	 */
4374 	if (inode->last_reflink_trans < trans->transid)
4375 		return btrfs_csum_file_blocks(trans, log_root, sums);
4376 
4377 	/*
4378 	 * Serialize logging for checksums. This is to avoid racing with the
4379 	 * same checksum being logged by another task that is logging another
4380 	 * file which happens to refer to the same extent as well. Such races
4381 	 * can leave checksum items in the log with overlapping ranges.
4382 	 */
4383 	ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
4384 			       lock_end, &cached_state);
4385 	if (ret)
4386 		return ret;
4387 	/*
4388 	 * Due to extent cloning, we might have logged a csum item that covers a
4389 	 * subrange of a cloned extent, and later we can end up logging a csum
4390 	 * item for a larger subrange of the same extent or the entire range.
4391 	 * This would leave csum items in the log tree that cover the same range
4392 	 * and break the searches for checksums in the log tree, resulting in
4393 	 * some checksums missing in the fs/subvolume tree. So just delete (or
4394 	 * trim and adjust) any existing csum items in the log for this range.
4395 	 */
4396 	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4397 	if (!ret)
4398 		ret = btrfs_csum_file_blocks(trans, log_root, sums);
4399 
4400 	unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
4401 			     &cached_state);
4402 
4403 	return ret;
4404 }
4405 
4406 static noinline int copy_items(struct btrfs_trans_handle *trans,
4407 			       struct btrfs_inode *inode,
4408 			       struct btrfs_path *dst_path,
4409 			       struct btrfs_path *src_path,
4410 			       int start_slot, int nr, int inode_only,
4411 			       u64 logged_isize)
4412 {
4413 	struct btrfs_root *log = inode->root->log_root;
4414 	struct btrfs_file_extent_item *extent;
4415 	struct extent_buffer *src = src_path->nodes[0];
4416 	int ret = 0;
4417 	struct btrfs_key *ins_keys;
4418 	u32 *ins_sizes;
4419 	struct btrfs_item_batch batch;
4420 	char *ins_data;
4421 	int i;
4422 	int dst_index;
4423 	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4424 	const u64 i_size = i_size_read(&inode->vfs_inode);
4425 
4426 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4427 			   nr * sizeof(u32), GFP_NOFS);
4428 	if (!ins_data)
4429 		return -ENOMEM;
4430 
4431 	ins_sizes = (u32 *)ins_data;
4432 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4433 	batch.keys = ins_keys;
4434 	batch.data_sizes = ins_sizes;
4435 	batch.total_data_size = 0;
4436 	batch.nr = 0;
4437 
4438 	dst_index = 0;
4439 	for (i = 0; i < nr; i++) {
4440 		const int src_slot = start_slot + i;
4441 		struct btrfs_root *csum_root;
4442 		struct btrfs_ordered_sum *sums;
4443 		struct btrfs_ordered_sum *sums_next;
4444 		LIST_HEAD(ordered_sums);
4445 		u64 disk_bytenr;
4446 		u64 disk_num_bytes;
4447 		u64 extent_offset;
4448 		u64 extent_num_bytes;
4449 		bool is_old_extent;
4450 
4451 		btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4452 
4453 		if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4454 			goto add_to_batch;
4455 
4456 		extent = btrfs_item_ptr(src, src_slot,
4457 					struct btrfs_file_extent_item);
4458 
4459 		is_old_extent = (btrfs_file_extent_generation(src, extent) <
4460 				 trans->transid);
4461 
4462 		/*
4463 		 * Don't copy extents from past generations. That would make us
4464 		 * log a lot more metadata for common cases like doing only a
4465 		 * few random writes into a file and then fsync it for the first
4466 		 * time or after the full sync flag is set on the inode. We can
4467 		 * get leaves full of extent items, most of which are from past
4468 		 * generations, so we can skip them - as long as the inode has
4469 		 * not been the target of a reflink operation in this transaction,
4470 		 * as in that case it might have had file extent items with old
4471 		 * generations copied into it. We also must always log prealloc
4472 		 * extents that start at or beyond eof, otherwise we would lose
4473 		 * them on log replay.
4474 		 */
4475 		if (is_old_extent &&
4476 		    ins_keys[dst_index].offset < i_size &&
4477 		    inode->last_reflink_trans < trans->transid)
4478 			continue;
4479 
4480 		if (skip_csum)
4481 			goto add_to_batch;
4482 
4483 		/* Only regular extents have checksums. */
4484 		if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4485 			goto add_to_batch;
4486 
4487 		/*
4488 		 * If it's an extent created in a past transaction, then its
4489 		 * checksums are already accessible from the committed csum tree,
4490 		 * no need to log them.
4491 		 */
4492 		if (is_old_extent)
4493 			goto add_to_batch;
4494 
4495 		disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4496 		/* If it's an explicit hole, there are no checksums. */
4497 		if (disk_bytenr == 0)
4498 			goto add_to_batch;
4499 
4500 		disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4501 
4502 		if (btrfs_file_extent_compression(src, extent)) {
4503 			extent_offset = 0;
4504 			extent_num_bytes = disk_num_bytes;
4505 		} else {
4506 			extent_offset = btrfs_file_extent_offset(src, extent);
4507 			extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4508 		}
4509 
4510 		csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4511 		disk_bytenr += extent_offset;
4512 		ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
4513 					       disk_bytenr + extent_num_bytes - 1,
4514 					       &ordered_sums, 0);
4515 		if (ret)
4516 			goto out;
4517 
4518 		list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4519 			if (!ret)
4520 				ret = log_csums(trans, inode, log, sums);
4521 			list_del(&sums->list);
4522 			kfree(sums);
4523 		}
4524 		if (ret)
4525 			goto out;
4526 
4527 add_to_batch:
4528 		ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4529 		batch.total_data_size += ins_sizes[dst_index];
4530 		batch.nr++;
4531 		dst_index++;
4532 	}
4533 
4534 	/*
4535 	 * We have a leaf full of old extent items that don't need to be logged,
4536 	 * so we don't need to do anything.
4537 	 */
4538 	if (batch.nr == 0)
4539 		goto out;
4540 
4541 	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4542 	if (ret)
4543 		goto out;
4544 
4545 	dst_index = 0;
4546 	for (i = 0; i < nr; i++) {
4547 		const int src_slot = start_slot + i;
4548 		const int dst_slot = dst_path->slots[0] + dst_index;
4549 		struct btrfs_key key;
4550 		unsigned long src_offset;
4551 		unsigned long dst_offset;
4552 
4553 		/*
4554 		 * We're done, all the remaining items in the source leaf
4555 		 * correspond to old file extent items.
4556 		 */
4557 		if (dst_index >= batch.nr)
4558 			break;
4559 
4560 		btrfs_item_key_to_cpu(src, &key, src_slot);
4561 
4562 		if (key.type != BTRFS_EXTENT_DATA_KEY)
4563 			goto copy_item;
4564 
4565 		extent = btrfs_item_ptr(src, src_slot,
4566 					struct btrfs_file_extent_item);
4567 
4568 		/* See the comment in the previous loop, same logic. */
4569 		if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4570 		    key.offset < i_size &&
4571 		    inode->last_reflink_trans < trans->transid)
4572 			continue;
4573 
4574 copy_item:
4575 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4576 		src_offset = btrfs_item_ptr_offset(src, src_slot);
4577 
4578 		if (key.type == BTRFS_INODE_ITEM_KEY) {
4579 			struct btrfs_inode_item *inode_item;
4580 
4581 			inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4582 						    struct btrfs_inode_item);
4583 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4584 					&inode->vfs_inode,
4585 					inode_only == LOG_INODE_EXISTS,
4586 					logged_isize);
4587 		} else {
4588 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4589 					   src_offset, ins_sizes[dst_index]);
4590 		}
4591 
4592 		dst_index++;
4593 	}
4594 
4595 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4596 	btrfs_release_path(dst_path);
4597 out:
4598 	kfree(ins_data);
4599 
4600 	return ret;
4601 }
4602 
4603 static int extent_cmp(void *priv, const struct list_head *a,
4604 		      const struct list_head *b)
4605 {
4606 	const struct extent_map *em1, *em2;
4607 
4608 	em1 = list_entry(a, struct extent_map, list);
4609 	em2 = list_entry(b, struct extent_map, list);
4610 
4611 	if (em1->start < em2->start)
4612 		return -1;
4613 	else if (em1->start > em2->start)
4614 		return 1;
4615 	return 0;
4616 }
4617 
4618 static int log_extent_csums(struct btrfs_trans_handle *trans,
4619 			    struct btrfs_inode *inode,
4620 			    struct btrfs_root *log_root,
4621 			    const struct extent_map *em,
4622 			    struct btrfs_log_ctx *ctx)
4623 {
4624 	struct btrfs_ordered_extent *ordered;
4625 	struct btrfs_root *csum_root;
4626 	u64 csum_offset;
4627 	u64 csum_len;
4628 	u64 mod_start = em->mod_start;
4629 	u64 mod_len = em->mod_len;
4630 	LIST_HEAD(ordered_sums);
4631 	int ret = 0;
4632 
4633 	if (inode->flags & BTRFS_INODE_NODATASUM ||
4634 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4635 	    em->block_start == EXTENT_MAP_HOLE)
4636 		return 0;
4637 
4638 	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4639 		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4640 		const u64 mod_end = mod_start + mod_len;
4641 		struct btrfs_ordered_sum *sums;
4642 
4643 		if (mod_len == 0)
4644 			break;
4645 
4646 		if (ordered_end <= mod_start)
4647 			continue;
4648 		if (mod_end <= ordered->file_offset)
4649 			break;
4650 
4651 		/*
4652 		 * We are going to copy all the csums on this ordered extent, so
4653 		 * go ahead and adjust mod_start and mod_len in case this ordered
4654 		 * extent has already been logged.
4655 		 */
4656 		if (ordered->file_offset > mod_start) {
4657 			if (ordered_end >= mod_end)
4658 				mod_len = ordered->file_offset - mod_start;
4659 			/*
4660 			 * If we have this case
4661 			 *
4662 			 * |--------- logged extent ---------|
4663 			 *       |----- ordered extent ----|
4664 			 *
4665 			 * Just don't mess with mod_start and mod_len, we'll
4666 			 * just end up logging more csums than we need and it
4667 			 * will be ok.
4668 			 */
4669 		} else {
4670 			if (ordered_end < mod_end) {
4671 				mod_len = mod_end - ordered_end;
4672 				mod_start = ordered_end;
4673 			} else {
4674 				mod_len = 0;
4675 			}
4676 		}
4677 
4678 		/*
4679 		 * To keep us from looping for the above case of an ordered
4680 		 * extent that falls inside of the logged extent.
4681 		 */
4682 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4683 			continue;
4684 
4685 		list_for_each_entry(sums, &ordered->list, list) {
4686 			ret = log_csums(trans, inode, log_root, sums);
4687 			if (ret)
4688 				return ret;
4689 		}
4690 	}
4691 
4692 	/* We're done, found all csums in the ordered extents. */
4693 	if (mod_len == 0)
4694 		return 0;
4695 
4696 	/* If we're compressed we have to save the entire range of csums. */
4697 	if (em->compress_type) {
4698 		csum_offset = 0;
4699 		csum_len = max(em->block_len, em->orig_block_len);
4700 	} else {
4701 		csum_offset = mod_start - em->start;
4702 		csum_len = mod_len;
4703 	}
4704 
4705 	/* block start is already adjusted for the file extent offset. */
4706 	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4707 	ret = btrfs_lookup_csums_range(csum_root,
4708 				       em->block_start + csum_offset,
4709 				       em->block_start + csum_offset +
4710 				       csum_len - 1, &ordered_sums, 0);
4711 	if (ret)
4712 		return ret;
4713 
4714 	while (!list_empty(&ordered_sums)) {
4715 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4716 						   struct btrfs_ordered_sum,
4717 						   list);
4718 		if (!ret)
4719 			ret = log_csums(trans, inode, log_root, sums);
4720 		list_del(&sums->list);
4721 		kfree(sums);
4722 	}
4723 
4724 	return ret;
4725 }
4726 
4727 static int log_one_extent(struct btrfs_trans_handle *trans,
4728 			  struct btrfs_inode *inode,
4729 			  const struct extent_map *em,
4730 			  struct btrfs_path *path,
4731 			  struct btrfs_log_ctx *ctx)
4732 {
4733 	struct btrfs_drop_extents_args drop_args = { 0 };
4734 	struct btrfs_root *log = inode->root->log_root;
4735 	struct btrfs_file_extent_item fi = { 0 };
4736 	struct extent_buffer *leaf;
4737 	struct btrfs_key key;
4738 	u64 extent_offset = em->start - em->orig_start;
4739 	u64 block_len;
4740 	int ret;
4741 
4742 	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4743 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4744 		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4745 	else
4746 		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4747 
4748 	block_len = max(em->block_len, em->orig_block_len);
4749 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4750 		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
4751 		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4752 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4753 		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
4754 							extent_offset);
4755 		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4756 	}
4757 
4758 	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4759 	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4760 	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4761 	btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
4762 
4763 	ret = log_extent_csums(trans, inode, log, em, ctx);
4764 	if (ret)
4765 		return ret;
4766 
4767 	/*
4768 	 * If this is the first time we are logging the inode in the current
4769 	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4770 	 * because it does a deletion search, which always acquires write locks
4771 	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4772 	 * but also adds significant contention in a log tree, since log trees
4773 	 * are small, with a root at level 2 or 3 at most, due to their short
4774 	 * life span.
4775 	 */
4776 	if (ctx->logged_before) {
4777 		drop_args.path = path;
4778 		drop_args.start = em->start;
4779 		drop_args.end = em->start + em->len;
4780 		drop_args.replace_extent = true;
4781 		drop_args.extent_item_size = sizeof(fi);
4782 		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4783 		if (ret)
4784 			return ret;
4785 	}
4786 
4787 	if (!drop_args.extent_inserted) {
4788 		key.objectid = btrfs_ino(inode);
4789 		key.type = BTRFS_EXTENT_DATA_KEY;
4790 		key.offset = em->start;
4791 
4792 		ret = btrfs_insert_empty_item(trans, log, path, &key,
4793 					      sizeof(fi));
4794 		if (ret)
4795 			return ret;
4796 	}
4797 	leaf = path->nodes[0];
4798 	write_extent_buffer(leaf, &fi,
4799 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4800 			    sizeof(fi));
4801 	btrfs_mark_buffer_dirty(leaf);
4802 
4803 	btrfs_release_path(path);
4804 
4805 	return ret;
4806 }
4807 
4808 /*
4809  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4810  * lose them after doing a full/fast fsync and replaying the log. We scan the
4811  * subvolume's root instead of iterating the inode's extent map tree because
4812  * otherwise we can log incorrect extent items based on extent map conversion.
4813  * That can happen due to the fact that extent maps are merged when they
4814  * are not in the extent map tree's list of modified extents.
4815  */
4816 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4817 				      struct btrfs_inode *inode,
4818 				      struct btrfs_path *path)
4819 {
4820 	struct btrfs_root *root = inode->root;
4821 	struct btrfs_key key;
4822 	const u64 i_size = i_size_read(&inode->vfs_inode);
4823 	const u64 ino = btrfs_ino(inode);
4824 	struct btrfs_path *dst_path = NULL;
4825 	bool dropped_extents = false;
4826 	u64 truncate_offset = i_size;
4827 	struct extent_buffer *leaf;
4828 	int slot;
4829 	int ins_nr = 0;
4830 	int start_slot;
4831 	int ret;
4832 
4833 	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4834 		return 0;
4835 
4836 	key.objectid = ino;
4837 	key.type = BTRFS_EXTENT_DATA_KEY;
4838 	key.offset = i_size;
4839 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4840 	if (ret < 0)
4841 		goto out;
4842 
4843 	/*
4844 	 * We must check if there is a prealloc extent that starts before the
4845 	 * i_size and crosses the i_size boundary. This is to ensure later we
4846 	 * truncate down to the end of that extent and not to the i_size, as
4847 	 * otherwise we end up losing part of the prealloc extent after a log
4848 	 * replay and with an implicit hole if there is another prealloc extent
4849 	 * that starts at an offset beyond i_size.
4850 	 */
4851 	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4852 	if (ret < 0)
4853 		goto out;
4854 
4855 	if (ret == 0) {
4856 		struct btrfs_file_extent_item *ei;
4857 
4858 		leaf = path->nodes[0];
4859 		slot = path->slots[0];
4860 		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4861 
4862 		if (btrfs_file_extent_type(leaf, ei) ==
4863 		    BTRFS_FILE_EXTENT_PREALLOC) {
4864 			u64 extent_end;
4865 
4866 			btrfs_item_key_to_cpu(leaf, &key, slot);
4867 			extent_end = key.offset +
4868 				btrfs_file_extent_num_bytes(leaf, ei);
4869 
4870 			if (extent_end > i_size)
4871 				truncate_offset = extent_end;
4872 		}
4873 	} else {
4874 		ret = 0;
4875 	}
4876 
4877 	while (true) {
4878 		leaf = path->nodes[0];
4879 		slot = path->slots[0];
4880 
4881 		if (slot >= btrfs_header_nritems(leaf)) {
4882 			if (ins_nr > 0) {
4883 				ret = copy_items(trans, inode, dst_path, path,
4884 						 start_slot, ins_nr, 1, 0);
4885 				if (ret < 0)
4886 					goto out;
4887 				ins_nr = 0;
4888 			}
4889 			ret = btrfs_next_leaf(root, path);
4890 			if (ret < 0)
4891 				goto out;
4892 			if (ret > 0) {
4893 				ret = 0;
4894 				break;
4895 			}
4896 			continue;
4897 		}
4898 
4899 		btrfs_item_key_to_cpu(leaf, &key, slot);
4900 		if (key.objectid > ino)
4901 			break;
4902 		if (WARN_ON_ONCE(key.objectid < ino) ||
4903 		    key.type < BTRFS_EXTENT_DATA_KEY ||
4904 		    key.offset < i_size) {
4905 			path->slots[0]++;
4906 			continue;
4907 		}
4908 		if (!dropped_extents) {
4909 			/*
4910 			 * Avoid logging extent items logged in past fsync calls
4911 			 * and leading to duplicate keys in the log tree.
4912 			 */
4913 			ret = truncate_inode_items(trans, root->log_root, inode,
4914 						   truncate_offset,
4915 						   BTRFS_EXTENT_DATA_KEY);
4916 			if (ret)
4917 				goto out;
4918 			dropped_extents = true;
4919 		}
4920 		if (ins_nr == 0)
4921 			start_slot = slot;
4922 		ins_nr++;
4923 		path->slots[0]++;
4924 		if (!dst_path) {
4925 			dst_path = btrfs_alloc_path();
4926 			if (!dst_path) {
4927 				ret = -ENOMEM;
4928 				goto out;
4929 			}
4930 		}
4931 	}
4932 	if (ins_nr > 0)
4933 		ret = copy_items(trans, inode, dst_path, path,
4934 				 start_slot, ins_nr, 1, 0);
4935 out:
4936 	btrfs_release_path(path);
4937 	btrfs_free_path(dst_path);
4938 	return ret;
4939 }
4940 
4941 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4942 				     struct btrfs_inode *inode,
4943 				     struct btrfs_path *path,
4944 				     struct btrfs_log_ctx *ctx)
4945 {
4946 	struct btrfs_ordered_extent *ordered;
4947 	struct btrfs_ordered_extent *tmp;
4948 	struct extent_map *em, *n;
4949 	struct list_head extents;
4950 	struct extent_map_tree *tree = &inode->extent_tree;
4951 	int ret = 0;
4952 	int num = 0;
4953 
4954 	INIT_LIST_HEAD(&extents);
4955 
4956 	write_lock(&tree->lock);
4957 
4958 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4959 		list_del_init(&em->list);
4960 		/*
4961 		 * Just an arbitrary number, this can be really CPU intensive
4962 		 * once we start getting a lot of extents, and really once we
4963 		 * have a bunch of extents we just want to commit since it will
4964 		 * be faster.
4965 		 */
4966 		if (++num > 32768) {
4967 			list_del_init(&tree->modified_extents);
4968 			ret = -EFBIG;
4969 			goto process;
4970 		}
4971 
4972 		if (em->generation < trans->transid)
4973 			continue;
4974 
4975 		/* We log prealloc extents beyond eof later. */
4976 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4977 		    em->start >= i_size_read(&inode->vfs_inode))
4978 			continue;
4979 
4980 		/* Need a ref to keep it from getting evicted from cache */
4981 		refcount_inc(&em->refs);
4982 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4983 		list_add_tail(&em->list, &extents);
4984 		num++;
4985 	}
4986 
4987 	list_sort(NULL, &extents, extent_cmp);
4988 process:
4989 	while (!list_empty(&extents)) {
4990 		em = list_entry(extents.next, struct extent_map, list);
4991 
4992 		list_del_init(&em->list);
4993 
4994 		/*
4995 		 * If we had an error we just need to delete everybody from our
4996 		 * private list.
4997 		 */
4998 		if (ret) {
4999 			clear_em_logging(tree, em);
5000 			free_extent_map(em);
5001 			continue;
5002 		}
5003 
5004 		write_unlock(&tree->lock);
5005 
5006 		ret = log_one_extent(trans, inode, em, path, ctx);
5007 		write_lock(&tree->lock);
5008 		clear_em_logging(tree, em);
5009 		free_extent_map(em);
5010 	}
5011 	WARN_ON(!list_empty(&extents));
5012 	write_unlock(&tree->lock);
5013 
5014 	if (!ret)
5015 		ret = btrfs_log_prealloc_extents(trans, inode, path);
5016 	if (ret)
5017 		return ret;
5018 
5019 	/*
5020 	 * We have logged all extents successfully, now make sure the commit of
5021 	 * the current transaction waits for the ordered extents to complete
5022 	 * before it commits and wipes out the log trees, otherwise we would
5023 	 * lose data if an ordered extents completes after the transaction
5024 	 * commits and a power failure happens after the transaction commit.
5025 	 */
5026 	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
5027 		list_del_init(&ordered->log_list);
5028 		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
5029 
5030 		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5031 			spin_lock_irq(&inode->ordered_tree.lock);
5032 			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5033 				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
5034 				atomic_inc(&trans->transaction->pending_ordered);
5035 			}
5036 			spin_unlock_irq(&inode->ordered_tree.lock);
5037 		}
5038 		btrfs_put_ordered_extent(ordered);
5039 	}
5040 
5041 	return 0;
5042 }
5043 
5044 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5045 			     struct btrfs_path *path, u64 *size_ret)
5046 {
5047 	struct btrfs_key key;
5048 	int ret;
5049 
5050 	key.objectid = btrfs_ino(inode);
5051 	key.type = BTRFS_INODE_ITEM_KEY;
5052 	key.offset = 0;
5053 
5054 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5055 	if (ret < 0) {
5056 		return ret;
5057 	} else if (ret > 0) {
5058 		*size_ret = 0;
5059 	} else {
5060 		struct btrfs_inode_item *item;
5061 
5062 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5063 				      struct btrfs_inode_item);
5064 		*size_ret = btrfs_inode_size(path->nodes[0], item);
5065 		/*
5066 		 * If the in-memory inode's i_size is smaller then the inode
5067 		 * size stored in the btree, return the inode's i_size, so
5068 		 * that we get a correct inode size after replaying the log
5069 		 * when before a power failure we had a shrinking truncate
5070 		 * followed by addition of a new name (rename / new hard link).
5071 		 * Otherwise return the inode size from the btree, to avoid
5072 		 * data loss when replaying a log due to previously doing a
5073 		 * write that expands the inode's size and logging a new name
5074 		 * immediately after.
5075 		 */
5076 		if (*size_ret > inode->vfs_inode.i_size)
5077 			*size_ret = inode->vfs_inode.i_size;
5078 	}
5079 
5080 	btrfs_release_path(path);
5081 	return 0;
5082 }
5083 
5084 /*
5085  * At the moment we always log all xattrs. This is to figure out at log replay
5086  * time which xattrs must have their deletion replayed. If a xattr is missing
5087  * in the log tree and exists in the fs/subvol tree, we delete it. This is
5088  * because if a xattr is deleted, the inode is fsynced and a power failure
5089  * happens, causing the log to be replayed the next time the fs is mounted,
5090  * we want the xattr to not exist anymore (same behaviour as other filesystems
5091  * with a journal, ext3/4, xfs, f2fs, etc).
5092  */
5093 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5094 				struct btrfs_inode *inode,
5095 				struct btrfs_path *path,
5096 				struct btrfs_path *dst_path)
5097 {
5098 	struct btrfs_root *root = inode->root;
5099 	int ret;
5100 	struct btrfs_key key;
5101 	const u64 ino = btrfs_ino(inode);
5102 	int ins_nr = 0;
5103 	int start_slot = 0;
5104 	bool found_xattrs = false;
5105 
5106 	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5107 		return 0;
5108 
5109 	key.objectid = ino;
5110 	key.type = BTRFS_XATTR_ITEM_KEY;
5111 	key.offset = 0;
5112 
5113 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5114 	if (ret < 0)
5115 		return ret;
5116 
5117 	while (true) {
5118 		int slot = path->slots[0];
5119 		struct extent_buffer *leaf = path->nodes[0];
5120 		int nritems = btrfs_header_nritems(leaf);
5121 
5122 		if (slot >= nritems) {
5123 			if (ins_nr > 0) {
5124 				ret = copy_items(trans, inode, dst_path, path,
5125 						 start_slot, ins_nr, 1, 0);
5126 				if (ret < 0)
5127 					return ret;
5128 				ins_nr = 0;
5129 			}
5130 			ret = btrfs_next_leaf(root, path);
5131 			if (ret < 0)
5132 				return ret;
5133 			else if (ret > 0)
5134 				break;
5135 			continue;
5136 		}
5137 
5138 		btrfs_item_key_to_cpu(leaf, &key, slot);
5139 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5140 			break;
5141 
5142 		if (ins_nr == 0)
5143 			start_slot = slot;
5144 		ins_nr++;
5145 		path->slots[0]++;
5146 		found_xattrs = true;
5147 		cond_resched();
5148 	}
5149 	if (ins_nr > 0) {
5150 		ret = copy_items(trans, inode, dst_path, path,
5151 				 start_slot, ins_nr, 1, 0);
5152 		if (ret < 0)
5153 			return ret;
5154 	}
5155 
5156 	if (!found_xattrs)
5157 		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5158 
5159 	return 0;
5160 }
5161 
5162 /*
5163  * When using the NO_HOLES feature if we punched a hole that causes the
5164  * deletion of entire leafs or all the extent items of the first leaf (the one
5165  * that contains the inode item and references) we may end up not processing
5166  * any extents, because there are no leafs with a generation matching the
5167  * current transaction that have extent items for our inode. So we need to find
5168  * if any holes exist and then log them. We also need to log holes after any
5169  * truncate operation that changes the inode's size.
5170  */
5171 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5172 			   struct btrfs_inode *inode,
5173 			   struct btrfs_path *path)
5174 {
5175 	struct btrfs_root *root = inode->root;
5176 	struct btrfs_fs_info *fs_info = root->fs_info;
5177 	struct btrfs_key key;
5178 	const u64 ino = btrfs_ino(inode);
5179 	const u64 i_size = i_size_read(&inode->vfs_inode);
5180 	u64 prev_extent_end = 0;
5181 	int ret;
5182 
5183 	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5184 		return 0;
5185 
5186 	key.objectid = ino;
5187 	key.type = BTRFS_EXTENT_DATA_KEY;
5188 	key.offset = 0;
5189 
5190 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5191 	if (ret < 0)
5192 		return ret;
5193 
5194 	while (true) {
5195 		struct extent_buffer *leaf = path->nodes[0];
5196 
5197 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5198 			ret = btrfs_next_leaf(root, path);
5199 			if (ret < 0)
5200 				return ret;
5201 			if (ret > 0) {
5202 				ret = 0;
5203 				break;
5204 			}
5205 			leaf = path->nodes[0];
5206 		}
5207 
5208 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5209 		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5210 			break;
5211 
5212 		/* We have a hole, log it. */
5213 		if (prev_extent_end < key.offset) {
5214 			const u64 hole_len = key.offset - prev_extent_end;
5215 
5216 			/*
5217 			 * Release the path to avoid deadlocks with other code
5218 			 * paths that search the root while holding locks on
5219 			 * leafs from the log root.
5220 			 */
5221 			btrfs_release_path(path);
5222 			ret = btrfs_insert_file_extent(trans, root->log_root,
5223 						       ino, prev_extent_end, 0,
5224 						       0, hole_len, 0, hole_len,
5225 						       0, 0, 0);
5226 			if (ret < 0)
5227 				return ret;
5228 
5229 			/*
5230 			 * Search for the same key again in the root. Since it's
5231 			 * an extent item and we are holding the inode lock, the
5232 			 * key must still exist. If it doesn't just emit warning
5233 			 * and return an error to fall back to a transaction
5234 			 * commit.
5235 			 */
5236 			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5237 			if (ret < 0)
5238 				return ret;
5239 			if (WARN_ON(ret > 0))
5240 				return -ENOENT;
5241 			leaf = path->nodes[0];
5242 		}
5243 
5244 		prev_extent_end = btrfs_file_extent_end(path);
5245 		path->slots[0]++;
5246 		cond_resched();
5247 	}
5248 
5249 	if (prev_extent_end < i_size) {
5250 		u64 hole_len;
5251 
5252 		btrfs_release_path(path);
5253 		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5254 		ret = btrfs_insert_file_extent(trans, root->log_root,
5255 					       ino, prev_extent_end, 0, 0,
5256 					       hole_len, 0, hole_len,
5257 					       0, 0, 0);
5258 		if (ret < 0)
5259 			return ret;
5260 	}
5261 
5262 	return 0;
5263 }
5264 
5265 /*
5266  * When we are logging a new inode X, check if it doesn't have a reference that
5267  * matches the reference from some other inode Y created in a past transaction
5268  * and that was renamed in the current transaction. If we don't do this, then at
5269  * log replay time we can lose inode Y (and all its files if it's a directory):
5270  *
5271  * mkdir /mnt/x
5272  * echo "hello world" > /mnt/x/foobar
5273  * sync
5274  * mv /mnt/x /mnt/y
5275  * mkdir /mnt/x                 # or touch /mnt/x
5276  * xfs_io -c fsync /mnt/x
5277  * <power fail>
5278  * mount fs, trigger log replay
5279  *
5280  * After the log replay procedure, we would lose the first directory and all its
5281  * files (file foobar).
5282  * For the case where inode Y is not a directory we simply end up losing it:
5283  *
5284  * echo "123" > /mnt/foo
5285  * sync
5286  * mv /mnt/foo /mnt/bar
5287  * echo "abc" > /mnt/foo
5288  * xfs_io -c fsync /mnt/foo
5289  * <power fail>
5290  *
5291  * We also need this for cases where a snapshot entry is replaced by some other
5292  * entry (file or directory) otherwise we end up with an unreplayable log due to
5293  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5294  * if it were a regular entry:
5295  *
5296  * mkdir /mnt/x
5297  * btrfs subvolume snapshot /mnt /mnt/x/snap
5298  * btrfs subvolume delete /mnt/x/snap
5299  * rmdir /mnt/x
5300  * mkdir /mnt/x
5301  * fsync /mnt/x or fsync some new file inside it
5302  * <power fail>
5303  *
5304  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5305  * the same transaction.
5306  */
5307 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5308 					 const int slot,
5309 					 const struct btrfs_key *key,
5310 					 struct btrfs_inode *inode,
5311 					 u64 *other_ino, u64 *other_parent)
5312 {
5313 	int ret;
5314 	struct btrfs_path *search_path;
5315 	char *name = NULL;
5316 	u32 name_len = 0;
5317 	u32 item_size = btrfs_item_size(eb, slot);
5318 	u32 cur_offset = 0;
5319 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5320 
5321 	search_path = btrfs_alloc_path();
5322 	if (!search_path)
5323 		return -ENOMEM;
5324 	search_path->search_commit_root = 1;
5325 	search_path->skip_locking = 1;
5326 
5327 	while (cur_offset < item_size) {
5328 		u64 parent;
5329 		u32 this_name_len;
5330 		u32 this_len;
5331 		unsigned long name_ptr;
5332 		struct btrfs_dir_item *di;
5333 
5334 		if (key->type == BTRFS_INODE_REF_KEY) {
5335 			struct btrfs_inode_ref *iref;
5336 
5337 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5338 			parent = key->offset;
5339 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
5340 			name_ptr = (unsigned long)(iref + 1);
5341 			this_len = sizeof(*iref) + this_name_len;
5342 		} else {
5343 			struct btrfs_inode_extref *extref;
5344 
5345 			extref = (struct btrfs_inode_extref *)(ptr +
5346 							       cur_offset);
5347 			parent = btrfs_inode_extref_parent(eb, extref);
5348 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
5349 			name_ptr = (unsigned long)&extref->name;
5350 			this_len = sizeof(*extref) + this_name_len;
5351 		}
5352 
5353 		if (this_name_len > name_len) {
5354 			char *new_name;
5355 
5356 			new_name = krealloc(name, this_name_len, GFP_NOFS);
5357 			if (!new_name) {
5358 				ret = -ENOMEM;
5359 				goto out;
5360 			}
5361 			name_len = this_name_len;
5362 			name = new_name;
5363 		}
5364 
5365 		read_extent_buffer(eb, name, name_ptr, this_name_len);
5366 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5367 				parent, name, this_name_len, 0);
5368 		if (di && !IS_ERR(di)) {
5369 			struct btrfs_key di_key;
5370 
5371 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5372 						  di, &di_key);
5373 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5374 				if (di_key.objectid != key->objectid) {
5375 					ret = 1;
5376 					*other_ino = di_key.objectid;
5377 					*other_parent = parent;
5378 				} else {
5379 					ret = 0;
5380 				}
5381 			} else {
5382 				ret = -EAGAIN;
5383 			}
5384 			goto out;
5385 		} else if (IS_ERR(di)) {
5386 			ret = PTR_ERR(di);
5387 			goto out;
5388 		}
5389 		btrfs_release_path(search_path);
5390 
5391 		cur_offset += this_len;
5392 	}
5393 	ret = 0;
5394 out:
5395 	btrfs_free_path(search_path);
5396 	kfree(name);
5397 	return ret;
5398 }
5399 
5400 struct btrfs_ino_list {
5401 	u64 ino;
5402 	u64 parent;
5403 	struct list_head list;
5404 };
5405 
5406 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5407 				  struct btrfs_root *root,
5408 				  struct btrfs_path *path,
5409 				  struct btrfs_log_ctx *ctx,
5410 				  u64 ino, u64 parent)
5411 {
5412 	struct btrfs_ino_list *ino_elem;
5413 	LIST_HEAD(inode_list);
5414 	int ret = 0;
5415 
5416 	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5417 	if (!ino_elem)
5418 		return -ENOMEM;
5419 	ino_elem->ino = ino;
5420 	ino_elem->parent = parent;
5421 	list_add_tail(&ino_elem->list, &inode_list);
5422 
5423 	while (!list_empty(&inode_list)) {
5424 		struct btrfs_fs_info *fs_info = root->fs_info;
5425 		struct btrfs_key key;
5426 		struct inode *inode;
5427 
5428 		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
5429 					    list);
5430 		ino = ino_elem->ino;
5431 		parent = ino_elem->parent;
5432 		list_del(&ino_elem->list);
5433 		kfree(ino_elem);
5434 		if (ret)
5435 			continue;
5436 
5437 		btrfs_release_path(path);
5438 
5439 		inode = btrfs_iget(fs_info->sb, ino, root);
5440 		/*
5441 		 * If the other inode that had a conflicting dir entry was
5442 		 * deleted in the current transaction, we need to log its parent
5443 		 * directory.
5444 		 */
5445 		if (IS_ERR(inode)) {
5446 			ret = PTR_ERR(inode);
5447 			if (ret == -ENOENT) {
5448 				inode = btrfs_iget(fs_info->sb, parent, root);
5449 				if (IS_ERR(inode)) {
5450 					ret = PTR_ERR(inode);
5451 				} else {
5452 					ret = btrfs_log_inode(trans,
5453 						      BTRFS_I(inode),
5454 						      LOG_OTHER_INODE_ALL,
5455 						      ctx);
5456 					btrfs_add_delayed_iput(inode);
5457 				}
5458 			}
5459 			continue;
5460 		}
5461 		/*
5462 		 * If the inode was already logged skip it - otherwise we can
5463 		 * hit an infinite loop. Example:
5464 		 *
5465 		 * From the commit root (previous transaction) we have the
5466 		 * following inodes:
5467 		 *
5468 		 * inode 257 a directory
5469 		 * inode 258 with references "zz" and "zz_link" on inode 257
5470 		 * inode 259 with reference "a" on inode 257
5471 		 *
5472 		 * And in the current (uncommitted) transaction we have:
5473 		 *
5474 		 * inode 257 a directory, unchanged
5475 		 * inode 258 with references "a" and "a2" on inode 257
5476 		 * inode 259 with reference "zz_link" on inode 257
5477 		 * inode 261 with reference "zz" on inode 257
5478 		 *
5479 		 * When logging inode 261 the following infinite loop could
5480 		 * happen if we don't skip already logged inodes:
5481 		 *
5482 		 * - we detect inode 258 as a conflicting inode, with inode 261
5483 		 *   on reference "zz", and log it;
5484 		 *
5485 		 * - we detect inode 259 as a conflicting inode, with inode 258
5486 		 *   on reference "a", and log it;
5487 		 *
5488 		 * - we detect inode 258 as a conflicting inode, with inode 259
5489 		 *   on reference "zz_link", and log it - again! After this we
5490 		 *   repeat the above steps forever.
5491 		 */
5492 		spin_lock(&BTRFS_I(inode)->lock);
5493 		/*
5494 		 * Check the inode's logged_trans only instead of
5495 		 * btrfs_inode_in_log(). This is because the last_log_commit of
5496 		 * the inode is not updated when we only log that it exists (see
5497 		 * btrfs_log_inode()).
5498 		 */
5499 		if (BTRFS_I(inode)->logged_trans == trans->transid) {
5500 			spin_unlock(&BTRFS_I(inode)->lock);
5501 			btrfs_add_delayed_iput(inode);
5502 			continue;
5503 		}
5504 		spin_unlock(&BTRFS_I(inode)->lock);
5505 		/*
5506 		 * We are safe logging the other inode without acquiring its
5507 		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5508 		 * are safe against concurrent renames of the other inode as
5509 		 * well because during a rename we pin the log and update the
5510 		 * log with the new name before we unpin it.
5511 		 */
5512 		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
5513 		if (ret) {
5514 			btrfs_add_delayed_iput(inode);
5515 			continue;
5516 		}
5517 
5518 		key.objectid = ino;
5519 		key.type = BTRFS_INODE_REF_KEY;
5520 		key.offset = 0;
5521 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5522 		if (ret < 0) {
5523 			btrfs_add_delayed_iput(inode);
5524 			continue;
5525 		}
5526 
5527 		while (true) {
5528 			struct extent_buffer *leaf = path->nodes[0];
5529 			int slot = path->slots[0];
5530 			u64 other_ino = 0;
5531 			u64 other_parent = 0;
5532 
5533 			if (slot >= btrfs_header_nritems(leaf)) {
5534 				ret = btrfs_next_leaf(root, path);
5535 				if (ret < 0) {
5536 					break;
5537 				} else if (ret > 0) {
5538 					ret = 0;
5539 					break;
5540 				}
5541 				continue;
5542 			}
5543 
5544 			btrfs_item_key_to_cpu(leaf, &key, slot);
5545 			if (key.objectid != ino ||
5546 			    (key.type != BTRFS_INODE_REF_KEY &&
5547 			     key.type != BTRFS_INODE_EXTREF_KEY)) {
5548 				ret = 0;
5549 				break;
5550 			}
5551 
5552 			ret = btrfs_check_ref_name_override(leaf, slot, &key,
5553 					BTRFS_I(inode), &other_ino,
5554 					&other_parent);
5555 			if (ret < 0)
5556 				break;
5557 			if (ret > 0) {
5558 				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5559 				if (!ino_elem) {
5560 					ret = -ENOMEM;
5561 					break;
5562 				}
5563 				ino_elem->ino = other_ino;
5564 				ino_elem->parent = other_parent;
5565 				list_add_tail(&ino_elem->list, &inode_list);
5566 				ret = 0;
5567 			}
5568 			path->slots[0]++;
5569 		}
5570 		btrfs_add_delayed_iput(inode);
5571 	}
5572 
5573 	return ret;
5574 }
5575 
5576 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5577 				   struct btrfs_inode *inode,
5578 				   struct btrfs_key *min_key,
5579 				   const struct btrfs_key *max_key,
5580 				   struct btrfs_path *path,
5581 				   struct btrfs_path *dst_path,
5582 				   const u64 logged_isize,
5583 				   const bool recursive_logging,
5584 				   const int inode_only,
5585 				   struct btrfs_log_ctx *ctx,
5586 				   bool *need_log_inode_item)
5587 {
5588 	const u64 i_size = i_size_read(&inode->vfs_inode);
5589 	struct btrfs_root *root = inode->root;
5590 	int ins_start_slot = 0;
5591 	int ins_nr = 0;
5592 	int ret;
5593 
5594 	while (1) {
5595 		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5596 		if (ret < 0)
5597 			return ret;
5598 		if (ret > 0) {
5599 			ret = 0;
5600 			break;
5601 		}
5602 again:
5603 		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5604 		if (min_key->objectid != max_key->objectid)
5605 			break;
5606 		if (min_key->type > max_key->type)
5607 			break;
5608 
5609 		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5610 			*need_log_inode_item = false;
5611 		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5612 			   min_key->offset >= i_size) {
5613 			/*
5614 			 * Extents at and beyond eof are logged with
5615 			 * btrfs_log_prealloc_extents().
5616 			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5617 			 * and no keys greater than that, so bail out.
5618 			 */
5619 			break;
5620 		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5621 			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5622 			   inode->generation == trans->transid &&
5623 			   !recursive_logging) {
5624 			u64 other_ino = 0;
5625 			u64 other_parent = 0;
5626 
5627 			ret = btrfs_check_ref_name_override(path->nodes[0],
5628 					path->slots[0], min_key, inode,
5629 					&other_ino, &other_parent);
5630 			if (ret < 0) {
5631 				return ret;
5632 			} else if (ret > 0 &&
5633 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5634 				if (ins_nr > 0) {
5635 					ins_nr++;
5636 				} else {
5637 					ins_nr = 1;
5638 					ins_start_slot = path->slots[0];
5639 				}
5640 				ret = copy_items(trans, inode, dst_path, path,
5641 						 ins_start_slot, ins_nr,
5642 						 inode_only, logged_isize);
5643 				if (ret < 0)
5644 					return ret;
5645 				ins_nr = 0;
5646 
5647 				ret = log_conflicting_inodes(trans, root, path,
5648 						ctx, other_ino, other_parent);
5649 				if (ret)
5650 					return ret;
5651 				btrfs_release_path(path);
5652 				goto next_key;
5653 			}
5654 		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5655 			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5656 			if (ins_nr == 0)
5657 				goto next_slot;
5658 			ret = copy_items(trans, inode, dst_path, path,
5659 					 ins_start_slot,
5660 					 ins_nr, inode_only, logged_isize);
5661 			if (ret < 0)
5662 				return ret;
5663 			ins_nr = 0;
5664 			goto next_slot;
5665 		}
5666 
5667 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5668 			ins_nr++;
5669 			goto next_slot;
5670 		} else if (!ins_nr) {
5671 			ins_start_slot = path->slots[0];
5672 			ins_nr = 1;
5673 			goto next_slot;
5674 		}
5675 
5676 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5677 				 ins_nr, inode_only, logged_isize);
5678 		if (ret < 0)
5679 			return ret;
5680 		ins_nr = 1;
5681 		ins_start_slot = path->slots[0];
5682 next_slot:
5683 		path->slots[0]++;
5684 		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5685 			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5686 					      path->slots[0]);
5687 			goto again;
5688 		}
5689 		if (ins_nr) {
5690 			ret = copy_items(trans, inode, dst_path, path,
5691 					 ins_start_slot, ins_nr, inode_only,
5692 					 logged_isize);
5693 			if (ret < 0)
5694 				return ret;
5695 			ins_nr = 0;
5696 		}
5697 		btrfs_release_path(path);
5698 next_key:
5699 		if (min_key->offset < (u64)-1) {
5700 			min_key->offset++;
5701 		} else if (min_key->type < max_key->type) {
5702 			min_key->type++;
5703 			min_key->offset = 0;
5704 		} else {
5705 			break;
5706 		}
5707 
5708 		/*
5709 		 * We may process many leaves full of items for our inode, so
5710 		 * avoid monopolizing a cpu for too long by rescheduling while
5711 		 * not holding locks on any tree.
5712 		 */
5713 		cond_resched();
5714 	}
5715 	if (ins_nr) {
5716 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5717 				 ins_nr, inode_only, logged_isize);
5718 		if (ret)
5719 			return ret;
5720 	}
5721 
5722 	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5723 		/*
5724 		 * Release the path because otherwise we might attempt to double
5725 		 * lock the same leaf with btrfs_log_prealloc_extents() below.
5726 		 */
5727 		btrfs_release_path(path);
5728 		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5729 	}
5730 
5731 	return ret;
5732 }
5733 
5734 /* log a single inode in the tree log.
5735  * At least one parent directory for this inode must exist in the tree
5736  * or be logged already.
5737  *
5738  * Any items from this inode changed by the current transaction are copied
5739  * to the log tree.  An extra reference is taken on any extents in this
5740  * file, allowing us to avoid a whole pile of corner cases around logging
5741  * blocks that have been removed from the tree.
5742  *
5743  * See LOG_INODE_ALL and related defines for a description of what inode_only
5744  * does.
5745  *
5746  * This handles both files and directories.
5747  */
5748 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5749 			   struct btrfs_inode *inode,
5750 			   int inode_only,
5751 			   struct btrfs_log_ctx *ctx)
5752 {
5753 	struct btrfs_path *path;
5754 	struct btrfs_path *dst_path;
5755 	struct btrfs_key min_key;
5756 	struct btrfs_key max_key;
5757 	struct btrfs_root *log = inode->root->log_root;
5758 	int ret;
5759 	bool fast_search = false;
5760 	u64 ino = btrfs_ino(inode);
5761 	struct extent_map_tree *em_tree = &inode->extent_tree;
5762 	u64 logged_isize = 0;
5763 	bool need_log_inode_item = true;
5764 	bool xattrs_logged = false;
5765 	bool recursive_logging = false;
5766 	bool inode_item_dropped = true;
5767 	const bool orig_logged_before = ctx->logged_before;
5768 
5769 	path = btrfs_alloc_path();
5770 	if (!path)
5771 		return -ENOMEM;
5772 	dst_path = btrfs_alloc_path();
5773 	if (!dst_path) {
5774 		btrfs_free_path(path);
5775 		return -ENOMEM;
5776 	}
5777 
5778 	min_key.objectid = ino;
5779 	min_key.type = BTRFS_INODE_ITEM_KEY;
5780 	min_key.offset = 0;
5781 
5782 	max_key.objectid = ino;
5783 
5784 
5785 	/* today the code can only do partial logging of directories */
5786 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
5787 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5788 		       &inode->runtime_flags) &&
5789 	     inode_only >= LOG_INODE_EXISTS))
5790 		max_key.type = BTRFS_XATTR_ITEM_KEY;
5791 	else
5792 		max_key.type = (u8)-1;
5793 	max_key.offset = (u64)-1;
5794 
5795 	/*
5796 	 * Only run delayed items if we are a directory. We want to make sure
5797 	 * all directory indexes hit the fs/subvolume tree so we can find them
5798 	 * and figure out which index ranges have to be logged.
5799 	 */
5800 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5801 		ret = btrfs_commit_inode_delayed_items(trans, inode);
5802 		if (ret)
5803 			goto out;
5804 	}
5805 
5806 	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5807 		recursive_logging = true;
5808 		if (inode_only == LOG_OTHER_INODE)
5809 			inode_only = LOG_INODE_EXISTS;
5810 		else
5811 			inode_only = LOG_INODE_ALL;
5812 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5813 	} else {
5814 		mutex_lock(&inode->log_mutex);
5815 	}
5816 
5817 	/*
5818 	 * For symlinks, we must always log their content, which is stored in an
5819 	 * inline extent, otherwise we could end up with an empty symlink after
5820 	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
5821 	 * one attempts to create an empty symlink).
5822 	 * We don't need to worry about flushing delalloc, because when we create
5823 	 * the inline extent when the symlink is created (we never have delalloc
5824 	 * for symlinks).
5825 	 */
5826 	if (S_ISLNK(inode->vfs_inode.i_mode))
5827 		inode_only = LOG_INODE_ALL;
5828 
5829 	/*
5830 	 * Before logging the inode item, cache the value returned by
5831 	 * inode_logged(), because after that we have the need to figure out if
5832 	 * the inode was previously logged in this transaction.
5833 	 */
5834 	ret = inode_logged(trans, inode, path);
5835 	if (ret < 0)
5836 		goto out_unlock;
5837 	ctx->logged_before = (ret == 1);
5838 	ret = 0;
5839 
5840 	/*
5841 	 * This is for cases where logging a directory could result in losing a
5842 	 * a file after replaying the log. For example, if we move a file from a
5843 	 * directory A to a directory B, then fsync directory A, we have no way
5844 	 * to known the file was moved from A to B, so logging just A would
5845 	 * result in losing the file after a log replay.
5846 	 */
5847 	if (S_ISDIR(inode->vfs_inode.i_mode) &&
5848 	    inode_only == LOG_INODE_ALL &&
5849 	    inode->last_unlink_trans >= trans->transid) {
5850 		btrfs_set_log_full_commit(trans);
5851 		ret = BTRFS_LOG_FORCE_COMMIT;
5852 		goto out_unlock;
5853 	}
5854 
5855 	/*
5856 	 * a brute force approach to making sure we get the most uptodate
5857 	 * copies of everything.
5858 	 */
5859 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5860 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5861 
5862 		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
5863 		if (inode_only == LOG_INODE_EXISTS)
5864 			max_key_type = BTRFS_XATTR_ITEM_KEY;
5865 		if (ctx->logged_before)
5866 			ret = drop_inode_items(trans, log, path, inode,
5867 					       max_key_type);
5868 	} else {
5869 		if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
5870 			/*
5871 			 * Make sure the new inode item we write to the log has
5872 			 * the same isize as the current one (if it exists).
5873 			 * This is necessary to prevent data loss after log
5874 			 * replay, and also to prevent doing a wrong expanding
5875 			 * truncate - for e.g. create file, write 4K into offset
5876 			 * 0, fsync, write 4K into offset 4096, add hard link,
5877 			 * fsync some other file (to sync log), power fail - if
5878 			 * we use the inode's current i_size, after log replay
5879 			 * we get a 8Kb file, with the last 4Kb extent as a hole
5880 			 * (zeroes), as if an expanding truncate happened,
5881 			 * instead of getting a file of 4Kb only.
5882 			 */
5883 			ret = logged_inode_size(log, inode, path, &logged_isize);
5884 			if (ret)
5885 				goto out_unlock;
5886 		}
5887 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5888 			     &inode->runtime_flags)) {
5889 			if (inode_only == LOG_INODE_EXISTS) {
5890 				max_key.type = BTRFS_XATTR_ITEM_KEY;
5891 				if (ctx->logged_before)
5892 					ret = drop_inode_items(trans, log, path,
5893 							       inode, max_key.type);
5894 			} else {
5895 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5896 					  &inode->runtime_flags);
5897 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5898 					  &inode->runtime_flags);
5899 				if (ctx->logged_before)
5900 					ret = truncate_inode_items(trans, log,
5901 								   inode, 0, 0);
5902 			}
5903 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5904 					      &inode->runtime_flags) ||
5905 			   inode_only == LOG_INODE_EXISTS) {
5906 			if (inode_only == LOG_INODE_ALL)
5907 				fast_search = true;
5908 			max_key.type = BTRFS_XATTR_ITEM_KEY;
5909 			if (ctx->logged_before)
5910 				ret = drop_inode_items(trans, log, path, inode,
5911 						       max_key.type);
5912 		} else {
5913 			if (inode_only == LOG_INODE_ALL)
5914 				fast_search = true;
5915 			inode_item_dropped = false;
5916 			goto log_extents;
5917 		}
5918 
5919 	}
5920 	if (ret)
5921 		goto out_unlock;
5922 
5923 	ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5924 				      path, dst_path, logged_isize,
5925 				      recursive_logging, inode_only, ctx,
5926 				      &need_log_inode_item);
5927 	if (ret)
5928 		goto out_unlock;
5929 
5930 	btrfs_release_path(path);
5931 	btrfs_release_path(dst_path);
5932 	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
5933 	if (ret)
5934 		goto out_unlock;
5935 	xattrs_logged = true;
5936 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5937 		btrfs_release_path(path);
5938 		btrfs_release_path(dst_path);
5939 		ret = btrfs_log_holes(trans, inode, path);
5940 		if (ret)
5941 			goto out_unlock;
5942 	}
5943 log_extents:
5944 	btrfs_release_path(path);
5945 	btrfs_release_path(dst_path);
5946 	if (need_log_inode_item) {
5947 		ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
5948 		if (ret)
5949 			goto out_unlock;
5950 		/*
5951 		 * If we are doing a fast fsync and the inode was logged before
5952 		 * in this transaction, we don't need to log the xattrs because
5953 		 * they were logged before. If xattrs were added, changed or
5954 		 * deleted since the last time we logged the inode, then we have
5955 		 * already logged them because the inode had the runtime flag
5956 		 * BTRFS_INODE_COPY_EVERYTHING set.
5957 		 */
5958 		if (!xattrs_logged && inode->logged_trans < trans->transid) {
5959 			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
5960 			if (ret)
5961 				goto out_unlock;
5962 			btrfs_release_path(path);
5963 		}
5964 	}
5965 	if (fast_search) {
5966 		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
5967 		if (ret)
5968 			goto out_unlock;
5969 	} else if (inode_only == LOG_INODE_ALL) {
5970 		struct extent_map *em, *n;
5971 
5972 		write_lock(&em_tree->lock);
5973 		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5974 			list_del_init(&em->list);
5975 		write_unlock(&em_tree->lock);
5976 	}
5977 
5978 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5979 		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
5980 		if (ret)
5981 			goto out_unlock;
5982 	}
5983 
5984 	spin_lock(&inode->lock);
5985 	inode->logged_trans = trans->transid;
5986 	/*
5987 	 * Don't update last_log_commit if we logged that an inode exists.
5988 	 * We do this for three reasons:
5989 	 *
5990 	 * 1) We might have had buffered writes to this inode that were
5991 	 *    flushed and had their ordered extents completed in this
5992 	 *    transaction, but we did not previously log the inode with
5993 	 *    LOG_INODE_ALL. Later the inode was evicted and after that
5994 	 *    it was loaded again and this LOG_INODE_EXISTS log operation
5995 	 *    happened. We must make sure that if an explicit fsync against
5996 	 *    the inode is performed later, it logs the new extents, an
5997 	 *    updated inode item, etc, and syncs the log. The same logic
5998 	 *    applies to direct IO writes instead of buffered writes.
5999 	 *
6000 	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6001 	 *    is logged with an i_size of 0 or whatever value was logged
6002 	 *    before. If later the i_size of the inode is increased by a
6003 	 *    truncate operation, the log is synced through an fsync of
6004 	 *    some other inode and then finally an explicit fsync against
6005 	 *    this inode is made, we must make sure this fsync logs the
6006 	 *    inode with the new i_size, the hole between old i_size and
6007 	 *    the new i_size, and syncs the log.
6008 	 *
6009 	 * 3) If we are logging that an ancestor inode exists as part of
6010 	 *    logging a new name from a link or rename operation, don't update
6011 	 *    its last_log_commit - otherwise if an explicit fsync is made
6012 	 *    against an ancestor, the fsync considers the inode in the log
6013 	 *    and doesn't sync the log, resulting in the ancestor missing after
6014 	 *    a power failure unless the log was synced as part of an fsync
6015 	 *    against any other unrelated inode.
6016 	 */
6017 	if (inode_only != LOG_INODE_EXISTS)
6018 		inode->last_log_commit = inode->last_sub_trans;
6019 	spin_unlock(&inode->lock);
6020 
6021 	/*
6022 	 * Reset the last_reflink_trans so that the next fsync does not need to
6023 	 * go through the slower path when logging extents and their checksums.
6024 	 */
6025 	if (inode_only == LOG_INODE_ALL)
6026 		inode->last_reflink_trans = 0;
6027 
6028 out_unlock:
6029 	mutex_unlock(&inode->log_mutex);
6030 out:
6031 	btrfs_free_path(path);
6032 	btrfs_free_path(dst_path);
6033 
6034 	if (recursive_logging)
6035 		ctx->logged_before = orig_logged_before;
6036 
6037 	return ret;
6038 }
6039 
6040 /*
6041  * Check if we need to log an inode. This is used in contexts where while
6042  * logging an inode we need to log another inode (either that it exists or in
6043  * full mode). This is used instead of btrfs_inode_in_log() because the later
6044  * requires the inode to be in the log and have the log transaction committed,
6045  * while here we do not care if the log transaction was already committed - our
6046  * caller will commit the log later - and we want to avoid logging an inode
6047  * multiple times when multiple tasks have joined the same log transaction.
6048  */
6049 static bool need_log_inode(struct btrfs_trans_handle *trans,
6050 			   struct btrfs_inode *inode)
6051 {
6052 	/*
6053 	 * If a directory was not modified, no dentries added or removed, we can
6054 	 * and should avoid logging it.
6055 	 */
6056 	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
6057 		return false;
6058 
6059 	/*
6060 	 * If this inode does not have new/updated/deleted xattrs since the last
6061 	 * time it was logged and is flagged as logged in the current transaction,
6062 	 * we can skip logging it. As for new/deleted names, those are updated in
6063 	 * the log by link/unlink/rename operations.
6064 	 * In case the inode was logged and then evicted and reloaded, its
6065 	 * logged_trans will be 0, in which case we have to fully log it since
6066 	 * logged_trans is a transient field, not persisted.
6067 	 */
6068 	if (inode->logged_trans == trans->transid &&
6069 	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
6070 		return false;
6071 
6072 	return true;
6073 }
6074 
6075 struct btrfs_dir_list {
6076 	u64 ino;
6077 	struct list_head list;
6078 };
6079 
6080 /*
6081  * Log the inodes of the new dentries of a directory. See log_dir_items() for
6082  * details about the why it is needed.
6083  * This is a recursive operation - if an existing dentry corresponds to a
6084  * directory, that directory's new entries are logged too (same behaviour as
6085  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
6086  * the dentries point to we do not lock their i_mutex, otherwise lockdep
6087  * complains about the following circular lock dependency / possible deadlock:
6088  *
6089  *        CPU0                                        CPU1
6090  *        ----                                        ----
6091  * lock(&type->i_mutex_dir_key#3/2);
6092  *                                            lock(sb_internal#2);
6093  *                                            lock(&type->i_mutex_dir_key#3/2);
6094  * lock(&sb->s_type->i_mutex_key#14);
6095  *
6096  * Where sb_internal is the lock (a counter that works as a lock) acquired by
6097  * sb_start_intwrite() in btrfs_start_transaction().
6098  * Not locking i_mutex of the inodes is still safe because:
6099  *
6100  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
6101  *    that while logging the inode new references (names) are added or removed
6102  *    from the inode, leaving the logged inode item with a link count that does
6103  *    not match the number of logged inode reference items. This is fine because
6104  *    at log replay time we compute the real number of links and correct the
6105  *    link count in the inode item (see replay_one_buffer() and
6106  *    link_to_fixup_dir());
6107  *
6108  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
6109  *    while logging the inode's items new index items (key type
6110  *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
6111  *    has a size that doesn't match the sum of the lengths of all the logged
6112  *    names - this is ok, not a problem, because at log replay time we set the
6113  *    directory's i_size to the correct value (see replay_one_name() and
6114  *    do_overwrite_item()).
6115  */
6116 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
6117 				struct btrfs_root *root,
6118 				struct btrfs_inode *start_inode,
6119 				struct btrfs_log_ctx *ctx)
6120 {
6121 	struct btrfs_fs_info *fs_info = root->fs_info;
6122 	struct btrfs_path *path;
6123 	LIST_HEAD(dir_list);
6124 	struct btrfs_dir_list *dir_elem;
6125 	int ret = 0;
6126 
6127 	/*
6128 	 * If we are logging a new name, as part of a link or rename operation,
6129 	 * don't bother logging new dentries, as we just want to log the names
6130 	 * of an inode and that any new parents exist.
6131 	 */
6132 	if (ctx->logging_new_name)
6133 		return 0;
6134 
6135 	path = btrfs_alloc_path();
6136 	if (!path)
6137 		return -ENOMEM;
6138 
6139 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
6140 	if (!dir_elem) {
6141 		btrfs_free_path(path);
6142 		return -ENOMEM;
6143 	}
6144 	dir_elem->ino = btrfs_ino(start_inode);
6145 	list_add_tail(&dir_elem->list, &dir_list);
6146 
6147 	while (!list_empty(&dir_list)) {
6148 		struct extent_buffer *leaf;
6149 		struct btrfs_key min_key;
6150 		int nritems;
6151 		int i;
6152 
6153 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
6154 					    list);
6155 		if (ret)
6156 			goto next_dir_inode;
6157 
6158 		min_key.objectid = dir_elem->ino;
6159 		min_key.type = BTRFS_DIR_INDEX_KEY;
6160 		min_key.offset = 0;
6161 again:
6162 		btrfs_release_path(path);
6163 		ret = btrfs_search_forward(root, &min_key, path, trans->transid);
6164 		if (ret < 0) {
6165 			goto next_dir_inode;
6166 		} else if (ret > 0) {
6167 			ret = 0;
6168 			goto next_dir_inode;
6169 		}
6170 
6171 		leaf = path->nodes[0];
6172 		nritems = btrfs_header_nritems(leaf);
6173 		for (i = path->slots[0]; i < nritems; i++) {
6174 			struct btrfs_dir_item *di;
6175 			struct btrfs_key di_key;
6176 			struct inode *di_inode;
6177 			struct btrfs_dir_list *new_dir_elem;
6178 			int log_mode = LOG_INODE_EXISTS;
6179 			int type;
6180 
6181 			btrfs_item_key_to_cpu(leaf, &min_key, i);
6182 			if (min_key.objectid != dir_elem->ino ||
6183 			    min_key.type != BTRFS_DIR_INDEX_KEY)
6184 				goto next_dir_inode;
6185 
6186 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
6187 			type = btrfs_dir_type(leaf, di);
6188 			if (btrfs_dir_transid(leaf, di) < trans->transid)
6189 				continue;
6190 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
6191 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
6192 				continue;
6193 
6194 			btrfs_release_path(path);
6195 			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
6196 			if (IS_ERR(di_inode)) {
6197 				ret = PTR_ERR(di_inode);
6198 				goto next_dir_inode;
6199 			}
6200 
6201 			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6202 				btrfs_add_delayed_iput(di_inode);
6203 				break;
6204 			}
6205 
6206 			ctx->log_new_dentries = false;
6207 			if (type == BTRFS_FT_DIR)
6208 				log_mode = LOG_INODE_ALL;
6209 			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
6210 					      log_mode, ctx);
6211 			btrfs_add_delayed_iput(di_inode);
6212 			if (ret)
6213 				goto next_dir_inode;
6214 			if (ctx->log_new_dentries) {
6215 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
6216 						       GFP_NOFS);
6217 				if (!new_dir_elem) {
6218 					ret = -ENOMEM;
6219 					goto next_dir_inode;
6220 				}
6221 				new_dir_elem->ino = di_key.objectid;
6222 				list_add_tail(&new_dir_elem->list, &dir_list);
6223 			}
6224 			break;
6225 		}
6226 		if (min_key.offset < (u64)-1) {
6227 			min_key.offset++;
6228 			goto again;
6229 		}
6230 next_dir_inode:
6231 		list_del(&dir_elem->list);
6232 		kfree(dir_elem);
6233 	}
6234 
6235 	btrfs_free_path(path);
6236 	return ret;
6237 }
6238 
6239 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6240 				 struct btrfs_inode *inode,
6241 				 struct btrfs_log_ctx *ctx)
6242 {
6243 	struct btrfs_fs_info *fs_info = trans->fs_info;
6244 	int ret;
6245 	struct btrfs_path *path;
6246 	struct btrfs_key key;
6247 	struct btrfs_root *root = inode->root;
6248 	const u64 ino = btrfs_ino(inode);
6249 
6250 	path = btrfs_alloc_path();
6251 	if (!path)
6252 		return -ENOMEM;
6253 	path->skip_locking = 1;
6254 	path->search_commit_root = 1;
6255 
6256 	key.objectid = ino;
6257 	key.type = BTRFS_INODE_REF_KEY;
6258 	key.offset = 0;
6259 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6260 	if (ret < 0)
6261 		goto out;
6262 
6263 	while (true) {
6264 		struct extent_buffer *leaf = path->nodes[0];
6265 		int slot = path->slots[0];
6266 		u32 cur_offset = 0;
6267 		u32 item_size;
6268 		unsigned long ptr;
6269 
6270 		if (slot >= btrfs_header_nritems(leaf)) {
6271 			ret = btrfs_next_leaf(root, path);
6272 			if (ret < 0)
6273 				goto out;
6274 			else if (ret > 0)
6275 				break;
6276 			continue;
6277 		}
6278 
6279 		btrfs_item_key_to_cpu(leaf, &key, slot);
6280 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6281 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6282 			break;
6283 
6284 		item_size = btrfs_item_size(leaf, slot);
6285 		ptr = btrfs_item_ptr_offset(leaf, slot);
6286 		while (cur_offset < item_size) {
6287 			struct btrfs_key inode_key;
6288 			struct inode *dir_inode;
6289 
6290 			inode_key.type = BTRFS_INODE_ITEM_KEY;
6291 			inode_key.offset = 0;
6292 
6293 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
6294 				struct btrfs_inode_extref *extref;
6295 
6296 				extref = (struct btrfs_inode_extref *)
6297 					(ptr + cur_offset);
6298 				inode_key.objectid = btrfs_inode_extref_parent(
6299 					leaf, extref);
6300 				cur_offset += sizeof(*extref);
6301 				cur_offset += btrfs_inode_extref_name_len(leaf,
6302 					extref);
6303 			} else {
6304 				inode_key.objectid = key.offset;
6305 				cur_offset = item_size;
6306 			}
6307 
6308 			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
6309 					       root);
6310 			/*
6311 			 * If the parent inode was deleted, return an error to
6312 			 * fallback to a transaction commit. This is to prevent
6313 			 * getting an inode that was moved from one parent A to
6314 			 * a parent B, got its former parent A deleted and then
6315 			 * it got fsync'ed, from existing at both parents after
6316 			 * a log replay (and the old parent still existing).
6317 			 * Example:
6318 			 *
6319 			 * mkdir /mnt/A
6320 			 * mkdir /mnt/B
6321 			 * touch /mnt/B/bar
6322 			 * sync
6323 			 * mv /mnt/B/bar /mnt/A/bar
6324 			 * mv -T /mnt/A /mnt/B
6325 			 * fsync /mnt/B/bar
6326 			 * <power fail>
6327 			 *
6328 			 * If we ignore the old parent B which got deleted,
6329 			 * after a log replay we would have file bar linked
6330 			 * at both parents and the old parent B would still
6331 			 * exist.
6332 			 */
6333 			if (IS_ERR(dir_inode)) {
6334 				ret = PTR_ERR(dir_inode);
6335 				goto out;
6336 			}
6337 
6338 			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6339 				btrfs_add_delayed_iput(dir_inode);
6340 				continue;
6341 			}
6342 
6343 			ctx->log_new_dentries = false;
6344 			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6345 					      LOG_INODE_ALL, ctx);
6346 			if (!ret && ctx->log_new_dentries)
6347 				ret = log_new_dir_dentries(trans, root,
6348 						   BTRFS_I(dir_inode), ctx);
6349 			btrfs_add_delayed_iput(dir_inode);
6350 			if (ret)
6351 				goto out;
6352 		}
6353 		path->slots[0]++;
6354 	}
6355 	ret = 0;
6356 out:
6357 	btrfs_free_path(path);
6358 	return ret;
6359 }
6360 
6361 static int log_new_ancestors(struct btrfs_trans_handle *trans,
6362 			     struct btrfs_root *root,
6363 			     struct btrfs_path *path,
6364 			     struct btrfs_log_ctx *ctx)
6365 {
6366 	struct btrfs_key found_key;
6367 
6368 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6369 
6370 	while (true) {
6371 		struct btrfs_fs_info *fs_info = root->fs_info;
6372 		struct extent_buffer *leaf = path->nodes[0];
6373 		int slot = path->slots[0];
6374 		struct btrfs_key search_key;
6375 		struct inode *inode;
6376 		u64 ino;
6377 		int ret = 0;
6378 
6379 		btrfs_release_path(path);
6380 
6381 		ino = found_key.offset;
6382 
6383 		search_key.objectid = found_key.offset;
6384 		search_key.type = BTRFS_INODE_ITEM_KEY;
6385 		search_key.offset = 0;
6386 		inode = btrfs_iget(fs_info->sb, ino, root);
6387 		if (IS_ERR(inode))
6388 			return PTR_ERR(inode);
6389 
6390 		if (BTRFS_I(inode)->generation >= trans->transid &&
6391 		    need_log_inode(trans, BTRFS_I(inode)))
6392 			ret = btrfs_log_inode(trans, BTRFS_I(inode),
6393 					      LOG_INODE_EXISTS, ctx);
6394 		btrfs_add_delayed_iput(inode);
6395 		if (ret)
6396 			return ret;
6397 
6398 		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6399 			break;
6400 
6401 		search_key.type = BTRFS_INODE_REF_KEY;
6402 		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6403 		if (ret < 0)
6404 			return ret;
6405 
6406 		leaf = path->nodes[0];
6407 		slot = path->slots[0];
6408 		if (slot >= btrfs_header_nritems(leaf)) {
6409 			ret = btrfs_next_leaf(root, path);
6410 			if (ret < 0)
6411 				return ret;
6412 			else if (ret > 0)
6413 				return -ENOENT;
6414 			leaf = path->nodes[0];
6415 			slot = path->slots[0];
6416 		}
6417 
6418 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6419 		if (found_key.objectid != search_key.objectid ||
6420 		    found_key.type != BTRFS_INODE_REF_KEY)
6421 			return -ENOENT;
6422 	}
6423 	return 0;
6424 }
6425 
6426 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6427 				  struct btrfs_inode *inode,
6428 				  struct dentry *parent,
6429 				  struct btrfs_log_ctx *ctx)
6430 {
6431 	struct btrfs_root *root = inode->root;
6432 	struct dentry *old_parent = NULL;
6433 	struct super_block *sb = inode->vfs_inode.i_sb;
6434 	int ret = 0;
6435 
6436 	while (true) {
6437 		if (!parent || d_really_is_negative(parent) ||
6438 		    sb != parent->d_sb)
6439 			break;
6440 
6441 		inode = BTRFS_I(d_inode(parent));
6442 		if (root != inode->root)
6443 			break;
6444 
6445 		if (inode->generation >= trans->transid &&
6446 		    need_log_inode(trans, inode)) {
6447 			ret = btrfs_log_inode(trans, inode,
6448 					      LOG_INODE_EXISTS, ctx);
6449 			if (ret)
6450 				break;
6451 		}
6452 		if (IS_ROOT(parent))
6453 			break;
6454 
6455 		parent = dget_parent(parent);
6456 		dput(old_parent);
6457 		old_parent = parent;
6458 	}
6459 	dput(old_parent);
6460 
6461 	return ret;
6462 }
6463 
6464 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6465 				 struct btrfs_inode *inode,
6466 				 struct dentry *parent,
6467 				 struct btrfs_log_ctx *ctx)
6468 {
6469 	struct btrfs_root *root = inode->root;
6470 	const u64 ino = btrfs_ino(inode);
6471 	struct btrfs_path *path;
6472 	struct btrfs_key search_key;
6473 	int ret;
6474 
6475 	/*
6476 	 * For a single hard link case, go through a fast path that does not
6477 	 * need to iterate the fs/subvolume tree.
6478 	 */
6479 	if (inode->vfs_inode.i_nlink < 2)
6480 		return log_new_ancestors_fast(trans, inode, parent, ctx);
6481 
6482 	path = btrfs_alloc_path();
6483 	if (!path)
6484 		return -ENOMEM;
6485 
6486 	search_key.objectid = ino;
6487 	search_key.type = BTRFS_INODE_REF_KEY;
6488 	search_key.offset = 0;
6489 again:
6490 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6491 	if (ret < 0)
6492 		goto out;
6493 	if (ret == 0)
6494 		path->slots[0]++;
6495 
6496 	while (true) {
6497 		struct extent_buffer *leaf = path->nodes[0];
6498 		int slot = path->slots[0];
6499 		struct btrfs_key found_key;
6500 
6501 		if (slot >= btrfs_header_nritems(leaf)) {
6502 			ret = btrfs_next_leaf(root, path);
6503 			if (ret < 0)
6504 				goto out;
6505 			else if (ret > 0)
6506 				break;
6507 			continue;
6508 		}
6509 
6510 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6511 		if (found_key.objectid != ino ||
6512 		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6513 			break;
6514 
6515 		/*
6516 		 * Don't deal with extended references because they are rare
6517 		 * cases and too complex to deal with (we would need to keep
6518 		 * track of which subitem we are processing for each item in
6519 		 * this loop, etc). So just return some error to fallback to
6520 		 * a transaction commit.
6521 		 */
6522 		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6523 			ret = -EMLINK;
6524 			goto out;
6525 		}
6526 
6527 		/*
6528 		 * Logging ancestors needs to do more searches on the fs/subvol
6529 		 * tree, so it releases the path as needed to avoid deadlocks.
6530 		 * Keep track of the last inode ref key and resume from that key
6531 		 * after logging all new ancestors for the current hard link.
6532 		 */
6533 		memcpy(&search_key, &found_key, sizeof(search_key));
6534 
6535 		ret = log_new_ancestors(trans, root, path, ctx);
6536 		if (ret)
6537 			goto out;
6538 		btrfs_release_path(path);
6539 		goto again;
6540 	}
6541 	ret = 0;
6542 out:
6543 	btrfs_free_path(path);
6544 	return ret;
6545 }
6546 
6547 /*
6548  * helper function around btrfs_log_inode to make sure newly created
6549  * parent directories also end up in the log.  A minimal inode and backref
6550  * only logging is done of any parent directories that are older than
6551  * the last committed transaction
6552  */
6553 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6554 				  struct btrfs_inode *inode,
6555 				  struct dentry *parent,
6556 				  int inode_only,
6557 				  struct btrfs_log_ctx *ctx)
6558 {
6559 	struct btrfs_root *root = inode->root;
6560 	struct btrfs_fs_info *fs_info = root->fs_info;
6561 	int ret = 0;
6562 	bool log_dentries = false;
6563 
6564 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
6565 		ret = BTRFS_LOG_FORCE_COMMIT;
6566 		goto end_no_trans;
6567 	}
6568 
6569 	if (btrfs_root_refs(&root->root_item) == 0) {
6570 		ret = BTRFS_LOG_FORCE_COMMIT;
6571 		goto end_no_trans;
6572 	}
6573 
6574 	/*
6575 	 * Skip already logged inodes or inodes corresponding to tmpfiles
6576 	 * (since logging them is pointless, a link count of 0 means they
6577 	 * will never be accessible).
6578 	 */
6579 	if ((btrfs_inode_in_log(inode, trans->transid) &&
6580 	     list_empty(&ctx->ordered_extents)) ||
6581 	    inode->vfs_inode.i_nlink == 0) {
6582 		ret = BTRFS_NO_LOG_SYNC;
6583 		goto end_no_trans;
6584 	}
6585 
6586 	ret = start_log_trans(trans, root, ctx);
6587 	if (ret)
6588 		goto end_no_trans;
6589 
6590 	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
6591 	if (ret)
6592 		goto end_trans;
6593 
6594 	/*
6595 	 * for regular files, if its inode is already on disk, we don't
6596 	 * have to worry about the parents at all.  This is because
6597 	 * we can use the last_unlink_trans field to record renames
6598 	 * and other fun in this file.
6599 	 */
6600 	if (S_ISREG(inode->vfs_inode.i_mode) &&
6601 	    inode->generation < trans->transid &&
6602 	    inode->last_unlink_trans < trans->transid) {
6603 		ret = 0;
6604 		goto end_trans;
6605 	}
6606 
6607 	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
6608 		log_dentries = true;
6609 
6610 	/*
6611 	 * On unlink we must make sure all our current and old parent directory
6612 	 * inodes are fully logged. This is to prevent leaving dangling
6613 	 * directory index entries in directories that were our parents but are
6614 	 * not anymore. Not doing this results in old parent directory being
6615 	 * impossible to delete after log replay (rmdir will always fail with
6616 	 * error -ENOTEMPTY).
6617 	 *
6618 	 * Example 1:
6619 	 *
6620 	 * mkdir testdir
6621 	 * touch testdir/foo
6622 	 * ln testdir/foo testdir/bar
6623 	 * sync
6624 	 * unlink testdir/bar
6625 	 * xfs_io -c fsync testdir/foo
6626 	 * <power failure>
6627 	 * mount fs, triggers log replay
6628 	 *
6629 	 * If we don't log the parent directory (testdir), after log replay the
6630 	 * directory still has an entry pointing to the file inode using the bar
6631 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6632 	 * the file inode has a link count of 1.
6633 	 *
6634 	 * Example 2:
6635 	 *
6636 	 * mkdir testdir
6637 	 * touch foo
6638 	 * ln foo testdir/foo2
6639 	 * ln foo testdir/foo3
6640 	 * sync
6641 	 * unlink testdir/foo3
6642 	 * xfs_io -c fsync foo
6643 	 * <power failure>
6644 	 * mount fs, triggers log replay
6645 	 *
6646 	 * Similar as the first example, after log replay the parent directory
6647 	 * testdir still has an entry pointing to the inode file with name foo3
6648 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6649 	 * and has a link count of 2.
6650 	 */
6651 	if (inode->last_unlink_trans >= trans->transid) {
6652 		ret = btrfs_log_all_parents(trans, inode, ctx);
6653 		if (ret)
6654 			goto end_trans;
6655 	}
6656 
6657 	ret = log_all_new_ancestors(trans, inode, parent, ctx);
6658 	if (ret)
6659 		goto end_trans;
6660 
6661 	if (log_dentries)
6662 		ret = log_new_dir_dentries(trans, root, inode, ctx);
6663 	else
6664 		ret = 0;
6665 end_trans:
6666 	if (ret < 0) {
6667 		btrfs_set_log_full_commit(trans);
6668 		ret = BTRFS_LOG_FORCE_COMMIT;
6669 	}
6670 
6671 	if (ret)
6672 		btrfs_remove_log_ctx(root, ctx);
6673 	btrfs_end_log_trans(root);
6674 end_no_trans:
6675 	return ret;
6676 }
6677 
6678 /*
6679  * it is not safe to log dentry if the chunk root has added new
6680  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6681  * If this returns 1, you must commit the transaction to safely get your
6682  * data on disk.
6683  */
6684 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6685 			  struct dentry *dentry,
6686 			  struct btrfs_log_ctx *ctx)
6687 {
6688 	struct dentry *parent = dget_parent(dentry);
6689 	int ret;
6690 
6691 	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6692 				     LOG_INODE_ALL, ctx);
6693 	dput(parent);
6694 
6695 	return ret;
6696 }
6697 
6698 /*
6699  * should be called during mount to recover any replay any log trees
6700  * from the FS
6701  */
6702 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6703 {
6704 	int ret;
6705 	struct btrfs_path *path;
6706 	struct btrfs_trans_handle *trans;
6707 	struct btrfs_key key;
6708 	struct btrfs_key found_key;
6709 	struct btrfs_root *log;
6710 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6711 	struct walk_control wc = {
6712 		.process_func = process_one_buffer,
6713 		.stage = LOG_WALK_PIN_ONLY,
6714 	};
6715 
6716 	path = btrfs_alloc_path();
6717 	if (!path)
6718 		return -ENOMEM;
6719 
6720 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6721 
6722 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
6723 	if (IS_ERR(trans)) {
6724 		ret = PTR_ERR(trans);
6725 		goto error;
6726 	}
6727 
6728 	wc.trans = trans;
6729 	wc.pin = 1;
6730 
6731 	ret = walk_log_tree(trans, log_root_tree, &wc);
6732 	if (ret) {
6733 		btrfs_abort_transaction(trans, ret);
6734 		goto error;
6735 	}
6736 
6737 again:
6738 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
6739 	key.offset = (u64)-1;
6740 	key.type = BTRFS_ROOT_ITEM_KEY;
6741 
6742 	while (1) {
6743 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6744 
6745 		if (ret < 0) {
6746 			btrfs_abort_transaction(trans, ret);
6747 			goto error;
6748 		}
6749 		if (ret > 0) {
6750 			if (path->slots[0] == 0)
6751 				break;
6752 			path->slots[0]--;
6753 		}
6754 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6755 				      path->slots[0]);
6756 		btrfs_release_path(path);
6757 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6758 			break;
6759 
6760 		log = btrfs_read_tree_root(log_root_tree, &found_key);
6761 		if (IS_ERR(log)) {
6762 			ret = PTR_ERR(log);
6763 			btrfs_abort_transaction(trans, ret);
6764 			goto error;
6765 		}
6766 
6767 		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6768 						   true);
6769 		if (IS_ERR(wc.replay_dest)) {
6770 			ret = PTR_ERR(wc.replay_dest);
6771 
6772 			/*
6773 			 * We didn't find the subvol, likely because it was
6774 			 * deleted.  This is ok, simply skip this log and go to
6775 			 * the next one.
6776 			 *
6777 			 * We need to exclude the root because we can't have
6778 			 * other log replays overwriting this log as we'll read
6779 			 * it back in a few more times.  This will keep our
6780 			 * block from being modified, and we'll just bail for
6781 			 * each subsequent pass.
6782 			 */
6783 			if (ret == -ENOENT)
6784 				ret = btrfs_pin_extent_for_log_replay(trans,
6785 							log->node->start,
6786 							log->node->len);
6787 			btrfs_put_root(log);
6788 
6789 			if (!ret)
6790 				goto next;
6791 			btrfs_abort_transaction(trans, ret);
6792 			goto error;
6793 		}
6794 
6795 		wc.replay_dest->log_root = log;
6796 		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
6797 		if (ret)
6798 			/* The loop needs to continue due to the root refs */
6799 			btrfs_abort_transaction(trans, ret);
6800 		else
6801 			ret = walk_log_tree(trans, log, &wc);
6802 
6803 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6804 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
6805 						      path);
6806 			if (ret)
6807 				btrfs_abort_transaction(trans, ret);
6808 		}
6809 
6810 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6811 			struct btrfs_root *root = wc.replay_dest;
6812 
6813 			btrfs_release_path(path);
6814 
6815 			/*
6816 			 * We have just replayed everything, and the highest
6817 			 * objectid of fs roots probably has changed in case
6818 			 * some inode_item's got replayed.
6819 			 *
6820 			 * root->objectid_mutex is not acquired as log replay
6821 			 * could only happen during mount.
6822 			 */
6823 			ret = btrfs_init_root_free_objectid(root);
6824 			if (ret)
6825 				btrfs_abort_transaction(trans, ret);
6826 		}
6827 
6828 		wc.replay_dest->log_root = NULL;
6829 		btrfs_put_root(wc.replay_dest);
6830 		btrfs_put_root(log);
6831 
6832 		if (ret)
6833 			goto error;
6834 next:
6835 		if (found_key.offset == 0)
6836 			break;
6837 		key.offset = found_key.offset - 1;
6838 	}
6839 	btrfs_release_path(path);
6840 
6841 	/* step one is to pin it all, step two is to replay just inodes */
6842 	if (wc.pin) {
6843 		wc.pin = 0;
6844 		wc.process_func = replay_one_buffer;
6845 		wc.stage = LOG_WALK_REPLAY_INODES;
6846 		goto again;
6847 	}
6848 	/* step three is to replay everything */
6849 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
6850 		wc.stage++;
6851 		goto again;
6852 	}
6853 
6854 	btrfs_free_path(path);
6855 
6856 	/* step 4: commit the transaction, which also unpins the blocks */
6857 	ret = btrfs_commit_transaction(trans);
6858 	if (ret)
6859 		return ret;
6860 
6861 	log_root_tree->log_root = NULL;
6862 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6863 	btrfs_put_root(log_root_tree);
6864 
6865 	return 0;
6866 error:
6867 	if (wc.trans)
6868 		btrfs_end_transaction(wc.trans);
6869 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6870 	btrfs_free_path(path);
6871 	return ret;
6872 }
6873 
6874 /*
6875  * there are some corner cases where we want to force a full
6876  * commit instead of allowing a directory to be logged.
6877  *
6878  * They revolve around files there were unlinked from the directory, and
6879  * this function updates the parent directory so that a full commit is
6880  * properly done if it is fsync'd later after the unlinks are done.
6881  *
6882  * Must be called before the unlink operations (updates to the subvolume tree,
6883  * inodes, etc) are done.
6884  */
6885 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6886 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
6887 			     int for_rename)
6888 {
6889 	/*
6890 	 * when we're logging a file, if it hasn't been renamed
6891 	 * or unlinked, and its inode is fully committed on disk,
6892 	 * we don't have to worry about walking up the directory chain
6893 	 * to log its parents.
6894 	 *
6895 	 * So, we use the last_unlink_trans field to put this transid
6896 	 * into the file.  When the file is logged we check it and
6897 	 * don't log the parents if the file is fully on disk.
6898 	 */
6899 	mutex_lock(&inode->log_mutex);
6900 	inode->last_unlink_trans = trans->transid;
6901 	mutex_unlock(&inode->log_mutex);
6902 
6903 	/*
6904 	 * if this directory was already logged any new
6905 	 * names for this file/dir will get recorded
6906 	 */
6907 	if (dir->logged_trans == trans->transid)
6908 		return;
6909 
6910 	/*
6911 	 * if the inode we're about to unlink was logged,
6912 	 * the log will be properly updated for any new names
6913 	 */
6914 	if (inode->logged_trans == trans->transid)
6915 		return;
6916 
6917 	/*
6918 	 * when renaming files across directories, if the directory
6919 	 * there we're unlinking from gets fsync'd later on, there's
6920 	 * no way to find the destination directory later and fsync it
6921 	 * properly.  So, we have to be conservative and force commits
6922 	 * so the new name gets discovered.
6923 	 */
6924 	if (for_rename)
6925 		goto record;
6926 
6927 	/* we can safely do the unlink without any special recording */
6928 	return;
6929 
6930 record:
6931 	mutex_lock(&dir->log_mutex);
6932 	dir->last_unlink_trans = trans->transid;
6933 	mutex_unlock(&dir->log_mutex);
6934 }
6935 
6936 /*
6937  * Make sure that if someone attempts to fsync the parent directory of a deleted
6938  * snapshot, it ends up triggering a transaction commit. This is to guarantee
6939  * that after replaying the log tree of the parent directory's root we will not
6940  * see the snapshot anymore and at log replay time we will not see any log tree
6941  * corresponding to the deleted snapshot's root, which could lead to replaying
6942  * it after replaying the log tree of the parent directory (which would replay
6943  * the snapshot delete operation).
6944  *
6945  * Must be called before the actual snapshot destroy operation (updates to the
6946  * parent root and tree of tree roots trees, etc) are done.
6947  */
6948 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6949 				   struct btrfs_inode *dir)
6950 {
6951 	mutex_lock(&dir->log_mutex);
6952 	dir->last_unlink_trans = trans->transid;
6953 	mutex_unlock(&dir->log_mutex);
6954 }
6955 
6956 /**
6957  * Update the log after adding a new name for an inode.
6958  *
6959  * @trans:              Transaction handle.
6960  * @old_dentry:         The dentry associated with the old name and the old
6961  *                      parent directory.
6962  * @old_dir:            The inode of the previous parent directory for the case
6963  *                      of a rename. For a link operation, it must be NULL.
6964  * @old_dir_index:      The index number associated with the old name, meaningful
6965  *                      only for rename operations (when @old_dir is not NULL).
6966  *                      Ignored for link operations.
6967  * @parent:             The dentry associated with the directory under which the
6968  *                      new name is located.
6969  *
6970  * Call this after adding a new name for an inode, as a result of a link or
6971  * rename operation, and it will properly update the log to reflect the new name.
6972  */
6973 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6974 			struct dentry *old_dentry, struct btrfs_inode *old_dir,
6975 			u64 old_dir_index, struct dentry *parent)
6976 {
6977 	struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
6978 	struct btrfs_root *root = inode->root;
6979 	struct btrfs_log_ctx ctx;
6980 	bool log_pinned = false;
6981 	int ret;
6982 
6983 	/*
6984 	 * this will force the logging code to walk the dentry chain
6985 	 * up for the file
6986 	 */
6987 	if (!S_ISDIR(inode->vfs_inode.i_mode))
6988 		inode->last_unlink_trans = trans->transid;
6989 
6990 	/*
6991 	 * if this inode hasn't been logged and directory we're renaming it
6992 	 * from hasn't been logged, we don't need to log it
6993 	 */
6994 	ret = inode_logged(trans, inode, NULL);
6995 	if (ret < 0) {
6996 		goto out;
6997 	} else if (ret == 0) {
6998 		if (!old_dir)
6999 			return;
7000 		/*
7001 		 * If the inode was not logged and we are doing a rename (old_dir is not
7002 		 * NULL), check if old_dir was logged - if it was not we can return and
7003 		 * do nothing.
7004 		 */
7005 		ret = inode_logged(trans, old_dir, NULL);
7006 		if (ret < 0)
7007 			goto out;
7008 		else if (ret == 0)
7009 			return;
7010 	}
7011 	ret = 0;
7012 
7013 	/*
7014 	 * If we are doing a rename (old_dir is not NULL) from a directory that
7015 	 * was previously logged, make sure that on log replay we get the old
7016 	 * dir entry deleted. This is needed because we will also log the new
7017 	 * name of the renamed inode, so we need to make sure that after log
7018 	 * replay we don't end up with both the new and old dir entries existing.
7019 	 */
7020 	if (old_dir && old_dir->logged_trans == trans->transid) {
7021 		struct btrfs_root *log = old_dir->root->log_root;
7022 		struct btrfs_path *path;
7023 
7024 		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7025 
7026 		/*
7027 		 * We have two inodes to update in the log, the old directory and
7028 		 * the inode that got renamed, so we must pin the log to prevent
7029 		 * anyone from syncing the log until we have updated both inodes
7030 		 * in the log.
7031 		 */
7032 		ret = join_running_log_trans(root);
7033 		/*
7034 		 * At least one of the inodes was logged before, so this should
7035 		 * not fail, but if it does, it's not serious, just bail out and
7036 		 * mark the log for a full commit.
7037 		 */
7038 		if (WARN_ON_ONCE(ret < 0))
7039 			goto out;
7040 		log_pinned = true;
7041 
7042 		path = btrfs_alloc_path();
7043 		if (!path) {
7044 			ret = -ENOMEM;
7045 			goto out;
7046 		}
7047 
7048 		/*
7049 		 * Other concurrent task might be logging the old directory,
7050 		 * as it can be triggered when logging other inode that had or
7051 		 * still has a dentry in the old directory. We lock the old
7052 		 * directory's log_mutex to ensure the deletion of the old
7053 		 * name is persisted, because during directory logging we
7054 		 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7055 		 * the old name's dir index item is in the delayed items, so
7056 		 * it could be missed by an in progress directory logging.
7057 		 */
7058 		mutex_lock(&old_dir->log_mutex);
7059 		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7060 					old_dentry->d_name.name,
7061 					old_dentry->d_name.len, old_dir_index);
7062 		if (ret > 0) {
7063 			/*
7064 			 * The dentry does not exist in the log, so record its
7065 			 * deletion.
7066 			 */
7067 			btrfs_release_path(path);
7068 			ret = insert_dir_log_key(trans, log, path,
7069 						 btrfs_ino(old_dir),
7070 						 old_dir_index, old_dir_index);
7071 		}
7072 		mutex_unlock(&old_dir->log_mutex);
7073 
7074 		btrfs_free_path(path);
7075 		if (ret < 0)
7076 			goto out;
7077 	}
7078 
7079 	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
7080 	ctx.logging_new_name = true;
7081 	/*
7082 	 * We don't care about the return value. If we fail to log the new name
7083 	 * then we know the next attempt to sync the log will fallback to a full
7084 	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7085 	 * we don't need to worry about getting a log committed that has an
7086 	 * inconsistent state after a rename operation.
7087 	 */
7088 	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7089 out:
7090 	/*
7091 	 * If an error happened mark the log for a full commit because it's not
7092 	 * consistent and up to date or we couldn't find out if one of the
7093 	 * inodes was logged before in this transaction. Do it before unpinning
7094 	 * the log, to avoid any races with someone else trying to commit it.
7095 	 */
7096 	if (ret < 0)
7097 		btrfs_set_log_full_commit(trans);
7098 	if (log_pinned)
7099 		btrfs_end_log_trans(root);
7100 }
7101 
7102