xref: /openbmc/linux/fs/btrfs/tree-log.c (revision b9890054)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "print-tree.h"
17 #include "backref.h"
18 #include "compression.h"
19 #include "qgroup.h"
20 #include "inode-map.h"
21 
22 /* magic values for the inode_only field in btrfs_log_inode:
23  *
24  * LOG_INODE_ALL means to log everything
25  * LOG_INODE_EXISTS means to log just enough to recreate the inode
26  * during log replay
27  */
28 enum {
29 	LOG_INODE_ALL,
30 	LOG_INODE_EXISTS,
31 	LOG_OTHER_INODE,
32 	LOG_OTHER_INODE_ALL,
33 };
34 
35 /*
36  * directory trouble cases
37  *
38  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
39  * log, we must force a full commit before doing an fsync of the directory
40  * where the unlink was done.
41  * ---> record transid of last unlink/rename per directory
42  *
43  * mkdir foo/some_dir
44  * normal commit
45  * rename foo/some_dir foo2/some_dir
46  * mkdir foo/some_dir
47  * fsync foo/some_dir/some_file
48  *
49  * The fsync above will unlink the original some_dir without recording
50  * it in its new location (foo2).  After a crash, some_dir will be gone
51  * unless the fsync of some_file forces a full commit
52  *
53  * 2) we must log any new names for any file or dir that is in the fsync
54  * log. ---> check inode while renaming/linking.
55  *
56  * 2a) we must log any new names for any file or dir during rename
57  * when the directory they are being removed from was logged.
58  * ---> check inode and old parent dir during rename
59  *
60  *  2a is actually the more important variant.  With the extra logging
61  *  a crash might unlink the old name without recreating the new one
62  *
63  * 3) after a crash, we must go through any directories with a link count
64  * of zero and redo the rm -rf
65  *
66  * mkdir f1/foo
67  * normal commit
68  * rm -rf f1/foo
69  * fsync(f1)
70  *
71  * The directory f1 was fully removed from the FS, but fsync was never
72  * called on f1, only its parent dir.  After a crash the rm -rf must
73  * be replayed.  This must be able to recurse down the entire
74  * directory tree.  The inode link count fixup code takes care of the
75  * ugly details.
76  */
77 
78 /*
79  * stages for the tree walking.  The first
80  * stage (0) is to only pin down the blocks we find
81  * the second stage (1) is to make sure that all the inodes
82  * we find in the log are created in the subvolume.
83  *
84  * The last stage is to deal with directories and links and extents
85  * and all the other fun semantics
86  */
87 enum {
88 	LOG_WALK_PIN_ONLY,
89 	LOG_WALK_REPLAY_INODES,
90 	LOG_WALK_REPLAY_DIR_INDEX,
91 	LOG_WALK_REPLAY_ALL,
92 };
93 
94 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
95 			   struct btrfs_root *root, struct btrfs_inode *inode,
96 			   int inode_only,
97 			   const loff_t start,
98 			   const loff_t end,
99 			   struct btrfs_log_ctx *ctx);
100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101 			     struct btrfs_root *root,
102 			     struct btrfs_path *path, u64 objectid);
103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104 				       struct btrfs_root *root,
105 				       struct btrfs_root *log,
106 				       struct btrfs_path *path,
107 				       u64 dirid, int del_all);
108 
109 /*
110  * tree logging is a special write ahead log used to make sure that
111  * fsyncs and O_SYNCs can happen without doing full tree commits.
112  *
113  * Full tree commits are expensive because they require commonly
114  * modified blocks to be recowed, creating many dirty pages in the
115  * extent tree an 4x-6x higher write load than ext3.
116  *
117  * Instead of doing a tree commit on every fsync, we use the
118  * key ranges and transaction ids to find items for a given file or directory
119  * that have changed in this transaction.  Those items are copied into
120  * a special tree (one per subvolume root), that tree is written to disk
121  * and then the fsync is considered complete.
122  *
123  * After a crash, items are copied out of the log-tree back into the
124  * subvolume tree.  Any file data extents found are recorded in the extent
125  * allocation tree, and the log-tree freed.
126  *
127  * The log tree is read three times, once to pin down all the extents it is
128  * using in ram and once, once to create all the inodes logged in the tree
129  * and once to do all the other items.
130  */
131 
132 /*
133  * start a sub transaction and setup the log tree
134  * this increments the log tree writer count to make the people
135  * syncing the tree wait for us to finish
136  */
137 static int start_log_trans(struct btrfs_trans_handle *trans,
138 			   struct btrfs_root *root,
139 			   struct btrfs_log_ctx *ctx)
140 {
141 	struct btrfs_fs_info *fs_info = root->fs_info;
142 	int ret = 0;
143 
144 	mutex_lock(&root->log_mutex);
145 
146 	if (root->log_root) {
147 		if (btrfs_need_log_full_commit(trans)) {
148 			ret = -EAGAIN;
149 			goto out;
150 		}
151 
152 		if (!root->log_start_pid) {
153 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
154 			root->log_start_pid = current->pid;
155 		} else if (root->log_start_pid != current->pid) {
156 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
157 		}
158 	} else {
159 		mutex_lock(&fs_info->tree_log_mutex);
160 		if (!fs_info->log_root_tree)
161 			ret = btrfs_init_log_root_tree(trans, fs_info);
162 		mutex_unlock(&fs_info->tree_log_mutex);
163 		if (ret)
164 			goto out;
165 
166 		ret = btrfs_add_log_tree(trans, root);
167 		if (ret)
168 			goto out;
169 
170 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
171 		root->log_start_pid = current->pid;
172 	}
173 
174 	atomic_inc(&root->log_batch);
175 	atomic_inc(&root->log_writers);
176 	if (ctx) {
177 		int index = root->log_transid % 2;
178 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
179 		ctx->log_transid = root->log_transid;
180 	}
181 
182 out:
183 	mutex_unlock(&root->log_mutex);
184 	return ret;
185 }
186 
187 /*
188  * returns 0 if there was a log transaction running and we were able
189  * to join, or returns -ENOENT if there were not transactions
190  * in progress
191  */
192 static int join_running_log_trans(struct btrfs_root *root)
193 {
194 	int ret = -ENOENT;
195 
196 	mutex_lock(&root->log_mutex);
197 	if (root->log_root) {
198 		ret = 0;
199 		atomic_inc(&root->log_writers);
200 	}
201 	mutex_unlock(&root->log_mutex);
202 	return ret;
203 }
204 
205 /*
206  * This either makes the current running log transaction wait
207  * until you call btrfs_end_log_trans() or it makes any future
208  * log transactions wait until you call btrfs_end_log_trans()
209  */
210 void btrfs_pin_log_trans(struct btrfs_root *root)
211 {
212 	mutex_lock(&root->log_mutex);
213 	atomic_inc(&root->log_writers);
214 	mutex_unlock(&root->log_mutex);
215 }
216 
217 /*
218  * indicate we're done making changes to the log tree
219  * and wake up anyone waiting to do a sync
220  */
221 void btrfs_end_log_trans(struct btrfs_root *root)
222 {
223 	if (atomic_dec_and_test(&root->log_writers)) {
224 		/* atomic_dec_and_test implies a barrier */
225 		cond_wake_up_nomb(&root->log_writer_wait);
226 	}
227 }
228 
229 static int btrfs_write_tree_block(struct extent_buffer *buf)
230 {
231 	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
232 					buf->start + buf->len - 1);
233 }
234 
235 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
236 {
237 	filemap_fdatawait_range(buf->pages[0]->mapping,
238 			        buf->start, buf->start + buf->len - 1);
239 }
240 
241 /*
242  * the walk control struct is used to pass state down the chain when
243  * processing the log tree.  The stage field tells us which part
244  * of the log tree processing we are currently doing.  The others
245  * are state fields used for that specific part
246  */
247 struct walk_control {
248 	/* should we free the extent on disk when done?  This is used
249 	 * at transaction commit time while freeing a log tree
250 	 */
251 	int free;
252 
253 	/* should we write out the extent buffer?  This is used
254 	 * while flushing the log tree to disk during a sync
255 	 */
256 	int write;
257 
258 	/* should we wait for the extent buffer io to finish?  Also used
259 	 * while flushing the log tree to disk for a sync
260 	 */
261 	int wait;
262 
263 	/* pin only walk, we record which extents on disk belong to the
264 	 * log trees
265 	 */
266 	int pin;
267 
268 	/* what stage of the replay code we're currently in */
269 	int stage;
270 
271 	/*
272 	 * Ignore any items from the inode currently being processed. Needs
273 	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
274 	 * the LOG_WALK_REPLAY_INODES stage.
275 	 */
276 	bool ignore_cur_inode;
277 
278 	/* the root we are currently replaying */
279 	struct btrfs_root *replay_dest;
280 
281 	/* the trans handle for the current replay */
282 	struct btrfs_trans_handle *trans;
283 
284 	/* the function that gets used to process blocks we find in the
285 	 * tree.  Note the extent_buffer might not be up to date when it is
286 	 * passed in, and it must be checked or read if you need the data
287 	 * inside it
288 	 */
289 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
290 			    struct walk_control *wc, u64 gen, int level);
291 };
292 
293 /*
294  * process_func used to pin down extents, write them or wait on them
295  */
296 static int process_one_buffer(struct btrfs_root *log,
297 			      struct extent_buffer *eb,
298 			      struct walk_control *wc, u64 gen, int level)
299 {
300 	struct btrfs_fs_info *fs_info = log->fs_info;
301 	int ret = 0;
302 
303 	/*
304 	 * If this fs is mixed then we need to be able to process the leaves to
305 	 * pin down any logged extents, so we have to read the block.
306 	 */
307 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
308 		ret = btrfs_read_buffer(eb, gen, level, NULL);
309 		if (ret)
310 			return ret;
311 	}
312 
313 	if (wc->pin)
314 		ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
315 						      eb->len);
316 
317 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
318 		if (wc->pin && btrfs_header_level(eb) == 0)
319 			ret = btrfs_exclude_logged_extents(eb);
320 		if (wc->write)
321 			btrfs_write_tree_block(eb);
322 		if (wc->wait)
323 			btrfs_wait_tree_block_writeback(eb);
324 	}
325 	return ret;
326 }
327 
328 /*
329  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
330  * to the src data we are copying out.
331  *
332  * root is the tree we are copying into, and path is a scratch
333  * path for use in this function (it should be released on entry and
334  * will be released on exit).
335  *
336  * If the key is already in the destination tree the existing item is
337  * overwritten.  If the existing item isn't big enough, it is extended.
338  * If it is too large, it is truncated.
339  *
340  * If the key isn't in the destination yet, a new item is inserted.
341  */
342 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
343 				   struct btrfs_root *root,
344 				   struct btrfs_path *path,
345 				   struct extent_buffer *eb, int slot,
346 				   struct btrfs_key *key)
347 {
348 	int ret;
349 	u32 item_size;
350 	u64 saved_i_size = 0;
351 	int save_old_i_size = 0;
352 	unsigned long src_ptr;
353 	unsigned long dst_ptr;
354 	int overwrite_root = 0;
355 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
356 
357 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
358 		overwrite_root = 1;
359 
360 	item_size = btrfs_item_size_nr(eb, slot);
361 	src_ptr = btrfs_item_ptr_offset(eb, slot);
362 
363 	/* look for the key in the destination tree */
364 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
365 	if (ret < 0)
366 		return ret;
367 
368 	if (ret == 0) {
369 		char *src_copy;
370 		char *dst_copy;
371 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
372 						  path->slots[0]);
373 		if (dst_size != item_size)
374 			goto insert;
375 
376 		if (item_size == 0) {
377 			btrfs_release_path(path);
378 			return 0;
379 		}
380 		dst_copy = kmalloc(item_size, GFP_NOFS);
381 		src_copy = kmalloc(item_size, GFP_NOFS);
382 		if (!dst_copy || !src_copy) {
383 			btrfs_release_path(path);
384 			kfree(dst_copy);
385 			kfree(src_copy);
386 			return -ENOMEM;
387 		}
388 
389 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
390 
391 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
392 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
393 				   item_size);
394 		ret = memcmp(dst_copy, src_copy, item_size);
395 
396 		kfree(dst_copy);
397 		kfree(src_copy);
398 		/*
399 		 * they have the same contents, just return, this saves
400 		 * us from cowing blocks in the destination tree and doing
401 		 * extra writes that may not have been done by a previous
402 		 * sync
403 		 */
404 		if (ret == 0) {
405 			btrfs_release_path(path);
406 			return 0;
407 		}
408 
409 		/*
410 		 * We need to load the old nbytes into the inode so when we
411 		 * replay the extents we've logged we get the right nbytes.
412 		 */
413 		if (inode_item) {
414 			struct btrfs_inode_item *item;
415 			u64 nbytes;
416 			u32 mode;
417 
418 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
419 					      struct btrfs_inode_item);
420 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
421 			item = btrfs_item_ptr(eb, slot,
422 					      struct btrfs_inode_item);
423 			btrfs_set_inode_nbytes(eb, item, nbytes);
424 
425 			/*
426 			 * If this is a directory we need to reset the i_size to
427 			 * 0 so that we can set it up properly when replaying
428 			 * the rest of the items in this log.
429 			 */
430 			mode = btrfs_inode_mode(eb, item);
431 			if (S_ISDIR(mode))
432 				btrfs_set_inode_size(eb, item, 0);
433 		}
434 	} else if (inode_item) {
435 		struct btrfs_inode_item *item;
436 		u32 mode;
437 
438 		/*
439 		 * New inode, set nbytes to 0 so that the nbytes comes out
440 		 * properly when we replay the extents.
441 		 */
442 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
443 		btrfs_set_inode_nbytes(eb, item, 0);
444 
445 		/*
446 		 * If this is a directory we need to reset the i_size to 0 so
447 		 * that we can set it up properly when replaying the rest of
448 		 * the items in this log.
449 		 */
450 		mode = btrfs_inode_mode(eb, item);
451 		if (S_ISDIR(mode))
452 			btrfs_set_inode_size(eb, item, 0);
453 	}
454 insert:
455 	btrfs_release_path(path);
456 	/* try to insert the key into the destination tree */
457 	path->skip_release_on_error = 1;
458 	ret = btrfs_insert_empty_item(trans, root, path,
459 				      key, item_size);
460 	path->skip_release_on_error = 0;
461 
462 	/* make sure any existing item is the correct size */
463 	if (ret == -EEXIST || ret == -EOVERFLOW) {
464 		u32 found_size;
465 		found_size = btrfs_item_size_nr(path->nodes[0],
466 						path->slots[0]);
467 		if (found_size > item_size)
468 			btrfs_truncate_item(path, item_size, 1);
469 		else if (found_size < item_size)
470 			btrfs_extend_item(path, item_size - found_size);
471 	} else if (ret) {
472 		return ret;
473 	}
474 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
475 					path->slots[0]);
476 
477 	/* don't overwrite an existing inode if the generation number
478 	 * was logged as zero.  This is done when the tree logging code
479 	 * is just logging an inode to make sure it exists after recovery.
480 	 *
481 	 * Also, don't overwrite i_size on directories during replay.
482 	 * log replay inserts and removes directory items based on the
483 	 * state of the tree found in the subvolume, and i_size is modified
484 	 * as it goes
485 	 */
486 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
487 		struct btrfs_inode_item *src_item;
488 		struct btrfs_inode_item *dst_item;
489 
490 		src_item = (struct btrfs_inode_item *)src_ptr;
491 		dst_item = (struct btrfs_inode_item *)dst_ptr;
492 
493 		if (btrfs_inode_generation(eb, src_item) == 0) {
494 			struct extent_buffer *dst_eb = path->nodes[0];
495 			const u64 ino_size = btrfs_inode_size(eb, src_item);
496 
497 			/*
498 			 * For regular files an ino_size == 0 is used only when
499 			 * logging that an inode exists, as part of a directory
500 			 * fsync, and the inode wasn't fsynced before. In this
501 			 * case don't set the size of the inode in the fs/subvol
502 			 * tree, otherwise we would be throwing valid data away.
503 			 */
504 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
505 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
506 			    ino_size != 0) {
507 				struct btrfs_map_token token;
508 
509 				btrfs_init_map_token(&token, dst_eb);
510 				btrfs_set_token_inode_size(dst_eb, dst_item,
511 							   ino_size, &token);
512 			}
513 			goto no_copy;
514 		}
515 
516 		if (overwrite_root &&
517 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
518 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
519 			save_old_i_size = 1;
520 			saved_i_size = btrfs_inode_size(path->nodes[0],
521 							dst_item);
522 		}
523 	}
524 
525 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
526 			   src_ptr, item_size);
527 
528 	if (save_old_i_size) {
529 		struct btrfs_inode_item *dst_item;
530 		dst_item = (struct btrfs_inode_item *)dst_ptr;
531 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
532 	}
533 
534 	/* make sure the generation is filled in */
535 	if (key->type == BTRFS_INODE_ITEM_KEY) {
536 		struct btrfs_inode_item *dst_item;
537 		dst_item = (struct btrfs_inode_item *)dst_ptr;
538 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
539 			btrfs_set_inode_generation(path->nodes[0], dst_item,
540 						   trans->transid);
541 		}
542 	}
543 no_copy:
544 	btrfs_mark_buffer_dirty(path->nodes[0]);
545 	btrfs_release_path(path);
546 	return 0;
547 }
548 
549 /*
550  * simple helper to read an inode off the disk from a given root
551  * This can only be called for subvolume roots and not for the log
552  */
553 static noinline struct inode *read_one_inode(struct btrfs_root *root,
554 					     u64 objectid)
555 {
556 	struct btrfs_key key;
557 	struct inode *inode;
558 
559 	key.objectid = objectid;
560 	key.type = BTRFS_INODE_ITEM_KEY;
561 	key.offset = 0;
562 	inode = btrfs_iget(root->fs_info->sb, &key, root);
563 	if (IS_ERR(inode))
564 		inode = NULL;
565 	return inode;
566 }
567 
568 /* replays a single extent in 'eb' at 'slot' with 'key' into the
569  * subvolume 'root'.  path is released on entry and should be released
570  * on exit.
571  *
572  * extents in the log tree have not been allocated out of the extent
573  * tree yet.  So, this completes the allocation, taking a reference
574  * as required if the extent already exists or creating a new extent
575  * if it isn't in the extent allocation tree yet.
576  *
577  * The extent is inserted into the file, dropping any existing extents
578  * from the file that overlap the new one.
579  */
580 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
581 				      struct btrfs_root *root,
582 				      struct btrfs_path *path,
583 				      struct extent_buffer *eb, int slot,
584 				      struct btrfs_key *key)
585 {
586 	struct btrfs_fs_info *fs_info = root->fs_info;
587 	int found_type;
588 	u64 extent_end;
589 	u64 start = key->offset;
590 	u64 nbytes = 0;
591 	struct btrfs_file_extent_item *item;
592 	struct inode *inode = NULL;
593 	unsigned long size;
594 	int ret = 0;
595 
596 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
597 	found_type = btrfs_file_extent_type(eb, item);
598 
599 	if (found_type == BTRFS_FILE_EXTENT_REG ||
600 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
601 		nbytes = btrfs_file_extent_num_bytes(eb, item);
602 		extent_end = start + nbytes;
603 
604 		/*
605 		 * We don't add to the inodes nbytes if we are prealloc or a
606 		 * hole.
607 		 */
608 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
609 			nbytes = 0;
610 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
611 		size = btrfs_file_extent_ram_bytes(eb, item);
612 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
613 		extent_end = ALIGN(start + size,
614 				   fs_info->sectorsize);
615 	} else {
616 		ret = 0;
617 		goto out;
618 	}
619 
620 	inode = read_one_inode(root, key->objectid);
621 	if (!inode) {
622 		ret = -EIO;
623 		goto out;
624 	}
625 
626 	/*
627 	 * first check to see if we already have this extent in the
628 	 * file.  This must be done before the btrfs_drop_extents run
629 	 * so we don't try to drop this extent.
630 	 */
631 	ret = btrfs_lookup_file_extent(trans, root, path,
632 			btrfs_ino(BTRFS_I(inode)), start, 0);
633 
634 	if (ret == 0 &&
635 	    (found_type == BTRFS_FILE_EXTENT_REG ||
636 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
637 		struct btrfs_file_extent_item cmp1;
638 		struct btrfs_file_extent_item cmp2;
639 		struct btrfs_file_extent_item *existing;
640 		struct extent_buffer *leaf;
641 
642 		leaf = path->nodes[0];
643 		existing = btrfs_item_ptr(leaf, path->slots[0],
644 					  struct btrfs_file_extent_item);
645 
646 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
647 				   sizeof(cmp1));
648 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
649 				   sizeof(cmp2));
650 
651 		/*
652 		 * we already have a pointer to this exact extent,
653 		 * we don't have to do anything
654 		 */
655 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
656 			btrfs_release_path(path);
657 			goto out;
658 		}
659 	}
660 	btrfs_release_path(path);
661 
662 	/* drop any overlapping extents */
663 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
664 	if (ret)
665 		goto out;
666 
667 	if (found_type == BTRFS_FILE_EXTENT_REG ||
668 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
669 		u64 offset;
670 		unsigned long dest_offset;
671 		struct btrfs_key ins;
672 
673 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
674 		    btrfs_fs_incompat(fs_info, NO_HOLES))
675 			goto update_inode;
676 
677 		ret = btrfs_insert_empty_item(trans, root, path, key,
678 					      sizeof(*item));
679 		if (ret)
680 			goto out;
681 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
682 						    path->slots[0]);
683 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
684 				(unsigned long)item,  sizeof(*item));
685 
686 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
687 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
688 		ins.type = BTRFS_EXTENT_ITEM_KEY;
689 		offset = key->offset - btrfs_file_extent_offset(eb, item);
690 
691 		/*
692 		 * Manually record dirty extent, as here we did a shallow
693 		 * file extent item copy and skip normal backref update,
694 		 * but modifying extent tree all by ourselves.
695 		 * So need to manually record dirty extent for qgroup,
696 		 * as the owner of the file extent changed from log tree
697 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
698 		 */
699 		ret = btrfs_qgroup_trace_extent(trans,
700 				btrfs_file_extent_disk_bytenr(eb, item),
701 				btrfs_file_extent_disk_num_bytes(eb, item),
702 				GFP_NOFS);
703 		if (ret < 0)
704 			goto out;
705 
706 		if (ins.objectid > 0) {
707 			struct btrfs_ref ref = { 0 };
708 			u64 csum_start;
709 			u64 csum_end;
710 			LIST_HEAD(ordered_sums);
711 
712 			/*
713 			 * is this extent already allocated in the extent
714 			 * allocation tree?  If so, just add a reference
715 			 */
716 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
717 						ins.offset);
718 			if (ret == 0) {
719 				btrfs_init_generic_ref(&ref,
720 						BTRFS_ADD_DELAYED_REF,
721 						ins.objectid, ins.offset, 0);
722 				btrfs_init_data_ref(&ref,
723 						root->root_key.objectid,
724 						key->objectid, offset);
725 				ret = btrfs_inc_extent_ref(trans, &ref);
726 				if (ret)
727 					goto out;
728 			} else {
729 				/*
730 				 * insert the extent pointer in the extent
731 				 * allocation tree
732 				 */
733 				ret = btrfs_alloc_logged_file_extent(trans,
734 						root->root_key.objectid,
735 						key->objectid, offset, &ins);
736 				if (ret)
737 					goto out;
738 			}
739 			btrfs_release_path(path);
740 
741 			if (btrfs_file_extent_compression(eb, item)) {
742 				csum_start = ins.objectid;
743 				csum_end = csum_start + ins.offset;
744 			} else {
745 				csum_start = ins.objectid +
746 					btrfs_file_extent_offset(eb, item);
747 				csum_end = csum_start +
748 					btrfs_file_extent_num_bytes(eb, item);
749 			}
750 
751 			ret = btrfs_lookup_csums_range(root->log_root,
752 						csum_start, csum_end - 1,
753 						&ordered_sums, 0);
754 			if (ret)
755 				goto out;
756 			/*
757 			 * Now delete all existing cums in the csum root that
758 			 * cover our range. We do this because we can have an
759 			 * extent that is completely referenced by one file
760 			 * extent item and partially referenced by another
761 			 * file extent item (like after using the clone or
762 			 * extent_same ioctls). In this case if we end up doing
763 			 * the replay of the one that partially references the
764 			 * extent first, and we do not do the csum deletion
765 			 * below, we can get 2 csum items in the csum tree that
766 			 * overlap each other. For example, imagine our log has
767 			 * the two following file extent items:
768 			 *
769 			 * key (257 EXTENT_DATA 409600)
770 			 *     extent data disk byte 12845056 nr 102400
771 			 *     extent data offset 20480 nr 20480 ram 102400
772 			 *
773 			 * key (257 EXTENT_DATA 819200)
774 			 *     extent data disk byte 12845056 nr 102400
775 			 *     extent data offset 0 nr 102400 ram 102400
776 			 *
777 			 * Where the second one fully references the 100K extent
778 			 * that starts at disk byte 12845056, and the log tree
779 			 * has a single csum item that covers the entire range
780 			 * of the extent:
781 			 *
782 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
783 			 *
784 			 * After the first file extent item is replayed, the
785 			 * csum tree gets the following csum item:
786 			 *
787 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
788 			 *
789 			 * Which covers the 20K sub-range starting at offset 20K
790 			 * of our extent. Now when we replay the second file
791 			 * extent item, if we do not delete existing csum items
792 			 * that cover any of its blocks, we end up getting two
793 			 * csum items in our csum tree that overlap each other:
794 			 *
795 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
796 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
797 			 *
798 			 * Which is a problem, because after this anyone trying
799 			 * to lookup up for the checksum of any block of our
800 			 * extent starting at an offset of 40K or higher, will
801 			 * end up looking at the second csum item only, which
802 			 * does not contain the checksum for any block starting
803 			 * at offset 40K or higher of our extent.
804 			 */
805 			while (!list_empty(&ordered_sums)) {
806 				struct btrfs_ordered_sum *sums;
807 				sums = list_entry(ordered_sums.next,
808 						struct btrfs_ordered_sum,
809 						list);
810 				if (!ret)
811 					ret = btrfs_del_csums(trans, fs_info,
812 							      sums->bytenr,
813 							      sums->len);
814 				if (!ret)
815 					ret = btrfs_csum_file_blocks(trans,
816 						fs_info->csum_root, sums);
817 				list_del(&sums->list);
818 				kfree(sums);
819 			}
820 			if (ret)
821 				goto out;
822 		} else {
823 			btrfs_release_path(path);
824 		}
825 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
826 		/* inline extents are easy, we just overwrite them */
827 		ret = overwrite_item(trans, root, path, eb, slot, key);
828 		if (ret)
829 			goto out;
830 	}
831 
832 	inode_add_bytes(inode, nbytes);
833 update_inode:
834 	ret = btrfs_update_inode(trans, root, inode);
835 out:
836 	if (inode)
837 		iput(inode);
838 	return ret;
839 }
840 
841 /*
842  * when cleaning up conflicts between the directory names in the
843  * subvolume, directory names in the log and directory names in the
844  * inode back references, we may have to unlink inodes from directories.
845  *
846  * This is a helper function to do the unlink of a specific directory
847  * item
848  */
849 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
850 				      struct btrfs_root *root,
851 				      struct btrfs_path *path,
852 				      struct btrfs_inode *dir,
853 				      struct btrfs_dir_item *di)
854 {
855 	struct inode *inode;
856 	char *name;
857 	int name_len;
858 	struct extent_buffer *leaf;
859 	struct btrfs_key location;
860 	int ret;
861 
862 	leaf = path->nodes[0];
863 
864 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
865 	name_len = btrfs_dir_name_len(leaf, di);
866 	name = kmalloc(name_len, GFP_NOFS);
867 	if (!name)
868 		return -ENOMEM;
869 
870 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
871 	btrfs_release_path(path);
872 
873 	inode = read_one_inode(root, location.objectid);
874 	if (!inode) {
875 		ret = -EIO;
876 		goto out;
877 	}
878 
879 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
880 	if (ret)
881 		goto out;
882 
883 	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
884 			name_len);
885 	if (ret)
886 		goto out;
887 	else
888 		ret = btrfs_run_delayed_items(trans);
889 out:
890 	kfree(name);
891 	iput(inode);
892 	return ret;
893 }
894 
895 /*
896  * helper function to see if a given name and sequence number found
897  * in an inode back reference are already in a directory and correctly
898  * point to this inode
899  */
900 static noinline int inode_in_dir(struct btrfs_root *root,
901 				 struct btrfs_path *path,
902 				 u64 dirid, u64 objectid, u64 index,
903 				 const char *name, int name_len)
904 {
905 	struct btrfs_dir_item *di;
906 	struct btrfs_key location;
907 	int match = 0;
908 
909 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
910 					 index, name, name_len, 0);
911 	if (di && !IS_ERR(di)) {
912 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
913 		if (location.objectid != objectid)
914 			goto out;
915 	} else
916 		goto out;
917 	btrfs_release_path(path);
918 
919 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
920 	if (di && !IS_ERR(di)) {
921 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
922 		if (location.objectid != objectid)
923 			goto out;
924 	} else
925 		goto out;
926 	match = 1;
927 out:
928 	btrfs_release_path(path);
929 	return match;
930 }
931 
932 /*
933  * helper function to check a log tree for a named back reference in
934  * an inode.  This is used to decide if a back reference that is
935  * found in the subvolume conflicts with what we find in the log.
936  *
937  * inode backreferences may have multiple refs in a single item,
938  * during replay we process one reference at a time, and we don't
939  * want to delete valid links to a file from the subvolume if that
940  * link is also in the log.
941  */
942 static noinline int backref_in_log(struct btrfs_root *log,
943 				   struct btrfs_key *key,
944 				   u64 ref_objectid,
945 				   const char *name, int namelen)
946 {
947 	struct btrfs_path *path;
948 	int ret;
949 
950 	path = btrfs_alloc_path();
951 	if (!path)
952 		return -ENOMEM;
953 
954 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
955 	if (ret < 0) {
956 		goto out;
957 	} else if (ret == 1) {
958 		ret = 0;
959 		goto out;
960 	}
961 
962 	if (key->type == BTRFS_INODE_EXTREF_KEY)
963 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
964 						       path->slots[0],
965 						       ref_objectid,
966 						       name, namelen);
967 	else
968 		ret = !!btrfs_find_name_in_backref(path->nodes[0],
969 						   path->slots[0],
970 						   name, namelen);
971 out:
972 	btrfs_free_path(path);
973 	return ret;
974 }
975 
976 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
977 				  struct btrfs_root *root,
978 				  struct btrfs_path *path,
979 				  struct btrfs_root *log_root,
980 				  struct btrfs_inode *dir,
981 				  struct btrfs_inode *inode,
982 				  u64 inode_objectid, u64 parent_objectid,
983 				  u64 ref_index, char *name, int namelen,
984 				  int *search_done)
985 {
986 	int ret;
987 	char *victim_name;
988 	int victim_name_len;
989 	struct extent_buffer *leaf;
990 	struct btrfs_dir_item *di;
991 	struct btrfs_key search_key;
992 	struct btrfs_inode_extref *extref;
993 
994 again:
995 	/* Search old style refs */
996 	search_key.objectid = inode_objectid;
997 	search_key.type = BTRFS_INODE_REF_KEY;
998 	search_key.offset = parent_objectid;
999 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1000 	if (ret == 0) {
1001 		struct btrfs_inode_ref *victim_ref;
1002 		unsigned long ptr;
1003 		unsigned long ptr_end;
1004 
1005 		leaf = path->nodes[0];
1006 
1007 		/* are we trying to overwrite a back ref for the root directory
1008 		 * if so, just jump out, we're done
1009 		 */
1010 		if (search_key.objectid == search_key.offset)
1011 			return 1;
1012 
1013 		/* check all the names in this back reference to see
1014 		 * if they are in the log.  if so, we allow them to stay
1015 		 * otherwise they must be unlinked as a conflict
1016 		 */
1017 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1018 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1019 		while (ptr < ptr_end) {
1020 			victim_ref = (struct btrfs_inode_ref *)ptr;
1021 			victim_name_len = btrfs_inode_ref_name_len(leaf,
1022 								   victim_ref);
1023 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1024 			if (!victim_name)
1025 				return -ENOMEM;
1026 
1027 			read_extent_buffer(leaf, victim_name,
1028 					   (unsigned long)(victim_ref + 1),
1029 					   victim_name_len);
1030 
1031 			ret = backref_in_log(log_root, &search_key,
1032 					     parent_objectid, victim_name,
1033 					     victim_name_len);
1034 			if (ret < 0) {
1035 				kfree(victim_name);
1036 				return ret;
1037 			} else if (!ret) {
1038 				inc_nlink(&inode->vfs_inode);
1039 				btrfs_release_path(path);
1040 
1041 				ret = btrfs_unlink_inode(trans, root, dir, inode,
1042 						victim_name, victim_name_len);
1043 				kfree(victim_name);
1044 				if (ret)
1045 					return ret;
1046 				ret = btrfs_run_delayed_items(trans);
1047 				if (ret)
1048 					return ret;
1049 				*search_done = 1;
1050 				goto again;
1051 			}
1052 			kfree(victim_name);
1053 
1054 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1055 		}
1056 
1057 		/*
1058 		 * NOTE: we have searched root tree and checked the
1059 		 * corresponding ref, it does not need to check again.
1060 		 */
1061 		*search_done = 1;
1062 	}
1063 	btrfs_release_path(path);
1064 
1065 	/* Same search but for extended refs */
1066 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1067 					   inode_objectid, parent_objectid, 0,
1068 					   0);
1069 	if (!IS_ERR_OR_NULL(extref)) {
1070 		u32 item_size;
1071 		u32 cur_offset = 0;
1072 		unsigned long base;
1073 		struct inode *victim_parent;
1074 
1075 		leaf = path->nodes[0];
1076 
1077 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1078 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1079 
1080 		while (cur_offset < item_size) {
1081 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1082 
1083 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1084 
1085 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1086 				goto next;
1087 
1088 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1089 			if (!victim_name)
1090 				return -ENOMEM;
1091 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1092 					   victim_name_len);
1093 
1094 			search_key.objectid = inode_objectid;
1095 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1096 			search_key.offset = btrfs_extref_hash(parent_objectid,
1097 							      victim_name,
1098 							      victim_name_len);
1099 			ret = backref_in_log(log_root, &search_key,
1100 					     parent_objectid, victim_name,
1101 					     victim_name_len);
1102 			if (ret < 0) {
1103 				return ret;
1104 			} else if (!ret) {
1105 				ret = -ENOENT;
1106 				victim_parent = read_one_inode(root,
1107 						parent_objectid);
1108 				if (victim_parent) {
1109 					inc_nlink(&inode->vfs_inode);
1110 					btrfs_release_path(path);
1111 
1112 					ret = btrfs_unlink_inode(trans, root,
1113 							BTRFS_I(victim_parent),
1114 							inode,
1115 							victim_name,
1116 							victim_name_len);
1117 					if (!ret)
1118 						ret = btrfs_run_delayed_items(
1119 								  trans);
1120 				}
1121 				iput(victim_parent);
1122 				kfree(victim_name);
1123 				if (ret)
1124 					return ret;
1125 				*search_done = 1;
1126 				goto again;
1127 			}
1128 			kfree(victim_name);
1129 next:
1130 			cur_offset += victim_name_len + sizeof(*extref);
1131 		}
1132 		*search_done = 1;
1133 	}
1134 	btrfs_release_path(path);
1135 
1136 	/* look for a conflicting sequence number */
1137 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1138 					 ref_index, name, namelen, 0);
1139 	if (di && !IS_ERR(di)) {
1140 		ret = drop_one_dir_item(trans, root, path, dir, di);
1141 		if (ret)
1142 			return ret;
1143 	}
1144 	btrfs_release_path(path);
1145 
1146 	/* look for a conflicting name */
1147 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1148 				   name, namelen, 0);
1149 	if (di && !IS_ERR(di)) {
1150 		ret = drop_one_dir_item(trans, root, path, dir, di);
1151 		if (ret)
1152 			return ret;
1153 	}
1154 	btrfs_release_path(path);
1155 
1156 	return 0;
1157 }
1158 
1159 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1160 			     u32 *namelen, char **name, u64 *index,
1161 			     u64 *parent_objectid)
1162 {
1163 	struct btrfs_inode_extref *extref;
1164 
1165 	extref = (struct btrfs_inode_extref *)ref_ptr;
1166 
1167 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1168 	*name = kmalloc(*namelen, GFP_NOFS);
1169 	if (*name == NULL)
1170 		return -ENOMEM;
1171 
1172 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1173 			   *namelen);
1174 
1175 	if (index)
1176 		*index = btrfs_inode_extref_index(eb, extref);
1177 	if (parent_objectid)
1178 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1179 
1180 	return 0;
1181 }
1182 
1183 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1184 			  u32 *namelen, char **name, u64 *index)
1185 {
1186 	struct btrfs_inode_ref *ref;
1187 
1188 	ref = (struct btrfs_inode_ref *)ref_ptr;
1189 
1190 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1191 	*name = kmalloc(*namelen, GFP_NOFS);
1192 	if (*name == NULL)
1193 		return -ENOMEM;
1194 
1195 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1196 
1197 	if (index)
1198 		*index = btrfs_inode_ref_index(eb, ref);
1199 
1200 	return 0;
1201 }
1202 
1203 /*
1204  * Take an inode reference item from the log tree and iterate all names from the
1205  * inode reference item in the subvolume tree with the same key (if it exists).
1206  * For any name that is not in the inode reference item from the log tree, do a
1207  * proper unlink of that name (that is, remove its entry from the inode
1208  * reference item and both dir index keys).
1209  */
1210 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1211 				 struct btrfs_root *root,
1212 				 struct btrfs_path *path,
1213 				 struct btrfs_inode *inode,
1214 				 struct extent_buffer *log_eb,
1215 				 int log_slot,
1216 				 struct btrfs_key *key)
1217 {
1218 	int ret;
1219 	unsigned long ref_ptr;
1220 	unsigned long ref_end;
1221 	struct extent_buffer *eb;
1222 
1223 again:
1224 	btrfs_release_path(path);
1225 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1226 	if (ret > 0) {
1227 		ret = 0;
1228 		goto out;
1229 	}
1230 	if (ret < 0)
1231 		goto out;
1232 
1233 	eb = path->nodes[0];
1234 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1235 	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1236 	while (ref_ptr < ref_end) {
1237 		char *name = NULL;
1238 		int namelen;
1239 		u64 parent_id;
1240 
1241 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1242 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1243 						NULL, &parent_id);
1244 		} else {
1245 			parent_id = key->offset;
1246 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1247 					     NULL);
1248 		}
1249 		if (ret)
1250 			goto out;
1251 
1252 		if (key->type == BTRFS_INODE_EXTREF_KEY)
1253 			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1254 							       parent_id, name,
1255 							       namelen);
1256 		else
1257 			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1258 							   name, namelen);
1259 
1260 		if (!ret) {
1261 			struct inode *dir;
1262 
1263 			btrfs_release_path(path);
1264 			dir = read_one_inode(root, parent_id);
1265 			if (!dir) {
1266 				ret = -ENOENT;
1267 				kfree(name);
1268 				goto out;
1269 			}
1270 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1271 						 inode, name, namelen);
1272 			kfree(name);
1273 			iput(dir);
1274 			if (ret)
1275 				goto out;
1276 			goto again;
1277 		}
1278 
1279 		kfree(name);
1280 		ref_ptr += namelen;
1281 		if (key->type == BTRFS_INODE_EXTREF_KEY)
1282 			ref_ptr += sizeof(struct btrfs_inode_extref);
1283 		else
1284 			ref_ptr += sizeof(struct btrfs_inode_ref);
1285 	}
1286 	ret = 0;
1287  out:
1288 	btrfs_release_path(path);
1289 	return ret;
1290 }
1291 
1292 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1293 				  const u8 ref_type, const char *name,
1294 				  const int namelen)
1295 {
1296 	struct btrfs_key key;
1297 	struct btrfs_path *path;
1298 	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1299 	int ret;
1300 
1301 	path = btrfs_alloc_path();
1302 	if (!path)
1303 		return -ENOMEM;
1304 
1305 	key.objectid = btrfs_ino(BTRFS_I(inode));
1306 	key.type = ref_type;
1307 	if (key.type == BTRFS_INODE_REF_KEY)
1308 		key.offset = parent_id;
1309 	else
1310 		key.offset = btrfs_extref_hash(parent_id, name, namelen);
1311 
1312 	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1313 	if (ret < 0)
1314 		goto out;
1315 	if (ret > 0) {
1316 		ret = 0;
1317 		goto out;
1318 	}
1319 	if (key.type == BTRFS_INODE_EXTREF_KEY)
1320 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1321 				path->slots[0], parent_id, name, namelen);
1322 	else
1323 		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1324 						   name, namelen);
1325 
1326 out:
1327 	btrfs_free_path(path);
1328 	return ret;
1329 }
1330 
1331 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1332 		    struct inode *dir, struct inode *inode, const char *name,
1333 		    int namelen, u64 ref_index)
1334 {
1335 	struct btrfs_dir_item *dir_item;
1336 	struct btrfs_key key;
1337 	struct btrfs_path *path;
1338 	struct inode *other_inode = NULL;
1339 	int ret;
1340 
1341 	path = btrfs_alloc_path();
1342 	if (!path)
1343 		return -ENOMEM;
1344 
1345 	dir_item = btrfs_lookup_dir_item(NULL, root, path,
1346 					 btrfs_ino(BTRFS_I(dir)),
1347 					 name, namelen, 0);
1348 	if (!dir_item) {
1349 		btrfs_release_path(path);
1350 		goto add_link;
1351 	} else if (IS_ERR(dir_item)) {
1352 		ret = PTR_ERR(dir_item);
1353 		goto out;
1354 	}
1355 
1356 	/*
1357 	 * Our inode's dentry collides with the dentry of another inode which is
1358 	 * in the log but not yet processed since it has a higher inode number.
1359 	 * So delete that other dentry.
1360 	 */
1361 	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1362 	btrfs_release_path(path);
1363 	other_inode = read_one_inode(root, key.objectid);
1364 	if (!other_inode) {
1365 		ret = -ENOENT;
1366 		goto out;
1367 	}
1368 	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1369 				 name, namelen);
1370 	if (ret)
1371 		goto out;
1372 	/*
1373 	 * If we dropped the link count to 0, bump it so that later the iput()
1374 	 * on the inode will not free it. We will fixup the link count later.
1375 	 */
1376 	if (other_inode->i_nlink == 0)
1377 		inc_nlink(other_inode);
1378 
1379 	ret = btrfs_run_delayed_items(trans);
1380 	if (ret)
1381 		goto out;
1382 add_link:
1383 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1384 			     name, namelen, 0, ref_index);
1385 out:
1386 	iput(other_inode);
1387 	btrfs_free_path(path);
1388 
1389 	return ret;
1390 }
1391 
1392 /*
1393  * replay one inode back reference item found in the log tree.
1394  * eb, slot and key refer to the buffer and key found in the log tree.
1395  * root is the destination we are replaying into, and path is for temp
1396  * use by this function.  (it should be released on return).
1397  */
1398 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1399 				  struct btrfs_root *root,
1400 				  struct btrfs_root *log,
1401 				  struct btrfs_path *path,
1402 				  struct extent_buffer *eb, int slot,
1403 				  struct btrfs_key *key)
1404 {
1405 	struct inode *dir = NULL;
1406 	struct inode *inode = NULL;
1407 	unsigned long ref_ptr;
1408 	unsigned long ref_end;
1409 	char *name = NULL;
1410 	int namelen;
1411 	int ret;
1412 	int search_done = 0;
1413 	int log_ref_ver = 0;
1414 	u64 parent_objectid;
1415 	u64 inode_objectid;
1416 	u64 ref_index = 0;
1417 	int ref_struct_size;
1418 
1419 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1420 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1421 
1422 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1423 		struct btrfs_inode_extref *r;
1424 
1425 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1426 		log_ref_ver = 1;
1427 		r = (struct btrfs_inode_extref *)ref_ptr;
1428 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1429 	} else {
1430 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1431 		parent_objectid = key->offset;
1432 	}
1433 	inode_objectid = key->objectid;
1434 
1435 	/*
1436 	 * it is possible that we didn't log all the parent directories
1437 	 * for a given inode.  If we don't find the dir, just don't
1438 	 * copy the back ref in.  The link count fixup code will take
1439 	 * care of the rest
1440 	 */
1441 	dir = read_one_inode(root, parent_objectid);
1442 	if (!dir) {
1443 		ret = -ENOENT;
1444 		goto out;
1445 	}
1446 
1447 	inode = read_one_inode(root, inode_objectid);
1448 	if (!inode) {
1449 		ret = -EIO;
1450 		goto out;
1451 	}
1452 
1453 	while (ref_ptr < ref_end) {
1454 		if (log_ref_ver) {
1455 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1456 						&ref_index, &parent_objectid);
1457 			/*
1458 			 * parent object can change from one array
1459 			 * item to another.
1460 			 */
1461 			if (!dir)
1462 				dir = read_one_inode(root, parent_objectid);
1463 			if (!dir) {
1464 				ret = -ENOENT;
1465 				goto out;
1466 			}
1467 		} else {
1468 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1469 					     &ref_index);
1470 		}
1471 		if (ret)
1472 			goto out;
1473 
1474 		/* if we already have a perfect match, we're done */
1475 		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1476 					btrfs_ino(BTRFS_I(inode)), ref_index,
1477 					name, namelen)) {
1478 			/*
1479 			 * look for a conflicting back reference in the
1480 			 * metadata. if we find one we have to unlink that name
1481 			 * of the file before we add our new link.  Later on, we
1482 			 * overwrite any existing back reference, and we don't
1483 			 * want to create dangling pointers in the directory.
1484 			 */
1485 
1486 			if (!search_done) {
1487 				ret = __add_inode_ref(trans, root, path, log,
1488 						      BTRFS_I(dir),
1489 						      BTRFS_I(inode),
1490 						      inode_objectid,
1491 						      parent_objectid,
1492 						      ref_index, name, namelen,
1493 						      &search_done);
1494 				if (ret) {
1495 					if (ret == 1)
1496 						ret = 0;
1497 					goto out;
1498 				}
1499 			}
1500 
1501 			/*
1502 			 * If a reference item already exists for this inode
1503 			 * with the same parent and name, but different index,
1504 			 * drop it and the corresponding directory index entries
1505 			 * from the parent before adding the new reference item
1506 			 * and dir index entries, otherwise we would fail with
1507 			 * -EEXIST returned from btrfs_add_link() below.
1508 			 */
1509 			ret = btrfs_inode_ref_exists(inode, dir, key->type,
1510 						     name, namelen);
1511 			if (ret > 0) {
1512 				ret = btrfs_unlink_inode(trans, root,
1513 							 BTRFS_I(dir),
1514 							 BTRFS_I(inode),
1515 							 name, namelen);
1516 				/*
1517 				 * If we dropped the link count to 0, bump it so
1518 				 * that later the iput() on the inode will not
1519 				 * free it. We will fixup the link count later.
1520 				 */
1521 				if (!ret && inode->i_nlink == 0)
1522 					inc_nlink(inode);
1523 			}
1524 			if (ret < 0)
1525 				goto out;
1526 
1527 			/* insert our name */
1528 			ret = add_link(trans, root, dir, inode, name, namelen,
1529 				       ref_index);
1530 			if (ret)
1531 				goto out;
1532 
1533 			btrfs_update_inode(trans, root, inode);
1534 		}
1535 
1536 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1537 		kfree(name);
1538 		name = NULL;
1539 		if (log_ref_ver) {
1540 			iput(dir);
1541 			dir = NULL;
1542 		}
1543 	}
1544 
1545 	/*
1546 	 * Before we overwrite the inode reference item in the subvolume tree
1547 	 * with the item from the log tree, we must unlink all names from the
1548 	 * parent directory that are in the subvolume's tree inode reference
1549 	 * item, otherwise we end up with an inconsistent subvolume tree where
1550 	 * dir index entries exist for a name but there is no inode reference
1551 	 * item with the same name.
1552 	 */
1553 	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1554 				    key);
1555 	if (ret)
1556 		goto out;
1557 
1558 	/* finally write the back reference in the inode */
1559 	ret = overwrite_item(trans, root, path, eb, slot, key);
1560 out:
1561 	btrfs_release_path(path);
1562 	kfree(name);
1563 	iput(dir);
1564 	iput(inode);
1565 	return ret;
1566 }
1567 
1568 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1569 			      struct btrfs_root *root, u64 ino)
1570 {
1571 	int ret;
1572 
1573 	ret = btrfs_insert_orphan_item(trans, root, ino);
1574 	if (ret == -EEXIST)
1575 		ret = 0;
1576 
1577 	return ret;
1578 }
1579 
1580 static int count_inode_extrefs(struct btrfs_root *root,
1581 		struct btrfs_inode *inode, struct btrfs_path *path)
1582 {
1583 	int ret = 0;
1584 	int name_len;
1585 	unsigned int nlink = 0;
1586 	u32 item_size;
1587 	u32 cur_offset = 0;
1588 	u64 inode_objectid = btrfs_ino(inode);
1589 	u64 offset = 0;
1590 	unsigned long ptr;
1591 	struct btrfs_inode_extref *extref;
1592 	struct extent_buffer *leaf;
1593 
1594 	while (1) {
1595 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1596 					    &extref, &offset);
1597 		if (ret)
1598 			break;
1599 
1600 		leaf = path->nodes[0];
1601 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1602 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1603 		cur_offset = 0;
1604 
1605 		while (cur_offset < item_size) {
1606 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1607 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1608 
1609 			nlink++;
1610 
1611 			cur_offset += name_len + sizeof(*extref);
1612 		}
1613 
1614 		offset++;
1615 		btrfs_release_path(path);
1616 	}
1617 	btrfs_release_path(path);
1618 
1619 	if (ret < 0 && ret != -ENOENT)
1620 		return ret;
1621 	return nlink;
1622 }
1623 
1624 static int count_inode_refs(struct btrfs_root *root,
1625 			struct btrfs_inode *inode, struct btrfs_path *path)
1626 {
1627 	int ret;
1628 	struct btrfs_key key;
1629 	unsigned int nlink = 0;
1630 	unsigned long ptr;
1631 	unsigned long ptr_end;
1632 	int name_len;
1633 	u64 ino = btrfs_ino(inode);
1634 
1635 	key.objectid = ino;
1636 	key.type = BTRFS_INODE_REF_KEY;
1637 	key.offset = (u64)-1;
1638 
1639 	while (1) {
1640 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1641 		if (ret < 0)
1642 			break;
1643 		if (ret > 0) {
1644 			if (path->slots[0] == 0)
1645 				break;
1646 			path->slots[0]--;
1647 		}
1648 process_slot:
1649 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1650 				      path->slots[0]);
1651 		if (key.objectid != ino ||
1652 		    key.type != BTRFS_INODE_REF_KEY)
1653 			break;
1654 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1655 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1656 						   path->slots[0]);
1657 		while (ptr < ptr_end) {
1658 			struct btrfs_inode_ref *ref;
1659 
1660 			ref = (struct btrfs_inode_ref *)ptr;
1661 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1662 							    ref);
1663 			ptr = (unsigned long)(ref + 1) + name_len;
1664 			nlink++;
1665 		}
1666 
1667 		if (key.offset == 0)
1668 			break;
1669 		if (path->slots[0] > 0) {
1670 			path->slots[0]--;
1671 			goto process_slot;
1672 		}
1673 		key.offset--;
1674 		btrfs_release_path(path);
1675 	}
1676 	btrfs_release_path(path);
1677 
1678 	return nlink;
1679 }
1680 
1681 /*
1682  * There are a few corners where the link count of the file can't
1683  * be properly maintained during replay.  So, instead of adding
1684  * lots of complexity to the log code, we just scan the backrefs
1685  * for any file that has been through replay.
1686  *
1687  * The scan will update the link count on the inode to reflect the
1688  * number of back refs found.  If it goes down to zero, the iput
1689  * will free the inode.
1690  */
1691 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1692 					   struct btrfs_root *root,
1693 					   struct inode *inode)
1694 {
1695 	struct btrfs_path *path;
1696 	int ret;
1697 	u64 nlink = 0;
1698 	u64 ino = btrfs_ino(BTRFS_I(inode));
1699 
1700 	path = btrfs_alloc_path();
1701 	if (!path)
1702 		return -ENOMEM;
1703 
1704 	ret = count_inode_refs(root, BTRFS_I(inode), path);
1705 	if (ret < 0)
1706 		goto out;
1707 
1708 	nlink = ret;
1709 
1710 	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1711 	if (ret < 0)
1712 		goto out;
1713 
1714 	nlink += ret;
1715 
1716 	ret = 0;
1717 
1718 	if (nlink != inode->i_nlink) {
1719 		set_nlink(inode, nlink);
1720 		btrfs_update_inode(trans, root, inode);
1721 	}
1722 	BTRFS_I(inode)->index_cnt = (u64)-1;
1723 
1724 	if (inode->i_nlink == 0) {
1725 		if (S_ISDIR(inode->i_mode)) {
1726 			ret = replay_dir_deletes(trans, root, NULL, path,
1727 						 ino, 1);
1728 			if (ret)
1729 				goto out;
1730 		}
1731 		ret = insert_orphan_item(trans, root, ino);
1732 	}
1733 
1734 out:
1735 	btrfs_free_path(path);
1736 	return ret;
1737 }
1738 
1739 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1740 					    struct btrfs_root *root,
1741 					    struct btrfs_path *path)
1742 {
1743 	int ret;
1744 	struct btrfs_key key;
1745 	struct inode *inode;
1746 
1747 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1748 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1749 	key.offset = (u64)-1;
1750 	while (1) {
1751 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1752 		if (ret < 0)
1753 			break;
1754 
1755 		if (ret == 1) {
1756 			if (path->slots[0] == 0)
1757 				break;
1758 			path->slots[0]--;
1759 		}
1760 
1761 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1762 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1763 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1764 			break;
1765 
1766 		ret = btrfs_del_item(trans, root, path);
1767 		if (ret)
1768 			goto out;
1769 
1770 		btrfs_release_path(path);
1771 		inode = read_one_inode(root, key.offset);
1772 		if (!inode)
1773 			return -EIO;
1774 
1775 		ret = fixup_inode_link_count(trans, root, inode);
1776 		iput(inode);
1777 		if (ret)
1778 			goto out;
1779 
1780 		/*
1781 		 * fixup on a directory may create new entries,
1782 		 * make sure we always look for the highset possible
1783 		 * offset
1784 		 */
1785 		key.offset = (u64)-1;
1786 	}
1787 	ret = 0;
1788 out:
1789 	btrfs_release_path(path);
1790 	return ret;
1791 }
1792 
1793 
1794 /*
1795  * record a given inode in the fixup dir so we can check its link
1796  * count when replay is done.  The link count is incremented here
1797  * so the inode won't go away until we check it
1798  */
1799 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1800 				      struct btrfs_root *root,
1801 				      struct btrfs_path *path,
1802 				      u64 objectid)
1803 {
1804 	struct btrfs_key key;
1805 	int ret = 0;
1806 	struct inode *inode;
1807 
1808 	inode = read_one_inode(root, objectid);
1809 	if (!inode)
1810 		return -EIO;
1811 
1812 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1813 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1814 	key.offset = objectid;
1815 
1816 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1817 
1818 	btrfs_release_path(path);
1819 	if (ret == 0) {
1820 		if (!inode->i_nlink)
1821 			set_nlink(inode, 1);
1822 		else
1823 			inc_nlink(inode);
1824 		ret = btrfs_update_inode(trans, root, inode);
1825 	} else if (ret == -EEXIST) {
1826 		ret = 0;
1827 	} else {
1828 		BUG(); /* Logic Error */
1829 	}
1830 	iput(inode);
1831 
1832 	return ret;
1833 }
1834 
1835 /*
1836  * when replaying the log for a directory, we only insert names
1837  * for inodes that actually exist.  This means an fsync on a directory
1838  * does not implicitly fsync all the new files in it
1839  */
1840 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1841 				    struct btrfs_root *root,
1842 				    u64 dirid, u64 index,
1843 				    char *name, int name_len,
1844 				    struct btrfs_key *location)
1845 {
1846 	struct inode *inode;
1847 	struct inode *dir;
1848 	int ret;
1849 
1850 	inode = read_one_inode(root, location->objectid);
1851 	if (!inode)
1852 		return -ENOENT;
1853 
1854 	dir = read_one_inode(root, dirid);
1855 	if (!dir) {
1856 		iput(inode);
1857 		return -EIO;
1858 	}
1859 
1860 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1861 			name_len, 1, index);
1862 
1863 	/* FIXME, put inode into FIXUP list */
1864 
1865 	iput(inode);
1866 	iput(dir);
1867 	return ret;
1868 }
1869 
1870 /*
1871  * take a single entry in a log directory item and replay it into
1872  * the subvolume.
1873  *
1874  * if a conflicting item exists in the subdirectory already,
1875  * the inode it points to is unlinked and put into the link count
1876  * fix up tree.
1877  *
1878  * If a name from the log points to a file or directory that does
1879  * not exist in the FS, it is skipped.  fsyncs on directories
1880  * do not force down inodes inside that directory, just changes to the
1881  * names or unlinks in a directory.
1882  *
1883  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1884  * non-existing inode) and 1 if the name was replayed.
1885  */
1886 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1887 				    struct btrfs_root *root,
1888 				    struct btrfs_path *path,
1889 				    struct extent_buffer *eb,
1890 				    struct btrfs_dir_item *di,
1891 				    struct btrfs_key *key)
1892 {
1893 	char *name;
1894 	int name_len;
1895 	struct btrfs_dir_item *dst_di;
1896 	struct btrfs_key found_key;
1897 	struct btrfs_key log_key;
1898 	struct inode *dir;
1899 	u8 log_type;
1900 	int exists;
1901 	int ret = 0;
1902 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1903 	bool name_added = false;
1904 
1905 	dir = read_one_inode(root, key->objectid);
1906 	if (!dir)
1907 		return -EIO;
1908 
1909 	name_len = btrfs_dir_name_len(eb, di);
1910 	name = kmalloc(name_len, GFP_NOFS);
1911 	if (!name) {
1912 		ret = -ENOMEM;
1913 		goto out;
1914 	}
1915 
1916 	log_type = btrfs_dir_type(eb, di);
1917 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1918 		   name_len);
1919 
1920 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1921 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1922 	if (exists == 0)
1923 		exists = 1;
1924 	else
1925 		exists = 0;
1926 	btrfs_release_path(path);
1927 
1928 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1929 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1930 				       name, name_len, 1);
1931 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1932 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1933 						     key->objectid,
1934 						     key->offset, name,
1935 						     name_len, 1);
1936 	} else {
1937 		/* Corruption */
1938 		ret = -EINVAL;
1939 		goto out;
1940 	}
1941 	if (IS_ERR_OR_NULL(dst_di)) {
1942 		/* we need a sequence number to insert, so we only
1943 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1944 		 */
1945 		if (key->type != BTRFS_DIR_INDEX_KEY)
1946 			goto out;
1947 		goto insert;
1948 	}
1949 
1950 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1951 	/* the existing item matches the logged item */
1952 	if (found_key.objectid == log_key.objectid &&
1953 	    found_key.type == log_key.type &&
1954 	    found_key.offset == log_key.offset &&
1955 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1956 		update_size = false;
1957 		goto out;
1958 	}
1959 
1960 	/*
1961 	 * don't drop the conflicting directory entry if the inode
1962 	 * for the new entry doesn't exist
1963 	 */
1964 	if (!exists)
1965 		goto out;
1966 
1967 	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1968 	if (ret)
1969 		goto out;
1970 
1971 	if (key->type == BTRFS_DIR_INDEX_KEY)
1972 		goto insert;
1973 out:
1974 	btrfs_release_path(path);
1975 	if (!ret && update_size) {
1976 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1977 		ret = btrfs_update_inode(trans, root, dir);
1978 	}
1979 	kfree(name);
1980 	iput(dir);
1981 	if (!ret && name_added)
1982 		ret = 1;
1983 	return ret;
1984 
1985 insert:
1986 	/*
1987 	 * Check if the inode reference exists in the log for the given name,
1988 	 * inode and parent inode
1989 	 */
1990 	found_key.objectid = log_key.objectid;
1991 	found_key.type = BTRFS_INODE_REF_KEY;
1992 	found_key.offset = key->objectid;
1993 	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
1994 	if (ret < 0) {
1995 	        goto out;
1996 	} else if (ret) {
1997 	        /* The dentry will be added later. */
1998 	        ret = 0;
1999 	        update_size = false;
2000 	        goto out;
2001 	}
2002 
2003 	found_key.objectid = log_key.objectid;
2004 	found_key.type = BTRFS_INODE_EXTREF_KEY;
2005 	found_key.offset = key->objectid;
2006 	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2007 			     name_len);
2008 	if (ret < 0) {
2009 		goto out;
2010 	} else if (ret) {
2011 		/* The dentry will be added later. */
2012 		ret = 0;
2013 		update_size = false;
2014 		goto out;
2015 	}
2016 	btrfs_release_path(path);
2017 	ret = insert_one_name(trans, root, key->objectid, key->offset,
2018 			      name, name_len, &log_key);
2019 	if (ret && ret != -ENOENT && ret != -EEXIST)
2020 		goto out;
2021 	if (!ret)
2022 		name_added = true;
2023 	update_size = false;
2024 	ret = 0;
2025 	goto out;
2026 }
2027 
2028 /*
2029  * find all the names in a directory item and reconcile them into
2030  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
2031  * one name in a directory item, but the same code gets used for
2032  * both directory index types
2033  */
2034 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2035 					struct btrfs_root *root,
2036 					struct btrfs_path *path,
2037 					struct extent_buffer *eb, int slot,
2038 					struct btrfs_key *key)
2039 {
2040 	int ret = 0;
2041 	u32 item_size = btrfs_item_size_nr(eb, slot);
2042 	struct btrfs_dir_item *di;
2043 	int name_len;
2044 	unsigned long ptr;
2045 	unsigned long ptr_end;
2046 	struct btrfs_path *fixup_path = NULL;
2047 
2048 	ptr = btrfs_item_ptr_offset(eb, slot);
2049 	ptr_end = ptr + item_size;
2050 	while (ptr < ptr_end) {
2051 		di = (struct btrfs_dir_item *)ptr;
2052 		name_len = btrfs_dir_name_len(eb, di);
2053 		ret = replay_one_name(trans, root, path, eb, di, key);
2054 		if (ret < 0)
2055 			break;
2056 		ptr = (unsigned long)(di + 1);
2057 		ptr += name_len;
2058 
2059 		/*
2060 		 * If this entry refers to a non-directory (directories can not
2061 		 * have a link count > 1) and it was added in the transaction
2062 		 * that was not committed, make sure we fixup the link count of
2063 		 * the inode it the entry points to. Otherwise something like
2064 		 * the following would result in a directory pointing to an
2065 		 * inode with a wrong link that does not account for this dir
2066 		 * entry:
2067 		 *
2068 		 * mkdir testdir
2069 		 * touch testdir/foo
2070 		 * touch testdir/bar
2071 		 * sync
2072 		 *
2073 		 * ln testdir/bar testdir/bar_link
2074 		 * ln testdir/foo testdir/foo_link
2075 		 * xfs_io -c "fsync" testdir/bar
2076 		 *
2077 		 * <power failure>
2078 		 *
2079 		 * mount fs, log replay happens
2080 		 *
2081 		 * File foo would remain with a link count of 1 when it has two
2082 		 * entries pointing to it in the directory testdir. This would
2083 		 * make it impossible to ever delete the parent directory has
2084 		 * it would result in stale dentries that can never be deleted.
2085 		 */
2086 		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2087 			struct btrfs_key di_key;
2088 
2089 			if (!fixup_path) {
2090 				fixup_path = btrfs_alloc_path();
2091 				if (!fixup_path) {
2092 					ret = -ENOMEM;
2093 					break;
2094 				}
2095 			}
2096 
2097 			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2098 			ret = link_to_fixup_dir(trans, root, fixup_path,
2099 						di_key.objectid);
2100 			if (ret)
2101 				break;
2102 		}
2103 		ret = 0;
2104 	}
2105 	btrfs_free_path(fixup_path);
2106 	return ret;
2107 }
2108 
2109 /*
2110  * directory replay has two parts.  There are the standard directory
2111  * items in the log copied from the subvolume, and range items
2112  * created in the log while the subvolume was logged.
2113  *
2114  * The range items tell us which parts of the key space the log
2115  * is authoritative for.  During replay, if a key in the subvolume
2116  * directory is in a logged range item, but not actually in the log
2117  * that means it was deleted from the directory before the fsync
2118  * and should be removed.
2119  */
2120 static noinline int find_dir_range(struct btrfs_root *root,
2121 				   struct btrfs_path *path,
2122 				   u64 dirid, int key_type,
2123 				   u64 *start_ret, u64 *end_ret)
2124 {
2125 	struct btrfs_key key;
2126 	u64 found_end;
2127 	struct btrfs_dir_log_item *item;
2128 	int ret;
2129 	int nritems;
2130 
2131 	if (*start_ret == (u64)-1)
2132 		return 1;
2133 
2134 	key.objectid = dirid;
2135 	key.type = key_type;
2136 	key.offset = *start_ret;
2137 
2138 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2139 	if (ret < 0)
2140 		goto out;
2141 	if (ret > 0) {
2142 		if (path->slots[0] == 0)
2143 			goto out;
2144 		path->slots[0]--;
2145 	}
2146 	if (ret != 0)
2147 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2148 
2149 	if (key.type != key_type || key.objectid != dirid) {
2150 		ret = 1;
2151 		goto next;
2152 	}
2153 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2154 			      struct btrfs_dir_log_item);
2155 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2156 
2157 	if (*start_ret >= key.offset && *start_ret <= found_end) {
2158 		ret = 0;
2159 		*start_ret = key.offset;
2160 		*end_ret = found_end;
2161 		goto out;
2162 	}
2163 	ret = 1;
2164 next:
2165 	/* check the next slot in the tree to see if it is a valid item */
2166 	nritems = btrfs_header_nritems(path->nodes[0]);
2167 	path->slots[0]++;
2168 	if (path->slots[0] >= nritems) {
2169 		ret = btrfs_next_leaf(root, path);
2170 		if (ret)
2171 			goto out;
2172 	}
2173 
2174 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2175 
2176 	if (key.type != key_type || key.objectid != dirid) {
2177 		ret = 1;
2178 		goto out;
2179 	}
2180 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2181 			      struct btrfs_dir_log_item);
2182 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2183 	*start_ret = key.offset;
2184 	*end_ret = found_end;
2185 	ret = 0;
2186 out:
2187 	btrfs_release_path(path);
2188 	return ret;
2189 }
2190 
2191 /*
2192  * this looks for a given directory item in the log.  If the directory
2193  * item is not in the log, the item is removed and the inode it points
2194  * to is unlinked
2195  */
2196 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2197 				      struct btrfs_root *root,
2198 				      struct btrfs_root *log,
2199 				      struct btrfs_path *path,
2200 				      struct btrfs_path *log_path,
2201 				      struct inode *dir,
2202 				      struct btrfs_key *dir_key)
2203 {
2204 	int ret;
2205 	struct extent_buffer *eb;
2206 	int slot;
2207 	u32 item_size;
2208 	struct btrfs_dir_item *di;
2209 	struct btrfs_dir_item *log_di;
2210 	int name_len;
2211 	unsigned long ptr;
2212 	unsigned long ptr_end;
2213 	char *name;
2214 	struct inode *inode;
2215 	struct btrfs_key location;
2216 
2217 again:
2218 	eb = path->nodes[0];
2219 	slot = path->slots[0];
2220 	item_size = btrfs_item_size_nr(eb, slot);
2221 	ptr = btrfs_item_ptr_offset(eb, slot);
2222 	ptr_end = ptr + item_size;
2223 	while (ptr < ptr_end) {
2224 		di = (struct btrfs_dir_item *)ptr;
2225 		name_len = btrfs_dir_name_len(eb, di);
2226 		name = kmalloc(name_len, GFP_NOFS);
2227 		if (!name) {
2228 			ret = -ENOMEM;
2229 			goto out;
2230 		}
2231 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2232 				  name_len);
2233 		log_di = NULL;
2234 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2235 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2236 						       dir_key->objectid,
2237 						       name, name_len, 0);
2238 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2239 			log_di = btrfs_lookup_dir_index_item(trans, log,
2240 						     log_path,
2241 						     dir_key->objectid,
2242 						     dir_key->offset,
2243 						     name, name_len, 0);
2244 		}
2245 		if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2246 			btrfs_dir_item_key_to_cpu(eb, di, &location);
2247 			btrfs_release_path(path);
2248 			btrfs_release_path(log_path);
2249 			inode = read_one_inode(root, location.objectid);
2250 			if (!inode) {
2251 				kfree(name);
2252 				return -EIO;
2253 			}
2254 
2255 			ret = link_to_fixup_dir(trans, root,
2256 						path, location.objectid);
2257 			if (ret) {
2258 				kfree(name);
2259 				iput(inode);
2260 				goto out;
2261 			}
2262 
2263 			inc_nlink(inode);
2264 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2265 					BTRFS_I(inode), name, name_len);
2266 			if (!ret)
2267 				ret = btrfs_run_delayed_items(trans);
2268 			kfree(name);
2269 			iput(inode);
2270 			if (ret)
2271 				goto out;
2272 
2273 			/* there might still be more names under this key
2274 			 * check and repeat if required
2275 			 */
2276 			ret = btrfs_search_slot(NULL, root, dir_key, path,
2277 						0, 0);
2278 			if (ret == 0)
2279 				goto again;
2280 			ret = 0;
2281 			goto out;
2282 		} else if (IS_ERR(log_di)) {
2283 			kfree(name);
2284 			return PTR_ERR(log_di);
2285 		}
2286 		btrfs_release_path(log_path);
2287 		kfree(name);
2288 
2289 		ptr = (unsigned long)(di + 1);
2290 		ptr += name_len;
2291 	}
2292 	ret = 0;
2293 out:
2294 	btrfs_release_path(path);
2295 	btrfs_release_path(log_path);
2296 	return ret;
2297 }
2298 
2299 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2300 			      struct btrfs_root *root,
2301 			      struct btrfs_root *log,
2302 			      struct btrfs_path *path,
2303 			      const u64 ino)
2304 {
2305 	struct btrfs_key search_key;
2306 	struct btrfs_path *log_path;
2307 	int i;
2308 	int nritems;
2309 	int ret;
2310 
2311 	log_path = btrfs_alloc_path();
2312 	if (!log_path)
2313 		return -ENOMEM;
2314 
2315 	search_key.objectid = ino;
2316 	search_key.type = BTRFS_XATTR_ITEM_KEY;
2317 	search_key.offset = 0;
2318 again:
2319 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2320 	if (ret < 0)
2321 		goto out;
2322 process_leaf:
2323 	nritems = btrfs_header_nritems(path->nodes[0]);
2324 	for (i = path->slots[0]; i < nritems; i++) {
2325 		struct btrfs_key key;
2326 		struct btrfs_dir_item *di;
2327 		struct btrfs_dir_item *log_di;
2328 		u32 total_size;
2329 		u32 cur;
2330 
2331 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2332 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2333 			ret = 0;
2334 			goto out;
2335 		}
2336 
2337 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2338 		total_size = btrfs_item_size_nr(path->nodes[0], i);
2339 		cur = 0;
2340 		while (cur < total_size) {
2341 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2342 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2343 			u32 this_len = sizeof(*di) + name_len + data_len;
2344 			char *name;
2345 
2346 			name = kmalloc(name_len, GFP_NOFS);
2347 			if (!name) {
2348 				ret = -ENOMEM;
2349 				goto out;
2350 			}
2351 			read_extent_buffer(path->nodes[0], name,
2352 					   (unsigned long)(di + 1), name_len);
2353 
2354 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2355 						    name, name_len, 0);
2356 			btrfs_release_path(log_path);
2357 			if (!log_di) {
2358 				/* Doesn't exist in log tree, so delete it. */
2359 				btrfs_release_path(path);
2360 				di = btrfs_lookup_xattr(trans, root, path, ino,
2361 							name, name_len, -1);
2362 				kfree(name);
2363 				if (IS_ERR(di)) {
2364 					ret = PTR_ERR(di);
2365 					goto out;
2366 				}
2367 				ASSERT(di);
2368 				ret = btrfs_delete_one_dir_name(trans, root,
2369 								path, di);
2370 				if (ret)
2371 					goto out;
2372 				btrfs_release_path(path);
2373 				search_key = key;
2374 				goto again;
2375 			}
2376 			kfree(name);
2377 			if (IS_ERR(log_di)) {
2378 				ret = PTR_ERR(log_di);
2379 				goto out;
2380 			}
2381 			cur += this_len;
2382 			di = (struct btrfs_dir_item *)((char *)di + this_len);
2383 		}
2384 	}
2385 	ret = btrfs_next_leaf(root, path);
2386 	if (ret > 0)
2387 		ret = 0;
2388 	else if (ret == 0)
2389 		goto process_leaf;
2390 out:
2391 	btrfs_free_path(log_path);
2392 	btrfs_release_path(path);
2393 	return ret;
2394 }
2395 
2396 
2397 /*
2398  * deletion replay happens before we copy any new directory items
2399  * out of the log or out of backreferences from inodes.  It
2400  * scans the log to find ranges of keys that log is authoritative for,
2401  * and then scans the directory to find items in those ranges that are
2402  * not present in the log.
2403  *
2404  * Anything we don't find in the log is unlinked and removed from the
2405  * directory.
2406  */
2407 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2408 				       struct btrfs_root *root,
2409 				       struct btrfs_root *log,
2410 				       struct btrfs_path *path,
2411 				       u64 dirid, int del_all)
2412 {
2413 	u64 range_start;
2414 	u64 range_end;
2415 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2416 	int ret = 0;
2417 	struct btrfs_key dir_key;
2418 	struct btrfs_key found_key;
2419 	struct btrfs_path *log_path;
2420 	struct inode *dir;
2421 
2422 	dir_key.objectid = dirid;
2423 	dir_key.type = BTRFS_DIR_ITEM_KEY;
2424 	log_path = btrfs_alloc_path();
2425 	if (!log_path)
2426 		return -ENOMEM;
2427 
2428 	dir = read_one_inode(root, dirid);
2429 	/* it isn't an error if the inode isn't there, that can happen
2430 	 * because we replay the deletes before we copy in the inode item
2431 	 * from the log
2432 	 */
2433 	if (!dir) {
2434 		btrfs_free_path(log_path);
2435 		return 0;
2436 	}
2437 again:
2438 	range_start = 0;
2439 	range_end = 0;
2440 	while (1) {
2441 		if (del_all)
2442 			range_end = (u64)-1;
2443 		else {
2444 			ret = find_dir_range(log, path, dirid, key_type,
2445 					     &range_start, &range_end);
2446 			if (ret != 0)
2447 				break;
2448 		}
2449 
2450 		dir_key.offset = range_start;
2451 		while (1) {
2452 			int nritems;
2453 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2454 						0, 0);
2455 			if (ret < 0)
2456 				goto out;
2457 
2458 			nritems = btrfs_header_nritems(path->nodes[0]);
2459 			if (path->slots[0] >= nritems) {
2460 				ret = btrfs_next_leaf(root, path);
2461 				if (ret == 1)
2462 					break;
2463 				else if (ret < 0)
2464 					goto out;
2465 			}
2466 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2467 					      path->slots[0]);
2468 			if (found_key.objectid != dirid ||
2469 			    found_key.type != dir_key.type)
2470 				goto next_type;
2471 
2472 			if (found_key.offset > range_end)
2473 				break;
2474 
2475 			ret = check_item_in_log(trans, root, log, path,
2476 						log_path, dir,
2477 						&found_key);
2478 			if (ret)
2479 				goto out;
2480 			if (found_key.offset == (u64)-1)
2481 				break;
2482 			dir_key.offset = found_key.offset + 1;
2483 		}
2484 		btrfs_release_path(path);
2485 		if (range_end == (u64)-1)
2486 			break;
2487 		range_start = range_end + 1;
2488 	}
2489 
2490 next_type:
2491 	ret = 0;
2492 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2493 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2494 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2495 		btrfs_release_path(path);
2496 		goto again;
2497 	}
2498 out:
2499 	btrfs_release_path(path);
2500 	btrfs_free_path(log_path);
2501 	iput(dir);
2502 	return ret;
2503 }
2504 
2505 /*
2506  * the process_func used to replay items from the log tree.  This
2507  * gets called in two different stages.  The first stage just looks
2508  * for inodes and makes sure they are all copied into the subvolume.
2509  *
2510  * The second stage copies all the other item types from the log into
2511  * the subvolume.  The two stage approach is slower, but gets rid of
2512  * lots of complexity around inodes referencing other inodes that exist
2513  * only in the log (references come from either directory items or inode
2514  * back refs).
2515  */
2516 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2517 			     struct walk_control *wc, u64 gen, int level)
2518 {
2519 	int nritems;
2520 	struct btrfs_path *path;
2521 	struct btrfs_root *root = wc->replay_dest;
2522 	struct btrfs_key key;
2523 	int i;
2524 	int ret;
2525 
2526 	ret = btrfs_read_buffer(eb, gen, level, NULL);
2527 	if (ret)
2528 		return ret;
2529 
2530 	level = btrfs_header_level(eb);
2531 
2532 	if (level != 0)
2533 		return 0;
2534 
2535 	path = btrfs_alloc_path();
2536 	if (!path)
2537 		return -ENOMEM;
2538 
2539 	nritems = btrfs_header_nritems(eb);
2540 	for (i = 0; i < nritems; i++) {
2541 		btrfs_item_key_to_cpu(eb, &key, i);
2542 
2543 		/* inode keys are done during the first stage */
2544 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2545 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2546 			struct btrfs_inode_item *inode_item;
2547 			u32 mode;
2548 
2549 			inode_item = btrfs_item_ptr(eb, i,
2550 					    struct btrfs_inode_item);
2551 			/*
2552 			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2553 			 * and never got linked before the fsync, skip it, as
2554 			 * replaying it is pointless since it would be deleted
2555 			 * later. We skip logging tmpfiles, but it's always
2556 			 * possible we are replaying a log created with a kernel
2557 			 * that used to log tmpfiles.
2558 			 */
2559 			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2560 				wc->ignore_cur_inode = true;
2561 				continue;
2562 			} else {
2563 				wc->ignore_cur_inode = false;
2564 			}
2565 			ret = replay_xattr_deletes(wc->trans, root, log,
2566 						   path, key.objectid);
2567 			if (ret)
2568 				break;
2569 			mode = btrfs_inode_mode(eb, inode_item);
2570 			if (S_ISDIR(mode)) {
2571 				ret = replay_dir_deletes(wc->trans,
2572 					 root, log, path, key.objectid, 0);
2573 				if (ret)
2574 					break;
2575 			}
2576 			ret = overwrite_item(wc->trans, root, path,
2577 					     eb, i, &key);
2578 			if (ret)
2579 				break;
2580 
2581 			/*
2582 			 * Before replaying extents, truncate the inode to its
2583 			 * size. We need to do it now and not after log replay
2584 			 * because before an fsync we can have prealloc extents
2585 			 * added beyond the inode's i_size. If we did it after,
2586 			 * through orphan cleanup for example, we would drop
2587 			 * those prealloc extents just after replaying them.
2588 			 */
2589 			if (S_ISREG(mode)) {
2590 				struct inode *inode;
2591 				u64 from;
2592 
2593 				inode = read_one_inode(root, key.objectid);
2594 				if (!inode) {
2595 					ret = -EIO;
2596 					break;
2597 				}
2598 				from = ALIGN(i_size_read(inode),
2599 					     root->fs_info->sectorsize);
2600 				ret = btrfs_drop_extents(wc->trans, root, inode,
2601 							 from, (u64)-1, 1);
2602 				if (!ret) {
2603 					/* Update the inode's nbytes. */
2604 					ret = btrfs_update_inode(wc->trans,
2605 								 root, inode);
2606 				}
2607 				iput(inode);
2608 				if (ret)
2609 					break;
2610 			}
2611 
2612 			ret = link_to_fixup_dir(wc->trans, root,
2613 						path, key.objectid);
2614 			if (ret)
2615 				break;
2616 		}
2617 
2618 		if (wc->ignore_cur_inode)
2619 			continue;
2620 
2621 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2622 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2623 			ret = replay_one_dir_item(wc->trans, root, path,
2624 						  eb, i, &key);
2625 			if (ret)
2626 				break;
2627 		}
2628 
2629 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2630 			continue;
2631 
2632 		/* these keys are simply copied */
2633 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2634 			ret = overwrite_item(wc->trans, root, path,
2635 					     eb, i, &key);
2636 			if (ret)
2637 				break;
2638 		} else if (key.type == BTRFS_INODE_REF_KEY ||
2639 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2640 			ret = add_inode_ref(wc->trans, root, log, path,
2641 					    eb, i, &key);
2642 			if (ret && ret != -ENOENT)
2643 				break;
2644 			ret = 0;
2645 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2646 			ret = replay_one_extent(wc->trans, root, path,
2647 						eb, i, &key);
2648 			if (ret)
2649 				break;
2650 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2651 			ret = replay_one_dir_item(wc->trans, root, path,
2652 						  eb, i, &key);
2653 			if (ret)
2654 				break;
2655 		}
2656 	}
2657 	btrfs_free_path(path);
2658 	return ret;
2659 }
2660 
2661 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2662 				   struct btrfs_root *root,
2663 				   struct btrfs_path *path, int *level,
2664 				   struct walk_control *wc)
2665 {
2666 	struct btrfs_fs_info *fs_info = root->fs_info;
2667 	u64 root_owner;
2668 	u64 bytenr;
2669 	u64 ptr_gen;
2670 	struct extent_buffer *next;
2671 	struct extent_buffer *cur;
2672 	struct extent_buffer *parent;
2673 	u32 blocksize;
2674 	int ret = 0;
2675 
2676 	WARN_ON(*level < 0);
2677 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2678 
2679 	while (*level > 0) {
2680 		struct btrfs_key first_key;
2681 
2682 		WARN_ON(*level < 0);
2683 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
2684 		cur = path->nodes[*level];
2685 
2686 		WARN_ON(btrfs_header_level(cur) != *level);
2687 
2688 		if (path->slots[*level] >=
2689 		    btrfs_header_nritems(cur))
2690 			break;
2691 
2692 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2693 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2694 		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2695 		blocksize = fs_info->nodesize;
2696 
2697 		parent = path->nodes[*level];
2698 		root_owner = btrfs_header_owner(parent);
2699 
2700 		next = btrfs_find_create_tree_block(fs_info, bytenr);
2701 		if (IS_ERR(next))
2702 			return PTR_ERR(next);
2703 
2704 		if (*level == 1) {
2705 			ret = wc->process_func(root, next, wc, ptr_gen,
2706 					       *level - 1);
2707 			if (ret) {
2708 				free_extent_buffer(next);
2709 				return ret;
2710 			}
2711 
2712 			path->slots[*level]++;
2713 			if (wc->free) {
2714 				ret = btrfs_read_buffer(next, ptr_gen,
2715 							*level - 1, &first_key);
2716 				if (ret) {
2717 					free_extent_buffer(next);
2718 					return ret;
2719 				}
2720 
2721 				if (trans) {
2722 					btrfs_tree_lock(next);
2723 					btrfs_set_lock_blocking_write(next);
2724 					btrfs_clean_tree_block(next);
2725 					btrfs_wait_tree_block_writeback(next);
2726 					btrfs_tree_unlock(next);
2727 				} else {
2728 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2729 						clear_extent_buffer_dirty(next);
2730 				}
2731 
2732 				WARN_ON(root_owner !=
2733 					BTRFS_TREE_LOG_OBJECTID);
2734 				ret = btrfs_free_and_pin_reserved_extent(
2735 							fs_info, bytenr,
2736 							blocksize);
2737 				if (ret) {
2738 					free_extent_buffer(next);
2739 					return ret;
2740 				}
2741 			}
2742 			free_extent_buffer(next);
2743 			continue;
2744 		}
2745 		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2746 		if (ret) {
2747 			free_extent_buffer(next);
2748 			return ret;
2749 		}
2750 
2751 		WARN_ON(*level <= 0);
2752 		if (path->nodes[*level-1])
2753 			free_extent_buffer(path->nodes[*level-1]);
2754 		path->nodes[*level-1] = next;
2755 		*level = btrfs_header_level(next);
2756 		path->slots[*level] = 0;
2757 		cond_resched();
2758 	}
2759 	WARN_ON(*level < 0);
2760 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2761 
2762 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2763 
2764 	cond_resched();
2765 	return 0;
2766 }
2767 
2768 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2769 				 struct btrfs_root *root,
2770 				 struct btrfs_path *path, int *level,
2771 				 struct walk_control *wc)
2772 {
2773 	struct btrfs_fs_info *fs_info = root->fs_info;
2774 	u64 root_owner;
2775 	int i;
2776 	int slot;
2777 	int ret;
2778 
2779 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2780 		slot = path->slots[i];
2781 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2782 			path->slots[i]++;
2783 			*level = i;
2784 			WARN_ON(*level == 0);
2785 			return 0;
2786 		} else {
2787 			struct extent_buffer *parent;
2788 			if (path->nodes[*level] == root->node)
2789 				parent = path->nodes[*level];
2790 			else
2791 				parent = path->nodes[*level + 1];
2792 
2793 			root_owner = btrfs_header_owner(parent);
2794 			ret = wc->process_func(root, path->nodes[*level], wc,
2795 				 btrfs_header_generation(path->nodes[*level]),
2796 				 *level);
2797 			if (ret)
2798 				return ret;
2799 
2800 			if (wc->free) {
2801 				struct extent_buffer *next;
2802 
2803 				next = path->nodes[*level];
2804 
2805 				if (trans) {
2806 					btrfs_tree_lock(next);
2807 					btrfs_set_lock_blocking_write(next);
2808 					btrfs_clean_tree_block(next);
2809 					btrfs_wait_tree_block_writeback(next);
2810 					btrfs_tree_unlock(next);
2811 				} else {
2812 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2813 						clear_extent_buffer_dirty(next);
2814 				}
2815 
2816 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2817 				ret = btrfs_free_and_pin_reserved_extent(
2818 						fs_info,
2819 						path->nodes[*level]->start,
2820 						path->nodes[*level]->len);
2821 				if (ret)
2822 					return ret;
2823 			}
2824 			free_extent_buffer(path->nodes[*level]);
2825 			path->nodes[*level] = NULL;
2826 			*level = i + 1;
2827 		}
2828 	}
2829 	return 1;
2830 }
2831 
2832 /*
2833  * drop the reference count on the tree rooted at 'snap'.  This traverses
2834  * the tree freeing any blocks that have a ref count of zero after being
2835  * decremented.
2836  */
2837 static int walk_log_tree(struct btrfs_trans_handle *trans,
2838 			 struct btrfs_root *log, struct walk_control *wc)
2839 {
2840 	struct btrfs_fs_info *fs_info = log->fs_info;
2841 	int ret = 0;
2842 	int wret;
2843 	int level;
2844 	struct btrfs_path *path;
2845 	int orig_level;
2846 
2847 	path = btrfs_alloc_path();
2848 	if (!path)
2849 		return -ENOMEM;
2850 
2851 	level = btrfs_header_level(log->node);
2852 	orig_level = level;
2853 	path->nodes[level] = log->node;
2854 	atomic_inc(&log->node->refs);
2855 	path->slots[level] = 0;
2856 
2857 	while (1) {
2858 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2859 		if (wret > 0)
2860 			break;
2861 		if (wret < 0) {
2862 			ret = wret;
2863 			goto out;
2864 		}
2865 
2866 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2867 		if (wret > 0)
2868 			break;
2869 		if (wret < 0) {
2870 			ret = wret;
2871 			goto out;
2872 		}
2873 	}
2874 
2875 	/* was the root node processed? if not, catch it here */
2876 	if (path->nodes[orig_level]) {
2877 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2878 			 btrfs_header_generation(path->nodes[orig_level]),
2879 			 orig_level);
2880 		if (ret)
2881 			goto out;
2882 		if (wc->free) {
2883 			struct extent_buffer *next;
2884 
2885 			next = path->nodes[orig_level];
2886 
2887 			if (trans) {
2888 				btrfs_tree_lock(next);
2889 				btrfs_set_lock_blocking_write(next);
2890 				btrfs_clean_tree_block(next);
2891 				btrfs_wait_tree_block_writeback(next);
2892 				btrfs_tree_unlock(next);
2893 			} else {
2894 				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2895 					clear_extent_buffer_dirty(next);
2896 			}
2897 
2898 			WARN_ON(log->root_key.objectid !=
2899 				BTRFS_TREE_LOG_OBJECTID);
2900 			ret = btrfs_free_and_pin_reserved_extent(fs_info,
2901 							next->start, next->len);
2902 			if (ret)
2903 				goto out;
2904 		}
2905 	}
2906 
2907 out:
2908 	btrfs_free_path(path);
2909 	return ret;
2910 }
2911 
2912 /*
2913  * helper function to update the item for a given subvolumes log root
2914  * in the tree of log roots
2915  */
2916 static int update_log_root(struct btrfs_trans_handle *trans,
2917 			   struct btrfs_root *log,
2918 			   struct btrfs_root_item *root_item)
2919 {
2920 	struct btrfs_fs_info *fs_info = log->fs_info;
2921 	int ret;
2922 
2923 	if (log->log_transid == 1) {
2924 		/* insert root item on the first sync */
2925 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2926 				&log->root_key, root_item);
2927 	} else {
2928 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2929 				&log->root_key, root_item);
2930 	}
2931 	return ret;
2932 }
2933 
2934 static void wait_log_commit(struct btrfs_root *root, int transid)
2935 {
2936 	DEFINE_WAIT(wait);
2937 	int index = transid % 2;
2938 
2939 	/*
2940 	 * we only allow two pending log transactions at a time,
2941 	 * so we know that if ours is more than 2 older than the
2942 	 * current transaction, we're done
2943 	 */
2944 	for (;;) {
2945 		prepare_to_wait(&root->log_commit_wait[index],
2946 				&wait, TASK_UNINTERRUPTIBLE);
2947 
2948 		if (!(root->log_transid_committed < transid &&
2949 		      atomic_read(&root->log_commit[index])))
2950 			break;
2951 
2952 		mutex_unlock(&root->log_mutex);
2953 		schedule();
2954 		mutex_lock(&root->log_mutex);
2955 	}
2956 	finish_wait(&root->log_commit_wait[index], &wait);
2957 }
2958 
2959 static void wait_for_writer(struct btrfs_root *root)
2960 {
2961 	DEFINE_WAIT(wait);
2962 
2963 	for (;;) {
2964 		prepare_to_wait(&root->log_writer_wait, &wait,
2965 				TASK_UNINTERRUPTIBLE);
2966 		if (!atomic_read(&root->log_writers))
2967 			break;
2968 
2969 		mutex_unlock(&root->log_mutex);
2970 		schedule();
2971 		mutex_lock(&root->log_mutex);
2972 	}
2973 	finish_wait(&root->log_writer_wait, &wait);
2974 }
2975 
2976 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2977 					struct btrfs_log_ctx *ctx)
2978 {
2979 	if (!ctx)
2980 		return;
2981 
2982 	mutex_lock(&root->log_mutex);
2983 	list_del_init(&ctx->list);
2984 	mutex_unlock(&root->log_mutex);
2985 }
2986 
2987 /*
2988  * Invoked in log mutex context, or be sure there is no other task which
2989  * can access the list.
2990  */
2991 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2992 					     int index, int error)
2993 {
2994 	struct btrfs_log_ctx *ctx;
2995 	struct btrfs_log_ctx *safe;
2996 
2997 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2998 		list_del_init(&ctx->list);
2999 		ctx->log_ret = error;
3000 	}
3001 
3002 	INIT_LIST_HEAD(&root->log_ctxs[index]);
3003 }
3004 
3005 /*
3006  * btrfs_sync_log does sends a given tree log down to the disk and
3007  * updates the super blocks to record it.  When this call is done,
3008  * you know that any inodes previously logged are safely on disk only
3009  * if it returns 0.
3010  *
3011  * Any other return value means you need to call btrfs_commit_transaction.
3012  * Some of the edge cases for fsyncing directories that have had unlinks
3013  * or renames done in the past mean that sometimes the only safe
3014  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3015  * that has happened.
3016  */
3017 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3018 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3019 {
3020 	int index1;
3021 	int index2;
3022 	int mark;
3023 	int ret;
3024 	struct btrfs_fs_info *fs_info = root->fs_info;
3025 	struct btrfs_root *log = root->log_root;
3026 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3027 	struct btrfs_root_item new_root_item;
3028 	int log_transid = 0;
3029 	struct btrfs_log_ctx root_log_ctx;
3030 	struct blk_plug plug;
3031 
3032 	mutex_lock(&root->log_mutex);
3033 	log_transid = ctx->log_transid;
3034 	if (root->log_transid_committed >= log_transid) {
3035 		mutex_unlock(&root->log_mutex);
3036 		return ctx->log_ret;
3037 	}
3038 
3039 	index1 = log_transid % 2;
3040 	if (atomic_read(&root->log_commit[index1])) {
3041 		wait_log_commit(root, log_transid);
3042 		mutex_unlock(&root->log_mutex);
3043 		return ctx->log_ret;
3044 	}
3045 	ASSERT(log_transid == root->log_transid);
3046 	atomic_set(&root->log_commit[index1], 1);
3047 
3048 	/* wait for previous tree log sync to complete */
3049 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3050 		wait_log_commit(root, log_transid - 1);
3051 
3052 	while (1) {
3053 		int batch = atomic_read(&root->log_batch);
3054 		/* when we're on an ssd, just kick the log commit out */
3055 		if (!btrfs_test_opt(fs_info, SSD) &&
3056 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3057 			mutex_unlock(&root->log_mutex);
3058 			schedule_timeout_uninterruptible(1);
3059 			mutex_lock(&root->log_mutex);
3060 		}
3061 		wait_for_writer(root);
3062 		if (batch == atomic_read(&root->log_batch))
3063 			break;
3064 	}
3065 
3066 	/* bail out if we need to do a full commit */
3067 	if (btrfs_need_log_full_commit(trans)) {
3068 		ret = -EAGAIN;
3069 		mutex_unlock(&root->log_mutex);
3070 		goto out;
3071 	}
3072 
3073 	if (log_transid % 2 == 0)
3074 		mark = EXTENT_DIRTY;
3075 	else
3076 		mark = EXTENT_NEW;
3077 
3078 	/* we start IO on  all the marked extents here, but we don't actually
3079 	 * wait for them until later.
3080 	 */
3081 	blk_start_plug(&plug);
3082 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3083 	if (ret) {
3084 		blk_finish_plug(&plug);
3085 		btrfs_abort_transaction(trans, ret);
3086 		btrfs_set_log_full_commit(trans);
3087 		mutex_unlock(&root->log_mutex);
3088 		goto out;
3089 	}
3090 
3091 	/*
3092 	 * We _must_ update under the root->log_mutex in order to make sure we
3093 	 * have a consistent view of the log root we are trying to commit at
3094 	 * this moment.
3095 	 *
3096 	 * We _must_ copy this into a local copy, because we are not holding the
3097 	 * log_root_tree->log_mutex yet.  This is important because when we
3098 	 * commit the log_root_tree we must have a consistent view of the
3099 	 * log_root_tree when we update the super block to point at the
3100 	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3101 	 * with the commit and possibly point at the new block which we may not
3102 	 * have written out.
3103 	 */
3104 	btrfs_set_root_node(&log->root_item, log->node);
3105 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3106 
3107 	root->log_transid++;
3108 	log->log_transid = root->log_transid;
3109 	root->log_start_pid = 0;
3110 	/*
3111 	 * IO has been started, blocks of the log tree have WRITTEN flag set
3112 	 * in their headers. new modifications of the log will be written to
3113 	 * new positions. so it's safe to allow log writers to go in.
3114 	 */
3115 	mutex_unlock(&root->log_mutex);
3116 
3117 	btrfs_init_log_ctx(&root_log_ctx, NULL);
3118 
3119 	mutex_lock(&log_root_tree->log_mutex);
3120 	atomic_inc(&log_root_tree->log_batch);
3121 	atomic_inc(&log_root_tree->log_writers);
3122 
3123 	index2 = log_root_tree->log_transid % 2;
3124 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3125 	root_log_ctx.log_transid = log_root_tree->log_transid;
3126 
3127 	mutex_unlock(&log_root_tree->log_mutex);
3128 
3129 	mutex_lock(&log_root_tree->log_mutex);
3130 
3131 	/*
3132 	 * Now we are safe to update the log_root_tree because we're under the
3133 	 * log_mutex, and we're a current writer so we're holding the commit
3134 	 * open until we drop the log_mutex.
3135 	 */
3136 	ret = update_log_root(trans, log, &new_root_item);
3137 
3138 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
3139 		/* atomic_dec_and_test implies a barrier */
3140 		cond_wake_up_nomb(&log_root_tree->log_writer_wait);
3141 	}
3142 
3143 	if (ret) {
3144 		if (!list_empty(&root_log_ctx.list))
3145 			list_del_init(&root_log_ctx.list);
3146 
3147 		blk_finish_plug(&plug);
3148 		btrfs_set_log_full_commit(trans);
3149 
3150 		if (ret != -ENOSPC) {
3151 			btrfs_abort_transaction(trans, ret);
3152 			mutex_unlock(&log_root_tree->log_mutex);
3153 			goto out;
3154 		}
3155 		btrfs_wait_tree_log_extents(log, mark);
3156 		mutex_unlock(&log_root_tree->log_mutex);
3157 		ret = -EAGAIN;
3158 		goto out;
3159 	}
3160 
3161 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3162 		blk_finish_plug(&plug);
3163 		list_del_init(&root_log_ctx.list);
3164 		mutex_unlock(&log_root_tree->log_mutex);
3165 		ret = root_log_ctx.log_ret;
3166 		goto out;
3167 	}
3168 
3169 	index2 = root_log_ctx.log_transid % 2;
3170 	if (atomic_read(&log_root_tree->log_commit[index2])) {
3171 		blk_finish_plug(&plug);
3172 		ret = btrfs_wait_tree_log_extents(log, mark);
3173 		wait_log_commit(log_root_tree,
3174 				root_log_ctx.log_transid);
3175 		mutex_unlock(&log_root_tree->log_mutex);
3176 		if (!ret)
3177 			ret = root_log_ctx.log_ret;
3178 		goto out;
3179 	}
3180 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3181 	atomic_set(&log_root_tree->log_commit[index2], 1);
3182 
3183 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3184 		wait_log_commit(log_root_tree,
3185 				root_log_ctx.log_transid - 1);
3186 	}
3187 
3188 	wait_for_writer(log_root_tree);
3189 
3190 	/*
3191 	 * now that we've moved on to the tree of log tree roots,
3192 	 * check the full commit flag again
3193 	 */
3194 	if (btrfs_need_log_full_commit(trans)) {
3195 		blk_finish_plug(&plug);
3196 		btrfs_wait_tree_log_extents(log, mark);
3197 		mutex_unlock(&log_root_tree->log_mutex);
3198 		ret = -EAGAIN;
3199 		goto out_wake_log_root;
3200 	}
3201 
3202 	ret = btrfs_write_marked_extents(fs_info,
3203 					 &log_root_tree->dirty_log_pages,
3204 					 EXTENT_DIRTY | EXTENT_NEW);
3205 	blk_finish_plug(&plug);
3206 	if (ret) {
3207 		btrfs_set_log_full_commit(trans);
3208 		btrfs_abort_transaction(trans, ret);
3209 		mutex_unlock(&log_root_tree->log_mutex);
3210 		goto out_wake_log_root;
3211 	}
3212 	ret = btrfs_wait_tree_log_extents(log, mark);
3213 	if (!ret)
3214 		ret = btrfs_wait_tree_log_extents(log_root_tree,
3215 						  EXTENT_NEW | EXTENT_DIRTY);
3216 	if (ret) {
3217 		btrfs_set_log_full_commit(trans);
3218 		mutex_unlock(&log_root_tree->log_mutex);
3219 		goto out_wake_log_root;
3220 	}
3221 
3222 	btrfs_set_super_log_root(fs_info->super_for_commit,
3223 				 log_root_tree->node->start);
3224 	btrfs_set_super_log_root_level(fs_info->super_for_commit,
3225 				       btrfs_header_level(log_root_tree->node));
3226 
3227 	log_root_tree->log_transid++;
3228 	mutex_unlock(&log_root_tree->log_mutex);
3229 
3230 	/*
3231 	 * Nobody else is going to jump in and write the ctree
3232 	 * super here because the log_commit atomic below is protecting
3233 	 * us.  We must be called with a transaction handle pinning
3234 	 * the running transaction open, so a full commit can't hop
3235 	 * in and cause problems either.
3236 	 */
3237 	ret = write_all_supers(fs_info, 1);
3238 	if (ret) {
3239 		btrfs_set_log_full_commit(trans);
3240 		btrfs_abort_transaction(trans, ret);
3241 		goto out_wake_log_root;
3242 	}
3243 
3244 	mutex_lock(&root->log_mutex);
3245 	if (root->last_log_commit < log_transid)
3246 		root->last_log_commit = log_transid;
3247 	mutex_unlock(&root->log_mutex);
3248 
3249 out_wake_log_root:
3250 	mutex_lock(&log_root_tree->log_mutex);
3251 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3252 
3253 	log_root_tree->log_transid_committed++;
3254 	atomic_set(&log_root_tree->log_commit[index2], 0);
3255 	mutex_unlock(&log_root_tree->log_mutex);
3256 
3257 	/*
3258 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3259 	 * all the updates above are seen by the woken threads. It might not be
3260 	 * necessary, but proving that seems to be hard.
3261 	 */
3262 	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3263 out:
3264 	mutex_lock(&root->log_mutex);
3265 	btrfs_remove_all_log_ctxs(root, index1, ret);
3266 	root->log_transid_committed++;
3267 	atomic_set(&root->log_commit[index1], 0);
3268 	mutex_unlock(&root->log_mutex);
3269 
3270 	/*
3271 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3272 	 * all the updates above are seen by the woken threads. It might not be
3273 	 * necessary, but proving that seems to be hard.
3274 	 */
3275 	cond_wake_up(&root->log_commit_wait[index1]);
3276 	return ret;
3277 }
3278 
3279 static void free_log_tree(struct btrfs_trans_handle *trans,
3280 			  struct btrfs_root *log)
3281 {
3282 	int ret;
3283 	struct walk_control wc = {
3284 		.free = 1,
3285 		.process_func = process_one_buffer
3286 	};
3287 
3288 	ret = walk_log_tree(trans, log, &wc);
3289 	if (ret) {
3290 		if (trans)
3291 			btrfs_abort_transaction(trans, ret);
3292 		else
3293 			btrfs_handle_fs_error(log->fs_info, ret, NULL);
3294 	}
3295 
3296 	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3297 			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3298 	free_extent_buffer(log->node);
3299 	kfree(log);
3300 }
3301 
3302 /*
3303  * free all the extents used by the tree log.  This should be called
3304  * at commit time of the full transaction
3305  */
3306 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3307 {
3308 	if (root->log_root) {
3309 		free_log_tree(trans, root->log_root);
3310 		root->log_root = NULL;
3311 	}
3312 	return 0;
3313 }
3314 
3315 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3316 			     struct btrfs_fs_info *fs_info)
3317 {
3318 	if (fs_info->log_root_tree) {
3319 		free_log_tree(trans, fs_info->log_root_tree);
3320 		fs_info->log_root_tree = NULL;
3321 	}
3322 	return 0;
3323 }
3324 
3325 /*
3326  * Check if an inode was logged in the current transaction. We can't always rely
3327  * on an inode's logged_trans value, because it's an in-memory only field and
3328  * therefore not persisted. This means that its value is lost if the inode gets
3329  * evicted and loaded again from disk (in which case it has a value of 0, and
3330  * certainly it is smaller then any possible transaction ID), when that happens
3331  * the full_sync flag is set in the inode's runtime flags, so on that case we
3332  * assume eviction happened and ignore the logged_trans value, assuming the
3333  * worst case, that the inode was logged before in the current transaction.
3334  */
3335 static bool inode_logged(struct btrfs_trans_handle *trans,
3336 			 struct btrfs_inode *inode)
3337 {
3338 	if (inode->logged_trans == trans->transid)
3339 		return true;
3340 
3341 	if (inode->last_trans == trans->transid &&
3342 	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3343 	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3344 		return true;
3345 
3346 	return false;
3347 }
3348 
3349 /*
3350  * If both a file and directory are logged, and unlinks or renames are
3351  * mixed in, we have a few interesting corners:
3352  *
3353  * create file X in dir Y
3354  * link file X to X.link in dir Y
3355  * fsync file X
3356  * unlink file X but leave X.link
3357  * fsync dir Y
3358  *
3359  * After a crash we would expect only X.link to exist.  But file X
3360  * didn't get fsync'd again so the log has back refs for X and X.link.
3361  *
3362  * We solve this by removing directory entries and inode backrefs from the
3363  * log when a file that was logged in the current transaction is
3364  * unlinked.  Any later fsync will include the updated log entries, and
3365  * we'll be able to reconstruct the proper directory items from backrefs.
3366  *
3367  * This optimizations allows us to avoid relogging the entire inode
3368  * or the entire directory.
3369  */
3370 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3371 				 struct btrfs_root *root,
3372 				 const char *name, int name_len,
3373 				 struct btrfs_inode *dir, u64 index)
3374 {
3375 	struct btrfs_root *log;
3376 	struct btrfs_dir_item *di;
3377 	struct btrfs_path *path;
3378 	int ret;
3379 	int err = 0;
3380 	int bytes_del = 0;
3381 	u64 dir_ino = btrfs_ino(dir);
3382 
3383 	if (!inode_logged(trans, dir))
3384 		return 0;
3385 
3386 	ret = join_running_log_trans(root);
3387 	if (ret)
3388 		return 0;
3389 
3390 	mutex_lock(&dir->log_mutex);
3391 
3392 	log = root->log_root;
3393 	path = btrfs_alloc_path();
3394 	if (!path) {
3395 		err = -ENOMEM;
3396 		goto out_unlock;
3397 	}
3398 
3399 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3400 				   name, name_len, -1);
3401 	if (IS_ERR(di)) {
3402 		err = PTR_ERR(di);
3403 		goto fail;
3404 	}
3405 	if (di) {
3406 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3407 		bytes_del += name_len;
3408 		if (ret) {
3409 			err = ret;
3410 			goto fail;
3411 		}
3412 	}
3413 	btrfs_release_path(path);
3414 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3415 					 index, name, name_len, -1);
3416 	if (IS_ERR(di)) {
3417 		err = PTR_ERR(di);
3418 		goto fail;
3419 	}
3420 	if (di) {
3421 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3422 		bytes_del += name_len;
3423 		if (ret) {
3424 			err = ret;
3425 			goto fail;
3426 		}
3427 	}
3428 
3429 	/* update the directory size in the log to reflect the names
3430 	 * we have removed
3431 	 */
3432 	if (bytes_del) {
3433 		struct btrfs_key key;
3434 
3435 		key.objectid = dir_ino;
3436 		key.offset = 0;
3437 		key.type = BTRFS_INODE_ITEM_KEY;
3438 		btrfs_release_path(path);
3439 
3440 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3441 		if (ret < 0) {
3442 			err = ret;
3443 			goto fail;
3444 		}
3445 		if (ret == 0) {
3446 			struct btrfs_inode_item *item;
3447 			u64 i_size;
3448 
3449 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3450 					      struct btrfs_inode_item);
3451 			i_size = btrfs_inode_size(path->nodes[0], item);
3452 			if (i_size > bytes_del)
3453 				i_size -= bytes_del;
3454 			else
3455 				i_size = 0;
3456 			btrfs_set_inode_size(path->nodes[0], item, i_size);
3457 			btrfs_mark_buffer_dirty(path->nodes[0]);
3458 		} else
3459 			ret = 0;
3460 		btrfs_release_path(path);
3461 	}
3462 fail:
3463 	btrfs_free_path(path);
3464 out_unlock:
3465 	mutex_unlock(&dir->log_mutex);
3466 	if (ret == -ENOSPC) {
3467 		btrfs_set_log_full_commit(trans);
3468 		ret = 0;
3469 	} else if (ret < 0)
3470 		btrfs_abort_transaction(trans, ret);
3471 
3472 	btrfs_end_log_trans(root);
3473 
3474 	return err;
3475 }
3476 
3477 /* see comments for btrfs_del_dir_entries_in_log */
3478 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3479 			       struct btrfs_root *root,
3480 			       const char *name, int name_len,
3481 			       struct btrfs_inode *inode, u64 dirid)
3482 {
3483 	struct btrfs_root *log;
3484 	u64 index;
3485 	int ret;
3486 
3487 	if (!inode_logged(trans, inode))
3488 		return 0;
3489 
3490 	ret = join_running_log_trans(root);
3491 	if (ret)
3492 		return 0;
3493 	log = root->log_root;
3494 	mutex_lock(&inode->log_mutex);
3495 
3496 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3497 				  dirid, &index);
3498 	mutex_unlock(&inode->log_mutex);
3499 	if (ret == -ENOSPC) {
3500 		btrfs_set_log_full_commit(trans);
3501 		ret = 0;
3502 	} else if (ret < 0 && ret != -ENOENT)
3503 		btrfs_abort_transaction(trans, ret);
3504 	btrfs_end_log_trans(root);
3505 
3506 	return ret;
3507 }
3508 
3509 /*
3510  * creates a range item in the log for 'dirid'.  first_offset and
3511  * last_offset tell us which parts of the key space the log should
3512  * be considered authoritative for.
3513  */
3514 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3515 				       struct btrfs_root *log,
3516 				       struct btrfs_path *path,
3517 				       int key_type, u64 dirid,
3518 				       u64 first_offset, u64 last_offset)
3519 {
3520 	int ret;
3521 	struct btrfs_key key;
3522 	struct btrfs_dir_log_item *item;
3523 
3524 	key.objectid = dirid;
3525 	key.offset = first_offset;
3526 	if (key_type == BTRFS_DIR_ITEM_KEY)
3527 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3528 	else
3529 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3530 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3531 	if (ret)
3532 		return ret;
3533 
3534 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3535 			      struct btrfs_dir_log_item);
3536 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3537 	btrfs_mark_buffer_dirty(path->nodes[0]);
3538 	btrfs_release_path(path);
3539 	return 0;
3540 }
3541 
3542 /*
3543  * log all the items included in the current transaction for a given
3544  * directory.  This also creates the range items in the log tree required
3545  * to replay anything deleted before the fsync
3546  */
3547 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3548 			  struct btrfs_root *root, struct btrfs_inode *inode,
3549 			  struct btrfs_path *path,
3550 			  struct btrfs_path *dst_path, int key_type,
3551 			  struct btrfs_log_ctx *ctx,
3552 			  u64 min_offset, u64 *last_offset_ret)
3553 {
3554 	struct btrfs_key min_key;
3555 	struct btrfs_root *log = root->log_root;
3556 	struct extent_buffer *src;
3557 	int err = 0;
3558 	int ret;
3559 	int i;
3560 	int nritems;
3561 	u64 first_offset = min_offset;
3562 	u64 last_offset = (u64)-1;
3563 	u64 ino = btrfs_ino(inode);
3564 
3565 	log = root->log_root;
3566 
3567 	min_key.objectid = ino;
3568 	min_key.type = key_type;
3569 	min_key.offset = min_offset;
3570 
3571 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3572 
3573 	/*
3574 	 * we didn't find anything from this transaction, see if there
3575 	 * is anything at all
3576 	 */
3577 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3578 		min_key.objectid = ino;
3579 		min_key.type = key_type;
3580 		min_key.offset = (u64)-1;
3581 		btrfs_release_path(path);
3582 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3583 		if (ret < 0) {
3584 			btrfs_release_path(path);
3585 			return ret;
3586 		}
3587 		ret = btrfs_previous_item(root, path, ino, key_type);
3588 
3589 		/* if ret == 0 there are items for this type,
3590 		 * create a range to tell us the last key of this type.
3591 		 * otherwise, there are no items in this directory after
3592 		 * *min_offset, and we create a range to indicate that.
3593 		 */
3594 		if (ret == 0) {
3595 			struct btrfs_key tmp;
3596 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3597 					      path->slots[0]);
3598 			if (key_type == tmp.type)
3599 				first_offset = max(min_offset, tmp.offset) + 1;
3600 		}
3601 		goto done;
3602 	}
3603 
3604 	/* go backward to find any previous key */
3605 	ret = btrfs_previous_item(root, path, ino, key_type);
3606 	if (ret == 0) {
3607 		struct btrfs_key tmp;
3608 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3609 		if (key_type == tmp.type) {
3610 			first_offset = tmp.offset;
3611 			ret = overwrite_item(trans, log, dst_path,
3612 					     path->nodes[0], path->slots[0],
3613 					     &tmp);
3614 			if (ret) {
3615 				err = ret;
3616 				goto done;
3617 			}
3618 		}
3619 	}
3620 	btrfs_release_path(path);
3621 
3622 	/*
3623 	 * Find the first key from this transaction again.  See the note for
3624 	 * log_new_dir_dentries, if we're logging a directory recursively we
3625 	 * won't be holding its i_mutex, which means we can modify the directory
3626 	 * while we're logging it.  If we remove an entry between our first
3627 	 * search and this search we'll not find the key again and can just
3628 	 * bail.
3629 	 */
3630 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3631 	if (ret != 0)
3632 		goto done;
3633 
3634 	/*
3635 	 * we have a block from this transaction, log every item in it
3636 	 * from our directory
3637 	 */
3638 	while (1) {
3639 		struct btrfs_key tmp;
3640 		src = path->nodes[0];
3641 		nritems = btrfs_header_nritems(src);
3642 		for (i = path->slots[0]; i < nritems; i++) {
3643 			struct btrfs_dir_item *di;
3644 
3645 			btrfs_item_key_to_cpu(src, &min_key, i);
3646 
3647 			if (min_key.objectid != ino || min_key.type != key_type)
3648 				goto done;
3649 			ret = overwrite_item(trans, log, dst_path, src, i,
3650 					     &min_key);
3651 			if (ret) {
3652 				err = ret;
3653 				goto done;
3654 			}
3655 
3656 			/*
3657 			 * We must make sure that when we log a directory entry,
3658 			 * the corresponding inode, after log replay, has a
3659 			 * matching link count. For example:
3660 			 *
3661 			 * touch foo
3662 			 * mkdir mydir
3663 			 * sync
3664 			 * ln foo mydir/bar
3665 			 * xfs_io -c "fsync" mydir
3666 			 * <crash>
3667 			 * <mount fs and log replay>
3668 			 *
3669 			 * Would result in a fsync log that when replayed, our
3670 			 * file inode would have a link count of 1, but we get
3671 			 * two directory entries pointing to the same inode.
3672 			 * After removing one of the names, it would not be
3673 			 * possible to remove the other name, which resulted
3674 			 * always in stale file handle errors, and would not
3675 			 * be possible to rmdir the parent directory, since
3676 			 * its i_size could never decrement to the value
3677 			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3678 			 */
3679 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3680 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
3681 			if (ctx &&
3682 			    (btrfs_dir_transid(src, di) == trans->transid ||
3683 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3684 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
3685 				ctx->log_new_dentries = true;
3686 		}
3687 		path->slots[0] = nritems;
3688 
3689 		/*
3690 		 * look ahead to the next item and see if it is also
3691 		 * from this directory and from this transaction
3692 		 */
3693 		ret = btrfs_next_leaf(root, path);
3694 		if (ret) {
3695 			if (ret == 1)
3696 				last_offset = (u64)-1;
3697 			else
3698 				err = ret;
3699 			goto done;
3700 		}
3701 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3702 		if (tmp.objectid != ino || tmp.type != key_type) {
3703 			last_offset = (u64)-1;
3704 			goto done;
3705 		}
3706 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3707 			ret = overwrite_item(trans, log, dst_path,
3708 					     path->nodes[0], path->slots[0],
3709 					     &tmp);
3710 			if (ret)
3711 				err = ret;
3712 			else
3713 				last_offset = tmp.offset;
3714 			goto done;
3715 		}
3716 	}
3717 done:
3718 	btrfs_release_path(path);
3719 	btrfs_release_path(dst_path);
3720 
3721 	if (err == 0) {
3722 		*last_offset_ret = last_offset;
3723 		/*
3724 		 * insert the log range keys to indicate where the log
3725 		 * is valid
3726 		 */
3727 		ret = insert_dir_log_key(trans, log, path, key_type,
3728 					 ino, first_offset, last_offset);
3729 		if (ret)
3730 			err = ret;
3731 	}
3732 	return err;
3733 }
3734 
3735 /*
3736  * logging directories is very similar to logging inodes, We find all the items
3737  * from the current transaction and write them to the log.
3738  *
3739  * The recovery code scans the directory in the subvolume, and if it finds a
3740  * key in the range logged that is not present in the log tree, then it means
3741  * that dir entry was unlinked during the transaction.
3742  *
3743  * In order for that scan to work, we must include one key smaller than
3744  * the smallest logged by this transaction and one key larger than the largest
3745  * key logged by this transaction.
3746  */
3747 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3748 			  struct btrfs_root *root, struct btrfs_inode *inode,
3749 			  struct btrfs_path *path,
3750 			  struct btrfs_path *dst_path,
3751 			  struct btrfs_log_ctx *ctx)
3752 {
3753 	u64 min_key;
3754 	u64 max_key;
3755 	int ret;
3756 	int key_type = BTRFS_DIR_ITEM_KEY;
3757 
3758 again:
3759 	min_key = 0;
3760 	max_key = 0;
3761 	while (1) {
3762 		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3763 				ctx, min_key, &max_key);
3764 		if (ret)
3765 			return ret;
3766 		if (max_key == (u64)-1)
3767 			break;
3768 		min_key = max_key + 1;
3769 	}
3770 
3771 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3772 		key_type = BTRFS_DIR_INDEX_KEY;
3773 		goto again;
3774 	}
3775 	return 0;
3776 }
3777 
3778 /*
3779  * a helper function to drop items from the log before we relog an
3780  * inode.  max_key_type indicates the highest item type to remove.
3781  * This cannot be run for file data extents because it does not
3782  * free the extents they point to.
3783  */
3784 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3785 				  struct btrfs_root *log,
3786 				  struct btrfs_path *path,
3787 				  u64 objectid, int max_key_type)
3788 {
3789 	int ret;
3790 	struct btrfs_key key;
3791 	struct btrfs_key found_key;
3792 	int start_slot;
3793 
3794 	key.objectid = objectid;
3795 	key.type = max_key_type;
3796 	key.offset = (u64)-1;
3797 
3798 	while (1) {
3799 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3800 		BUG_ON(ret == 0); /* Logic error */
3801 		if (ret < 0)
3802 			break;
3803 
3804 		if (path->slots[0] == 0)
3805 			break;
3806 
3807 		path->slots[0]--;
3808 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3809 				      path->slots[0]);
3810 
3811 		if (found_key.objectid != objectid)
3812 			break;
3813 
3814 		found_key.offset = 0;
3815 		found_key.type = 0;
3816 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3817 				       &start_slot);
3818 		if (ret < 0)
3819 			break;
3820 
3821 		ret = btrfs_del_items(trans, log, path, start_slot,
3822 				      path->slots[0] - start_slot + 1);
3823 		/*
3824 		 * If start slot isn't 0 then we don't need to re-search, we've
3825 		 * found the last guy with the objectid in this tree.
3826 		 */
3827 		if (ret || start_slot != 0)
3828 			break;
3829 		btrfs_release_path(path);
3830 	}
3831 	btrfs_release_path(path);
3832 	if (ret > 0)
3833 		ret = 0;
3834 	return ret;
3835 }
3836 
3837 static void fill_inode_item(struct btrfs_trans_handle *trans,
3838 			    struct extent_buffer *leaf,
3839 			    struct btrfs_inode_item *item,
3840 			    struct inode *inode, int log_inode_only,
3841 			    u64 logged_isize)
3842 {
3843 	struct btrfs_map_token token;
3844 
3845 	btrfs_init_map_token(&token, leaf);
3846 
3847 	if (log_inode_only) {
3848 		/* set the generation to zero so the recover code
3849 		 * can tell the difference between an logging
3850 		 * just to say 'this inode exists' and a logging
3851 		 * to say 'update this inode with these values'
3852 		 */
3853 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
3854 		btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3855 	} else {
3856 		btrfs_set_token_inode_generation(leaf, item,
3857 						 BTRFS_I(inode)->generation,
3858 						 &token);
3859 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3860 	}
3861 
3862 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3863 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3864 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3865 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3866 
3867 	btrfs_set_token_timespec_sec(leaf, &item->atime,
3868 				     inode->i_atime.tv_sec, &token);
3869 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
3870 				      inode->i_atime.tv_nsec, &token);
3871 
3872 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
3873 				     inode->i_mtime.tv_sec, &token);
3874 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3875 				      inode->i_mtime.tv_nsec, &token);
3876 
3877 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
3878 				     inode->i_ctime.tv_sec, &token);
3879 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3880 				      inode->i_ctime.tv_nsec, &token);
3881 
3882 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3883 				     &token);
3884 
3885 	btrfs_set_token_inode_sequence(leaf, item,
3886 				       inode_peek_iversion(inode), &token);
3887 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3888 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3889 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3890 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3891 }
3892 
3893 static int log_inode_item(struct btrfs_trans_handle *trans,
3894 			  struct btrfs_root *log, struct btrfs_path *path,
3895 			  struct btrfs_inode *inode)
3896 {
3897 	struct btrfs_inode_item *inode_item;
3898 	int ret;
3899 
3900 	ret = btrfs_insert_empty_item(trans, log, path,
3901 				      &inode->location, sizeof(*inode_item));
3902 	if (ret && ret != -EEXIST)
3903 		return ret;
3904 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3905 				    struct btrfs_inode_item);
3906 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3907 			0, 0);
3908 	btrfs_release_path(path);
3909 	return 0;
3910 }
3911 
3912 static noinline int copy_items(struct btrfs_trans_handle *trans,
3913 			       struct btrfs_inode *inode,
3914 			       struct btrfs_path *dst_path,
3915 			       struct btrfs_path *src_path, u64 *last_extent,
3916 			       int start_slot, int nr, int inode_only,
3917 			       u64 logged_isize)
3918 {
3919 	struct btrfs_fs_info *fs_info = trans->fs_info;
3920 	unsigned long src_offset;
3921 	unsigned long dst_offset;
3922 	struct btrfs_root *log = inode->root->log_root;
3923 	struct btrfs_file_extent_item *extent;
3924 	struct btrfs_inode_item *inode_item;
3925 	struct extent_buffer *src = src_path->nodes[0];
3926 	struct btrfs_key first_key, last_key, key;
3927 	int ret;
3928 	struct btrfs_key *ins_keys;
3929 	u32 *ins_sizes;
3930 	char *ins_data;
3931 	int i;
3932 	struct list_head ordered_sums;
3933 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
3934 	bool has_extents = false;
3935 	bool need_find_last_extent = true;
3936 	bool done = false;
3937 
3938 	INIT_LIST_HEAD(&ordered_sums);
3939 
3940 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3941 			   nr * sizeof(u32), GFP_NOFS);
3942 	if (!ins_data)
3943 		return -ENOMEM;
3944 
3945 	first_key.objectid = (u64)-1;
3946 
3947 	ins_sizes = (u32 *)ins_data;
3948 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3949 
3950 	for (i = 0; i < nr; i++) {
3951 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3952 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3953 	}
3954 	ret = btrfs_insert_empty_items(trans, log, dst_path,
3955 				       ins_keys, ins_sizes, nr);
3956 	if (ret) {
3957 		kfree(ins_data);
3958 		return ret;
3959 	}
3960 
3961 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3962 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3963 						   dst_path->slots[0]);
3964 
3965 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3966 
3967 		if (i == nr - 1)
3968 			last_key = ins_keys[i];
3969 
3970 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3971 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
3972 						    dst_path->slots[0],
3973 						    struct btrfs_inode_item);
3974 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
3975 					&inode->vfs_inode,
3976 					inode_only == LOG_INODE_EXISTS,
3977 					logged_isize);
3978 		} else {
3979 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3980 					   src_offset, ins_sizes[i]);
3981 		}
3982 
3983 		/*
3984 		 * We set need_find_last_extent here in case we know we were
3985 		 * processing other items and then walk into the first extent in
3986 		 * the inode.  If we don't hit an extent then nothing changes,
3987 		 * we'll do the last search the next time around.
3988 		 */
3989 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3990 			has_extents = true;
3991 			if (first_key.objectid == (u64)-1)
3992 				first_key = ins_keys[i];
3993 		} else {
3994 			need_find_last_extent = false;
3995 		}
3996 
3997 		/* take a reference on file data extents so that truncates
3998 		 * or deletes of this inode don't have to relog the inode
3999 		 * again
4000 		 */
4001 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4002 		    !skip_csum) {
4003 			int found_type;
4004 			extent = btrfs_item_ptr(src, start_slot + i,
4005 						struct btrfs_file_extent_item);
4006 
4007 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
4008 				continue;
4009 
4010 			found_type = btrfs_file_extent_type(src, extent);
4011 			if (found_type == BTRFS_FILE_EXTENT_REG) {
4012 				u64 ds, dl, cs, cl;
4013 				ds = btrfs_file_extent_disk_bytenr(src,
4014 								extent);
4015 				/* ds == 0 is a hole */
4016 				if (ds == 0)
4017 					continue;
4018 
4019 				dl = btrfs_file_extent_disk_num_bytes(src,
4020 								extent);
4021 				cs = btrfs_file_extent_offset(src, extent);
4022 				cl = btrfs_file_extent_num_bytes(src,
4023 								extent);
4024 				if (btrfs_file_extent_compression(src,
4025 								  extent)) {
4026 					cs = 0;
4027 					cl = dl;
4028 				}
4029 
4030 				ret = btrfs_lookup_csums_range(
4031 						fs_info->csum_root,
4032 						ds + cs, ds + cs + cl - 1,
4033 						&ordered_sums, 0);
4034 				if (ret) {
4035 					btrfs_release_path(dst_path);
4036 					kfree(ins_data);
4037 					return ret;
4038 				}
4039 			}
4040 		}
4041 	}
4042 
4043 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4044 	btrfs_release_path(dst_path);
4045 	kfree(ins_data);
4046 
4047 	/*
4048 	 * we have to do this after the loop above to avoid changing the
4049 	 * log tree while trying to change the log tree.
4050 	 */
4051 	ret = 0;
4052 	while (!list_empty(&ordered_sums)) {
4053 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4054 						   struct btrfs_ordered_sum,
4055 						   list);
4056 		if (!ret)
4057 			ret = btrfs_csum_file_blocks(trans, log, sums);
4058 		list_del(&sums->list);
4059 		kfree(sums);
4060 	}
4061 
4062 	if (!has_extents)
4063 		return ret;
4064 
4065 	if (need_find_last_extent && *last_extent == first_key.offset) {
4066 		/*
4067 		 * We don't have any leafs between our current one and the one
4068 		 * we processed before that can have file extent items for our
4069 		 * inode (and have a generation number smaller than our current
4070 		 * transaction id).
4071 		 */
4072 		need_find_last_extent = false;
4073 	}
4074 
4075 	/*
4076 	 * Because we use btrfs_search_forward we could skip leaves that were
4077 	 * not modified and then assume *last_extent is valid when it really
4078 	 * isn't.  So back up to the previous leaf and read the end of the last
4079 	 * extent before we go and fill in holes.
4080 	 */
4081 	if (need_find_last_extent) {
4082 		u64 len;
4083 
4084 		ret = btrfs_prev_leaf(inode->root, src_path);
4085 		if (ret < 0)
4086 			return ret;
4087 		if (ret)
4088 			goto fill_holes;
4089 		if (src_path->slots[0])
4090 			src_path->slots[0]--;
4091 		src = src_path->nodes[0];
4092 		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
4093 		if (key.objectid != btrfs_ino(inode) ||
4094 		    key.type != BTRFS_EXTENT_DATA_KEY)
4095 			goto fill_holes;
4096 		extent = btrfs_item_ptr(src, src_path->slots[0],
4097 					struct btrfs_file_extent_item);
4098 		if (btrfs_file_extent_type(src, extent) ==
4099 		    BTRFS_FILE_EXTENT_INLINE) {
4100 			len = btrfs_file_extent_ram_bytes(src, extent);
4101 			*last_extent = ALIGN(key.offset + len,
4102 					     fs_info->sectorsize);
4103 		} else {
4104 			len = btrfs_file_extent_num_bytes(src, extent);
4105 			*last_extent = key.offset + len;
4106 		}
4107 	}
4108 fill_holes:
4109 	/* So we did prev_leaf, now we need to move to the next leaf, but a few
4110 	 * things could have happened
4111 	 *
4112 	 * 1) A merge could have happened, so we could currently be on a leaf
4113 	 * that holds what we were copying in the first place.
4114 	 * 2) A split could have happened, and now not all of the items we want
4115 	 * are on the same leaf.
4116 	 *
4117 	 * So we need to adjust how we search for holes, we need to drop the
4118 	 * path and re-search for the first extent key we found, and then walk
4119 	 * forward until we hit the last one we copied.
4120 	 */
4121 	if (need_find_last_extent) {
4122 		/* btrfs_prev_leaf could return 1 without releasing the path */
4123 		btrfs_release_path(src_path);
4124 		ret = btrfs_search_slot(NULL, inode->root, &first_key,
4125 				src_path, 0, 0);
4126 		if (ret < 0)
4127 			return ret;
4128 		ASSERT(ret == 0);
4129 		src = src_path->nodes[0];
4130 		i = src_path->slots[0];
4131 	} else {
4132 		i = start_slot;
4133 	}
4134 
4135 	/*
4136 	 * Ok so here we need to go through and fill in any holes we may have
4137 	 * to make sure that holes are punched for those areas in case they had
4138 	 * extents previously.
4139 	 */
4140 	while (!done) {
4141 		u64 offset, len;
4142 		u64 extent_end;
4143 
4144 		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
4145 			ret = btrfs_next_leaf(inode->root, src_path);
4146 			if (ret < 0)
4147 				return ret;
4148 			ASSERT(ret == 0);
4149 			src = src_path->nodes[0];
4150 			i = 0;
4151 			need_find_last_extent = true;
4152 		}
4153 
4154 		btrfs_item_key_to_cpu(src, &key, i);
4155 		if (!btrfs_comp_cpu_keys(&key, &last_key))
4156 			done = true;
4157 		if (key.objectid != btrfs_ino(inode) ||
4158 		    key.type != BTRFS_EXTENT_DATA_KEY) {
4159 			i++;
4160 			continue;
4161 		}
4162 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
4163 		if (btrfs_file_extent_type(src, extent) ==
4164 		    BTRFS_FILE_EXTENT_INLINE) {
4165 			len = btrfs_file_extent_ram_bytes(src, extent);
4166 			extent_end = ALIGN(key.offset + len,
4167 					   fs_info->sectorsize);
4168 		} else {
4169 			len = btrfs_file_extent_num_bytes(src, extent);
4170 			extent_end = key.offset + len;
4171 		}
4172 		i++;
4173 
4174 		if (*last_extent == key.offset) {
4175 			*last_extent = extent_end;
4176 			continue;
4177 		}
4178 		offset = *last_extent;
4179 		len = key.offset - *last_extent;
4180 		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
4181 				offset, 0, 0, len, 0, len, 0, 0, 0);
4182 		if (ret)
4183 			break;
4184 		*last_extent = extent_end;
4185 	}
4186 
4187 	/*
4188 	 * Check if there is a hole between the last extent found in our leaf
4189 	 * and the first extent in the next leaf. If there is one, we need to
4190 	 * log an explicit hole so that at replay time we can punch the hole.
4191 	 */
4192 	if (ret == 0 &&
4193 	    key.objectid == btrfs_ino(inode) &&
4194 	    key.type == BTRFS_EXTENT_DATA_KEY &&
4195 	    i == btrfs_header_nritems(src_path->nodes[0])) {
4196 		ret = btrfs_next_leaf(inode->root, src_path);
4197 		need_find_last_extent = true;
4198 		if (ret > 0) {
4199 			ret = 0;
4200 		} else if (ret == 0) {
4201 			btrfs_item_key_to_cpu(src_path->nodes[0], &key,
4202 					      src_path->slots[0]);
4203 			if (key.objectid == btrfs_ino(inode) &&
4204 			    key.type == BTRFS_EXTENT_DATA_KEY &&
4205 			    *last_extent < key.offset) {
4206 				const u64 len = key.offset - *last_extent;
4207 
4208 				ret = btrfs_insert_file_extent(trans, log,
4209 							       btrfs_ino(inode),
4210 							       *last_extent, 0,
4211 							       0, len, 0, len,
4212 							       0, 0, 0);
4213 				*last_extent += len;
4214 			}
4215 		}
4216 	}
4217 	/*
4218 	 * Need to let the callers know we dropped the path so they should
4219 	 * re-search.
4220 	 */
4221 	if (!ret && need_find_last_extent)
4222 		ret = 1;
4223 	return ret;
4224 }
4225 
4226 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
4227 {
4228 	struct extent_map *em1, *em2;
4229 
4230 	em1 = list_entry(a, struct extent_map, list);
4231 	em2 = list_entry(b, struct extent_map, list);
4232 
4233 	if (em1->start < em2->start)
4234 		return -1;
4235 	else if (em1->start > em2->start)
4236 		return 1;
4237 	return 0;
4238 }
4239 
4240 static int log_extent_csums(struct btrfs_trans_handle *trans,
4241 			    struct btrfs_inode *inode,
4242 			    struct btrfs_root *log_root,
4243 			    const struct extent_map *em)
4244 {
4245 	u64 csum_offset;
4246 	u64 csum_len;
4247 	LIST_HEAD(ordered_sums);
4248 	int ret = 0;
4249 
4250 	if (inode->flags & BTRFS_INODE_NODATASUM ||
4251 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4252 	    em->block_start == EXTENT_MAP_HOLE)
4253 		return 0;
4254 
4255 	/* If we're compressed we have to save the entire range of csums. */
4256 	if (em->compress_type) {
4257 		csum_offset = 0;
4258 		csum_len = max(em->block_len, em->orig_block_len);
4259 	} else {
4260 		csum_offset = em->mod_start - em->start;
4261 		csum_len = em->mod_len;
4262 	}
4263 
4264 	/* block start is already adjusted for the file extent offset. */
4265 	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4266 				       em->block_start + csum_offset,
4267 				       em->block_start + csum_offset +
4268 				       csum_len - 1, &ordered_sums, 0);
4269 	if (ret)
4270 		return ret;
4271 
4272 	while (!list_empty(&ordered_sums)) {
4273 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4274 						   struct btrfs_ordered_sum,
4275 						   list);
4276 		if (!ret)
4277 			ret = btrfs_csum_file_blocks(trans, log_root, sums);
4278 		list_del(&sums->list);
4279 		kfree(sums);
4280 	}
4281 
4282 	return ret;
4283 }
4284 
4285 static int log_one_extent(struct btrfs_trans_handle *trans,
4286 			  struct btrfs_inode *inode, struct btrfs_root *root,
4287 			  const struct extent_map *em,
4288 			  struct btrfs_path *path,
4289 			  struct btrfs_log_ctx *ctx)
4290 {
4291 	struct btrfs_root *log = root->log_root;
4292 	struct btrfs_file_extent_item *fi;
4293 	struct extent_buffer *leaf;
4294 	struct btrfs_map_token token;
4295 	struct btrfs_key key;
4296 	u64 extent_offset = em->start - em->orig_start;
4297 	u64 block_len;
4298 	int ret;
4299 	int extent_inserted = 0;
4300 
4301 	ret = log_extent_csums(trans, inode, log, em);
4302 	if (ret)
4303 		return ret;
4304 
4305 	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
4306 				   em->start + em->len, NULL, 0, 1,
4307 				   sizeof(*fi), &extent_inserted);
4308 	if (ret)
4309 		return ret;
4310 
4311 	if (!extent_inserted) {
4312 		key.objectid = btrfs_ino(inode);
4313 		key.type = BTRFS_EXTENT_DATA_KEY;
4314 		key.offset = em->start;
4315 
4316 		ret = btrfs_insert_empty_item(trans, log, path, &key,
4317 					      sizeof(*fi));
4318 		if (ret)
4319 			return ret;
4320 	}
4321 	leaf = path->nodes[0];
4322 	btrfs_init_map_token(&token, leaf);
4323 	fi = btrfs_item_ptr(leaf, path->slots[0],
4324 			    struct btrfs_file_extent_item);
4325 
4326 	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4327 					       &token);
4328 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4329 		btrfs_set_token_file_extent_type(leaf, fi,
4330 						 BTRFS_FILE_EXTENT_PREALLOC,
4331 						 &token);
4332 	else
4333 		btrfs_set_token_file_extent_type(leaf, fi,
4334 						 BTRFS_FILE_EXTENT_REG,
4335 						 &token);
4336 
4337 	block_len = max(em->block_len, em->orig_block_len);
4338 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4339 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4340 							em->block_start,
4341 							&token);
4342 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4343 							   &token);
4344 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4345 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4346 							em->block_start -
4347 							extent_offset, &token);
4348 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4349 							   &token);
4350 	} else {
4351 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4352 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4353 							   &token);
4354 	}
4355 
4356 	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4357 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4358 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4359 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4360 						&token);
4361 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4362 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4363 	btrfs_mark_buffer_dirty(leaf);
4364 
4365 	btrfs_release_path(path);
4366 
4367 	return ret;
4368 }
4369 
4370 /*
4371  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4372  * lose them after doing a fast fsync and replaying the log. We scan the
4373  * subvolume's root instead of iterating the inode's extent map tree because
4374  * otherwise we can log incorrect extent items based on extent map conversion.
4375  * That can happen due to the fact that extent maps are merged when they
4376  * are not in the extent map tree's list of modified extents.
4377  */
4378 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4379 				      struct btrfs_inode *inode,
4380 				      struct btrfs_path *path)
4381 {
4382 	struct btrfs_root *root = inode->root;
4383 	struct btrfs_key key;
4384 	const u64 i_size = i_size_read(&inode->vfs_inode);
4385 	const u64 ino = btrfs_ino(inode);
4386 	struct btrfs_path *dst_path = NULL;
4387 	u64 last_extent = (u64)-1;
4388 	int ins_nr = 0;
4389 	int start_slot;
4390 	int ret;
4391 
4392 	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4393 		return 0;
4394 
4395 	key.objectid = ino;
4396 	key.type = BTRFS_EXTENT_DATA_KEY;
4397 	key.offset = i_size;
4398 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4399 	if (ret < 0)
4400 		goto out;
4401 
4402 	while (true) {
4403 		struct extent_buffer *leaf = path->nodes[0];
4404 		int slot = path->slots[0];
4405 
4406 		if (slot >= btrfs_header_nritems(leaf)) {
4407 			if (ins_nr > 0) {
4408 				ret = copy_items(trans, inode, dst_path, path,
4409 						 &last_extent, start_slot,
4410 						 ins_nr, 1, 0);
4411 				if (ret < 0)
4412 					goto out;
4413 				ins_nr = 0;
4414 			}
4415 			ret = btrfs_next_leaf(root, path);
4416 			if (ret < 0)
4417 				goto out;
4418 			if (ret > 0) {
4419 				ret = 0;
4420 				break;
4421 			}
4422 			continue;
4423 		}
4424 
4425 		btrfs_item_key_to_cpu(leaf, &key, slot);
4426 		if (key.objectid > ino)
4427 			break;
4428 		if (WARN_ON_ONCE(key.objectid < ino) ||
4429 		    key.type < BTRFS_EXTENT_DATA_KEY ||
4430 		    key.offset < i_size) {
4431 			path->slots[0]++;
4432 			continue;
4433 		}
4434 		if (last_extent == (u64)-1) {
4435 			last_extent = key.offset;
4436 			/*
4437 			 * Avoid logging extent items logged in past fsync calls
4438 			 * and leading to duplicate keys in the log tree.
4439 			 */
4440 			do {
4441 				ret = btrfs_truncate_inode_items(trans,
4442 							 root->log_root,
4443 							 &inode->vfs_inode,
4444 							 i_size,
4445 							 BTRFS_EXTENT_DATA_KEY);
4446 			} while (ret == -EAGAIN);
4447 			if (ret)
4448 				goto out;
4449 		}
4450 		if (ins_nr == 0)
4451 			start_slot = slot;
4452 		ins_nr++;
4453 		path->slots[0]++;
4454 		if (!dst_path) {
4455 			dst_path = btrfs_alloc_path();
4456 			if (!dst_path) {
4457 				ret = -ENOMEM;
4458 				goto out;
4459 			}
4460 		}
4461 	}
4462 	if (ins_nr > 0) {
4463 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
4464 				 start_slot, ins_nr, 1, 0);
4465 		if (ret > 0)
4466 			ret = 0;
4467 	}
4468 out:
4469 	btrfs_release_path(path);
4470 	btrfs_free_path(dst_path);
4471 	return ret;
4472 }
4473 
4474 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4475 				     struct btrfs_root *root,
4476 				     struct btrfs_inode *inode,
4477 				     struct btrfs_path *path,
4478 				     struct btrfs_log_ctx *ctx,
4479 				     const u64 start,
4480 				     const u64 end)
4481 {
4482 	struct extent_map *em, *n;
4483 	struct list_head extents;
4484 	struct extent_map_tree *tree = &inode->extent_tree;
4485 	u64 test_gen;
4486 	int ret = 0;
4487 	int num = 0;
4488 
4489 	INIT_LIST_HEAD(&extents);
4490 
4491 	write_lock(&tree->lock);
4492 	test_gen = root->fs_info->last_trans_committed;
4493 
4494 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4495 		/*
4496 		 * Skip extents outside our logging range. It's important to do
4497 		 * it for correctness because if we don't ignore them, we may
4498 		 * log them before their ordered extent completes, and therefore
4499 		 * we could log them without logging their respective checksums
4500 		 * (the checksum items are added to the csum tree at the very
4501 		 * end of btrfs_finish_ordered_io()). Also leave such extents
4502 		 * outside of our range in the list, since we may have another
4503 		 * ranged fsync in the near future that needs them. If an extent
4504 		 * outside our range corresponds to a hole, log it to avoid
4505 		 * leaving gaps between extents (fsck will complain when we are
4506 		 * not using the NO_HOLES feature).
4507 		 */
4508 		if ((em->start > end || em->start + em->len <= start) &&
4509 		    em->block_start != EXTENT_MAP_HOLE)
4510 			continue;
4511 
4512 		list_del_init(&em->list);
4513 		/*
4514 		 * Just an arbitrary number, this can be really CPU intensive
4515 		 * once we start getting a lot of extents, and really once we
4516 		 * have a bunch of extents we just want to commit since it will
4517 		 * be faster.
4518 		 */
4519 		if (++num > 32768) {
4520 			list_del_init(&tree->modified_extents);
4521 			ret = -EFBIG;
4522 			goto process;
4523 		}
4524 
4525 		if (em->generation <= test_gen)
4526 			continue;
4527 
4528 		/* We log prealloc extents beyond eof later. */
4529 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4530 		    em->start >= i_size_read(&inode->vfs_inode))
4531 			continue;
4532 
4533 		/* Need a ref to keep it from getting evicted from cache */
4534 		refcount_inc(&em->refs);
4535 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4536 		list_add_tail(&em->list, &extents);
4537 		num++;
4538 	}
4539 
4540 	list_sort(NULL, &extents, extent_cmp);
4541 process:
4542 	while (!list_empty(&extents)) {
4543 		em = list_entry(extents.next, struct extent_map, list);
4544 
4545 		list_del_init(&em->list);
4546 
4547 		/*
4548 		 * If we had an error we just need to delete everybody from our
4549 		 * private list.
4550 		 */
4551 		if (ret) {
4552 			clear_em_logging(tree, em);
4553 			free_extent_map(em);
4554 			continue;
4555 		}
4556 
4557 		write_unlock(&tree->lock);
4558 
4559 		ret = log_one_extent(trans, inode, root, em, path, ctx);
4560 		write_lock(&tree->lock);
4561 		clear_em_logging(tree, em);
4562 		free_extent_map(em);
4563 	}
4564 	WARN_ON(!list_empty(&extents));
4565 	write_unlock(&tree->lock);
4566 
4567 	btrfs_release_path(path);
4568 	if (!ret)
4569 		ret = btrfs_log_prealloc_extents(trans, inode, path);
4570 
4571 	return ret;
4572 }
4573 
4574 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4575 			     struct btrfs_path *path, u64 *size_ret)
4576 {
4577 	struct btrfs_key key;
4578 	int ret;
4579 
4580 	key.objectid = btrfs_ino(inode);
4581 	key.type = BTRFS_INODE_ITEM_KEY;
4582 	key.offset = 0;
4583 
4584 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4585 	if (ret < 0) {
4586 		return ret;
4587 	} else if (ret > 0) {
4588 		*size_ret = 0;
4589 	} else {
4590 		struct btrfs_inode_item *item;
4591 
4592 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4593 				      struct btrfs_inode_item);
4594 		*size_ret = btrfs_inode_size(path->nodes[0], item);
4595 		/*
4596 		 * If the in-memory inode's i_size is smaller then the inode
4597 		 * size stored in the btree, return the inode's i_size, so
4598 		 * that we get a correct inode size after replaying the log
4599 		 * when before a power failure we had a shrinking truncate
4600 		 * followed by addition of a new name (rename / new hard link).
4601 		 * Otherwise return the inode size from the btree, to avoid
4602 		 * data loss when replaying a log due to previously doing a
4603 		 * write that expands the inode's size and logging a new name
4604 		 * immediately after.
4605 		 */
4606 		if (*size_ret > inode->vfs_inode.i_size)
4607 			*size_ret = inode->vfs_inode.i_size;
4608 	}
4609 
4610 	btrfs_release_path(path);
4611 	return 0;
4612 }
4613 
4614 /*
4615  * At the moment we always log all xattrs. This is to figure out at log replay
4616  * time which xattrs must have their deletion replayed. If a xattr is missing
4617  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4618  * because if a xattr is deleted, the inode is fsynced and a power failure
4619  * happens, causing the log to be replayed the next time the fs is mounted,
4620  * we want the xattr to not exist anymore (same behaviour as other filesystems
4621  * with a journal, ext3/4, xfs, f2fs, etc).
4622  */
4623 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4624 				struct btrfs_root *root,
4625 				struct btrfs_inode *inode,
4626 				struct btrfs_path *path,
4627 				struct btrfs_path *dst_path)
4628 {
4629 	int ret;
4630 	struct btrfs_key key;
4631 	const u64 ino = btrfs_ino(inode);
4632 	int ins_nr = 0;
4633 	int start_slot = 0;
4634 
4635 	key.objectid = ino;
4636 	key.type = BTRFS_XATTR_ITEM_KEY;
4637 	key.offset = 0;
4638 
4639 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4640 	if (ret < 0)
4641 		return ret;
4642 
4643 	while (true) {
4644 		int slot = path->slots[0];
4645 		struct extent_buffer *leaf = path->nodes[0];
4646 		int nritems = btrfs_header_nritems(leaf);
4647 
4648 		if (slot >= nritems) {
4649 			if (ins_nr > 0) {
4650 				u64 last_extent = 0;
4651 
4652 				ret = copy_items(trans, inode, dst_path, path,
4653 						 &last_extent, start_slot,
4654 						 ins_nr, 1, 0);
4655 				/* can't be 1, extent items aren't processed */
4656 				ASSERT(ret <= 0);
4657 				if (ret < 0)
4658 					return ret;
4659 				ins_nr = 0;
4660 			}
4661 			ret = btrfs_next_leaf(root, path);
4662 			if (ret < 0)
4663 				return ret;
4664 			else if (ret > 0)
4665 				break;
4666 			continue;
4667 		}
4668 
4669 		btrfs_item_key_to_cpu(leaf, &key, slot);
4670 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4671 			break;
4672 
4673 		if (ins_nr == 0)
4674 			start_slot = slot;
4675 		ins_nr++;
4676 		path->slots[0]++;
4677 		cond_resched();
4678 	}
4679 	if (ins_nr > 0) {
4680 		u64 last_extent = 0;
4681 
4682 		ret = copy_items(trans, inode, dst_path, path,
4683 				 &last_extent, start_slot,
4684 				 ins_nr, 1, 0);
4685 		/* can't be 1, extent items aren't processed */
4686 		ASSERT(ret <= 0);
4687 		if (ret < 0)
4688 			return ret;
4689 	}
4690 
4691 	return 0;
4692 }
4693 
4694 /*
4695  * If the no holes feature is enabled we need to make sure any hole between the
4696  * last extent and the i_size of our inode is explicitly marked in the log. This
4697  * is to make sure that doing something like:
4698  *
4699  *      1) create file with 128Kb of data
4700  *      2) truncate file to 64Kb
4701  *      3) truncate file to 256Kb
4702  *      4) fsync file
4703  *      5) <crash/power failure>
4704  *      6) mount fs and trigger log replay
4705  *
4706  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4707  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4708  * file correspond to a hole. The presence of explicit holes in a log tree is
4709  * what guarantees that log replay will remove/adjust file extent items in the
4710  * fs/subvol tree.
4711  *
4712  * Here we do not need to care about holes between extents, that is already done
4713  * by copy_items(). We also only need to do this in the full sync path, where we
4714  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4715  * lookup the list of modified extent maps and if any represents a hole, we
4716  * insert a corresponding extent representing a hole in the log tree.
4717  */
4718 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4719 				   struct btrfs_root *root,
4720 				   struct btrfs_inode *inode,
4721 				   struct btrfs_path *path)
4722 {
4723 	struct btrfs_fs_info *fs_info = root->fs_info;
4724 	int ret;
4725 	struct btrfs_key key;
4726 	u64 hole_start;
4727 	u64 hole_size;
4728 	struct extent_buffer *leaf;
4729 	struct btrfs_root *log = root->log_root;
4730 	const u64 ino = btrfs_ino(inode);
4731 	const u64 i_size = i_size_read(&inode->vfs_inode);
4732 
4733 	if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4734 		return 0;
4735 
4736 	key.objectid = ino;
4737 	key.type = BTRFS_EXTENT_DATA_KEY;
4738 	key.offset = (u64)-1;
4739 
4740 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4741 	ASSERT(ret != 0);
4742 	if (ret < 0)
4743 		return ret;
4744 
4745 	ASSERT(path->slots[0] > 0);
4746 	path->slots[0]--;
4747 	leaf = path->nodes[0];
4748 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4749 
4750 	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4751 		/* inode does not have any extents */
4752 		hole_start = 0;
4753 		hole_size = i_size;
4754 	} else {
4755 		struct btrfs_file_extent_item *extent;
4756 		u64 len;
4757 
4758 		/*
4759 		 * If there's an extent beyond i_size, an explicit hole was
4760 		 * already inserted by copy_items().
4761 		 */
4762 		if (key.offset >= i_size)
4763 			return 0;
4764 
4765 		extent = btrfs_item_ptr(leaf, path->slots[0],
4766 					struct btrfs_file_extent_item);
4767 
4768 		if (btrfs_file_extent_type(leaf, extent) ==
4769 		    BTRFS_FILE_EXTENT_INLINE)
4770 			return 0;
4771 
4772 		len = btrfs_file_extent_num_bytes(leaf, extent);
4773 		/* Last extent goes beyond i_size, no need to log a hole. */
4774 		if (key.offset + len > i_size)
4775 			return 0;
4776 		hole_start = key.offset + len;
4777 		hole_size = i_size - hole_start;
4778 	}
4779 	btrfs_release_path(path);
4780 
4781 	/* Last extent ends at i_size. */
4782 	if (hole_size == 0)
4783 		return 0;
4784 
4785 	hole_size = ALIGN(hole_size, fs_info->sectorsize);
4786 	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4787 				       hole_size, 0, hole_size, 0, 0, 0);
4788 	return ret;
4789 }
4790 
4791 /*
4792  * When we are logging a new inode X, check if it doesn't have a reference that
4793  * matches the reference from some other inode Y created in a past transaction
4794  * and that was renamed in the current transaction. If we don't do this, then at
4795  * log replay time we can lose inode Y (and all its files if it's a directory):
4796  *
4797  * mkdir /mnt/x
4798  * echo "hello world" > /mnt/x/foobar
4799  * sync
4800  * mv /mnt/x /mnt/y
4801  * mkdir /mnt/x                 # or touch /mnt/x
4802  * xfs_io -c fsync /mnt/x
4803  * <power fail>
4804  * mount fs, trigger log replay
4805  *
4806  * After the log replay procedure, we would lose the first directory and all its
4807  * files (file foobar).
4808  * For the case where inode Y is not a directory we simply end up losing it:
4809  *
4810  * echo "123" > /mnt/foo
4811  * sync
4812  * mv /mnt/foo /mnt/bar
4813  * echo "abc" > /mnt/foo
4814  * xfs_io -c fsync /mnt/foo
4815  * <power fail>
4816  *
4817  * We also need this for cases where a snapshot entry is replaced by some other
4818  * entry (file or directory) otherwise we end up with an unreplayable log due to
4819  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4820  * if it were a regular entry:
4821  *
4822  * mkdir /mnt/x
4823  * btrfs subvolume snapshot /mnt /mnt/x/snap
4824  * btrfs subvolume delete /mnt/x/snap
4825  * rmdir /mnt/x
4826  * mkdir /mnt/x
4827  * fsync /mnt/x or fsync some new file inside it
4828  * <power fail>
4829  *
4830  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4831  * the same transaction.
4832  */
4833 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4834 					 const int slot,
4835 					 const struct btrfs_key *key,
4836 					 struct btrfs_inode *inode,
4837 					 u64 *other_ino, u64 *other_parent)
4838 {
4839 	int ret;
4840 	struct btrfs_path *search_path;
4841 	char *name = NULL;
4842 	u32 name_len = 0;
4843 	u32 item_size = btrfs_item_size_nr(eb, slot);
4844 	u32 cur_offset = 0;
4845 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4846 
4847 	search_path = btrfs_alloc_path();
4848 	if (!search_path)
4849 		return -ENOMEM;
4850 	search_path->search_commit_root = 1;
4851 	search_path->skip_locking = 1;
4852 
4853 	while (cur_offset < item_size) {
4854 		u64 parent;
4855 		u32 this_name_len;
4856 		u32 this_len;
4857 		unsigned long name_ptr;
4858 		struct btrfs_dir_item *di;
4859 
4860 		if (key->type == BTRFS_INODE_REF_KEY) {
4861 			struct btrfs_inode_ref *iref;
4862 
4863 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4864 			parent = key->offset;
4865 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
4866 			name_ptr = (unsigned long)(iref + 1);
4867 			this_len = sizeof(*iref) + this_name_len;
4868 		} else {
4869 			struct btrfs_inode_extref *extref;
4870 
4871 			extref = (struct btrfs_inode_extref *)(ptr +
4872 							       cur_offset);
4873 			parent = btrfs_inode_extref_parent(eb, extref);
4874 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
4875 			name_ptr = (unsigned long)&extref->name;
4876 			this_len = sizeof(*extref) + this_name_len;
4877 		}
4878 
4879 		if (this_name_len > name_len) {
4880 			char *new_name;
4881 
4882 			new_name = krealloc(name, this_name_len, GFP_NOFS);
4883 			if (!new_name) {
4884 				ret = -ENOMEM;
4885 				goto out;
4886 			}
4887 			name_len = this_name_len;
4888 			name = new_name;
4889 		}
4890 
4891 		read_extent_buffer(eb, name, name_ptr, this_name_len);
4892 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4893 				parent, name, this_name_len, 0);
4894 		if (di && !IS_ERR(di)) {
4895 			struct btrfs_key di_key;
4896 
4897 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4898 						  di, &di_key);
4899 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4900 				if (di_key.objectid != key->objectid) {
4901 					ret = 1;
4902 					*other_ino = di_key.objectid;
4903 					*other_parent = parent;
4904 				} else {
4905 					ret = 0;
4906 				}
4907 			} else {
4908 				ret = -EAGAIN;
4909 			}
4910 			goto out;
4911 		} else if (IS_ERR(di)) {
4912 			ret = PTR_ERR(di);
4913 			goto out;
4914 		}
4915 		btrfs_release_path(search_path);
4916 
4917 		cur_offset += this_len;
4918 	}
4919 	ret = 0;
4920 out:
4921 	btrfs_free_path(search_path);
4922 	kfree(name);
4923 	return ret;
4924 }
4925 
4926 struct btrfs_ino_list {
4927 	u64 ino;
4928 	u64 parent;
4929 	struct list_head list;
4930 };
4931 
4932 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4933 				  struct btrfs_root *root,
4934 				  struct btrfs_path *path,
4935 				  struct btrfs_log_ctx *ctx,
4936 				  u64 ino, u64 parent)
4937 {
4938 	struct btrfs_ino_list *ino_elem;
4939 	LIST_HEAD(inode_list);
4940 	int ret = 0;
4941 
4942 	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4943 	if (!ino_elem)
4944 		return -ENOMEM;
4945 	ino_elem->ino = ino;
4946 	ino_elem->parent = parent;
4947 	list_add_tail(&ino_elem->list, &inode_list);
4948 
4949 	while (!list_empty(&inode_list)) {
4950 		struct btrfs_fs_info *fs_info = root->fs_info;
4951 		struct btrfs_key key;
4952 		struct inode *inode;
4953 
4954 		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4955 					    list);
4956 		ino = ino_elem->ino;
4957 		parent = ino_elem->parent;
4958 		list_del(&ino_elem->list);
4959 		kfree(ino_elem);
4960 		if (ret)
4961 			continue;
4962 
4963 		btrfs_release_path(path);
4964 
4965 		key.objectid = ino;
4966 		key.type = BTRFS_INODE_ITEM_KEY;
4967 		key.offset = 0;
4968 		inode = btrfs_iget(fs_info->sb, &key, root);
4969 		/*
4970 		 * If the other inode that had a conflicting dir entry was
4971 		 * deleted in the current transaction, we need to log its parent
4972 		 * directory.
4973 		 */
4974 		if (IS_ERR(inode)) {
4975 			ret = PTR_ERR(inode);
4976 			if (ret == -ENOENT) {
4977 				key.objectid = parent;
4978 				inode = btrfs_iget(fs_info->sb, &key, root);
4979 				if (IS_ERR(inode)) {
4980 					ret = PTR_ERR(inode);
4981 				} else {
4982 					ret = btrfs_log_inode(trans, root,
4983 						      BTRFS_I(inode),
4984 						      LOG_OTHER_INODE_ALL,
4985 						      0, LLONG_MAX, ctx);
4986 					btrfs_add_delayed_iput(inode);
4987 				}
4988 			}
4989 			continue;
4990 		}
4991 		/*
4992 		 * We are safe logging the other inode without acquiring its
4993 		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
4994 		 * are safe against concurrent renames of the other inode as
4995 		 * well because during a rename we pin the log and update the
4996 		 * log with the new name before we unpin it.
4997 		 */
4998 		ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
4999 				      LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
5000 		if (ret) {
5001 			btrfs_add_delayed_iput(inode);
5002 			continue;
5003 		}
5004 
5005 		key.objectid = ino;
5006 		key.type = BTRFS_INODE_REF_KEY;
5007 		key.offset = 0;
5008 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5009 		if (ret < 0) {
5010 			btrfs_add_delayed_iput(inode);
5011 			continue;
5012 		}
5013 
5014 		while (true) {
5015 			struct extent_buffer *leaf = path->nodes[0];
5016 			int slot = path->slots[0];
5017 			u64 other_ino = 0;
5018 			u64 other_parent = 0;
5019 
5020 			if (slot >= btrfs_header_nritems(leaf)) {
5021 				ret = btrfs_next_leaf(root, path);
5022 				if (ret < 0) {
5023 					break;
5024 				} else if (ret > 0) {
5025 					ret = 0;
5026 					break;
5027 				}
5028 				continue;
5029 			}
5030 
5031 			btrfs_item_key_to_cpu(leaf, &key, slot);
5032 			if (key.objectid != ino ||
5033 			    (key.type != BTRFS_INODE_REF_KEY &&
5034 			     key.type != BTRFS_INODE_EXTREF_KEY)) {
5035 				ret = 0;
5036 				break;
5037 			}
5038 
5039 			ret = btrfs_check_ref_name_override(leaf, slot, &key,
5040 					BTRFS_I(inode), &other_ino,
5041 					&other_parent);
5042 			if (ret < 0)
5043 				break;
5044 			if (ret > 0) {
5045 				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5046 				if (!ino_elem) {
5047 					ret = -ENOMEM;
5048 					break;
5049 				}
5050 				ino_elem->ino = other_ino;
5051 				ino_elem->parent = other_parent;
5052 				list_add_tail(&ino_elem->list, &inode_list);
5053 				ret = 0;
5054 			}
5055 			path->slots[0]++;
5056 		}
5057 		btrfs_add_delayed_iput(inode);
5058 	}
5059 
5060 	return ret;
5061 }
5062 
5063 /* log a single inode in the tree log.
5064  * At least one parent directory for this inode must exist in the tree
5065  * or be logged already.
5066  *
5067  * Any items from this inode changed by the current transaction are copied
5068  * to the log tree.  An extra reference is taken on any extents in this
5069  * file, allowing us to avoid a whole pile of corner cases around logging
5070  * blocks that have been removed from the tree.
5071  *
5072  * See LOG_INODE_ALL and related defines for a description of what inode_only
5073  * does.
5074  *
5075  * This handles both files and directories.
5076  */
5077 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5078 			   struct btrfs_root *root, struct btrfs_inode *inode,
5079 			   int inode_only,
5080 			   const loff_t start,
5081 			   const loff_t end,
5082 			   struct btrfs_log_ctx *ctx)
5083 {
5084 	struct btrfs_fs_info *fs_info = root->fs_info;
5085 	struct btrfs_path *path;
5086 	struct btrfs_path *dst_path;
5087 	struct btrfs_key min_key;
5088 	struct btrfs_key max_key;
5089 	struct btrfs_root *log = root->log_root;
5090 	u64 last_extent = 0;
5091 	int err = 0;
5092 	int ret;
5093 	int nritems;
5094 	int ins_start_slot = 0;
5095 	int ins_nr;
5096 	bool fast_search = false;
5097 	u64 ino = btrfs_ino(inode);
5098 	struct extent_map_tree *em_tree = &inode->extent_tree;
5099 	u64 logged_isize = 0;
5100 	bool need_log_inode_item = true;
5101 	bool xattrs_logged = false;
5102 	bool recursive_logging = false;
5103 
5104 	path = btrfs_alloc_path();
5105 	if (!path)
5106 		return -ENOMEM;
5107 	dst_path = btrfs_alloc_path();
5108 	if (!dst_path) {
5109 		btrfs_free_path(path);
5110 		return -ENOMEM;
5111 	}
5112 
5113 	min_key.objectid = ino;
5114 	min_key.type = BTRFS_INODE_ITEM_KEY;
5115 	min_key.offset = 0;
5116 
5117 	max_key.objectid = ino;
5118 
5119 
5120 	/* today the code can only do partial logging of directories */
5121 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
5122 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5123 		       &inode->runtime_flags) &&
5124 	     inode_only >= LOG_INODE_EXISTS))
5125 		max_key.type = BTRFS_XATTR_ITEM_KEY;
5126 	else
5127 		max_key.type = (u8)-1;
5128 	max_key.offset = (u64)-1;
5129 
5130 	/*
5131 	 * Only run delayed items if we are a dir or a new file.
5132 	 * Otherwise commit the delayed inode only, which is needed in
5133 	 * order for the log replay code to mark inodes for link count
5134 	 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
5135 	 */
5136 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
5137 	    inode->generation > fs_info->last_trans_committed)
5138 		ret = btrfs_commit_inode_delayed_items(trans, inode);
5139 	else
5140 		ret = btrfs_commit_inode_delayed_inode(inode);
5141 
5142 	if (ret) {
5143 		btrfs_free_path(path);
5144 		btrfs_free_path(dst_path);
5145 		return ret;
5146 	}
5147 
5148 	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5149 		recursive_logging = true;
5150 		if (inode_only == LOG_OTHER_INODE)
5151 			inode_only = LOG_INODE_EXISTS;
5152 		else
5153 			inode_only = LOG_INODE_ALL;
5154 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5155 	} else {
5156 		mutex_lock(&inode->log_mutex);
5157 	}
5158 
5159 	/*
5160 	 * a brute force approach to making sure we get the most uptodate
5161 	 * copies of everything.
5162 	 */
5163 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5164 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5165 
5166 		if (inode_only == LOG_INODE_EXISTS)
5167 			max_key_type = BTRFS_XATTR_ITEM_KEY;
5168 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5169 	} else {
5170 		if (inode_only == LOG_INODE_EXISTS) {
5171 			/*
5172 			 * Make sure the new inode item we write to the log has
5173 			 * the same isize as the current one (if it exists).
5174 			 * This is necessary to prevent data loss after log
5175 			 * replay, and also to prevent doing a wrong expanding
5176 			 * truncate - for e.g. create file, write 4K into offset
5177 			 * 0, fsync, write 4K into offset 4096, add hard link,
5178 			 * fsync some other file (to sync log), power fail - if
5179 			 * we use the inode's current i_size, after log replay
5180 			 * we get a 8Kb file, with the last 4Kb extent as a hole
5181 			 * (zeroes), as if an expanding truncate happened,
5182 			 * instead of getting a file of 4Kb only.
5183 			 */
5184 			err = logged_inode_size(log, inode, path, &logged_isize);
5185 			if (err)
5186 				goto out_unlock;
5187 		}
5188 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5189 			     &inode->runtime_flags)) {
5190 			if (inode_only == LOG_INODE_EXISTS) {
5191 				max_key.type = BTRFS_XATTR_ITEM_KEY;
5192 				ret = drop_objectid_items(trans, log, path, ino,
5193 							  max_key.type);
5194 			} else {
5195 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5196 					  &inode->runtime_flags);
5197 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5198 					  &inode->runtime_flags);
5199 				while(1) {
5200 					ret = btrfs_truncate_inode_items(trans,
5201 						log, &inode->vfs_inode, 0, 0);
5202 					if (ret != -EAGAIN)
5203 						break;
5204 				}
5205 			}
5206 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5207 					      &inode->runtime_flags) ||
5208 			   inode_only == LOG_INODE_EXISTS) {
5209 			if (inode_only == LOG_INODE_ALL)
5210 				fast_search = true;
5211 			max_key.type = BTRFS_XATTR_ITEM_KEY;
5212 			ret = drop_objectid_items(trans, log, path, ino,
5213 						  max_key.type);
5214 		} else {
5215 			if (inode_only == LOG_INODE_ALL)
5216 				fast_search = true;
5217 			goto log_extents;
5218 		}
5219 
5220 	}
5221 	if (ret) {
5222 		err = ret;
5223 		goto out_unlock;
5224 	}
5225 
5226 	while (1) {
5227 		ins_nr = 0;
5228 		ret = btrfs_search_forward(root, &min_key,
5229 					   path, trans->transid);
5230 		if (ret < 0) {
5231 			err = ret;
5232 			goto out_unlock;
5233 		}
5234 		if (ret != 0)
5235 			break;
5236 again:
5237 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
5238 		if (min_key.objectid != ino)
5239 			break;
5240 		if (min_key.type > max_key.type)
5241 			break;
5242 
5243 		if (min_key.type == BTRFS_INODE_ITEM_KEY)
5244 			need_log_inode_item = false;
5245 
5246 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
5247 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
5248 		    inode->generation == trans->transid &&
5249 		    !recursive_logging) {
5250 			u64 other_ino = 0;
5251 			u64 other_parent = 0;
5252 
5253 			ret = btrfs_check_ref_name_override(path->nodes[0],
5254 					path->slots[0], &min_key, inode,
5255 					&other_ino, &other_parent);
5256 			if (ret < 0) {
5257 				err = ret;
5258 				goto out_unlock;
5259 			} else if (ret > 0 && ctx &&
5260 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5261 				if (ins_nr > 0) {
5262 					ins_nr++;
5263 				} else {
5264 					ins_nr = 1;
5265 					ins_start_slot = path->slots[0];
5266 				}
5267 				ret = copy_items(trans, inode, dst_path, path,
5268 						 &last_extent, ins_start_slot,
5269 						 ins_nr, inode_only,
5270 						 logged_isize);
5271 				if (ret < 0) {
5272 					err = ret;
5273 					goto out_unlock;
5274 				}
5275 				ins_nr = 0;
5276 
5277 				err = log_conflicting_inodes(trans, root, path,
5278 						ctx, other_ino, other_parent);
5279 				if (err)
5280 					goto out_unlock;
5281 				btrfs_release_path(path);
5282 				goto next_key;
5283 			}
5284 		}
5285 
5286 		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5287 		if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
5288 			if (ins_nr == 0)
5289 				goto next_slot;
5290 			ret = copy_items(trans, inode, dst_path, path,
5291 					 &last_extent, ins_start_slot,
5292 					 ins_nr, inode_only, logged_isize);
5293 			if (ret < 0) {
5294 				err = ret;
5295 				goto out_unlock;
5296 			}
5297 			ins_nr = 0;
5298 			if (ret) {
5299 				btrfs_release_path(path);
5300 				continue;
5301 			}
5302 			goto next_slot;
5303 		}
5304 
5305 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5306 			ins_nr++;
5307 			goto next_slot;
5308 		} else if (!ins_nr) {
5309 			ins_start_slot = path->slots[0];
5310 			ins_nr = 1;
5311 			goto next_slot;
5312 		}
5313 
5314 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
5315 				 ins_start_slot, ins_nr, inode_only,
5316 				 logged_isize);
5317 		if (ret < 0) {
5318 			err = ret;
5319 			goto out_unlock;
5320 		}
5321 		if (ret) {
5322 			ins_nr = 0;
5323 			btrfs_release_path(path);
5324 			continue;
5325 		}
5326 		ins_nr = 1;
5327 		ins_start_slot = path->slots[0];
5328 next_slot:
5329 
5330 		nritems = btrfs_header_nritems(path->nodes[0]);
5331 		path->slots[0]++;
5332 		if (path->slots[0] < nritems) {
5333 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
5334 					      path->slots[0]);
5335 			goto again;
5336 		}
5337 		if (ins_nr) {
5338 			ret = copy_items(trans, inode, dst_path, path,
5339 					 &last_extent, ins_start_slot,
5340 					 ins_nr, inode_only, logged_isize);
5341 			if (ret < 0) {
5342 				err = ret;
5343 				goto out_unlock;
5344 			}
5345 			ret = 0;
5346 			ins_nr = 0;
5347 		}
5348 		btrfs_release_path(path);
5349 next_key:
5350 		if (min_key.offset < (u64)-1) {
5351 			min_key.offset++;
5352 		} else if (min_key.type < max_key.type) {
5353 			min_key.type++;
5354 			min_key.offset = 0;
5355 		} else {
5356 			break;
5357 		}
5358 	}
5359 	if (ins_nr) {
5360 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
5361 				 ins_start_slot, ins_nr, inode_only,
5362 				 logged_isize);
5363 		if (ret < 0) {
5364 			err = ret;
5365 			goto out_unlock;
5366 		}
5367 		ret = 0;
5368 		ins_nr = 0;
5369 	}
5370 
5371 	btrfs_release_path(path);
5372 	btrfs_release_path(dst_path);
5373 	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5374 	if (err)
5375 		goto out_unlock;
5376 	xattrs_logged = true;
5377 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5378 		btrfs_release_path(path);
5379 		btrfs_release_path(dst_path);
5380 		err = btrfs_log_trailing_hole(trans, root, inode, path);
5381 		if (err)
5382 			goto out_unlock;
5383 	}
5384 log_extents:
5385 	btrfs_release_path(path);
5386 	btrfs_release_path(dst_path);
5387 	if (need_log_inode_item) {
5388 		err = log_inode_item(trans, log, dst_path, inode);
5389 		if (!err && !xattrs_logged) {
5390 			err = btrfs_log_all_xattrs(trans, root, inode, path,
5391 						   dst_path);
5392 			btrfs_release_path(path);
5393 		}
5394 		if (err)
5395 			goto out_unlock;
5396 	}
5397 	if (fast_search) {
5398 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5399 						ctx, start, end);
5400 		if (ret) {
5401 			err = ret;
5402 			goto out_unlock;
5403 		}
5404 	} else if (inode_only == LOG_INODE_ALL) {
5405 		struct extent_map *em, *n;
5406 
5407 		write_lock(&em_tree->lock);
5408 		/*
5409 		 * We can't just remove every em if we're called for a ranged
5410 		 * fsync - that is, one that doesn't cover the whole possible
5411 		 * file range (0 to LLONG_MAX). This is because we can have
5412 		 * em's that fall outside the range we're logging and therefore
5413 		 * their ordered operations haven't completed yet
5414 		 * (btrfs_finish_ordered_io() not invoked yet). This means we
5415 		 * didn't get their respective file extent item in the fs/subvol
5416 		 * tree yet, and need to let the next fast fsync (one which
5417 		 * consults the list of modified extent maps) find the em so
5418 		 * that it logs a matching file extent item and waits for the
5419 		 * respective ordered operation to complete (if it's still
5420 		 * running).
5421 		 *
5422 		 * Removing every em outside the range we're logging would make
5423 		 * the next fast fsync not log their matching file extent items,
5424 		 * therefore making us lose data after a log replay.
5425 		 */
5426 		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
5427 					 list) {
5428 			const u64 mod_end = em->mod_start + em->mod_len - 1;
5429 
5430 			if (em->mod_start >= start && mod_end <= end)
5431 				list_del_init(&em->list);
5432 		}
5433 		write_unlock(&em_tree->lock);
5434 	}
5435 
5436 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5437 		ret = log_directory_changes(trans, root, inode, path, dst_path,
5438 					ctx);
5439 		if (ret) {
5440 			err = ret;
5441 			goto out_unlock;
5442 		}
5443 	}
5444 
5445 	/*
5446 	 * Don't update last_log_commit if we logged that an inode exists after
5447 	 * it was loaded to memory (full_sync bit set).
5448 	 * This is to prevent data loss when we do a write to the inode, then
5449 	 * the inode gets evicted after all delalloc was flushed, then we log
5450 	 * it exists (due to a rename for example) and then fsync it. This last
5451 	 * fsync would do nothing (not logging the extents previously written).
5452 	 */
5453 	spin_lock(&inode->lock);
5454 	inode->logged_trans = trans->transid;
5455 	if (inode_only != LOG_INODE_EXISTS ||
5456 	    !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5457 		inode->last_log_commit = inode->last_sub_trans;
5458 	spin_unlock(&inode->lock);
5459 out_unlock:
5460 	mutex_unlock(&inode->log_mutex);
5461 
5462 	btrfs_free_path(path);
5463 	btrfs_free_path(dst_path);
5464 	return err;
5465 }
5466 
5467 /*
5468  * Check if we must fallback to a transaction commit when logging an inode.
5469  * This must be called after logging the inode and is used only in the context
5470  * when fsyncing an inode requires the need to log some other inode - in which
5471  * case we can't lock the i_mutex of each other inode we need to log as that
5472  * can lead to deadlocks with concurrent fsync against other inodes (as we can
5473  * log inodes up or down in the hierarchy) or rename operations for example. So
5474  * we take the log_mutex of the inode after we have logged it and then check for
5475  * its last_unlink_trans value - this is safe because any task setting
5476  * last_unlink_trans must take the log_mutex and it must do this before it does
5477  * the actual unlink operation, so if we do this check before a concurrent task
5478  * sets last_unlink_trans it means we've logged a consistent version/state of
5479  * all the inode items, otherwise we are not sure and must do a transaction
5480  * commit (the concurrent task might have only updated last_unlink_trans before
5481  * we logged the inode or it might have also done the unlink).
5482  */
5483 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5484 					  struct btrfs_inode *inode)
5485 {
5486 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
5487 	bool ret = false;
5488 
5489 	mutex_lock(&inode->log_mutex);
5490 	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5491 		/*
5492 		 * Make sure any commits to the log are forced to be full
5493 		 * commits.
5494 		 */
5495 		btrfs_set_log_full_commit(trans);
5496 		ret = true;
5497 	}
5498 	mutex_unlock(&inode->log_mutex);
5499 
5500 	return ret;
5501 }
5502 
5503 /*
5504  * follow the dentry parent pointers up the chain and see if any
5505  * of the directories in it require a full commit before they can
5506  * be logged.  Returns zero if nothing special needs to be done or 1 if
5507  * a full commit is required.
5508  */
5509 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5510 					       struct btrfs_inode *inode,
5511 					       struct dentry *parent,
5512 					       struct super_block *sb,
5513 					       u64 last_committed)
5514 {
5515 	int ret = 0;
5516 	struct dentry *old_parent = NULL;
5517 
5518 	/*
5519 	 * for regular files, if its inode is already on disk, we don't
5520 	 * have to worry about the parents at all.  This is because
5521 	 * we can use the last_unlink_trans field to record renames
5522 	 * and other fun in this file.
5523 	 */
5524 	if (S_ISREG(inode->vfs_inode.i_mode) &&
5525 	    inode->generation <= last_committed &&
5526 	    inode->last_unlink_trans <= last_committed)
5527 		goto out;
5528 
5529 	if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5530 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5531 			goto out;
5532 		inode = BTRFS_I(d_inode(parent));
5533 	}
5534 
5535 	while (1) {
5536 		if (btrfs_must_commit_transaction(trans, inode)) {
5537 			ret = 1;
5538 			break;
5539 		}
5540 
5541 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5542 			break;
5543 
5544 		if (IS_ROOT(parent)) {
5545 			inode = BTRFS_I(d_inode(parent));
5546 			if (btrfs_must_commit_transaction(trans, inode))
5547 				ret = 1;
5548 			break;
5549 		}
5550 
5551 		parent = dget_parent(parent);
5552 		dput(old_parent);
5553 		old_parent = parent;
5554 		inode = BTRFS_I(d_inode(parent));
5555 
5556 	}
5557 	dput(old_parent);
5558 out:
5559 	return ret;
5560 }
5561 
5562 struct btrfs_dir_list {
5563 	u64 ino;
5564 	struct list_head list;
5565 };
5566 
5567 /*
5568  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5569  * details about the why it is needed.
5570  * This is a recursive operation - if an existing dentry corresponds to a
5571  * directory, that directory's new entries are logged too (same behaviour as
5572  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5573  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5574  * complains about the following circular lock dependency / possible deadlock:
5575  *
5576  *        CPU0                                        CPU1
5577  *        ----                                        ----
5578  * lock(&type->i_mutex_dir_key#3/2);
5579  *                                            lock(sb_internal#2);
5580  *                                            lock(&type->i_mutex_dir_key#3/2);
5581  * lock(&sb->s_type->i_mutex_key#14);
5582  *
5583  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5584  * sb_start_intwrite() in btrfs_start_transaction().
5585  * Not locking i_mutex of the inodes is still safe because:
5586  *
5587  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5588  *    that while logging the inode new references (names) are added or removed
5589  *    from the inode, leaving the logged inode item with a link count that does
5590  *    not match the number of logged inode reference items. This is fine because
5591  *    at log replay time we compute the real number of links and correct the
5592  *    link count in the inode item (see replay_one_buffer() and
5593  *    link_to_fixup_dir());
5594  *
5595  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5596  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5597  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5598  *    has a size that doesn't match the sum of the lengths of all the logged
5599  *    names. This does not result in a problem because if a dir_item key is
5600  *    logged but its matching dir_index key is not logged, at log replay time we
5601  *    don't use it to replay the respective name (see replay_one_name()). On the
5602  *    other hand if only the dir_index key ends up being logged, the respective
5603  *    name is added to the fs/subvol tree with both the dir_item and dir_index
5604  *    keys created (see replay_one_name()).
5605  *    The directory's inode item with a wrong i_size is not a problem as well,
5606  *    since we don't use it at log replay time to set the i_size in the inode
5607  *    item of the fs/subvol tree (see overwrite_item()).
5608  */
5609 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5610 				struct btrfs_root *root,
5611 				struct btrfs_inode *start_inode,
5612 				struct btrfs_log_ctx *ctx)
5613 {
5614 	struct btrfs_fs_info *fs_info = root->fs_info;
5615 	struct btrfs_root *log = root->log_root;
5616 	struct btrfs_path *path;
5617 	LIST_HEAD(dir_list);
5618 	struct btrfs_dir_list *dir_elem;
5619 	int ret = 0;
5620 
5621 	path = btrfs_alloc_path();
5622 	if (!path)
5623 		return -ENOMEM;
5624 
5625 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5626 	if (!dir_elem) {
5627 		btrfs_free_path(path);
5628 		return -ENOMEM;
5629 	}
5630 	dir_elem->ino = btrfs_ino(start_inode);
5631 	list_add_tail(&dir_elem->list, &dir_list);
5632 
5633 	while (!list_empty(&dir_list)) {
5634 		struct extent_buffer *leaf;
5635 		struct btrfs_key min_key;
5636 		int nritems;
5637 		int i;
5638 
5639 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5640 					    list);
5641 		if (ret)
5642 			goto next_dir_inode;
5643 
5644 		min_key.objectid = dir_elem->ino;
5645 		min_key.type = BTRFS_DIR_ITEM_KEY;
5646 		min_key.offset = 0;
5647 again:
5648 		btrfs_release_path(path);
5649 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5650 		if (ret < 0) {
5651 			goto next_dir_inode;
5652 		} else if (ret > 0) {
5653 			ret = 0;
5654 			goto next_dir_inode;
5655 		}
5656 
5657 process_leaf:
5658 		leaf = path->nodes[0];
5659 		nritems = btrfs_header_nritems(leaf);
5660 		for (i = path->slots[0]; i < nritems; i++) {
5661 			struct btrfs_dir_item *di;
5662 			struct btrfs_key di_key;
5663 			struct inode *di_inode;
5664 			struct btrfs_dir_list *new_dir_elem;
5665 			int log_mode = LOG_INODE_EXISTS;
5666 			int type;
5667 
5668 			btrfs_item_key_to_cpu(leaf, &min_key, i);
5669 			if (min_key.objectid != dir_elem->ino ||
5670 			    min_key.type != BTRFS_DIR_ITEM_KEY)
5671 				goto next_dir_inode;
5672 
5673 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5674 			type = btrfs_dir_type(leaf, di);
5675 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
5676 			    type != BTRFS_FT_DIR)
5677 				continue;
5678 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5679 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5680 				continue;
5681 
5682 			btrfs_release_path(path);
5683 			di_inode = btrfs_iget(fs_info->sb, &di_key, root);
5684 			if (IS_ERR(di_inode)) {
5685 				ret = PTR_ERR(di_inode);
5686 				goto next_dir_inode;
5687 			}
5688 
5689 			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5690 				btrfs_add_delayed_iput(di_inode);
5691 				break;
5692 			}
5693 
5694 			ctx->log_new_dentries = false;
5695 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5696 				log_mode = LOG_INODE_ALL;
5697 			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5698 					      log_mode, 0, LLONG_MAX, ctx);
5699 			if (!ret &&
5700 			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5701 				ret = 1;
5702 			btrfs_add_delayed_iput(di_inode);
5703 			if (ret)
5704 				goto next_dir_inode;
5705 			if (ctx->log_new_dentries) {
5706 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5707 						       GFP_NOFS);
5708 				if (!new_dir_elem) {
5709 					ret = -ENOMEM;
5710 					goto next_dir_inode;
5711 				}
5712 				new_dir_elem->ino = di_key.objectid;
5713 				list_add_tail(&new_dir_elem->list, &dir_list);
5714 			}
5715 			break;
5716 		}
5717 		if (i == nritems) {
5718 			ret = btrfs_next_leaf(log, path);
5719 			if (ret < 0) {
5720 				goto next_dir_inode;
5721 			} else if (ret > 0) {
5722 				ret = 0;
5723 				goto next_dir_inode;
5724 			}
5725 			goto process_leaf;
5726 		}
5727 		if (min_key.offset < (u64)-1) {
5728 			min_key.offset++;
5729 			goto again;
5730 		}
5731 next_dir_inode:
5732 		list_del(&dir_elem->list);
5733 		kfree(dir_elem);
5734 	}
5735 
5736 	btrfs_free_path(path);
5737 	return ret;
5738 }
5739 
5740 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5741 				 struct btrfs_inode *inode,
5742 				 struct btrfs_log_ctx *ctx)
5743 {
5744 	struct btrfs_fs_info *fs_info = trans->fs_info;
5745 	int ret;
5746 	struct btrfs_path *path;
5747 	struct btrfs_key key;
5748 	struct btrfs_root *root = inode->root;
5749 	const u64 ino = btrfs_ino(inode);
5750 
5751 	path = btrfs_alloc_path();
5752 	if (!path)
5753 		return -ENOMEM;
5754 	path->skip_locking = 1;
5755 	path->search_commit_root = 1;
5756 
5757 	key.objectid = ino;
5758 	key.type = BTRFS_INODE_REF_KEY;
5759 	key.offset = 0;
5760 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5761 	if (ret < 0)
5762 		goto out;
5763 
5764 	while (true) {
5765 		struct extent_buffer *leaf = path->nodes[0];
5766 		int slot = path->slots[0];
5767 		u32 cur_offset = 0;
5768 		u32 item_size;
5769 		unsigned long ptr;
5770 
5771 		if (slot >= btrfs_header_nritems(leaf)) {
5772 			ret = btrfs_next_leaf(root, path);
5773 			if (ret < 0)
5774 				goto out;
5775 			else if (ret > 0)
5776 				break;
5777 			continue;
5778 		}
5779 
5780 		btrfs_item_key_to_cpu(leaf, &key, slot);
5781 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5782 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5783 			break;
5784 
5785 		item_size = btrfs_item_size_nr(leaf, slot);
5786 		ptr = btrfs_item_ptr_offset(leaf, slot);
5787 		while (cur_offset < item_size) {
5788 			struct btrfs_key inode_key;
5789 			struct inode *dir_inode;
5790 
5791 			inode_key.type = BTRFS_INODE_ITEM_KEY;
5792 			inode_key.offset = 0;
5793 
5794 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
5795 				struct btrfs_inode_extref *extref;
5796 
5797 				extref = (struct btrfs_inode_extref *)
5798 					(ptr + cur_offset);
5799 				inode_key.objectid = btrfs_inode_extref_parent(
5800 					leaf, extref);
5801 				cur_offset += sizeof(*extref);
5802 				cur_offset += btrfs_inode_extref_name_len(leaf,
5803 					extref);
5804 			} else {
5805 				inode_key.objectid = key.offset;
5806 				cur_offset = item_size;
5807 			}
5808 
5809 			dir_inode = btrfs_iget(fs_info->sb, &inode_key, root);
5810 			/*
5811 			 * If the parent inode was deleted, return an error to
5812 			 * fallback to a transaction commit. This is to prevent
5813 			 * getting an inode that was moved from one parent A to
5814 			 * a parent B, got its former parent A deleted and then
5815 			 * it got fsync'ed, from existing at both parents after
5816 			 * a log replay (and the old parent still existing).
5817 			 * Example:
5818 			 *
5819 			 * mkdir /mnt/A
5820 			 * mkdir /mnt/B
5821 			 * touch /mnt/B/bar
5822 			 * sync
5823 			 * mv /mnt/B/bar /mnt/A/bar
5824 			 * mv -T /mnt/A /mnt/B
5825 			 * fsync /mnt/B/bar
5826 			 * <power fail>
5827 			 *
5828 			 * If we ignore the old parent B which got deleted,
5829 			 * after a log replay we would have file bar linked
5830 			 * at both parents and the old parent B would still
5831 			 * exist.
5832 			 */
5833 			if (IS_ERR(dir_inode)) {
5834 				ret = PTR_ERR(dir_inode);
5835 				goto out;
5836 			}
5837 
5838 			if (ctx)
5839 				ctx->log_new_dentries = false;
5840 			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5841 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5842 			if (!ret &&
5843 			    btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5844 				ret = 1;
5845 			if (!ret && ctx && ctx->log_new_dentries)
5846 				ret = log_new_dir_dentries(trans, root,
5847 						   BTRFS_I(dir_inode), ctx);
5848 			btrfs_add_delayed_iput(dir_inode);
5849 			if (ret)
5850 				goto out;
5851 		}
5852 		path->slots[0]++;
5853 	}
5854 	ret = 0;
5855 out:
5856 	btrfs_free_path(path);
5857 	return ret;
5858 }
5859 
5860 static int log_new_ancestors(struct btrfs_trans_handle *trans,
5861 			     struct btrfs_root *root,
5862 			     struct btrfs_path *path,
5863 			     struct btrfs_log_ctx *ctx)
5864 {
5865 	struct btrfs_key found_key;
5866 
5867 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5868 
5869 	while (true) {
5870 		struct btrfs_fs_info *fs_info = root->fs_info;
5871 		const u64 last_committed = fs_info->last_trans_committed;
5872 		struct extent_buffer *leaf = path->nodes[0];
5873 		int slot = path->slots[0];
5874 		struct btrfs_key search_key;
5875 		struct inode *inode;
5876 		int ret = 0;
5877 
5878 		btrfs_release_path(path);
5879 
5880 		search_key.objectid = found_key.offset;
5881 		search_key.type = BTRFS_INODE_ITEM_KEY;
5882 		search_key.offset = 0;
5883 		inode = btrfs_iget(fs_info->sb, &search_key, root);
5884 		if (IS_ERR(inode))
5885 			return PTR_ERR(inode);
5886 
5887 		if (BTRFS_I(inode)->generation > last_committed)
5888 			ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5889 					      LOG_INODE_EXISTS,
5890 					      0, LLONG_MAX, ctx);
5891 		btrfs_add_delayed_iput(inode);
5892 		if (ret)
5893 			return ret;
5894 
5895 		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5896 			break;
5897 
5898 		search_key.type = BTRFS_INODE_REF_KEY;
5899 		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5900 		if (ret < 0)
5901 			return ret;
5902 
5903 		leaf = path->nodes[0];
5904 		slot = path->slots[0];
5905 		if (slot >= btrfs_header_nritems(leaf)) {
5906 			ret = btrfs_next_leaf(root, path);
5907 			if (ret < 0)
5908 				return ret;
5909 			else if (ret > 0)
5910 				return -ENOENT;
5911 			leaf = path->nodes[0];
5912 			slot = path->slots[0];
5913 		}
5914 
5915 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5916 		if (found_key.objectid != search_key.objectid ||
5917 		    found_key.type != BTRFS_INODE_REF_KEY)
5918 			return -ENOENT;
5919 	}
5920 	return 0;
5921 }
5922 
5923 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5924 				  struct btrfs_inode *inode,
5925 				  struct dentry *parent,
5926 				  struct btrfs_log_ctx *ctx)
5927 {
5928 	struct btrfs_root *root = inode->root;
5929 	struct btrfs_fs_info *fs_info = root->fs_info;
5930 	struct dentry *old_parent = NULL;
5931 	struct super_block *sb = inode->vfs_inode.i_sb;
5932 	int ret = 0;
5933 
5934 	while (true) {
5935 		if (!parent || d_really_is_negative(parent) ||
5936 		    sb != parent->d_sb)
5937 			break;
5938 
5939 		inode = BTRFS_I(d_inode(parent));
5940 		if (root != inode->root)
5941 			break;
5942 
5943 		if (inode->generation > fs_info->last_trans_committed) {
5944 			ret = btrfs_log_inode(trans, root, inode,
5945 					LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5946 			if (ret)
5947 				break;
5948 		}
5949 		if (IS_ROOT(parent))
5950 			break;
5951 
5952 		parent = dget_parent(parent);
5953 		dput(old_parent);
5954 		old_parent = parent;
5955 	}
5956 	dput(old_parent);
5957 
5958 	return ret;
5959 }
5960 
5961 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
5962 				 struct btrfs_inode *inode,
5963 				 struct dentry *parent,
5964 				 struct btrfs_log_ctx *ctx)
5965 {
5966 	struct btrfs_root *root = inode->root;
5967 	const u64 ino = btrfs_ino(inode);
5968 	struct btrfs_path *path;
5969 	struct btrfs_key search_key;
5970 	int ret;
5971 
5972 	/*
5973 	 * For a single hard link case, go through a fast path that does not
5974 	 * need to iterate the fs/subvolume tree.
5975 	 */
5976 	if (inode->vfs_inode.i_nlink < 2)
5977 		return log_new_ancestors_fast(trans, inode, parent, ctx);
5978 
5979 	path = btrfs_alloc_path();
5980 	if (!path)
5981 		return -ENOMEM;
5982 
5983 	search_key.objectid = ino;
5984 	search_key.type = BTRFS_INODE_REF_KEY;
5985 	search_key.offset = 0;
5986 again:
5987 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5988 	if (ret < 0)
5989 		goto out;
5990 	if (ret == 0)
5991 		path->slots[0]++;
5992 
5993 	while (true) {
5994 		struct extent_buffer *leaf = path->nodes[0];
5995 		int slot = path->slots[0];
5996 		struct btrfs_key found_key;
5997 
5998 		if (slot >= btrfs_header_nritems(leaf)) {
5999 			ret = btrfs_next_leaf(root, path);
6000 			if (ret < 0)
6001 				goto out;
6002 			else if (ret > 0)
6003 				break;
6004 			continue;
6005 		}
6006 
6007 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6008 		if (found_key.objectid != ino ||
6009 		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6010 			break;
6011 
6012 		/*
6013 		 * Don't deal with extended references because they are rare
6014 		 * cases and too complex to deal with (we would need to keep
6015 		 * track of which subitem we are processing for each item in
6016 		 * this loop, etc). So just return some error to fallback to
6017 		 * a transaction commit.
6018 		 */
6019 		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6020 			ret = -EMLINK;
6021 			goto out;
6022 		}
6023 
6024 		/*
6025 		 * Logging ancestors needs to do more searches on the fs/subvol
6026 		 * tree, so it releases the path as needed to avoid deadlocks.
6027 		 * Keep track of the last inode ref key and resume from that key
6028 		 * after logging all new ancestors for the current hard link.
6029 		 */
6030 		memcpy(&search_key, &found_key, sizeof(search_key));
6031 
6032 		ret = log_new_ancestors(trans, root, path, ctx);
6033 		if (ret)
6034 			goto out;
6035 		btrfs_release_path(path);
6036 		goto again;
6037 	}
6038 	ret = 0;
6039 out:
6040 	btrfs_free_path(path);
6041 	return ret;
6042 }
6043 
6044 /*
6045  * helper function around btrfs_log_inode to make sure newly created
6046  * parent directories also end up in the log.  A minimal inode and backref
6047  * only logging is done of any parent directories that are older than
6048  * the last committed transaction
6049  */
6050 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6051 				  struct btrfs_inode *inode,
6052 				  struct dentry *parent,
6053 				  const loff_t start,
6054 				  const loff_t end,
6055 				  int inode_only,
6056 				  struct btrfs_log_ctx *ctx)
6057 {
6058 	struct btrfs_root *root = inode->root;
6059 	struct btrfs_fs_info *fs_info = root->fs_info;
6060 	struct super_block *sb;
6061 	int ret = 0;
6062 	u64 last_committed = fs_info->last_trans_committed;
6063 	bool log_dentries = false;
6064 
6065 	sb = inode->vfs_inode.i_sb;
6066 
6067 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
6068 		ret = 1;
6069 		goto end_no_trans;
6070 	}
6071 
6072 	/*
6073 	 * The prev transaction commit doesn't complete, we need do
6074 	 * full commit by ourselves.
6075 	 */
6076 	if (fs_info->last_trans_log_full_commit >
6077 	    fs_info->last_trans_committed) {
6078 		ret = 1;
6079 		goto end_no_trans;
6080 	}
6081 
6082 	if (btrfs_root_refs(&root->root_item) == 0) {
6083 		ret = 1;
6084 		goto end_no_trans;
6085 	}
6086 
6087 	ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
6088 			last_committed);
6089 	if (ret)
6090 		goto end_no_trans;
6091 
6092 	/*
6093 	 * Skip already logged inodes or inodes corresponding to tmpfiles
6094 	 * (since logging them is pointless, a link count of 0 means they
6095 	 * will never be accessible).
6096 	 */
6097 	if (btrfs_inode_in_log(inode, trans->transid) ||
6098 	    inode->vfs_inode.i_nlink == 0) {
6099 		ret = BTRFS_NO_LOG_SYNC;
6100 		goto end_no_trans;
6101 	}
6102 
6103 	ret = start_log_trans(trans, root, ctx);
6104 	if (ret)
6105 		goto end_no_trans;
6106 
6107 	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
6108 	if (ret)
6109 		goto end_trans;
6110 
6111 	/*
6112 	 * for regular files, if its inode is already on disk, we don't
6113 	 * have to worry about the parents at all.  This is because
6114 	 * we can use the last_unlink_trans field to record renames
6115 	 * and other fun in this file.
6116 	 */
6117 	if (S_ISREG(inode->vfs_inode.i_mode) &&
6118 	    inode->generation <= last_committed &&
6119 	    inode->last_unlink_trans <= last_committed) {
6120 		ret = 0;
6121 		goto end_trans;
6122 	}
6123 
6124 	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6125 		log_dentries = true;
6126 
6127 	/*
6128 	 * On unlink we must make sure all our current and old parent directory
6129 	 * inodes are fully logged. This is to prevent leaving dangling
6130 	 * directory index entries in directories that were our parents but are
6131 	 * not anymore. Not doing this results in old parent directory being
6132 	 * impossible to delete after log replay (rmdir will always fail with
6133 	 * error -ENOTEMPTY).
6134 	 *
6135 	 * Example 1:
6136 	 *
6137 	 * mkdir testdir
6138 	 * touch testdir/foo
6139 	 * ln testdir/foo testdir/bar
6140 	 * sync
6141 	 * unlink testdir/bar
6142 	 * xfs_io -c fsync testdir/foo
6143 	 * <power failure>
6144 	 * mount fs, triggers log replay
6145 	 *
6146 	 * If we don't log the parent directory (testdir), after log replay the
6147 	 * directory still has an entry pointing to the file inode using the bar
6148 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6149 	 * the file inode has a link count of 1.
6150 	 *
6151 	 * Example 2:
6152 	 *
6153 	 * mkdir testdir
6154 	 * touch foo
6155 	 * ln foo testdir/foo2
6156 	 * ln foo testdir/foo3
6157 	 * sync
6158 	 * unlink testdir/foo3
6159 	 * xfs_io -c fsync foo
6160 	 * <power failure>
6161 	 * mount fs, triggers log replay
6162 	 *
6163 	 * Similar as the first example, after log replay the parent directory
6164 	 * testdir still has an entry pointing to the inode file with name foo3
6165 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6166 	 * and has a link count of 2.
6167 	 */
6168 	if (inode->last_unlink_trans > last_committed) {
6169 		ret = btrfs_log_all_parents(trans, inode, ctx);
6170 		if (ret)
6171 			goto end_trans;
6172 	}
6173 
6174 	ret = log_all_new_ancestors(trans, inode, parent, ctx);
6175 	if (ret)
6176 		goto end_trans;
6177 
6178 	if (log_dentries)
6179 		ret = log_new_dir_dentries(trans, root, inode, ctx);
6180 	else
6181 		ret = 0;
6182 end_trans:
6183 	if (ret < 0) {
6184 		btrfs_set_log_full_commit(trans);
6185 		ret = 1;
6186 	}
6187 
6188 	if (ret)
6189 		btrfs_remove_log_ctx(root, ctx);
6190 	btrfs_end_log_trans(root);
6191 end_no_trans:
6192 	return ret;
6193 }
6194 
6195 /*
6196  * it is not safe to log dentry if the chunk root has added new
6197  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6198  * If this returns 1, you must commit the transaction to safely get your
6199  * data on disk.
6200  */
6201 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6202 			  struct dentry *dentry,
6203 			  const loff_t start,
6204 			  const loff_t end,
6205 			  struct btrfs_log_ctx *ctx)
6206 {
6207 	struct dentry *parent = dget_parent(dentry);
6208 	int ret;
6209 
6210 	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6211 				     start, end, LOG_INODE_ALL, ctx);
6212 	dput(parent);
6213 
6214 	return ret;
6215 }
6216 
6217 /*
6218  * should be called during mount to recover any replay any log trees
6219  * from the FS
6220  */
6221 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6222 {
6223 	int ret;
6224 	struct btrfs_path *path;
6225 	struct btrfs_trans_handle *trans;
6226 	struct btrfs_key key;
6227 	struct btrfs_key found_key;
6228 	struct btrfs_key tmp_key;
6229 	struct btrfs_root *log;
6230 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6231 	struct walk_control wc = {
6232 		.process_func = process_one_buffer,
6233 		.stage = LOG_WALK_PIN_ONLY,
6234 	};
6235 
6236 	path = btrfs_alloc_path();
6237 	if (!path)
6238 		return -ENOMEM;
6239 
6240 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6241 
6242 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
6243 	if (IS_ERR(trans)) {
6244 		ret = PTR_ERR(trans);
6245 		goto error;
6246 	}
6247 
6248 	wc.trans = trans;
6249 	wc.pin = 1;
6250 
6251 	ret = walk_log_tree(trans, log_root_tree, &wc);
6252 	if (ret) {
6253 		btrfs_handle_fs_error(fs_info, ret,
6254 			"Failed to pin buffers while recovering log root tree.");
6255 		goto error;
6256 	}
6257 
6258 again:
6259 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
6260 	key.offset = (u64)-1;
6261 	key.type = BTRFS_ROOT_ITEM_KEY;
6262 
6263 	while (1) {
6264 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6265 
6266 		if (ret < 0) {
6267 			btrfs_handle_fs_error(fs_info, ret,
6268 				    "Couldn't find tree log root.");
6269 			goto error;
6270 		}
6271 		if (ret > 0) {
6272 			if (path->slots[0] == 0)
6273 				break;
6274 			path->slots[0]--;
6275 		}
6276 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6277 				      path->slots[0]);
6278 		btrfs_release_path(path);
6279 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6280 			break;
6281 
6282 		log = btrfs_read_fs_root(log_root_tree, &found_key);
6283 		if (IS_ERR(log)) {
6284 			ret = PTR_ERR(log);
6285 			btrfs_handle_fs_error(fs_info, ret,
6286 				    "Couldn't read tree log root.");
6287 			goto error;
6288 		}
6289 
6290 		tmp_key.objectid = found_key.offset;
6291 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
6292 		tmp_key.offset = (u64)-1;
6293 
6294 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
6295 		if (IS_ERR(wc.replay_dest)) {
6296 			ret = PTR_ERR(wc.replay_dest);
6297 			free_extent_buffer(log->node);
6298 			free_extent_buffer(log->commit_root);
6299 			kfree(log);
6300 			btrfs_handle_fs_error(fs_info, ret,
6301 				"Couldn't read target root for tree log recovery.");
6302 			goto error;
6303 		}
6304 
6305 		wc.replay_dest->log_root = log;
6306 		btrfs_record_root_in_trans(trans, wc.replay_dest);
6307 		ret = walk_log_tree(trans, log, &wc);
6308 
6309 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6310 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
6311 						      path);
6312 		}
6313 
6314 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6315 			struct btrfs_root *root = wc.replay_dest;
6316 
6317 			btrfs_release_path(path);
6318 
6319 			/*
6320 			 * We have just replayed everything, and the highest
6321 			 * objectid of fs roots probably has changed in case
6322 			 * some inode_item's got replayed.
6323 			 *
6324 			 * root->objectid_mutex is not acquired as log replay
6325 			 * could only happen during mount.
6326 			 */
6327 			ret = btrfs_find_highest_objectid(root,
6328 						  &root->highest_objectid);
6329 		}
6330 
6331 		key.offset = found_key.offset - 1;
6332 		wc.replay_dest->log_root = NULL;
6333 		free_extent_buffer(log->node);
6334 		free_extent_buffer(log->commit_root);
6335 		kfree(log);
6336 
6337 		if (ret)
6338 			goto error;
6339 
6340 		if (found_key.offset == 0)
6341 			break;
6342 	}
6343 	btrfs_release_path(path);
6344 
6345 	/* step one is to pin it all, step two is to replay just inodes */
6346 	if (wc.pin) {
6347 		wc.pin = 0;
6348 		wc.process_func = replay_one_buffer;
6349 		wc.stage = LOG_WALK_REPLAY_INODES;
6350 		goto again;
6351 	}
6352 	/* step three is to replay everything */
6353 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
6354 		wc.stage++;
6355 		goto again;
6356 	}
6357 
6358 	btrfs_free_path(path);
6359 
6360 	/* step 4: commit the transaction, which also unpins the blocks */
6361 	ret = btrfs_commit_transaction(trans);
6362 	if (ret)
6363 		return ret;
6364 
6365 	free_extent_buffer(log_root_tree->node);
6366 	log_root_tree->log_root = NULL;
6367 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6368 	kfree(log_root_tree);
6369 
6370 	return 0;
6371 error:
6372 	if (wc.trans)
6373 		btrfs_end_transaction(wc.trans);
6374 	btrfs_free_path(path);
6375 	return ret;
6376 }
6377 
6378 /*
6379  * there are some corner cases where we want to force a full
6380  * commit instead of allowing a directory to be logged.
6381  *
6382  * They revolve around files there were unlinked from the directory, and
6383  * this function updates the parent directory so that a full commit is
6384  * properly done if it is fsync'd later after the unlinks are done.
6385  *
6386  * Must be called before the unlink operations (updates to the subvolume tree,
6387  * inodes, etc) are done.
6388  */
6389 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6390 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
6391 			     int for_rename)
6392 {
6393 	/*
6394 	 * when we're logging a file, if it hasn't been renamed
6395 	 * or unlinked, and its inode is fully committed on disk,
6396 	 * we don't have to worry about walking up the directory chain
6397 	 * to log its parents.
6398 	 *
6399 	 * So, we use the last_unlink_trans field to put this transid
6400 	 * into the file.  When the file is logged we check it and
6401 	 * don't log the parents if the file is fully on disk.
6402 	 */
6403 	mutex_lock(&inode->log_mutex);
6404 	inode->last_unlink_trans = trans->transid;
6405 	mutex_unlock(&inode->log_mutex);
6406 
6407 	/*
6408 	 * if this directory was already logged any new
6409 	 * names for this file/dir will get recorded
6410 	 */
6411 	if (dir->logged_trans == trans->transid)
6412 		return;
6413 
6414 	/*
6415 	 * if the inode we're about to unlink was logged,
6416 	 * the log will be properly updated for any new names
6417 	 */
6418 	if (inode->logged_trans == trans->transid)
6419 		return;
6420 
6421 	/*
6422 	 * when renaming files across directories, if the directory
6423 	 * there we're unlinking from gets fsync'd later on, there's
6424 	 * no way to find the destination directory later and fsync it
6425 	 * properly.  So, we have to be conservative and force commits
6426 	 * so the new name gets discovered.
6427 	 */
6428 	if (for_rename)
6429 		goto record;
6430 
6431 	/* we can safely do the unlink without any special recording */
6432 	return;
6433 
6434 record:
6435 	mutex_lock(&dir->log_mutex);
6436 	dir->last_unlink_trans = trans->transid;
6437 	mutex_unlock(&dir->log_mutex);
6438 }
6439 
6440 /*
6441  * Make sure that if someone attempts to fsync the parent directory of a deleted
6442  * snapshot, it ends up triggering a transaction commit. This is to guarantee
6443  * that after replaying the log tree of the parent directory's root we will not
6444  * see the snapshot anymore and at log replay time we will not see any log tree
6445  * corresponding to the deleted snapshot's root, which could lead to replaying
6446  * it after replaying the log tree of the parent directory (which would replay
6447  * the snapshot delete operation).
6448  *
6449  * Must be called before the actual snapshot destroy operation (updates to the
6450  * parent root and tree of tree roots trees, etc) are done.
6451  */
6452 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6453 				   struct btrfs_inode *dir)
6454 {
6455 	mutex_lock(&dir->log_mutex);
6456 	dir->last_unlink_trans = trans->transid;
6457 	mutex_unlock(&dir->log_mutex);
6458 }
6459 
6460 /*
6461  * Call this after adding a new name for a file and it will properly
6462  * update the log to reflect the new name.
6463  *
6464  * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
6465  * true (because it's not used).
6466  *
6467  * Return value depends on whether @sync_log is true or false.
6468  * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
6469  *            committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
6470  *            otherwise.
6471  * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
6472  *             to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
6473  *             or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
6474  *             committed (without attempting to sync the log).
6475  */
6476 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
6477 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6478 			struct dentry *parent,
6479 			bool sync_log, struct btrfs_log_ctx *ctx)
6480 {
6481 	struct btrfs_fs_info *fs_info = trans->fs_info;
6482 	int ret;
6483 
6484 	/*
6485 	 * this will force the logging code to walk the dentry chain
6486 	 * up for the file
6487 	 */
6488 	if (!S_ISDIR(inode->vfs_inode.i_mode))
6489 		inode->last_unlink_trans = trans->transid;
6490 
6491 	/*
6492 	 * if this inode hasn't been logged and directory we're renaming it
6493 	 * from hasn't been logged, we don't need to log it
6494 	 */
6495 	if (inode->logged_trans <= fs_info->last_trans_committed &&
6496 	    (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
6497 		return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
6498 			BTRFS_DONT_NEED_LOG_SYNC;
6499 
6500 	if (sync_log) {
6501 		struct btrfs_log_ctx ctx2;
6502 
6503 		btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
6504 		ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6505 					     LOG_INODE_EXISTS, &ctx2);
6506 		if (ret == BTRFS_NO_LOG_SYNC)
6507 			return BTRFS_DONT_NEED_TRANS_COMMIT;
6508 		else if (ret)
6509 			return BTRFS_NEED_TRANS_COMMIT;
6510 
6511 		ret = btrfs_sync_log(trans, inode->root, &ctx2);
6512 		if (ret)
6513 			return BTRFS_NEED_TRANS_COMMIT;
6514 		return BTRFS_DONT_NEED_TRANS_COMMIT;
6515 	}
6516 
6517 	ASSERT(ctx);
6518 	ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6519 				     LOG_INODE_EXISTS, ctx);
6520 	if (ret == BTRFS_NO_LOG_SYNC)
6521 		return BTRFS_DONT_NEED_LOG_SYNC;
6522 	else if (ret)
6523 		return BTRFS_NEED_TRANS_COMMIT;
6524 
6525 	return BTRFS_NEED_LOG_SYNC;
6526 }
6527 
6528