xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 160b8e75)
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/blkdev.h>
22 #include <linux/list_sort.h>
23 #include <linux/iversion.h>
24 #include "tree-log.h"
25 #include "disk-io.h"
26 #include "locking.h"
27 #include "print-tree.h"
28 #include "backref.h"
29 #include "hash.h"
30 #include "compression.h"
31 #include "qgroup.h"
32 
33 /* magic values for the inode_only field in btrfs_log_inode:
34  *
35  * LOG_INODE_ALL means to log everything
36  * LOG_INODE_EXISTS means to log just enough to recreate the inode
37  * during log replay
38  */
39 #define LOG_INODE_ALL 0
40 #define LOG_INODE_EXISTS 1
41 #define LOG_OTHER_INODE 2
42 
43 /*
44  * directory trouble cases
45  *
46  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
47  * log, we must force a full commit before doing an fsync of the directory
48  * where the unlink was done.
49  * ---> record transid of last unlink/rename per directory
50  *
51  * mkdir foo/some_dir
52  * normal commit
53  * rename foo/some_dir foo2/some_dir
54  * mkdir foo/some_dir
55  * fsync foo/some_dir/some_file
56  *
57  * The fsync above will unlink the original some_dir without recording
58  * it in its new location (foo2).  After a crash, some_dir will be gone
59  * unless the fsync of some_file forces a full commit
60  *
61  * 2) we must log any new names for any file or dir that is in the fsync
62  * log. ---> check inode while renaming/linking.
63  *
64  * 2a) we must log any new names for any file or dir during rename
65  * when the directory they are being removed from was logged.
66  * ---> check inode and old parent dir during rename
67  *
68  *  2a is actually the more important variant.  With the extra logging
69  *  a crash might unlink the old name without recreating the new one
70  *
71  * 3) after a crash, we must go through any directories with a link count
72  * of zero and redo the rm -rf
73  *
74  * mkdir f1/foo
75  * normal commit
76  * rm -rf f1/foo
77  * fsync(f1)
78  *
79  * The directory f1 was fully removed from the FS, but fsync was never
80  * called on f1, only its parent dir.  After a crash the rm -rf must
81  * be replayed.  This must be able to recurse down the entire
82  * directory tree.  The inode link count fixup code takes care of the
83  * ugly details.
84  */
85 
86 /*
87  * stages for the tree walking.  The first
88  * stage (0) is to only pin down the blocks we find
89  * the second stage (1) is to make sure that all the inodes
90  * we find in the log are created in the subvolume.
91  *
92  * The last stage is to deal with directories and links and extents
93  * and all the other fun semantics
94  */
95 #define LOG_WALK_PIN_ONLY 0
96 #define LOG_WALK_REPLAY_INODES 1
97 #define LOG_WALK_REPLAY_DIR_INDEX 2
98 #define LOG_WALK_REPLAY_ALL 3
99 
100 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
101 			   struct btrfs_root *root, struct btrfs_inode *inode,
102 			   int inode_only,
103 			   const loff_t start,
104 			   const loff_t end,
105 			   struct btrfs_log_ctx *ctx);
106 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
107 			     struct btrfs_root *root,
108 			     struct btrfs_path *path, u64 objectid);
109 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
110 				       struct btrfs_root *root,
111 				       struct btrfs_root *log,
112 				       struct btrfs_path *path,
113 				       u64 dirid, int del_all);
114 
115 /*
116  * tree logging is a special write ahead log used to make sure that
117  * fsyncs and O_SYNCs can happen without doing full tree commits.
118  *
119  * Full tree commits are expensive because they require commonly
120  * modified blocks to be recowed, creating many dirty pages in the
121  * extent tree an 4x-6x higher write load than ext3.
122  *
123  * Instead of doing a tree commit on every fsync, we use the
124  * key ranges and transaction ids to find items for a given file or directory
125  * that have changed in this transaction.  Those items are copied into
126  * a special tree (one per subvolume root), that tree is written to disk
127  * and then the fsync is considered complete.
128  *
129  * After a crash, items are copied out of the log-tree back into the
130  * subvolume tree.  Any file data extents found are recorded in the extent
131  * allocation tree, and the log-tree freed.
132  *
133  * The log tree is read three times, once to pin down all the extents it is
134  * using in ram and once, once to create all the inodes logged in the tree
135  * and once to do all the other items.
136  */
137 
138 /*
139  * start a sub transaction and setup the log tree
140  * this increments the log tree writer count to make the people
141  * syncing the tree wait for us to finish
142  */
143 static int start_log_trans(struct btrfs_trans_handle *trans,
144 			   struct btrfs_root *root,
145 			   struct btrfs_log_ctx *ctx)
146 {
147 	struct btrfs_fs_info *fs_info = root->fs_info;
148 	int ret = 0;
149 
150 	mutex_lock(&root->log_mutex);
151 
152 	if (root->log_root) {
153 		if (btrfs_need_log_full_commit(fs_info, trans)) {
154 			ret = -EAGAIN;
155 			goto out;
156 		}
157 
158 		if (!root->log_start_pid) {
159 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
160 			root->log_start_pid = current->pid;
161 		} else if (root->log_start_pid != current->pid) {
162 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
163 		}
164 	} else {
165 		mutex_lock(&fs_info->tree_log_mutex);
166 		if (!fs_info->log_root_tree)
167 			ret = btrfs_init_log_root_tree(trans, fs_info);
168 		mutex_unlock(&fs_info->tree_log_mutex);
169 		if (ret)
170 			goto out;
171 
172 		ret = btrfs_add_log_tree(trans, root);
173 		if (ret)
174 			goto out;
175 
176 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
177 		root->log_start_pid = current->pid;
178 	}
179 
180 	atomic_inc(&root->log_batch);
181 	atomic_inc(&root->log_writers);
182 	if (ctx) {
183 		int index = root->log_transid % 2;
184 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
185 		ctx->log_transid = root->log_transid;
186 	}
187 
188 out:
189 	mutex_unlock(&root->log_mutex);
190 	return ret;
191 }
192 
193 /*
194  * returns 0 if there was a log transaction running and we were able
195  * to join, or returns -ENOENT if there were not transactions
196  * in progress
197  */
198 static int join_running_log_trans(struct btrfs_root *root)
199 {
200 	int ret = -ENOENT;
201 
202 	smp_mb();
203 	if (!root->log_root)
204 		return -ENOENT;
205 
206 	mutex_lock(&root->log_mutex);
207 	if (root->log_root) {
208 		ret = 0;
209 		atomic_inc(&root->log_writers);
210 	}
211 	mutex_unlock(&root->log_mutex);
212 	return ret;
213 }
214 
215 /*
216  * This either makes the current running log transaction wait
217  * until you call btrfs_end_log_trans() or it makes any future
218  * log transactions wait until you call btrfs_end_log_trans()
219  */
220 int btrfs_pin_log_trans(struct btrfs_root *root)
221 {
222 	int ret = -ENOENT;
223 
224 	mutex_lock(&root->log_mutex);
225 	atomic_inc(&root->log_writers);
226 	mutex_unlock(&root->log_mutex);
227 	return ret;
228 }
229 
230 /*
231  * indicate we're done making changes to the log tree
232  * and wake up anyone waiting to do a sync
233  */
234 void btrfs_end_log_trans(struct btrfs_root *root)
235 {
236 	if (atomic_dec_and_test(&root->log_writers)) {
237 		/*
238 		 * Implicit memory barrier after atomic_dec_and_test
239 		 */
240 		if (waitqueue_active(&root->log_writer_wait))
241 			wake_up(&root->log_writer_wait);
242 	}
243 }
244 
245 
246 /*
247  * the walk control struct is used to pass state down the chain when
248  * processing the log tree.  The stage field tells us which part
249  * of the log tree processing we are currently doing.  The others
250  * are state fields used for that specific part
251  */
252 struct walk_control {
253 	/* should we free the extent on disk when done?  This is used
254 	 * at transaction commit time while freeing a log tree
255 	 */
256 	int free;
257 
258 	/* should we write out the extent buffer?  This is used
259 	 * while flushing the log tree to disk during a sync
260 	 */
261 	int write;
262 
263 	/* should we wait for the extent buffer io to finish?  Also used
264 	 * while flushing the log tree to disk for a sync
265 	 */
266 	int wait;
267 
268 	/* pin only walk, we record which extents on disk belong to the
269 	 * log trees
270 	 */
271 	int pin;
272 
273 	/* what stage of the replay code we're currently in */
274 	int stage;
275 
276 	/* the root we are currently replaying */
277 	struct btrfs_root *replay_dest;
278 
279 	/* the trans handle for the current replay */
280 	struct btrfs_trans_handle *trans;
281 
282 	/* the function that gets used to process blocks we find in the
283 	 * tree.  Note the extent_buffer might not be up to date when it is
284 	 * passed in, and it must be checked or read if you need the data
285 	 * inside it
286 	 */
287 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
288 			    struct walk_control *wc, u64 gen);
289 };
290 
291 /*
292  * process_func used to pin down extents, write them or wait on them
293  */
294 static int process_one_buffer(struct btrfs_root *log,
295 			      struct extent_buffer *eb,
296 			      struct walk_control *wc, u64 gen)
297 {
298 	struct btrfs_fs_info *fs_info = log->fs_info;
299 	int ret = 0;
300 
301 	/*
302 	 * If this fs is mixed then we need to be able to process the leaves to
303 	 * pin down any logged extents, so we have to read the block.
304 	 */
305 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
306 		ret = btrfs_read_buffer(eb, gen);
307 		if (ret)
308 			return ret;
309 	}
310 
311 	if (wc->pin)
312 		ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
313 						      eb->len);
314 
315 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
316 		if (wc->pin && btrfs_header_level(eb) == 0)
317 			ret = btrfs_exclude_logged_extents(fs_info, eb);
318 		if (wc->write)
319 			btrfs_write_tree_block(eb);
320 		if (wc->wait)
321 			btrfs_wait_tree_block_writeback(eb);
322 	}
323 	return ret;
324 }
325 
326 /*
327  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
328  * to the src data we are copying out.
329  *
330  * root is the tree we are copying into, and path is a scratch
331  * path for use in this function (it should be released on entry and
332  * will be released on exit).
333  *
334  * If the key is already in the destination tree the existing item is
335  * overwritten.  If the existing item isn't big enough, it is extended.
336  * If it is too large, it is truncated.
337  *
338  * If the key isn't in the destination yet, a new item is inserted.
339  */
340 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
341 				   struct btrfs_root *root,
342 				   struct btrfs_path *path,
343 				   struct extent_buffer *eb, int slot,
344 				   struct btrfs_key *key)
345 {
346 	struct btrfs_fs_info *fs_info = root->fs_info;
347 	int ret;
348 	u32 item_size;
349 	u64 saved_i_size = 0;
350 	int save_old_i_size = 0;
351 	unsigned long src_ptr;
352 	unsigned long dst_ptr;
353 	int overwrite_root = 0;
354 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
355 
356 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
357 		overwrite_root = 1;
358 
359 	item_size = btrfs_item_size_nr(eb, slot);
360 	src_ptr = btrfs_item_ptr_offset(eb, slot);
361 
362 	/* look for the key in the destination tree */
363 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
364 	if (ret < 0)
365 		return ret;
366 
367 	if (ret == 0) {
368 		char *src_copy;
369 		char *dst_copy;
370 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
371 						  path->slots[0]);
372 		if (dst_size != item_size)
373 			goto insert;
374 
375 		if (item_size == 0) {
376 			btrfs_release_path(path);
377 			return 0;
378 		}
379 		dst_copy = kmalloc(item_size, GFP_NOFS);
380 		src_copy = kmalloc(item_size, GFP_NOFS);
381 		if (!dst_copy || !src_copy) {
382 			btrfs_release_path(path);
383 			kfree(dst_copy);
384 			kfree(src_copy);
385 			return -ENOMEM;
386 		}
387 
388 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
389 
390 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
391 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
392 				   item_size);
393 		ret = memcmp(dst_copy, src_copy, item_size);
394 
395 		kfree(dst_copy);
396 		kfree(src_copy);
397 		/*
398 		 * they have the same contents, just return, this saves
399 		 * us from cowing blocks in the destination tree and doing
400 		 * extra writes that may not have been done by a previous
401 		 * sync
402 		 */
403 		if (ret == 0) {
404 			btrfs_release_path(path);
405 			return 0;
406 		}
407 
408 		/*
409 		 * We need to load the old nbytes into the inode so when we
410 		 * replay the extents we've logged we get the right nbytes.
411 		 */
412 		if (inode_item) {
413 			struct btrfs_inode_item *item;
414 			u64 nbytes;
415 			u32 mode;
416 
417 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
418 					      struct btrfs_inode_item);
419 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
420 			item = btrfs_item_ptr(eb, slot,
421 					      struct btrfs_inode_item);
422 			btrfs_set_inode_nbytes(eb, item, nbytes);
423 
424 			/*
425 			 * If this is a directory we need to reset the i_size to
426 			 * 0 so that we can set it up properly when replaying
427 			 * the rest of the items in this log.
428 			 */
429 			mode = btrfs_inode_mode(eb, item);
430 			if (S_ISDIR(mode))
431 				btrfs_set_inode_size(eb, item, 0);
432 		}
433 	} else if (inode_item) {
434 		struct btrfs_inode_item *item;
435 		u32 mode;
436 
437 		/*
438 		 * New inode, set nbytes to 0 so that the nbytes comes out
439 		 * properly when we replay the extents.
440 		 */
441 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
442 		btrfs_set_inode_nbytes(eb, item, 0);
443 
444 		/*
445 		 * If this is a directory we need to reset the i_size to 0 so
446 		 * that we can set it up properly when replaying the rest of
447 		 * the items in this log.
448 		 */
449 		mode = btrfs_inode_mode(eb, item);
450 		if (S_ISDIR(mode))
451 			btrfs_set_inode_size(eb, item, 0);
452 	}
453 insert:
454 	btrfs_release_path(path);
455 	/* try to insert the key into the destination tree */
456 	path->skip_release_on_error = 1;
457 	ret = btrfs_insert_empty_item(trans, root, path,
458 				      key, item_size);
459 	path->skip_release_on_error = 0;
460 
461 	/* make sure any existing item is the correct size */
462 	if (ret == -EEXIST || ret == -EOVERFLOW) {
463 		u32 found_size;
464 		found_size = btrfs_item_size_nr(path->nodes[0],
465 						path->slots[0]);
466 		if (found_size > item_size)
467 			btrfs_truncate_item(fs_info, path, item_size, 1);
468 		else if (found_size < item_size)
469 			btrfs_extend_item(fs_info, path,
470 					  item_size - found_size);
471 	} else if (ret) {
472 		return ret;
473 	}
474 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
475 					path->slots[0]);
476 
477 	/* don't overwrite an existing inode if the generation number
478 	 * was logged as zero.  This is done when the tree logging code
479 	 * is just logging an inode to make sure it exists after recovery.
480 	 *
481 	 * Also, don't overwrite i_size on directories during replay.
482 	 * log replay inserts and removes directory items based on the
483 	 * state of the tree found in the subvolume, and i_size is modified
484 	 * as it goes
485 	 */
486 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
487 		struct btrfs_inode_item *src_item;
488 		struct btrfs_inode_item *dst_item;
489 
490 		src_item = (struct btrfs_inode_item *)src_ptr;
491 		dst_item = (struct btrfs_inode_item *)dst_ptr;
492 
493 		if (btrfs_inode_generation(eb, src_item) == 0) {
494 			struct extent_buffer *dst_eb = path->nodes[0];
495 			const u64 ino_size = btrfs_inode_size(eb, src_item);
496 
497 			/*
498 			 * For regular files an ino_size == 0 is used only when
499 			 * logging that an inode exists, as part of a directory
500 			 * fsync, and the inode wasn't fsynced before. In this
501 			 * case don't set the size of the inode in the fs/subvol
502 			 * tree, otherwise we would be throwing valid data away.
503 			 */
504 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
505 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
506 			    ino_size != 0) {
507 				struct btrfs_map_token token;
508 
509 				btrfs_init_map_token(&token);
510 				btrfs_set_token_inode_size(dst_eb, dst_item,
511 							   ino_size, &token);
512 			}
513 			goto no_copy;
514 		}
515 
516 		if (overwrite_root &&
517 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
518 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
519 			save_old_i_size = 1;
520 			saved_i_size = btrfs_inode_size(path->nodes[0],
521 							dst_item);
522 		}
523 	}
524 
525 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
526 			   src_ptr, item_size);
527 
528 	if (save_old_i_size) {
529 		struct btrfs_inode_item *dst_item;
530 		dst_item = (struct btrfs_inode_item *)dst_ptr;
531 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
532 	}
533 
534 	/* make sure the generation is filled in */
535 	if (key->type == BTRFS_INODE_ITEM_KEY) {
536 		struct btrfs_inode_item *dst_item;
537 		dst_item = (struct btrfs_inode_item *)dst_ptr;
538 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
539 			btrfs_set_inode_generation(path->nodes[0], dst_item,
540 						   trans->transid);
541 		}
542 	}
543 no_copy:
544 	btrfs_mark_buffer_dirty(path->nodes[0]);
545 	btrfs_release_path(path);
546 	return 0;
547 }
548 
549 /*
550  * simple helper to read an inode off the disk from a given root
551  * This can only be called for subvolume roots and not for the log
552  */
553 static noinline struct inode *read_one_inode(struct btrfs_root *root,
554 					     u64 objectid)
555 {
556 	struct btrfs_key key;
557 	struct inode *inode;
558 
559 	key.objectid = objectid;
560 	key.type = BTRFS_INODE_ITEM_KEY;
561 	key.offset = 0;
562 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
563 	if (IS_ERR(inode)) {
564 		inode = NULL;
565 	} else if (is_bad_inode(inode)) {
566 		iput(inode);
567 		inode = NULL;
568 	}
569 	return inode;
570 }
571 
572 /* replays a single extent in 'eb' at 'slot' with 'key' into the
573  * subvolume 'root'.  path is released on entry and should be released
574  * on exit.
575  *
576  * extents in the log tree have not been allocated out of the extent
577  * tree yet.  So, this completes the allocation, taking a reference
578  * as required if the extent already exists or creating a new extent
579  * if it isn't in the extent allocation tree yet.
580  *
581  * The extent is inserted into the file, dropping any existing extents
582  * from the file that overlap the new one.
583  */
584 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
585 				      struct btrfs_root *root,
586 				      struct btrfs_path *path,
587 				      struct extent_buffer *eb, int slot,
588 				      struct btrfs_key *key)
589 {
590 	struct btrfs_fs_info *fs_info = root->fs_info;
591 	int found_type;
592 	u64 extent_end;
593 	u64 start = key->offset;
594 	u64 nbytes = 0;
595 	struct btrfs_file_extent_item *item;
596 	struct inode *inode = NULL;
597 	unsigned long size;
598 	int ret = 0;
599 
600 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
601 	found_type = btrfs_file_extent_type(eb, item);
602 
603 	if (found_type == BTRFS_FILE_EXTENT_REG ||
604 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
605 		nbytes = btrfs_file_extent_num_bytes(eb, item);
606 		extent_end = start + nbytes;
607 
608 		/*
609 		 * We don't add to the inodes nbytes if we are prealloc or a
610 		 * hole.
611 		 */
612 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
613 			nbytes = 0;
614 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
615 		size = btrfs_file_extent_inline_len(eb, slot, item);
616 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
617 		extent_end = ALIGN(start + size,
618 				   fs_info->sectorsize);
619 	} else {
620 		ret = 0;
621 		goto out;
622 	}
623 
624 	inode = read_one_inode(root, key->objectid);
625 	if (!inode) {
626 		ret = -EIO;
627 		goto out;
628 	}
629 
630 	/*
631 	 * first check to see if we already have this extent in the
632 	 * file.  This must be done before the btrfs_drop_extents run
633 	 * so we don't try to drop this extent.
634 	 */
635 	ret = btrfs_lookup_file_extent(trans, root, path,
636 			btrfs_ino(BTRFS_I(inode)), start, 0);
637 
638 	if (ret == 0 &&
639 	    (found_type == BTRFS_FILE_EXTENT_REG ||
640 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
641 		struct btrfs_file_extent_item cmp1;
642 		struct btrfs_file_extent_item cmp2;
643 		struct btrfs_file_extent_item *existing;
644 		struct extent_buffer *leaf;
645 
646 		leaf = path->nodes[0];
647 		existing = btrfs_item_ptr(leaf, path->slots[0],
648 					  struct btrfs_file_extent_item);
649 
650 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
651 				   sizeof(cmp1));
652 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
653 				   sizeof(cmp2));
654 
655 		/*
656 		 * we already have a pointer to this exact extent,
657 		 * we don't have to do anything
658 		 */
659 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
660 			btrfs_release_path(path);
661 			goto out;
662 		}
663 	}
664 	btrfs_release_path(path);
665 
666 	/* drop any overlapping extents */
667 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
668 	if (ret)
669 		goto out;
670 
671 	if (found_type == BTRFS_FILE_EXTENT_REG ||
672 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
673 		u64 offset;
674 		unsigned long dest_offset;
675 		struct btrfs_key ins;
676 
677 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
678 		    btrfs_fs_incompat(fs_info, NO_HOLES))
679 			goto update_inode;
680 
681 		ret = btrfs_insert_empty_item(trans, root, path, key,
682 					      sizeof(*item));
683 		if (ret)
684 			goto out;
685 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
686 						    path->slots[0]);
687 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
688 				(unsigned long)item,  sizeof(*item));
689 
690 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
691 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
692 		ins.type = BTRFS_EXTENT_ITEM_KEY;
693 		offset = key->offset - btrfs_file_extent_offset(eb, item);
694 
695 		/*
696 		 * Manually record dirty extent, as here we did a shallow
697 		 * file extent item copy and skip normal backref update,
698 		 * but modifying extent tree all by ourselves.
699 		 * So need to manually record dirty extent for qgroup,
700 		 * as the owner of the file extent changed from log tree
701 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
702 		 */
703 		ret = btrfs_qgroup_trace_extent(trans, fs_info,
704 				btrfs_file_extent_disk_bytenr(eb, item),
705 				btrfs_file_extent_disk_num_bytes(eb, item),
706 				GFP_NOFS);
707 		if (ret < 0)
708 			goto out;
709 
710 		if (ins.objectid > 0) {
711 			u64 csum_start;
712 			u64 csum_end;
713 			LIST_HEAD(ordered_sums);
714 			/*
715 			 * is this extent already allocated in the extent
716 			 * allocation tree?  If so, just add a reference
717 			 */
718 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
719 						ins.offset);
720 			if (ret == 0) {
721 				ret = btrfs_inc_extent_ref(trans, root,
722 						ins.objectid, ins.offset,
723 						0, root->root_key.objectid,
724 						key->objectid, offset);
725 				if (ret)
726 					goto out;
727 			} else {
728 				/*
729 				 * insert the extent pointer in the extent
730 				 * allocation tree
731 				 */
732 				ret = btrfs_alloc_logged_file_extent(trans,
733 						fs_info,
734 						root->root_key.objectid,
735 						key->objectid, offset, &ins);
736 				if (ret)
737 					goto out;
738 			}
739 			btrfs_release_path(path);
740 
741 			if (btrfs_file_extent_compression(eb, item)) {
742 				csum_start = ins.objectid;
743 				csum_end = csum_start + ins.offset;
744 			} else {
745 				csum_start = ins.objectid +
746 					btrfs_file_extent_offset(eb, item);
747 				csum_end = csum_start +
748 					btrfs_file_extent_num_bytes(eb, item);
749 			}
750 
751 			ret = btrfs_lookup_csums_range(root->log_root,
752 						csum_start, csum_end - 1,
753 						&ordered_sums, 0);
754 			if (ret)
755 				goto out;
756 			/*
757 			 * Now delete all existing cums in the csum root that
758 			 * cover our range. We do this because we can have an
759 			 * extent that is completely referenced by one file
760 			 * extent item and partially referenced by another
761 			 * file extent item (like after using the clone or
762 			 * extent_same ioctls). In this case if we end up doing
763 			 * the replay of the one that partially references the
764 			 * extent first, and we do not do the csum deletion
765 			 * below, we can get 2 csum items in the csum tree that
766 			 * overlap each other. For example, imagine our log has
767 			 * the two following file extent items:
768 			 *
769 			 * key (257 EXTENT_DATA 409600)
770 			 *     extent data disk byte 12845056 nr 102400
771 			 *     extent data offset 20480 nr 20480 ram 102400
772 			 *
773 			 * key (257 EXTENT_DATA 819200)
774 			 *     extent data disk byte 12845056 nr 102400
775 			 *     extent data offset 0 nr 102400 ram 102400
776 			 *
777 			 * Where the second one fully references the 100K extent
778 			 * that starts at disk byte 12845056, and the log tree
779 			 * has a single csum item that covers the entire range
780 			 * of the extent:
781 			 *
782 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
783 			 *
784 			 * After the first file extent item is replayed, the
785 			 * csum tree gets the following csum item:
786 			 *
787 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
788 			 *
789 			 * Which covers the 20K sub-range starting at offset 20K
790 			 * of our extent. Now when we replay the second file
791 			 * extent item, if we do not delete existing csum items
792 			 * that cover any of its blocks, we end up getting two
793 			 * csum items in our csum tree that overlap each other:
794 			 *
795 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
796 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
797 			 *
798 			 * Which is a problem, because after this anyone trying
799 			 * to lookup up for the checksum of any block of our
800 			 * extent starting at an offset of 40K or higher, will
801 			 * end up looking at the second csum item only, which
802 			 * does not contain the checksum for any block starting
803 			 * at offset 40K or higher of our extent.
804 			 */
805 			while (!list_empty(&ordered_sums)) {
806 				struct btrfs_ordered_sum *sums;
807 				sums = list_entry(ordered_sums.next,
808 						struct btrfs_ordered_sum,
809 						list);
810 				if (!ret)
811 					ret = btrfs_del_csums(trans, fs_info,
812 							      sums->bytenr,
813 							      sums->len);
814 				if (!ret)
815 					ret = btrfs_csum_file_blocks(trans,
816 						fs_info->csum_root, sums);
817 				list_del(&sums->list);
818 				kfree(sums);
819 			}
820 			if (ret)
821 				goto out;
822 		} else {
823 			btrfs_release_path(path);
824 		}
825 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
826 		/* inline extents are easy, we just overwrite them */
827 		ret = overwrite_item(trans, root, path, eb, slot, key);
828 		if (ret)
829 			goto out;
830 	}
831 
832 	inode_add_bytes(inode, nbytes);
833 update_inode:
834 	ret = btrfs_update_inode(trans, root, inode);
835 out:
836 	if (inode)
837 		iput(inode);
838 	return ret;
839 }
840 
841 /*
842  * when cleaning up conflicts between the directory names in the
843  * subvolume, directory names in the log and directory names in the
844  * inode back references, we may have to unlink inodes from directories.
845  *
846  * This is a helper function to do the unlink of a specific directory
847  * item
848  */
849 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
850 				      struct btrfs_root *root,
851 				      struct btrfs_path *path,
852 				      struct btrfs_inode *dir,
853 				      struct btrfs_dir_item *di)
854 {
855 	struct btrfs_fs_info *fs_info = root->fs_info;
856 	struct inode *inode;
857 	char *name;
858 	int name_len;
859 	struct extent_buffer *leaf;
860 	struct btrfs_key location;
861 	int ret;
862 
863 	leaf = path->nodes[0];
864 
865 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
866 	name_len = btrfs_dir_name_len(leaf, di);
867 	name = kmalloc(name_len, GFP_NOFS);
868 	if (!name)
869 		return -ENOMEM;
870 
871 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
872 	btrfs_release_path(path);
873 
874 	inode = read_one_inode(root, location.objectid);
875 	if (!inode) {
876 		ret = -EIO;
877 		goto out;
878 	}
879 
880 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
881 	if (ret)
882 		goto out;
883 
884 	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
885 			name_len);
886 	if (ret)
887 		goto out;
888 	else
889 		ret = btrfs_run_delayed_items(trans, fs_info);
890 out:
891 	kfree(name);
892 	iput(inode);
893 	return ret;
894 }
895 
896 /*
897  * helper function to see if a given name and sequence number found
898  * in an inode back reference are already in a directory and correctly
899  * point to this inode
900  */
901 static noinline int inode_in_dir(struct btrfs_root *root,
902 				 struct btrfs_path *path,
903 				 u64 dirid, u64 objectid, u64 index,
904 				 const char *name, int name_len)
905 {
906 	struct btrfs_dir_item *di;
907 	struct btrfs_key location;
908 	int match = 0;
909 
910 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
911 					 index, name, name_len, 0);
912 	if (di && !IS_ERR(di)) {
913 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
914 		if (location.objectid != objectid)
915 			goto out;
916 	} else
917 		goto out;
918 	btrfs_release_path(path);
919 
920 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
921 	if (di && !IS_ERR(di)) {
922 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
923 		if (location.objectid != objectid)
924 			goto out;
925 	} else
926 		goto out;
927 	match = 1;
928 out:
929 	btrfs_release_path(path);
930 	return match;
931 }
932 
933 /*
934  * helper function to check a log tree for a named back reference in
935  * an inode.  This is used to decide if a back reference that is
936  * found in the subvolume conflicts with what we find in the log.
937  *
938  * inode backreferences may have multiple refs in a single item,
939  * during replay we process one reference at a time, and we don't
940  * want to delete valid links to a file from the subvolume if that
941  * link is also in the log.
942  */
943 static noinline int backref_in_log(struct btrfs_root *log,
944 				   struct btrfs_key *key,
945 				   u64 ref_objectid,
946 				   const char *name, int namelen)
947 {
948 	struct btrfs_path *path;
949 	struct btrfs_inode_ref *ref;
950 	unsigned long ptr;
951 	unsigned long ptr_end;
952 	unsigned long name_ptr;
953 	int found_name_len;
954 	int item_size;
955 	int ret;
956 	int match = 0;
957 
958 	path = btrfs_alloc_path();
959 	if (!path)
960 		return -ENOMEM;
961 
962 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
963 	if (ret != 0)
964 		goto out;
965 
966 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
967 
968 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
969 		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
970 						   name, namelen, NULL))
971 			match = 1;
972 
973 		goto out;
974 	}
975 
976 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
977 	ptr_end = ptr + item_size;
978 	while (ptr < ptr_end) {
979 		ref = (struct btrfs_inode_ref *)ptr;
980 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
981 		if (found_name_len == namelen) {
982 			name_ptr = (unsigned long)(ref + 1);
983 			ret = memcmp_extent_buffer(path->nodes[0], name,
984 						   name_ptr, namelen);
985 			if (ret == 0) {
986 				match = 1;
987 				goto out;
988 			}
989 		}
990 		ptr = (unsigned long)(ref + 1) + found_name_len;
991 	}
992 out:
993 	btrfs_free_path(path);
994 	return match;
995 }
996 
997 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
998 				  struct btrfs_root *root,
999 				  struct btrfs_path *path,
1000 				  struct btrfs_root *log_root,
1001 				  struct btrfs_inode *dir,
1002 				  struct btrfs_inode *inode,
1003 				  u64 inode_objectid, u64 parent_objectid,
1004 				  u64 ref_index, char *name, int namelen,
1005 				  int *search_done)
1006 {
1007 	struct btrfs_fs_info *fs_info = root->fs_info;
1008 	int ret;
1009 	char *victim_name;
1010 	int victim_name_len;
1011 	struct extent_buffer *leaf;
1012 	struct btrfs_dir_item *di;
1013 	struct btrfs_key search_key;
1014 	struct btrfs_inode_extref *extref;
1015 
1016 again:
1017 	/* Search old style refs */
1018 	search_key.objectid = inode_objectid;
1019 	search_key.type = BTRFS_INODE_REF_KEY;
1020 	search_key.offset = parent_objectid;
1021 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1022 	if (ret == 0) {
1023 		struct btrfs_inode_ref *victim_ref;
1024 		unsigned long ptr;
1025 		unsigned long ptr_end;
1026 
1027 		leaf = path->nodes[0];
1028 
1029 		/* are we trying to overwrite a back ref for the root directory
1030 		 * if so, just jump out, we're done
1031 		 */
1032 		if (search_key.objectid == search_key.offset)
1033 			return 1;
1034 
1035 		/* check all the names in this back reference to see
1036 		 * if they are in the log.  if so, we allow them to stay
1037 		 * otherwise they must be unlinked as a conflict
1038 		 */
1039 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1040 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1041 		while (ptr < ptr_end) {
1042 			victim_ref = (struct btrfs_inode_ref *)ptr;
1043 			victim_name_len = btrfs_inode_ref_name_len(leaf,
1044 								   victim_ref);
1045 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1046 			if (!victim_name)
1047 				return -ENOMEM;
1048 
1049 			read_extent_buffer(leaf, victim_name,
1050 					   (unsigned long)(victim_ref + 1),
1051 					   victim_name_len);
1052 
1053 			if (!backref_in_log(log_root, &search_key,
1054 					    parent_objectid,
1055 					    victim_name,
1056 					    victim_name_len)) {
1057 				inc_nlink(&inode->vfs_inode);
1058 				btrfs_release_path(path);
1059 
1060 				ret = btrfs_unlink_inode(trans, root, dir, inode,
1061 						victim_name, victim_name_len);
1062 				kfree(victim_name);
1063 				if (ret)
1064 					return ret;
1065 				ret = btrfs_run_delayed_items(trans, fs_info);
1066 				if (ret)
1067 					return ret;
1068 				*search_done = 1;
1069 				goto again;
1070 			}
1071 			kfree(victim_name);
1072 
1073 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1074 		}
1075 
1076 		/*
1077 		 * NOTE: we have searched root tree and checked the
1078 		 * corresponding ref, it does not need to check again.
1079 		 */
1080 		*search_done = 1;
1081 	}
1082 	btrfs_release_path(path);
1083 
1084 	/* Same search but for extended refs */
1085 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1086 					   inode_objectid, parent_objectid, 0,
1087 					   0);
1088 	if (!IS_ERR_OR_NULL(extref)) {
1089 		u32 item_size;
1090 		u32 cur_offset = 0;
1091 		unsigned long base;
1092 		struct inode *victim_parent;
1093 
1094 		leaf = path->nodes[0];
1095 
1096 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1097 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1098 
1099 		while (cur_offset < item_size) {
1100 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1101 
1102 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1103 
1104 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1105 				goto next;
1106 
1107 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1108 			if (!victim_name)
1109 				return -ENOMEM;
1110 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1111 					   victim_name_len);
1112 
1113 			search_key.objectid = inode_objectid;
1114 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1115 			search_key.offset = btrfs_extref_hash(parent_objectid,
1116 							      victim_name,
1117 							      victim_name_len);
1118 			ret = 0;
1119 			if (!backref_in_log(log_root, &search_key,
1120 					    parent_objectid, victim_name,
1121 					    victim_name_len)) {
1122 				ret = -ENOENT;
1123 				victim_parent = read_one_inode(root,
1124 						parent_objectid);
1125 				if (victim_parent) {
1126 					inc_nlink(&inode->vfs_inode);
1127 					btrfs_release_path(path);
1128 
1129 					ret = btrfs_unlink_inode(trans, root,
1130 							BTRFS_I(victim_parent),
1131 							inode,
1132 							victim_name,
1133 							victim_name_len);
1134 					if (!ret)
1135 						ret = btrfs_run_delayed_items(
1136 								  trans,
1137 								  fs_info);
1138 				}
1139 				iput(victim_parent);
1140 				kfree(victim_name);
1141 				if (ret)
1142 					return ret;
1143 				*search_done = 1;
1144 				goto again;
1145 			}
1146 			kfree(victim_name);
1147 next:
1148 			cur_offset += victim_name_len + sizeof(*extref);
1149 		}
1150 		*search_done = 1;
1151 	}
1152 	btrfs_release_path(path);
1153 
1154 	/* look for a conflicting sequence number */
1155 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1156 					 ref_index, name, namelen, 0);
1157 	if (di && !IS_ERR(di)) {
1158 		ret = drop_one_dir_item(trans, root, path, dir, di);
1159 		if (ret)
1160 			return ret;
1161 	}
1162 	btrfs_release_path(path);
1163 
1164 	/* look for a conflicing name */
1165 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1166 				   name, namelen, 0);
1167 	if (di && !IS_ERR(di)) {
1168 		ret = drop_one_dir_item(trans, root, path, dir, di);
1169 		if (ret)
1170 			return ret;
1171 	}
1172 	btrfs_release_path(path);
1173 
1174 	return 0;
1175 }
1176 
1177 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1178 			     u32 *namelen, char **name, u64 *index,
1179 			     u64 *parent_objectid)
1180 {
1181 	struct btrfs_inode_extref *extref;
1182 
1183 	extref = (struct btrfs_inode_extref *)ref_ptr;
1184 
1185 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1186 	*name = kmalloc(*namelen, GFP_NOFS);
1187 	if (*name == NULL)
1188 		return -ENOMEM;
1189 
1190 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1191 			   *namelen);
1192 
1193 	*index = btrfs_inode_extref_index(eb, extref);
1194 	if (parent_objectid)
1195 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1196 
1197 	return 0;
1198 }
1199 
1200 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1201 			  u32 *namelen, char **name, u64 *index)
1202 {
1203 	struct btrfs_inode_ref *ref;
1204 
1205 	ref = (struct btrfs_inode_ref *)ref_ptr;
1206 
1207 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1208 	*name = kmalloc(*namelen, GFP_NOFS);
1209 	if (*name == NULL)
1210 		return -ENOMEM;
1211 
1212 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1213 
1214 	*index = btrfs_inode_ref_index(eb, ref);
1215 
1216 	return 0;
1217 }
1218 
1219 /*
1220  * replay one inode back reference item found in the log tree.
1221  * eb, slot and key refer to the buffer and key found in the log tree.
1222  * root is the destination we are replaying into, and path is for temp
1223  * use by this function.  (it should be released on return).
1224  */
1225 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1226 				  struct btrfs_root *root,
1227 				  struct btrfs_root *log,
1228 				  struct btrfs_path *path,
1229 				  struct extent_buffer *eb, int slot,
1230 				  struct btrfs_key *key)
1231 {
1232 	struct inode *dir = NULL;
1233 	struct inode *inode = NULL;
1234 	unsigned long ref_ptr;
1235 	unsigned long ref_end;
1236 	char *name = NULL;
1237 	int namelen;
1238 	int ret;
1239 	int search_done = 0;
1240 	int log_ref_ver = 0;
1241 	u64 parent_objectid;
1242 	u64 inode_objectid;
1243 	u64 ref_index = 0;
1244 	int ref_struct_size;
1245 
1246 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1247 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1248 
1249 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1250 		struct btrfs_inode_extref *r;
1251 
1252 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1253 		log_ref_ver = 1;
1254 		r = (struct btrfs_inode_extref *)ref_ptr;
1255 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1256 	} else {
1257 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1258 		parent_objectid = key->offset;
1259 	}
1260 	inode_objectid = key->objectid;
1261 
1262 	/*
1263 	 * it is possible that we didn't log all the parent directories
1264 	 * for a given inode.  If we don't find the dir, just don't
1265 	 * copy the back ref in.  The link count fixup code will take
1266 	 * care of the rest
1267 	 */
1268 	dir = read_one_inode(root, parent_objectid);
1269 	if (!dir) {
1270 		ret = -ENOENT;
1271 		goto out;
1272 	}
1273 
1274 	inode = read_one_inode(root, inode_objectid);
1275 	if (!inode) {
1276 		ret = -EIO;
1277 		goto out;
1278 	}
1279 
1280 	while (ref_ptr < ref_end) {
1281 		if (log_ref_ver) {
1282 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1283 						&ref_index, &parent_objectid);
1284 			/*
1285 			 * parent object can change from one array
1286 			 * item to another.
1287 			 */
1288 			if (!dir)
1289 				dir = read_one_inode(root, parent_objectid);
1290 			if (!dir) {
1291 				ret = -ENOENT;
1292 				goto out;
1293 			}
1294 		} else {
1295 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1296 					     &ref_index);
1297 		}
1298 		if (ret)
1299 			goto out;
1300 
1301 		/* if we already have a perfect match, we're done */
1302 		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1303 					btrfs_ino(BTRFS_I(inode)), ref_index,
1304 					name, namelen)) {
1305 			/*
1306 			 * look for a conflicting back reference in the
1307 			 * metadata. if we find one we have to unlink that name
1308 			 * of the file before we add our new link.  Later on, we
1309 			 * overwrite any existing back reference, and we don't
1310 			 * want to create dangling pointers in the directory.
1311 			 */
1312 
1313 			if (!search_done) {
1314 				ret = __add_inode_ref(trans, root, path, log,
1315 						      BTRFS_I(dir),
1316 						      BTRFS_I(inode),
1317 						      inode_objectid,
1318 						      parent_objectid,
1319 						      ref_index, name, namelen,
1320 						      &search_done);
1321 				if (ret) {
1322 					if (ret == 1)
1323 						ret = 0;
1324 					goto out;
1325 				}
1326 			}
1327 
1328 			/* insert our name */
1329 			ret = btrfs_add_link(trans, BTRFS_I(dir),
1330 					BTRFS_I(inode),
1331 					name, namelen, 0, ref_index);
1332 			if (ret)
1333 				goto out;
1334 
1335 			btrfs_update_inode(trans, root, inode);
1336 		}
1337 
1338 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1339 		kfree(name);
1340 		name = NULL;
1341 		if (log_ref_ver) {
1342 			iput(dir);
1343 			dir = NULL;
1344 		}
1345 	}
1346 
1347 	/* finally write the back reference in the inode */
1348 	ret = overwrite_item(trans, root, path, eb, slot, key);
1349 out:
1350 	btrfs_release_path(path);
1351 	kfree(name);
1352 	iput(dir);
1353 	iput(inode);
1354 	return ret;
1355 }
1356 
1357 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1358 			      struct btrfs_root *root, u64 ino)
1359 {
1360 	int ret;
1361 
1362 	ret = btrfs_insert_orphan_item(trans, root, ino);
1363 	if (ret == -EEXIST)
1364 		ret = 0;
1365 
1366 	return ret;
1367 }
1368 
1369 static int count_inode_extrefs(struct btrfs_root *root,
1370 		struct btrfs_inode *inode, struct btrfs_path *path)
1371 {
1372 	int ret = 0;
1373 	int name_len;
1374 	unsigned int nlink = 0;
1375 	u32 item_size;
1376 	u32 cur_offset = 0;
1377 	u64 inode_objectid = btrfs_ino(inode);
1378 	u64 offset = 0;
1379 	unsigned long ptr;
1380 	struct btrfs_inode_extref *extref;
1381 	struct extent_buffer *leaf;
1382 
1383 	while (1) {
1384 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1385 					    &extref, &offset);
1386 		if (ret)
1387 			break;
1388 
1389 		leaf = path->nodes[0];
1390 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1391 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1392 		cur_offset = 0;
1393 
1394 		while (cur_offset < item_size) {
1395 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1396 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1397 
1398 			nlink++;
1399 
1400 			cur_offset += name_len + sizeof(*extref);
1401 		}
1402 
1403 		offset++;
1404 		btrfs_release_path(path);
1405 	}
1406 	btrfs_release_path(path);
1407 
1408 	if (ret < 0 && ret != -ENOENT)
1409 		return ret;
1410 	return nlink;
1411 }
1412 
1413 static int count_inode_refs(struct btrfs_root *root,
1414 			struct btrfs_inode *inode, struct btrfs_path *path)
1415 {
1416 	int ret;
1417 	struct btrfs_key key;
1418 	unsigned int nlink = 0;
1419 	unsigned long ptr;
1420 	unsigned long ptr_end;
1421 	int name_len;
1422 	u64 ino = btrfs_ino(inode);
1423 
1424 	key.objectid = ino;
1425 	key.type = BTRFS_INODE_REF_KEY;
1426 	key.offset = (u64)-1;
1427 
1428 	while (1) {
1429 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1430 		if (ret < 0)
1431 			break;
1432 		if (ret > 0) {
1433 			if (path->slots[0] == 0)
1434 				break;
1435 			path->slots[0]--;
1436 		}
1437 process_slot:
1438 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1439 				      path->slots[0]);
1440 		if (key.objectid != ino ||
1441 		    key.type != BTRFS_INODE_REF_KEY)
1442 			break;
1443 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1444 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1445 						   path->slots[0]);
1446 		while (ptr < ptr_end) {
1447 			struct btrfs_inode_ref *ref;
1448 
1449 			ref = (struct btrfs_inode_ref *)ptr;
1450 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1451 							    ref);
1452 			ptr = (unsigned long)(ref + 1) + name_len;
1453 			nlink++;
1454 		}
1455 
1456 		if (key.offset == 0)
1457 			break;
1458 		if (path->slots[0] > 0) {
1459 			path->slots[0]--;
1460 			goto process_slot;
1461 		}
1462 		key.offset--;
1463 		btrfs_release_path(path);
1464 	}
1465 	btrfs_release_path(path);
1466 
1467 	return nlink;
1468 }
1469 
1470 /*
1471  * There are a few corners where the link count of the file can't
1472  * be properly maintained during replay.  So, instead of adding
1473  * lots of complexity to the log code, we just scan the backrefs
1474  * for any file that has been through replay.
1475  *
1476  * The scan will update the link count on the inode to reflect the
1477  * number of back refs found.  If it goes down to zero, the iput
1478  * will free the inode.
1479  */
1480 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1481 					   struct btrfs_root *root,
1482 					   struct inode *inode)
1483 {
1484 	struct btrfs_path *path;
1485 	int ret;
1486 	u64 nlink = 0;
1487 	u64 ino = btrfs_ino(BTRFS_I(inode));
1488 
1489 	path = btrfs_alloc_path();
1490 	if (!path)
1491 		return -ENOMEM;
1492 
1493 	ret = count_inode_refs(root, BTRFS_I(inode), path);
1494 	if (ret < 0)
1495 		goto out;
1496 
1497 	nlink = ret;
1498 
1499 	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1500 	if (ret < 0)
1501 		goto out;
1502 
1503 	nlink += ret;
1504 
1505 	ret = 0;
1506 
1507 	if (nlink != inode->i_nlink) {
1508 		set_nlink(inode, nlink);
1509 		btrfs_update_inode(trans, root, inode);
1510 	}
1511 	BTRFS_I(inode)->index_cnt = (u64)-1;
1512 
1513 	if (inode->i_nlink == 0) {
1514 		if (S_ISDIR(inode->i_mode)) {
1515 			ret = replay_dir_deletes(trans, root, NULL, path,
1516 						 ino, 1);
1517 			if (ret)
1518 				goto out;
1519 		}
1520 		ret = insert_orphan_item(trans, root, ino);
1521 	}
1522 
1523 out:
1524 	btrfs_free_path(path);
1525 	return ret;
1526 }
1527 
1528 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1529 					    struct btrfs_root *root,
1530 					    struct btrfs_path *path)
1531 {
1532 	int ret;
1533 	struct btrfs_key key;
1534 	struct inode *inode;
1535 
1536 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1537 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1538 	key.offset = (u64)-1;
1539 	while (1) {
1540 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1541 		if (ret < 0)
1542 			break;
1543 
1544 		if (ret == 1) {
1545 			if (path->slots[0] == 0)
1546 				break;
1547 			path->slots[0]--;
1548 		}
1549 
1550 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1551 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1552 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1553 			break;
1554 
1555 		ret = btrfs_del_item(trans, root, path);
1556 		if (ret)
1557 			goto out;
1558 
1559 		btrfs_release_path(path);
1560 		inode = read_one_inode(root, key.offset);
1561 		if (!inode)
1562 			return -EIO;
1563 
1564 		ret = fixup_inode_link_count(trans, root, inode);
1565 		iput(inode);
1566 		if (ret)
1567 			goto out;
1568 
1569 		/*
1570 		 * fixup on a directory may create new entries,
1571 		 * make sure we always look for the highset possible
1572 		 * offset
1573 		 */
1574 		key.offset = (u64)-1;
1575 	}
1576 	ret = 0;
1577 out:
1578 	btrfs_release_path(path);
1579 	return ret;
1580 }
1581 
1582 
1583 /*
1584  * record a given inode in the fixup dir so we can check its link
1585  * count when replay is done.  The link count is incremented here
1586  * so the inode won't go away until we check it
1587  */
1588 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1589 				      struct btrfs_root *root,
1590 				      struct btrfs_path *path,
1591 				      u64 objectid)
1592 {
1593 	struct btrfs_key key;
1594 	int ret = 0;
1595 	struct inode *inode;
1596 
1597 	inode = read_one_inode(root, objectid);
1598 	if (!inode)
1599 		return -EIO;
1600 
1601 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1602 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1603 	key.offset = objectid;
1604 
1605 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1606 
1607 	btrfs_release_path(path);
1608 	if (ret == 0) {
1609 		if (!inode->i_nlink)
1610 			set_nlink(inode, 1);
1611 		else
1612 			inc_nlink(inode);
1613 		ret = btrfs_update_inode(trans, root, inode);
1614 	} else if (ret == -EEXIST) {
1615 		ret = 0;
1616 	} else {
1617 		BUG(); /* Logic Error */
1618 	}
1619 	iput(inode);
1620 
1621 	return ret;
1622 }
1623 
1624 /*
1625  * when replaying the log for a directory, we only insert names
1626  * for inodes that actually exist.  This means an fsync on a directory
1627  * does not implicitly fsync all the new files in it
1628  */
1629 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1630 				    struct btrfs_root *root,
1631 				    u64 dirid, u64 index,
1632 				    char *name, int name_len,
1633 				    struct btrfs_key *location)
1634 {
1635 	struct inode *inode;
1636 	struct inode *dir;
1637 	int ret;
1638 
1639 	inode = read_one_inode(root, location->objectid);
1640 	if (!inode)
1641 		return -ENOENT;
1642 
1643 	dir = read_one_inode(root, dirid);
1644 	if (!dir) {
1645 		iput(inode);
1646 		return -EIO;
1647 	}
1648 
1649 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1650 			name_len, 1, index);
1651 
1652 	/* FIXME, put inode into FIXUP list */
1653 
1654 	iput(inode);
1655 	iput(dir);
1656 	return ret;
1657 }
1658 
1659 /*
1660  * Return true if an inode reference exists in the log for the given name,
1661  * inode and parent inode.
1662  */
1663 static bool name_in_log_ref(struct btrfs_root *log_root,
1664 			    const char *name, const int name_len,
1665 			    const u64 dirid, const u64 ino)
1666 {
1667 	struct btrfs_key search_key;
1668 
1669 	search_key.objectid = ino;
1670 	search_key.type = BTRFS_INODE_REF_KEY;
1671 	search_key.offset = dirid;
1672 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1673 		return true;
1674 
1675 	search_key.type = BTRFS_INODE_EXTREF_KEY;
1676 	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1677 	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1678 		return true;
1679 
1680 	return false;
1681 }
1682 
1683 /*
1684  * take a single entry in a log directory item and replay it into
1685  * the subvolume.
1686  *
1687  * if a conflicting item exists in the subdirectory already,
1688  * the inode it points to is unlinked and put into the link count
1689  * fix up tree.
1690  *
1691  * If a name from the log points to a file or directory that does
1692  * not exist in the FS, it is skipped.  fsyncs on directories
1693  * do not force down inodes inside that directory, just changes to the
1694  * names or unlinks in a directory.
1695  *
1696  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1697  * non-existing inode) and 1 if the name was replayed.
1698  */
1699 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1700 				    struct btrfs_root *root,
1701 				    struct btrfs_path *path,
1702 				    struct extent_buffer *eb,
1703 				    struct btrfs_dir_item *di,
1704 				    struct btrfs_key *key)
1705 {
1706 	char *name;
1707 	int name_len;
1708 	struct btrfs_dir_item *dst_di;
1709 	struct btrfs_key found_key;
1710 	struct btrfs_key log_key;
1711 	struct inode *dir;
1712 	u8 log_type;
1713 	int exists;
1714 	int ret = 0;
1715 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1716 	bool name_added = false;
1717 
1718 	dir = read_one_inode(root, key->objectid);
1719 	if (!dir)
1720 		return -EIO;
1721 
1722 	name_len = btrfs_dir_name_len(eb, di);
1723 	name = kmalloc(name_len, GFP_NOFS);
1724 	if (!name) {
1725 		ret = -ENOMEM;
1726 		goto out;
1727 	}
1728 
1729 	log_type = btrfs_dir_type(eb, di);
1730 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1731 		   name_len);
1732 
1733 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1734 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1735 	if (exists == 0)
1736 		exists = 1;
1737 	else
1738 		exists = 0;
1739 	btrfs_release_path(path);
1740 
1741 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1742 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1743 				       name, name_len, 1);
1744 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1745 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1746 						     key->objectid,
1747 						     key->offset, name,
1748 						     name_len, 1);
1749 	} else {
1750 		/* Corruption */
1751 		ret = -EINVAL;
1752 		goto out;
1753 	}
1754 	if (IS_ERR_OR_NULL(dst_di)) {
1755 		/* we need a sequence number to insert, so we only
1756 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1757 		 */
1758 		if (key->type != BTRFS_DIR_INDEX_KEY)
1759 			goto out;
1760 		goto insert;
1761 	}
1762 
1763 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1764 	/* the existing item matches the logged item */
1765 	if (found_key.objectid == log_key.objectid &&
1766 	    found_key.type == log_key.type &&
1767 	    found_key.offset == log_key.offset &&
1768 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1769 		update_size = false;
1770 		goto out;
1771 	}
1772 
1773 	/*
1774 	 * don't drop the conflicting directory entry if the inode
1775 	 * for the new entry doesn't exist
1776 	 */
1777 	if (!exists)
1778 		goto out;
1779 
1780 	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1781 	if (ret)
1782 		goto out;
1783 
1784 	if (key->type == BTRFS_DIR_INDEX_KEY)
1785 		goto insert;
1786 out:
1787 	btrfs_release_path(path);
1788 	if (!ret && update_size) {
1789 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1790 		ret = btrfs_update_inode(trans, root, dir);
1791 	}
1792 	kfree(name);
1793 	iput(dir);
1794 	if (!ret && name_added)
1795 		ret = 1;
1796 	return ret;
1797 
1798 insert:
1799 	if (name_in_log_ref(root->log_root, name, name_len,
1800 			    key->objectid, log_key.objectid)) {
1801 		/* The dentry will be added later. */
1802 		ret = 0;
1803 		update_size = false;
1804 		goto out;
1805 	}
1806 	btrfs_release_path(path);
1807 	ret = insert_one_name(trans, root, key->objectid, key->offset,
1808 			      name, name_len, &log_key);
1809 	if (ret && ret != -ENOENT && ret != -EEXIST)
1810 		goto out;
1811 	if (!ret)
1812 		name_added = true;
1813 	update_size = false;
1814 	ret = 0;
1815 	goto out;
1816 }
1817 
1818 /*
1819  * find all the names in a directory item and reconcile them into
1820  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1821  * one name in a directory item, but the same code gets used for
1822  * both directory index types
1823  */
1824 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1825 					struct btrfs_root *root,
1826 					struct btrfs_path *path,
1827 					struct extent_buffer *eb, int slot,
1828 					struct btrfs_key *key)
1829 {
1830 	int ret = 0;
1831 	u32 item_size = btrfs_item_size_nr(eb, slot);
1832 	struct btrfs_dir_item *di;
1833 	int name_len;
1834 	unsigned long ptr;
1835 	unsigned long ptr_end;
1836 	struct btrfs_path *fixup_path = NULL;
1837 
1838 	ptr = btrfs_item_ptr_offset(eb, slot);
1839 	ptr_end = ptr + item_size;
1840 	while (ptr < ptr_end) {
1841 		di = (struct btrfs_dir_item *)ptr;
1842 		name_len = btrfs_dir_name_len(eb, di);
1843 		ret = replay_one_name(trans, root, path, eb, di, key);
1844 		if (ret < 0)
1845 			break;
1846 		ptr = (unsigned long)(di + 1);
1847 		ptr += name_len;
1848 
1849 		/*
1850 		 * If this entry refers to a non-directory (directories can not
1851 		 * have a link count > 1) and it was added in the transaction
1852 		 * that was not committed, make sure we fixup the link count of
1853 		 * the inode it the entry points to. Otherwise something like
1854 		 * the following would result in a directory pointing to an
1855 		 * inode with a wrong link that does not account for this dir
1856 		 * entry:
1857 		 *
1858 		 * mkdir testdir
1859 		 * touch testdir/foo
1860 		 * touch testdir/bar
1861 		 * sync
1862 		 *
1863 		 * ln testdir/bar testdir/bar_link
1864 		 * ln testdir/foo testdir/foo_link
1865 		 * xfs_io -c "fsync" testdir/bar
1866 		 *
1867 		 * <power failure>
1868 		 *
1869 		 * mount fs, log replay happens
1870 		 *
1871 		 * File foo would remain with a link count of 1 when it has two
1872 		 * entries pointing to it in the directory testdir. This would
1873 		 * make it impossible to ever delete the parent directory has
1874 		 * it would result in stale dentries that can never be deleted.
1875 		 */
1876 		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1877 			struct btrfs_key di_key;
1878 
1879 			if (!fixup_path) {
1880 				fixup_path = btrfs_alloc_path();
1881 				if (!fixup_path) {
1882 					ret = -ENOMEM;
1883 					break;
1884 				}
1885 			}
1886 
1887 			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1888 			ret = link_to_fixup_dir(trans, root, fixup_path,
1889 						di_key.objectid);
1890 			if (ret)
1891 				break;
1892 		}
1893 		ret = 0;
1894 	}
1895 	btrfs_free_path(fixup_path);
1896 	return ret;
1897 }
1898 
1899 /*
1900  * directory replay has two parts.  There are the standard directory
1901  * items in the log copied from the subvolume, and range items
1902  * created in the log while the subvolume was logged.
1903  *
1904  * The range items tell us which parts of the key space the log
1905  * is authoritative for.  During replay, if a key in the subvolume
1906  * directory is in a logged range item, but not actually in the log
1907  * that means it was deleted from the directory before the fsync
1908  * and should be removed.
1909  */
1910 static noinline int find_dir_range(struct btrfs_root *root,
1911 				   struct btrfs_path *path,
1912 				   u64 dirid, int key_type,
1913 				   u64 *start_ret, u64 *end_ret)
1914 {
1915 	struct btrfs_key key;
1916 	u64 found_end;
1917 	struct btrfs_dir_log_item *item;
1918 	int ret;
1919 	int nritems;
1920 
1921 	if (*start_ret == (u64)-1)
1922 		return 1;
1923 
1924 	key.objectid = dirid;
1925 	key.type = key_type;
1926 	key.offset = *start_ret;
1927 
1928 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1929 	if (ret < 0)
1930 		goto out;
1931 	if (ret > 0) {
1932 		if (path->slots[0] == 0)
1933 			goto out;
1934 		path->slots[0]--;
1935 	}
1936 	if (ret != 0)
1937 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1938 
1939 	if (key.type != key_type || key.objectid != dirid) {
1940 		ret = 1;
1941 		goto next;
1942 	}
1943 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1944 			      struct btrfs_dir_log_item);
1945 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1946 
1947 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1948 		ret = 0;
1949 		*start_ret = key.offset;
1950 		*end_ret = found_end;
1951 		goto out;
1952 	}
1953 	ret = 1;
1954 next:
1955 	/* check the next slot in the tree to see if it is a valid item */
1956 	nritems = btrfs_header_nritems(path->nodes[0]);
1957 	path->slots[0]++;
1958 	if (path->slots[0] >= nritems) {
1959 		ret = btrfs_next_leaf(root, path);
1960 		if (ret)
1961 			goto out;
1962 	}
1963 
1964 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1965 
1966 	if (key.type != key_type || key.objectid != dirid) {
1967 		ret = 1;
1968 		goto out;
1969 	}
1970 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1971 			      struct btrfs_dir_log_item);
1972 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1973 	*start_ret = key.offset;
1974 	*end_ret = found_end;
1975 	ret = 0;
1976 out:
1977 	btrfs_release_path(path);
1978 	return ret;
1979 }
1980 
1981 /*
1982  * this looks for a given directory item in the log.  If the directory
1983  * item is not in the log, the item is removed and the inode it points
1984  * to is unlinked
1985  */
1986 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1987 				      struct btrfs_root *root,
1988 				      struct btrfs_root *log,
1989 				      struct btrfs_path *path,
1990 				      struct btrfs_path *log_path,
1991 				      struct inode *dir,
1992 				      struct btrfs_key *dir_key)
1993 {
1994 	struct btrfs_fs_info *fs_info = root->fs_info;
1995 	int ret;
1996 	struct extent_buffer *eb;
1997 	int slot;
1998 	u32 item_size;
1999 	struct btrfs_dir_item *di;
2000 	struct btrfs_dir_item *log_di;
2001 	int name_len;
2002 	unsigned long ptr;
2003 	unsigned long ptr_end;
2004 	char *name;
2005 	struct inode *inode;
2006 	struct btrfs_key location;
2007 
2008 again:
2009 	eb = path->nodes[0];
2010 	slot = path->slots[0];
2011 	item_size = btrfs_item_size_nr(eb, slot);
2012 	ptr = btrfs_item_ptr_offset(eb, slot);
2013 	ptr_end = ptr + item_size;
2014 	while (ptr < ptr_end) {
2015 		di = (struct btrfs_dir_item *)ptr;
2016 		name_len = btrfs_dir_name_len(eb, di);
2017 		name = kmalloc(name_len, GFP_NOFS);
2018 		if (!name) {
2019 			ret = -ENOMEM;
2020 			goto out;
2021 		}
2022 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2023 				  name_len);
2024 		log_di = NULL;
2025 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2026 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2027 						       dir_key->objectid,
2028 						       name, name_len, 0);
2029 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2030 			log_di = btrfs_lookup_dir_index_item(trans, log,
2031 						     log_path,
2032 						     dir_key->objectid,
2033 						     dir_key->offset,
2034 						     name, name_len, 0);
2035 		}
2036 		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2037 			btrfs_dir_item_key_to_cpu(eb, di, &location);
2038 			btrfs_release_path(path);
2039 			btrfs_release_path(log_path);
2040 			inode = read_one_inode(root, location.objectid);
2041 			if (!inode) {
2042 				kfree(name);
2043 				return -EIO;
2044 			}
2045 
2046 			ret = link_to_fixup_dir(trans, root,
2047 						path, location.objectid);
2048 			if (ret) {
2049 				kfree(name);
2050 				iput(inode);
2051 				goto out;
2052 			}
2053 
2054 			inc_nlink(inode);
2055 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2056 					BTRFS_I(inode), name, name_len);
2057 			if (!ret)
2058 				ret = btrfs_run_delayed_items(trans, fs_info);
2059 			kfree(name);
2060 			iput(inode);
2061 			if (ret)
2062 				goto out;
2063 
2064 			/* there might still be more names under this key
2065 			 * check and repeat if required
2066 			 */
2067 			ret = btrfs_search_slot(NULL, root, dir_key, path,
2068 						0, 0);
2069 			if (ret == 0)
2070 				goto again;
2071 			ret = 0;
2072 			goto out;
2073 		} else if (IS_ERR(log_di)) {
2074 			kfree(name);
2075 			return PTR_ERR(log_di);
2076 		}
2077 		btrfs_release_path(log_path);
2078 		kfree(name);
2079 
2080 		ptr = (unsigned long)(di + 1);
2081 		ptr += name_len;
2082 	}
2083 	ret = 0;
2084 out:
2085 	btrfs_release_path(path);
2086 	btrfs_release_path(log_path);
2087 	return ret;
2088 }
2089 
2090 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2091 			      struct btrfs_root *root,
2092 			      struct btrfs_root *log,
2093 			      struct btrfs_path *path,
2094 			      const u64 ino)
2095 {
2096 	struct btrfs_key search_key;
2097 	struct btrfs_path *log_path;
2098 	int i;
2099 	int nritems;
2100 	int ret;
2101 
2102 	log_path = btrfs_alloc_path();
2103 	if (!log_path)
2104 		return -ENOMEM;
2105 
2106 	search_key.objectid = ino;
2107 	search_key.type = BTRFS_XATTR_ITEM_KEY;
2108 	search_key.offset = 0;
2109 again:
2110 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2111 	if (ret < 0)
2112 		goto out;
2113 process_leaf:
2114 	nritems = btrfs_header_nritems(path->nodes[0]);
2115 	for (i = path->slots[0]; i < nritems; i++) {
2116 		struct btrfs_key key;
2117 		struct btrfs_dir_item *di;
2118 		struct btrfs_dir_item *log_di;
2119 		u32 total_size;
2120 		u32 cur;
2121 
2122 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2123 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2124 			ret = 0;
2125 			goto out;
2126 		}
2127 
2128 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2129 		total_size = btrfs_item_size_nr(path->nodes[0], i);
2130 		cur = 0;
2131 		while (cur < total_size) {
2132 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2133 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2134 			u32 this_len = sizeof(*di) + name_len + data_len;
2135 			char *name;
2136 
2137 			name = kmalloc(name_len, GFP_NOFS);
2138 			if (!name) {
2139 				ret = -ENOMEM;
2140 				goto out;
2141 			}
2142 			read_extent_buffer(path->nodes[0], name,
2143 					   (unsigned long)(di + 1), name_len);
2144 
2145 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2146 						    name, name_len, 0);
2147 			btrfs_release_path(log_path);
2148 			if (!log_di) {
2149 				/* Doesn't exist in log tree, so delete it. */
2150 				btrfs_release_path(path);
2151 				di = btrfs_lookup_xattr(trans, root, path, ino,
2152 							name, name_len, -1);
2153 				kfree(name);
2154 				if (IS_ERR(di)) {
2155 					ret = PTR_ERR(di);
2156 					goto out;
2157 				}
2158 				ASSERT(di);
2159 				ret = btrfs_delete_one_dir_name(trans, root,
2160 								path, di);
2161 				if (ret)
2162 					goto out;
2163 				btrfs_release_path(path);
2164 				search_key = key;
2165 				goto again;
2166 			}
2167 			kfree(name);
2168 			if (IS_ERR(log_di)) {
2169 				ret = PTR_ERR(log_di);
2170 				goto out;
2171 			}
2172 			cur += this_len;
2173 			di = (struct btrfs_dir_item *)((char *)di + this_len);
2174 		}
2175 	}
2176 	ret = btrfs_next_leaf(root, path);
2177 	if (ret > 0)
2178 		ret = 0;
2179 	else if (ret == 0)
2180 		goto process_leaf;
2181 out:
2182 	btrfs_free_path(log_path);
2183 	btrfs_release_path(path);
2184 	return ret;
2185 }
2186 
2187 
2188 /*
2189  * deletion replay happens before we copy any new directory items
2190  * out of the log or out of backreferences from inodes.  It
2191  * scans the log to find ranges of keys that log is authoritative for,
2192  * and then scans the directory to find items in those ranges that are
2193  * not present in the log.
2194  *
2195  * Anything we don't find in the log is unlinked and removed from the
2196  * directory.
2197  */
2198 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2199 				       struct btrfs_root *root,
2200 				       struct btrfs_root *log,
2201 				       struct btrfs_path *path,
2202 				       u64 dirid, int del_all)
2203 {
2204 	u64 range_start;
2205 	u64 range_end;
2206 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2207 	int ret = 0;
2208 	struct btrfs_key dir_key;
2209 	struct btrfs_key found_key;
2210 	struct btrfs_path *log_path;
2211 	struct inode *dir;
2212 
2213 	dir_key.objectid = dirid;
2214 	dir_key.type = BTRFS_DIR_ITEM_KEY;
2215 	log_path = btrfs_alloc_path();
2216 	if (!log_path)
2217 		return -ENOMEM;
2218 
2219 	dir = read_one_inode(root, dirid);
2220 	/* it isn't an error if the inode isn't there, that can happen
2221 	 * because we replay the deletes before we copy in the inode item
2222 	 * from the log
2223 	 */
2224 	if (!dir) {
2225 		btrfs_free_path(log_path);
2226 		return 0;
2227 	}
2228 again:
2229 	range_start = 0;
2230 	range_end = 0;
2231 	while (1) {
2232 		if (del_all)
2233 			range_end = (u64)-1;
2234 		else {
2235 			ret = find_dir_range(log, path, dirid, key_type,
2236 					     &range_start, &range_end);
2237 			if (ret != 0)
2238 				break;
2239 		}
2240 
2241 		dir_key.offset = range_start;
2242 		while (1) {
2243 			int nritems;
2244 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2245 						0, 0);
2246 			if (ret < 0)
2247 				goto out;
2248 
2249 			nritems = btrfs_header_nritems(path->nodes[0]);
2250 			if (path->slots[0] >= nritems) {
2251 				ret = btrfs_next_leaf(root, path);
2252 				if (ret)
2253 					break;
2254 			}
2255 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2256 					      path->slots[0]);
2257 			if (found_key.objectid != dirid ||
2258 			    found_key.type != dir_key.type)
2259 				goto next_type;
2260 
2261 			if (found_key.offset > range_end)
2262 				break;
2263 
2264 			ret = check_item_in_log(trans, root, log, path,
2265 						log_path, dir,
2266 						&found_key);
2267 			if (ret)
2268 				goto out;
2269 			if (found_key.offset == (u64)-1)
2270 				break;
2271 			dir_key.offset = found_key.offset + 1;
2272 		}
2273 		btrfs_release_path(path);
2274 		if (range_end == (u64)-1)
2275 			break;
2276 		range_start = range_end + 1;
2277 	}
2278 
2279 next_type:
2280 	ret = 0;
2281 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2282 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2283 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2284 		btrfs_release_path(path);
2285 		goto again;
2286 	}
2287 out:
2288 	btrfs_release_path(path);
2289 	btrfs_free_path(log_path);
2290 	iput(dir);
2291 	return ret;
2292 }
2293 
2294 /*
2295  * the process_func used to replay items from the log tree.  This
2296  * gets called in two different stages.  The first stage just looks
2297  * for inodes and makes sure they are all copied into the subvolume.
2298  *
2299  * The second stage copies all the other item types from the log into
2300  * the subvolume.  The two stage approach is slower, but gets rid of
2301  * lots of complexity around inodes referencing other inodes that exist
2302  * only in the log (references come from either directory items or inode
2303  * back refs).
2304  */
2305 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2306 			     struct walk_control *wc, u64 gen)
2307 {
2308 	int nritems;
2309 	struct btrfs_path *path;
2310 	struct btrfs_root *root = wc->replay_dest;
2311 	struct btrfs_key key;
2312 	int level;
2313 	int i;
2314 	int ret;
2315 
2316 	ret = btrfs_read_buffer(eb, gen);
2317 	if (ret)
2318 		return ret;
2319 
2320 	level = btrfs_header_level(eb);
2321 
2322 	if (level != 0)
2323 		return 0;
2324 
2325 	path = btrfs_alloc_path();
2326 	if (!path)
2327 		return -ENOMEM;
2328 
2329 	nritems = btrfs_header_nritems(eb);
2330 	for (i = 0; i < nritems; i++) {
2331 		btrfs_item_key_to_cpu(eb, &key, i);
2332 
2333 		/* inode keys are done during the first stage */
2334 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2335 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2336 			struct btrfs_inode_item *inode_item;
2337 			u32 mode;
2338 
2339 			inode_item = btrfs_item_ptr(eb, i,
2340 					    struct btrfs_inode_item);
2341 			ret = replay_xattr_deletes(wc->trans, root, log,
2342 						   path, key.objectid);
2343 			if (ret)
2344 				break;
2345 			mode = btrfs_inode_mode(eb, inode_item);
2346 			if (S_ISDIR(mode)) {
2347 				ret = replay_dir_deletes(wc->trans,
2348 					 root, log, path, key.objectid, 0);
2349 				if (ret)
2350 					break;
2351 			}
2352 			ret = overwrite_item(wc->trans, root, path,
2353 					     eb, i, &key);
2354 			if (ret)
2355 				break;
2356 
2357 			/* for regular files, make sure corresponding
2358 			 * orphan item exist. extents past the new EOF
2359 			 * will be truncated later by orphan cleanup.
2360 			 */
2361 			if (S_ISREG(mode)) {
2362 				ret = insert_orphan_item(wc->trans, root,
2363 							 key.objectid);
2364 				if (ret)
2365 					break;
2366 			}
2367 
2368 			ret = link_to_fixup_dir(wc->trans, root,
2369 						path, key.objectid);
2370 			if (ret)
2371 				break;
2372 		}
2373 
2374 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2375 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2376 			ret = replay_one_dir_item(wc->trans, root, path,
2377 						  eb, i, &key);
2378 			if (ret)
2379 				break;
2380 		}
2381 
2382 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2383 			continue;
2384 
2385 		/* these keys are simply copied */
2386 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2387 			ret = overwrite_item(wc->trans, root, path,
2388 					     eb, i, &key);
2389 			if (ret)
2390 				break;
2391 		} else if (key.type == BTRFS_INODE_REF_KEY ||
2392 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2393 			ret = add_inode_ref(wc->trans, root, log, path,
2394 					    eb, i, &key);
2395 			if (ret && ret != -ENOENT)
2396 				break;
2397 			ret = 0;
2398 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2399 			ret = replay_one_extent(wc->trans, root, path,
2400 						eb, i, &key);
2401 			if (ret)
2402 				break;
2403 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2404 			ret = replay_one_dir_item(wc->trans, root, path,
2405 						  eb, i, &key);
2406 			if (ret)
2407 				break;
2408 		}
2409 	}
2410 	btrfs_free_path(path);
2411 	return ret;
2412 }
2413 
2414 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2415 				   struct btrfs_root *root,
2416 				   struct btrfs_path *path, int *level,
2417 				   struct walk_control *wc)
2418 {
2419 	struct btrfs_fs_info *fs_info = root->fs_info;
2420 	u64 root_owner;
2421 	u64 bytenr;
2422 	u64 ptr_gen;
2423 	struct extent_buffer *next;
2424 	struct extent_buffer *cur;
2425 	struct extent_buffer *parent;
2426 	u32 blocksize;
2427 	int ret = 0;
2428 
2429 	WARN_ON(*level < 0);
2430 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2431 
2432 	while (*level > 0) {
2433 		WARN_ON(*level < 0);
2434 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
2435 		cur = path->nodes[*level];
2436 
2437 		WARN_ON(btrfs_header_level(cur) != *level);
2438 
2439 		if (path->slots[*level] >=
2440 		    btrfs_header_nritems(cur))
2441 			break;
2442 
2443 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2444 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2445 		blocksize = fs_info->nodesize;
2446 
2447 		parent = path->nodes[*level];
2448 		root_owner = btrfs_header_owner(parent);
2449 
2450 		next = btrfs_find_create_tree_block(fs_info, bytenr);
2451 		if (IS_ERR(next))
2452 			return PTR_ERR(next);
2453 
2454 		if (*level == 1) {
2455 			ret = wc->process_func(root, next, wc, ptr_gen);
2456 			if (ret) {
2457 				free_extent_buffer(next);
2458 				return ret;
2459 			}
2460 
2461 			path->slots[*level]++;
2462 			if (wc->free) {
2463 				ret = btrfs_read_buffer(next, ptr_gen);
2464 				if (ret) {
2465 					free_extent_buffer(next);
2466 					return ret;
2467 				}
2468 
2469 				if (trans) {
2470 					btrfs_tree_lock(next);
2471 					btrfs_set_lock_blocking(next);
2472 					clean_tree_block(fs_info, next);
2473 					btrfs_wait_tree_block_writeback(next);
2474 					btrfs_tree_unlock(next);
2475 				}
2476 
2477 				WARN_ON(root_owner !=
2478 					BTRFS_TREE_LOG_OBJECTID);
2479 				ret = btrfs_free_and_pin_reserved_extent(
2480 							fs_info, bytenr,
2481 							blocksize);
2482 				if (ret) {
2483 					free_extent_buffer(next);
2484 					return ret;
2485 				}
2486 			}
2487 			free_extent_buffer(next);
2488 			continue;
2489 		}
2490 		ret = btrfs_read_buffer(next, ptr_gen);
2491 		if (ret) {
2492 			free_extent_buffer(next);
2493 			return ret;
2494 		}
2495 
2496 		WARN_ON(*level <= 0);
2497 		if (path->nodes[*level-1])
2498 			free_extent_buffer(path->nodes[*level-1]);
2499 		path->nodes[*level-1] = next;
2500 		*level = btrfs_header_level(next);
2501 		path->slots[*level] = 0;
2502 		cond_resched();
2503 	}
2504 	WARN_ON(*level < 0);
2505 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2506 
2507 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2508 
2509 	cond_resched();
2510 	return 0;
2511 }
2512 
2513 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2514 				 struct btrfs_root *root,
2515 				 struct btrfs_path *path, int *level,
2516 				 struct walk_control *wc)
2517 {
2518 	struct btrfs_fs_info *fs_info = root->fs_info;
2519 	u64 root_owner;
2520 	int i;
2521 	int slot;
2522 	int ret;
2523 
2524 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2525 		slot = path->slots[i];
2526 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2527 			path->slots[i]++;
2528 			*level = i;
2529 			WARN_ON(*level == 0);
2530 			return 0;
2531 		} else {
2532 			struct extent_buffer *parent;
2533 			if (path->nodes[*level] == root->node)
2534 				parent = path->nodes[*level];
2535 			else
2536 				parent = path->nodes[*level + 1];
2537 
2538 			root_owner = btrfs_header_owner(parent);
2539 			ret = wc->process_func(root, path->nodes[*level], wc,
2540 				 btrfs_header_generation(path->nodes[*level]));
2541 			if (ret)
2542 				return ret;
2543 
2544 			if (wc->free) {
2545 				struct extent_buffer *next;
2546 
2547 				next = path->nodes[*level];
2548 
2549 				if (trans) {
2550 					btrfs_tree_lock(next);
2551 					btrfs_set_lock_blocking(next);
2552 					clean_tree_block(fs_info, next);
2553 					btrfs_wait_tree_block_writeback(next);
2554 					btrfs_tree_unlock(next);
2555 				}
2556 
2557 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2558 				ret = btrfs_free_and_pin_reserved_extent(
2559 						fs_info,
2560 						path->nodes[*level]->start,
2561 						path->nodes[*level]->len);
2562 				if (ret)
2563 					return ret;
2564 			}
2565 			free_extent_buffer(path->nodes[*level]);
2566 			path->nodes[*level] = NULL;
2567 			*level = i + 1;
2568 		}
2569 	}
2570 	return 1;
2571 }
2572 
2573 /*
2574  * drop the reference count on the tree rooted at 'snap'.  This traverses
2575  * the tree freeing any blocks that have a ref count of zero after being
2576  * decremented.
2577  */
2578 static int walk_log_tree(struct btrfs_trans_handle *trans,
2579 			 struct btrfs_root *log, struct walk_control *wc)
2580 {
2581 	struct btrfs_fs_info *fs_info = log->fs_info;
2582 	int ret = 0;
2583 	int wret;
2584 	int level;
2585 	struct btrfs_path *path;
2586 	int orig_level;
2587 
2588 	path = btrfs_alloc_path();
2589 	if (!path)
2590 		return -ENOMEM;
2591 
2592 	level = btrfs_header_level(log->node);
2593 	orig_level = level;
2594 	path->nodes[level] = log->node;
2595 	extent_buffer_get(log->node);
2596 	path->slots[level] = 0;
2597 
2598 	while (1) {
2599 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2600 		if (wret > 0)
2601 			break;
2602 		if (wret < 0) {
2603 			ret = wret;
2604 			goto out;
2605 		}
2606 
2607 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2608 		if (wret > 0)
2609 			break;
2610 		if (wret < 0) {
2611 			ret = wret;
2612 			goto out;
2613 		}
2614 	}
2615 
2616 	/* was the root node processed? if not, catch it here */
2617 	if (path->nodes[orig_level]) {
2618 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2619 			 btrfs_header_generation(path->nodes[orig_level]));
2620 		if (ret)
2621 			goto out;
2622 		if (wc->free) {
2623 			struct extent_buffer *next;
2624 
2625 			next = path->nodes[orig_level];
2626 
2627 			if (trans) {
2628 				btrfs_tree_lock(next);
2629 				btrfs_set_lock_blocking(next);
2630 				clean_tree_block(fs_info, next);
2631 				btrfs_wait_tree_block_writeback(next);
2632 				btrfs_tree_unlock(next);
2633 			}
2634 
2635 			WARN_ON(log->root_key.objectid !=
2636 				BTRFS_TREE_LOG_OBJECTID);
2637 			ret = btrfs_free_and_pin_reserved_extent(fs_info,
2638 							next->start, next->len);
2639 			if (ret)
2640 				goto out;
2641 		}
2642 	}
2643 
2644 out:
2645 	btrfs_free_path(path);
2646 	return ret;
2647 }
2648 
2649 /*
2650  * helper function to update the item for a given subvolumes log root
2651  * in the tree of log roots
2652  */
2653 static int update_log_root(struct btrfs_trans_handle *trans,
2654 			   struct btrfs_root *log)
2655 {
2656 	struct btrfs_fs_info *fs_info = log->fs_info;
2657 	int ret;
2658 
2659 	if (log->log_transid == 1) {
2660 		/* insert root item on the first sync */
2661 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2662 				&log->root_key, &log->root_item);
2663 	} else {
2664 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2665 				&log->root_key, &log->root_item);
2666 	}
2667 	return ret;
2668 }
2669 
2670 static void wait_log_commit(struct btrfs_root *root, int transid)
2671 {
2672 	DEFINE_WAIT(wait);
2673 	int index = transid % 2;
2674 
2675 	/*
2676 	 * we only allow two pending log transactions at a time,
2677 	 * so we know that if ours is more than 2 older than the
2678 	 * current transaction, we're done
2679 	 */
2680 	for (;;) {
2681 		prepare_to_wait(&root->log_commit_wait[index],
2682 				&wait, TASK_UNINTERRUPTIBLE);
2683 
2684 		if (!(root->log_transid_committed < transid &&
2685 		      atomic_read(&root->log_commit[index])))
2686 			break;
2687 
2688 		mutex_unlock(&root->log_mutex);
2689 		schedule();
2690 		mutex_lock(&root->log_mutex);
2691 	}
2692 	finish_wait(&root->log_commit_wait[index], &wait);
2693 }
2694 
2695 static void wait_for_writer(struct btrfs_root *root)
2696 {
2697 	DEFINE_WAIT(wait);
2698 
2699 	for (;;) {
2700 		prepare_to_wait(&root->log_writer_wait, &wait,
2701 				TASK_UNINTERRUPTIBLE);
2702 		if (!atomic_read(&root->log_writers))
2703 			break;
2704 
2705 		mutex_unlock(&root->log_mutex);
2706 		schedule();
2707 		mutex_lock(&root->log_mutex);
2708 	}
2709 	finish_wait(&root->log_writer_wait, &wait);
2710 }
2711 
2712 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2713 					struct btrfs_log_ctx *ctx)
2714 {
2715 	if (!ctx)
2716 		return;
2717 
2718 	mutex_lock(&root->log_mutex);
2719 	list_del_init(&ctx->list);
2720 	mutex_unlock(&root->log_mutex);
2721 }
2722 
2723 /*
2724  * Invoked in log mutex context, or be sure there is no other task which
2725  * can access the list.
2726  */
2727 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2728 					     int index, int error)
2729 {
2730 	struct btrfs_log_ctx *ctx;
2731 	struct btrfs_log_ctx *safe;
2732 
2733 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2734 		list_del_init(&ctx->list);
2735 		ctx->log_ret = error;
2736 	}
2737 
2738 	INIT_LIST_HEAD(&root->log_ctxs[index]);
2739 }
2740 
2741 /*
2742  * btrfs_sync_log does sends a given tree log down to the disk and
2743  * updates the super blocks to record it.  When this call is done,
2744  * you know that any inodes previously logged are safely on disk only
2745  * if it returns 0.
2746  *
2747  * Any other return value means you need to call btrfs_commit_transaction.
2748  * Some of the edge cases for fsyncing directories that have had unlinks
2749  * or renames done in the past mean that sometimes the only safe
2750  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2751  * that has happened.
2752  */
2753 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2754 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2755 {
2756 	int index1;
2757 	int index2;
2758 	int mark;
2759 	int ret;
2760 	struct btrfs_fs_info *fs_info = root->fs_info;
2761 	struct btrfs_root *log = root->log_root;
2762 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2763 	int log_transid = 0;
2764 	struct btrfs_log_ctx root_log_ctx;
2765 	struct blk_plug plug;
2766 
2767 	mutex_lock(&root->log_mutex);
2768 	log_transid = ctx->log_transid;
2769 	if (root->log_transid_committed >= log_transid) {
2770 		mutex_unlock(&root->log_mutex);
2771 		return ctx->log_ret;
2772 	}
2773 
2774 	index1 = log_transid % 2;
2775 	if (atomic_read(&root->log_commit[index1])) {
2776 		wait_log_commit(root, log_transid);
2777 		mutex_unlock(&root->log_mutex);
2778 		return ctx->log_ret;
2779 	}
2780 	ASSERT(log_transid == root->log_transid);
2781 	atomic_set(&root->log_commit[index1], 1);
2782 
2783 	/* wait for previous tree log sync to complete */
2784 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2785 		wait_log_commit(root, log_transid - 1);
2786 
2787 	while (1) {
2788 		int batch = atomic_read(&root->log_batch);
2789 		/* when we're on an ssd, just kick the log commit out */
2790 		if (!btrfs_test_opt(fs_info, SSD) &&
2791 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2792 			mutex_unlock(&root->log_mutex);
2793 			schedule_timeout_uninterruptible(1);
2794 			mutex_lock(&root->log_mutex);
2795 		}
2796 		wait_for_writer(root);
2797 		if (batch == atomic_read(&root->log_batch))
2798 			break;
2799 	}
2800 
2801 	/* bail out if we need to do a full commit */
2802 	if (btrfs_need_log_full_commit(fs_info, trans)) {
2803 		ret = -EAGAIN;
2804 		btrfs_free_logged_extents(log, log_transid);
2805 		mutex_unlock(&root->log_mutex);
2806 		goto out;
2807 	}
2808 
2809 	if (log_transid % 2 == 0)
2810 		mark = EXTENT_DIRTY;
2811 	else
2812 		mark = EXTENT_NEW;
2813 
2814 	/* we start IO on  all the marked extents here, but we don't actually
2815 	 * wait for them until later.
2816 	 */
2817 	blk_start_plug(&plug);
2818 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2819 	if (ret) {
2820 		blk_finish_plug(&plug);
2821 		btrfs_abort_transaction(trans, ret);
2822 		btrfs_free_logged_extents(log, log_transid);
2823 		btrfs_set_log_full_commit(fs_info, trans);
2824 		mutex_unlock(&root->log_mutex);
2825 		goto out;
2826 	}
2827 
2828 	btrfs_set_root_node(&log->root_item, log->node);
2829 
2830 	root->log_transid++;
2831 	log->log_transid = root->log_transid;
2832 	root->log_start_pid = 0;
2833 	/*
2834 	 * IO has been started, blocks of the log tree have WRITTEN flag set
2835 	 * in their headers. new modifications of the log will be written to
2836 	 * new positions. so it's safe to allow log writers to go in.
2837 	 */
2838 	mutex_unlock(&root->log_mutex);
2839 
2840 	btrfs_init_log_ctx(&root_log_ctx, NULL);
2841 
2842 	mutex_lock(&log_root_tree->log_mutex);
2843 	atomic_inc(&log_root_tree->log_batch);
2844 	atomic_inc(&log_root_tree->log_writers);
2845 
2846 	index2 = log_root_tree->log_transid % 2;
2847 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2848 	root_log_ctx.log_transid = log_root_tree->log_transid;
2849 
2850 	mutex_unlock(&log_root_tree->log_mutex);
2851 
2852 	ret = update_log_root(trans, log);
2853 
2854 	mutex_lock(&log_root_tree->log_mutex);
2855 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2856 		/*
2857 		 * Implicit memory barrier after atomic_dec_and_test
2858 		 */
2859 		if (waitqueue_active(&log_root_tree->log_writer_wait))
2860 			wake_up(&log_root_tree->log_writer_wait);
2861 	}
2862 
2863 	if (ret) {
2864 		if (!list_empty(&root_log_ctx.list))
2865 			list_del_init(&root_log_ctx.list);
2866 
2867 		blk_finish_plug(&plug);
2868 		btrfs_set_log_full_commit(fs_info, trans);
2869 
2870 		if (ret != -ENOSPC) {
2871 			btrfs_abort_transaction(trans, ret);
2872 			mutex_unlock(&log_root_tree->log_mutex);
2873 			goto out;
2874 		}
2875 		btrfs_wait_tree_log_extents(log, mark);
2876 		btrfs_free_logged_extents(log, log_transid);
2877 		mutex_unlock(&log_root_tree->log_mutex);
2878 		ret = -EAGAIN;
2879 		goto out;
2880 	}
2881 
2882 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2883 		blk_finish_plug(&plug);
2884 		list_del_init(&root_log_ctx.list);
2885 		mutex_unlock(&log_root_tree->log_mutex);
2886 		ret = root_log_ctx.log_ret;
2887 		goto out;
2888 	}
2889 
2890 	index2 = root_log_ctx.log_transid % 2;
2891 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2892 		blk_finish_plug(&plug);
2893 		ret = btrfs_wait_tree_log_extents(log, mark);
2894 		btrfs_wait_logged_extents(trans, log, log_transid);
2895 		wait_log_commit(log_root_tree,
2896 				root_log_ctx.log_transid);
2897 		mutex_unlock(&log_root_tree->log_mutex);
2898 		if (!ret)
2899 			ret = root_log_ctx.log_ret;
2900 		goto out;
2901 	}
2902 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2903 	atomic_set(&log_root_tree->log_commit[index2], 1);
2904 
2905 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2906 		wait_log_commit(log_root_tree,
2907 				root_log_ctx.log_transid - 1);
2908 	}
2909 
2910 	wait_for_writer(log_root_tree);
2911 
2912 	/*
2913 	 * now that we've moved on to the tree of log tree roots,
2914 	 * check the full commit flag again
2915 	 */
2916 	if (btrfs_need_log_full_commit(fs_info, trans)) {
2917 		blk_finish_plug(&plug);
2918 		btrfs_wait_tree_log_extents(log, mark);
2919 		btrfs_free_logged_extents(log, log_transid);
2920 		mutex_unlock(&log_root_tree->log_mutex);
2921 		ret = -EAGAIN;
2922 		goto out_wake_log_root;
2923 	}
2924 
2925 	ret = btrfs_write_marked_extents(fs_info,
2926 					 &log_root_tree->dirty_log_pages,
2927 					 EXTENT_DIRTY | EXTENT_NEW);
2928 	blk_finish_plug(&plug);
2929 	if (ret) {
2930 		btrfs_set_log_full_commit(fs_info, trans);
2931 		btrfs_abort_transaction(trans, ret);
2932 		btrfs_free_logged_extents(log, log_transid);
2933 		mutex_unlock(&log_root_tree->log_mutex);
2934 		goto out_wake_log_root;
2935 	}
2936 	ret = btrfs_wait_tree_log_extents(log, mark);
2937 	if (!ret)
2938 		ret = btrfs_wait_tree_log_extents(log_root_tree,
2939 						  EXTENT_NEW | EXTENT_DIRTY);
2940 	if (ret) {
2941 		btrfs_set_log_full_commit(fs_info, trans);
2942 		btrfs_free_logged_extents(log, log_transid);
2943 		mutex_unlock(&log_root_tree->log_mutex);
2944 		goto out_wake_log_root;
2945 	}
2946 	btrfs_wait_logged_extents(trans, log, log_transid);
2947 
2948 	btrfs_set_super_log_root(fs_info->super_for_commit,
2949 				 log_root_tree->node->start);
2950 	btrfs_set_super_log_root_level(fs_info->super_for_commit,
2951 				       btrfs_header_level(log_root_tree->node));
2952 
2953 	log_root_tree->log_transid++;
2954 	mutex_unlock(&log_root_tree->log_mutex);
2955 
2956 	/*
2957 	 * nobody else is going to jump in and write the the ctree
2958 	 * super here because the log_commit atomic below is protecting
2959 	 * us.  We must be called with a transaction handle pinning
2960 	 * the running transaction open, so a full commit can't hop
2961 	 * in and cause problems either.
2962 	 */
2963 	ret = write_all_supers(fs_info, 1);
2964 	if (ret) {
2965 		btrfs_set_log_full_commit(fs_info, trans);
2966 		btrfs_abort_transaction(trans, ret);
2967 		goto out_wake_log_root;
2968 	}
2969 
2970 	mutex_lock(&root->log_mutex);
2971 	if (root->last_log_commit < log_transid)
2972 		root->last_log_commit = log_transid;
2973 	mutex_unlock(&root->log_mutex);
2974 
2975 out_wake_log_root:
2976 	mutex_lock(&log_root_tree->log_mutex);
2977 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2978 
2979 	log_root_tree->log_transid_committed++;
2980 	atomic_set(&log_root_tree->log_commit[index2], 0);
2981 	mutex_unlock(&log_root_tree->log_mutex);
2982 
2983 	/*
2984 	 * The barrier before waitqueue_active is implied by mutex_unlock
2985 	 */
2986 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2987 		wake_up(&log_root_tree->log_commit_wait[index2]);
2988 out:
2989 	mutex_lock(&root->log_mutex);
2990 	btrfs_remove_all_log_ctxs(root, index1, ret);
2991 	root->log_transid_committed++;
2992 	atomic_set(&root->log_commit[index1], 0);
2993 	mutex_unlock(&root->log_mutex);
2994 
2995 	/*
2996 	 * The barrier before waitqueue_active is implied by mutex_unlock
2997 	 */
2998 	if (waitqueue_active(&root->log_commit_wait[index1]))
2999 		wake_up(&root->log_commit_wait[index1]);
3000 	return ret;
3001 }
3002 
3003 static void free_log_tree(struct btrfs_trans_handle *trans,
3004 			  struct btrfs_root *log)
3005 {
3006 	int ret;
3007 	u64 start;
3008 	u64 end;
3009 	struct walk_control wc = {
3010 		.free = 1,
3011 		.process_func = process_one_buffer
3012 	};
3013 
3014 	ret = walk_log_tree(trans, log, &wc);
3015 	/* I don't think this can happen but just in case */
3016 	if (ret)
3017 		btrfs_abort_transaction(trans, ret);
3018 
3019 	while (1) {
3020 		ret = find_first_extent_bit(&log->dirty_log_pages,
3021 				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
3022 				NULL);
3023 		if (ret)
3024 			break;
3025 
3026 		clear_extent_bits(&log->dirty_log_pages, start, end,
3027 				  EXTENT_DIRTY | EXTENT_NEW);
3028 	}
3029 
3030 	/*
3031 	 * We may have short-circuited the log tree with the full commit logic
3032 	 * and left ordered extents on our list, so clear these out to keep us
3033 	 * from leaking inodes and memory.
3034 	 */
3035 	btrfs_free_logged_extents(log, 0);
3036 	btrfs_free_logged_extents(log, 1);
3037 
3038 	free_extent_buffer(log->node);
3039 	kfree(log);
3040 }
3041 
3042 /*
3043  * free all the extents used by the tree log.  This should be called
3044  * at commit time of the full transaction
3045  */
3046 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3047 {
3048 	if (root->log_root) {
3049 		free_log_tree(trans, root->log_root);
3050 		root->log_root = NULL;
3051 	}
3052 	return 0;
3053 }
3054 
3055 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3056 			     struct btrfs_fs_info *fs_info)
3057 {
3058 	if (fs_info->log_root_tree) {
3059 		free_log_tree(trans, fs_info->log_root_tree);
3060 		fs_info->log_root_tree = NULL;
3061 	}
3062 	return 0;
3063 }
3064 
3065 /*
3066  * If both a file and directory are logged, and unlinks or renames are
3067  * mixed in, we have a few interesting corners:
3068  *
3069  * create file X in dir Y
3070  * link file X to X.link in dir Y
3071  * fsync file X
3072  * unlink file X but leave X.link
3073  * fsync dir Y
3074  *
3075  * After a crash we would expect only X.link to exist.  But file X
3076  * didn't get fsync'd again so the log has back refs for X and X.link.
3077  *
3078  * We solve this by removing directory entries and inode backrefs from the
3079  * log when a file that was logged in the current transaction is
3080  * unlinked.  Any later fsync will include the updated log entries, and
3081  * we'll be able to reconstruct the proper directory items from backrefs.
3082  *
3083  * This optimizations allows us to avoid relogging the entire inode
3084  * or the entire directory.
3085  */
3086 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3087 				 struct btrfs_root *root,
3088 				 const char *name, int name_len,
3089 				 struct btrfs_inode *dir, u64 index)
3090 {
3091 	struct btrfs_root *log;
3092 	struct btrfs_dir_item *di;
3093 	struct btrfs_path *path;
3094 	int ret;
3095 	int err = 0;
3096 	int bytes_del = 0;
3097 	u64 dir_ino = btrfs_ino(dir);
3098 
3099 	if (dir->logged_trans < trans->transid)
3100 		return 0;
3101 
3102 	ret = join_running_log_trans(root);
3103 	if (ret)
3104 		return 0;
3105 
3106 	mutex_lock(&dir->log_mutex);
3107 
3108 	log = root->log_root;
3109 	path = btrfs_alloc_path();
3110 	if (!path) {
3111 		err = -ENOMEM;
3112 		goto out_unlock;
3113 	}
3114 
3115 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3116 				   name, name_len, -1);
3117 	if (IS_ERR(di)) {
3118 		err = PTR_ERR(di);
3119 		goto fail;
3120 	}
3121 	if (di) {
3122 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3123 		bytes_del += name_len;
3124 		if (ret) {
3125 			err = ret;
3126 			goto fail;
3127 		}
3128 	}
3129 	btrfs_release_path(path);
3130 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3131 					 index, name, name_len, -1);
3132 	if (IS_ERR(di)) {
3133 		err = PTR_ERR(di);
3134 		goto fail;
3135 	}
3136 	if (di) {
3137 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3138 		bytes_del += name_len;
3139 		if (ret) {
3140 			err = ret;
3141 			goto fail;
3142 		}
3143 	}
3144 
3145 	/* update the directory size in the log to reflect the names
3146 	 * we have removed
3147 	 */
3148 	if (bytes_del) {
3149 		struct btrfs_key key;
3150 
3151 		key.objectid = dir_ino;
3152 		key.offset = 0;
3153 		key.type = BTRFS_INODE_ITEM_KEY;
3154 		btrfs_release_path(path);
3155 
3156 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3157 		if (ret < 0) {
3158 			err = ret;
3159 			goto fail;
3160 		}
3161 		if (ret == 0) {
3162 			struct btrfs_inode_item *item;
3163 			u64 i_size;
3164 
3165 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3166 					      struct btrfs_inode_item);
3167 			i_size = btrfs_inode_size(path->nodes[0], item);
3168 			if (i_size > bytes_del)
3169 				i_size -= bytes_del;
3170 			else
3171 				i_size = 0;
3172 			btrfs_set_inode_size(path->nodes[0], item, i_size);
3173 			btrfs_mark_buffer_dirty(path->nodes[0]);
3174 		} else
3175 			ret = 0;
3176 		btrfs_release_path(path);
3177 	}
3178 fail:
3179 	btrfs_free_path(path);
3180 out_unlock:
3181 	mutex_unlock(&dir->log_mutex);
3182 	if (ret == -ENOSPC) {
3183 		btrfs_set_log_full_commit(root->fs_info, trans);
3184 		ret = 0;
3185 	} else if (ret < 0)
3186 		btrfs_abort_transaction(trans, ret);
3187 
3188 	btrfs_end_log_trans(root);
3189 
3190 	return err;
3191 }
3192 
3193 /* see comments for btrfs_del_dir_entries_in_log */
3194 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3195 			       struct btrfs_root *root,
3196 			       const char *name, int name_len,
3197 			       struct btrfs_inode *inode, u64 dirid)
3198 {
3199 	struct btrfs_fs_info *fs_info = root->fs_info;
3200 	struct btrfs_root *log;
3201 	u64 index;
3202 	int ret;
3203 
3204 	if (inode->logged_trans < trans->transid)
3205 		return 0;
3206 
3207 	ret = join_running_log_trans(root);
3208 	if (ret)
3209 		return 0;
3210 	log = root->log_root;
3211 	mutex_lock(&inode->log_mutex);
3212 
3213 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3214 				  dirid, &index);
3215 	mutex_unlock(&inode->log_mutex);
3216 	if (ret == -ENOSPC) {
3217 		btrfs_set_log_full_commit(fs_info, trans);
3218 		ret = 0;
3219 	} else if (ret < 0 && ret != -ENOENT)
3220 		btrfs_abort_transaction(trans, ret);
3221 	btrfs_end_log_trans(root);
3222 
3223 	return ret;
3224 }
3225 
3226 /*
3227  * creates a range item in the log for 'dirid'.  first_offset and
3228  * last_offset tell us which parts of the key space the log should
3229  * be considered authoritative for.
3230  */
3231 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3232 				       struct btrfs_root *log,
3233 				       struct btrfs_path *path,
3234 				       int key_type, u64 dirid,
3235 				       u64 first_offset, u64 last_offset)
3236 {
3237 	int ret;
3238 	struct btrfs_key key;
3239 	struct btrfs_dir_log_item *item;
3240 
3241 	key.objectid = dirid;
3242 	key.offset = first_offset;
3243 	if (key_type == BTRFS_DIR_ITEM_KEY)
3244 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3245 	else
3246 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3247 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3248 	if (ret)
3249 		return ret;
3250 
3251 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3252 			      struct btrfs_dir_log_item);
3253 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3254 	btrfs_mark_buffer_dirty(path->nodes[0]);
3255 	btrfs_release_path(path);
3256 	return 0;
3257 }
3258 
3259 /*
3260  * log all the items included in the current transaction for a given
3261  * directory.  This also creates the range items in the log tree required
3262  * to replay anything deleted before the fsync
3263  */
3264 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3265 			  struct btrfs_root *root, struct btrfs_inode *inode,
3266 			  struct btrfs_path *path,
3267 			  struct btrfs_path *dst_path, int key_type,
3268 			  struct btrfs_log_ctx *ctx,
3269 			  u64 min_offset, u64 *last_offset_ret)
3270 {
3271 	struct btrfs_key min_key;
3272 	struct btrfs_root *log = root->log_root;
3273 	struct extent_buffer *src;
3274 	int err = 0;
3275 	int ret;
3276 	int i;
3277 	int nritems;
3278 	u64 first_offset = min_offset;
3279 	u64 last_offset = (u64)-1;
3280 	u64 ino = btrfs_ino(inode);
3281 
3282 	log = root->log_root;
3283 
3284 	min_key.objectid = ino;
3285 	min_key.type = key_type;
3286 	min_key.offset = min_offset;
3287 
3288 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3289 
3290 	/*
3291 	 * we didn't find anything from this transaction, see if there
3292 	 * is anything at all
3293 	 */
3294 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3295 		min_key.objectid = ino;
3296 		min_key.type = key_type;
3297 		min_key.offset = (u64)-1;
3298 		btrfs_release_path(path);
3299 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3300 		if (ret < 0) {
3301 			btrfs_release_path(path);
3302 			return ret;
3303 		}
3304 		ret = btrfs_previous_item(root, path, ino, key_type);
3305 
3306 		/* if ret == 0 there are items for this type,
3307 		 * create a range to tell us the last key of this type.
3308 		 * otherwise, there are no items in this directory after
3309 		 * *min_offset, and we create a range to indicate that.
3310 		 */
3311 		if (ret == 0) {
3312 			struct btrfs_key tmp;
3313 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3314 					      path->slots[0]);
3315 			if (key_type == tmp.type)
3316 				first_offset = max(min_offset, tmp.offset) + 1;
3317 		}
3318 		goto done;
3319 	}
3320 
3321 	/* go backward to find any previous key */
3322 	ret = btrfs_previous_item(root, path, ino, key_type);
3323 	if (ret == 0) {
3324 		struct btrfs_key tmp;
3325 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3326 		if (key_type == tmp.type) {
3327 			first_offset = tmp.offset;
3328 			ret = overwrite_item(trans, log, dst_path,
3329 					     path->nodes[0], path->slots[0],
3330 					     &tmp);
3331 			if (ret) {
3332 				err = ret;
3333 				goto done;
3334 			}
3335 		}
3336 	}
3337 	btrfs_release_path(path);
3338 
3339 	/* find the first key from this transaction again */
3340 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3341 	if (WARN_ON(ret != 0))
3342 		goto done;
3343 
3344 	/*
3345 	 * we have a block from this transaction, log every item in it
3346 	 * from our directory
3347 	 */
3348 	while (1) {
3349 		struct btrfs_key tmp;
3350 		src = path->nodes[0];
3351 		nritems = btrfs_header_nritems(src);
3352 		for (i = path->slots[0]; i < nritems; i++) {
3353 			struct btrfs_dir_item *di;
3354 
3355 			btrfs_item_key_to_cpu(src, &min_key, i);
3356 
3357 			if (min_key.objectid != ino || min_key.type != key_type)
3358 				goto done;
3359 			ret = overwrite_item(trans, log, dst_path, src, i,
3360 					     &min_key);
3361 			if (ret) {
3362 				err = ret;
3363 				goto done;
3364 			}
3365 
3366 			/*
3367 			 * We must make sure that when we log a directory entry,
3368 			 * the corresponding inode, after log replay, has a
3369 			 * matching link count. For example:
3370 			 *
3371 			 * touch foo
3372 			 * mkdir mydir
3373 			 * sync
3374 			 * ln foo mydir/bar
3375 			 * xfs_io -c "fsync" mydir
3376 			 * <crash>
3377 			 * <mount fs and log replay>
3378 			 *
3379 			 * Would result in a fsync log that when replayed, our
3380 			 * file inode would have a link count of 1, but we get
3381 			 * two directory entries pointing to the same inode.
3382 			 * After removing one of the names, it would not be
3383 			 * possible to remove the other name, which resulted
3384 			 * always in stale file handle errors, and would not
3385 			 * be possible to rmdir the parent directory, since
3386 			 * its i_size could never decrement to the value
3387 			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3388 			 */
3389 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3390 			btrfs_dir_item_key_to_cpu(src, di, &tmp);
3391 			if (ctx &&
3392 			    (btrfs_dir_transid(src, di) == trans->transid ||
3393 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3394 			    tmp.type != BTRFS_ROOT_ITEM_KEY)
3395 				ctx->log_new_dentries = true;
3396 		}
3397 		path->slots[0] = nritems;
3398 
3399 		/*
3400 		 * look ahead to the next item and see if it is also
3401 		 * from this directory and from this transaction
3402 		 */
3403 		ret = btrfs_next_leaf(root, path);
3404 		if (ret == 1) {
3405 			last_offset = (u64)-1;
3406 			goto done;
3407 		}
3408 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3409 		if (tmp.objectid != ino || tmp.type != key_type) {
3410 			last_offset = (u64)-1;
3411 			goto done;
3412 		}
3413 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3414 			ret = overwrite_item(trans, log, dst_path,
3415 					     path->nodes[0], path->slots[0],
3416 					     &tmp);
3417 			if (ret)
3418 				err = ret;
3419 			else
3420 				last_offset = tmp.offset;
3421 			goto done;
3422 		}
3423 	}
3424 done:
3425 	btrfs_release_path(path);
3426 	btrfs_release_path(dst_path);
3427 
3428 	if (err == 0) {
3429 		*last_offset_ret = last_offset;
3430 		/*
3431 		 * insert the log range keys to indicate where the log
3432 		 * is valid
3433 		 */
3434 		ret = insert_dir_log_key(trans, log, path, key_type,
3435 					 ino, first_offset, last_offset);
3436 		if (ret)
3437 			err = ret;
3438 	}
3439 	return err;
3440 }
3441 
3442 /*
3443  * logging directories is very similar to logging inodes, We find all the items
3444  * from the current transaction and write them to the log.
3445  *
3446  * The recovery code scans the directory in the subvolume, and if it finds a
3447  * key in the range logged that is not present in the log tree, then it means
3448  * that dir entry was unlinked during the transaction.
3449  *
3450  * In order for that scan to work, we must include one key smaller than
3451  * the smallest logged by this transaction and one key larger than the largest
3452  * key logged by this transaction.
3453  */
3454 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3455 			  struct btrfs_root *root, struct btrfs_inode *inode,
3456 			  struct btrfs_path *path,
3457 			  struct btrfs_path *dst_path,
3458 			  struct btrfs_log_ctx *ctx)
3459 {
3460 	u64 min_key;
3461 	u64 max_key;
3462 	int ret;
3463 	int key_type = BTRFS_DIR_ITEM_KEY;
3464 
3465 again:
3466 	min_key = 0;
3467 	max_key = 0;
3468 	while (1) {
3469 		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3470 				ctx, min_key, &max_key);
3471 		if (ret)
3472 			return ret;
3473 		if (max_key == (u64)-1)
3474 			break;
3475 		min_key = max_key + 1;
3476 	}
3477 
3478 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3479 		key_type = BTRFS_DIR_INDEX_KEY;
3480 		goto again;
3481 	}
3482 	return 0;
3483 }
3484 
3485 /*
3486  * a helper function to drop items from the log before we relog an
3487  * inode.  max_key_type indicates the highest item type to remove.
3488  * This cannot be run for file data extents because it does not
3489  * free the extents they point to.
3490  */
3491 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3492 				  struct btrfs_root *log,
3493 				  struct btrfs_path *path,
3494 				  u64 objectid, int max_key_type)
3495 {
3496 	int ret;
3497 	struct btrfs_key key;
3498 	struct btrfs_key found_key;
3499 	int start_slot;
3500 
3501 	key.objectid = objectid;
3502 	key.type = max_key_type;
3503 	key.offset = (u64)-1;
3504 
3505 	while (1) {
3506 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3507 		BUG_ON(ret == 0); /* Logic error */
3508 		if (ret < 0)
3509 			break;
3510 
3511 		if (path->slots[0] == 0)
3512 			break;
3513 
3514 		path->slots[0]--;
3515 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3516 				      path->slots[0]);
3517 
3518 		if (found_key.objectid != objectid)
3519 			break;
3520 
3521 		found_key.offset = 0;
3522 		found_key.type = 0;
3523 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3524 				       &start_slot);
3525 
3526 		ret = btrfs_del_items(trans, log, path, start_slot,
3527 				      path->slots[0] - start_slot + 1);
3528 		/*
3529 		 * If start slot isn't 0 then we don't need to re-search, we've
3530 		 * found the last guy with the objectid in this tree.
3531 		 */
3532 		if (ret || start_slot != 0)
3533 			break;
3534 		btrfs_release_path(path);
3535 	}
3536 	btrfs_release_path(path);
3537 	if (ret > 0)
3538 		ret = 0;
3539 	return ret;
3540 }
3541 
3542 static void fill_inode_item(struct btrfs_trans_handle *trans,
3543 			    struct extent_buffer *leaf,
3544 			    struct btrfs_inode_item *item,
3545 			    struct inode *inode, int log_inode_only,
3546 			    u64 logged_isize)
3547 {
3548 	struct btrfs_map_token token;
3549 
3550 	btrfs_init_map_token(&token);
3551 
3552 	if (log_inode_only) {
3553 		/* set the generation to zero so the recover code
3554 		 * can tell the difference between an logging
3555 		 * just to say 'this inode exists' and a logging
3556 		 * to say 'update this inode with these values'
3557 		 */
3558 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
3559 		btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3560 	} else {
3561 		btrfs_set_token_inode_generation(leaf, item,
3562 						 BTRFS_I(inode)->generation,
3563 						 &token);
3564 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3565 	}
3566 
3567 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3568 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3569 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3570 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3571 
3572 	btrfs_set_token_timespec_sec(leaf, &item->atime,
3573 				     inode->i_atime.tv_sec, &token);
3574 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
3575 				      inode->i_atime.tv_nsec, &token);
3576 
3577 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
3578 				     inode->i_mtime.tv_sec, &token);
3579 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3580 				      inode->i_mtime.tv_nsec, &token);
3581 
3582 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
3583 				     inode->i_ctime.tv_sec, &token);
3584 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3585 				      inode->i_ctime.tv_nsec, &token);
3586 
3587 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3588 				     &token);
3589 
3590 	btrfs_set_token_inode_sequence(leaf, item,
3591 				       inode_peek_iversion(inode), &token);
3592 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3593 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3594 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3595 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3596 }
3597 
3598 static int log_inode_item(struct btrfs_trans_handle *trans,
3599 			  struct btrfs_root *log, struct btrfs_path *path,
3600 			  struct btrfs_inode *inode)
3601 {
3602 	struct btrfs_inode_item *inode_item;
3603 	int ret;
3604 
3605 	ret = btrfs_insert_empty_item(trans, log, path,
3606 				      &inode->location, sizeof(*inode_item));
3607 	if (ret && ret != -EEXIST)
3608 		return ret;
3609 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3610 				    struct btrfs_inode_item);
3611 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3612 			0, 0);
3613 	btrfs_release_path(path);
3614 	return 0;
3615 }
3616 
3617 static noinline int copy_items(struct btrfs_trans_handle *trans,
3618 			       struct btrfs_inode *inode,
3619 			       struct btrfs_path *dst_path,
3620 			       struct btrfs_path *src_path, u64 *last_extent,
3621 			       int start_slot, int nr, int inode_only,
3622 			       u64 logged_isize)
3623 {
3624 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3625 	unsigned long src_offset;
3626 	unsigned long dst_offset;
3627 	struct btrfs_root *log = inode->root->log_root;
3628 	struct btrfs_file_extent_item *extent;
3629 	struct btrfs_inode_item *inode_item;
3630 	struct extent_buffer *src = src_path->nodes[0];
3631 	struct btrfs_key first_key, last_key, key;
3632 	int ret;
3633 	struct btrfs_key *ins_keys;
3634 	u32 *ins_sizes;
3635 	char *ins_data;
3636 	int i;
3637 	struct list_head ordered_sums;
3638 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
3639 	bool has_extents = false;
3640 	bool need_find_last_extent = true;
3641 	bool done = false;
3642 
3643 	INIT_LIST_HEAD(&ordered_sums);
3644 
3645 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3646 			   nr * sizeof(u32), GFP_NOFS);
3647 	if (!ins_data)
3648 		return -ENOMEM;
3649 
3650 	first_key.objectid = (u64)-1;
3651 
3652 	ins_sizes = (u32 *)ins_data;
3653 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3654 
3655 	for (i = 0; i < nr; i++) {
3656 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3657 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3658 	}
3659 	ret = btrfs_insert_empty_items(trans, log, dst_path,
3660 				       ins_keys, ins_sizes, nr);
3661 	if (ret) {
3662 		kfree(ins_data);
3663 		return ret;
3664 	}
3665 
3666 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3667 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3668 						   dst_path->slots[0]);
3669 
3670 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3671 
3672 		if (i == nr - 1)
3673 			last_key = ins_keys[i];
3674 
3675 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3676 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
3677 						    dst_path->slots[0],
3678 						    struct btrfs_inode_item);
3679 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
3680 					&inode->vfs_inode,
3681 					inode_only == LOG_INODE_EXISTS,
3682 					logged_isize);
3683 		} else {
3684 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3685 					   src_offset, ins_sizes[i]);
3686 		}
3687 
3688 		/*
3689 		 * We set need_find_last_extent here in case we know we were
3690 		 * processing other items and then walk into the first extent in
3691 		 * the inode.  If we don't hit an extent then nothing changes,
3692 		 * we'll do the last search the next time around.
3693 		 */
3694 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3695 			has_extents = true;
3696 			if (first_key.objectid == (u64)-1)
3697 				first_key = ins_keys[i];
3698 		} else {
3699 			need_find_last_extent = false;
3700 		}
3701 
3702 		/* take a reference on file data extents so that truncates
3703 		 * or deletes of this inode don't have to relog the inode
3704 		 * again
3705 		 */
3706 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3707 		    !skip_csum) {
3708 			int found_type;
3709 			extent = btrfs_item_ptr(src, start_slot + i,
3710 						struct btrfs_file_extent_item);
3711 
3712 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
3713 				continue;
3714 
3715 			found_type = btrfs_file_extent_type(src, extent);
3716 			if (found_type == BTRFS_FILE_EXTENT_REG) {
3717 				u64 ds, dl, cs, cl;
3718 				ds = btrfs_file_extent_disk_bytenr(src,
3719 								extent);
3720 				/* ds == 0 is a hole */
3721 				if (ds == 0)
3722 					continue;
3723 
3724 				dl = btrfs_file_extent_disk_num_bytes(src,
3725 								extent);
3726 				cs = btrfs_file_extent_offset(src, extent);
3727 				cl = btrfs_file_extent_num_bytes(src,
3728 								extent);
3729 				if (btrfs_file_extent_compression(src,
3730 								  extent)) {
3731 					cs = 0;
3732 					cl = dl;
3733 				}
3734 
3735 				ret = btrfs_lookup_csums_range(
3736 						fs_info->csum_root,
3737 						ds + cs, ds + cs + cl - 1,
3738 						&ordered_sums, 0);
3739 				if (ret) {
3740 					btrfs_release_path(dst_path);
3741 					kfree(ins_data);
3742 					return ret;
3743 				}
3744 			}
3745 		}
3746 	}
3747 
3748 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3749 	btrfs_release_path(dst_path);
3750 	kfree(ins_data);
3751 
3752 	/*
3753 	 * we have to do this after the loop above to avoid changing the
3754 	 * log tree while trying to change the log tree.
3755 	 */
3756 	ret = 0;
3757 	while (!list_empty(&ordered_sums)) {
3758 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3759 						   struct btrfs_ordered_sum,
3760 						   list);
3761 		if (!ret)
3762 			ret = btrfs_csum_file_blocks(trans, log, sums);
3763 		list_del(&sums->list);
3764 		kfree(sums);
3765 	}
3766 
3767 	if (!has_extents)
3768 		return ret;
3769 
3770 	if (need_find_last_extent && *last_extent == first_key.offset) {
3771 		/*
3772 		 * We don't have any leafs between our current one and the one
3773 		 * we processed before that can have file extent items for our
3774 		 * inode (and have a generation number smaller than our current
3775 		 * transaction id).
3776 		 */
3777 		need_find_last_extent = false;
3778 	}
3779 
3780 	/*
3781 	 * Because we use btrfs_search_forward we could skip leaves that were
3782 	 * not modified and then assume *last_extent is valid when it really
3783 	 * isn't.  So back up to the previous leaf and read the end of the last
3784 	 * extent before we go and fill in holes.
3785 	 */
3786 	if (need_find_last_extent) {
3787 		u64 len;
3788 
3789 		ret = btrfs_prev_leaf(inode->root, src_path);
3790 		if (ret < 0)
3791 			return ret;
3792 		if (ret)
3793 			goto fill_holes;
3794 		if (src_path->slots[0])
3795 			src_path->slots[0]--;
3796 		src = src_path->nodes[0];
3797 		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3798 		if (key.objectid != btrfs_ino(inode) ||
3799 		    key.type != BTRFS_EXTENT_DATA_KEY)
3800 			goto fill_holes;
3801 		extent = btrfs_item_ptr(src, src_path->slots[0],
3802 					struct btrfs_file_extent_item);
3803 		if (btrfs_file_extent_type(src, extent) ==
3804 		    BTRFS_FILE_EXTENT_INLINE) {
3805 			len = btrfs_file_extent_inline_len(src,
3806 							   src_path->slots[0],
3807 							   extent);
3808 			*last_extent = ALIGN(key.offset + len,
3809 					     fs_info->sectorsize);
3810 		} else {
3811 			len = btrfs_file_extent_num_bytes(src, extent);
3812 			*last_extent = key.offset + len;
3813 		}
3814 	}
3815 fill_holes:
3816 	/* So we did prev_leaf, now we need to move to the next leaf, but a few
3817 	 * things could have happened
3818 	 *
3819 	 * 1) A merge could have happened, so we could currently be on a leaf
3820 	 * that holds what we were copying in the first place.
3821 	 * 2) A split could have happened, and now not all of the items we want
3822 	 * are on the same leaf.
3823 	 *
3824 	 * So we need to adjust how we search for holes, we need to drop the
3825 	 * path and re-search for the first extent key we found, and then walk
3826 	 * forward until we hit the last one we copied.
3827 	 */
3828 	if (need_find_last_extent) {
3829 		/* btrfs_prev_leaf could return 1 without releasing the path */
3830 		btrfs_release_path(src_path);
3831 		ret = btrfs_search_slot(NULL, inode->root, &first_key,
3832 				src_path, 0, 0);
3833 		if (ret < 0)
3834 			return ret;
3835 		ASSERT(ret == 0);
3836 		src = src_path->nodes[0];
3837 		i = src_path->slots[0];
3838 	} else {
3839 		i = start_slot;
3840 	}
3841 
3842 	/*
3843 	 * Ok so here we need to go through and fill in any holes we may have
3844 	 * to make sure that holes are punched for those areas in case they had
3845 	 * extents previously.
3846 	 */
3847 	while (!done) {
3848 		u64 offset, len;
3849 		u64 extent_end;
3850 
3851 		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3852 			ret = btrfs_next_leaf(inode->root, src_path);
3853 			if (ret < 0)
3854 				return ret;
3855 			ASSERT(ret == 0);
3856 			src = src_path->nodes[0];
3857 			i = 0;
3858 		}
3859 
3860 		btrfs_item_key_to_cpu(src, &key, i);
3861 		if (!btrfs_comp_cpu_keys(&key, &last_key))
3862 			done = true;
3863 		if (key.objectid != btrfs_ino(inode) ||
3864 		    key.type != BTRFS_EXTENT_DATA_KEY) {
3865 			i++;
3866 			continue;
3867 		}
3868 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3869 		if (btrfs_file_extent_type(src, extent) ==
3870 		    BTRFS_FILE_EXTENT_INLINE) {
3871 			len = btrfs_file_extent_inline_len(src, i, extent);
3872 			extent_end = ALIGN(key.offset + len,
3873 					   fs_info->sectorsize);
3874 		} else {
3875 			len = btrfs_file_extent_num_bytes(src, extent);
3876 			extent_end = key.offset + len;
3877 		}
3878 		i++;
3879 
3880 		if (*last_extent == key.offset) {
3881 			*last_extent = extent_end;
3882 			continue;
3883 		}
3884 		offset = *last_extent;
3885 		len = key.offset - *last_extent;
3886 		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3887 				offset, 0, 0, len, 0, len, 0, 0, 0);
3888 		if (ret)
3889 			break;
3890 		*last_extent = extent_end;
3891 	}
3892 	/*
3893 	 * Need to let the callers know we dropped the path so they should
3894 	 * re-search.
3895 	 */
3896 	if (!ret && need_find_last_extent)
3897 		ret = 1;
3898 	return ret;
3899 }
3900 
3901 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3902 {
3903 	struct extent_map *em1, *em2;
3904 
3905 	em1 = list_entry(a, struct extent_map, list);
3906 	em2 = list_entry(b, struct extent_map, list);
3907 
3908 	if (em1->start < em2->start)
3909 		return -1;
3910 	else if (em1->start > em2->start)
3911 		return 1;
3912 	return 0;
3913 }
3914 
3915 static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3916 				struct inode *inode,
3917 				struct btrfs_root *root,
3918 				const struct extent_map *em,
3919 				const struct list_head *logged_list,
3920 				bool *ordered_io_error)
3921 {
3922 	struct btrfs_fs_info *fs_info = root->fs_info;
3923 	struct btrfs_ordered_extent *ordered;
3924 	struct btrfs_root *log = root->log_root;
3925 	u64 mod_start = em->mod_start;
3926 	u64 mod_len = em->mod_len;
3927 	const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3928 	u64 csum_offset;
3929 	u64 csum_len;
3930 	LIST_HEAD(ordered_sums);
3931 	int ret = 0;
3932 
3933 	*ordered_io_error = false;
3934 
3935 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3936 	    em->block_start == EXTENT_MAP_HOLE)
3937 		return 0;
3938 
3939 	/*
3940 	 * Wait far any ordered extent that covers our extent map. If it
3941 	 * finishes without an error, first check and see if our csums are on
3942 	 * our outstanding ordered extents.
3943 	 */
3944 	list_for_each_entry(ordered, logged_list, log_list) {
3945 		struct btrfs_ordered_sum *sum;
3946 
3947 		if (!mod_len)
3948 			break;
3949 
3950 		if (ordered->file_offset + ordered->len <= mod_start ||
3951 		    mod_start + mod_len <= ordered->file_offset)
3952 			continue;
3953 
3954 		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3955 		    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3956 		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3957 			const u64 start = ordered->file_offset;
3958 			const u64 end = ordered->file_offset + ordered->len - 1;
3959 
3960 			WARN_ON(ordered->inode != inode);
3961 			filemap_fdatawrite_range(inode->i_mapping, start, end);
3962 		}
3963 
3964 		wait_event(ordered->wait,
3965 			   (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3966 			    test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3967 
3968 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3969 			/*
3970 			 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3971 			 * i_mapping flags, so that the next fsync won't get
3972 			 * an outdated io error too.
3973 			 */
3974 			filemap_check_errors(inode->i_mapping);
3975 			*ordered_io_error = true;
3976 			break;
3977 		}
3978 		/*
3979 		 * We are going to copy all the csums on this ordered extent, so
3980 		 * go ahead and adjust mod_start and mod_len in case this
3981 		 * ordered extent has already been logged.
3982 		 */
3983 		if (ordered->file_offset > mod_start) {
3984 			if (ordered->file_offset + ordered->len >=
3985 			    mod_start + mod_len)
3986 				mod_len = ordered->file_offset - mod_start;
3987 			/*
3988 			 * If we have this case
3989 			 *
3990 			 * |--------- logged extent ---------|
3991 			 *       |----- ordered extent ----|
3992 			 *
3993 			 * Just don't mess with mod_start and mod_len, we'll
3994 			 * just end up logging more csums than we need and it
3995 			 * will be ok.
3996 			 */
3997 		} else {
3998 			if (ordered->file_offset + ordered->len <
3999 			    mod_start + mod_len) {
4000 				mod_len = (mod_start + mod_len) -
4001 					(ordered->file_offset + ordered->len);
4002 				mod_start = ordered->file_offset +
4003 					ordered->len;
4004 			} else {
4005 				mod_len = 0;
4006 			}
4007 		}
4008 
4009 		if (skip_csum)
4010 			continue;
4011 
4012 		/*
4013 		 * To keep us from looping for the above case of an ordered
4014 		 * extent that falls inside of the logged extent.
4015 		 */
4016 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
4017 				     &ordered->flags))
4018 			continue;
4019 
4020 		list_for_each_entry(sum, &ordered->list, list) {
4021 			ret = btrfs_csum_file_blocks(trans, log, sum);
4022 			if (ret)
4023 				break;
4024 		}
4025 	}
4026 
4027 	if (*ordered_io_error || !mod_len || ret || skip_csum)
4028 		return ret;
4029 
4030 	if (em->compress_type) {
4031 		csum_offset = 0;
4032 		csum_len = max(em->block_len, em->orig_block_len);
4033 	} else {
4034 		csum_offset = mod_start - em->start;
4035 		csum_len = mod_len;
4036 	}
4037 
4038 	/* block start is already adjusted for the file extent offset. */
4039 	ret = btrfs_lookup_csums_range(fs_info->csum_root,
4040 				       em->block_start + csum_offset,
4041 				       em->block_start + csum_offset +
4042 				       csum_len - 1, &ordered_sums, 0);
4043 	if (ret)
4044 		return ret;
4045 
4046 	while (!list_empty(&ordered_sums)) {
4047 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4048 						   struct btrfs_ordered_sum,
4049 						   list);
4050 		if (!ret)
4051 			ret = btrfs_csum_file_blocks(trans, log, sums);
4052 		list_del(&sums->list);
4053 		kfree(sums);
4054 	}
4055 
4056 	return ret;
4057 }
4058 
4059 static int log_one_extent(struct btrfs_trans_handle *trans,
4060 			  struct btrfs_inode *inode, struct btrfs_root *root,
4061 			  const struct extent_map *em,
4062 			  struct btrfs_path *path,
4063 			  const struct list_head *logged_list,
4064 			  struct btrfs_log_ctx *ctx)
4065 {
4066 	struct btrfs_root *log = root->log_root;
4067 	struct btrfs_file_extent_item *fi;
4068 	struct extent_buffer *leaf;
4069 	struct btrfs_map_token token;
4070 	struct btrfs_key key;
4071 	u64 extent_offset = em->start - em->orig_start;
4072 	u64 block_len;
4073 	int ret;
4074 	int extent_inserted = 0;
4075 	bool ordered_io_err = false;
4076 
4077 	ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em,
4078 			logged_list, &ordered_io_err);
4079 	if (ret)
4080 		return ret;
4081 
4082 	if (ordered_io_err) {
4083 		ctx->io_err = -EIO;
4084 		return ctx->io_err;
4085 	}
4086 
4087 	btrfs_init_map_token(&token);
4088 
4089 	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
4090 				   em->start + em->len, NULL, 0, 1,
4091 				   sizeof(*fi), &extent_inserted);
4092 	if (ret)
4093 		return ret;
4094 
4095 	if (!extent_inserted) {
4096 		key.objectid = btrfs_ino(inode);
4097 		key.type = BTRFS_EXTENT_DATA_KEY;
4098 		key.offset = em->start;
4099 
4100 		ret = btrfs_insert_empty_item(trans, log, path, &key,
4101 					      sizeof(*fi));
4102 		if (ret)
4103 			return ret;
4104 	}
4105 	leaf = path->nodes[0];
4106 	fi = btrfs_item_ptr(leaf, path->slots[0],
4107 			    struct btrfs_file_extent_item);
4108 
4109 	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4110 					       &token);
4111 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4112 		btrfs_set_token_file_extent_type(leaf, fi,
4113 						 BTRFS_FILE_EXTENT_PREALLOC,
4114 						 &token);
4115 	else
4116 		btrfs_set_token_file_extent_type(leaf, fi,
4117 						 BTRFS_FILE_EXTENT_REG,
4118 						 &token);
4119 
4120 	block_len = max(em->block_len, em->orig_block_len);
4121 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4122 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4123 							em->block_start,
4124 							&token);
4125 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4126 							   &token);
4127 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4128 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4129 							em->block_start -
4130 							extent_offset, &token);
4131 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4132 							   &token);
4133 	} else {
4134 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4135 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4136 							   &token);
4137 	}
4138 
4139 	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4140 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4141 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4142 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4143 						&token);
4144 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4145 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4146 	btrfs_mark_buffer_dirty(leaf);
4147 
4148 	btrfs_release_path(path);
4149 
4150 	return ret;
4151 }
4152 
4153 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4154 				     struct btrfs_root *root,
4155 				     struct btrfs_inode *inode,
4156 				     struct btrfs_path *path,
4157 				     struct list_head *logged_list,
4158 				     struct btrfs_log_ctx *ctx,
4159 				     const u64 start,
4160 				     const u64 end)
4161 {
4162 	struct extent_map *em, *n;
4163 	struct list_head extents;
4164 	struct extent_map_tree *tree = &inode->extent_tree;
4165 	u64 logged_start, logged_end;
4166 	u64 test_gen;
4167 	int ret = 0;
4168 	int num = 0;
4169 
4170 	INIT_LIST_HEAD(&extents);
4171 
4172 	down_write(&inode->dio_sem);
4173 	write_lock(&tree->lock);
4174 	test_gen = root->fs_info->last_trans_committed;
4175 	logged_start = start;
4176 	logged_end = end;
4177 
4178 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4179 		list_del_init(&em->list);
4180 		/*
4181 		 * Just an arbitrary number, this can be really CPU intensive
4182 		 * once we start getting a lot of extents, and really once we
4183 		 * have a bunch of extents we just want to commit since it will
4184 		 * be faster.
4185 		 */
4186 		if (++num > 32768) {
4187 			list_del_init(&tree->modified_extents);
4188 			ret = -EFBIG;
4189 			goto process;
4190 		}
4191 
4192 		if (em->generation <= test_gen)
4193 			continue;
4194 
4195 		if (em->start < logged_start)
4196 			logged_start = em->start;
4197 		if ((em->start + em->len - 1) > logged_end)
4198 			logged_end = em->start + em->len - 1;
4199 
4200 		/* Need a ref to keep it from getting evicted from cache */
4201 		refcount_inc(&em->refs);
4202 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4203 		list_add_tail(&em->list, &extents);
4204 		num++;
4205 	}
4206 
4207 	list_sort(NULL, &extents, extent_cmp);
4208 	btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
4209 	/*
4210 	 * Some ordered extents started by fsync might have completed
4211 	 * before we could collect them into the list logged_list, which
4212 	 * means they're gone, not in our logged_list nor in the inode's
4213 	 * ordered tree. We want the application/user space to know an
4214 	 * error happened while attempting to persist file data so that
4215 	 * it can take proper action. If such error happened, we leave
4216 	 * without writing to the log tree and the fsync must report the
4217 	 * file data write error and not commit the current transaction.
4218 	 */
4219 	ret = filemap_check_errors(inode->vfs_inode.i_mapping);
4220 	if (ret)
4221 		ctx->io_err = ret;
4222 process:
4223 	while (!list_empty(&extents)) {
4224 		em = list_entry(extents.next, struct extent_map, list);
4225 
4226 		list_del_init(&em->list);
4227 
4228 		/*
4229 		 * If we had an error we just need to delete everybody from our
4230 		 * private list.
4231 		 */
4232 		if (ret) {
4233 			clear_em_logging(tree, em);
4234 			free_extent_map(em);
4235 			continue;
4236 		}
4237 
4238 		write_unlock(&tree->lock);
4239 
4240 		ret = log_one_extent(trans, inode, root, em, path, logged_list,
4241 				     ctx);
4242 		write_lock(&tree->lock);
4243 		clear_em_logging(tree, em);
4244 		free_extent_map(em);
4245 	}
4246 	WARN_ON(!list_empty(&extents));
4247 	write_unlock(&tree->lock);
4248 	up_write(&inode->dio_sem);
4249 
4250 	btrfs_release_path(path);
4251 	return ret;
4252 }
4253 
4254 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4255 			     struct btrfs_path *path, u64 *size_ret)
4256 {
4257 	struct btrfs_key key;
4258 	int ret;
4259 
4260 	key.objectid = btrfs_ino(inode);
4261 	key.type = BTRFS_INODE_ITEM_KEY;
4262 	key.offset = 0;
4263 
4264 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4265 	if (ret < 0) {
4266 		return ret;
4267 	} else if (ret > 0) {
4268 		*size_ret = 0;
4269 	} else {
4270 		struct btrfs_inode_item *item;
4271 
4272 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4273 				      struct btrfs_inode_item);
4274 		*size_ret = btrfs_inode_size(path->nodes[0], item);
4275 	}
4276 
4277 	btrfs_release_path(path);
4278 	return 0;
4279 }
4280 
4281 /*
4282  * At the moment we always log all xattrs. This is to figure out at log replay
4283  * time which xattrs must have their deletion replayed. If a xattr is missing
4284  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4285  * because if a xattr is deleted, the inode is fsynced and a power failure
4286  * happens, causing the log to be replayed the next time the fs is mounted,
4287  * we want the xattr to not exist anymore (same behaviour as other filesystems
4288  * with a journal, ext3/4, xfs, f2fs, etc).
4289  */
4290 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4291 				struct btrfs_root *root,
4292 				struct btrfs_inode *inode,
4293 				struct btrfs_path *path,
4294 				struct btrfs_path *dst_path)
4295 {
4296 	int ret;
4297 	struct btrfs_key key;
4298 	const u64 ino = btrfs_ino(inode);
4299 	int ins_nr = 0;
4300 	int start_slot = 0;
4301 
4302 	key.objectid = ino;
4303 	key.type = BTRFS_XATTR_ITEM_KEY;
4304 	key.offset = 0;
4305 
4306 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4307 	if (ret < 0)
4308 		return ret;
4309 
4310 	while (true) {
4311 		int slot = path->slots[0];
4312 		struct extent_buffer *leaf = path->nodes[0];
4313 		int nritems = btrfs_header_nritems(leaf);
4314 
4315 		if (slot >= nritems) {
4316 			if (ins_nr > 0) {
4317 				u64 last_extent = 0;
4318 
4319 				ret = copy_items(trans, inode, dst_path, path,
4320 						 &last_extent, start_slot,
4321 						 ins_nr, 1, 0);
4322 				/* can't be 1, extent items aren't processed */
4323 				ASSERT(ret <= 0);
4324 				if (ret < 0)
4325 					return ret;
4326 				ins_nr = 0;
4327 			}
4328 			ret = btrfs_next_leaf(root, path);
4329 			if (ret < 0)
4330 				return ret;
4331 			else if (ret > 0)
4332 				break;
4333 			continue;
4334 		}
4335 
4336 		btrfs_item_key_to_cpu(leaf, &key, slot);
4337 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4338 			break;
4339 
4340 		if (ins_nr == 0)
4341 			start_slot = slot;
4342 		ins_nr++;
4343 		path->slots[0]++;
4344 		cond_resched();
4345 	}
4346 	if (ins_nr > 0) {
4347 		u64 last_extent = 0;
4348 
4349 		ret = copy_items(trans, inode, dst_path, path,
4350 				 &last_extent, start_slot,
4351 				 ins_nr, 1, 0);
4352 		/* can't be 1, extent items aren't processed */
4353 		ASSERT(ret <= 0);
4354 		if (ret < 0)
4355 			return ret;
4356 	}
4357 
4358 	return 0;
4359 }
4360 
4361 /*
4362  * If the no holes feature is enabled we need to make sure any hole between the
4363  * last extent and the i_size of our inode is explicitly marked in the log. This
4364  * is to make sure that doing something like:
4365  *
4366  *      1) create file with 128Kb of data
4367  *      2) truncate file to 64Kb
4368  *      3) truncate file to 256Kb
4369  *      4) fsync file
4370  *      5) <crash/power failure>
4371  *      6) mount fs and trigger log replay
4372  *
4373  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4374  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4375  * file correspond to a hole. The presence of explicit holes in a log tree is
4376  * what guarantees that log replay will remove/adjust file extent items in the
4377  * fs/subvol tree.
4378  *
4379  * Here we do not need to care about holes between extents, that is already done
4380  * by copy_items(). We also only need to do this in the full sync path, where we
4381  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4382  * lookup the list of modified extent maps and if any represents a hole, we
4383  * insert a corresponding extent representing a hole in the log tree.
4384  */
4385 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4386 				   struct btrfs_root *root,
4387 				   struct btrfs_inode *inode,
4388 				   struct btrfs_path *path)
4389 {
4390 	struct btrfs_fs_info *fs_info = root->fs_info;
4391 	int ret;
4392 	struct btrfs_key key;
4393 	u64 hole_start;
4394 	u64 hole_size;
4395 	struct extent_buffer *leaf;
4396 	struct btrfs_root *log = root->log_root;
4397 	const u64 ino = btrfs_ino(inode);
4398 	const u64 i_size = i_size_read(&inode->vfs_inode);
4399 
4400 	if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4401 		return 0;
4402 
4403 	key.objectid = ino;
4404 	key.type = BTRFS_EXTENT_DATA_KEY;
4405 	key.offset = (u64)-1;
4406 
4407 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4408 	ASSERT(ret != 0);
4409 	if (ret < 0)
4410 		return ret;
4411 
4412 	ASSERT(path->slots[0] > 0);
4413 	path->slots[0]--;
4414 	leaf = path->nodes[0];
4415 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4416 
4417 	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4418 		/* inode does not have any extents */
4419 		hole_start = 0;
4420 		hole_size = i_size;
4421 	} else {
4422 		struct btrfs_file_extent_item *extent;
4423 		u64 len;
4424 
4425 		/*
4426 		 * If there's an extent beyond i_size, an explicit hole was
4427 		 * already inserted by copy_items().
4428 		 */
4429 		if (key.offset >= i_size)
4430 			return 0;
4431 
4432 		extent = btrfs_item_ptr(leaf, path->slots[0],
4433 					struct btrfs_file_extent_item);
4434 
4435 		if (btrfs_file_extent_type(leaf, extent) ==
4436 		    BTRFS_FILE_EXTENT_INLINE) {
4437 			len = btrfs_file_extent_inline_len(leaf,
4438 							   path->slots[0],
4439 							   extent);
4440 			ASSERT(len == i_size ||
4441 			       (len == fs_info->sectorsize &&
4442 				btrfs_file_extent_compression(leaf, extent) !=
4443 				BTRFS_COMPRESS_NONE));
4444 			return 0;
4445 		}
4446 
4447 		len = btrfs_file_extent_num_bytes(leaf, extent);
4448 		/* Last extent goes beyond i_size, no need to log a hole. */
4449 		if (key.offset + len > i_size)
4450 			return 0;
4451 		hole_start = key.offset + len;
4452 		hole_size = i_size - hole_start;
4453 	}
4454 	btrfs_release_path(path);
4455 
4456 	/* Last extent ends at i_size. */
4457 	if (hole_size == 0)
4458 		return 0;
4459 
4460 	hole_size = ALIGN(hole_size, fs_info->sectorsize);
4461 	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4462 				       hole_size, 0, hole_size, 0, 0, 0);
4463 	return ret;
4464 }
4465 
4466 /*
4467  * When we are logging a new inode X, check if it doesn't have a reference that
4468  * matches the reference from some other inode Y created in a past transaction
4469  * and that was renamed in the current transaction. If we don't do this, then at
4470  * log replay time we can lose inode Y (and all its files if it's a directory):
4471  *
4472  * mkdir /mnt/x
4473  * echo "hello world" > /mnt/x/foobar
4474  * sync
4475  * mv /mnt/x /mnt/y
4476  * mkdir /mnt/x                 # or touch /mnt/x
4477  * xfs_io -c fsync /mnt/x
4478  * <power fail>
4479  * mount fs, trigger log replay
4480  *
4481  * After the log replay procedure, we would lose the first directory and all its
4482  * files (file foobar).
4483  * For the case where inode Y is not a directory we simply end up losing it:
4484  *
4485  * echo "123" > /mnt/foo
4486  * sync
4487  * mv /mnt/foo /mnt/bar
4488  * echo "abc" > /mnt/foo
4489  * xfs_io -c fsync /mnt/foo
4490  * <power fail>
4491  *
4492  * We also need this for cases where a snapshot entry is replaced by some other
4493  * entry (file or directory) otherwise we end up with an unreplayable log due to
4494  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4495  * if it were a regular entry:
4496  *
4497  * mkdir /mnt/x
4498  * btrfs subvolume snapshot /mnt /mnt/x/snap
4499  * btrfs subvolume delete /mnt/x/snap
4500  * rmdir /mnt/x
4501  * mkdir /mnt/x
4502  * fsync /mnt/x or fsync some new file inside it
4503  * <power fail>
4504  *
4505  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4506  * the same transaction.
4507  */
4508 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4509 					 const int slot,
4510 					 const struct btrfs_key *key,
4511 					 struct btrfs_inode *inode,
4512 					 u64 *other_ino)
4513 {
4514 	int ret;
4515 	struct btrfs_path *search_path;
4516 	char *name = NULL;
4517 	u32 name_len = 0;
4518 	u32 item_size = btrfs_item_size_nr(eb, slot);
4519 	u32 cur_offset = 0;
4520 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4521 
4522 	search_path = btrfs_alloc_path();
4523 	if (!search_path)
4524 		return -ENOMEM;
4525 	search_path->search_commit_root = 1;
4526 	search_path->skip_locking = 1;
4527 
4528 	while (cur_offset < item_size) {
4529 		u64 parent;
4530 		u32 this_name_len;
4531 		u32 this_len;
4532 		unsigned long name_ptr;
4533 		struct btrfs_dir_item *di;
4534 
4535 		if (key->type == BTRFS_INODE_REF_KEY) {
4536 			struct btrfs_inode_ref *iref;
4537 
4538 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4539 			parent = key->offset;
4540 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
4541 			name_ptr = (unsigned long)(iref + 1);
4542 			this_len = sizeof(*iref) + this_name_len;
4543 		} else {
4544 			struct btrfs_inode_extref *extref;
4545 
4546 			extref = (struct btrfs_inode_extref *)(ptr +
4547 							       cur_offset);
4548 			parent = btrfs_inode_extref_parent(eb, extref);
4549 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
4550 			name_ptr = (unsigned long)&extref->name;
4551 			this_len = sizeof(*extref) + this_name_len;
4552 		}
4553 
4554 		if (this_name_len > name_len) {
4555 			char *new_name;
4556 
4557 			new_name = krealloc(name, this_name_len, GFP_NOFS);
4558 			if (!new_name) {
4559 				ret = -ENOMEM;
4560 				goto out;
4561 			}
4562 			name_len = this_name_len;
4563 			name = new_name;
4564 		}
4565 
4566 		read_extent_buffer(eb, name, name_ptr, this_name_len);
4567 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4568 				parent, name, this_name_len, 0);
4569 		if (di && !IS_ERR(di)) {
4570 			struct btrfs_key di_key;
4571 
4572 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4573 						  di, &di_key);
4574 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4575 				ret = 1;
4576 				*other_ino = di_key.objectid;
4577 			} else {
4578 				ret = -EAGAIN;
4579 			}
4580 			goto out;
4581 		} else if (IS_ERR(di)) {
4582 			ret = PTR_ERR(di);
4583 			goto out;
4584 		}
4585 		btrfs_release_path(search_path);
4586 
4587 		cur_offset += this_len;
4588 	}
4589 	ret = 0;
4590 out:
4591 	btrfs_free_path(search_path);
4592 	kfree(name);
4593 	return ret;
4594 }
4595 
4596 /* log a single inode in the tree log.
4597  * At least one parent directory for this inode must exist in the tree
4598  * or be logged already.
4599  *
4600  * Any items from this inode changed by the current transaction are copied
4601  * to the log tree.  An extra reference is taken on any extents in this
4602  * file, allowing us to avoid a whole pile of corner cases around logging
4603  * blocks that have been removed from the tree.
4604  *
4605  * See LOG_INODE_ALL and related defines for a description of what inode_only
4606  * does.
4607  *
4608  * This handles both files and directories.
4609  */
4610 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4611 			   struct btrfs_root *root, struct btrfs_inode *inode,
4612 			   int inode_only,
4613 			   const loff_t start,
4614 			   const loff_t end,
4615 			   struct btrfs_log_ctx *ctx)
4616 {
4617 	struct btrfs_fs_info *fs_info = root->fs_info;
4618 	struct btrfs_path *path;
4619 	struct btrfs_path *dst_path;
4620 	struct btrfs_key min_key;
4621 	struct btrfs_key max_key;
4622 	struct btrfs_root *log = root->log_root;
4623 	LIST_HEAD(logged_list);
4624 	u64 last_extent = 0;
4625 	int err = 0;
4626 	int ret;
4627 	int nritems;
4628 	int ins_start_slot = 0;
4629 	int ins_nr;
4630 	bool fast_search = false;
4631 	u64 ino = btrfs_ino(inode);
4632 	struct extent_map_tree *em_tree = &inode->extent_tree;
4633 	u64 logged_isize = 0;
4634 	bool need_log_inode_item = true;
4635 
4636 	path = btrfs_alloc_path();
4637 	if (!path)
4638 		return -ENOMEM;
4639 	dst_path = btrfs_alloc_path();
4640 	if (!dst_path) {
4641 		btrfs_free_path(path);
4642 		return -ENOMEM;
4643 	}
4644 
4645 	min_key.objectid = ino;
4646 	min_key.type = BTRFS_INODE_ITEM_KEY;
4647 	min_key.offset = 0;
4648 
4649 	max_key.objectid = ino;
4650 
4651 
4652 	/* today the code can only do partial logging of directories */
4653 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
4654 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4655 		       &inode->runtime_flags) &&
4656 	     inode_only >= LOG_INODE_EXISTS))
4657 		max_key.type = BTRFS_XATTR_ITEM_KEY;
4658 	else
4659 		max_key.type = (u8)-1;
4660 	max_key.offset = (u64)-1;
4661 
4662 	/*
4663 	 * Only run delayed items if we are a dir or a new file.
4664 	 * Otherwise commit the delayed inode only, which is needed in
4665 	 * order for the log replay code to mark inodes for link count
4666 	 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4667 	 */
4668 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
4669 	    inode->generation > fs_info->last_trans_committed)
4670 		ret = btrfs_commit_inode_delayed_items(trans, inode);
4671 	else
4672 		ret = btrfs_commit_inode_delayed_inode(inode);
4673 
4674 	if (ret) {
4675 		btrfs_free_path(path);
4676 		btrfs_free_path(dst_path);
4677 		return ret;
4678 	}
4679 
4680 	if (inode_only == LOG_OTHER_INODE) {
4681 		inode_only = LOG_INODE_EXISTS;
4682 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
4683 	} else {
4684 		mutex_lock(&inode->log_mutex);
4685 	}
4686 
4687 	/*
4688 	 * a brute force approach to making sure we get the most uptodate
4689 	 * copies of everything.
4690 	 */
4691 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
4692 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4693 
4694 		if (inode_only == LOG_INODE_EXISTS)
4695 			max_key_type = BTRFS_XATTR_ITEM_KEY;
4696 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4697 	} else {
4698 		if (inode_only == LOG_INODE_EXISTS) {
4699 			/*
4700 			 * Make sure the new inode item we write to the log has
4701 			 * the same isize as the current one (if it exists).
4702 			 * This is necessary to prevent data loss after log
4703 			 * replay, and also to prevent doing a wrong expanding
4704 			 * truncate - for e.g. create file, write 4K into offset
4705 			 * 0, fsync, write 4K into offset 4096, add hard link,
4706 			 * fsync some other file (to sync log), power fail - if
4707 			 * we use the inode's current i_size, after log replay
4708 			 * we get a 8Kb file, with the last 4Kb extent as a hole
4709 			 * (zeroes), as if an expanding truncate happened,
4710 			 * instead of getting a file of 4Kb only.
4711 			 */
4712 			err = logged_inode_size(log, inode, path, &logged_isize);
4713 			if (err)
4714 				goto out_unlock;
4715 		}
4716 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4717 			     &inode->runtime_flags)) {
4718 			if (inode_only == LOG_INODE_EXISTS) {
4719 				max_key.type = BTRFS_XATTR_ITEM_KEY;
4720 				ret = drop_objectid_items(trans, log, path, ino,
4721 							  max_key.type);
4722 			} else {
4723 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4724 					  &inode->runtime_flags);
4725 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4726 					  &inode->runtime_flags);
4727 				while(1) {
4728 					ret = btrfs_truncate_inode_items(trans,
4729 						log, &inode->vfs_inode, 0, 0);
4730 					if (ret != -EAGAIN)
4731 						break;
4732 				}
4733 			}
4734 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4735 					      &inode->runtime_flags) ||
4736 			   inode_only == LOG_INODE_EXISTS) {
4737 			if (inode_only == LOG_INODE_ALL)
4738 				fast_search = true;
4739 			max_key.type = BTRFS_XATTR_ITEM_KEY;
4740 			ret = drop_objectid_items(trans, log, path, ino,
4741 						  max_key.type);
4742 		} else {
4743 			if (inode_only == LOG_INODE_ALL)
4744 				fast_search = true;
4745 			goto log_extents;
4746 		}
4747 
4748 	}
4749 	if (ret) {
4750 		err = ret;
4751 		goto out_unlock;
4752 	}
4753 
4754 	while (1) {
4755 		ins_nr = 0;
4756 		ret = btrfs_search_forward(root, &min_key,
4757 					   path, trans->transid);
4758 		if (ret < 0) {
4759 			err = ret;
4760 			goto out_unlock;
4761 		}
4762 		if (ret != 0)
4763 			break;
4764 again:
4765 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
4766 		if (min_key.objectid != ino)
4767 			break;
4768 		if (min_key.type > max_key.type)
4769 			break;
4770 
4771 		if (min_key.type == BTRFS_INODE_ITEM_KEY)
4772 			need_log_inode_item = false;
4773 
4774 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
4775 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4776 		    inode->generation == trans->transid) {
4777 			u64 other_ino = 0;
4778 
4779 			ret = btrfs_check_ref_name_override(path->nodes[0],
4780 					path->slots[0], &min_key, inode,
4781 					&other_ino);
4782 			if (ret < 0) {
4783 				err = ret;
4784 				goto out_unlock;
4785 			} else if (ret > 0 && ctx &&
4786 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4787 				struct btrfs_key inode_key;
4788 				struct inode *other_inode;
4789 
4790 				if (ins_nr > 0) {
4791 					ins_nr++;
4792 				} else {
4793 					ins_nr = 1;
4794 					ins_start_slot = path->slots[0];
4795 				}
4796 				ret = copy_items(trans, inode, dst_path, path,
4797 						 &last_extent, ins_start_slot,
4798 						 ins_nr, inode_only,
4799 						 logged_isize);
4800 				if (ret < 0) {
4801 					err = ret;
4802 					goto out_unlock;
4803 				}
4804 				ins_nr = 0;
4805 				btrfs_release_path(path);
4806 				inode_key.objectid = other_ino;
4807 				inode_key.type = BTRFS_INODE_ITEM_KEY;
4808 				inode_key.offset = 0;
4809 				other_inode = btrfs_iget(fs_info->sb,
4810 							 &inode_key, root,
4811 							 NULL);
4812 				/*
4813 				 * If the other inode that had a conflicting dir
4814 				 * entry was deleted in the current transaction,
4815 				 * we don't need to do more work nor fallback to
4816 				 * a transaction commit.
4817 				 */
4818 				if (IS_ERR(other_inode) &&
4819 				    PTR_ERR(other_inode) == -ENOENT) {
4820 					goto next_key;
4821 				} else if (IS_ERR(other_inode)) {
4822 					err = PTR_ERR(other_inode);
4823 					goto out_unlock;
4824 				}
4825 				/*
4826 				 * We are safe logging the other inode without
4827 				 * acquiring its i_mutex as long as we log with
4828 				 * the LOG_INODE_EXISTS mode. We're safe against
4829 				 * concurrent renames of the other inode as well
4830 				 * because during a rename we pin the log and
4831 				 * update the log with the new name before we
4832 				 * unpin it.
4833 				 */
4834 				err = btrfs_log_inode(trans, root,
4835 						BTRFS_I(other_inode),
4836 						LOG_OTHER_INODE, 0, LLONG_MAX,
4837 						ctx);
4838 				iput(other_inode);
4839 				if (err)
4840 					goto out_unlock;
4841 				else
4842 					goto next_key;
4843 			}
4844 		}
4845 
4846 		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
4847 		if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
4848 			if (ins_nr == 0)
4849 				goto next_slot;
4850 			ret = copy_items(trans, inode, dst_path, path,
4851 					 &last_extent, ins_start_slot,
4852 					 ins_nr, inode_only, logged_isize);
4853 			if (ret < 0) {
4854 				err = ret;
4855 				goto out_unlock;
4856 			}
4857 			ins_nr = 0;
4858 			if (ret) {
4859 				btrfs_release_path(path);
4860 				continue;
4861 			}
4862 			goto next_slot;
4863 		}
4864 
4865 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
4866 			ins_nr++;
4867 			goto next_slot;
4868 		} else if (!ins_nr) {
4869 			ins_start_slot = path->slots[0];
4870 			ins_nr = 1;
4871 			goto next_slot;
4872 		}
4873 
4874 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
4875 				 ins_start_slot, ins_nr, inode_only,
4876 				 logged_isize);
4877 		if (ret < 0) {
4878 			err = ret;
4879 			goto out_unlock;
4880 		}
4881 		if (ret) {
4882 			ins_nr = 0;
4883 			btrfs_release_path(path);
4884 			continue;
4885 		}
4886 		ins_nr = 1;
4887 		ins_start_slot = path->slots[0];
4888 next_slot:
4889 
4890 		nritems = btrfs_header_nritems(path->nodes[0]);
4891 		path->slots[0]++;
4892 		if (path->slots[0] < nritems) {
4893 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
4894 					      path->slots[0]);
4895 			goto again;
4896 		}
4897 		if (ins_nr) {
4898 			ret = copy_items(trans, inode, dst_path, path,
4899 					 &last_extent, ins_start_slot,
4900 					 ins_nr, inode_only, logged_isize);
4901 			if (ret < 0) {
4902 				err = ret;
4903 				goto out_unlock;
4904 			}
4905 			ret = 0;
4906 			ins_nr = 0;
4907 		}
4908 		btrfs_release_path(path);
4909 next_key:
4910 		if (min_key.offset < (u64)-1) {
4911 			min_key.offset++;
4912 		} else if (min_key.type < max_key.type) {
4913 			min_key.type++;
4914 			min_key.offset = 0;
4915 		} else {
4916 			break;
4917 		}
4918 	}
4919 	if (ins_nr) {
4920 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
4921 				 ins_start_slot, ins_nr, inode_only,
4922 				 logged_isize);
4923 		if (ret < 0) {
4924 			err = ret;
4925 			goto out_unlock;
4926 		}
4927 		ret = 0;
4928 		ins_nr = 0;
4929 	}
4930 
4931 	btrfs_release_path(path);
4932 	btrfs_release_path(dst_path);
4933 	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
4934 	if (err)
4935 		goto out_unlock;
4936 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4937 		btrfs_release_path(path);
4938 		btrfs_release_path(dst_path);
4939 		err = btrfs_log_trailing_hole(trans, root, inode, path);
4940 		if (err)
4941 			goto out_unlock;
4942 	}
4943 log_extents:
4944 	btrfs_release_path(path);
4945 	btrfs_release_path(dst_path);
4946 	if (need_log_inode_item) {
4947 		err = log_inode_item(trans, log, dst_path, inode);
4948 		if (err)
4949 			goto out_unlock;
4950 	}
4951 	if (fast_search) {
4952 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4953 						&logged_list, ctx, start, end);
4954 		if (ret) {
4955 			err = ret;
4956 			goto out_unlock;
4957 		}
4958 	} else if (inode_only == LOG_INODE_ALL) {
4959 		struct extent_map *em, *n;
4960 
4961 		write_lock(&em_tree->lock);
4962 		/*
4963 		 * We can't just remove every em if we're called for a ranged
4964 		 * fsync - that is, one that doesn't cover the whole possible
4965 		 * file range (0 to LLONG_MAX). This is because we can have
4966 		 * em's that fall outside the range we're logging and therefore
4967 		 * their ordered operations haven't completed yet
4968 		 * (btrfs_finish_ordered_io() not invoked yet). This means we
4969 		 * didn't get their respective file extent item in the fs/subvol
4970 		 * tree yet, and need to let the next fast fsync (one which
4971 		 * consults the list of modified extent maps) find the em so
4972 		 * that it logs a matching file extent item and waits for the
4973 		 * respective ordered operation to complete (if it's still
4974 		 * running).
4975 		 *
4976 		 * Removing every em outside the range we're logging would make
4977 		 * the next fast fsync not log their matching file extent items,
4978 		 * therefore making us lose data after a log replay.
4979 		 */
4980 		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4981 					 list) {
4982 			const u64 mod_end = em->mod_start + em->mod_len - 1;
4983 
4984 			if (em->mod_start >= start && mod_end <= end)
4985 				list_del_init(&em->list);
4986 		}
4987 		write_unlock(&em_tree->lock);
4988 	}
4989 
4990 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
4991 		ret = log_directory_changes(trans, root, inode, path, dst_path,
4992 					ctx);
4993 		if (ret) {
4994 			err = ret;
4995 			goto out_unlock;
4996 		}
4997 	}
4998 
4999 	spin_lock(&inode->lock);
5000 	inode->logged_trans = trans->transid;
5001 	inode->last_log_commit = inode->last_sub_trans;
5002 	spin_unlock(&inode->lock);
5003 out_unlock:
5004 	if (unlikely(err))
5005 		btrfs_put_logged_extents(&logged_list);
5006 	else
5007 		btrfs_submit_logged_extents(&logged_list, log);
5008 	mutex_unlock(&inode->log_mutex);
5009 
5010 	btrfs_free_path(path);
5011 	btrfs_free_path(dst_path);
5012 	return err;
5013 }
5014 
5015 /*
5016  * Check if we must fallback to a transaction commit when logging an inode.
5017  * This must be called after logging the inode and is used only in the context
5018  * when fsyncing an inode requires the need to log some other inode - in which
5019  * case we can't lock the i_mutex of each other inode we need to log as that
5020  * can lead to deadlocks with concurrent fsync against other inodes (as we can
5021  * log inodes up or down in the hierarchy) or rename operations for example. So
5022  * we take the log_mutex of the inode after we have logged it and then check for
5023  * its last_unlink_trans value - this is safe because any task setting
5024  * last_unlink_trans must take the log_mutex and it must do this before it does
5025  * the actual unlink operation, so if we do this check before a concurrent task
5026  * sets last_unlink_trans it means we've logged a consistent version/state of
5027  * all the inode items, otherwise we are not sure and must do a transaction
5028  * commit (the concurrent task might have only updated last_unlink_trans before
5029  * we logged the inode or it might have also done the unlink).
5030  */
5031 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5032 					  struct btrfs_inode *inode)
5033 {
5034 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
5035 	bool ret = false;
5036 
5037 	mutex_lock(&inode->log_mutex);
5038 	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5039 		/*
5040 		 * Make sure any commits to the log are forced to be full
5041 		 * commits.
5042 		 */
5043 		btrfs_set_log_full_commit(fs_info, trans);
5044 		ret = true;
5045 	}
5046 	mutex_unlock(&inode->log_mutex);
5047 
5048 	return ret;
5049 }
5050 
5051 /*
5052  * follow the dentry parent pointers up the chain and see if any
5053  * of the directories in it require a full commit before they can
5054  * be logged.  Returns zero if nothing special needs to be done or 1 if
5055  * a full commit is required.
5056  */
5057 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5058 					       struct btrfs_inode *inode,
5059 					       struct dentry *parent,
5060 					       struct super_block *sb,
5061 					       u64 last_committed)
5062 {
5063 	int ret = 0;
5064 	struct dentry *old_parent = NULL;
5065 	struct btrfs_inode *orig_inode = inode;
5066 
5067 	/*
5068 	 * for regular files, if its inode is already on disk, we don't
5069 	 * have to worry about the parents at all.  This is because
5070 	 * we can use the last_unlink_trans field to record renames
5071 	 * and other fun in this file.
5072 	 */
5073 	if (S_ISREG(inode->vfs_inode.i_mode) &&
5074 	    inode->generation <= last_committed &&
5075 	    inode->last_unlink_trans <= last_committed)
5076 		goto out;
5077 
5078 	if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5079 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5080 			goto out;
5081 		inode = BTRFS_I(d_inode(parent));
5082 	}
5083 
5084 	while (1) {
5085 		/*
5086 		 * If we are logging a directory then we start with our inode,
5087 		 * not our parent's inode, so we need to skip setting the
5088 		 * logged_trans so that further down in the log code we don't
5089 		 * think this inode has already been logged.
5090 		 */
5091 		if (inode != orig_inode)
5092 			inode->logged_trans = trans->transid;
5093 		smp_mb();
5094 
5095 		if (btrfs_must_commit_transaction(trans, inode)) {
5096 			ret = 1;
5097 			break;
5098 		}
5099 
5100 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5101 			break;
5102 
5103 		if (IS_ROOT(parent)) {
5104 			inode = BTRFS_I(d_inode(parent));
5105 			if (btrfs_must_commit_transaction(trans, inode))
5106 				ret = 1;
5107 			break;
5108 		}
5109 
5110 		parent = dget_parent(parent);
5111 		dput(old_parent);
5112 		old_parent = parent;
5113 		inode = BTRFS_I(d_inode(parent));
5114 
5115 	}
5116 	dput(old_parent);
5117 out:
5118 	return ret;
5119 }
5120 
5121 struct btrfs_dir_list {
5122 	u64 ino;
5123 	struct list_head list;
5124 };
5125 
5126 /*
5127  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5128  * details about the why it is needed.
5129  * This is a recursive operation - if an existing dentry corresponds to a
5130  * directory, that directory's new entries are logged too (same behaviour as
5131  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5132  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5133  * complains about the following circular lock dependency / possible deadlock:
5134  *
5135  *        CPU0                                        CPU1
5136  *        ----                                        ----
5137  * lock(&type->i_mutex_dir_key#3/2);
5138  *                                            lock(sb_internal#2);
5139  *                                            lock(&type->i_mutex_dir_key#3/2);
5140  * lock(&sb->s_type->i_mutex_key#14);
5141  *
5142  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5143  * sb_start_intwrite() in btrfs_start_transaction().
5144  * Not locking i_mutex of the inodes is still safe because:
5145  *
5146  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5147  *    that while logging the inode new references (names) are added or removed
5148  *    from the inode, leaving the logged inode item with a link count that does
5149  *    not match the number of logged inode reference items. This is fine because
5150  *    at log replay time we compute the real number of links and correct the
5151  *    link count in the inode item (see replay_one_buffer() and
5152  *    link_to_fixup_dir());
5153  *
5154  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5155  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5156  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5157  *    has a size that doesn't match the sum of the lengths of all the logged
5158  *    names. This does not result in a problem because if a dir_item key is
5159  *    logged but its matching dir_index key is not logged, at log replay time we
5160  *    don't use it to replay the respective name (see replay_one_name()). On the
5161  *    other hand if only the dir_index key ends up being logged, the respective
5162  *    name is added to the fs/subvol tree with both the dir_item and dir_index
5163  *    keys created (see replay_one_name()).
5164  *    The directory's inode item with a wrong i_size is not a problem as well,
5165  *    since we don't use it at log replay time to set the i_size in the inode
5166  *    item of the fs/subvol tree (see overwrite_item()).
5167  */
5168 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5169 				struct btrfs_root *root,
5170 				struct btrfs_inode *start_inode,
5171 				struct btrfs_log_ctx *ctx)
5172 {
5173 	struct btrfs_fs_info *fs_info = root->fs_info;
5174 	struct btrfs_root *log = root->log_root;
5175 	struct btrfs_path *path;
5176 	LIST_HEAD(dir_list);
5177 	struct btrfs_dir_list *dir_elem;
5178 	int ret = 0;
5179 
5180 	path = btrfs_alloc_path();
5181 	if (!path)
5182 		return -ENOMEM;
5183 
5184 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5185 	if (!dir_elem) {
5186 		btrfs_free_path(path);
5187 		return -ENOMEM;
5188 	}
5189 	dir_elem->ino = btrfs_ino(start_inode);
5190 	list_add_tail(&dir_elem->list, &dir_list);
5191 
5192 	while (!list_empty(&dir_list)) {
5193 		struct extent_buffer *leaf;
5194 		struct btrfs_key min_key;
5195 		int nritems;
5196 		int i;
5197 
5198 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5199 					    list);
5200 		if (ret)
5201 			goto next_dir_inode;
5202 
5203 		min_key.objectid = dir_elem->ino;
5204 		min_key.type = BTRFS_DIR_ITEM_KEY;
5205 		min_key.offset = 0;
5206 again:
5207 		btrfs_release_path(path);
5208 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5209 		if (ret < 0) {
5210 			goto next_dir_inode;
5211 		} else if (ret > 0) {
5212 			ret = 0;
5213 			goto next_dir_inode;
5214 		}
5215 
5216 process_leaf:
5217 		leaf = path->nodes[0];
5218 		nritems = btrfs_header_nritems(leaf);
5219 		for (i = path->slots[0]; i < nritems; i++) {
5220 			struct btrfs_dir_item *di;
5221 			struct btrfs_key di_key;
5222 			struct inode *di_inode;
5223 			struct btrfs_dir_list *new_dir_elem;
5224 			int log_mode = LOG_INODE_EXISTS;
5225 			int type;
5226 
5227 			btrfs_item_key_to_cpu(leaf, &min_key, i);
5228 			if (min_key.objectid != dir_elem->ino ||
5229 			    min_key.type != BTRFS_DIR_ITEM_KEY)
5230 				goto next_dir_inode;
5231 
5232 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5233 			type = btrfs_dir_type(leaf, di);
5234 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
5235 			    type != BTRFS_FT_DIR)
5236 				continue;
5237 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5238 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5239 				continue;
5240 
5241 			btrfs_release_path(path);
5242 			di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
5243 			if (IS_ERR(di_inode)) {
5244 				ret = PTR_ERR(di_inode);
5245 				goto next_dir_inode;
5246 			}
5247 
5248 			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5249 				iput(di_inode);
5250 				break;
5251 			}
5252 
5253 			ctx->log_new_dentries = false;
5254 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5255 				log_mode = LOG_INODE_ALL;
5256 			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5257 					      log_mode, 0, LLONG_MAX, ctx);
5258 			if (!ret &&
5259 			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5260 				ret = 1;
5261 			iput(di_inode);
5262 			if (ret)
5263 				goto next_dir_inode;
5264 			if (ctx->log_new_dentries) {
5265 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5266 						       GFP_NOFS);
5267 				if (!new_dir_elem) {
5268 					ret = -ENOMEM;
5269 					goto next_dir_inode;
5270 				}
5271 				new_dir_elem->ino = di_key.objectid;
5272 				list_add_tail(&new_dir_elem->list, &dir_list);
5273 			}
5274 			break;
5275 		}
5276 		if (i == nritems) {
5277 			ret = btrfs_next_leaf(log, path);
5278 			if (ret < 0) {
5279 				goto next_dir_inode;
5280 			} else if (ret > 0) {
5281 				ret = 0;
5282 				goto next_dir_inode;
5283 			}
5284 			goto process_leaf;
5285 		}
5286 		if (min_key.offset < (u64)-1) {
5287 			min_key.offset++;
5288 			goto again;
5289 		}
5290 next_dir_inode:
5291 		list_del(&dir_elem->list);
5292 		kfree(dir_elem);
5293 	}
5294 
5295 	btrfs_free_path(path);
5296 	return ret;
5297 }
5298 
5299 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5300 				 struct btrfs_inode *inode,
5301 				 struct btrfs_log_ctx *ctx)
5302 {
5303 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5304 	int ret;
5305 	struct btrfs_path *path;
5306 	struct btrfs_key key;
5307 	struct btrfs_root *root = inode->root;
5308 	const u64 ino = btrfs_ino(inode);
5309 
5310 	path = btrfs_alloc_path();
5311 	if (!path)
5312 		return -ENOMEM;
5313 	path->skip_locking = 1;
5314 	path->search_commit_root = 1;
5315 
5316 	key.objectid = ino;
5317 	key.type = BTRFS_INODE_REF_KEY;
5318 	key.offset = 0;
5319 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5320 	if (ret < 0)
5321 		goto out;
5322 
5323 	while (true) {
5324 		struct extent_buffer *leaf = path->nodes[0];
5325 		int slot = path->slots[0];
5326 		u32 cur_offset = 0;
5327 		u32 item_size;
5328 		unsigned long ptr;
5329 
5330 		if (slot >= btrfs_header_nritems(leaf)) {
5331 			ret = btrfs_next_leaf(root, path);
5332 			if (ret < 0)
5333 				goto out;
5334 			else if (ret > 0)
5335 				break;
5336 			continue;
5337 		}
5338 
5339 		btrfs_item_key_to_cpu(leaf, &key, slot);
5340 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5341 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5342 			break;
5343 
5344 		item_size = btrfs_item_size_nr(leaf, slot);
5345 		ptr = btrfs_item_ptr_offset(leaf, slot);
5346 		while (cur_offset < item_size) {
5347 			struct btrfs_key inode_key;
5348 			struct inode *dir_inode;
5349 
5350 			inode_key.type = BTRFS_INODE_ITEM_KEY;
5351 			inode_key.offset = 0;
5352 
5353 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
5354 				struct btrfs_inode_extref *extref;
5355 
5356 				extref = (struct btrfs_inode_extref *)
5357 					(ptr + cur_offset);
5358 				inode_key.objectid = btrfs_inode_extref_parent(
5359 					leaf, extref);
5360 				cur_offset += sizeof(*extref);
5361 				cur_offset += btrfs_inode_extref_name_len(leaf,
5362 					extref);
5363 			} else {
5364 				inode_key.objectid = key.offset;
5365 				cur_offset = item_size;
5366 			}
5367 
5368 			dir_inode = btrfs_iget(fs_info->sb, &inode_key,
5369 					       root, NULL);
5370 			/* If parent inode was deleted, skip it. */
5371 			if (IS_ERR(dir_inode))
5372 				continue;
5373 
5374 			if (ctx)
5375 				ctx->log_new_dentries = false;
5376 			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5377 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5378 			if (!ret &&
5379 			    btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5380 				ret = 1;
5381 			if (!ret && ctx && ctx->log_new_dentries)
5382 				ret = log_new_dir_dentries(trans, root,
5383 						   BTRFS_I(dir_inode), ctx);
5384 			iput(dir_inode);
5385 			if (ret)
5386 				goto out;
5387 		}
5388 		path->slots[0]++;
5389 	}
5390 	ret = 0;
5391 out:
5392 	btrfs_free_path(path);
5393 	return ret;
5394 }
5395 
5396 /*
5397  * helper function around btrfs_log_inode to make sure newly created
5398  * parent directories also end up in the log.  A minimal inode and backref
5399  * only logging is done of any parent directories that are older than
5400  * the last committed transaction
5401  */
5402 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5403 				  struct btrfs_root *root,
5404 				  struct btrfs_inode *inode,
5405 				  struct dentry *parent,
5406 				  const loff_t start,
5407 				  const loff_t end,
5408 				  int inode_only,
5409 				  struct btrfs_log_ctx *ctx)
5410 {
5411 	struct btrfs_fs_info *fs_info = root->fs_info;
5412 	struct super_block *sb;
5413 	struct dentry *old_parent = NULL;
5414 	int ret = 0;
5415 	u64 last_committed = fs_info->last_trans_committed;
5416 	bool log_dentries = false;
5417 	struct btrfs_inode *orig_inode = inode;
5418 
5419 	sb = inode->vfs_inode.i_sb;
5420 
5421 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
5422 		ret = 1;
5423 		goto end_no_trans;
5424 	}
5425 
5426 	/*
5427 	 * The prev transaction commit doesn't complete, we need do
5428 	 * full commit by ourselves.
5429 	 */
5430 	if (fs_info->last_trans_log_full_commit >
5431 	    fs_info->last_trans_committed) {
5432 		ret = 1;
5433 		goto end_no_trans;
5434 	}
5435 
5436 	if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) {
5437 		ret = 1;
5438 		goto end_no_trans;
5439 	}
5440 
5441 	ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
5442 			last_committed);
5443 	if (ret)
5444 		goto end_no_trans;
5445 
5446 	if (btrfs_inode_in_log(inode, trans->transid)) {
5447 		ret = BTRFS_NO_LOG_SYNC;
5448 		goto end_no_trans;
5449 	}
5450 
5451 	ret = start_log_trans(trans, root, ctx);
5452 	if (ret)
5453 		goto end_no_trans;
5454 
5455 	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
5456 	if (ret)
5457 		goto end_trans;
5458 
5459 	/*
5460 	 * for regular files, if its inode is already on disk, we don't
5461 	 * have to worry about the parents at all.  This is because
5462 	 * we can use the last_unlink_trans field to record renames
5463 	 * and other fun in this file.
5464 	 */
5465 	if (S_ISREG(inode->vfs_inode.i_mode) &&
5466 	    inode->generation <= last_committed &&
5467 	    inode->last_unlink_trans <= last_committed) {
5468 		ret = 0;
5469 		goto end_trans;
5470 	}
5471 
5472 	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
5473 		log_dentries = true;
5474 
5475 	/*
5476 	 * On unlink we must make sure all our current and old parent directory
5477 	 * inodes are fully logged. This is to prevent leaving dangling
5478 	 * directory index entries in directories that were our parents but are
5479 	 * not anymore. Not doing this results in old parent directory being
5480 	 * impossible to delete after log replay (rmdir will always fail with
5481 	 * error -ENOTEMPTY).
5482 	 *
5483 	 * Example 1:
5484 	 *
5485 	 * mkdir testdir
5486 	 * touch testdir/foo
5487 	 * ln testdir/foo testdir/bar
5488 	 * sync
5489 	 * unlink testdir/bar
5490 	 * xfs_io -c fsync testdir/foo
5491 	 * <power failure>
5492 	 * mount fs, triggers log replay
5493 	 *
5494 	 * If we don't log the parent directory (testdir), after log replay the
5495 	 * directory still has an entry pointing to the file inode using the bar
5496 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5497 	 * the file inode has a link count of 1.
5498 	 *
5499 	 * Example 2:
5500 	 *
5501 	 * mkdir testdir
5502 	 * touch foo
5503 	 * ln foo testdir/foo2
5504 	 * ln foo testdir/foo3
5505 	 * sync
5506 	 * unlink testdir/foo3
5507 	 * xfs_io -c fsync foo
5508 	 * <power failure>
5509 	 * mount fs, triggers log replay
5510 	 *
5511 	 * Similar as the first example, after log replay the parent directory
5512 	 * testdir still has an entry pointing to the inode file with name foo3
5513 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5514 	 * and has a link count of 2.
5515 	 */
5516 	if (inode->last_unlink_trans > last_committed) {
5517 		ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5518 		if (ret)
5519 			goto end_trans;
5520 	}
5521 
5522 	while (1) {
5523 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5524 			break;
5525 
5526 		inode = BTRFS_I(d_inode(parent));
5527 		if (root != inode->root)
5528 			break;
5529 
5530 		if (inode->generation > last_committed) {
5531 			ret = btrfs_log_inode(trans, root, inode,
5532 					LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5533 			if (ret)
5534 				goto end_trans;
5535 		}
5536 		if (IS_ROOT(parent))
5537 			break;
5538 
5539 		parent = dget_parent(parent);
5540 		dput(old_parent);
5541 		old_parent = parent;
5542 	}
5543 	if (log_dentries)
5544 		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
5545 	else
5546 		ret = 0;
5547 end_trans:
5548 	dput(old_parent);
5549 	if (ret < 0) {
5550 		btrfs_set_log_full_commit(fs_info, trans);
5551 		ret = 1;
5552 	}
5553 
5554 	if (ret)
5555 		btrfs_remove_log_ctx(root, ctx);
5556 	btrfs_end_log_trans(root);
5557 end_no_trans:
5558 	return ret;
5559 }
5560 
5561 /*
5562  * it is not safe to log dentry if the chunk root has added new
5563  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5564  * If this returns 1, you must commit the transaction to safely get your
5565  * data on disk.
5566  */
5567 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5568 			  struct btrfs_root *root, struct dentry *dentry,
5569 			  const loff_t start,
5570 			  const loff_t end,
5571 			  struct btrfs_log_ctx *ctx)
5572 {
5573 	struct dentry *parent = dget_parent(dentry);
5574 	int ret;
5575 
5576 	ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
5577 			parent, start, end, LOG_INODE_ALL, ctx);
5578 	dput(parent);
5579 
5580 	return ret;
5581 }
5582 
5583 /*
5584  * should be called during mount to recover any replay any log trees
5585  * from the FS
5586  */
5587 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5588 {
5589 	int ret;
5590 	struct btrfs_path *path;
5591 	struct btrfs_trans_handle *trans;
5592 	struct btrfs_key key;
5593 	struct btrfs_key found_key;
5594 	struct btrfs_key tmp_key;
5595 	struct btrfs_root *log;
5596 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5597 	struct walk_control wc = {
5598 		.process_func = process_one_buffer,
5599 		.stage = 0,
5600 	};
5601 
5602 	path = btrfs_alloc_path();
5603 	if (!path)
5604 		return -ENOMEM;
5605 
5606 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5607 
5608 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
5609 	if (IS_ERR(trans)) {
5610 		ret = PTR_ERR(trans);
5611 		goto error;
5612 	}
5613 
5614 	wc.trans = trans;
5615 	wc.pin = 1;
5616 
5617 	ret = walk_log_tree(trans, log_root_tree, &wc);
5618 	if (ret) {
5619 		btrfs_handle_fs_error(fs_info, ret,
5620 			"Failed to pin buffers while recovering log root tree.");
5621 		goto error;
5622 	}
5623 
5624 again:
5625 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
5626 	key.offset = (u64)-1;
5627 	key.type = BTRFS_ROOT_ITEM_KEY;
5628 
5629 	while (1) {
5630 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5631 
5632 		if (ret < 0) {
5633 			btrfs_handle_fs_error(fs_info, ret,
5634 				    "Couldn't find tree log root.");
5635 			goto error;
5636 		}
5637 		if (ret > 0) {
5638 			if (path->slots[0] == 0)
5639 				break;
5640 			path->slots[0]--;
5641 		}
5642 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5643 				      path->slots[0]);
5644 		btrfs_release_path(path);
5645 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5646 			break;
5647 
5648 		log = btrfs_read_fs_root(log_root_tree, &found_key);
5649 		if (IS_ERR(log)) {
5650 			ret = PTR_ERR(log);
5651 			btrfs_handle_fs_error(fs_info, ret,
5652 				    "Couldn't read tree log root.");
5653 			goto error;
5654 		}
5655 
5656 		tmp_key.objectid = found_key.offset;
5657 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5658 		tmp_key.offset = (u64)-1;
5659 
5660 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
5661 		if (IS_ERR(wc.replay_dest)) {
5662 			ret = PTR_ERR(wc.replay_dest);
5663 			free_extent_buffer(log->node);
5664 			free_extent_buffer(log->commit_root);
5665 			kfree(log);
5666 			btrfs_handle_fs_error(fs_info, ret,
5667 				"Couldn't read target root for tree log recovery.");
5668 			goto error;
5669 		}
5670 
5671 		wc.replay_dest->log_root = log;
5672 		btrfs_record_root_in_trans(trans, wc.replay_dest);
5673 		ret = walk_log_tree(trans, log, &wc);
5674 
5675 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5676 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
5677 						      path);
5678 		}
5679 
5680 		key.offset = found_key.offset - 1;
5681 		wc.replay_dest->log_root = NULL;
5682 		free_extent_buffer(log->node);
5683 		free_extent_buffer(log->commit_root);
5684 		kfree(log);
5685 
5686 		if (ret)
5687 			goto error;
5688 
5689 		if (found_key.offset == 0)
5690 			break;
5691 	}
5692 	btrfs_release_path(path);
5693 
5694 	/* step one is to pin it all, step two is to replay just inodes */
5695 	if (wc.pin) {
5696 		wc.pin = 0;
5697 		wc.process_func = replay_one_buffer;
5698 		wc.stage = LOG_WALK_REPLAY_INODES;
5699 		goto again;
5700 	}
5701 	/* step three is to replay everything */
5702 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
5703 		wc.stage++;
5704 		goto again;
5705 	}
5706 
5707 	btrfs_free_path(path);
5708 
5709 	/* step 4: commit the transaction, which also unpins the blocks */
5710 	ret = btrfs_commit_transaction(trans);
5711 	if (ret)
5712 		return ret;
5713 
5714 	free_extent_buffer(log_root_tree->node);
5715 	log_root_tree->log_root = NULL;
5716 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5717 	kfree(log_root_tree);
5718 
5719 	return 0;
5720 error:
5721 	if (wc.trans)
5722 		btrfs_end_transaction(wc.trans);
5723 	btrfs_free_path(path);
5724 	return ret;
5725 }
5726 
5727 /*
5728  * there are some corner cases where we want to force a full
5729  * commit instead of allowing a directory to be logged.
5730  *
5731  * They revolve around files there were unlinked from the directory, and
5732  * this function updates the parent directory so that a full commit is
5733  * properly done if it is fsync'd later after the unlinks are done.
5734  *
5735  * Must be called before the unlink operations (updates to the subvolume tree,
5736  * inodes, etc) are done.
5737  */
5738 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5739 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
5740 			     int for_rename)
5741 {
5742 	/*
5743 	 * when we're logging a file, if it hasn't been renamed
5744 	 * or unlinked, and its inode is fully committed on disk,
5745 	 * we don't have to worry about walking up the directory chain
5746 	 * to log its parents.
5747 	 *
5748 	 * So, we use the last_unlink_trans field to put this transid
5749 	 * into the file.  When the file is logged we check it and
5750 	 * don't log the parents if the file is fully on disk.
5751 	 */
5752 	mutex_lock(&inode->log_mutex);
5753 	inode->last_unlink_trans = trans->transid;
5754 	mutex_unlock(&inode->log_mutex);
5755 
5756 	/*
5757 	 * if this directory was already logged any new
5758 	 * names for this file/dir will get recorded
5759 	 */
5760 	smp_mb();
5761 	if (dir->logged_trans == trans->transid)
5762 		return;
5763 
5764 	/*
5765 	 * if the inode we're about to unlink was logged,
5766 	 * the log will be properly updated for any new names
5767 	 */
5768 	if (inode->logged_trans == trans->transid)
5769 		return;
5770 
5771 	/*
5772 	 * when renaming files across directories, if the directory
5773 	 * there we're unlinking from gets fsync'd later on, there's
5774 	 * no way to find the destination directory later and fsync it
5775 	 * properly.  So, we have to be conservative and force commits
5776 	 * so the new name gets discovered.
5777 	 */
5778 	if (for_rename)
5779 		goto record;
5780 
5781 	/* we can safely do the unlink without any special recording */
5782 	return;
5783 
5784 record:
5785 	mutex_lock(&dir->log_mutex);
5786 	dir->last_unlink_trans = trans->transid;
5787 	mutex_unlock(&dir->log_mutex);
5788 }
5789 
5790 /*
5791  * Make sure that if someone attempts to fsync the parent directory of a deleted
5792  * snapshot, it ends up triggering a transaction commit. This is to guarantee
5793  * that after replaying the log tree of the parent directory's root we will not
5794  * see the snapshot anymore and at log replay time we will not see any log tree
5795  * corresponding to the deleted snapshot's root, which could lead to replaying
5796  * it after replaying the log tree of the parent directory (which would replay
5797  * the snapshot delete operation).
5798  *
5799  * Must be called before the actual snapshot destroy operation (updates to the
5800  * parent root and tree of tree roots trees, etc) are done.
5801  */
5802 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
5803 				   struct btrfs_inode *dir)
5804 {
5805 	mutex_lock(&dir->log_mutex);
5806 	dir->last_unlink_trans = trans->transid;
5807 	mutex_unlock(&dir->log_mutex);
5808 }
5809 
5810 /*
5811  * Call this after adding a new name for a file and it will properly
5812  * update the log to reflect the new name.
5813  *
5814  * It will return zero if all goes well, and it will return 1 if a
5815  * full transaction commit is required.
5816  */
5817 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
5818 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
5819 			struct dentry *parent)
5820 {
5821 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5822 	struct btrfs_root *root = inode->root;
5823 
5824 	/*
5825 	 * this will force the logging code to walk the dentry chain
5826 	 * up for the file
5827 	 */
5828 	if (S_ISREG(inode->vfs_inode.i_mode))
5829 		inode->last_unlink_trans = trans->transid;
5830 
5831 	/*
5832 	 * if this inode hasn't been logged and directory we're renaming it
5833 	 * from hasn't been logged, we don't need to log it
5834 	 */
5835 	if (inode->logged_trans <= fs_info->last_trans_committed &&
5836 	    (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
5837 		return 0;
5838 
5839 	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
5840 				      LLONG_MAX, LOG_INODE_EXISTS, NULL);
5841 }
5842 
5843