xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 95188aaf)
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/list_sort.h>
22 #include "ctree.h"
23 #include "transaction.h"
24 #include "disk-io.h"
25 #include "locking.h"
26 #include "print-tree.h"
27 #include "backref.h"
28 #include "compat.h"
29 #include "tree-log.h"
30 #include "hash.h"
31 
32 /* magic values for the inode_only field in btrfs_log_inode:
33  *
34  * LOG_INODE_ALL means to log everything
35  * LOG_INODE_EXISTS means to log just enough to recreate the inode
36  * during log replay
37  */
38 #define LOG_INODE_ALL 0
39 #define LOG_INODE_EXISTS 1
40 
41 /*
42  * directory trouble cases
43  *
44  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
45  * log, we must force a full commit before doing an fsync of the directory
46  * where the unlink was done.
47  * ---> record transid of last unlink/rename per directory
48  *
49  * mkdir foo/some_dir
50  * normal commit
51  * rename foo/some_dir foo2/some_dir
52  * mkdir foo/some_dir
53  * fsync foo/some_dir/some_file
54  *
55  * The fsync above will unlink the original some_dir without recording
56  * it in its new location (foo2).  After a crash, some_dir will be gone
57  * unless the fsync of some_file forces a full commit
58  *
59  * 2) we must log any new names for any file or dir that is in the fsync
60  * log. ---> check inode while renaming/linking.
61  *
62  * 2a) we must log any new names for any file or dir during rename
63  * when the directory they are being removed from was logged.
64  * ---> check inode and old parent dir during rename
65  *
66  *  2a is actually the more important variant.  With the extra logging
67  *  a crash might unlink the old name without recreating the new one
68  *
69  * 3) after a crash, we must go through any directories with a link count
70  * of zero and redo the rm -rf
71  *
72  * mkdir f1/foo
73  * normal commit
74  * rm -rf f1/foo
75  * fsync(f1)
76  *
77  * The directory f1 was fully removed from the FS, but fsync was never
78  * called on f1, only its parent dir.  After a crash the rm -rf must
79  * be replayed.  This must be able to recurse down the entire
80  * directory tree.  The inode link count fixup code takes care of the
81  * ugly details.
82  */
83 
84 /*
85  * stages for the tree walking.  The first
86  * stage (0) is to only pin down the blocks we find
87  * the second stage (1) is to make sure that all the inodes
88  * we find in the log are created in the subvolume.
89  *
90  * The last stage is to deal with directories and links and extents
91  * and all the other fun semantics
92  */
93 #define LOG_WALK_PIN_ONLY 0
94 #define LOG_WALK_REPLAY_INODES 1
95 #define LOG_WALK_REPLAY_ALL 2
96 
97 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
98 			     struct btrfs_root *root, struct inode *inode,
99 			     int inode_only);
100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101 			     struct btrfs_root *root,
102 			     struct btrfs_path *path, u64 objectid);
103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104 				       struct btrfs_root *root,
105 				       struct btrfs_root *log,
106 				       struct btrfs_path *path,
107 				       u64 dirid, int del_all);
108 
109 /*
110  * tree logging is a special write ahead log used to make sure that
111  * fsyncs and O_SYNCs can happen without doing full tree commits.
112  *
113  * Full tree commits are expensive because they require commonly
114  * modified blocks to be recowed, creating many dirty pages in the
115  * extent tree an 4x-6x higher write load than ext3.
116  *
117  * Instead of doing a tree commit on every fsync, we use the
118  * key ranges and transaction ids to find items for a given file or directory
119  * that have changed in this transaction.  Those items are copied into
120  * a special tree (one per subvolume root), that tree is written to disk
121  * and then the fsync is considered complete.
122  *
123  * After a crash, items are copied out of the log-tree back into the
124  * subvolume tree.  Any file data extents found are recorded in the extent
125  * allocation tree, and the log-tree freed.
126  *
127  * The log tree is read three times, once to pin down all the extents it is
128  * using in ram and once, once to create all the inodes logged in the tree
129  * and once to do all the other items.
130  */
131 
132 /*
133  * start a sub transaction and setup the log tree
134  * this increments the log tree writer count to make the people
135  * syncing the tree wait for us to finish
136  */
137 static int start_log_trans(struct btrfs_trans_handle *trans,
138 			   struct btrfs_root *root)
139 {
140 	int ret;
141 	int err = 0;
142 
143 	mutex_lock(&root->log_mutex);
144 	if (root->log_root) {
145 		if (!root->log_start_pid) {
146 			root->log_start_pid = current->pid;
147 			root->log_multiple_pids = false;
148 		} else if (root->log_start_pid != current->pid) {
149 			root->log_multiple_pids = true;
150 		}
151 
152 		atomic_inc(&root->log_batch);
153 		atomic_inc(&root->log_writers);
154 		mutex_unlock(&root->log_mutex);
155 		return 0;
156 	}
157 	root->log_multiple_pids = false;
158 	root->log_start_pid = current->pid;
159 	mutex_lock(&root->fs_info->tree_log_mutex);
160 	if (!root->fs_info->log_root_tree) {
161 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
162 		if (ret)
163 			err = ret;
164 	}
165 	if (err == 0 && !root->log_root) {
166 		ret = btrfs_add_log_tree(trans, root);
167 		if (ret)
168 			err = ret;
169 	}
170 	mutex_unlock(&root->fs_info->tree_log_mutex);
171 	atomic_inc(&root->log_batch);
172 	atomic_inc(&root->log_writers);
173 	mutex_unlock(&root->log_mutex);
174 	return err;
175 }
176 
177 /*
178  * returns 0 if there was a log transaction running and we were able
179  * to join, or returns -ENOENT if there were not transactions
180  * in progress
181  */
182 static int join_running_log_trans(struct btrfs_root *root)
183 {
184 	int ret = -ENOENT;
185 
186 	smp_mb();
187 	if (!root->log_root)
188 		return -ENOENT;
189 
190 	mutex_lock(&root->log_mutex);
191 	if (root->log_root) {
192 		ret = 0;
193 		atomic_inc(&root->log_writers);
194 	}
195 	mutex_unlock(&root->log_mutex);
196 	return ret;
197 }
198 
199 /*
200  * This either makes the current running log transaction wait
201  * until you call btrfs_end_log_trans() or it makes any future
202  * log transactions wait until you call btrfs_end_log_trans()
203  */
204 int btrfs_pin_log_trans(struct btrfs_root *root)
205 {
206 	int ret = -ENOENT;
207 
208 	mutex_lock(&root->log_mutex);
209 	atomic_inc(&root->log_writers);
210 	mutex_unlock(&root->log_mutex);
211 	return ret;
212 }
213 
214 /*
215  * indicate we're done making changes to the log tree
216  * and wake up anyone waiting to do a sync
217  */
218 void btrfs_end_log_trans(struct btrfs_root *root)
219 {
220 	if (atomic_dec_and_test(&root->log_writers)) {
221 		smp_mb();
222 		if (waitqueue_active(&root->log_writer_wait))
223 			wake_up(&root->log_writer_wait);
224 	}
225 }
226 
227 
228 /*
229  * the walk control struct is used to pass state down the chain when
230  * processing the log tree.  The stage field tells us which part
231  * of the log tree processing we are currently doing.  The others
232  * are state fields used for that specific part
233  */
234 struct walk_control {
235 	/* should we free the extent on disk when done?  This is used
236 	 * at transaction commit time while freeing a log tree
237 	 */
238 	int free;
239 
240 	/* should we write out the extent buffer?  This is used
241 	 * while flushing the log tree to disk during a sync
242 	 */
243 	int write;
244 
245 	/* should we wait for the extent buffer io to finish?  Also used
246 	 * while flushing the log tree to disk for a sync
247 	 */
248 	int wait;
249 
250 	/* pin only walk, we record which extents on disk belong to the
251 	 * log trees
252 	 */
253 	int pin;
254 
255 	/* what stage of the replay code we're currently in */
256 	int stage;
257 
258 	/* the root we are currently replaying */
259 	struct btrfs_root *replay_dest;
260 
261 	/* the trans handle for the current replay */
262 	struct btrfs_trans_handle *trans;
263 
264 	/* the function that gets used to process blocks we find in the
265 	 * tree.  Note the extent_buffer might not be up to date when it is
266 	 * passed in, and it must be checked or read if you need the data
267 	 * inside it
268 	 */
269 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
270 			    struct walk_control *wc, u64 gen);
271 };
272 
273 /*
274  * process_func used to pin down extents, write them or wait on them
275  */
276 static int process_one_buffer(struct btrfs_root *log,
277 			      struct extent_buffer *eb,
278 			      struct walk_control *wc, u64 gen)
279 {
280 	if (wc->pin)
281 		btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
282 						eb->start, eb->len);
283 
284 	if (btrfs_buffer_uptodate(eb, gen, 0)) {
285 		if (wc->write)
286 			btrfs_write_tree_block(eb);
287 		if (wc->wait)
288 			btrfs_wait_tree_block_writeback(eb);
289 	}
290 	return 0;
291 }
292 
293 /*
294  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
295  * to the src data we are copying out.
296  *
297  * root is the tree we are copying into, and path is a scratch
298  * path for use in this function (it should be released on entry and
299  * will be released on exit).
300  *
301  * If the key is already in the destination tree the existing item is
302  * overwritten.  If the existing item isn't big enough, it is extended.
303  * If it is too large, it is truncated.
304  *
305  * If the key isn't in the destination yet, a new item is inserted.
306  */
307 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
308 				   struct btrfs_root *root,
309 				   struct btrfs_path *path,
310 				   struct extent_buffer *eb, int slot,
311 				   struct btrfs_key *key)
312 {
313 	int ret;
314 	u32 item_size;
315 	u64 saved_i_size = 0;
316 	int save_old_i_size = 0;
317 	unsigned long src_ptr;
318 	unsigned long dst_ptr;
319 	int overwrite_root = 0;
320 
321 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
322 		overwrite_root = 1;
323 
324 	item_size = btrfs_item_size_nr(eb, slot);
325 	src_ptr = btrfs_item_ptr_offset(eb, slot);
326 
327 	/* look for the key in the destination tree */
328 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
329 	if (ret == 0) {
330 		char *src_copy;
331 		char *dst_copy;
332 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
333 						  path->slots[0]);
334 		if (dst_size != item_size)
335 			goto insert;
336 
337 		if (item_size == 0) {
338 			btrfs_release_path(path);
339 			return 0;
340 		}
341 		dst_copy = kmalloc(item_size, GFP_NOFS);
342 		src_copy = kmalloc(item_size, GFP_NOFS);
343 		if (!dst_copy || !src_copy) {
344 			btrfs_release_path(path);
345 			kfree(dst_copy);
346 			kfree(src_copy);
347 			return -ENOMEM;
348 		}
349 
350 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
351 
352 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
353 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
354 				   item_size);
355 		ret = memcmp(dst_copy, src_copy, item_size);
356 
357 		kfree(dst_copy);
358 		kfree(src_copy);
359 		/*
360 		 * they have the same contents, just return, this saves
361 		 * us from cowing blocks in the destination tree and doing
362 		 * extra writes that may not have been done by a previous
363 		 * sync
364 		 */
365 		if (ret == 0) {
366 			btrfs_release_path(path);
367 			return 0;
368 		}
369 
370 	}
371 insert:
372 	btrfs_release_path(path);
373 	/* try to insert the key into the destination tree */
374 	ret = btrfs_insert_empty_item(trans, root, path,
375 				      key, item_size);
376 
377 	/* make sure any existing item is the correct size */
378 	if (ret == -EEXIST) {
379 		u32 found_size;
380 		found_size = btrfs_item_size_nr(path->nodes[0],
381 						path->slots[0]);
382 		if (found_size > item_size)
383 			btrfs_truncate_item(trans, root, path, item_size, 1);
384 		else if (found_size < item_size)
385 			btrfs_extend_item(trans, root, path,
386 					  item_size - found_size);
387 	} else if (ret) {
388 		return ret;
389 	}
390 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
391 					path->slots[0]);
392 
393 	/* don't overwrite an existing inode if the generation number
394 	 * was logged as zero.  This is done when the tree logging code
395 	 * is just logging an inode to make sure it exists after recovery.
396 	 *
397 	 * Also, don't overwrite i_size on directories during replay.
398 	 * log replay inserts and removes directory items based on the
399 	 * state of the tree found in the subvolume, and i_size is modified
400 	 * as it goes
401 	 */
402 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
403 		struct btrfs_inode_item *src_item;
404 		struct btrfs_inode_item *dst_item;
405 
406 		src_item = (struct btrfs_inode_item *)src_ptr;
407 		dst_item = (struct btrfs_inode_item *)dst_ptr;
408 
409 		if (btrfs_inode_generation(eb, src_item) == 0)
410 			goto no_copy;
411 
412 		if (overwrite_root &&
413 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
414 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
415 			save_old_i_size = 1;
416 			saved_i_size = btrfs_inode_size(path->nodes[0],
417 							dst_item);
418 		}
419 	}
420 
421 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
422 			   src_ptr, item_size);
423 
424 	if (save_old_i_size) {
425 		struct btrfs_inode_item *dst_item;
426 		dst_item = (struct btrfs_inode_item *)dst_ptr;
427 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
428 	}
429 
430 	/* make sure the generation is filled in */
431 	if (key->type == BTRFS_INODE_ITEM_KEY) {
432 		struct btrfs_inode_item *dst_item;
433 		dst_item = (struct btrfs_inode_item *)dst_ptr;
434 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
435 			btrfs_set_inode_generation(path->nodes[0], dst_item,
436 						   trans->transid);
437 		}
438 	}
439 no_copy:
440 	btrfs_mark_buffer_dirty(path->nodes[0]);
441 	btrfs_release_path(path);
442 	return 0;
443 }
444 
445 /*
446  * simple helper to read an inode off the disk from a given root
447  * This can only be called for subvolume roots and not for the log
448  */
449 static noinline struct inode *read_one_inode(struct btrfs_root *root,
450 					     u64 objectid)
451 {
452 	struct btrfs_key key;
453 	struct inode *inode;
454 
455 	key.objectid = objectid;
456 	key.type = BTRFS_INODE_ITEM_KEY;
457 	key.offset = 0;
458 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
459 	if (IS_ERR(inode)) {
460 		inode = NULL;
461 	} else if (is_bad_inode(inode)) {
462 		iput(inode);
463 		inode = NULL;
464 	}
465 	return inode;
466 }
467 
468 /* replays a single extent in 'eb' at 'slot' with 'key' into the
469  * subvolume 'root'.  path is released on entry and should be released
470  * on exit.
471  *
472  * extents in the log tree have not been allocated out of the extent
473  * tree yet.  So, this completes the allocation, taking a reference
474  * as required if the extent already exists or creating a new extent
475  * if it isn't in the extent allocation tree yet.
476  *
477  * The extent is inserted into the file, dropping any existing extents
478  * from the file that overlap the new one.
479  */
480 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
481 				      struct btrfs_root *root,
482 				      struct btrfs_path *path,
483 				      struct extent_buffer *eb, int slot,
484 				      struct btrfs_key *key)
485 {
486 	int found_type;
487 	u64 extent_end;
488 	u64 start = key->offset;
489 	u64 saved_nbytes;
490 	struct btrfs_file_extent_item *item;
491 	struct inode *inode = NULL;
492 	unsigned long size;
493 	int ret = 0;
494 
495 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
496 	found_type = btrfs_file_extent_type(eb, item);
497 
498 	if (found_type == BTRFS_FILE_EXTENT_REG ||
499 	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
500 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
501 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
502 		size = btrfs_file_extent_inline_len(eb, item);
503 		extent_end = ALIGN(start + size, root->sectorsize);
504 	} else {
505 		ret = 0;
506 		goto out;
507 	}
508 
509 	inode = read_one_inode(root, key->objectid);
510 	if (!inode) {
511 		ret = -EIO;
512 		goto out;
513 	}
514 
515 	/*
516 	 * first check to see if we already have this extent in the
517 	 * file.  This must be done before the btrfs_drop_extents run
518 	 * so we don't try to drop this extent.
519 	 */
520 	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
521 				       start, 0);
522 
523 	if (ret == 0 &&
524 	    (found_type == BTRFS_FILE_EXTENT_REG ||
525 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
526 		struct btrfs_file_extent_item cmp1;
527 		struct btrfs_file_extent_item cmp2;
528 		struct btrfs_file_extent_item *existing;
529 		struct extent_buffer *leaf;
530 
531 		leaf = path->nodes[0];
532 		existing = btrfs_item_ptr(leaf, path->slots[0],
533 					  struct btrfs_file_extent_item);
534 
535 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
536 				   sizeof(cmp1));
537 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
538 				   sizeof(cmp2));
539 
540 		/*
541 		 * we already have a pointer to this exact extent,
542 		 * we don't have to do anything
543 		 */
544 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
545 			btrfs_release_path(path);
546 			goto out;
547 		}
548 	}
549 	btrfs_release_path(path);
550 
551 	saved_nbytes = inode_get_bytes(inode);
552 	/* drop any overlapping extents */
553 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
554 	BUG_ON(ret);
555 
556 	if (found_type == BTRFS_FILE_EXTENT_REG ||
557 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
558 		u64 offset;
559 		unsigned long dest_offset;
560 		struct btrfs_key ins;
561 
562 		ret = btrfs_insert_empty_item(trans, root, path, key,
563 					      sizeof(*item));
564 		BUG_ON(ret);
565 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
566 						    path->slots[0]);
567 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
568 				(unsigned long)item,  sizeof(*item));
569 
570 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
571 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
572 		ins.type = BTRFS_EXTENT_ITEM_KEY;
573 		offset = key->offset - btrfs_file_extent_offset(eb, item);
574 
575 		if (ins.objectid > 0) {
576 			u64 csum_start;
577 			u64 csum_end;
578 			LIST_HEAD(ordered_sums);
579 			/*
580 			 * is this extent already allocated in the extent
581 			 * allocation tree?  If so, just add a reference
582 			 */
583 			ret = btrfs_lookup_extent(root, ins.objectid,
584 						ins.offset);
585 			if (ret == 0) {
586 				ret = btrfs_inc_extent_ref(trans, root,
587 						ins.objectid, ins.offset,
588 						0, root->root_key.objectid,
589 						key->objectid, offset, 0);
590 				BUG_ON(ret);
591 			} else {
592 				/*
593 				 * insert the extent pointer in the extent
594 				 * allocation tree
595 				 */
596 				ret = btrfs_alloc_logged_file_extent(trans,
597 						root, root->root_key.objectid,
598 						key->objectid, offset, &ins);
599 				BUG_ON(ret);
600 			}
601 			btrfs_release_path(path);
602 
603 			if (btrfs_file_extent_compression(eb, item)) {
604 				csum_start = ins.objectid;
605 				csum_end = csum_start + ins.offset;
606 			} else {
607 				csum_start = ins.objectid +
608 					btrfs_file_extent_offset(eb, item);
609 				csum_end = csum_start +
610 					btrfs_file_extent_num_bytes(eb, item);
611 			}
612 
613 			ret = btrfs_lookup_csums_range(root->log_root,
614 						csum_start, csum_end - 1,
615 						&ordered_sums, 0);
616 			BUG_ON(ret);
617 			while (!list_empty(&ordered_sums)) {
618 				struct btrfs_ordered_sum *sums;
619 				sums = list_entry(ordered_sums.next,
620 						struct btrfs_ordered_sum,
621 						list);
622 				ret = btrfs_csum_file_blocks(trans,
623 						root->fs_info->csum_root,
624 						sums);
625 				BUG_ON(ret);
626 				list_del(&sums->list);
627 				kfree(sums);
628 			}
629 		} else {
630 			btrfs_release_path(path);
631 		}
632 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
633 		/* inline extents are easy, we just overwrite them */
634 		ret = overwrite_item(trans, root, path, eb, slot, key);
635 		BUG_ON(ret);
636 	}
637 
638 	inode_set_bytes(inode, saved_nbytes);
639 	ret = btrfs_update_inode(trans, root, inode);
640 out:
641 	if (inode)
642 		iput(inode);
643 	return ret;
644 }
645 
646 /*
647  * when cleaning up conflicts between the directory names in the
648  * subvolume, directory names in the log and directory names in the
649  * inode back references, we may have to unlink inodes from directories.
650  *
651  * This is a helper function to do the unlink of a specific directory
652  * item
653  */
654 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
655 				      struct btrfs_root *root,
656 				      struct btrfs_path *path,
657 				      struct inode *dir,
658 				      struct btrfs_dir_item *di)
659 {
660 	struct inode *inode;
661 	char *name;
662 	int name_len;
663 	struct extent_buffer *leaf;
664 	struct btrfs_key location;
665 	int ret;
666 
667 	leaf = path->nodes[0];
668 
669 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
670 	name_len = btrfs_dir_name_len(leaf, di);
671 	name = kmalloc(name_len, GFP_NOFS);
672 	if (!name)
673 		return -ENOMEM;
674 
675 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
676 	btrfs_release_path(path);
677 
678 	inode = read_one_inode(root, location.objectid);
679 	if (!inode) {
680 		kfree(name);
681 		return -EIO;
682 	}
683 
684 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
685 	BUG_ON(ret);
686 
687 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
688 	BUG_ON(ret);
689 	kfree(name);
690 
691 	iput(inode);
692 
693 	btrfs_run_delayed_items(trans, root);
694 	return ret;
695 }
696 
697 /*
698  * helper function to see if a given name and sequence number found
699  * in an inode back reference are already in a directory and correctly
700  * point to this inode
701  */
702 static noinline int inode_in_dir(struct btrfs_root *root,
703 				 struct btrfs_path *path,
704 				 u64 dirid, u64 objectid, u64 index,
705 				 const char *name, int name_len)
706 {
707 	struct btrfs_dir_item *di;
708 	struct btrfs_key location;
709 	int match = 0;
710 
711 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
712 					 index, name, name_len, 0);
713 	if (di && !IS_ERR(di)) {
714 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
715 		if (location.objectid != objectid)
716 			goto out;
717 	} else
718 		goto out;
719 	btrfs_release_path(path);
720 
721 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
722 	if (di && !IS_ERR(di)) {
723 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
724 		if (location.objectid != objectid)
725 			goto out;
726 	} else
727 		goto out;
728 	match = 1;
729 out:
730 	btrfs_release_path(path);
731 	return match;
732 }
733 
734 /*
735  * helper function to check a log tree for a named back reference in
736  * an inode.  This is used to decide if a back reference that is
737  * found in the subvolume conflicts with what we find in the log.
738  *
739  * inode backreferences may have multiple refs in a single item,
740  * during replay we process one reference at a time, and we don't
741  * want to delete valid links to a file from the subvolume if that
742  * link is also in the log.
743  */
744 static noinline int backref_in_log(struct btrfs_root *log,
745 				   struct btrfs_key *key,
746 				   u64 ref_objectid,
747 				   char *name, int namelen)
748 {
749 	struct btrfs_path *path;
750 	struct btrfs_inode_ref *ref;
751 	unsigned long ptr;
752 	unsigned long ptr_end;
753 	unsigned long name_ptr;
754 	int found_name_len;
755 	int item_size;
756 	int ret;
757 	int match = 0;
758 
759 	path = btrfs_alloc_path();
760 	if (!path)
761 		return -ENOMEM;
762 
763 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
764 	if (ret != 0)
765 		goto out;
766 
767 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
768 
769 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
770 		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
771 						   name, namelen, NULL))
772 			match = 1;
773 
774 		goto out;
775 	}
776 
777 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
778 	ptr_end = ptr + item_size;
779 	while (ptr < ptr_end) {
780 		ref = (struct btrfs_inode_ref *)ptr;
781 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
782 		if (found_name_len == namelen) {
783 			name_ptr = (unsigned long)(ref + 1);
784 			ret = memcmp_extent_buffer(path->nodes[0], name,
785 						   name_ptr, namelen);
786 			if (ret == 0) {
787 				match = 1;
788 				goto out;
789 			}
790 		}
791 		ptr = (unsigned long)(ref + 1) + found_name_len;
792 	}
793 out:
794 	btrfs_free_path(path);
795 	return match;
796 }
797 
798 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
799 				  struct btrfs_root *root,
800 				  struct btrfs_path *path,
801 				  struct btrfs_root *log_root,
802 				  struct inode *dir, struct inode *inode,
803 				  struct extent_buffer *eb,
804 				  u64 inode_objectid, u64 parent_objectid,
805 				  u64 ref_index, char *name, int namelen,
806 				  int *search_done)
807 {
808 	int ret;
809 	char *victim_name;
810 	int victim_name_len;
811 	struct extent_buffer *leaf;
812 	struct btrfs_dir_item *di;
813 	struct btrfs_key search_key;
814 	struct btrfs_inode_extref *extref;
815 
816 again:
817 	/* Search old style refs */
818 	search_key.objectid = inode_objectid;
819 	search_key.type = BTRFS_INODE_REF_KEY;
820 	search_key.offset = parent_objectid;
821 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
822 	if (ret == 0) {
823 		struct btrfs_inode_ref *victim_ref;
824 		unsigned long ptr;
825 		unsigned long ptr_end;
826 
827 		leaf = path->nodes[0];
828 
829 		/* are we trying to overwrite a back ref for the root directory
830 		 * if so, just jump out, we're done
831 		 */
832 		if (search_key.objectid == search_key.offset)
833 			return 1;
834 
835 		/* check all the names in this back reference to see
836 		 * if they are in the log.  if so, we allow them to stay
837 		 * otherwise they must be unlinked as a conflict
838 		 */
839 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
840 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
841 		while (ptr < ptr_end) {
842 			victim_ref = (struct btrfs_inode_ref *)ptr;
843 			victim_name_len = btrfs_inode_ref_name_len(leaf,
844 								   victim_ref);
845 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
846 			BUG_ON(!victim_name);
847 
848 			read_extent_buffer(leaf, victim_name,
849 					   (unsigned long)(victim_ref + 1),
850 					   victim_name_len);
851 
852 			if (!backref_in_log(log_root, &search_key,
853 					    parent_objectid,
854 					    victim_name,
855 					    victim_name_len)) {
856 				btrfs_inc_nlink(inode);
857 				btrfs_release_path(path);
858 
859 				ret = btrfs_unlink_inode(trans, root, dir,
860 							 inode, victim_name,
861 							 victim_name_len);
862 				BUG_ON(ret);
863 				btrfs_run_delayed_items(trans, root);
864 				kfree(victim_name);
865 				*search_done = 1;
866 				goto again;
867 			}
868 			kfree(victim_name);
869 
870 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
871 		}
872 		BUG_ON(ret);
873 
874 		/*
875 		 * NOTE: we have searched root tree and checked the
876 		 * coresponding ref, it does not need to check again.
877 		 */
878 		*search_done = 1;
879 	}
880 	btrfs_release_path(path);
881 
882 	/* Same search but for extended refs */
883 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
884 					   inode_objectid, parent_objectid, 0,
885 					   0);
886 	if (!IS_ERR_OR_NULL(extref)) {
887 		u32 item_size;
888 		u32 cur_offset = 0;
889 		unsigned long base;
890 		struct inode *victim_parent;
891 
892 		leaf = path->nodes[0];
893 
894 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
895 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
896 
897 		while (cur_offset < item_size) {
898 			extref = (struct btrfs_inode_extref *)base + cur_offset;
899 
900 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
901 
902 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
903 				goto next;
904 
905 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
906 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
907 					   victim_name_len);
908 
909 			search_key.objectid = inode_objectid;
910 			search_key.type = BTRFS_INODE_EXTREF_KEY;
911 			search_key.offset = btrfs_extref_hash(parent_objectid,
912 							      victim_name,
913 							      victim_name_len);
914 			ret = 0;
915 			if (!backref_in_log(log_root, &search_key,
916 					    parent_objectid, victim_name,
917 					    victim_name_len)) {
918 				ret = -ENOENT;
919 				victim_parent = read_one_inode(root,
920 							       parent_objectid);
921 				if (victim_parent) {
922 					btrfs_inc_nlink(inode);
923 					btrfs_release_path(path);
924 
925 					ret = btrfs_unlink_inode(trans, root,
926 								 victim_parent,
927 								 inode,
928 								 victim_name,
929 								 victim_name_len);
930 					btrfs_run_delayed_items(trans, root);
931 				}
932 				BUG_ON(ret);
933 				iput(victim_parent);
934 				kfree(victim_name);
935 				*search_done = 1;
936 				goto again;
937 			}
938 			kfree(victim_name);
939 			BUG_ON(ret);
940 next:
941 			cur_offset += victim_name_len + sizeof(*extref);
942 		}
943 		*search_done = 1;
944 	}
945 	btrfs_release_path(path);
946 
947 	/* look for a conflicting sequence number */
948 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
949 					 ref_index, name, namelen, 0);
950 	if (di && !IS_ERR(di)) {
951 		ret = drop_one_dir_item(trans, root, path, dir, di);
952 		BUG_ON(ret);
953 	}
954 	btrfs_release_path(path);
955 
956 	/* look for a conflicing name */
957 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
958 				   name, namelen, 0);
959 	if (di && !IS_ERR(di)) {
960 		ret = drop_one_dir_item(trans, root, path, dir, di);
961 		BUG_ON(ret);
962 	}
963 	btrfs_release_path(path);
964 
965 	return 0;
966 }
967 
968 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
969 			     u32 *namelen, char **name, u64 *index,
970 			     u64 *parent_objectid)
971 {
972 	struct btrfs_inode_extref *extref;
973 
974 	extref = (struct btrfs_inode_extref *)ref_ptr;
975 
976 	*namelen = btrfs_inode_extref_name_len(eb, extref);
977 	*name = kmalloc(*namelen, GFP_NOFS);
978 	if (*name == NULL)
979 		return -ENOMEM;
980 
981 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
982 			   *namelen);
983 
984 	*index = btrfs_inode_extref_index(eb, extref);
985 	if (parent_objectid)
986 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
987 
988 	return 0;
989 }
990 
991 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
992 			  u32 *namelen, char **name, u64 *index)
993 {
994 	struct btrfs_inode_ref *ref;
995 
996 	ref = (struct btrfs_inode_ref *)ref_ptr;
997 
998 	*namelen = btrfs_inode_ref_name_len(eb, ref);
999 	*name = kmalloc(*namelen, GFP_NOFS);
1000 	if (*name == NULL)
1001 		return -ENOMEM;
1002 
1003 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1004 
1005 	*index = btrfs_inode_ref_index(eb, ref);
1006 
1007 	return 0;
1008 }
1009 
1010 /*
1011  * replay one inode back reference item found in the log tree.
1012  * eb, slot and key refer to the buffer and key found in the log tree.
1013  * root is the destination we are replaying into, and path is for temp
1014  * use by this function.  (it should be released on return).
1015  */
1016 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1017 				  struct btrfs_root *root,
1018 				  struct btrfs_root *log,
1019 				  struct btrfs_path *path,
1020 				  struct extent_buffer *eb, int slot,
1021 				  struct btrfs_key *key)
1022 {
1023 	struct inode *dir;
1024 	struct inode *inode;
1025 	unsigned long ref_ptr;
1026 	unsigned long ref_end;
1027 	char *name;
1028 	int namelen;
1029 	int ret;
1030 	int search_done = 0;
1031 	int log_ref_ver = 0;
1032 	u64 parent_objectid;
1033 	u64 inode_objectid;
1034 	u64 ref_index = 0;
1035 	int ref_struct_size;
1036 
1037 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1038 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1039 
1040 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1041 		struct btrfs_inode_extref *r;
1042 
1043 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1044 		log_ref_ver = 1;
1045 		r = (struct btrfs_inode_extref *)ref_ptr;
1046 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1047 	} else {
1048 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1049 		parent_objectid = key->offset;
1050 	}
1051 	inode_objectid = key->objectid;
1052 
1053 	/*
1054 	 * it is possible that we didn't log all the parent directories
1055 	 * for a given inode.  If we don't find the dir, just don't
1056 	 * copy the back ref in.  The link count fixup code will take
1057 	 * care of the rest
1058 	 */
1059 	dir = read_one_inode(root, parent_objectid);
1060 	if (!dir)
1061 		return -ENOENT;
1062 
1063 	inode = read_one_inode(root, inode_objectid);
1064 	if (!inode) {
1065 		iput(dir);
1066 		return -EIO;
1067 	}
1068 
1069 	while (ref_ptr < ref_end) {
1070 		if (log_ref_ver) {
1071 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1072 						&ref_index, &parent_objectid);
1073 			/*
1074 			 * parent object can change from one array
1075 			 * item to another.
1076 			 */
1077 			if (!dir)
1078 				dir = read_one_inode(root, parent_objectid);
1079 			if (!dir)
1080 				return -ENOENT;
1081 		} else {
1082 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1083 					     &ref_index);
1084 		}
1085 		if (ret)
1086 			return ret;
1087 
1088 		/* if we already have a perfect match, we're done */
1089 		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1090 				  ref_index, name, namelen)) {
1091 			/*
1092 			 * look for a conflicting back reference in the
1093 			 * metadata. if we find one we have to unlink that name
1094 			 * of the file before we add our new link.  Later on, we
1095 			 * overwrite any existing back reference, and we don't
1096 			 * want to create dangling pointers in the directory.
1097 			 */
1098 
1099 			if (!search_done) {
1100 				ret = __add_inode_ref(trans, root, path, log,
1101 						      dir, inode, eb,
1102 						      inode_objectid,
1103 						      parent_objectid,
1104 						      ref_index, name, namelen,
1105 						      &search_done);
1106 				if (ret == 1)
1107 					goto out;
1108 				BUG_ON(ret);
1109 			}
1110 
1111 			/* insert our name */
1112 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
1113 					     0, ref_index);
1114 			BUG_ON(ret);
1115 
1116 			btrfs_update_inode(trans, root, inode);
1117 		}
1118 
1119 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1120 		kfree(name);
1121 		if (log_ref_ver) {
1122 			iput(dir);
1123 			dir = NULL;
1124 		}
1125 	}
1126 
1127 	/* finally write the back reference in the inode */
1128 	ret = overwrite_item(trans, root, path, eb, slot, key);
1129 	BUG_ON(ret);
1130 
1131 out:
1132 	btrfs_release_path(path);
1133 	iput(dir);
1134 	iput(inode);
1135 	return 0;
1136 }
1137 
1138 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1139 			      struct btrfs_root *root, u64 offset)
1140 {
1141 	int ret;
1142 	ret = btrfs_find_orphan_item(root, offset);
1143 	if (ret > 0)
1144 		ret = btrfs_insert_orphan_item(trans, root, offset);
1145 	return ret;
1146 }
1147 
1148 static int count_inode_extrefs(struct btrfs_root *root,
1149 			       struct inode *inode, struct btrfs_path *path)
1150 {
1151 	int ret = 0;
1152 	int name_len;
1153 	unsigned int nlink = 0;
1154 	u32 item_size;
1155 	u32 cur_offset = 0;
1156 	u64 inode_objectid = btrfs_ino(inode);
1157 	u64 offset = 0;
1158 	unsigned long ptr;
1159 	struct btrfs_inode_extref *extref;
1160 	struct extent_buffer *leaf;
1161 
1162 	while (1) {
1163 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1164 					    &extref, &offset);
1165 		if (ret)
1166 			break;
1167 
1168 		leaf = path->nodes[0];
1169 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1170 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1171 
1172 		while (cur_offset < item_size) {
1173 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1174 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1175 
1176 			nlink++;
1177 
1178 			cur_offset += name_len + sizeof(*extref);
1179 		}
1180 
1181 		offset++;
1182 		btrfs_release_path(path);
1183 	}
1184 	btrfs_release_path(path);
1185 
1186 	if (ret < 0)
1187 		return ret;
1188 	return nlink;
1189 }
1190 
1191 static int count_inode_refs(struct btrfs_root *root,
1192 			       struct inode *inode, struct btrfs_path *path)
1193 {
1194 	int ret;
1195 	struct btrfs_key key;
1196 	unsigned int nlink = 0;
1197 	unsigned long ptr;
1198 	unsigned long ptr_end;
1199 	int name_len;
1200 	u64 ino = btrfs_ino(inode);
1201 
1202 	key.objectid = ino;
1203 	key.type = BTRFS_INODE_REF_KEY;
1204 	key.offset = (u64)-1;
1205 
1206 	while (1) {
1207 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1208 		if (ret < 0)
1209 			break;
1210 		if (ret > 0) {
1211 			if (path->slots[0] == 0)
1212 				break;
1213 			path->slots[0]--;
1214 		}
1215 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1216 				      path->slots[0]);
1217 		if (key.objectid != ino ||
1218 		    key.type != BTRFS_INODE_REF_KEY)
1219 			break;
1220 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1221 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1222 						   path->slots[0]);
1223 		while (ptr < ptr_end) {
1224 			struct btrfs_inode_ref *ref;
1225 
1226 			ref = (struct btrfs_inode_ref *)ptr;
1227 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1228 							    ref);
1229 			ptr = (unsigned long)(ref + 1) + name_len;
1230 			nlink++;
1231 		}
1232 
1233 		if (key.offset == 0)
1234 			break;
1235 		key.offset--;
1236 		btrfs_release_path(path);
1237 	}
1238 	btrfs_release_path(path);
1239 
1240 	return nlink;
1241 }
1242 
1243 /*
1244  * There are a few corners where the link count of the file can't
1245  * be properly maintained during replay.  So, instead of adding
1246  * lots of complexity to the log code, we just scan the backrefs
1247  * for any file that has been through replay.
1248  *
1249  * The scan will update the link count on the inode to reflect the
1250  * number of back refs found.  If it goes down to zero, the iput
1251  * will free the inode.
1252  */
1253 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1254 					   struct btrfs_root *root,
1255 					   struct inode *inode)
1256 {
1257 	struct btrfs_path *path;
1258 	int ret;
1259 	u64 nlink = 0;
1260 	u64 ino = btrfs_ino(inode);
1261 
1262 	path = btrfs_alloc_path();
1263 	if (!path)
1264 		return -ENOMEM;
1265 
1266 	ret = count_inode_refs(root, inode, path);
1267 	if (ret < 0)
1268 		goto out;
1269 
1270 	nlink = ret;
1271 
1272 	ret = count_inode_extrefs(root, inode, path);
1273 	if (ret == -ENOENT)
1274 		ret = 0;
1275 
1276 	if (ret < 0)
1277 		goto out;
1278 
1279 	nlink += ret;
1280 
1281 	ret = 0;
1282 
1283 	if (nlink != inode->i_nlink) {
1284 		set_nlink(inode, nlink);
1285 		btrfs_update_inode(trans, root, inode);
1286 	}
1287 	BTRFS_I(inode)->index_cnt = (u64)-1;
1288 
1289 	if (inode->i_nlink == 0) {
1290 		if (S_ISDIR(inode->i_mode)) {
1291 			ret = replay_dir_deletes(trans, root, NULL, path,
1292 						 ino, 1);
1293 			BUG_ON(ret);
1294 		}
1295 		ret = insert_orphan_item(trans, root, ino);
1296 		BUG_ON(ret);
1297 	}
1298 
1299 out:
1300 	btrfs_free_path(path);
1301 	return ret;
1302 }
1303 
1304 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1305 					    struct btrfs_root *root,
1306 					    struct btrfs_path *path)
1307 {
1308 	int ret;
1309 	struct btrfs_key key;
1310 	struct inode *inode;
1311 
1312 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1313 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1314 	key.offset = (u64)-1;
1315 	while (1) {
1316 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1317 		if (ret < 0)
1318 			break;
1319 
1320 		if (ret == 1) {
1321 			if (path->slots[0] == 0)
1322 				break;
1323 			path->slots[0]--;
1324 		}
1325 
1326 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1327 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1328 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1329 			break;
1330 
1331 		ret = btrfs_del_item(trans, root, path);
1332 		if (ret)
1333 			goto out;
1334 
1335 		btrfs_release_path(path);
1336 		inode = read_one_inode(root, key.offset);
1337 		if (!inode)
1338 			return -EIO;
1339 
1340 		ret = fixup_inode_link_count(trans, root, inode);
1341 		BUG_ON(ret);
1342 
1343 		iput(inode);
1344 
1345 		/*
1346 		 * fixup on a directory may create new entries,
1347 		 * make sure we always look for the highset possible
1348 		 * offset
1349 		 */
1350 		key.offset = (u64)-1;
1351 	}
1352 	ret = 0;
1353 out:
1354 	btrfs_release_path(path);
1355 	return ret;
1356 }
1357 
1358 
1359 /*
1360  * record a given inode in the fixup dir so we can check its link
1361  * count when replay is done.  The link count is incremented here
1362  * so the inode won't go away until we check it
1363  */
1364 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1365 				      struct btrfs_root *root,
1366 				      struct btrfs_path *path,
1367 				      u64 objectid)
1368 {
1369 	struct btrfs_key key;
1370 	int ret = 0;
1371 	struct inode *inode;
1372 
1373 	inode = read_one_inode(root, objectid);
1374 	if (!inode)
1375 		return -EIO;
1376 
1377 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1378 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1379 	key.offset = objectid;
1380 
1381 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1382 
1383 	btrfs_release_path(path);
1384 	if (ret == 0) {
1385 		if (!inode->i_nlink)
1386 			set_nlink(inode, 1);
1387 		else
1388 			btrfs_inc_nlink(inode);
1389 		ret = btrfs_update_inode(trans, root, inode);
1390 	} else if (ret == -EEXIST) {
1391 		ret = 0;
1392 	} else {
1393 		BUG();
1394 	}
1395 	iput(inode);
1396 
1397 	return ret;
1398 }
1399 
1400 /*
1401  * when replaying the log for a directory, we only insert names
1402  * for inodes that actually exist.  This means an fsync on a directory
1403  * does not implicitly fsync all the new files in it
1404  */
1405 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1406 				    struct btrfs_root *root,
1407 				    struct btrfs_path *path,
1408 				    u64 dirid, u64 index,
1409 				    char *name, int name_len, u8 type,
1410 				    struct btrfs_key *location)
1411 {
1412 	struct inode *inode;
1413 	struct inode *dir;
1414 	int ret;
1415 
1416 	inode = read_one_inode(root, location->objectid);
1417 	if (!inode)
1418 		return -ENOENT;
1419 
1420 	dir = read_one_inode(root, dirid);
1421 	if (!dir) {
1422 		iput(inode);
1423 		return -EIO;
1424 	}
1425 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1426 
1427 	/* FIXME, put inode into FIXUP list */
1428 
1429 	iput(inode);
1430 	iput(dir);
1431 	return ret;
1432 }
1433 
1434 /*
1435  * take a single entry in a log directory item and replay it into
1436  * the subvolume.
1437  *
1438  * if a conflicting item exists in the subdirectory already,
1439  * the inode it points to is unlinked and put into the link count
1440  * fix up tree.
1441  *
1442  * If a name from the log points to a file or directory that does
1443  * not exist in the FS, it is skipped.  fsyncs on directories
1444  * do not force down inodes inside that directory, just changes to the
1445  * names or unlinks in a directory.
1446  */
1447 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1448 				    struct btrfs_root *root,
1449 				    struct btrfs_path *path,
1450 				    struct extent_buffer *eb,
1451 				    struct btrfs_dir_item *di,
1452 				    struct btrfs_key *key)
1453 {
1454 	char *name;
1455 	int name_len;
1456 	struct btrfs_dir_item *dst_di;
1457 	struct btrfs_key found_key;
1458 	struct btrfs_key log_key;
1459 	struct inode *dir;
1460 	u8 log_type;
1461 	int exists;
1462 	int ret;
1463 
1464 	dir = read_one_inode(root, key->objectid);
1465 	if (!dir)
1466 		return -EIO;
1467 
1468 	name_len = btrfs_dir_name_len(eb, di);
1469 	name = kmalloc(name_len, GFP_NOFS);
1470 	if (!name)
1471 		return -ENOMEM;
1472 
1473 	log_type = btrfs_dir_type(eb, di);
1474 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1475 		   name_len);
1476 
1477 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1478 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1479 	if (exists == 0)
1480 		exists = 1;
1481 	else
1482 		exists = 0;
1483 	btrfs_release_path(path);
1484 
1485 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1486 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1487 				       name, name_len, 1);
1488 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1489 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1490 						     key->objectid,
1491 						     key->offset, name,
1492 						     name_len, 1);
1493 	} else {
1494 		BUG();
1495 	}
1496 	if (IS_ERR_OR_NULL(dst_di)) {
1497 		/* we need a sequence number to insert, so we only
1498 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1499 		 */
1500 		if (key->type != BTRFS_DIR_INDEX_KEY)
1501 			goto out;
1502 		goto insert;
1503 	}
1504 
1505 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1506 	/* the existing item matches the logged item */
1507 	if (found_key.objectid == log_key.objectid &&
1508 	    found_key.type == log_key.type &&
1509 	    found_key.offset == log_key.offset &&
1510 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1511 		goto out;
1512 	}
1513 
1514 	/*
1515 	 * don't drop the conflicting directory entry if the inode
1516 	 * for the new entry doesn't exist
1517 	 */
1518 	if (!exists)
1519 		goto out;
1520 
1521 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1522 	BUG_ON(ret);
1523 
1524 	if (key->type == BTRFS_DIR_INDEX_KEY)
1525 		goto insert;
1526 out:
1527 	btrfs_release_path(path);
1528 	kfree(name);
1529 	iput(dir);
1530 	return 0;
1531 
1532 insert:
1533 	btrfs_release_path(path);
1534 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1535 			      name, name_len, log_type, &log_key);
1536 
1537 	BUG_ON(ret && ret != -ENOENT);
1538 	goto out;
1539 }
1540 
1541 /*
1542  * find all the names in a directory item and reconcile them into
1543  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1544  * one name in a directory item, but the same code gets used for
1545  * both directory index types
1546  */
1547 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1548 					struct btrfs_root *root,
1549 					struct btrfs_path *path,
1550 					struct extent_buffer *eb, int slot,
1551 					struct btrfs_key *key)
1552 {
1553 	int ret;
1554 	u32 item_size = btrfs_item_size_nr(eb, slot);
1555 	struct btrfs_dir_item *di;
1556 	int name_len;
1557 	unsigned long ptr;
1558 	unsigned long ptr_end;
1559 
1560 	ptr = btrfs_item_ptr_offset(eb, slot);
1561 	ptr_end = ptr + item_size;
1562 	while (ptr < ptr_end) {
1563 		di = (struct btrfs_dir_item *)ptr;
1564 		if (verify_dir_item(root, eb, di))
1565 			return -EIO;
1566 		name_len = btrfs_dir_name_len(eb, di);
1567 		ret = replay_one_name(trans, root, path, eb, di, key);
1568 		BUG_ON(ret);
1569 		ptr = (unsigned long)(di + 1);
1570 		ptr += name_len;
1571 	}
1572 	return 0;
1573 }
1574 
1575 /*
1576  * directory replay has two parts.  There are the standard directory
1577  * items in the log copied from the subvolume, and range items
1578  * created in the log while the subvolume was logged.
1579  *
1580  * The range items tell us which parts of the key space the log
1581  * is authoritative for.  During replay, if a key in the subvolume
1582  * directory is in a logged range item, but not actually in the log
1583  * that means it was deleted from the directory before the fsync
1584  * and should be removed.
1585  */
1586 static noinline int find_dir_range(struct btrfs_root *root,
1587 				   struct btrfs_path *path,
1588 				   u64 dirid, int key_type,
1589 				   u64 *start_ret, u64 *end_ret)
1590 {
1591 	struct btrfs_key key;
1592 	u64 found_end;
1593 	struct btrfs_dir_log_item *item;
1594 	int ret;
1595 	int nritems;
1596 
1597 	if (*start_ret == (u64)-1)
1598 		return 1;
1599 
1600 	key.objectid = dirid;
1601 	key.type = key_type;
1602 	key.offset = *start_ret;
1603 
1604 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1605 	if (ret < 0)
1606 		goto out;
1607 	if (ret > 0) {
1608 		if (path->slots[0] == 0)
1609 			goto out;
1610 		path->slots[0]--;
1611 	}
1612 	if (ret != 0)
1613 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1614 
1615 	if (key.type != key_type || key.objectid != dirid) {
1616 		ret = 1;
1617 		goto next;
1618 	}
1619 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1620 			      struct btrfs_dir_log_item);
1621 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1622 
1623 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1624 		ret = 0;
1625 		*start_ret = key.offset;
1626 		*end_ret = found_end;
1627 		goto out;
1628 	}
1629 	ret = 1;
1630 next:
1631 	/* check the next slot in the tree to see if it is a valid item */
1632 	nritems = btrfs_header_nritems(path->nodes[0]);
1633 	if (path->slots[0] >= nritems) {
1634 		ret = btrfs_next_leaf(root, path);
1635 		if (ret)
1636 			goto out;
1637 	} else {
1638 		path->slots[0]++;
1639 	}
1640 
1641 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1642 
1643 	if (key.type != key_type || key.objectid != dirid) {
1644 		ret = 1;
1645 		goto out;
1646 	}
1647 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1648 			      struct btrfs_dir_log_item);
1649 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1650 	*start_ret = key.offset;
1651 	*end_ret = found_end;
1652 	ret = 0;
1653 out:
1654 	btrfs_release_path(path);
1655 	return ret;
1656 }
1657 
1658 /*
1659  * this looks for a given directory item in the log.  If the directory
1660  * item is not in the log, the item is removed and the inode it points
1661  * to is unlinked
1662  */
1663 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1664 				      struct btrfs_root *root,
1665 				      struct btrfs_root *log,
1666 				      struct btrfs_path *path,
1667 				      struct btrfs_path *log_path,
1668 				      struct inode *dir,
1669 				      struct btrfs_key *dir_key)
1670 {
1671 	int ret;
1672 	struct extent_buffer *eb;
1673 	int slot;
1674 	u32 item_size;
1675 	struct btrfs_dir_item *di;
1676 	struct btrfs_dir_item *log_di;
1677 	int name_len;
1678 	unsigned long ptr;
1679 	unsigned long ptr_end;
1680 	char *name;
1681 	struct inode *inode;
1682 	struct btrfs_key location;
1683 
1684 again:
1685 	eb = path->nodes[0];
1686 	slot = path->slots[0];
1687 	item_size = btrfs_item_size_nr(eb, slot);
1688 	ptr = btrfs_item_ptr_offset(eb, slot);
1689 	ptr_end = ptr + item_size;
1690 	while (ptr < ptr_end) {
1691 		di = (struct btrfs_dir_item *)ptr;
1692 		if (verify_dir_item(root, eb, di)) {
1693 			ret = -EIO;
1694 			goto out;
1695 		}
1696 
1697 		name_len = btrfs_dir_name_len(eb, di);
1698 		name = kmalloc(name_len, GFP_NOFS);
1699 		if (!name) {
1700 			ret = -ENOMEM;
1701 			goto out;
1702 		}
1703 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
1704 				  name_len);
1705 		log_di = NULL;
1706 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1707 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
1708 						       dir_key->objectid,
1709 						       name, name_len, 0);
1710 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1711 			log_di = btrfs_lookup_dir_index_item(trans, log,
1712 						     log_path,
1713 						     dir_key->objectid,
1714 						     dir_key->offset,
1715 						     name, name_len, 0);
1716 		}
1717 		if (IS_ERR_OR_NULL(log_di)) {
1718 			btrfs_dir_item_key_to_cpu(eb, di, &location);
1719 			btrfs_release_path(path);
1720 			btrfs_release_path(log_path);
1721 			inode = read_one_inode(root, location.objectid);
1722 			if (!inode) {
1723 				kfree(name);
1724 				return -EIO;
1725 			}
1726 
1727 			ret = link_to_fixup_dir(trans, root,
1728 						path, location.objectid);
1729 			BUG_ON(ret);
1730 			btrfs_inc_nlink(inode);
1731 			ret = btrfs_unlink_inode(trans, root, dir, inode,
1732 						 name, name_len);
1733 			BUG_ON(ret);
1734 
1735 			btrfs_run_delayed_items(trans, root);
1736 
1737 			kfree(name);
1738 			iput(inode);
1739 
1740 			/* there might still be more names under this key
1741 			 * check and repeat if required
1742 			 */
1743 			ret = btrfs_search_slot(NULL, root, dir_key, path,
1744 						0, 0);
1745 			if (ret == 0)
1746 				goto again;
1747 			ret = 0;
1748 			goto out;
1749 		}
1750 		btrfs_release_path(log_path);
1751 		kfree(name);
1752 
1753 		ptr = (unsigned long)(di + 1);
1754 		ptr += name_len;
1755 	}
1756 	ret = 0;
1757 out:
1758 	btrfs_release_path(path);
1759 	btrfs_release_path(log_path);
1760 	return ret;
1761 }
1762 
1763 /*
1764  * deletion replay happens before we copy any new directory items
1765  * out of the log or out of backreferences from inodes.  It
1766  * scans the log to find ranges of keys that log is authoritative for,
1767  * and then scans the directory to find items in those ranges that are
1768  * not present in the log.
1769  *
1770  * Anything we don't find in the log is unlinked and removed from the
1771  * directory.
1772  */
1773 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1774 				       struct btrfs_root *root,
1775 				       struct btrfs_root *log,
1776 				       struct btrfs_path *path,
1777 				       u64 dirid, int del_all)
1778 {
1779 	u64 range_start;
1780 	u64 range_end;
1781 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1782 	int ret = 0;
1783 	struct btrfs_key dir_key;
1784 	struct btrfs_key found_key;
1785 	struct btrfs_path *log_path;
1786 	struct inode *dir;
1787 
1788 	dir_key.objectid = dirid;
1789 	dir_key.type = BTRFS_DIR_ITEM_KEY;
1790 	log_path = btrfs_alloc_path();
1791 	if (!log_path)
1792 		return -ENOMEM;
1793 
1794 	dir = read_one_inode(root, dirid);
1795 	/* it isn't an error if the inode isn't there, that can happen
1796 	 * because we replay the deletes before we copy in the inode item
1797 	 * from the log
1798 	 */
1799 	if (!dir) {
1800 		btrfs_free_path(log_path);
1801 		return 0;
1802 	}
1803 again:
1804 	range_start = 0;
1805 	range_end = 0;
1806 	while (1) {
1807 		if (del_all)
1808 			range_end = (u64)-1;
1809 		else {
1810 			ret = find_dir_range(log, path, dirid, key_type,
1811 					     &range_start, &range_end);
1812 			if (ret != 0)
1813 				break;
1814 		}
1815 
1816 		dir_key.offset = range_start;
1817 		while (1) {
1818 			int nritems;
1819 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
1820 						0, 0);
1821 			if (ret < 0)
1822 				goto out;
1823 
1824 			nritems = btrfs_header_nritems(path->nodes[0]);
1825 			if (path->slots[0] >= nritems) {
1826 				ret = btrfs_next_leaf(root, path);
1827 				if (ret)
1828 					break;
1829 			}
1830 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1831 					      path->slots[0]);
1832 			if (found_key.objectid != dirid ||
1833 			    found_key.type != dir_key.type)
1834 				goto next_type;
1835 
1836 			if (found_key.offset > range_end)
1837 				break;
1838 
1839 			ret = check_item_in_log(trans, root, log, path,
1840 						log_path, dir,
1841 						&found_key);
1842 			BUG_ON(ret);
1843 			if (found_key.offset == (u64)-1)
1844 				break;
1845 			dir_key.offset = found_key.offset + 1;
1846 		}
1847 		btrfs_release_path(path);
1848 		if (range_end == (u64)-1)
1849 			break;
1850 		range_start = range_end + 1;
1851 	}
1852 
1853 next_type:
1854 	ret = 0;
1855 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1856 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
1857 		dir_key.type = BTRFS_DIR_INDEX_KEY;
1858 		btrfs_release_path(path);
1859 		goto again;
1860 	}
1861 out:
1862 	btrfs_release_path(path);
1863 	btrfs_free_path(log_path);
1864 	iput(dir);
1865 	return ret;
1866 }
1867 
1868 /*
1869  * the process_func used to replay items from the log tree.  This
1870  * gets called in two different stages.  The first stage just looks
1871  * for inodes and makes sure they are all copied into the subvolume.
1872  *
1873  * The second stage copies all the other item types from the log into
1874  * the subvolume.  The two stage approach is slower, but gets rid of
1875  * lots of complexity around inodes referencing other inodes that exist
1876  * only in the log (references come from either directory items or inode
1877  * back refs).
1878  */
1879 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1880 			     struct walk_control *wc, u64 gen)
1881 {
1882 	int nritems;
1883 	struct btrfs_path *path;
1884 	struct btrfs_root *root = wc->replay_dest;
1885 	struct btrfs_key key;
1886 	int level;
1887 	int i;
1888 	int ret;
1889 
1890 	ret = btrfs_read_buffer(eb, gen);
1891 	if (ret)
1892 		return ret;
1893 
1894 	level = btrfs_header_level(eb);
1895 
1896 	if (level != 0)
1897 		return 0;
1898 
1899 	path = btrfs_alloc_path();
1900 	if (!path)
1901 		return -ENOMEM;
1902 
1903 	nritems = btrfs_header_nritems(eb);
1904 	for (i = 0; i < nritems; i++) {
1905 		btrfs_item_key_to_cpu(eb, &key, i);
1906 
1907 		/* inode keys are done during the first stage */
1908 		if (key.type == BTRFS_INODE_ITEM_KEY &&
1909 		    wc->stage == LOG_WALK_REPLAY_INODES) {
1910 			struct btrfs_inode_item *inode_item;
1911 			u32 mode;
1912 
1913 			inode_item = btrfs_item_ptr(eb, i,
1914 					    struct btrfs_inode_item);
1915 			mode = btrfs_inode_mode(eb, inode_item);
1916 			if (S_ISDIR(mode)) {
1917 				ret = replay_dir_deletes(wc->trans,
1918 					 root, log, path, key.objectid, 0);
1919 				BUG_ON(ret);
1920 			}
1921 			ret = overwrite_item(wc->trans, root, path,
1922 					     eb, i, &key);
1923 			BUG_ON(ret);
1924 
1925 			/* for regular files, make sure corresponding
1926 			 * orhpan item exist. extents past the new EOF
1927 			 * will be truncated later by orphan cleanup.
1928 			 */
1929 			if (S_ISREG(mode)) {
1930 				ret = insert_orphan_item(wc->trans, root,
1931 							 key.objectid);
1932 				BUG_ON(ret);
1933 			}
1934 
1935 			ret = link_to_fixup_dir(wc->trans, root,
1936 						path, key.objectid);
1937 			BUG_ON(ret);
1938 		}
1939 		if (wc->stage < LOG_WALK_REPLAY_ALL)
1940 			continue;
1941 
1942 		/* these keys are simply copied */
1943 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
1944 			ret = overwrite_item(wc->trans, root, path,
1945 					     eb, i, &key);
1946 			BUG_ON(ret);
1947 		} else if (key.type == BTRFS_INODE_REF_KEY) {
1948 			ret = add_inode_ref(wc->trans, root, log, path,
1949 					    eb, i, &key);
1950 			BUG_ON(ret && ret != -ENOENT);
1951 		} else if (key.type == BTRFS_INODE_EXTREF_KEY) {
1952 			ret = add_inode_ref(wc->trans, root, log, path,
1953 					    eb, i, &key);
1954 			BUG_ON(ret && ret != -ENOENT);
1955 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1956 			ret = replay_one_extent(wc->trans, root, path,
1957 						eb, i, &key);
1958 			BUG_ON(ret);
1959 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
1960 			   key.type == BTRFS_DIR_INDEX_KEY) {
1961 			ret = replay_one_dir_item(wc->trans, root, path,
1962 						  eb, i, &key);
1963 			BUG_ON(ret);
1964 		}
1965 	}
1966 	btrfs_free_path(path);
1967 	return 0;
1968 }
1969 
1970 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1971 				   struct btrfs_root *root,
1972 				   struct btrfs_path *path, int *level,
1973 				   struct walk_control *wc)
1974 {
1975 	u64 root_owner;
1976 	u64 bytenr;
1977 	u64 ptr_gen;
1978 	struct extent_buffer *next;
1979 	struct extent_buffer *cur;
1980 	struct extent_buffer *parent;
1981 	u32 blocksize;
1982 	int ret = 0;
1983 
1984 	WARN_ON(*level < 0);
1985 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1986 
1987 	while (*level > 0) {
1988 		WARN_ON(*level < 0);
1989 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
1990 		cur = path->nodes[*level];
1991 
1992 		if (btrfs_header_level(cur) != *level)
1993 			WARN_ON(1);
1994 
1995 		if (path->slots[*level] >=
1996 		    btrfs_header_nritems(cur))
1997 			break;
1998 
1999 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2000 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2001 		blocksize = btrfs_level_size(root, *level - 1);
2002 
2003 		parent = path->nodes[*level];
2004 		root_owner = btrfs_header_owner(parent);
2005 
2006 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2007 		if (!next)
2008 			return -ENOMEM;
2009 
2010 		if (*level == 1) {
2011 			ret = wc->process_func(root, next, wc, ptr_gen);
2012 			if (ret)
2013 				return ret;
2014 
2015 			path->slots[*level]++;
2016 			if (wc->free) {
2017 				ret = btrfs_read_buffer(next, ptr_gen);
2018 				if (ret) {
2019 					free_extent_buffer(next);
2020 					return ret;
2021 				}
2022 
2023 				btrfs_tree_lock(next);
2024 				btrfs_set_lock_blocking(next);
2025 				clean_tree_block(trans, root, next);
2026 				btrfs_wait_tree_block_writeback(next);
2027 				btrfs_tree_unlock(next);
2028 
2029 				WARN_ON(root_owner !=
2030 					BTRFS_TREE_LOG_OBJECTID);
2031 				ret = btrfs_free_and_pin_reserved_extent(root,
2032 							 bytenr, blocksize);
2033 				BUG_ON(ret); /* -ENOMEM or logic errors */
2034 			}
2035 			free_extent_buffer(next);
2036 			continue;
2037 		}
2038 		ret = btrfs_read_buffer(next, ptr_gen);
2039 		if (ret) {
2040 			free_extent_buffer(next);
2041 			return ret;
2042 		}
2043 
2044 		WARN_ON(*level <= 0);
2045 		if (path->nodes[*level-1])
2046 			free_extent_buffer(path->nodes[*level-1]);
2047 		path->nodes[*level-1] = next;
2048 		*level = btrfs_header_level(next);
2049 		path->slots[*level] = 0;
2050 		cond_resched();
2051 	}
2052 	WARN_ON(*level < 0);
2053 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2054 
2055 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2056 
2057 	cond_resched();
2058 	return 0;
2059 }
2060 
2061 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2062 				 struct btrfs_root *root,
2063 				 struct btrfs_path *path, int *level,
2064 				 struct walk_control *wc)
2065 {
2066 	u64 root_owner;
2067 	int i;
2068 	int slot;
2069 	int ret;
2070 
2071 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2072 		slot = path->slots[i];
2073 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2074 			path->slots[i]++;
2075 			*level = i;
2076 			WARN_ON(*level == 0);
2077 			return 0;
2078 		} else {
2079 			struct extent_buffer *parent;
2080 			if (path->nodes[*level] == root->node)
2081 				parent = path->nodes[*level];
2082 			else
2083 				parent = path->nodes[*level + 1];
2084 
2085 			root_owner = btrfs_header_owner(parent);
2086 			ret = wc->process_func(root, path->nodes[*level], wc,
2087 				 btrfs_header_generation(path->nodes[*level]));
2088 			if (ret)
2089 				return ret;
2090 
2091 			if (wc->free) {
2092 				struct extent_buffer *next;
2093 
2094 				next = path->nodes[*level];
2095 
2096 				btrfs_tree_lock(next);
2097 				btrfs_set_lock_blocking(next);
2098 				clean_tree_block(trans, root, next);
2099 				btrfs_wait_tree_block_writeback(next);
2100 				btrfs_tree_unlock(next);
2101 
2102 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2103 				ret = btrfs_free_and_pin_reserved_extent(root,
2104 						path->nodes[*level]->start,
2105 						path->nodes[*level]->len);
2106 				BUG_ON(ret);
2107 			}
2108 			free_extent_buffer(path->nodes[*level]);
2109 			path->nodes[*level] = NULL;
2110 			*level = i + 1;
2111 		}
2112 	}
2113 	return 1;
2114 }
2115 
2116 /*
2117  * drop the reference count on the tree rooted at 'snap'.  This traverses
2118  * the tree freeing any blocks that have a ref count of zero after being
2119  * decremented.
2120  */
2121 static int walk_log_tree(struct btrfs_trans_handle *trans,
2122 			 struct btrfs_root *log, struct walk_control *wc)
2123 {
2124 	int ret = 0;
2125 	int wret;
2126 	int level;
2127 	struct btrfs_path *path;
2128 	int i;
2129 	int orig_level;
2130 
2131 	path = btrfs_alloc_path();
2132 	if (!path)
2133 		return -ENOMEM;
2134 
2135 	level = btrfs_header_level(log->node);
2136 	orig_level = level;
2137 	path->nodes[level] = log->node;
2138 	extent_buffer_get(log->node);
2139 	path->slots[level] = 0;
2140 
2141 	while (1) {
2142 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2143 		if (wret > 0)
2144 			break;
2145 		if (wret < 0) {
2146 			ret = wret;
2147 			goto out;
2148 		}
2149 
2150 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2151 		if (wret > 0)
2152 			break;
2153 		if (wret < 0) {
2154 			ret = wret;
2155 			goto out;
2156 		}
2157 	}
2158 
2159 	/* was the root node processed? if not, catch it here */
2160 	if (path->nodes[orig_level]) {
2161 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2162 			 btrfs_header_generation(path->nodes[orig_level]));
2163 		if (ret)
2164 			goto out;
2165 		if (wc->free) {
2166 			struct extent_buffer *next;
2167 
2168 			next = path->nodes[orig_level];
2169 
2170 			btrfs_tree_lock(next);
2171 			btrfs_set_lock_blocking(next);
2172 			clean_tree_block(trans, log, next);
2173 			btrfs_wait_tree_block_writeback(next);
2174 			btrfs_tree_unlock(next);
2175 
2176 			WARN_ON(log->root_key.objectid !=
2177 				BTRFS_TREE_LOG_OBJECTID);
2178 			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2179 							 next->len);
2180 			BUG_ON(ret); /* -ENOMEM or logic errors */
2181 		}
2182 	}
2183 
2184 out:
2185 	for (i = 0; i <= orig_level; i++) {
2186 		if (path->nodes[i]) {
2187 			free_extent_buffer(path->nodes[i]);
2188 			path->nodes[i] = NULL;
2189 		}
2190 	}
2191 	btrfs_free_path(path);
2192 	return ret;
2193 }
2194 
2195 /*
2196  * helper function to update the item for a given subvolumes log root
2197  * in the tree of log roots
2198  */
2199 static int update_log_root(struct btrfs_trans_handle *trans,
2200 			   struct btrfs_root *log)
2201 {
2202 	int ret;
2203 
2204 	if (log->log_transid == 1) {
2205 		/* insert root item on the first sync */
2206 		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2207 				&log->root_key, &log->root_item);
2208 	} else {
2209 		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2210 				&log->root_key, &log->root_item);
2211 	}
2212 	return ret;
2213 }
2214 
2215 static int wait_log_commit(struct btrfs_trans_handle *trans,
2216 			   struct btrfs_root *root, unsigned long transid)
2217 {
2218 	DEFINE_WAIT(wait);
2219 	int index = transid % 2;
2220 
2221 	/*
2222 	 * we only allow two pending log transactions at a time,
2223 	 * so we know that if ours is more than 2 older than the
2224 	 * current transaction, we're done
2225 	 */
2226 	do {
2227 		prepare_to_wait(&root->log_commit_wait[index],
2228 				&wait, TASK_UNINTERRUPTIBLE);
2229 		mutex_unlock(&root->log_mutex);
2230 
2231 		if (root->fs_info->last_trans_log_full_commit !=
2232 		    trans->transid && root->log_transid < transid + 2 &&
2233 		    atomic_read(&root->log_commit[index]))
2234 			schedule();
2235 
2236 		finish_wait(&root->log_commit_wait[index], &wait);
2237 		mutex_lock(&root->log_mutex);
2238 	} while (root->fs_info->last_trans_log_full_commit !=
2239 		 trans->transid && root->log_transid < transid + 2 &&
2240 		 atomic_read(&root->log_commit[index]));
2241 	return 0;
2242 }
2243 
2244 static void wait_for_writer(struct btrfs_trans_handle *trans,
2245 			    struct btrfs_root *root)
2246 {
2247 	DEFINE_WAIT(wait);
2248 	while (root->fs_info->last_trans_log_full_commit !=
2249 	       trans->transid && atomic_read(&root->log_writers)) {
2250 		prepare_to_wait(&root->log_writer_wait,
2251 				&wait, TASK_UNINTERRUPTIBLE);
2252 		mutex_unlock(&root->log_mutex);
2253 		if (root->fs_info->last_trans_log_full_commit !=
2254 		    trans->transid && atomic_read(&root->log_writers))
2255 			schedule();
2256 		mutex_lock(&root->log_mutex);
2257 		finish_wait(&root->log_writer_wait, &wait);
2258 	}
2259 }
2260 
2261 /*
2262  * btrfs_sync_log does sends a given tree log down to the disk and
2263  * updates the super blocks to record it.  When this call is done,
2264  * you know that any inodes previously logged are safely on disk only
2265  * if it returns 0.
2266  *
2267  * Any other return value means you need to call btrfs_commit_transaction.
2268  * Some of the edge cases for fsyncing directories that have had unlinks
2269  * or renames done in the past mean that sometimes the only safe
2270  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2271  * that has happened.
2272  */
2273 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2274 		   struct btrfs_root *root)
2275 {
2276 	int index1;
2277 	int index2;
2278 	int mark;
2279 	int ret;
2280 	struct btrfs_root *log = root->log_root;
2281 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2282 	unsigned long log_transid = 0;
2283 
2284 	mutex_lock(&root->log_mutex);
2285 	log_transid = root->log_transid;
2286 	index1 = root->log_transid % 2;
2287 	if (atomic_read(&root->log_commit[index1])) {
2288 		wait_log_commit(trans, root, root->log_transid);
2289 		mutex_unlock(&root->log_mutex);
2290 		return 0;
2291 	}
2292 	atomic_set(&root->log_commit[index1], 1);
2293 
2294 	/* wait for previous tree log sync to complete */
2295 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2296 		wait_log_commit(trans, root, root->log_transid - 1);
2297 	while (1) {
2298 		int batch = atomic_read(&root->log_batch);
2299 		/* when we're on an ssd, just kick the log commit out */
2300 		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2301 			mutex_unlock(&root->log_mutex);
2302 			schedule_timeout_uninterruptible(1);
2303 			mutex_lock(&root->log_mutex);
2304 		}
2305 		wait_for_writer(trans, root);
2306 		if (batch == atomic_read(&root->log_batch))
2307 			break;
2308 	}
2309 
2310 	/* bail out if we need to do a full commit */
2311 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2312 		ret = -EAGAIN;
2313 		btrfs_free_logged_extents(log, log_transid);
2314 		mutex_unlock(&root->log_mutex);
2315 		goto out;
2316 	}
2317 
2318 	if (log_transid % 2 == 0)
2319 		mark = EXTENT_DIRTY;
2320 	else
2321 		mark = EXTENT_NEW;
2322 
2323 	/* we start IO on  all the marked extents here, but we don't actually
2324 	 * wait for them until later.
2325 	 */
2326 	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2327 	if (ret) {
2328 		btrfs_abort_transaction(trans, root, ret);
2329 		btrfs_free_logged_extents(log, log_transid);
2330 		mutex_unlock(&root->log_mutex);
2331 		goto out;
2332 	}
2333 
2334 	btrfs_set_root_node(&log->root_item, log->node);
2335 
2336 	root->log_transid++;
2337 	log->log_transid = root->log_transid;
2338 	root->log_start_pid = 0;
2339 	smp_mb();
2340 	/*
2341 	 * IO has been started, blocks of the log tree have WRITTEN flag set
2342 	 * in their headers. new modifications of the log will be written to
2343 	 * new positions. so it's safe to allow log writers to go in.
2344 	 */
2345 	mutex_unlock(&root->log_mutex);
2346 
2347 	mutex_lock(&log_root_tree->log_mutex);
2348 	atomic_inc(&log_root_tree->log_batch);
2349 	atomic_inc(&log_root_tree->log_writers);
2350 	mutex_unlock(&log_root_tree->log_mutex);
2351 
2352 	ret = update_log_root(trans, log);
2353 
2354 	mutex_lock(&log_root_tree->log_mutex);
2355 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2356 		smp_mb();
2357 		if (waitqueue_active(&log_root_tree->log_writer_wait))
2358 			wake_up(&log_root_tree->log_writer_wait);
2359 	}
2360 
2361 	if (ret) {
2362 		if (ret != -ENOSPC) {
2363 			btrfs_abort_transaction(trans, root, ret);
2364 			mutex_unlock(&log_root_tree->log_mutex);
2365 			goto out;
2366 		}
2367 		root->fs_info->last_trans_log_full_commit = trans->transid;
2368 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2369 		btrfs_free_logged_extents(log, log_transid);
2370 		mutex_unlock(&log_root_tree->log_mutex);
2371 		ret = -EAGAIN;
2372 		goto out;
2373 	}
2374 
2375 	index2 = log_root_tree->log_transid % 2;
2376 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2377 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2378 		wait_log_commit(trans, log_root_tree,
2379 				log_root_tree->log_transid);
2380 		btrfs_free_logged_extents(log, log_transid);
2381 		mutex_unlock(&log_root_tree->log_mutex);
2382 		ret = 0;
2383 		goto out;
2384 	}
2385 	atomic_set(&log_root_tree->log_commit[index2], 1);
2386 
2387 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2388 		wait_log_commit(trans, log_root_tree,
2389 				log_root_tree->log_transid - 1);
2390 	}
2391 
2392 	wait_for_writer(trans, log_root_tree);
2393 
2394 	/*
2395 	 * now that we've moved on to the tree of log tree roots,
2396 	 * check the full commit flag again
2397 	 */
2398 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2399 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2400 		btrfs_free_logged_extents(log, log_transid);
2401 		mutex_unlock(&log_root_tree->log_mutex);
2402 		ret = -EAGAIN;
2403 		goto out_wake_log_root;
2404 	}
2405 
2406 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2407 				&log_root_tree->dirty_log_pages,
2408 				EXTENT_DIRTY | EXTENT_NEW);
2409 	if (ret) {
2410 		btrfs_abort_transaction(trans, root, ret);
2411 		btrfs_free_logged_extents(log, log_transid);
2412 		mutex_unlock(&log_root_tree->log_mutex);
2413 		goto out_wake_log_root;
2414 	}
2415 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2416 	btrfs_wait_logged_extents(log, log_transid);
2417 
2418 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
2419 				log_root_tree->node->start);
2420 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2421 				btrfs_header_level(log_root_tree->node));
2422 
2423 	log_root_tree->log_transid++;
2424 	smp_mb();
2425 
2426 	mutex_unlock(&log_root_tree->log_mutex);
2427 
2428 	/*
2429 	 * nobody else is going to jump in and write the the ctree
2430 	 * super here because the log_commit atomic below is protecting
2431 	 * us.  We must be called with a transaction handle pinning
2432 	 * the running transaction open, so a full commit can't hop
2433 	 * in and cause problems either.
2434 	 */
2435 	btrfs_scrub_pause_super(root);
2436 	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2437 	btrfs_scrub_continue_super(root);
2438 	if (ret) {
2439 		btrfs_abort_transaction(trans, root, ret);
2440 		goto out_wake_log_root;
2441 	}
2442 
2443 	mutex_lock(&root->log_mutex);
2444 	if (root->last_log_commit < log_transid)
2445 		root->last_log_commit = log_transid;
2446 	mutex_unlock(&root->log_mutex);
2447 
2448 out_wake_log_root:
2449 	atomic_set(&log_root_tree->log_commit[index2], 0);
2450 	smp_mb();
2451 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2452 		wake_up(&log_root_tree->log_commit_wait[index2]);
2453 out:
2454 	atomic_set(&root->log_commit[index1], 0);
2455 	smp_mb();
2456 	if (waitqueue_active(&root->log_commit_wait[index1]))
2457 		wake_up(&root->log_commit_wait[index1]);
2458 	return ret;
2459 }
2460 
2461 static void free_log_tree(struct btrfs_trans_handle *trans,
2462 			  struct btrfs_root *log)
2463 {
2464 	int ret;
2465 	u64 start;
2466 	u64 end;
2467 	struct walk_control wc = {
2468 		.free = 1,
2469 		.process_func = process_one_buffer
2470 	};
2471 
2472 	if (trans) {
2473 		ret = walk_log_tree(trans, log, &wc);
2474 		BUG_ON(ret);
2475 	}
2476 
2477 	while (1) {
2478 		ret = find_first_extent_bit(&log->dirty_log_pages,
2479 				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2480 				NULL);
2481 		if (ret)
2482 			break;
2483 
2484 		clear_extent_bits(&log->dirty_log_pages, start, end,
2485 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2486 	}
2487 
2488 	/*
2489 	 * We may have short-circuited the log tree with the full commit logic
2490 	 * and left ordered extents on our list, so clear these out to keep us
2491 	 * from leaking inodes and memory.
2492 	 */
2493 	btrfs_free_logged_extents(log, 0);
2494 	btrfs_free_logged_extents(log, 1);
2495 
2496 	free_extent_buffer(log->node);
2497 	kfree(log);
2498 }
2499 
2500 /*
2501  * free all the extents used by the tree log.  This should be called
2502  * at commit time of the full transaction
2503  */
2504 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2505 {
2506 	if (root->log_root) {
2507 		free_log_tree(trans, root->log_root);
2508 		root->log_root = NULL;
2509 	}
2510 	return 0;
2511 }
2512 
2513 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2514 			     struct btrfs_fs_info *fs_info)
2515 {
2516 	if (fs_info->log_root_tree) {
2517 		free_log_tree(trans, fs_info->log_root_tree);
2518 		fs_info->log_root_tree = NULL;
2519 	}
2520 	return 0;
2521 }
2522 
2523 /*
2524  * If both a file and directory are logged, and unlinks or renames are
2525  * mixed in, we have a few interesting corners:
2526  *
2527  * create file X in dir Y
2528  * link file X to X.link in dir Y
2529  * fsync file X
2530  * unlink file X but leave X.link
2531  * fsync dir Y
2532  *
2533  * After a crash we would expect only X.link to exist.  But file X
2534  * didn't get fsync'd again so the log has back refs for X and X.link.
2535  *
2536  * We solve this by removing directory entries and inode backrefs from the
2537  * log when a file that was logged in the current transaction is
2538  * unlinked.  Any later fsync will include the updated log entries, and
2539  * we'll be able to reconstruct the proper directory items from backrefs.
2540  *
2541  * This optimizations allows us to avoid relogging the entire inode
2542  * or the entire directory.
2543  */
2544 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2545 				 struct btrfs_root *root,
2546 				 const char *name, int name_len,
2547 				 struct inode *dir, u64 index)
2548 {
2549 	struct btrfs_root *log;
2550 	struct btrfs_dir_item *di;
2551 	struct btrfs_path *path;
2552 	int ret;
2553 	int err = 0;
2554 	int bytes_del = 0;
2555 	u64 dir_ino = btrfs_ino(dir);
2556 
2557 	if (BTRFS_I(dir)->logged_trans < trans->transid)
2558 		return 0;
2559 
2560 	ret = join_running_log_trans(root);
2561 	if (ret)
2562 		return 0;
2563 
2564 	mutex_lock(&BTRFS_I(dir)->log_mutex);
2565 
2566 	log = root->log_root;
2567 	path = btrfs_alloc_path();
2568 	if (!path) {
2569 		err = -ENOMEM;
2570 		goto out_unlock;
2571 	}
2572 
2573 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2574 				   name, name_len, -1);
2575 	if (IS_ERR(di)) {
2576 		err = PTR_ERR(di);
2577 		goto fail;
2578 	}
2579 	if (di) {
2580 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2581 		bytes_del += name_len;
2582 		BUG_ON(ret);
2583 	}
2584 	btrfs_release_path(path);
2585 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2586 					 index, name, name_len, -1);
2587 	if (IS_ERR(di)) {
2588 		err = PTR_ERR(di);
2589 		goto fail;
2590 	}
2591 	if (di) {
2592 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2593 		bytes_del += name_len;
2594 		BUG_ON(ret);
2595 	}
2596 
2597 	/* update the directory size in the log to reflect the names
2598 	 * we have removed
2599 	 */
2600 	if (bytes_del) {
2601 		struct btrfs_key key;
2602 
2603 		key.objectid = dir_ino;
2604 		key.offset = 0;
2605 		key.type = BTRFS_INODE_ITEM_KEY;
2606 		btrfs_release_path(path);
2607 
2608 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2609 		if (ret < 0) {
2610 			err = ret;
2611 			goto fail;
2612 		}
2613 		if (ret == 0) {
2614 			struct btrfs_inode_item *item;
2615 			u64 i_size;
2616 
2617 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2618 					      struct btrfs_inode_item);
2619 			i_size = btrfs_inode_size(path->nodes[0], item);
2620 			if (i_size > bytes_del)
2621 				i_size -= bytes_del;
2622 			else
2623 				i_size = 0;
2624 			btrfs_set_inode_size(path->nodes[0], item, i_size);
2625 			btrfs_mark_buffer_dirty(path->nodes[0]);
2626 		} else
2627 			ret = 0;
2628 		btrfs_release_path(path);
2629 	}
2630 fail:
2631 	btrfs_free_path(path);
2632 out_unlock:
2633 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
2634 	if (ret == -ENOSPC) {
2635 		root->fs_info->last_trans_log_full_commit = trans->transid;
2636 		ret = 0;
2637 	} else if (ret < 0)
2638 		btrfs_abort_transaction(trans, root, ret);
2639 
2640 	btrfs_end_log_trans(root);
2641 
2642 	return err;
2643 }
2644 
2645 /* see comments for btrfs_del_dir_entries_in_log */
2646 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2647 			       struct btrfs_root *root,
2648 			       const char *name, int name_len,
2649 			       struct inode *inode, u64 dirid)
2650 {
2651 	struct btrfs_root *log;
2652 	u64 index;
2653 	int ret;
2654 
2655 	if (BTRFS_I(inode)->logged_trans < trans->transid)
2656 		return 0;
2657 
2658 	ret = join_running_log_trans(root);
2659 	if (ret)
2660 		return 0;
2661 	log = root->log_root;
2662 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2663 
2664 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2665 				  dirid, &index);
2666 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
2667 	if (ret == -ENOSPC) {
2668 		root->fs_info->last_trans_log_full_commit = trans->transid;
2669 		ret = 0;
2670 	} else if (ret < 0 && ret != -ENOENT)
2671 		btrfs_abort_transaction(trans, root, ret);
2672 	btrfs_end_log_trans(root);
2673 
2674 	return ret;
2675 }
2676 
2677 /*
2678  * creates a range item in the log for 'dirid'.  first_offset and
2679  * last_offset tell us which parts of the key space the log should
2680  * be considered authoritative for.
2681  */
2682 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2683 				       struct btrfs_root *log,
2684 				       struct btrfs_path *path,
2685 				       int key_type, u64 dirid,
2686 				       u64 first_offset, u64 last_offset)
2687 {
2688 	int ret;
2689 	struct btrfs_key key;
2690 	struct btrfs_dir_log_item *item;
2691 
2692 	key.objectid = dirid;
2693 	key.offset = first_offset;
2694 	if (key_type == BTRFS_DIR_ITEM_KEY)
2695 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
2696 	else
2697 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
2698 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2699 	if (ret)
2700 		return ret;
2701 
2702 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2703 			      struct btrfs_dir_log_item);
2704 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2705 	btrfs_mark_buffer_dirty(path->nodes[0]);
2706 	btrfs_release_path(path);
2707 	return 0;
2708 }
2709 
2710 /*
2711  * log all the items included in the current transaction for a given
2712  * directory.  This also creates the range items in the log tree required
2713  * to replay anything deleted before the fsync
2714  */
2715 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2716 			  struct btrfs_root *root, struct inode *inode,
2717 			  struct btrfs_path *path,
2718 			  struct btrfs_path *dst_path, int key_type,
2719 			  u64 min_offset, u64 *last_offset_ret)
2720 {
2721 	struct btrfs_key min_key;
2722 	struct btrfs_key max_key;
2723 	struct btrfs_root *log = root->log_root;
2724 	struct extent_buffer *src;
2725 	int err = 0;
2726 	int ret;
2727 	int i;
2728 	int nritems;
2729 	u64 first_offset = min_offset;
2730 	u64 last_offset = (u64)-1;
2731 	u64 ino = btrfs_ino(inode);
2732 
2733 	log = root->log_root;
2734 	max_key.objectid = ino;
2735 	max_key.offset = (u64)-1;
2736 	max_key.type = key_type;
2737 
2738 	min_key.objectid = ino;
2739 	min_key.type = key_type;
2740 	min_key.offset = min_offset;
2741 
2742 	path->keep_locks = 1;
2743 
2744 	ret = btrfs_search_forward(root, &min_key, &max_key,
2745 				   path, trans->transid);
2746 
2747 	/*
2748 	 * we didn't find anything from this transaction, see if there
2749 	 * is anything at all
2750 	 */
2751 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2752 		min_key.objectid = ino;
2753 		min_key.type = key_type;
2754 		min_key.offset = (u64)-1;
2755 		btrfs_release_path(path);
2756 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2757 		if (ret < 0) {
2758 			btrfs_release_path(path);
2759 			return ret;
2760 		}
2761 		ret = btrfs_previous_item(root, path, ino, key_type);
2762 
2763 		/* if ret == 0 there are items for this type,
2764 		 * create a range to tell us the last key of this type.
2765 		 * otherwise, there are no items in this directory after
2766 		 * *min_offset, and we create a range to indicate that.
2767 		 */
2768 		if (ret == 0) {
2769 			struct btrfs_key tmp;
2770 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2771 					      path->slots[0]);
2772 			if (key_type == tmp.type)
2773 				first_offset = max(min_offset, tmp.offset) + 1;
2774 		}
2775 		goto done;
2776 	}
2777 
2778 	/* go backward to find any previous key */
2779 	ret = btrfs_previous_item(root, path, ino, key_type);
2780 	if (ret == 0) {
2781 		struct btrfs_key tmp;
2782 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2783 		if (key_type == tmp.type) {
2784 			first_offset = tmp.offset;
2785 			ret = overwrite_item(trans, log, dst_path,
2786 					     path->nodes[0], path->slots[0],
2787 					     &tmp);
2788 			if (ret) {
2789 				err = ret;
2790 				goto done;
2791 			}
2792 		}
2793 	}
2794 	btrfs_release_path(path);
2795 
2796 	/* find the first key from this transaction again */
2797 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2798 	if (ret != 0) {
2799 		WARN_ON(1);
2800 		goto done;
2801 	}
2802 
2803 	/*
2804 	 * we have a block from this transaction, log every item in it
2805 	 * from our directory
2806 	 */
2807 	while (1) {
2808 		struct btrfs_key tmp;
2809 		src = path->nodes[0];
2810 		nritems = btrfs_header_nritems(src);
2811 		for (i = path->slots[0]; i < nritems; i++) {
2812 			btrfs_item_key_to_cpu(src, &min_key, i);
2813 
2814 			if (min_key.objectid != ino || min_key.type != key_type)
2815 				goto done;
2816 			ret = overwrite_item(trans, log, dst_path, src, i,
2817 					     &min_key);
2818 			if (ret) {
2819 				err = ret;
2820 				goto done;
2821 			}
2822 		}
2823 		path->slots[0] = nritems;
2824 
2825 		/*
2826 		 * look ahead to the next item and see if it is also
2827 		 * from this directory and from this transaction
2828 		 */
2829 		ret = btrfs_next_leaf(root, path);
2830 		if (ret == 1) {
2831 			last_offset = (u64)-1;
2832 			goto done;
2833 		}
2834 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2835 		if (tmp.objectid != ino || tmp.type != key_type) {
2836 			last_offset = (u64)-1;
2837 			goto done;
2838 		}
2839 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2840 			ret = overwrite_item(trans, log, dst_path,
2841 					     path->nodes[0], path->slots[0],
2842 					     &tmp);
2843 			if (ret)
2844 				err = ret;
2845 			else
2846 				last_offset = tmp.offset;
2847 			goto done;
2848 		}
2849 	}
2850 done:
2851 	btrfs_release_path(path);
2852 	btrfs_release_path(dst_path);
2853 
2854 	if (err == 0) {
2855 		*last_offset_ret = last_offset;
2856 		/*
2857 		 * insert the log range keys to indicate where the log
2858 		 * is valid
2859 		 */
2860 		ret = insert_dir_log_key(trans, log, path, key_type,
2861 					 ino, first_offset, last_offset);
2862 		if (ret)
2863 			err = ret;
2864 	}
2865 	return err;
2866 }
2867 
2868 /*
2869  * logging directories is very similar to logging inodes, We find all the items
2870  * from the current transaction and write them to the log.
2871  *
2872  * The recovery code scans the directory in the subvolume, and if it finds a
2873  * key in the range logged that is not present in the log tree, then it means
2874  * that dir entry was unlinked during the transaction.
2875  *
2876  * In order for that scan to work, we must include one key smaller than
2877  * the smallest logged by this transaction and one key larger than the largest
2878  * key logged by this transaction.
2879  */
2880 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2881 			  struct btrfs_root *root, struct inode *inode,
2882 			  struct btrfs_path *path,
2883 			  struct btrfs_path *dst_path)
2884 {
2885 	u64 min_key;
2886 	u64 max_key;
2887 	int ret;
2888 	int key_type = BTRFS_DIR_ITEM_KEY;
2889 
2890 again:
2891 	min_key = 0;
2892 	max_key = 0;
2893 	while (1) {
2894 		ret = log_dir_items(trans, root, inode, path,
2895 				    dst_path, key_type, min_key,
2896 				    &max_key);
2897 		if (ret)
2898 			return ret;
2899 		if (max_key == (u64)-1)
2900 			break;
2901 		min_key = max_key + 1;
2902 	}
2903 
2904 	if (key_type == BTRFS_DIR_ITEM_KEY) {
2905 		key_type = BTRFS_DIR_INDEX_KEY;
2906 		goto again;
2907 	}
2908 	return 0;
2909 }
2910 
2911 /*
2912  * a helper function to drop items from the log before we relog an
2913  * inode.  max_key_type indicates the highest item type to remove.
2914  * This cannot be run for file data extents because it does not
2915  * free the extents they point to.
2916  */
2917 static int drop_objectid_items(struct btrfs_trans_handle *trans,
2918 				  struct btrfs_root *log,
2919 				  struct btrfs_path *path,
2920 				  u64 objectid, int max_key_type)
2921 {
2922 	int ret;
2923 	struct btrfs_key key;
2924 	struct btrfs_key found_key;
2925 	int start_slot;
2926 
2927 	key.objectid = objectid;
2928 	key.type = max_key_type;
2929 	key.offset = (u64)-1;
2930 
2931 	while (1) {
2932 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2933 		BUG_ON(ret == 0);
2934 		if (ret < 0)
2935 			break;
2936 
2937 		if (path->slots[0] == 0)
2938 			break;
2939 
2940 		path->slots[0]--;
2941 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2942 				      path->slots[0]);
2943 
2944 		if (found_key.objectid != objectid)
2945 			break;
2946 
2947 		found_key.offset = 0;
2948 		found_key.type = 0;
2949 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
2950 				       &start_slot);
2951 
2952 		ret = btrfs_del_items(trans, log, path, start_slot,
2953 				      path->slots[0] - start_slot + 1);
2954 		/*
2955 		 * If start slot isn't 0 then we don't need to re-search, we've
2956 		 * found the last guy with the objectid in this tree.
2957 		 */
2958 		if (ret || start_slot != 0)
2959 			break;
2960 		btrfs_release_path(path);
2961 	}
2962 	btrfs_release_path(path);
2963 	if (ret > 0)
2964 		ret = 0;
2965 	return ret;
2966 }
2967 
2968 static void fill_inode_item(struct btrfs_trans_handle *trans,
2969 			    struct extent_buffer *leaf,
2970 			    struct btrfs_inode_item *item,
2971 			    struct inode *inode, int log_inode_only)
2972 {
2973 	struct btrfs_map_token token;
2974 
2975 	btrfs_init_map_token(&token);
2976 
2977 	if (log_inode_only) {
2978 		/* set the generation to zero so the recover code
2979 		 * can tell the difference between an logging
2980 		 * just to say 'this inode exists' and a logging
2981 		 * to say 'update this inode with these values'
2982 		 */
2983 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
2984 		btrfs_set_token_inode_size(leaf, item, 0, &token);
2985 	} else {
2986 		btrfs_set_token_inode_generation(leaf, item,
2987 						 BTRFS_I(inode)->generation,
2988 						 &token);
2989 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2990 	}
2991 
2992 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2993 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2994 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2995 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2996 
2997 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2998 				     inode->i_atime.tv_sec, &token);
2999 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3000 				      inode->i_atime.tv_nsec, &token);
3001 
3002 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3003 				     inode->i_mtime.tv_sec, &token);
3004 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3005 				      inode->i_mtime.tv_nsec, &token);
3006 
3007 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3008 				     inode->i_ctime.tv_sec, &token);
3009 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3010 				      inode->i_ctime.tv_nsec, &token);
3011 
3012 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3013 				     &token);
3014 
3015 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3016 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3017 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3018 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3019 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3020 }
3021 
3022 static int log_inode_item(struct btrfs_trans_handle *trans,
3023 			  struct btrfs_root *log, struct btrfs_path *path,
3024 			  struct inode *inode)
3025 {
3026 	struct btrfs_inode_item *inode_item;
3027 	struct btrfs_key key;
3028 	int ret;
3029 
3030 	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3031 	ret = btrfs_insert_empty_item(trans, log, path, &key,
3032 				      sizeof(*inode_item));
3033 	if (ret && ret != -EEXIST)
3034 		return ret;
3035 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3036 				    struct btrfs_inode_item);
3037 	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3038 	btrfs_release_path(path);
3039 	return 0;
3040 }
3041 
3042 static noinline int copy_items(struct btrfs_trans_handle *trans,
3043 			       struct inode *inode,
3044 			       struct btrfs_path *dst_path,
3045 			       struct extent_buffer *src,
3046 			       int start_slot, int nr, int inode_only)
3047 {
3048 	unsigned long src_offset;
3049 	unsigned long dst_offset;
3050 	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3051 	struct btrfs_file_extent_item *extent;
3052 	struct btrfs_inode_item *inode_item;
3053 	int ret;
3054 	struct btrfs_key *ins_keys;
3055 	u32 *ins_sizes;
3056 	char *ins_data;
3057 	int i;
3058 	struct list_head ordered_sums;
3059 	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3060 
3061 	INIT_LIST_HEAD(&ordered_sums);
3062 
3063 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3064 			   nr * sizeof(u32), GFP_NOFS);
3065 	if (!ins_data)
3066 		return -ENOMEM;
3067 
3068 	ins_sizes = (u32 *)ins_data;
3069 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3070 
3071 	for (i = 0; i < nr; i++) {
3072 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3073 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3074 	}
3075 	ret = btrfs_insert_empty_items(trans, log, dst_path,
3076 				       ins_keys, ins_sizes, nr);
3077 	if (ret) {
3078 		kfree(ins_data);
3079 		return ret;
3080 	}
3081 
3082 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3083 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3084 						   dst_path->slots[0]);
3085 
3086 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3087 
3088 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3089 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
3090 						    dst_path->slots[0],
3091 						    struct btrfs_inode_item);
3092 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
3093 					inode, inode_only == LOG_INODE_EXISTS);
3094 		} else {
3095 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3096 					   src_offset, ins_sizes[i]);
3097 		}
3098 
3099 		/* take a reference on file data extents so that truncates
3100 		 * or deletes of this inode don't have to relog the inode
3101 		 * again
3102 		 */
3103 		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3104 		    !skip_csum) {
3105 			int found_type;
3106 			extent = btrfs_item_ptr(src, start_slot + i,
3107 						struct btrfs_file_extent_item);
3108 
3109 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
3110 				continue;
3111 
3112 			found_type = btrfs_file_extent_type(src, extent);
3113 			if (found_type == BTRFS_FILE_EXTENT_REG) {
3114 				u64 ds, dl, cs, cl;
3115 				ds = btrfs_file_extent_disk_bytenr(src,
3116 								extent);
3117 				/* ds == 0 is a hole */
3118 				if (ds == 0)
3119 					continue;
3120 
3121 				dl = btrfs_file_extent_disk_num_bytes(src,
3122 								extent);
3123 				cs = btrfs_file_extent_offset(src, extent);
3124 				cl = btrfs_file_extent_num_bytes(src,
3125 								extent);
3126 				if (btrfs_file_extent_compression(src,
3127 								  extent)) {
3128 					cs = 0;
3129 					cl = dl;
3130 				}
3131 
3132 				ret = btrfs_lookup_csums_range(
3133 						log->fs_info->csum_root,
3134 						ds + cs, ds + cs + cl - 1,
3135 						&ordered_sums, 0);
3136 				BUG_ON(ret);
3137 			}
3138 		}
3139 	}
3140 
3141 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3142 	btrfs_release_path(dst_path);
3143 	kfree(ins_data);
3144 
3145 	/*
3146 	 * we have to do this after the loop above to avoid changing the
3147 	 * log tree while trying to change the log tree.
3148 	 */
3149 	ret = 0;
3150 	while (!list_empty(&ordered_sums)) {
3151 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3152 						   struct btrfs_ordered_sum,
3153 						   list);
3154 		if (!ret)
3155 			ret = btrfs_csum_file_blocks(trans, log, sums);
3156 		list_del(&sums->list);
3157 		kfree(sums);
3158 	}
3159 	return ret;
3160 }
3161 
3162 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3163 {
3164 	struct extent_map *em1, *em2;
3165 
3166 	em1 = list_entry(a, struct extent_map, list);
3167 	em2 = list_entry(b, struct extent_map, list);
3168 
3169 	if (em1->start < em2->start)
3170 		return -1;
3171 	else if (em1->start > em2->start)
3172 		return 1;
3173 	return 0;
3174 }
3175 
3176 static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3177 				 struct btrfs_root *root, struct inode *inode,
3178 				 struct extent_map *em,
3179 				 struct btrfs_path *path)
3180 {
3181 	struct btrfs_file_extent_item *fi;
3182 	struct extent_buffer *leaf;
3183 	struct btrfs_key key, new_key;
3184 	struct btrfs_map_token token;
3185 	u64 extent_end;
3186 	u64 extent_offset = 0;
3187 	int extent_type;
3188 	int del_slot = 0;
3189 	int del_nr = 0;
3190 	int ret = 0;
3191 
3192 	while (1) {
3193 		btrfs_init_map_token(&token);
3194 		leaf = path->nodes[0];
3195 		path->slots[0]++;
3196 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3197 			if (del_nr) {
3198 				ret = btrfs_del_items(trans, root, path,
3199 						      del_slot, del_nr);
3200 				if (ret)
3201 					return ret;
3202 				del_nr = 0;
3203 			}
3204 
3205 			ret = btrfs_next_leaf_write(trans, root, path, 1);
3206 			if (ret < 0)
3207 				return ret;
3208 			if (ret > 0)
3209 				return 0;
3210 			leaf = path->nodes[0];
3211 		}
3212 
3213 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3214 		if (key.objectid != btrfs_ino(inode) ||
3215 		    key.type != BTRFS_EXTENT_DATA_KEY ||
3216 		    key.offset >= em->start + em->len)
3217 			break;
3218 
3219 		fi = btrfs_item_ptr(leaf, path->slots[0],
3220 				    struct btrfs_file_extent_item);
3221 		extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3222 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
3223 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3224 			extent_offset = btrfs_token_file_extent_offset(leaf,
3225 								fi, &token);
3226 			extent_end = key.offset +
3227 				btrfs_token_file_extent_num_bytes(leaf, fi,
3228 								  &token);
3229 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3230 			extent_end = key.offset +
3231 				btrfs_file_extent_inline_len(leaf, fi);
3232 		} else {
3233 			BUG();
3234 		}
3235 
3236 		if (extent_end <= em->len + em->start) {
3237 			if (!del_nr) {
3238 				del_slot = path->slots[0];
3239 			}
3240 			del_nr++;
3241 			continue;
3242 		}
3243 
3244 		/*
3245 		 * Ok so we'll ignore previous items if we log a new extent,
3246 		 * which can lead to overlapping extents, so if we have an
3247 		 * existing extent we want to adjust we _have_ to check the next
3248 		 * guy to make sure we even need this extent anymore, this keeps
3249 		 * us from panicing in set_item_key_safe.
3250 		 */
3251 		if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3252 			struct btrfs_key tmp_key;
3253 
3254 			btrfs_item_key_to_cpu(leaf, &tmp_key,
3255 					      path->slots[0] + 1);
3256 			if (tmp_key.objectid == btrfs_ino(inode) &&
3257 			    tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3258 			    tmp_key.offset <= em->start + em->len) {
3259 				if (!del_nr)
3260 					del_slot = path->slots[0];
3261 				del_nr++;
3262 				continue;
3263 			}
3264 		}
3265 
3266 		BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3267 		memcpy(&new_key, &key, sizeof(new_key));
3268 		new_key.offset = em->start + em->len;
3269 		btrfs_set_item_key_safe(trans, root, path, &new_key);
3270 		extent_offset += em->start + em->len - key.offset;
3271 		btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3272 						   &token);
3273 		btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3274 						      (em->start + em->len),
3275 						      &token);
3276 		btrfs_mark_buffer_dirty(leaf);
3277 	}
3278 
3279 	if (del_nr)
3280 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3281 
3282 	return ret;
3283 }
3284 
3285 static int log_one_extent(struct btrfs_trans_handle *trans,
3286 			  struct inode *inode, struct btrfs_root *root,
3287 			  struct extent_map *em, struct btrfs_path *path)
3288 {
3289 	struct btrfs_root *log = root->log_root;
3290 	struct btrfs_file_extent_item *fi;
3291 	struct extent_buffer *leaf;
3292 	struct btrfs_ordered_extent *ordered;
3293 	struct list_head ordered_sums;
3294 	struct btrfs_map_token token;
3295 	struct btrfs_key key;
3296 	u64 mod_start = em->mod_start;
3297 	u64 mod_len = em->mod_len;
3298 	u64 csum_offset;
3299 	u64 csum_len;
3300 	u64 extent_offset = em->start - em->orig_start;
3301 	u64 block_len;
3302 	int ret;
3303 	int index = log->log_transid % 2;
3304 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3305 
3306 insert:
3307 	INIT_LIST_HEAD(&ordered_sums);
3308 	btrfs_init_map_token(&token);
3309 	key.objectid = btrfs_ino(inode);
3310 	key.type = BTRFS_EXTENT_DATA_KEY;
3311 	key.offset = em->start;
3312 	path->really_keep_locks = 1;
3313 
3314 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3315 	if (ret && ret != -EEXIST) {
3316 		path->really_keep_locks = 0;
3317 		return ret;
3318 	}
3319 	leaf = path->nodes[0];
3320 	fi = btrfs_item_ptr(leaf, path->slots[0],
3321 			    struct btrfs_file_extent_item);
3322 
3323 	/*
3324 	 * If we are overwriting an inline extent with a real one then we need
3325 	 * to just delete the inline extent as it may not be large enough to
3326 	 * have the entire file_extent_item.
3327 	 */
3328 	if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
3329 	    BTRFS_FILE_EXTENT_INLINE) {
3330 		ret = btrfs_del_item(trans, log, path);
3331 		btrfs_release_path(path);
3332 		if (ret) {
3333 			path->really_keep_locks = 0;
3334 			return ret;
3335 		}
3336 		goto insert;
3337 	}
3338 
3339 	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3340 					       &token);
3341 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3342 		skip_csum = true;
3343 		btrfs_set_token_file_extent_type(leaf, fi,
3344 						 BTRFS_FILE_EXTENT_PREALLOC,
3345 						 &token);
3346 	} else {
3347 		btrfs_set_token_file_extent_type(leaf, fi,
3348 						 BTRFS_FILE_EXTENT_REG,
3349 						 &token);
3350 		if (em->block_start == 0)
3351 			skip_csum = true;
3352 	}
3353 
3354 	block_len = max(em->block_len, em->orig_block_len);
3355 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
3356 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3357 							em->block_start,
3358 							&token);
3359 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3360 							   &token);
3361 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3362 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3363 							em->block_start -
3364 							extent_offset, &token);
3365 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3366 							   &token);
3367 	} else {
3368 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3369 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3370 							   &token);
3371 	}
3372 
3373 	btrfs_set_token_file_extent_offset(leaf, fi,
3374 					   em->start - em->orig_start,
3375 					   &token);
3376 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3377 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3378 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3379 						&token);
3380 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3381 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3382 	btrfs_mark_buffer_dirty(leaf);
3383 
3384 	/*
3385 	 * Have to check the extent to the right of us to make sure it doesn't
3386 	 * fall in our current range.  We're ok if the previous extent is in our
3387 	 * range since the recovery stuff will run us in key order and thus just
3388 	 * drop the part we overwrote.
3389 	 */
3390 	ret = drop_adjacent_extents(trans, log, inode, em, path);
3391 	btrfs_release_path(path);
3392 	path->really_keep_locks = 0;
3393 	if (ret) {
3394 		return ret;
3395 	}
3396 
3397 	if (skip_csum)
3398 		return 0;
3399 
3400 	if (em->compress_type) {
3401 		csum_offset = 0;
3402 		csum_len = block_len;
3403 	}
3404 
3405 	/*
3406 	 * First check and see if our csums are on our outstanding ordered
3407 	 * extents.
3408 	 */
3409 again:
3410 	spin_lock_irq(&log->log_extents_lock[index]);
3411 	list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3412 		struct btrfs_ordered_sum *sum;
3413 
3414 		if (!mod_len)
3415 			break;
3416 
3417 		if (ordered->inode != inode)
3418 			continue;
3419 
3420 		if (ordered->file_offset + ordered->len <= mod_start ||
3421 		    mod_start + mod_len <= ordered->file_offset)
3422 			continue;
3423 
3424 		/*
3425 		 * We are going to copy all the csums on this ordered extent, so
3426 		 * go ahead and adjust mod_start and mod_len in case this
3427 		 * ordered extent has already been logged.
3428 		 */
3429 		if (ordered->file_offset > mod_start) {
3430 			if (ordered->file_offset + ordered->len >=
3431 			    mod_start + mod_len)
3432 				mod_len = ordered->file_offset - mod_start;
3433 			/*
3434 			 * If we have this case
3435 			 *
3436 			 * |--------- logged extent ---------|
3437 			 *       |----- ordered extent ----|
3438 			 *
3439 			 * Just don't mess with mod_start and mod_len, we'll
3440 			 * just end up logging more csums than we need and it
3441 			 * will be ok.
3442 			 */
3443 		} else {
3444 			if (ordered->file_offset + ordered->len <
3445 			    mod_start + mod_len) {
3446 				mod_len = (mod_start + mod_len) -
3447 					(ordered->file_offset + ordered->len);
3448 				mod_start = ordered->file_offset +
3449 					ordered->len;
3450 			} else {
3451 				mod_len = 0;
3452 			}
3453 		}
3454 
3455 		/*
3456 		 * To keep us from looping for the above case of an ordered
3457 		 * extent that falls inside of the logged extent.
3458 		 */
3459 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3460 				     &ordered->flags))
3461 			continue;
3462 		atomic_inc(&ordered->refs);
3463 		spin_unlock_irq(&log->log_extents_lock[index]);
3464 		/*
3465 		 * we've dropped the lock, we must either break or
3466 		 * start over after this.
3467 		 */
3468 
3469 		wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3470 
3471 		list_for_each_entry(sum, &ordered->list, list) {
3472 			ret = btrfs_csum_file_blocks(trans, log, sum);
3473 			if (ret) {
3474 				btrfs_put_ordered_extent(ordered);
3475 				goto unlocked;
3476 			}
3477 		}
3478 		btrfs_put_ordered_extent(ordered);
3479 		goto again;
3480 
3481 	}
3482 	spin_unlock_irq(&log->log_extents_lock[index]);
3483 unlocked:
3484 
3485 	if (!mod_len || ret)
3486 		return ret;
3487 
3488 	csum_offset = mod_start - em->start;
3489 	csum_len = mod_len;
3490 
3491 	/* block start is already adjusted for the file extent offset. */
3492 	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3493 				       em->block_start + csum_offset,
3494 				       em->block_start + csum_offset +
3495 				       csum_len - 1, &ordered_sums, 0);
3496 	if (ret)
3497 		return ret;
3498 
3499 	while (!list_empty(&ordered_sums)) {
3500 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3501 						   struct btrfs_ordered_sum,
3502 						   list);
3503 		if (!ret)
3504 			ret = btrfs_csum_file_blocks(trans, log, sums);
3505 		list_del(&sums->list);
3506 		kfree(sums);
3507 	}
3508 
3509 	return ret;
3510 }
3511 
3512 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3513 				     struct btrfs_root *root,
3514 				     struct inode *inode,
3515 				     struct btrfs_path *path)
3516 {
3517 	struct extent_map *em, *n;
3518 	struct list_head extents;
3519 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3520 	u64 test_gen;
3521 	int ret = 0;
3522 	int num = 0;
3523 
3524 	INIT_LIST_HEAD(&extents);
3525 
3526 	write_lock(&tree->lock);
3527 	test_gen = root->fs_info->last_trans_committed;
3528 
3529 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3530 		list_del_init(&em->list);
3531 
3532 		/*
3533 		 * Just an arbitrary number, this can be really CPU intensive
3534 		 * once we start getting a lot of extents, and really once we
3535 		 * have a bunch of extents we just want to commit since it will
3536 		 * be faster.
3537 		 */
3538 		if (++num > 32768) {
3539 			list_del_init(&tree->modified_extents);
3540 			ret = -EFBIG;
3541 			goto process;
3542 		}
3543 
3544 		if (em->generation <= test_gen)
3545 			continue;
3546 		/* Need a ref to keep it from getting evicted from cache */
3547 		atomic_inc(&em->refs);
3548 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3549 		list_add_tail(&em->list, &extents);
3550 		num++;
3551 	}
3552 
3553 	list_sort(NULL, &extents, extent_cmp);
3554 
3555 process:
3556 	while (!list_empty(&extents)) {
3557 		em = list_entry(extents.next, struct extent_map, list);
3558 
3559 		list_del_init(&em->list);
3560 
3561 		/*
3562 		 * If we had an error we just need to delete everybody from our
3563 		 * private list.
3564 		 */
3565 		if (ret) {
3566 			clear_em_logging(tree, em);
3567 			free_extent_map(em);
3568 			continue;
3569 		}
3570 
3571 		write_unlock(&tree->lock);
3572 
3573 		ret = log_one_extent(trans, inode, root, em, path);
3574 		write_lock(&tree->lock);
3575 		clear_em_logging(tree, em);
3576 		free_extent_map(em);
3577 	}
3578 	WARN_ON(!list_empty(&extents));
3579 	write_unlock(&tree->lock);
3580 
3581 	btrfs_release_path(path);
3582 	return ret;
3583 }
3584 
3585 /* log a single inode in the tree log.
3586  * At least one parent directory for this inode must exist in the tree
3587  * or be logged already.
3588  *
3589  * Any items from this inode changed by the current transaction are copied
3590  * to the log tree.  An extra reference is taken on any extents in this
3591  * file, allowing us to avoid a whole pile of corner cases around logging
3592  * blocks that have been removed from the tree.
3593  *
3594  * See LOG_INODE_ALL and related defines for a description of what inode_only
3595  * does.
3596  *
3597  * This handles both files and directories.
3598  */
3599 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3600 			     struct btrfs_root *root, struct inode *inode,
3601 			     int inode_only)
3602 {
3603 	struct btrfs_path *path;
3604 	struct btrfs_path *dst_path;
3605 	struct btrfs_key min_key;
3606 	struct btrfs_key max_key;
3607 	struct btrfs_root *log = root->log_root;
3608 	struct extent_buffer *src = NULL;
3609 	int err = 0;
3610 	int ret;
3611 	int nritems;
3612 	int ins_start_slot = 0;
3613 	int ins_nr;
3614 	bool fast_search = false;
3615 	u64 ino = btrfs_ino(inode);
3616 
3617 	log = root->log_root;
3618 
3619 	path = btrfs_alloc_path();
3620 	if (!path)
3621 		return -ENOMEM;
3622 	dst_path = btrfs_alloc_path();
3623 	if (!dst_path) {
3624 		btrfs_free_path(path);
3625 		return -ENOMEM;
3626 	}
3627 
3628 	min_key.objectid = ino;
3629 	min_key.type = BTRFS_INODE_ITEM_KEY;
3630 	min_key.offset = 0;
3631 
3632 	max_key.objectid = ino;
3633 
3634 
3635 	/* today the code can only do partial logging of directories */
3636 	if (S_ISDIR(inode->i_mode) ||
3637 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3638 		       &BTRFS_I(inode)->runtime_flags) &&
3639 	     inode_only == LOG_INODE_EXISTS))
3640 		max_key.type = BTRFS_XATTR_ITEM_KEY;
3641 	else
3642 		max_key.type = (u8)-1;
3643 	max_key.offset = (u64)-1;
3644 
3645 	/* Only run delayed items if we are a dir or a new file */
3646 	if (S_ISDIR(inode->i_mode) ||
3647 	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3648 		ret = btrfs_commit_inode_delayed_items(trans, inode);
3649 		if (ret) {
3650 			btrfs_free_path(path);
3651 			btrfs_free_path(dst_path);
3652 			return ret;
3653 		}
3654 	}
3655 
3656 	mutex_lock(&BTRFS_I(inode)->log_mutex);
3657 
3658 	btrfs_get_logged_extents(log, inode);
3659 
3660 	/*
3661 	 * a brute force approach to making sure we get the most uptodate
3662 	 * copies of everything.
3663 	 */
3664 	if (S_ISDIR(inode->i_mode)) {
3665 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3666 
3667 		if (inode_only == LOG_INODE_EXISTS)
3668 			max_key_type = BTRFS_XATTR_ITEM_KEY;
3669 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3670 	} else {
3671 		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3672 				       &BTRFS_I(inode)->runtime_flags)) {
3673 			clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3674 				  &BTRFS_I(inode)->runtime_flags);
3675 			ret = btrfs_truncate_inode_items(trans, log,
3676 							 inode, 0, 0);
3677 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3678 					      &BTRFS_I(inode)->runtime_flags)) {
3679 			if (inode_only == LOG_INODE_ALL)
3680 				fast_search = true;
3681 			max_key.type = BTRFS_XATTR_ITEM_KEY;
3682 			ret = drop_objectid_items(trans, log, path, ino,
3683 						  max_key.type);
3684 		} else {
3685 			if (inode_only == LOG_INODE_ALL)
3686 				fast_search = true;
3687 			ret = log_inode_item(trans, log, dst_path, inode);
3688 			if (ret) {
3689 				err = ret;
3690 				goto out_unlock;
3691 			}
3692 			goto log_extents;
3693 		}
3694 
3695 	}
3696 	if (ret) {
3697 		err = ret;
3698 		goto out_unlock;
3699 	}
3700 	path->keep_locks = 1;
3701 
3702 	while (1) {
3703 		ins_nr = 0;
3704 		ret = btrfs_search_forward(root, &min_key, &max_key,
3705 					   path, trans->transid);
3706 		if (ret != 0)
3707 			break;
3708 again:
3709 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
3710 		if (min_key.objectid != ino)
3711 			break;
3712 		if (min_key.type > max_key.type)
3713 			break;
3714 
3715 		src = path->nodes[0];
3716 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3717 			ins_nr++;
3718 			goto next_slot;
3719 		} else if (!ins_nr) {
3720 			ins_start_slot = path->slots[0];
3721 			ins_nr = 1;
3722 			goto next_slot;
3723 		}
3724 
3725 		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3726 				 ins_nr, inode_only);
3727 		if (ret) {
3728 			err = ret;
3729 			goto out_unlock;
3730 		}
3731 		ins_nr = 1;
3732 		ins_start_slot = path->slots[0];
3733 next_slot:
3734 
3735 		nritems = btrfs_header_nritems(path->nodes[0]);
3736 		path->slots[0]++;
3737 		if (path->slots[0] < nritems) {
3738 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3739 					      path->slots[0]);
3740 			goto again;
3741 		}
3742 		if (ins_nr) {
3743 			ret = copy_items(trans, inode, dst_path, src,
3744 					 ins_start_slot,
3745 					 ins_nr, inode_only);
3746 			if (ret) {
3747 				err = ret;
3748 				goto out_unlock;
3749 			}
3750 			ins_nr = 0;
3751 		}
3752 		btrfs_release_path(path);
3753 
3754 		if (min_key.offset < (u64)-1)
3755 			min_key.offset++;
3756 		else if (min_key.type < (u8)-1)
3757 			min_key.type++;
3758 		else if (min_key.objectid < (u64)-1)
3759 			min_key.objectid++;
3760 		else
3761 			break;
3762 	}
3763 	if (ins_nr) {
3764 		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3765 				 ins_nr, inode_only);
3766 		if (ret) {
3767 			err = ret;
3768 			goto out_unlock;
3769 		}
3770 		ins_nr = 0;
3771 	}
3772 
3773 log_extents:
3774 	if (fast_search) {
3775 		btrfs_release_path(dst_path);
3776 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3777 		if (ret) {
3778 			err = ret;
3779 			goto out_unlock;
3780 		}
3781 	} else {
3782 		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3783 		struct extent_map *em, *n;
3784 
3785 		write_lock(&tree->lock);
3786 		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3787 			list_del_init(&em->list);
3788 		write_unlock(&tree->lock);
3789 	}
3790 
3791 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3792 		btrfs_release_path(path);
3793 		btrfs_release_path(dst_path);
3794 		ret = log_directory_changes(trans, root, inode, path, dst_path);
3795 		if (ret) {
3796 			err = ret;
3797 			goto out_unlock;
3798 		}
3799 	}
3800 	BTRFS_I(inode)->logged_trans = trans->transid;
3801 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3802 out_unlock:
3803 	if (err)
3804 		btrfs_free_logged_extents(log, log->log_transid);
3805 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
3806 
3807 	btrfs_free_path(path);
3808 	btrfs_free_path(dst_path);
3809 	return err;
3810 }
3811 
3812 /*
3813  * follow the dentry parent pointers up the chain and see if any
3814  * of the directories in it require a full commit before they can
3815  * be logged.  Returns zero if nothing special needs to be done or 1 if
3816  * a full commit is required.
3817  */
3818 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
3819 					       struct inode *inode,
3820 					       struct dentry *parent,
3821 					       struct super_block *sb,
3822 					       u64 last_committed)
3823 {
3824 	int ret = 0;
3825 	struct btrfs_root *root;
3826 	struct dentry *old_parent = NULL;
3827 
3828 	/*
3829 	 * for regular files, if its inode is already on disk, we don't
3830 	 * have to worry about the parents at all.  This is because
3831 	 * we can use the last_unlink_trans field to record renames
3832 	 * and other fun in this file.
3833 	 */
3834 	if (S_ISREG(inode->i_mode) &&
3835 	    BTRFS_I(inode)->generation <= last_committed &&
3836 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
3837 			goto out;
3838 
3839 	if (!S_ISDIR(inode->i_mode)) {
3840 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3841 			goto out;
3842 		inode = parent->d_inode;
3843 	}
3844 
3845 	while (1) {
3846 		BTRFS_I(inode)->logged_trans = trans->transid;
3847 		smp_mb();
3848 
3849 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
3850 			root = BTRFS_I(inode)->root;
3851 
3852 			/*
3853 			 * make sure any commits to the log are forced
3854 			 * to be full commits
3855 			 */
3856 			root->fs_info->last_trans_log_full_commit =
3857 				trans->transid;
3858 			ret = 1;
3859 			break;
3860 		}
3861 
3862 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3863 			break;
3864 
3865 		if (IS_ROOT(parent))
3866 			break;
3867 
3868 		parent = dget_parent(parent);
3869 		dput(old_parent);
3870 		old_parent = parent;
3871 		inode = parent->d_inode;
3872 
3873 	}
3874 	dput(old_parent);
3875 out:
3876 	return ret;
3877 }
3878 
3879 /*
3880  * helper function around btrfs_log_inode to make sure newly created
3881  * parent directories also end up in the log.  A minimal inode and backref
3882  * only logging is done of any parent directories that are older than
3883  * the last committed transaction
3884  */
3885 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3886 		    struct btrfs_root *root, struct inode *inode,
3887 		    struct dentry *parent, int exists_only)
3888 {
3889 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3890 	struct super_block *sb;
3891 	struct dentry *old_parent = NULL;
3892 	int ret = 0;
3893 	u64 last_committed = root->fs_info->last_trans_committed;
3894 
3895 	sb = inode->i_sb;
3896 
3897 	if (btrfs_test_opt(root, NOTREELOG)) {
3898 		ret = 1;
3899 		goto end_no_trans;
3900 	}
3901 
3902 	if (root->fs_info->last_trans_log_full_commit >
3903 	    root->fs_info->last_trans_committed) {
3904 		ret = 1;
3905 		goto end_no_trans;
3906 	}
3907 
3908 	if (root != BTRFS_I(inode)->root ||
3909 	    btrfs_root_refs(&root->root_item) == 0) {
3910 		ret = 1;
3911 		goto end_no_trans;
3912 	}
3913 
3914 	ret = check_parent_dirs_for_sync(trans, inode, parent,
3915 					 sb, last_committed);
3916 	if (ret)
3917 		goto end_no_trans;
3918 
3919 	if (btrfs_inode_in_log(inode, trans->transid)) {
3920 		ret = BTRFS_NO_LOG_SYNC;
3921 		goto end_no_trans;
3922 	}
3923 
3924 	ret = start_log_trans(trans, root);
3925 	if (ret)
3926 		goto end_trans;
3927 
3928 	ret = btrfs_log_inode(trans, root, inode, inode_only);
3929 	if (ret)
3930 		goto end_trans;
3931 
3932 	/*
3933 	 * for regular files, if its inode is already on disk, we don't
3934 	 * have to worry about the parents at all.  This is because
3935 	 * we can use the last_unlink_trans field to record renames
3936 	 * and other fun in this file.
3937 	 */
3938 	if (S_ISREG(inode->i_mode) &&
3939 	    BTRFS_I(inode)->generation <= last_committed &&
3940 	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
3941 		ret = 0;
3942 		goto end_trans;
3943 	}
3944 
3945 	inode_only = LOG_INODE_EXISTS;
3946 	while (1) {
3947 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3948 			break;
3949 
3950 		inode = parent->d_inode;
3951 		if (root != BTRFS_I(inode)->root)
3952 			break;
3953 
3954 		if (BTRFS_I(inode)->generation >
3955 		    root->fs_info->last_trans_committed) {
3956 			ret = btrfs_log_inode(trans, root, inode, inode_only);
3957 			if (ret)
3958 				goto end_trans;
3959 		}
3960 		if (IS_ROOT(parent))
3961 			break;
3962 
3963 		parent = dget_parent(parent);
3964 		dput(old_parent);
3965 		old_parent = parent;
3966 	}
3967 	ret = 0;
3968 end_trans:
3969 	dput(old_parent);
3970 	if (ret < 0) {
3971 		root->fs_info->last_trans_log_full_commit = trans->transid;
3972 		ret = 1;
3973 	}
3974 	btrfs_end_log_trans(root);
3975 end_no_trans:
3976 	return ret;
3977 }
3978 
3979 /*
3980  * it is not safe to log dentry if the chunk root has added new
3981  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
3982  * If this returns 1, you must commit the transaction to safely get your
3983  * data on disk.
3984  */
3985 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3986 			  struct btrfs_root *root, struct dentry *dentry)
3987 {
3988 	struct dentry *parent = dget_parent(dentry);
3989 	int ret;
3990 
3991 	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3992 	dput(parent);
3993 
3994 	return ret;
3995 }
3996 
3997 /*
3998  * should be called during mount to recover any replay any log trees
3999  * from the FS
4000  */
4001 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4002 {
4003 	int ret;
4004 	struct btrfs_path *path;
4005 	struct btrfs_trans_handle *trans;
4006 	struct btrfs_key key;
4007 	struct btrfs_key found_key;
4008 	struct btrfs_key tmp_key;
4009 	struct btrfs_root *log;
4010 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
4011 	struct walk_control wc = {
4012 		.process_func = process_one_buffer,
4013 		.stage = 0,
4014 	};
4015 
4016 	path = btrfs_alloc_path();
4017 	if (!path)
4018 		return -ENOMEM;
4019 
4020 	fs_info->log_root_recovering = 1;
4021 
4022 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4023 	if (IS_ERR(trans)) {
4024 		ret = PTR_ERR(trans);
4025 		goto error;
4026 	}
4027 
4028 	wc.trans = trans;
4029 	wc.pin = 1;
4030 
4031 	ret = walk_log_tree(trans, log_root_tree, &wc);
4032 	if (ret) {
4033 		btrfs_error(fs_info, ret, "Failed to pin buffers while "
4034 			    "recovering log root tree.");
4035 		goto error;
4036 	}
4037 
4038 again:
4039 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
4040 	key.offset = (u64)-1;
4041 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
4042 
4043 	while (1) {
4044 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4045 
4046 		if (ret < 0) {
4047 			btrfs_error(fs_info, ret,
4048 				    "Couldn't find tree log root.");
4049 			goto error;
4050 		}
4051 		if (ret > 0) {
4052 			if (path->slots[0] == 0)
4053 				break;
4054 			path->slots[0]--;
4055 		}
4056 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4057 				      path->slots[0]);
4058 		btrfs_release_path(path);
4059 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4060 			break;
4061 
4062 		log = btrfs_read_fs_root_no_radix(log_root_tree,
4063 						  &found_key);
4064 		if (IS_ERR(log)) {
4065 			ret = PTR_ERR(log);
4066 			btrfs_error(fs_info, ret,
4067 				    "Couldn't read tree log root.");
4068 			goto error;
4069 		}
4070 
4071 		tmp_key.objectid = found_key.offset;
4072 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4073 		tmp_key.offset = (u64)-1;
4074 
4075 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4076 		if (IS_ERR(wc.replay_dest)) {
4077 			ret = PTR_ERR(wc.replay_dest);
4078 			btrfs_error(fs_info, ret, "Couldn't read target root "
4079 				    "for tree log recovery.");
4080 			goto error;
4081 		}
4082 
4083 		wc.replay_dest->log_root = log;
4084 		btrfs_record_root_in_trans(trans, wc.replay_dest);
4085 		ret = walk_log_tree(trans, log, &wc);
4086 		BUG_ON(ret);
4087 
4088 		if (wc.stage == LOG_WALK_REPLAY_ALL) {
4089 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
4090 						      path);
4091 			BUG_ON(ret);
4092 		}
4093 
4094 		key.offset = found_key.offset - 1;
4095 		wc.replay_dest->log_root = NULL;
4096 		free_extent_buffer(log->node);
4097 		free_extent_buffer(log->commit_root);
4098 		kfree(log);
4099 
4100 		if (found_key.offset == 0)
4101 			break;
4102 	}
4103 	btrfs_release_path(path);
4104 
4105 	/* step one is to pin it all, step two is to replay just inodes */
4106 	if (wc.pin) {
4107 		wc.pin = 0;
4108 		wc.process_func = replay_one_buffer;
4109 		wc.stage = LOG_WALK_REPLAY_INODES;
4110 		goto again;
4111 	}
4112 	/* step three is to replay everything */
4113 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
4114 		wc.stage++;
4115 		goto again;
4116 	}
4117 
4118 	btrfs_free_path(path);
4119 
4120 	free_extent_buffer(log_root_tree->node);
4121 	log_root_tree->log_root = NULL;
4122 	fs_info->log_root_recovering = 0;
4123 
4124 	/* step 4: commit the transaction, which also unpins the blocks */
4125 	btrfs_commit_transaction(trans, fs_info->tree_root);
4126 
4127 	kfree(log_root_tree);
4128 	return 0;
4129 
4130 error:
4131 	btrfs_free_path(path);
4132 	return ret;
4133 }
4134 
4135 /*
4136  * there are some corner cases where we want to force a full
4137  * commit instead of allowing a directory to be logged.
4138  *
4139  * They revolve around files there were unlinked from the directory, and
4140  * this function updates the parent directory so that a full commit is
4141  * properly done if it is fsync'd later after the unlinks are done.
4142  */
4143 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4144 			     struct inode *dir, struct inode *inode,
4145 			     int for_rename)
4146 {
4147 	/*
4148 	 * when we're logging a file, if it hasn't been renamed
4149 	 * or unlinked, and its inode is fully committed on disk,
4150 	 * we don't have to worry about walking up the directory chain
4151 	 * to log its parents.
4152 	 *
4153 	 * So, we use the last_unlink_trans field to put this transid
4154 	 * into the file.  When the file is logged we check it and
4155 	 * don't log the parents if the file is fully on disk.
4156 	 */
4157 	if (S_ISREG(inode->i_mode))
4158 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
4159 
4160 	/*
4161 	 * if this directory was already logged any new
4162 	 * names for this file/dir will get recorded
4163 	 */
4164 	smp_mb();
4165 	if (BTRFS_I(dir)->logged_trans == trans->transid)
4166 		return;
4167 
4168 	/*
4169 	 * if the inode we're about to unlink was logged,
4170 	 * the log will be properly updated for any new names
4171 	 */
4172 	if (BTRFS_I(inode)->logged_trans == trans->transid)
4173 		return;
4174 
4175 	/*
4176 	 * when renaming files across directories, if the directory
4177 	 * there we're unlinking from gets fsync'd later on, there's
4178 	 * no way to find the destination directory later and fsync it
4179 	 * properly.  So, we have to be conservative and force commits
4180 	 * so the new name gets discovered.
4181 	 */
4182 	if (for_rename)
4183 		goto record;
4184 
4185 	/* we can safely do the unlink without any special recording */
4186 	return;
4187 
4188 record:
4189 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
4190 }
4191 
4192 /*
4193  * Call this after adding a new name for a file and it will properly
4194  * update the log to reflect the new name.
4195  *
4196  * It will return zero if all goes well, and it will return 1 if a
4197  * full transaction commit is required.
4198  */
4199 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4200 			struct inode *inode, struct inode *old_dir,
4201 			struct dentry *parent)
4202 {
4203 	struct btrfs_root * root = BTRFS_I(inode)->root;
4204 
4205 	/*
4206 	 * this will force the logging code to walk the dentry chain
4207 	 * up for the file
4208 	 */
4209 	if (S_ISREG(inode->i_mode))
4210 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
4211 
4212 	/*
4213 	 * if this inode hasn't been logged and directory we're renaming it
4214 	 * from hasn't been logged, we don't need to log it
4215 	 */
4216 	if (BTRFS_I(inode)->logged_trans <=
4217 	    root->fs_info->last_trans_committed &&
4218 	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4219 		    root->fs_info->last_trans_committed))
4220 		return 0;
4221 
4222 	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
4223 }
4224 
4225