xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 82ced6fd)
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "locking.h"
24 #include "print-tree.h"
25 #include "compat.h"
26 #include "tree-log.h"
27 
28 /* magic values for the inode_only field in btrfs_log_inode:
29  *
30  * LOG_INODE_ALL means to log everything
31  * LOG_INODE_EXISTS means to log just enough to recreate the inode
32  * during log replay
33  */
34 #define LOG_INODE_ALL 0
35 #define LOG_INODE_EXISTS 1
36 
37 /*
38  * directory trouble cases
39  *
40  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41  * log, we must force a full commit before doing an fsync of the directory
42  * where the unlink was done.
43  * ---> record transid of last unlink/rename per directory
44  *
45  * mkdir foo/some_dir
46  * normal commit
47  * rename foo/some_dir foo2/some_dir
48  * mkdir foo/some_dir
49  * fsync foo/some_dir/some_file
50  *
51  * The fsync above will unlink the original some_dir without recording
52  * it in its new location (foo2).  After a crash, some_dir will be gone
53  * unless the fsync of some_file forces a full commit
54  *
55  * 2) we must log any new names for any file or dir that is in the fsync
56  * log. ---> check inode while renaming/linking.
57  *
58  * 2a) we must log any new names for any file or dir during rename
59  * when the directory they are being removed from was logged.
60  * ---> check inode and old parent dir during rename
61  *
62  *  2a is actually the more important variant.  With the extra logging
63  *  a crash might unlink the old name without recreating the new one
64  *
65  * 3) after a crash, we must go through any directories with a link count
66  * of zero and redo the rm -rf
67  *
68  * mkdir f1/foo
69  * normal commit
70  * rm -rf f1/foo
71  * fsync(f1)
72  *
73  * The directory f1 was fully removed from the FS, but fsync was never
74  * called on f1, only its parent dir.  After a crash the rm -rf must
75  * be replayed.  This must be able to recurse down the entire
76  * directory tree.  The inode link count fixup code takes care of the
77  * ugly details.
78  */
79 
80 /*
81  * stages for the tree walking.  The first
82  * stage (0) is to only pin down the blocks we find
83  * the second stage (1) is to make sure that all the inodes
84  * we find in the log are created in the subvolume.
85  *
86  * The last stage is to deal with directories and links and extents
87  * and all the other fun semantics
88  */
89 #define LOG_WALK_PIN_ONLY 0
90 #define LOG_WALK_REPLAY_INODES 1
91 #define LOG_WALK_REPLAY_ALL 2
92 
93 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
94 			     struct btrfs_root *root, struct inode *inode,
95 			     int inode_only);
96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
97 			     struct btrfs_root *root,
98 			     struct btrfs_path *path, u64 objectid);
99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 				       struct btrfs_root *root,
101 				       struct btrfs_root *log,
102 				       struct btrfs_path *path,
103 				       u64 dirid, int del_all);
104 
105 /*
106  * tree logging is a special write ahead log used to make sure that
107  * fsyncs and O_SYNCs can happen without doing full tree commits.
108  *
109  * Full tree commits are expensive because they require commonly
110  * modified blocks to be recowed, creating many dirty pages in the
111  * extent tree an 4x-6x higher write load than ext3.
112  *
113  * Instead of doing a tree commit on every fsync, we use the
114  * key ranges and transaction ids to find items for a given file or directory
115  * that have changed in this transaction.  Those items are copied into
116  * a special tree (one per subvolume root), that tree is written to disk
117  * and then the fsync is considered complete.
118  *
119  * After a crash, items are copied out of the log-tree back into the
120  * subvolume tree.  Any file data extents found are recorded in the extent
121  * allocation tree, and the log-tree freed.
122  *
123  * The log tree is read three times, once to pin down all the extents it is
124  * using in ram and once, once to create all the inodes logged in the tree
125  * and once to do all the other items.
126  */
127 
128 /*
129  * start a sub transaction and setup the log tree
130  * this increments the log tree writer count to make the people
131  * syncing the tree wait for us to finish
132  */
133 static int start_log_trans(struct btrfs_trans_handle *trans,
134 			   struct btrfs_root *root)
135 {
136 	int ret;
137 
138 	mutex_lock(&root->log_mutex);
139 	if (root->log_root) {
140 		root->log_batch++;
141 		atomic_inc(&root->log_writers);
142 		mutex_unlock(&root->log_mutex);
143 		return 0;
144 	}
145 	mutex_lock(&root->fs_info->tree_log_mutex);
146 	if (!root->fs_info->log_root_tree) {
147 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
148 		BUG_ON(ret);
149 	}
150 	if (!root->log_root) {
151 		ret = btrfs_add_log_tree(trans, root);
152 		BUG_ON(ret);
153 	}
154 	mutex_unlock(&root->fs_info->tree_log_mutex);
155 	root->log_batch++;
156 	atomic_inc(&root->log_writers);
157 	mutex_unlock(&root->log_mutex);
158 	return 0;
159 }
160 
161 /*
162  * returns 0 if there was a log transaction running and we were able
163  * to join, or returns -ENOENT if there were not transactions
164  * in progress
165  */
166 static int join_running_log_trans(struct btrfs_root *root)
167 {
168 	int ret = -ENOENT;
169 
170 	smp_mb();
171 	if (!root->log_root)
172 		return -ENOENT;
173 
174 	mutex_lock(&root->log_mutex);
175 	if (root->log_root) {
176 		ret = 0;
177 		atomic_inc(&root->log_writers);
178 	}
179 	mutex_unlock(&root->log_mutex);
180 	return ret;
181 }
182 
183 /*
184  * This either makes the current running log transaction wait
185  * until you call btrfs_end_log_trans() or it makes any future
186  * log transactions wait until you call btrfs_end_log_trans()
187  */
188 int btrfs_pin_log_trans(struct btrfs_root *root)
189 {
190 	int ret = -ENOENT;
191 
192 	mutex_lock(&root->log_mutex);
193 	atomic_inc(&root->log_writers);
194 	mutex_unlock(&root->log_mutex);
195 	return ret;
196 }
197 
198 /*
199  * indicate we're done making changes to the log tree
200  * and wake up anyone waiting to do a sync
201  */
202 int btrfs_end_log_trans(struct btrfs_root *root)
203 {
204 	if (atomic_dec_and_test(&root->log_writers)) {
205 		smp_mb();
206 		if (waitqueue_active(&root->log_writer_wait))
207 			wake_up(&root->log_writer_wait);
208 	}
209 	return 0;
210 }
211 
212 
213 /*
214  * the walk control struct is used to pass state down the chain when
215  * processing the log tree.  The stage field tells us which part
216  * of the log tree processing we are currently doing.  The others
217  * are state fields used for that specific part
218  */
219 struct walk_control {
220 	/* should we free the extent on disk when done?  This is used
221 	 * at transaction commit time while freeing a log tree
222 	 */
223 	int free;
224 
225 	/* should we write out the extent buffer?  This is used
226 	 * while flushing the log tree to disk during a sync
227 	 */
228 	int write;
229 
230 	/* should we wait for the extent buffer io to finish?  Also used
231 	 * while flushing the log tree to disk for a sync
232 	 */
233 	int wait;
234 
235 	/* pin only walk, we record which extents on disk belong to the
236 	 * log trees
237 	 */
238 	int pin;
239 
240 	/* what stage of the replay code we're currently in */
241 	int stage;
242 
243 	/* the root we are currently replaying */
244 	struct btrfs_root *replay_dest;
245 
246 	/* the trans handle for the current replay */
247 	struct btrfs_trans_handle *trans;
248 
249 	/* the function that gets used to process blocks we find in the
250 	 * tree.  Note the extent_buffer might not be up to date when it is
251 	 * passed in, and it must be checked or read if you need the data
252 	 * inside it
253 	 */
254 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
255 			    struct walk_control *wc, u64 gen);
256 };
257 
258 /*
259  * process_func used to pin down extents, write them or wait on them
260  */
261 static int process_one_buffer(struct btrfs_root *log,
262 			      struct extent_buffer *eb,
263 			      struct walk_control *wc, u64 gen)
264 {
265 	if (wc->pin)
266 		btrfs_update_pinned_extents(log->fs_info->extent_root,
267 					    eb->start, eb->len, 1);
268 
269 	if (btrfs_buffer_uptodate(eb, gen)) {
270 		if (wc->write)
271 			btrfs_write_tree_block(eb);
272 		if (wc->wait)
273 			btrfs_wait_tree_block_writeback(eb);
274 	}
275 	return 0;
276 }
277 
278 /*
279  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
280  * to the src data we are copying out.
281  *
282  * root is the tree we are copying into, and path is a scratch
283  * path for use in this function (it should be released on entry and
284  * will be released on exit).
285  *
286  * If the key is already in the destination tree the existing item is
287  * overwritten.  If the existing item isn't big enough, it is extended.
288  * If it is too large, it is truncated.
289  *
290  * If the key isn't in the destination yet, a new item is inserted.
291  */
292 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
293 				   struct btrfs_root *root,
294 				   struct btrfs_path *path,
295 				   struct extent_buffer *eb, int slot,
296 				   struct btrfs_key *key)
297 {
298 	int ret;
299 	u32 item_size;
300 	u64 saved_i_size = 0;
301 	int save_old_i_size = 0;
302 	unsigned long src_ptr;
303 	unsigned long dst_ptr;
304 	int overwrite_root = 0;
305 
306 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
307 		overwrite_root = 1;
308 
309 	item_size = btrfs_item_size_nr(eb, slot);
310 	src_ptr = btrfs_item_ptr_offset(eb, slot);
311 
312 	/* look for the key in the destination tree */
313 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
314 	if (ret == 0) {
315 		char *src_copy;
316 		char *dst_copy;
317 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
318 						  path->slots[0]);
319 		if (dst_size != item_size)
320 			goto insert;
321 
322 		if (item_size == 0) {
323 			btrfs_release_path(root, path);
324 			return 0;
325 		}
326 		dst_copy = kmalloc(item_size, GFP_NOFS);
327 		src_copy = kmalloc(item_size, GFP_NOFS);
328 
329 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
330 
331 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
332 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
333 				   item_size);
334 		ret = memcmp(dst_copy, src_copy, item_size);
335 
336 		kfree(dst_copy);
337 		kfree(src_copy);
338 		/*
339 		 * they have the same contents, just return, this saves
340 		 * us from cowing blocks in the destination tree and doing
341 		 * extra writes that may not have been done by a previous
342 		 * sync
343 		 */
344 		if (ret == 0) {
345 			btrfs_release_path(root, path);
346 			return 0;
347 		}
348 
349 	}
350 insert:
351 	btrfs_release_path(root, path);
352 	/* try to insert the key into the destination tree */
353 	ret = btrfs_insert_empty_item(trans, root, path,
354 				      key, item_size);
355 
356 	/* make sure any existing item is the correct size */
357 	if (ret == -EEXIST) {
358 		u32 found_size;
359 		found_size = btrfs_item_size_nr(path->nodes[0],
360 						path->slots[0]);
361 		if (found_size > item_size) {
362 			btrfs_truncate_item(trans, root, path, item_size, 1);
363 		} else if (found_size < item_size) {
364 			ret = btrfs_extend_item(trans, root, path,
365 						item_size - found_size);
366 			BUG_ON(ret);
367 		}
368 	} else if (ret) {
369 		BUG();
370 	}
371 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
372 					path->slots[0]);
373 
374 	/* don't overwrite an existing inode if the generation number
375 	 * was logged as zero.  This is done when the tree logging code
376 	 * is just logging an inode to make sure it exists after recovery.
377 	 *
378 	 * Also, don't overwrite i_size on directories during replay.
379 	 * log replay inserts and removes directory items based on the
380 	 * state of the tree found in the subvolume, and i_size is modified
381 	 * as it goes
382 	 */
383 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
384 		struct btrfs_inode_item *src_item;
385 		struct btrfs_inode_item *dst_item;
386 
387 		src_item = (struct btrfs_inode_item *)src_ptr;
388 		dst_item = (struct btrfs_inode_item *)dst_ptr;
389 
390 		if (btrfs_inode_generation(eb, src_item) == 0)
391 			goto no_copy;
392 
393 		if (overwrite_root &&
394 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
395 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
396 			save_old_i_size = 1;
397 			saved_i_size = btrfs_inode_size(path->nodes[0],
398 							dst_item);
399 		}
400 	}
401 
402 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
403 			   src_ptr, item_size);
404 
405 	if (save_old_i_size) {
406 		struct btrfs_inode_item *dst_item;
407 		dst_item = (struct btrfs_inode_item *)dst_ptr;
408 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
409 	}
410 
411 	/* make sure the generation is filled in */
412 	if (key->type == BTRFS_INODE_ITEM_KEY) {
413 		struct btrfs_inode_item *dst_item;
414 		dst_item = (struct btrfs_inode_item *)dst_ptr;
415 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
416 			btrfs_set_inode_generation(path->nodes[0], dst_item,
417 						   trans->transid);
418 		}
419 	}
420 no_copy:
421 	btrfs_mark_buffer_dirty(path->nodes[0]);
422 	btrfs_release_path(root, path);
423 	return 0;
424 }
425 
426 /*
427  * simple helper to read an inode off the disk from a given root
428  * This can only be called for subvolume roots and not for the log
429  */
430 static noinline struct inode *read_one_inode(struct btrfs_root *root,
431 					     u64 objectid)
432 {
433 	struct inode *inode;
434 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
435 	if (inode->i_state & I_NEW) {
436 		BTRFS_I(inode)->root = root;
437 		BTRFS_I(inode)->location.objectid = objectid;
438 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
439 		BTRFS_I(inode)->location.offset = 0;
440 		btrfs_read_locked_inode(inode);
441 		unlock_new_inode(inode);
442 
443 	}
444 	if (is_bad_inode(inode)) {
445 		iput(inode);
446 		inode = NULL;
447 	}
448 	return inode;
449 }
450 
451 /* replays a single extent in 'eb' at 'slot' with 'key' into the
452  * subvolume 'root'.  path is released on entry and should be released
453  * on exit.
454  *
455  * extents in the log tree have not been allocated out of the extent
456  * tree yet.  So, this completes the allocation, taking a reference
457  * as required if the extent already exists or creating a new extent
458  * if it isn't in the extent allocation tree yet.
459  *
460  * The extent is inserted into the file, dropping any existing extents
461  * from the file that overlap the new one.
462  */
463 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
464 				      struct btrfs_root *root,
465 				      struct btrfs_path *path,
466 				      struct extent_buffer *eb, int slot,
467 				      struct btrfs_key *key)
468 {
469 	int found_type;
470 	u64 mask = root->sectorsize - 1;
471 	u64 extent_end;
472 	u64 alloc_hint;
473 	u64 start = key->offset;
474 	u64 saved_nbytes;
475 	struct btrfs_file_extent_item *item;
476 	struct inode *inode = NULL;
477 	unsigned long size;
478 	int ret = 0;
479 
480 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
481 	found_type = btrfs_file_extent_type(eb, item);
482 
483 	if (found_type == BTRFS_FILE_EXTENT_REG ||
484 	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
485 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
486 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
487 		size = btrfs_file_extent_inline_len(eb, item);
488 		extent_end = (start + size + mask) & ~mask;
489 	} else {
490 		ret = 0;
491 		goto out;
492 	}
493 
494 	inode = read_one_inode(root, key->objectid);
495 	if (!inode) {
496 		ret = -EIO;
497 		goto out;
498 	}
499 
500 	/*
501 	 * first check to see if we already have this extent in the
502 	 * file.  This must be done before the btrfs_drop_extents run
503 	 * so we don't try to drop this extent.
504 	 */
505 	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
506 				       start, 0);
507 
508 	if (ret == 0 &&
509 	    (found_type == BTRFS_FILE_EXTENT_REG ||
510 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
511 		struct btrfs_file_extent_item cmp1;
512 		struct btrfs_file_extent_item cmp2;
513 		struct btrfs_file_extent_item *existing;
514 		struct extent_buffer *leaf;
515 
516 		leaf = path->nodes[0];
517 		existing = btrfs_item_ptr(leaf, path->slots[0],
518 					  struct btrfs_file_extent_item);
519 
520 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
521 				   sizeof(cmp1));
522 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
523 				   sizeof(cmp2));
524 
525 		/*
526 		 * we already have a pointer to this exact extent,
527 		 * we don't have to do anything
528 		 */
529 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
530 			btrfs_release_path(root, path);
531 			goto out;
532 		}
533 	}
534 	btrfs_release_path(root, path);
535 
536 	saved_nbytes = inode_get_bytes(inode);
537 	/* drop any overlapping extents */
538 	ret = btrfs_drop_extents(trans, root, inode,
539 			 start, extent_end, extent_end, start, &alloc_hint);
540 	BUG_ON(ret);
541 
542 	if (found_type == BTRFS_FILE_EXTENT_REG ||
543 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
544 		unsigned long dest_offset;
545 		struct btrfs_key ins;
546 
547 		ret = btrfs_insert_empty_item(trans, root, path, key,
548 					      sizeof(*item));
549 		BUG_ON(ret);
550 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
551 						    path->slots[0]);
552 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
553 				(unsigned long)item,  sizeof(*item));
554 
555 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
556 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
557 		ins.type = BTRFS_EXTENT_ITEM_KEY;
558 
559 		if (ins.objectid > 0) {
560 			u64 csum_start;
561 			u64 csum_end;
562 			LIST_HEAD(ordered_sums);
563 			/*
564 			 * is this extent already allocated in the extent
565 			 * allocation tree?  If so, just add a reference
566 			 */
567 			ret = btrfs_lookup_extent(root, ins.objectid,
568 						ins.offset);
569 			if (ret == 0) {
570 				ret = btrfs_inc_extent_ref(trans, root,
571 						ins.objectid, ins.offset,
572 						path->nodes[0]->start,
573 						root->root_key.objectid,
574 						trans->transid, key->objectid);
575 			} else {
576 				/*
577 				 * insert the extent pointer in the extent
578 				 * allocation tree
579 				 */
580 				ret = btrfs_alloc_logged_extent(trans, root,
581 						path->nodes[0]->start,
582 						root->root_key.objectid,
583 						trans->transid, key->objectid,
584 						&ins);
585 				BUG_ON(ret);
586 			}
587 			btrfs_release_path(root, path);
588 
589 			if (btrfs_file_extent_compression(eb, item)) {
590 				csum_start = ins.objectid;
591 				csum_end = csum_start + ins.offset;
592 			} else {
593 				csum_start = ins.objectid +
594 					btrfs_file_extent_offset(eb, item);
595 				csum_end = csum_start +
596 					btrfs_file_extent_num_bytes(eb, item);
597 			}
598 
599 			ret = btrfs_lookup_csums_range(root->log_root,
600 						csum_start, csum_end - 1,
601 						&ordered_sums);
602 			BUG_ON(ret);
603 			while (!list_empty(&ordered_sums)) {
604 				struct btrfs_ordered_sum *sums;
605 				sums = list_entry(ordered_sums.next,
606 						struct btrfs_ordered_sum,
607 						list);
608 				ret = btrfs_csum_file_blocks(trans,
609 						root->fs_info->csum_root,
610 						sums);
611 				BUG_ON(ret);
612 				list_del(&sums->list);
613 				kfree(sums);
614 			}
615 		} else {
616 			btrfs_release_path(root, path);
617 		}
618 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
619 		/* inline extents are easy, we just overwrite them */
620 		ret = overwrite_item(trans, root, path, eb, slot, key);
621 		BUG_ON(ret);
622 	}
623 
624 	inode_set_bytes(inode, saved_nbytes);
625 	btrfs_update_inode(trans, root, inode);
626 out:
627 	if (inode)
628 		iput(inode);
629 	return ret;
630 }
631 
632 /*
633  * when cleaning up conflicts between the directory names in the
634  * subvolume, directory names in the log and directory names in the
635  * inode back references, we may have to unlink inodes from directories.
636  *
637  * This is a helper function to do the unlink of a specific directory
638  * item
639  */
640 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
641 				      struct btrfs_root *root,
642 				      struct btrfs_path *path,
643 				      struct inode *dir,
644 				      struct btrfs_dir_item *di)
645 {
646 	struct inode *inode;
647 	char *name;
648 	int name_len;
649 	struct extent_buffer *leaf;
650 	struct btrfs_key location;
651 	int ret;
652 
653 	leaf = path->nodes[0];
654 
655 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
656 	name_len = btrfs_dir_name_len(leaf, di);
657 	name = kmalloc(name_len, GFP_NOFS);
658 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
659 	btrfs_release_path(root, path);
660 
661 	inode = read_one_inode(root, location.objectid);
662 	BUG_ON(!inode);
663 
664 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
665 	BUG_ON(ret);
666 
667 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
668 	BUG_ON(ret);
669 	kfree(name);
670 
671 	iput(inode);
672 	return ret;
673 }
674 
675 /*
676  * helper function to see if a given name and sequence number found
677  * in an inode back reference are already in a directory and correctly
678  * point to this inode
679  */
680 static noinline int inode_in_dir(struct btrfs_root *root,
681 				 struct btrfs_path *path,
682 				 u64 dirid, u64 objectid, u64 index,
683 				 const char *name, int name_len)
684 {
685 	struct btrfs_dir_item *di;
686 	struct btrfs_key location;
687 	int match = 0;
688 
689 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
690 					 index, name, name_len, 0);
691 	if (di && !IS_ERR(di)) {
692 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
693 		if (location.objectid != objectid)
694 			goto out;
695 	} else
696 		goto out;
697 	btrfs_release_path(root, path);
698 
699 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
700 	if (di && !IS_ERR(di)) {
701 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
702 		if (location.objectid != objectid)
703 			goto out;
704 	} else
705 		goto out;
706 	match = 1;
707 out:
708 	btrfs_release_path(root, path);
709 	return match;
710 }
711 
712 /*
713  * helper function to check a log tree for a named back reference in
714  * an inode.  This is used to decide if a back reference that is
715  * found in the subvolume conflicts with what we find in the log.
716  *
717  * inode backreferences may have multiple refs in a single item,
718  * during replay we process one reference at a time, and we don't
719  * want to delete valid links to a file from the subvolume if that
720  * link is also in the log.
721  */
722 static noinline int backref_in_log(struct btrfs_root *log,
723 				   struct btrfs_key *key,
724 				   char *name, int namelen)
725 {
726 	struct btrfs_path *path;
727 	struct btrfs_inode_ref *ref;
728 	unsigned long ptr;
729 	unsigned long ptr_end;
730 	unsigned long name_ptr;
731 	int found_name_len;
732 	int item_size;
733 	int ret;
734 	int match = 0;
735 
736 	path = btrfs_alloc_path();
737 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
738 	if (ret != 0)
739 		goto out;
740 
741 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
742 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
743 	ptr_end = ptr + item_size;
744 	while (ptr < ptr_end) {
745 		ref = (struct btrfs_inode_ref *)ptr;
746 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
747 		if (found_name_len == namelen) {
748 			name_ptr = (unsigned long)(ref + 1);
749 			ret = memcmp_extent_buffer(path->nodes[0], name,
750 						   name_ptr, namelen);
751 			if (ret == 0) {
752 				match = 1;
753 				goto out;
754 			}
755 		}
756 		ptr = (unsigned long)(ref + 1) + found_name_len;
757 	}
758 out:
759 	btrfs_free_path(path);
760 	return match;
761 }
762 
763 
764 /*
765  * replay one inode back reference item found in the log tree.
766  * eb, slot and key refer to the buffer and key found in the log tree.
767  * root is the destination we are replaying into, and path is for temp
768  * use by this function.  (it should be released on return).
769  */
770 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
771 				  struct btrfs_root *root,
772 				  struct btrfs_root *log,
773 				  struct btrfs_path *path,
774 				  struct extent_buffer *eb, int slot,
775 				  struct btrfs_key *key)
776 {
777 	struct inode *dir;
778 	int ret;
779 	struct btrfs_key location;
780 	struct btrfs_inode_ref *ref;
781 	struct btrfs_dir_item *di;
782 	struct inode *inode;
783 	char *name;
784 	int namelen;
785 	unsigned long ref_ptr;
786 	unsigned long ref_end;
787 
788 	location.objectid = key->objectid;
789 	location.type = BTRFS_INODE_ITEM_KEY;
790 	location.offset = 0;
791 
792 	/*
793 	 * it is possible that we didn't log all the parent directories
794 	 * for a given inode.  If we don't find the dir, just don't
795 	 * copy the back ref in.  The link count fixup code will take
796 	 * care of the rest
797 	 */
798 	dir = read_one_inode(root, key->offset);
799 	if (!dir)
800 		return -ENOENT;
801 
802 	inode = read_one_inode(root, key->objectid);
803 	BUG_ON(!dir);
804 
805 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
806 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
807 
808 again:
809 	ref = (struct btrfs_inode_ref *)ref_ptr;
810 
811 	namelen = btrfs_inode_ref_name_len(eb, ref);
812 	name = kmalloc(namelen, GFP_NOFS);
813 	BUG_ON(!name);
814 
815 	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
816 
817 	/* if we already have a perfect match, we're done */
818 	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
819 			 btrfs_inode_ref_index(eb, ref),
820 			 name, namelen)) {
821 		goto out;
822 	}
823 
824 	/*
825 	 * look for a conflicting back reference in the metadata.
826 	 * if we find one we have to unlink that name of the file
827 	 * before we add our new link.  Later on, we overwrite any
828 	 * existing back reference, and we don't want to create
829 	 * dangling pointers in the directory.
830 	 */
831 conflict_again:
832 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
833 	if (ret == 0) {
834 		char *victim_name;
835 		int victim_name_len;
836 		struct btrfs_inode_ref *victim_ref;
837 		unsigned long ptr;
838 		unsigned long ptr_end;
839 		struct extent_buffer *leaf = path->nodes[0];
840 
841 		/* are we trying to overwrite a back ref for the root directory
842 		 * if so, just jump out, we're done
843 		 */
844 		if (key->objectid == key->offset)
845 			goto out_nowrite;
846 
847 		/* check all the names in this back reference to see
848 		 * if they are in the log.  if so, we allow them to stay
849 		 * otherwise they must be unlinked as a conflict
850 		 */
851 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
852 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
853 		while (ptr < ptr_end) {
854 			victim_ref = (struct btrfs_inode_ref *)ptr;
855 			victim_name_len = btrfs_inode_ref_name_len(leaf,
856 								   victim_ref);
857 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
858 			BUG_ON(!victim_name);
859 
860 			read_extent_buffer(leaf, victim_name,
861 					   (unsigned long)(victim_ref + 1),
862 					   victim_name_len);
863 
864 			if (!backref_in_log(log, key, victim_name,
865 					    victim_name_len)) {
866 				btrfs_inc_nlink(inode);
867 				btrfs_release_path(root, path);
868 
869 				ret = btrfs_unlink_inode(trans, root, dir,
870 							 inode, victim_name,
871 							 victim_name_len);
872 				kfree(victim_name);
873 				btrfs_release_path(root, path);
874 				goto conflict_again;
875 			}
876 			kfree(victim_name);
877 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
878 		}
879 		BUG_ON(ret);
880 	}
881 	btrfs_release_path(root, path);
882 
883 	/* look for a conflicting sequence number */
884 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
885 					 btrfs_inode_ref_index(eb, ref),
886 					 name, namelen, 0);
887 	if (di && !IS_ERR(di)) {
888 		ret = drop_one_dir_item(trans, root, path, dir, di);
889 		BUG_ON(ret);
890 	}
891 	btrfs_release_path(root, path);
892 
893 
894 	/* look for a conflicting name */
895 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
896 				   name, namelen, 0);
897 	if (di && !IS_ERR(di)) {
898 		ret = drop_one_dir_item(trans, root, path, dir, di);
899 		BUG_ON(ret);
900 	}
901 	btrfs_release_path(root, path);
902 
903 	/* insert our name */
904 	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
905 			     btrfs_inode_ref_index(eb, ref));
906 	BUG_ON(ret);
907 
908 	btrfs_update_inode(trans, root, inode);
909 
910 out:
911 	ref_ptr = (unsigned long)(ref + 1) + namelen;
912 	kfree(name);
913 	if (ref_ptr < ref_end)
914 		goto again;
915 
916 	/* finally write the back reference in the inode */
917 	ret = overwrite_item(trans, root, path, eb, slot, key);
918 	BUG_ON(ret);
919 
920 out_nowrite:
921 	btrfs_release_path(root, path);
922 	iput(dir);
923 	iput(inode);
924 	return 0;
925 }
926 
927 /*
928  * There are a few corners where the link count of the file can't
929  * be properly maintained during replay.  So, instead of adding
930  * lots of complexity to the log code, we just scan the backrefs
931  * for any file that has been through replay.
932  *
933  * The scan will update the link count on the inode to reflect the
934  * number of back refs found.  If it goes down to zero, the iput
935  * will free the inode.
936  */
937 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
938 					   struct btrfs_root *root,
939 					   struct inode *inode)
940 {
941 	struct btrfs_path *path;
942 	int ret;
943 	struct btrfs_key key;
944 	u64 nlink = 0;
945 	unsigned long ptr;
946 	unsigned long ptr_end;
947 	int name_len;
948 
949 	key.objectid = inode->i_ino;
950 	key.type = BTRFS_INODE_REF_KEY;
951 	key.offset = (u64)-1;
952 
953 	path = btrfs_alloc_path();
954 
955 	while (1) {
956 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
957 		if (ret < 0)
958 			break;
959 		if (ret > 0) {
960 			if (path->slots[0] == 0)
961 				break;
962 			path->slots[0]--;
963 		}
964 		btrfs_item_key_to_cpu(path->nodes[0], &key,
965 				      path->slots[0]);
966 		if (key.objectid != inode->i_ino ||
967 		    key.type != BTRFS_INODE_REF_KEY)
968 			break;
969 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
970 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
971 						   path->slots[0]);
972 		while (ptr < ptr_end) {
973 			struct btrfs_inode_ref *ref;
974 
975 			ref = (struct btrfs_inode_ref *)ptr;
976 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
977 							    ref);
978 			ptr = (unsigned long)(ref + 1) + name_len;
979 			nlink++;
980 		}
981 
982 		if (key.offset == 0)
983 			break;
984 		key.offset--;
985 		btrfs_release_path(root, path);
986 	}
987 	btrfs_release_path(root, path);
988 	if (nlink != inode->i_nlink) {
989 		inode->i_nlink = nlink;
990 		btrfs_update_inode(trans, root, inode);
991 	}
992 	BTRFS_I(inode)->index_cnt = (u64)-1;
993 
994 	if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
995 		ret = replay_dir_deletes(trans, root, NULL, path,
996 					 inode->i_ino, 1);
997 		BUG_ON(ret);
998 	}
999 	btrfs_free_path(path);
1000 
1001 	return 0;
1002 }
1003 
1004 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1005 					    struct btrfs_root *root,
1006 					    struct btrfs_path *path)
1007 {
1008 	int ret;
1009 	struct btrfs_key key;
1010 	struct inode *inode;
1011 
1012 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1013 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1014 	key.offset = (u64)-1;
1015 	while (1) {
1016 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1017 		if (ret < 0)
1018 			break;
1019 
1020 		if (ret == 1) {
1021 			if (path->slots[0] == 0)
1022 				break;
1023 			path->slots[0]--;
1024 		}
1025 
1026 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1027 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1028 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1029 			break;
1030 
1031 		ret = btrfs_del_item(trans, root, path);
1032 		BUG_ON(ret);
1033 
1034 		btrfs_release_path(root, path);
1035 		inode = read_one_inode(root, key.offset);
1036 		BUG_ON(!inode);
1037 
1038 		ret = fixup_inode_link_count(trans, root, inode);
1039 		BUG_ON(ret);
1040 
1041 		iput(inode);
1042 
1043 		/*
1044 		 * fixup on a directory may create new entries,
1045 		 * make sure we always look for the highset possible
1046 		 * offset
1047 		 */
1048 		key.offset = (u64)-1;
1049 	}
1050 	btrfs_release_path(root, path);
1051 	return 0;
1052 }
1053 
1054 
1055 /*
1056  * record a given inode in the fixup dir so we can check its link
1057  * count when replay is done.  The link count is incremented here
1058  * so the inode won't go away until we check it
1059  */
1060 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1061 				      struct btrfs_root *root,
1062 				      struct btrfs_path *path,
1063 				      u64 objectid)
1064 {
1065 	struct btrfs_key key;
1066 	int ret = 0;
1067 	struct inode *inode;
1068 
1069 	inode = read_one_inode(root, objectid);
1070 	BUG_ON(!inode);
1071 
1072 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1073 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1074 	key.offset = objectid;
1075 
1076 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1077 
1078 	btrfs_release_path(root, path);
1079 	if (ret == 0) {
1080 		btrfs_inc_nlink(inode);
1081 		btrfs_update_inode(trans, root, inode);
1082 	} else if (ret == -EEXIST) {
1083 		ret = 0;
1084 	} else {
1085 		BUG();
1086 	}
1087 	iput(inode);
1088 
1089 	return ret;
1090 }
1091 
1092 /*
1093  * when replaying the log for a directory, we only insert names
1094  * for inodes that actually exist.  This means an fsync on a directory
1095  * does not implicitly fsync all the new files in it
1096  */
1097 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1098 				    struct btrfs_root *root,
1099 				    struct btrfs_path *path,
1100 				    u64 dirid, u64 index,
1101 				    char *name, int name_len, u8 type,
1102 				    struct btrfs_key *location)
1103 {
1104 	struct inode *inode;
1105 	struct inode *dir;
1106 	int ret;
1107 
1108 	inode = read_one_inode(root, location->objectid);
1109 	if (!inode)
1110 		return -ENOENT;
1111 
1112 	dir = read_one_inode(root, dirid);
1113 	if (!dir) {
1114 		iput(inode);
1115 		return -EIO;
1116 	}
1117 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1118 
1119 	/* FIXME, put inode into FIXUP list */
1120 
1121 	iput(inode);
1122 	iput(dir);
1123 	return ret;
1124 }
1125 
1126 /*
1127  * take a single entry in a log directory item and replay it into
1128  * the subvolume.
1129  *
1130  * if a conflicting item exists in the subdirectory already,
1131  * the inode it points to is unlinked and put into the link count
1132  * fix up tree.
1133  *
1134  * If a name from the log points to a file or directory that does
1135  * not exist in the FS, it is skipped.  fsyncs on directories
1136  * do not force down inodes inside that directory, just changes to the
1137  * names or unlinks in a directory.
1138  */
1139 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1140 				    struct btrfs_root *root,
1141 				    struct btrfs_path *path,
1142 				    struct extent_buffer *eb,
1143 				    struct btrfs_dir_item *di,
1144 				    struct btrfs_key *key)
1145 {
1146 	char *name;
1147 	int name_len;
1148 	struct btrfs_dir_item *dst_di;
1149 	struct btrfs_key found_key;
1150 	struct btrfs_key log_key;
1151 	struct inode *dir;
1152 	u8 log_type;
1153 	int exists;
1154 	int ret;
1155 
1156 	dir = read_one_inode(root, key->objectid);
1157 	BUG_ON(!dir);
1158 
1159 	name_len = btrfs_dir_name_len(eb, di);
1160 	name = kmalloc(name_len, GFP_NOFS);
1161 	log_type = btrfs_dir_type(eb, di);
1162 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1163 		   name_len);
1164 
1165 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1166 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1167 	if (exists == 0)
1168 		exists = 1;
1169 	else
1170 		exists = 0;
1171 	btrfs_release_path(root, path);
1172 
1173 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1174 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1175 				       name, name_len, 1);
1176 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1177 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1178 						     key->objectid,
1179 						     key->offset, name,
1180 						     name_len, 1);
1181 	} else {
1182 		BUG();
1183 	}
1184 	if (!dst_di || IS_ERR(dst_di)) {
1185 		/* we need a sequence number to insert, so we only
1186 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1187 		 */
1188 		if (key->type != BTRFS_DIR_INDEX_KEY)
1189 			goto out;
1190 		goto insert;
1191 	}
1192 
1193 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1194 	/* the existing item matches the logged item */
1195 	if (found_key.objectid == log_key.objectid &&
1196 	    found_key.type == log_key.type &&
1197 	    found_key.offset == log_key.offset &&
1198 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1199 		goto out;
1200 	}
1201 
1202 	/*
1203 	 * don't drop the conflicting directory entry if the inode
1204 	 * for the new entry doesn't exist
1205 	 */
1206 	if (!exists)
1207 		goto out;
1208 
1209 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1210 	BUG_ON(ret);
1211 
1212 	if (key->type == BTRFS_DIR_INDEX_KEY)
1213 		goto insert;
1214 out:
1215 	btrfs_release_path(root, path);
1216 	kfree(name);
1217 	iput(dir);
1218 	return 0;
1219 
1220 insert:
1221 	btrfs_release_path(root, path);
1222 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1223 			      name, name_len, log_type, &log_key);
1224 
1225 	BUG_ON(ret && ret != -ENOENT);
1226 	goto out;
1227 }
1228 
1229 /*
1230  * find all the names in a directory item and reconcile them into
1231  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1232  * one name in a directory item, but the same code gets used for
1233  * both directory index types
1234  */
1235 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1236 					struct btrfs_root *root,
1237 					struct btrfs_path *path,
1238 					struct extent_buffer *eb, int slot,
1239 					struct btrfs_key *key)
1240 {
1241 	int ret;
1242 	u32 item_size = btrfs_item_size_nr(eb, slot);
1243 	struct btrfs_dir_item *di;
1244 	int name_len;
1245 	unsigned long ptr;
1246 	unsigned long ptr_end;
1247 
1248 	ptr = btrfs_item_ptr_offset(eb, slot);
1249 	ptr_end = ptr + item_size;
1250 	while (ptr < ptr_end) {
1251 		di = (struct btrfs_dir_item *)ptr;
1252 		name_len = btrfs_dir_name_len(eb, di);
1253 		ret = replay_one_name(trans, root, path, eb, di, key);
1254 		BUG_ON(ret);
1255 		ptr = (unsigned long)(di + 1);
1256 		ptr += name_len;
1257 	}
1258 	return 0;
1259 }
1260 
1261 /*
1262  * directory replay has two parts.  There are the standard directory
1263  * items in the log copied from the subvolume, and range items
1264  * created in the log while the subvolume was logged.
1265  *
1266  * The range items tell us which parts of the key space the log
1267  * is authoritative for.  During replay, if a key in the subvolume
1268  * directory is in a logged range item, but not actually in the log
1269  * that means it was deleted from the directory before the fsync
1270  * and should be removed.
1271  */
1272 static noinline int find_dir_range(struct btrfs_root *root,
1273 				   struct btrfs_path *path,
1274 				   u64 dirid, int key_type,
1275 				   u64 *start_ret, u64 *end_ret)
1276 {
1277 	struct btrfs_key key;
1278 	u64 found_end;
1279 	struct btrfs_dir_log_item *item;
1280 	int ret;
1281 	int nritems;
1282 
1283 	if (*start_ret == (u64)-1)
1284 		return 1;
1285 
1286 	key.objectid = dirid;
1287 	key.type = key_type;
1288 	key.offset = *start_ret;
1289 
1290 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1291 	if (ret < 0)
1292 		goto out;
1293 	if (ret > 0) {
1294 		if (path->slots[0] == 0)
1295 			goto out;
1296 		path->slots[0]--;
1297 	}
1298 	if (ret != 0)
1299 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1300 
1301 	if (key.type != key_type || key.objectid != dirid) {
1302 		ret = 1;
1303 		goto next;
1304 	}
1305 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1306 			      struct btrfs_dir_log_item);
1307 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1308 
1309 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1310 		ret = 0;
1311 		*start_ret = key.offset;
1312 		*end_ret = found_end;
1313 		goto out;
1314 	}
1315 	ret = 1;
1316 next:
1317 	/* check the next slot in the tree to see if it is a valid item */
1318 	nritems = btrfs_header_nritems(path->nodes[0]);
1319 	if (path->slots[0] >= nritems) {
1320 		ret = btrfs_next_leaf(root, path);
1321 		if (ret)
1322 			goto out;
1323 	} else {
1324 		path->slots[0]++;
1325 	}
1326 
1327 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1328 
1329 	if (key.type != key_type || key.objectid != dirid) {
1330 		ret = 1;
1331 		goto out;
1332 	}
1333 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1334 			      struct btrfs_dir_log_item);
1335 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1336 	*start_ret = key.offset;
1337 	*end_ret = found_end;
1338 	ret = 0;
1339 out:
1340 	btrfs_release_path(root, path);
1341 	return ret;
1342 }
1343 
1344 /*
1345  * this looks for a given directory item in the log.  If the directory
1346  * item is not in the log, the item is removed and the inode it points
1347  * to is unlinked
1348  */
1349 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1350 				      struct btrfs_root *root,
1351 				      struct btrfs_root *log,
1352 				      struct btrfs_path *path,
1353 				      struct btrfs_path *log_path,
1354 				      struct inode *dir,
1355 				      struct btrfs_key *dir_key)
1356 {
1357 	int ret;
1358 	struct extent_buffer *eb;
1359 	int slot;
1360 	u32 item_size;
1361 	struct btrfs_dir_item *di;
1362 	struct btrfs_dir_item *log_di;
1363 	int name_len;
1364 	unsigned long ptr;
1365 	unsigned long ptr_end;
1366 	char *name;
1367 	struct inode *inode;
1368 	struct btrfs_key location;
1369 
1370 again:
1371 	eb = path->nodes[0];
1372 	slot = path->slots[0];
1373 	item_size = btrfs_item_size_nr(eb, slot);
1374 	ptr = btrfs_item_ptr_offset(eb, slot);
1375 	ptr_end = ptr + item_size;
1376 	while (ptr < ptr_end) {
1377 		di = (struct btrfs_dir_item *)ptr;
1378 		name_len = btrfs_dir_name_len(eb, di);
1379 		name = kmalloc(name_len, GFP_NOFS);
1380 		if (!name) {
1381 			ret = -ENOMEM;
1382 			goto out;
1383 		}
1384 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
1385 				  name_len);
1386 		log_di = NULL;
1387 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1388 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
1389 						       dir_key->objectid,
1390 						       name, name_len, 0);
1391 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1392 			log_di = btrfs_lookup_dir_index_item(trans, log,
1393 						     log_path,
1394 						     dir_key->objectid,
1395 						     dir_key->offset,
1396 						     name, name_len, 0);
1397 		}
1398 		if (!log_di || IS_ERR(log_di)) {
1399 			btrfs_dir_item_key_to_cpu(eb, di, &location);
1400 			btrfs_release_path(root, path);
1401 			btrfs_release_path(log, log_path);
1402 			inode = read_one_inode(root, location.objectid);
1403 			BUG_ON(!inode);
1404 
1405 			ret = link_to_fixup_dir(trans, root,
1406 						path, location.objectid);
1407 			BUG_ON(ret);
1408 			btrfs_inc_nlink(inode);
1409 			ret = btrfs_unlink_inode(trans, root, dir, inode,
1410 						 name, name_len);
1411 			BUG_ON(ret);
1412 			kfree(name);
1413 			iput(inode);
1414 
1415 			/* there might still be more names under this key
1416 			 * check and repeat if required
1417 			 */
1418 			ret = btrfs_search_slot(NULL, root, dir_key, path,
1419 						0, 0);
1420 			if (ret == 0)
1421 				goto again;
1422 			ret = 0;
1423 			goto out;
1424 		}
1425 		btrfs_release_path(log, log_path);
1426 		kfree(name);
1427 
1428 		ptr = (unsigned long)(di + 1);
1429 		ptr += name_len;
1430 	}
1431 	ret = 0;
1432 out:
1433 	btrfs_release_path(root, path);
1434 	btrfs_release_path(log, log_path);
1435 	return ret;
1436 }
1437 
1438 /*
1439  * deletion replay happens before we copy any new directory items
1440  * out of the log or out of backreferences from inodes.  It
1441  * scans the log to find ranges of keys that log is authoritative for,
1442  * and then scans the directory to find items in those ranges that are
1443  * not present in the log.
1444  *
1445  * Anything we don't find in the log is unlinked and removed from the
1446  * directory.
1447  */
1448 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1449 				       struct btrfs_root *root,
1450 				       struct btrfs_root *log,
1451 				       struct btrfs_path *path,
1452 				       u64 dirid, int del_all)
1453 {
1454 	u64 range_start;
1455 	u64 range_end;
1456 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1457 	int ret = 0;
1458 	struct btrfs_key dir_key;
1459 	struct btrfs_key found_key;
1460 	struct btrfs_path *log_path;
1461 	struct inode *dir;
1462 
1463 	dir_key.objectid = dirid;
1464 	dir_key.type = BTRFS_DIR_ITEM_KEY;
1465 	log_path = btrfs_alloc_path();
1466 	if (!log_path)
1467 		return -ENOMEM;
1468 
1469 	dir = read_one_inode(root, dirid);
1470 	/* it isn't an error if the inode isn't there, that can happen
1471 	 * because we replay the deletes before we copy in the inode item
1472 	 * from the log
1473 	 */
1474 	if (!dir) {
1475 		btrfs_free_path(log_path);
1476 		return 0;
1477 	}
1478 again:
1479 	range_start = 0;
1480 	range_end = 0;
1481 	while (1) {
1482 		if (del_all)
1483 			range_end = (u64)-1;
1484 		else {
1485 			ret = find_dir_range(log, path, dirid, key_type,
1486 					     &range_start, &range_end);
1487 			if (ret != 0)
1488 				break;
1489 		}
1490 
1491 		dir_key.offset = range_start;
1492 		while (1) {
1493 			int nritems;
1494 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
1495 						0, 0);
1496 			if (ret < 0)
1497 				goto out;
1498 
1499 			nritems = btrfs_header_nritems(path->nodes[0]);
1500 			if (path->slots[0] >= nritems) {
1501 				ret = btrfs_next_leaf(root, path);
1502 				if (ret)
1503 					break;
1504 			}
1505 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1506 					      path->slots[0]);
1507 			if (found_key.objectid != dirid ||
1508 			    found_key.type != dir_key.type)
1509 				goto next_type;
1510 
1511 			if (found_key.offset > range_end)
1512 				break;
1513 
1514 			ret = check_item_in_log(trans, root, log, path,
1515 						log_path, dir,
1516 						&found_key);
1517 			BUG_ON(ret);
1518 			if (found_key.offset == (u64)-1)
1519 				break;
1520 			dir_key.offset = found_key.offset + 1;
1521 		}
1522 		btrfs_release_path(root, path);
1523 		if (range_end == (u64)-1)
1524 			break;
1525 		range_start = range_end + 1;
1526 	}
1527 
1528 next_type:
1529 	ret = 0;
1530 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1531 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
1532 		dir_key.type = BTRFS_DIR_INDEX_KEY;
1533 		btrfs_release_path(root, path);
1534 		goto again;
1535 	}
1536 out:
1537 	btrfs_release_path(root, path);
1538 	btrfs_free_path(log_path);
1539 	iput(dir);
1540 	return ret;
1541 }
1542 
1543 /*
1544  * the process_func used to replay items from the log tree.  This
1545  * gets called in two different stages.  The first stage just looks
1546  * for inodes and makes sure they are all copied into the subvolume.
1547  *
1548  * The second stage copies all the other item types from the log into
1549  * the subvolume.  The two stage approach is slower, but gets rid of
1550  * lots of complexity around inodes referencing other inodes that exist
1551  * only in the log (references come from either directory items or inode
1552  * back refs).
1553  */
1554 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1555 			     struct walk_control *wc, u64 gen)
1556 {
1557 	int nritems;
1558 	struct btrfs_path *path;
1559 	struct btrfs_root *root = wc->replay_dest;
1560 	struct btrfs_key key;
1561 	u32 item_size;
1562 	int level;
1563 	int i;
1564 	int ret;
1565 
1566 	btrfs_read_buffer(eb, gen);
1567 
1568 	level = btrfs_header_level(eb);
1569 
1570 	if (level != 0)
1571 		return 0;
1572 
1573 	path = btrfs_alloc_path();
1574 	BUG_ON(!path);
1575 
1576 	nritems = btrfs_header_nritems(eb);
1577 	for (i = 0; i < nritems; i++) {
1578 		btrfs_item_key_to_cpu(eb, &key, i);
1579 		item_size = btrfs_item_size_nr(eb, i);
1580 
1581 		/* inode keys are done during the first stage */
1582 		if (key.type == BTRFS_INODE_ITEM_KEY &&
1583 		    wc->stage == LOG_WALK_REPLAY_INODES) {
1584 			struct inode *inode;
1585 			struct btrfs_inode_item *inode_item;
1586 			u32 mode;
1587 
1588 			inode_item = btrfs_item_ptr(eb, i,
1589 					    struct btrfs_inode_item);
1590 			mode = btrfs_inode_mode(eb, inode_item);
1591 			if (S_ISDIR(mode)) {
1592 				ret = replay_dir_deletes(wc->trans,
1593 					 root, log, path, key.objectid, 0);
1594 				BUG_ON(ret);
1595 			}
1596 			ret = overwrite_item(wc->trans, root, path,
1597 					     eb, i, &key);
1598 			BUG_ON(ret);
1599 
1600 			/* for regular files, truncate away
1601 			 * extents past the new EOF
1602 			 */
1603 			if (S_ISREG(mode)) {
1604 				inode = read_one_inode(root,
1605 						       key.objectid);
1606 				BUG_ON(!inode);
1607 
1608 				ret = btrfs_truncate_inode_items(wc->trans,
1609 					root, inode, inode->i_size,
1610 					BTRFS_EXTENT_DATA_KEY);
1611 				BUG_ON(ret);
1612 
1613 				/* if the nlink count is zero here, the iput
1614 				 * will free the inode.  We bump it to make
1615 				 * sure it doesn't get freed until the link
1616 				 * count fixup is done
1617 				 */
1618 				if (inode->i_nlink == 0) {
1619 					btrfs_inc_nlink(inode);
1620 					btrfs_update_inode(wc->trans,
1621 							   root, inode);
1622 				}
1623 				iput(inode);
1624 			}
1625 			ret = link_to_fixup_dir(wc->trans, root,
1626 						path, key.objectid);
1627 			BUG_ON(ret);
1628 		}
1629 		if (wc->stage < LOG_WALK_REPLAY_ALL)
1630 			continue;
1631 
1632 		/* these keys are simply copied */
1633 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
1634 			ret = overwrite_item(wc->trans, root, path,
1635 					     eb, i, &key);
1636 			BUG_ON(ret);
1637 		} else if (key.type == BTRFS_INODE_REF_KEY) {
1638 			ret = add_inode_ref(wc->trans, root, log, path,
1639 					    eb, i, &key);
1640 			BUG_ON(ret && ret != -ENOENT);
1641 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1642 			ret = replay_one_extent(wc->trans, root, path,
1643 						eb, i, &key);
1644 			BUG_ON(ret);
1645 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
1646 			   key.type == BTRFS_DIR_INDEX_KEY) {
1647 			ret = replay_one_dir_item(wc->trans, root, path,
1648 						  eb, i, &key);
1649 			BUG_ON(ret);
1650 		}
1651 	}
1652 	btrfs_free_path(path);
1653 	return 0;
1654 }
1655 
1656 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1657 				   struct btrfs_root *root,
1658 				   struct btrfs_path *path, int *level,
1659 				   struct walk_control *wc)
1660 {
1661 	u64 root_owner;
1662 	u64 root_gen;
1663 	u64 bytenr;
1664 	u64 ptr_gen;
1665 	struct extent_buffer *next;
1666 	struct extent_buffer *cur;
1667 	struct extent_buffer *parent;
1668 	u32 blocksize;
1669 	int ret = 0;
1670 
1671 	WARN_ON(*level < 0);
1672 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1673 
1674 	while (*level > 0) {
1675 		WARN_ON(*level < 0);
1676 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
1677 		cur = path->nodes[*level];
1678 
1679 		if (btrfs_header_level(cur) != *level)
1680 			WARN_ON(1);
1681 
1682 		if (path->slots[*level] >=
1683 		    btrfs_header_nritems(cur))
1684 			break;
1685 
1686 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1687 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1688 		blocksize = btrfs_level_size(root, *level - 1);
1689 
1690 		parent = path->nodes[*level];
1691 		root_owner = btrfs_header_owner(parent);
1692 		root_gen = btrfs_header_generation(parent);
1693 
1694 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1695 
1696 		wc->process_func(root, next, wc, ptr_gen);
1697 
1698 		if (*level == 1) {
1699 			path->slots[*level]++;
1700 			if (wc->free) {
1701 				btrfs_read_buffer(next, ptr_gen);
1702 
1703 				btrfs_tree_lock(next);
1704 				clean_tree_block(trans, root, next);
1705 				btrfs_set_lock_blocking(next);
1706 				btrfs_wait_tree_block_writeback(next);
1707 				btrfs_tree_unlock(next);
1708 
1709 				ret = btrfs_drop_leaf_ref(trans, root, next);
1710 				BUG_ON(ret);
1711 
1712 				WARN_ON(root_owner !=
1713 					BTRFS_TREE_LOG_OBJECTID);
1714 				ret = btrfs_free_reserved_extent(root,
1715 							 bytenr, blocksize);
1716 				BUG_ON(ret);
1717 			}
1718 			free_extent_buffer(next);
1719 			continue;
1720 		}
1721 		btrfs_read_buffer(next, ptr_gen);
1722 
1723 		WARN_ON(*level <= 0);
1724 		if (path->nodes[*level-1])
1725 			free_extent_buffer(path->nodes[*level-1]);
1726 		path->nodes[*level-1] = next;
1727 		*level = btrfs_header_level(next);
1728 		path->slots[*level] = 0;
1729 		cond_resched();
1730 	}
1731 	WARN_ON(*level < 0);
1732 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1733 
1734 	if (path->nodes[*level] == root->node)
1735 		parent = path->nodes[*level];
1736 	else
1737 		parent = path->nodes[*level + 1];
1738 
1739 	bytenr = path->nodes[*level]->start;
1740 
1741 	blocksize = btrfs_level_size(root, *level);
1742 	root_owner = btrfs_header_owner(parent);
1743 	root_gen = btrfs_header_generation(parent);
1744 
1745 	wc->process_func(root, path->nodes[*level], wc,
1746 			 btrfs_header_generation(path->nodes[*level]));
1747 
1748 	if (wc->free) {
1749 		next = path->nodes[*level];
1750 		btrfs_tree_lock(next);
1751 		clean_tree_block(trans, root, next);
1752 		btrfs_set_lock_blocking(next);
1753 		btrfs_wait_tree_block_writeback(next);
1754 		btrfs_tree_unlock(next);
1755 
1756 		if (*level == 0) {
1757 			ret = btrfs_drop_leaf_ref(trans, root, next);
1758 			BUG_ON(ret);
1759 		}
1760 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 		BUG_ON(ret);
1763 	}
1764 	free_extent_buffer(path->nodes[*level]);
1765 	path->nodes[*level] = NULL;
1766 	*level += 1;
1767 
1768 	cond_resched();
1769 	return 0;
1770 }
1771 
1772 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773 				 struct btrfs_root *root,
1774 				 struct btrfs_path *path, int *level,
1775 				 struct walk_control *wc)
1776 {
1777 	u64 root_owner;
1778 	u64 root_gen;
1779 	int i;
1780 	int slot;
1781 	int ret;
1782 
1783 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1784 		slot = path->slots[i];
1785 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1786 			struct extent_buffer *node;
1787 			node = path->nodes[i];
1788 			path->slots[i]++;
1789 			*level = i;
1790 			WARN_ON(*level == 0);
1791 			return 0;
1792 		} else {
1793 			struct extent_buffer *parent;
1794 			if (path->nodes[*level] == root->node)
1795 				parent = path->nodes[*level];
1796 			else
1797 				parent = path->nodes[*level + 1];
1798 
1799 			root_owner = btrfs_header_owner(parent);
1800 			root_gen = btrfs_header_generation(parent);
1801 			wc->process_func(root, path->nodes[*level], wc,
1802 				 btrfs_header_generation(path->nodes[*level]));
1803 			if (wc->free) {
1804 				struct extent_buffer *next;
1805 
1806 				next = path->nodes[*level];
1807 
1808 				btrfs_tree_lock(next);
1809 				clean_tree_block(trans, root, next);
1810 				btrfs_set_lock_blocking(next);
1811 				btrfs_wait_tree_block_writeback(next);
1812 				btrfs_tree_unlock(next);
1813 
1814 				if (*level == 0) {
1815 					ret = btrfs_drop_leaf_ref(trans, root,
1816 								  next);
1817 					BUG_ON(ret);
1818 				}
1819 
1820 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1821 				ret = btrfs_free_reserved_extent(root,
1822 						path->nodes[*level]->start,
1823 						path->nodes[*level]->len);
1824 				BUG_ON(ret);
1825 			}
1826 			free_extent_buffer(path->nodes[*level]);
1827 			path->nodes[*level] = NULL;
1828 			*level = i + 1;
1829 		}
1830 	}
1831 	return 1;
1832 }
1833 
1834 /*
1835  * drop the reference count on the tree rooted at 'snap'.  This traverses
1836  * the tree freeing any blocks that have a ref count of zero after being
1837  * decremented.
1838  */
1839 static int walk_log_tree(struct btrfs_trans_handle *trans,
1840 			 struct btrfs_root *log, struct walk_control *wc)
1841 {
1842 	int ret = 0;
1843 	int wret;
1844 	int level;
1845 	struct btrfs_path *path;
1846 	int i;
1847 	int orig_level;
1848 
1849 	path = btrfs_alloc_path();
1850 	BUG_ON(!path);
1851 
1852 	level = btrfs_header_level(log->node);
1853 	orig_level = level;
1854 	path->nodes[level] = log->node;
1855 	extent_buffer_get(log->node);
1856 	path->slots[level] = 0;
1857 
1858 	while (1) {
1859 		wret = walk_down_log_tree(trans, log, path, &level, wc);
1860 		if (wret > 0)
1861 			break;
1862 		if (wret < 0)
1863 			ret = wret;
1864 
1865 		wret = walk_up_log_tree(trans, log, path, &level, wc);
1866 		if (wret > 0)
1867 			break;
1868 		if (wret < 0)
1869 			ret = wret;
1870 	}
1871 
1872 	/* was the root node processed? if not, catch it here */
1873 	if (path->nodes[orig_level]) {
1874 		wc->process_func(log, path->nodes[orig_level], wc,
1875 			 btrfs_header_generation(path->nodes[orig_level]));
1876 		if (wc->free) {
1877 			struct extent_buffer *next;
1878 
1879 			next = path->nodes[orig_level];
1880 
1881 			btrfs_tree_lock(next);
1882 			clean_tree_block(trans, log, next);
1883 			btrfs_set_lock_blocking(next);
1884 			btrfs_wait_tree_block_writeback(next);
1885 			btrfs_tree_unlock(next);
1886 
1887 			if (orig_level == 0) {
1888 				ret = btrfs_drop_leaf_ref(trans, log,
1889 							  next);
1890 				BUG_ON(ret);
1891 			}
1892 			WARN_ON(log->root_key.objectid !=
1893 				BTRFS_TREE_LOG_OBJECTID);
1894 			ret = btrfs_free_reserved_extent(log, next->start,
1895 							 next->len);
1896 			BUG_ON(ret);
1897 		}
1898 	}
1899 
1900 	for (i = 0; i <= orig_level; i++) {
1901 		if (path->nodes[i]) {
1902 			free_extent_buffer(path->nodes[i]);
1903 			path->nodes[i] = NULL;
1904 		}
1905 	}
1906 	btrfs_free_path(path);
1907 	return ret;
1908 }
1909 
1910 /*
1911  * helper function to update the item for a given subvolumes log root
1912  * in the tree of log roots
1913  */
1914 static int update_log_root(struct btrfs_trans_handle *trans,
1915 			   struct btrfs_root *log)
1916 {
1917 	int ret;
1918 
1919 	if (log->log_transid == 1) {
1920 		/* insert root item on the first sync */
1921 		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1922 				&log->root_key, &log->root_item);
1923 	} else {
1924 		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1925 				&log->root_key, &log->root_item);
1926 	}
1927 	return ret;
1928 }
1929 
1930 static int wait_log_commit(struct btrfs_trans_handle *trans,
1931 			   struct btrfs_root *root, unsigned long transid)
1932 {
1933 	DEFINE_WAIT(wait);
1934 	int index = transid % 2;
1935 
1936 	/*
1937 	 * we only allow two pending log transactions at a time,
1938 	 * so we know that if ours is more than 2 older than the
1939 	 * current transaction, we're done
1940 	 */
1941 	do {
1942 		prepare_to_wait(&root->log_commit_wait[index],
1943 				&wait, TASK_UNINTERRUPTIBLE);
1944 		mutex_unlock(&root->log_mutex);
1945 
1946 		if (root->fs_info->last_trans_log_full_commit !=
1947 		    trans->transid && root->log_transid < transid + 2 &&
1948 		    atomic_read(&root->log_commit[index]))
1949 			schedule();
1950 
1951 		finish_wait(&root->log_commit_wait[index], &wait);
1952 		mutex_lock(&root->log_mutex);
1953 	} while (root->log_transid < transid + 2 &&
1954 		 atomic_read(&root->log_commit[index]));
1955 	return 0;
1956 }
1957 
1958 static int wait_for_writer(struct btrfs_trans_handle *trans,
1959 			   struct btrfs_root *root)
1960 {
1961 	DEFINE_WAIT(wait);
1962 	while (atomic_read(&root->log_writers)) {
1963 		prepare_to_wait(&root->log_writer_wait,
1964 				&wait, TASK_UNINTERRUPTIBLE);
1965 		mutex_unlock(&root->log_mutex);
1966 		if (root->fs_info->last_trans_log_full_commit !=
1967 		    trans->transid && atomic_read(&root->log_writers))
1968 			schedule();
1969 		mutex_lock(&root->log_mutex);
1970 		finish_wait(&root->log_writer_wait, &wait);
1971 	}
1972 	return 0;
1973 }
1974 
1975 /*
1976  * btrfs_sync_log does sends a given tree log down to the disk and
1977  * updates the super blocks to record it.  When this call is done,
1978  * you know that any inodes previously logged are safely on disk only
1979  * if it returns 0.
1980  *
1981  * Any other return value means you need to call btrfs_commit_transaction.
1982  * Some of the edge cases for fsyncing directories that have had unlinks
1983  * or renames done in the past mean that sometimes the only safe
1984  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
1985  * that has happened.
1986  */
1987 int btrfs_sync_log(struct btrfs_trans_handle *trans,
1988 		   struct btrfs_root *root)
1989 {
1990 	int index1;
1991 	int index2;
1992 	int ret;
1993 	struct btrfs_root *log = root->log_root;
1994 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1995 
1996 	mutex_lock(&root->log_mutex);
1997 	index1 = root->log_transid % 2;
1998 	if (atomic_read(&root->log_commit[index1])) {
1999 		wait_log_commit(trans, root, root->log_transid);
2000 		mutex_unlock(&root->log_mutex);
2001 		return 0;
2002 	}
2003 	atomic_set(&root->log_commit[index1], 1);
2004 
2005 	/* wait for previous tree log sync to complete */
2006 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2007 		wait_log_commit(trans, root, root->log_transid - 1);
2008 
2009 	while (1) {
2010 		unsigned long batch = root->log_batch;
2011 		mutex_unlock(&root->log_mutex);
2012 		schedule_timeout_uninterruptible(1);
2013 		mutex_lock(&root->log_mutex);
2014 
2015 		wait_for_writer(trans, root);
2016 		if (batch == root->log_batch)
2017 			break;
2018 	}
2019 
2020 	/* bail out if we need to do a full commit */
2021 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2022 		ret = -EAGAIN;
2023 		mutex_unlock(&root->log_mutex);
2024 		goto out;
2025 	}
2026 
2027 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
2028 	BUG_ON(ret);
2029 
2030 	btrfs_set_root_bytenr(&log->root_item, log->node->start);
2031 	btrfs_set_root_generation(&log->root_item, trans->transid);
2032 	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2033 
2034 	root->log_batch = 0;
2035 	root->log_transid++;
2036 	log->log_transid = root->log_transid;
2037 	smp_mb();
2038 	/*
2039 	 * log tree has been flushed to disk, new modifications of
2040 	 * the log will be written to new positions. so it's safe to
2041 	 * allow log writers to go in.
2042 	 */
2043 	mutex_unlock(&root->log_mutex);
2044 
2045 	mutex_lock(&log_root_tree->log_mutex);
2046 	log_root_tree->log_batch++;
2047 	atomic_inc(&log_root_tree->log_writers);
2048 	mutex_unlock(&log_root_tree->log_mutex);
2049 
2050 	ret = update_log_root(trans, log);
2051 	BUG_ON(ret);
2052 
2053 	mutex_lock(&log_root_tree->log_mutex);
2054 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2055 		smp_mb();
2056 		if (waitqueue_active(&log_root_tree->log_writer_wait))
2057 			wake_up(&log_root_tree->log_writer_wait);
2058 	}
2059 
2060 	index2 = log_root_tree->log_transid % 2;
2061 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2062 		wait_log_commit(trans, log_root_tree,
2063 				log_root_tree->log_transid);
2064 		mutex_unlock(&log_root_tree->log_mutex);
2065 		goto out;
2066 	}
2067 	atomic_set(&log_root_tree->log_commit[index2], 1);
2068 
2069 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2070 		wait_log_commit(trans, log_root_tree,
2071 				log_root_tree->log_transid - 1);
2072 	}
2073 
2074 	wait_for_writer(trans, log_root_tree);
2075 
2076 	/*
2077 	 * now that we've moved on to the tree of log tree roots,
2078 	 * check the full commit flag again
2079 	 */
2080 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2081 		mutex_unlock(&log_root_tree->log_mutex);
2082 		ret = -EAGAIN;
2083 		goto out_wake_log_root;
2084 	}
2085 
2086 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2087 				&log_root_tree->dirty_log_pages);
2088 	BUG_ON(ret);
2089 
2090 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
2091 				log_root_tree->node->start);
2092 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
2093 				btrfs_header_level(log_root_tree->node));
2094 
2095 	log_root_tree->log_batch = 0;
2096 	log_root_tree->log_transid++;
2097 	smp_mb();
2098 
2099 	mutex_unlock(&log_root_tree->log_mutex);
2100 
2101 	/*
2102 	 * nobody else is going to jump in and write the the ctree
2103 	 * super here because the log_commit atomic below is protecting
2104 	 * us.  We must be called with a transaction handle pinning
2105 	 * the running transaction open, so a full commit can't hop
2106 	 * in and cause problems either.
2107 	 */
2108 	write_ctree_super(trans, root->fs_info->tree_root, 2);
2109 	ret = 0;
2110 
2111 out_wake_log_root:
2112 	atomic_set(&log_root_tree->log_commit[index2], 0);
2113 	smp_mb();
2114 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2115 		wake_up(&log_root_tree->log_commit_wait[index2]);
2116 out:
2117 	atomic_set(&root->log_commit[index1], 0);
2118 	smp_mb();
2119 	if (waitqueue_active(&root->log_commit_wait[index1]))
2120 		wake_up(&root->log_commit_wait[index1]);
2121 	return 0;
2122 }
2123 
2124 /*
2125  * free all the extents used by the tree log.  This should be called
2126  * at commit time of the full transaction
2127  */
2128 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2129 {
2130 	int ret;
2131 	struct btrfs_root *log;
2132 	struct key;
2133 	u64 start;
2134 	u64 end;
2135 	struct walk_control wc = {
2136 		.free = 1,
2137 		.process_func = process_one_buffer
2138 	};
2139 
2140 	if (!root->log_root || root->fs_info->log_root_recovering)
2141 		return 0;
2142 
2143 	log = root->log_root;
2144 	ret = walk_log_tree(trans, log, &wc);
2145 	BUG_ON(ret);
2146 
2147 	while (1) {
2148 		ret = find_first_extent_bit(&log->dirty_log_pages,
2149 				    0, &start, &end, EXTENT_DIRTY);
2150 		if (ret)
2151 			break;
2152 
2153 		clear_extent_dirty(&log->dirty_log_pages,
2154 				   start, end, GFP_NOFS);
2155 	}
2156 
2157 	if (log->log_transid > 0) {
2158 		ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2159 				     &log->root_key);
2160 		BUG_ON(ret);
2161 	}
2162 	root->log_root = NULL;
2163 	free_extent_buffer(log->node);
2164 	kfree(log);
2165 	return 0;
2166 }
2167 
2168 /*
2169  * If both a file and directory are logged, and unlinks or renames are
2170  * mixed in, we have a few interesting corners:
2171  *
2172  * create file X in dir Y
2173  * link file X to X.link in dir Y
2174  * fsync file X
2175  * unlink file X but leave X.link
2176  * fsync dir Y
2177  *
2178  * After a crash we would expect only X.link to exist.  But file X
2179  * didn't get fsync'd again so the log has back refs for X and X.link.
2180  *
2181  * We solve this by removing directory entries and inode backrefs from the
2182  * log when a file that was logged in the current transaction is
2183  * unlinked.  Any later fsync will include the updated log entries, and
2184  * we'll be able to reconstruct the proper directory items from backrefs.
2185  *
2186  * This optimizations allows us to avoid relogging the entire inode
2187  * or the entire directory.
2188  */
2189 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2190 				 struct btrfs_root *root,
2191 				 const char *name, int name_len,
2192 				 struct inode *dir, u64 index)
2193 {
2194 	struct btrfs_root *log;
2195 	struct btrfs_dir_item *di;
2196 	struct btrfs_path *path;
2197 	int ret;
2198 	int bytes_del = 0;
2199 
2200 	if (BTRFS_I(dir)->logged_trans < trans->transid)
2201 		return 0;
2202 
2203 	ret = join_running_log_trans(root);
2204 	if (ret)
2205 		return 0;
2206 
2207 	mutex_lock(&BTRFS_I(dir)->log_mutex);
2208 
2209 	log = root->log_root;
2210 	path = btrfs_alloc_path();
2211 	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2212 				   name, name_len, -1);
2213 	if (di && !IS_ERR(di)) {
2214 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2215 		bytes_del += name_len;
2216 		BUG_ON(ret);
2217 	}
2218 	btrfs_release_path(log, path);
2219 	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2220 					 index, name, name_len, -1);
2221 	if (di && !IS_ERR(di)) {
2222 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2223 		bytes_del += name_len;
2224 		BUG_ON(ret);
2225 	}
2226 
2227 	/* update the directory size in the log to reflect the names
2228 	 * we have removed
2229 	 */
2230 	if (bytes_del) {
2231 		struct btrfs_key key;
2232 
2233 		key.objectid = dir->i_ino;
2234 		key.offset = 0;
2235 		key.type = BTRFS_INODE_ITEM_KEY;
2236 		btrfs_release_path(log, path);
2237 
2238 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2239 		if (ret == 0) {
2240 			struct btrfs_inode_item *item;
2241 			u64 i_size;
2242 
2243 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2244 					      struct btrfs_inode_item);
2245 			i_size = btrfs_inode_size(path->nodes[0], item);
2246 			if (i_size > bytes_del)
2247 				i_size -= bytes_del;
2248 			else
2249 				i_size = 0;
2250 			btrfs_set_inode_size(path->nodes[0], item, i_size);
2251 			btrfs_mark_buffer_dirty(path->nodes[0]);
2252 		} else
2253 			ret = 0;
2254 		btrfs_release_path(log, path);
2255 	}
2256 
2257 	btrfs_free_path(path);
2258 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
2259 	btrfs_end_log_trans(root);
2260 
2261 	return 0;
2262 }
2263 
2264 /* see comments for btrfs_del_dir_entries_in_log */
2265 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2266 			       struct btrfs_root *root,
2267 			       const char *name, int name_len,
2268 			       struct inode *inode, u64 dirid)
2269 {
2270 	struct btrfs_root *log;
2271 	u64 index;
2272 	int ret;
2273 
2274 	if (BTRFS_I(inode)->logged_trans < trans->transid)
2275 		return 0;
2276 
2277 	ret = join_running_log_trans(root);
2278 	if (ret)
2279 		return 0;
2280 	log = root->log_root;
2281 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2282 
2283 	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2284 				  dirid, &index);
2285 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
2286 	btrfs_end_log_trans(root);
2287 
2288 	return ret;
2289 }
2290 
2291 /*
2292  * creates a range item in the log for 'dirid'.  first_offset and
2293  * last_offset tell us which parts of the key space the log should
2294  * be considered authoritative for.
2295  */
2296 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2297 				       struct btrfs_root *log,
2298 				       struct btrfs_path *path,
2299 				       int key_type, u64 dirid,
2300 				       u64 first_offset, u64 last_offset)
2301 {
2302 	int ret;
2303 	struct btrfs_key key;
2304 	struct btrfs_dir_log_item *item;
2305 
2306 	key.objectid = dirid;
2307 	key.offset = first_offset;
2308 	if (key_type == BTRFS_DIR_ITEM_KEY)
2309 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
2310 	else
2311 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
2312 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2313 	BUG_ON(ret);
2314 
2315 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2316 			      struct btrfs_dir_log_item);
2317 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2318 	btrfs_mark_buffer_dirty(path->nodes[0]);
2319 	btrfs_release_path(log, path);
2320 	return 0;
2321 }
2322 
2323 /*
2324  * log all the items included in the current transaction for a given
2325  * directory.  This also creates the range items in the log tree required
2326  * to replay anything deleted before the fsync
2327  */
2328 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2329 			  struct btrfs_root *root, struct inode *inode,
2330 			  struct btrfs_path *path,
2331 			  struct btrfs_path *dst_path, int key_type,
2332 			  u64 min_offset, u64 *last_offset_ret)
2333 {
2334 	struct btrfs_key min_key;
2335 	struct btrfs_key max_key;
2336 	struct btrfs_root *log = root->log_root;
2337 	struct extent_buffer *src;
2338 	int ret;
2339 	int i;
2340 	int nritems;
2341 	u64 first_offset = min_offset;
2342 	u64 last_offset = (u64)-1;
2343 
2344 	log = root->log_root;
2345 	max_key.objectid = inode->i_ino;
2346 	max_key.offset = (u64)-1;
2347 	max_key.type = key_type;
2348 
2349 	min_key.objectid = inode->i_ino;
2350 	min_key.type = key_type;
2351 	min_key.offset = min_offset;
2352 
2353 	path->keep_locks = 1;
2354 
2355 	ret = btrfs_search_forward(root, &min_key, &max_key,
2356 				   path, 0, trans->transid);
2357 
2358 	/*
2359 	 * we didn't find anything from this transaction, see if there
2360 	 * is anything at all
2361 	 */
2362 	if (ret != 0 || min_key.objectid != inode->i_ino ||
2363 	    min_key.type != key_type) {
2364 		min_key.objectid = inode->i_ino;
2365 		min_key.type = key_type;
2366 		min_key.offset = (u64)-1;
2367 		btrfs_release_path(root, path);
2368 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2369 		if (ret < 0) {
2370 			btrfs_release_path(root, path);
2371 			return ret;
2372 		}
2373 		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2374 
2375 		/* if ret == 0 there are items for this type,
2376 		 * create a range to tell us the last key of this type.
2377 		 * otherwise, there are no items in this directory after
2378 		 * *min_offset, and we create a range to indicate that.
2379 		 */
2380 		if (ret == 0) {
2381 			struct btrfs_key tmp;
2382 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2383 					      path->slots[0]);
2384 			if (key_type == tmp.type)
2385 				first_offset = max(min_offset, tmp.offset) + 1;
2386 		}
2387 		goto done;
2388 	}
2389 
2390 	/* go backward to find any previous key */
2391 	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2392 	if (ret == 0) {
2393 		struct btrfs_key tmp;
2394 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2395 		if (key_type == tmp.type) {
2396 			first_offset = tmp.offset;
2397 			ret = overwrite_item(trans, log, dst_path,
2398 					     path->nodes[0], path->slots[0],
2399 					     &tmp);
2400 		}
2401 	}
2402 	btrfs_release_path(root, path);
2403 
2404 	/* find the first key from this transaction again */
2405 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2406 	if (ret != 0) {
2407 		WARN_ON(1);
2408 		goto done;
2409 	}
2410 
2411 	/*
2412 	 * we have a block from this transaction, log every item in it
2413 	 * from our directory
2414 	 */
2415 	while (1) {
2416 		struct btrfs_key tmp;
2417 		src = path->nodes[0];
2418 		nritems = btrfs_header_nritems(src);
2419 		for (i = path->slots[0]; i < nritems; i++) {
2420 			btrfs_item_key_to_cpu(src, &min_key, i);
2421 
2422 			if (min_key.objectid != inode->i_ino ||
2423 			    min_key.type != key_type)
2424 				goto done;
2425 			ret = overwrite_item(trans, log, dst_path, src, i,
2426 					     &min_key);
2427 			BUG_ON(ret);
2428 		}
2429 		path->slots[0] = nritems;
2430 
2431 		/*
2432 		 * look ahead to the next item and see if it is also
2433 		 * from this directory and from this transaction
2434 		 */
2435 		ret = btrfs_next_leaf(root, path);
2436 		if (ret == 1) {
2437 			last_offset = (u64)-1;
2438 			goto done;
2439 		}
2440 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2441 		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2442 			last_offset = (u64)-1;
2443 			goto done;
2444 		}
2445 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2446 			ret = overwrite_item(trans, log, dst_path,
2447 					     path->nodes[0], path->slots[0],
2448 					     &tmp);
2449 
2450 			BUG_ON(ret);
2451 			last_offset = tmp.offset;
2452 			goto done;
2453 		}
2454 	}
2455 done:
2456 	*last_offset_ret = last_offset;
2457 	btrfs_release_path(root, path);
2458 	btrfs_release_path(log, dst_path);
2459 
2460 	/* insert the log range keys to indicate where the log is valid */
2461 	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2462 				 first_offset, last_offset);
2463 	BUG_ON(ret);
2464 	return 0;
2465 }
2466 
2467 /*
2468  * logging directories is very similar to logging inodes, We find all the items
2469  * from the current transaction and write them to the log.
2470  *
2471  * The recovery code scans the directory in the subvolume, and if it finds a
2472  * key in the range logged that is not present in the log tree, then it means
2473  * that dir entry was unlinked during the transaction.
2474  *
2475  * In order for that scan to work, we must include one key smaller than
2476  * the smallest logged by this transaction and one key larger than the largest
2477  * key logged by this transaction.
2478  */
2479 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2480 			  struct btrfs_root *root, struct inode *inode,
2481 			  struct btrfs_path *path,
2482 			  struct btrfs_path *dst_path)
2483 {
2484 	u64 min_key;
2485 	u64 max_key;
2486 	int ret;
2487 	int key_type = BTRFS_DIR_ITEM_KEY;
2488 
2489 again:
2490 	min_key = 0;
2491 	max_key = 0;
2492 	while (1) {
2493 		ret = log_dir_items(trans, root, inode, path,
2494 				    dst_path, key_type, min_key,
2495 				    &max_key);
2496 		BUG_ON(ret);
2497 		if (max_key == (u64)-1)
2498 			break;
2499 		min_key = max_key + 1;
2500 	}
2501 
2502 	if (key_type == BTRFS_DIR_ITEM_KEY) {
2503 		key_type = BTRFS_DIR_INDEX_KEY;
2504 		goto again;
2505 	}
2506 	return 0;
2507 }
2508 
2509 /*
2510  * a helper function to drop items from the log before we relog an
2511  * inode.  max_key_type indicates the highest item type to remove.
2512  * This cannot be run for file data extents because it does not
2513  * free the extents they point to.
2514  */
2515 static int drop_objectid_items(struct btrfs_trans_handle *trans,
2516 				  struct btrfs_root *log,
2517 				  struct btrfs_path *path,
2518 				  u64 objectid, int max_key_type)
2519 {
2520 	int ret;
2521 	struct btrfs_key key;
2522 	struct btrfs_key found_key;
2523 
2524 	key.objectid = objectid;
2525 	key.type = max_key_type;
2526 	key.offset = (u64)-1;
2527 
2528 	while (1) {
2529 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2530 
2531 		if (ret != 1)
2532 			break;
2533 
2534 		if (path->slots[0] == 0)
2535 			break;
2536 
2537 		path->slots[0]--;
2538 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2539 				      path->slots[0]);
2540 
2541 		if (found_key.objectid != objectid)
2542 			break;
2543 
2544 		ret = btrfs_del_item(trans, log, path);
2545 		BUG_ON(ret);
2546 		btrfs_release_path(log, path);
2547 	}
2548 	btrfs_release_path(log, path);
2549 	return 0;
2550 }
2551 
2552 static noinline int copy_items(struct btrfs_trans_handle *trans,
2553 			       struct btrfs_root *log,
2554 			       struct btrfs_path *dst_path,
2555 			       struct extent_buffer *src,
2556 			       int start_slot, int nr, int inode_only)
2557 {
2558 	unsigned long src_offset;
2559 	unsigned long dst_offset;
2560 	struct btrfs_file_extent_item *extent;
2561 	struct btrfs_inode_item *inode_item;
2562 	int ret;
2563 	struct btrfs_key *ins_keys;
2564 	u32 *ins_sizes;
2565 	char *ins_data;
2566 	int i;
2567 	struct list_head ordered_sums;
2568 
2569 	INIT_LIST_HEAD(&ordered_sums);
2570 
2571 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2572 			   nr * sizeof(u32), GFP_NOFS);
2573 	ins_sizes = (u32 *)ins_data;
2574 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2575 
2576 	for (i = 0; i < nr; i++) {
2577 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2578 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2579 	}
2580 	ret = btrfs_insert_empty_items(trans, log, dst_path,
2581 				       ins_keys, ins_sizes, nr);
2582 	BUG_ON(ret);
2583 
2584 	for (i = 0; i < nr; i++) {
2585 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2586 						   dst_path->slots[0]);
2587 
2588 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2589 
2590 		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2591 				   src_offset, ins_sizes[i]);
2592 
2593 		if (inode_only == LOG_INODE_EXISTS &&
2594 		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2595 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
2596 						    dst_path->slots[0],
2597 						    struct btrfs_inode_item);
2598 			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2599 
2600 			/* set the generation to zero so the recover code
2601 			 * can tell the difference between an logging
2602 			 * just to say 'this inode exists' and a logging
2603 			 * to say 'update this inode with these values'
2604 			 */
2605 			btrfs_set_inode_generation(dst_path->nodes[0],
2606 						   inode_item, 0);
2607 		}
2608 		/* take a reference on file data extents so that truncates
2609 		 * or deletes of this inode don't have to relog the inode
2610 		 * again
2611 		 */
2612 		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2613 			int found_type;
2614 			extent = btrfs_item_ptr(src, start_slot + i,
2615 						struct btrfs_file_extent_item);
2616 
2617 			found_type = btrfs_file_extent_type(src, extent);
2618 			if (found_type == BTRFS_FILE_EXTENT_REG ||
2619 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2620 				u64 ds = btrfs_file_extent_disk_bytenr(src,
2621 								   extent);
2622 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
2623 								      extent);
2624 				u64 cs = btrfs_file_extent_offset(src, extent);
2625 				u64 cl = btrfs_file_extent_num_bytes(src,
2626 								     extent);;
2627 				if (btrfs_file_extent_compression(src,
2628 								  extent)) {
2629 					cs = 0;
2630 					cl = dl;
2631 				}
2632 				/* ds == 0 is a hole */
2633 				if (ds != 0) {
2634 					ret = btrfs_inc_extent_ref(trans, log,
2635 						   ds, dl,
2636 						   dst_path->nodes[0]->start,
2637 						   BTRFS_TREE_LOG_OBJECTID,
2638 						   trans->transid,
2639 						   ins_keys[i].objectid);
2640 					BUG_ON(ret);
2641 					ret = btrfs_lookup_csums_range(
2642 						   log->fs_info->csum_root,
2643 						   ds + cs, ds + cs + cl - 1,
2644 						   &ordered_sums);
2645 					BUG_ON(ret);
2646 				}
2647 			}
2648 		}
2649 		dst_path->slots[0]++;
2650 	}
2651 
2652 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2653 	btrfs_release_path(log, dst_path);
2654 	kfree(ins_data);
2655 
2656 	/*
2657 	 * we have to do this after the loop above to avoid changing the
2658 	 * log tree while trying to change the log tree.
2659 	 */
2660 	while (!list_empty(&ordered_sums)) {
2661 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2662 						   struct btrfs_ordered_sum,
2663 						   list);
2664 		ret = btrfs_csum_file_blocks(trans, log, sums);
2665 		BUG_ON(ret);
2666 		list_del(&sums->list);
2667 		kfree(sums);
2668 	}
2669 	return 0;
2670 }
2671 
2672 /* log a single inode in the tree log.
2673  * At least one parent directory for this inode must exist in the tree
2674  * or be logged already.
2675  *
2676  * Any items from this inode changed by the current transaction are copied
2677  * to the log tree.  An extra reference is taken on any extents in this
2678  * file, allowing us to avoid a whole pile of corner cases around logging
2679  * blocks that have been removed from the tree.
2680  *
2681  * See LOG_INODE_ALL and related defines for a description of what inode_only
2682  * does.
2683  *
2684  * This handles both files and directories.
2685  */
2686 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2687 			     struct btrfs_root *root, struct inode *inode,
2688 			     int inode_only)
2689 {
2690 	struct btrfs_path *path;
2691 	struct btrfs_path *dst_path;
2692 	struct btrfs_key min_key;
2693 	struct btrfs_key max_key;
2694 	struct btrfs_root *log = root->log_root;
2695 	struct extent_buffer *src = NULL;
2696 	u32 size;
2697 	int ret;
2698 	int nritems;
2699 	int ins_start_slot = 0;
2700 	int ins_nr;
2701 
2702 	log = root->log_root;
2703 
2704 	path = btrfs_alloc_path();
2705 	dst_path = btrfs_alloc_path();
2706 
2707 	min_key.objectid = inode->i_ino;
2708 	min_key.type = BTRFS_INODE_ITEM_KEY;
2709 	min_key.offset = 0;
2710 
2711 	max_key.objectid = inode->i_ino;
2712 
2713 	/* today the code can only do partial logging of directories */
2714 	if (!S_ISDIR(inode->i_mode))
2715 	    inode_only = LOG_INODE_ALL;
2716 
2717 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2718 		max_key.type = BTRFS_XATTR_ITEM_KEY;
2719 	else
2720 		max_key.type = (u8)-1;
2721 	max_key.offset = (u64)-1;
2722 
2723 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2724 
2725 	/*
2726 	 * a brute force approach to making sure we get the most uptodate
2727 	 * copies of everything.
2728 	 */
2729 	if (S_ISDIR(inode->i_mode)) {
2730 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2731 
2732 		if (inode_only == LOG_INODE_EXISTS)
2733 			max_key_type = BTRFS_XATTR_ITEM_KEY;
2734 		ret = drop_objectid_items(trans, log, path,
2735 					  inode->i_ino, max_key_type);
2736 	} else {
2737 		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2738 	}
2739 	BUG_ON(ret);
2740 	path->keep_locks = 1;
2741 
2742 	while (1) {
2743 		ins_nr = 0;
2744 		ret = btrfs_search_forward(root, &min_key, &max_key,
2745 					   path, 0, trans->transid);
2746 		if (ret != 0)
2747 			break;
2748 again:
2749 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
2750 		if (min_key.objectid != inode->i_ino)
2751 			break;
2752 		if (min_key.type > max_key.type)
2753 			break;
2754 
2755 		src = path->nodes[0];
2756 		size = btrfs_item_size_nr(src, path->slots[0]);
2757 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2758 			ins_nr++;
2759 			goto next_slot;
2760 		} else if (!ins_nr) {
2761 			ins_start_slot = path->slots[0];
2762 			ins_nr = 1;
2763 			goto next_slot;
2764 		}
2765 
2766 		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2767 				 ins_nr, inode_only);
2768 		BUG_ON(ret);
2769 		ins_nr = 1;
2770 		ins_start_slot = path->slots[0];
2771 next_slot:
2772 
2773 		nritems = btrfs_header_nritems(path->nodes[0]);
2774 		path->slots[0]++;
2775 		if (path->slots[0] < nritems) {
2776 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2777 					      path->slots[0]);
2778 			goto again;
2779 		}
2780 		if (ins_nr) {
2781 			ret = copy_items(trans, log, dst_path, src,
2782 					 ins_start_slot,
2783 					 ins_nr, inode_only);
2784 			BUG_ON(ret);
2785 			ins_nr = 0;
2786 		}
2787 		btrfs_release_path(root, path);
2788 
2789 		if (min_key.offset < (u64)-1)
2790 			min_key.offset++;
2791 		else if (min_key.type < (u8)-1)
2792 			min_key.type++;
2793 		else if (min_key.objectid < (u64)-1)
2794 			min_key.objectid++;
2795 		else
2796 			break;
2797 	}
2798 	if (ins_nr) {
2799 		ret = copy_items(trans, log, dst_path, src,
2800 				 ins_start_slot,
2801 				 ins_nr, inode_only);
2802 		BUG_ON(ret);
2803 		ins_nr = 0;
2804 	}
2805 	WARN_ON(ins_nr);
2806 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2807 		btrfs_release_path(root, path);
2808 		btrfs_release_path(log, dst_path);
2809 		ret = log_directory_changes(trans, root, inode, path, dst_path);
2810 		BUG_ON(ret);
2811 	}
2812 	BTRFS_I(inode)->logged_trans = trans->transid;
2813 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
2814 
2815 	btrfs_free_path(path);
2816 	btrfs_free_path(dst_path);
2817 	return 0;
2818 }
2819 
2820 /*
2821  * follow the dentry parent pointers up the chain and see if any
2822  * of the directories in it require a full commit before they can
2823  * be logged.  Returns zero if nothing special needs to be done or 1 if
2824  * a full commit is required.
2825  */
2826 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2827 					       struct inode *inode,
2828 					       struct dentry *parent,
2829 					       struct super_block *sb,
2830 					       u64 last_committed)
2831 {
2832 	int ret = 0;
2833 	struct btrfs_root *root;
2834 
2835 	/*
2836 	 * for regular files, if its inode is already on disk, we don't
2837 	 * have to worry about the parents at all.  This is because
2838 	 * we can use the last_unlink_trans field to record renames
2839 	 * and other fun in this file.
2840 	 */
2841 	if (S_ISREG(inode->i_mode) &&
2842 	    BTRFS_I(inode)->generation <= last_committed &&
2843 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
2844 			goto out;
2845 
2846 	if (!S_ISDIR(inode->i_mode)) {
2847 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2848 			goto out;
2849 		inode = parent->d_inode;
2850 	}
2851 
2852 	while (1) {
2853 		BTRFS_I(inode)->logged_trans = trans->transid;
2854 		smp_mb();
2855 
2856 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2857 			root = BTRFS_I(inode)->root;
2858 
2859 			/*
2860 			 * make sure any commits to the log are forced
2861 			 * to be full commits
2862 			 */
2863 			root->fs_info->last_trans_log_full_commit =
2864 				trans->transid;
2865 			ret = 1;
2866 			break;
2867 		}
2868 
2869 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2870 			break;
2871 
2872 		if (parent == sb->s_root)
2873 			break;
2874 
2875 		parent = parent->d_parent;
2876 		inode = parent->d_inode;
2877 
2878 	}
2879 out:
2880 	return ret;
2881 }
2882 
2883 /*
2884  * helper function around btrfs_log_inode to make sure newly created
2885  * parent directories also end up in the log.  A minimal inode and backref
2886  * only logging is done of any parent directories that are older than
2887  * the last committed transaction
2888  */
2889 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2890 		    struct btrfs_root *root, struct inode *inode,
2891 		    struct dentry *parent, int exists_only)
2892 {
2893 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2894 	struct super_block *sb;
2895 	int ret = 0;
2896 	u64 last_committed = root->fs_info->last_trans_committed;
2897 
2898 	sb = inode->i_sb;
2899 
2900 	if (btrfs_test_opt(root, NOTREELOG)) {
2901 		ret = 1;
2902 		goto end_no_trans;
2903 	}
2904 
2905 	if (root->fs_info->last_trans_log_full_commit >
2906 	    root->fs_info->last_trans_committed) {
2907 		ret = 1;
2908 		goto end_no_trans;
2909 	}
2910 
2911 	ret = check_parent_dirs_for_sync(trans, inode, parent,
2912 					 sb, last_committed);
2913 	if (ret)
2914 		goto end_no_trans;
2915 
2916 	start_log_trans(trans, root);
2917 
2918 	ret = btrfs_log_inode(trans, root, inode, inode_only);
2919 	BUG_ON(ret);
2920 
2921 	/*
2922 	 * for regular files, if its inode is already on disk, we don't
2923 	 * have to worry about the parents at all.  This is because
2924 	 * we can use the last_unlink_trans field to record renames
2925 	 * and other fun in this file.
2926 	 */
2927 	if (S_ISREG(inode->i_mode) &&
2928 	    BTRFS_I(inode)->generation <= last_committed &&
2929 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
2930 			goto no_parent;
2931 
2932 	inode_only = LOG_INODE_EXISTS;
2933 	while (1) {
2934 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2935 			break;
2936 
2937 		inode = parent->d_inode;
2938 		if (BTRFS_I(inode)->generation >
2939 		    root->fs_info->last_trans_committed) {
2940 			ret = btrfs_log_inode(trans, root, inode, inode_only);
2941 			BUG_ON(ret);
2942 		}
2943 		if (parent == sb->s_root)
2944 			break;
2945 
2946 		parent = parent->d_parent;
2947 	}
2948 no_parent:
2949 	ret = 0;
2950 	btrfs_end_log_trans(root);
2951 end_no_trans:
2952 	return ret;
2953 }
2954 
2955 /*
2956  * it is not safe to log dentry if the chunk root has added new
2957  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
2958  * If this returns 1, you must commit the transaction to safely get your
2959  * data on disk.
2960  */
2961 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2962 			  struct btrfs_root *root, struct dentry *dentry)
2963 {
2964 	return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2965 				      dentry->d_parent, 0);
2966 }
2967 
2968 /*
2969  * should be called during mount to recover any replay any log trees
2970  * from the FS
2971  */
2972 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2973 {
2974 	int ret;
2975 	struct btrfs_path *path;
2976 	struct btrfs_trans_handle *trans;
2977 	struct btrfs_key key;
2978 	struct btrfs_key found_key;
2979 	struct btrfs_key tmp_key;
2980 	struct btrfs_root *log;
2981 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2982 	u64 highest_inode;
2983 	struct walk_control wc = {
2984 		.process_func = process_one_buffer,
2985 		.stage = 0,
2986 	};
2987 
2988 	fs_info->log_root_recovering = 1;
2989 	path = btrfs_alloc_path();
2990 	BUG_ON(!path);
2991 
2992 	trans = btrfs_start_transaction(fs_info->tree_root, 1);
2993 
2994 	wc.trans = trans;
2995 	wc.pin = 1;
2996 
2997 	walk_log_tree(trans, log_root_tree, &wc);
2998 
2999 again:
3000 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
3001 	key.offset = (u64)-1;
3002 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
3003 
3004 	while (1) {
3005 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
3006 		if (ret < 0)
3007 			break;
3008 		if (ret > 0) {
3009 			if (path->slots[0] == 0)
3010 				break;
3011 			path->slots[0]--;
3012 		}
3013 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3014 				      path->slots[0]);
3015 		btrfs_release_path(log_root_tree, path);
3016 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
3017 			break;
3018 
3019 		log = btrfs_read_fs_root_no_radix(log_root_tree,
3020 						  &found_key);
3021 		BUG_ON(!log);
3022 
3023 
3024 		tmp_key.objectid = found_key.offset;
3025 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
3026 		tmp_key.offset = (u64)-1;
3027 
3028 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
3029 		BUG_ON(!wc.replay_dest);
3030 
3031 		wc.replay_dest->log_root = log;
3032 		mutex_lock(&fs_info->trans_mutex);
3033 		btrfs_record_root_in_trans(wc.replay_dest);
3034 		mutex_unlock(&fs_info->trans_mutex);
3035 		ret = walk_log_tree(trans, log, &wc);
3036 		BUG_ON(ret);
3037 
3038 		if (wc.stage == LOG_WALK_REPLAY_ALL) {
3039 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
3040 						      path);
3041 			BUG_ON(ret);
3042 		}
3043 		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
3044 		if (ret == 0) {
3045 			wc.replay_dest->highest_inode = highest_inode;
3046 			wc.replay_dest->last_inode_alloc = highest_inode;
3047 		}
3048 
3049 		key.offset = found_key.offset - 1;
3050 		wc.replay_dest->log_root = NULL;
3051 		free_extent_buffer(log->node);
3052 		kfree(log);
3053 
3054 		if (found_key.offset == 0)
3055 			break;
3056 	}
3057 	btrfs_release_path(log_root_tree, path);
3058 
3059 	/* step one is to pin it all, step two is to replay just inodes */
3060 	if (wc.pin) {
3061 		wc.pin = 0;
3062 		wc.process_func = replay_one_buffer;
3063 		wc.stage = LOG_WALK_REPLAY_INODES;
3064 		goto again;
3065 	}
3066 	/* step three is to replay everything */
3067 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
3068 		wc.stage++;
3069 		goto again;
3070 	}
3071 
3072 	btrfs_free_path(path);
3073 
3074 	free_extent_buffer(log_root_tree->node);
3075 	log_root_tree->log_root = NULL;
3076 	fs_info->log_root_recovering = 0;
3077 
3078 	/* step 4: commit the transaction, which also unpins the blocks */
3079 	btrfs_commit_transaction(trans, fs_info->tree_root);
3080 
3081 	kfree(log_root_tree);
3082 	return 0;
3083 }
3084 
3085 /*
3086  * there are some corner cases where we want to force a full
3087  * commit instead of allowing a directory to be logged.
3088  *
3089  * They revolve around files there were unlinked from the directory, and
3090  * this function updates the parent directory so that a full commit is
3091  * properly done if it is fsync'd later after the unlinks are done.
3092  */
3093 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3094 			     struct inode *dir, struct inode *inode,
3095 			     int for_rename)
3096 {
3097 	/*
3098 	 * when we're logging a file, if it hasn't been renamed
3099 	 * or unlinked, and its inode is fully committed on disk,
3100 	 * we don't have to worry about walking up the directory chain
3101 	 * to log its parents.
3102 	 *
3103 	 * So, we use the last_unlink_trans field to put this transid
3104 	 * into the file.  When the file is logged we check it and
3105 	 * don't log the parents if the file is fully on disk.
3106 	 */
3107 	if (S_ISREG(inode->i_mode))
3108 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
3109 
3110 	/*
3111 	 * if this directory was already logged any new
3112 	 * names for this file/dir will get recorded
3113 	 */
3114 	smp_mb();
3115 	if (BTRFS_I(dir)->logged_trans == trans->transid)
3116 		return;
3117 
3118 	/*
3119 	 * if the inode we're about to unlink was logged,
3120 	 * the log will be properly updated for any new names
3121 	 */
3122 	if (BTRFS_I(inode)->logged_trans == trans->transid)
3123 		return;
3124 
3125 	/*
3126 	 * when renaming files across directories, if the directory
3127 	 * there we're unlinking from gets fsync'd later on, there's
3128 	 * no way to find the destination directory later and fsync it
3129 	 * properly.  So, we have to be conservative and force commits
3130 	 * so the new name gets discovered.
3131 	 */
3132 	if (for_rename)
3133 		goto record;
3134 
3135 	/* we can safely do the unlink without any special recording */
3136 	return;
3137 
3138 record:
3139 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
3140 }
3141 
3142 /*
3143  * Call this after adding a new name for a file and it will properly
3144  * update the log to reflect the new name.
3145  *
3146  * It will return zero if all goes well, and it will return 1 if a
3147  * full transaction commit is required.
3148  */
3149 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3150 			struct inode *inode, struct inode *old_dir,
3151 			struct dentry *parent)
3152 {
3153 	struct btrfs_root * root = BTRFS_I(inode)->root;
3154 
3155 	/*
3156 	 * this will force the logging code to walk the dentry chain
3157 	 * up for the file
3158 	 */
3159 	if (S_ISREG(inode->i_mode))
3160 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
3161 
3162 	/*
3163 	 * if this inode hasn't been logged and directory we're renaming it
3164 	 * from hasn't been logged, we don't need to log it
3165 	 */
3166 	if (BTRFS_I(inode)->logged_trans <=
3167 	    root->fs_info->last_trans_committed &&
3168 	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3169 		    root->fs_info->last_trans_committed))
3170 		return 0;
3171 
3172 	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3173 }
3174 
3175