xref: /openbmc/linux/fs/btrfs/tree-log.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "locking.h"
24 #include "print-tree.h"
25 #include "compat.h"
26 #include "tree-log.h"
27 
28 /* magic values for the inode_only field in btrfs_log_inode:
29  *
30  * LOG_INODE_ALL means to log everything
31  * LOG_INODE_EXISTS means to log just enough to recreate the inode
32  * during log replay
33  */
34 #define LOG_INODE_ALL 0
35 #define LOG_INODE_EXISTS 1
36 
37 /*
38  * directory trouble cases
39  *
40  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41  * log, we must force a full commit before doing an fsync of the directory
42  * where the unlink was done.
43  * ---> record transid of last unlink/rename per directory
44  *
45  * mkdir foo/some_dir
46  * normal commit
47  * rename foo/some_dir foo2/some_dir
48  * mkdir foo/some_dir
49  * fsync foo/some_dir/some_file
50  *
51  * The fsync above will unlink the original some_dir without recording
52  * it in its new location (foo2).  After a crash, some_dir will be gone
53  * unless the fsync of some_file forces a full commit
54  *
55  * 2) we must log any new names for any file or dir that is in the fsync
56  * log. ---> check inode while renaming/linking.
57  *
58  * 2a) we must log any new names for any file or dir during rename
59  * when the directory they are being removed from was logged.
60  * ---> check inode and old parent dir during rename
61  *
62  *  2a is actually the more important variant.  With the extra logging
63  *  a crash might unlink the old name without recreating the new one
64  *
65  * 3) after a crash, we must go through any directories with a link count
66  * of zero and redo the rm -rf
67  *
68  * mkdir f1/foo
69  * normal commit
70  * rm -rf f1/foo
71  * fsync(f1)
72  *
73  * The directory f1 was fully removed from the FS, but fsync was never
74  * called on f1, only its parent dir.  After a crash the rm -rf must
75  * be replayed.  This must be able to recurse down the entire
76  * directory tree.  The inode link count fixup code takes care of the
77  * ugly details.
78  */
79 
80 /*
81  * stages for the tree walking.  The first
82  * stage (0) is to only pin down the blocks we find
83  * the second stage (1) is to make sure that all the inodes
84  * we find in the log are created in the subvolume.
85  *
86  * The last stage is to deal with directories and links and extents
87  * and all the other fun semantics
88  */
89 #define LOG_WALK_PIN_ONLY 0
90 #define LOG_WALK_REPLAY_INODES 1
91 #define LOG_WALK_REPLAY_ALL 2
92 
93 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
94 			     struct btrfs_root *root, struct inode *inode,
95 			     int inode_only);
96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
97 			     struct btrfs_root *root,
98 			     struct btrfs_path *path, u64 objectid);
99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 				       struct btrfs_root *root,
101 				       struct btrfs_root *log,
102 				       struct btrfs_path *path,
103 				       u64 dirid, int del_all);
104 
105 /*
106  * tree logging is a special write ahead log used to make sure that
107  * fsyncs and O_SYNCs can happen without doing full tree commits.
108  *
109  * Full tree commits are expensive because they require commonly
110  * modified blocks to be recowed, creating many dirty pages in the
111  * extent tree an 4x-6x higher write load than ext3.
112  *
113  * Instead of doing a tree commit on every fsync, we use the
114  * key ranges and transaction ids to find items for a given file or directory
115  * that have changed in this transaction.  Those items are copied into
116  * a special tree (one per subvolume root), that tree is written to disk
117  * and then the fsync is considered complete.
118  *
119  * After a crash, items are copied out of the log-tree back into the
120  * subvolume tree.  Any file data extents found are recorded in the extent
121  * allocation tree, and the log-tree freed.
122  *
123  * The log tree is read three times, once to pin down all the extents it is
124  * using in ram and once, once to create all the inodes logged in the tree
125  * and once to do all the other items.
126  */
127 
128 /*
129  * start a sub transaction and setup the log tree
130  * this increments the log tree writer count to make the people
131  * syncing the tree wait for us to finish
132  */
133 static int start_log_trans(struct btrfs_trans_handle *trans,
134 			   struct btrfs_root *root)
135 {
136 	int ret;
137 
138 	mutex_lock(&root->log_mutex);
139 	if (root->log_root) {
140 		root->log_batch++;
141 		atomic_inc(&root->log_writers);
142 		mutex_unlock(&root->log_mutex);
143 		return 0;
144 	}
145 	mutex_lock(&root->fs_info->tree_log_mutex);
146 	if (!root->fs_info->log_root_tree) {
147 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
148 		BUG_ON(ret);
149 	}
150 	if (!root->log_root) {
151 		ret = btrfs_add_log_tree(trans, root);
152 		BUG_ON(ret);
153 	}
154 	mutex_unlock(&root->fs_info->tree_log_mutex);
155 	root->log_batch++;
156 	atomic_inc(&root->log_writers);
157 	mutex_unlock(&root->log_mutex);
158 	return 0;
159 }
160 
161 /*
162  * returns 0 if there was a log transaction running and we were able
163  * to join, or returns -ENOENT if there were not transactions
164  * in progress
165  */
166 static int join_running_log_trans(struct btrfs_root *root)
167 {
168 	int ret = -ENOENT;
169 
170 	smp_mb();
171 	if (!root->log_root)
172 		return -ENOENT;
173 
174 	mutex_lock(&root->log_mutex);
175 	if (root->log_root) {
176 		ret = 0;
177 		atomic_inc(&root->log_writers);
178 	}
179 	mutex_unlock(&root->log_mutex);
180 	return ret;
181 }
182 
183 /*
184  * This either makes the current running log transaction wait
185  * until you call btrfs_end_log_trans() or it makes any future
186  * log transactions wait until you call btrfs_end_log_trans()
187  */
188 int btrfs_pin_log_trans(struct btrfs_root *root)
189 {
190 	int ret = -ENOENT;
191 
192 	mutex_lock(&root->log_mutex);
193 	atomic_inc(&root->log_writers);
194 	mutex_unlock(&root->log_mutex);
195 	return ret;
196 }
197 
198 /*
199  * indicate we're done making changes to the log tree
200  * and wake up anyone waiting to do a sync
201  */
202 int btrfs_end_log_trans(struct btrfs_root *root)
203 {
204 	if (atomic_dec_and_test(&root->log_writers)) {
205 		smp_mb();
206 		if (waitqueue_active(&root->log_writer_wait))
207 			wake_up(&root->log_writer_wait);
208 	}
209 	return 0;
210 }
211 
212 
213 /*
214  * the walk control struct is used to pass state down the chain when
215  * processing the log tree.  The stage field tells us which part
216  * of the log tree processing we are currently doing.  The others
217  * are state fields used for that specific part
218  */
219 struct walk_control {
220 	/* should we free the extent on disk when done?  This is used
221 	 * at transaction commit time while freeing a log tree
222 	 */
223 	int free;
224 
225 	/* should we write out the extent buffer?  This is used
226 	 * while flushing the log tree to disk during a sync
227 	 */
228 	int write;
229 
230 	/* should we wait for the extent buffer io to finish?  Also used
231 	 * while flushing the log tree to disk for a sync
232 	 */
233 	int wait;
234 
235 	/* pin only walk, we record which extents on disk belong to the
236 	 * log trees
237 	 */
238 	int pin;
239 
240 	/* what stage of the replay code we're currently in */
241 	int stage;
242 
243 	/* the root we are currently replaying */
244 	struct btrfs_root *replay_dest;
245 
246 	/* the trans handle for the current replay */
247 	struct btrfs_trans_handle *trans;
248 
249 	/* the function that gets used to process blocks we find in the
250 	 * tree.  Note the extent_buffer might not be up to date when it is
251 	 * passed in, and it must be checked or read if you need the data
252 	 * inside it
253 	 */
254 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
255 			    struct walk_control *wc, u64 gen);
256 };
257 
258 /*
259  * process_func used to pin down extents, write them or wait on them
260  */
261 static int process_one_buffer(struct btrfs_root *log,
262 			      struct extent_buffer *eb,
263 			      struct walk_control *wc, u64 gen)
264 {
265 	if (wc->pin) {
266 		mutex_lock(&log->fs_info->pinned_mutex);
267 		btrfs_update_pinned_extents(log->fs_info->extent_root,
268 					    eb->start, eb->len, 1);
269 	}
270 
271 	if (btrfs_buffer_uptodate(eb, gen)) {
272 		if (wc->write)
273 			btrfs_write_tree_block(eb);
274 		if (wc->wait)
275 			btrfs_wait_tree_block_writeback(eb);
276 	}
277 	return 0;
278 }
279 
280 /*
281  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
282  * to the src data we are copying out.
283  *
284  * root is the tree we are copying into, and path is a scratch
285  * path for use in this function (it should be released on entry and
286  * will be released on exit).
287  *
288  * If the key is already in the destination tree the existing item is
289  * overwritten.  If the existing item isn't big enough, it is extended.
290  * If it is too large, it is truncated.
291  *
292  * If the key isn't in the destination yet, a new item is inserted.
293  */
294 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
295 				   struct btrfs_root *root,
296 				   struct btrfs_path *path,
297 				   struct extent_buffer *eb, int slot,
298 				   struct btrfs_key *key)
299 {
300 	int ret;
301 	u32 item_size;
302 	u64 saved_i_size = 0;
303 	int save_old_i_size = 0;
304 	unsigned long src_ptr;
305 	unsigned long dst_ptr;
306 	int overwrite_root = 0;
307 
308 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
309 		overwrite_root = 1;
310 
311 	item_size = btrfs_item_size_nr(eb, slot);
312 	src_ptr = btrfs_item_ptr_offset(eb, slot);
313 
314 	/* look for the key in the destination tree */
315 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
316 	if (ret == 0) {
317 		char *src_copy;
318 		char *dst_copy;
319 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
320 						  path->slots[0]);
321 		if (dst_size != item_size)
322 			goto insert;
323 
324 		if (item_size == 0) {
325 			btrfs_release_path(root, path);
326 			return 0;
327 		}
328 		dst_copy = kmalloc(item_size, GFP_NOFS);
329 		src_copy = kmalloc(item_size, GFP_NOFS);
330 
331 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
332 
333 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
334 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
335 				   item_size);
336 		ret = memcmp(dst_copy, src_copy, item_size);
337 
338 		kfree(dst_copy);
339 		kfree(src_copy);
340 		/*
341 		 * they have the same contents, just return, this saves
342 		 * us from cowing blocks in the destination tree and doing
343 		 * extra writes that may not have been done by a previous
344 		 * sync
345 		 */
346 		if (ret == 0) {
347 			btrfs_release_path(root, path);
348 			return 0;
349 		}
350 
351 	}
352 insert:
353 	btrfs_release_path(root, path);
354 	/* try to insert the key into the destination tree */
355 	ret = btrfs_insert_empty_item(trans, root, path,
356 				      key, item_size);
357 
358 	/* make sure any existing item is the correct size */
359 	if (ret == -EEXIST) {
360 		u32 found_size;
361 		found_size = btrfs_item_size_nr(path->nodes[0],
362 						path->slots[0]);
363 		if (found_size > item_size) {
364 			btrfs_truncate_item(trans, root, path, item_size, 1);
365 		} else if (found_size < item_size) {
366 			ret = btrfs_extend_item(trans, root, path,
367 						item_size - found_size);
368 			BUG_ON(ret);
369 		}
370 	} else if (ret) {
371 		BUG();
372 	}
373 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
374 					path->slots[0]);
375 
376 	/* don't overwrite an existing inode if the generation number
377 	 * was logged as zero.  This is done when the tree logging code
378 	 * is just logging an inode to make sure it exists after recovery.
379 	 *
380 	 * Also, don't overwrite i_size on directories during replay.
381 	 * log replay inserts and removes directory items based on the
382 	 * state of the tree found in the subvolume, and i_size is modified
383 	 * as it goes
384 	 */
385 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
386 		struct btrfs_inode_item *src_item;
387 		struct btrfs_inode_item *dst_item;
388 
389 		src_item = (struct btrfs_inode_item *)src_ptr;
390 		dst_item = (struct btrfs_inode_item *)dst_ptr;
391 
392 		if (btrfs_inode_generation(eb, src_item) == 0)
393 			goto no_copy;
394 
395 		if (overwrite_root &&
396 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
397 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
398 			save_old_i_size = 1;
399 			saved_i_size = btrfs_inode_size(path->nodes[0],
400 							dst_item);
401 		}
402 	}
403 
404 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
405 			   src_ptr, item_size);
406 
407 	if (save_old_i_size) {
408 		struct btrfs_inode_item *dst_item;
409 		dst_item = (struct btrfs_inode_item *)dst_ptr;
410 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
411 	}
412 
413 	/* make sure the generation is filled in */
414 	if (key->type == BTRFS_INODE_ITEM_KEY) {
415 		struct btrfs_inode_item *dst_item;
416 		dst_item = (struct btrfs_inode_item *)dst_ptr;
417 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
418 			btrfs_set_inode_generation(path->nodes[0], dst_item,
419 						   trans->transid);
420 		}
421 	}
422 no_copy:
423 	btrfs_mark_buffer_dirty(path->nodes[0]);
424 	btrfs_release_path(root, path);
425 	return 0;
426 }
427 
428 /*
429  * simple helper to read an inode off the disk from a given root
430  * This can only be called for subvolume roots and not for the log
431  */
432 static noinline struct inode *read_one_inode(struct btrfs_root *root,
433 					     u64 objectid)
434 {
435 	struct inode *inode;
436 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
437 	if (inode->i_state & I_NEW) {
438 		BTRFS_I(inode)->root = root;
439 		BTRFS_I(inode)->location.objectid = objectid;
440 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
441 		BTRFS_I(inode)->location.offset = 0;
442 		btrfs_read_locked_inode(inode);
443 		unlock_new_inode(inode);
444 
445 	}
446 	if (is_bad_inode(inode)) {
447 		iput(inode);
448 		inode = NULL;
449 	}
450 	return inode;
451 }
452 
453 /* replays a single extent in 'eb' at 'slot' with 'key' into the
454  * subvolume 'root'.  path is released on entry and should be released
455  * on exit.
456  *
457  * extents in the log tree have not been allocated out of the extent
458  * tree yet.  So, this completes the allocation, taking a reference
459  * as required if the extent already exists or creating a new extent
460  * if it isn't in the extent allocation tree yet.
461  *
462  * The extent is inserted into the file, dropping any existing extents
463  * from the file that overlap the new one.
464  */
465 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
466 				      struct btrfs_root *root,
467 				      struct btrfs_path *path,
468 				      struct extent_buffer *eb, int slot,
469 				      struct btrfs_key *key)
470 {
471 	int found_type;
472 	u64 mask = root->sectorsize - 1;
473 	u64 extent_end;
474 	u64 alloc_hint;
475 	u64 start = key->offset;
476 	u64 saved_nbytes;
477 	struct btrfs_file_extent_item *item;
478 	struct inode *inode = NULL;
479 	unsigned long size;
480 	int ret = 0;
481 
482 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
483 	found_type = btrfs_file_extent_type(eb, item);
484 
485 	if (found_type == BTRFS_FILE_EXTENT_REG ||
486 	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
487 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
488 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
489 		size = btrfs_file_extent_inline_len(eb, item);
490 		extent_end = (start + size + mask) & ~mask;
491 	} else {
492 		ret = 0;
493 		goto out;
494 	}
495 
496 	inode = read_one_inode(root, key->objectid);
497 	if (!inode) {
498 		ret = -EIO;
499 		goto out;
500 	}
501 
502 	/*
503 	 * first check to see if we already have this extent in the
504 	 * file.  This must be done before the btrfs_drop_extents run
505 	 * so we don't try to drop this extent.
506 	 */
507 	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
508 				       start, 0);
509 
510 	if (ret == 0 &&
511 	    (found_type == BTRFS_FILE_EXTENT_REG ||
512 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
513 		struct btrfs_file_extent_item cmp1;
514 		struct btrfs_file_extent_item cmp2;
515 		struct btrfs_file_extent_item *existing;
516 		struct extent_buffer *leaf;
517 
518 		leaf = path->nodes[0];
519 		existing = btrfs_item_ptr(leaf, path->slots[0],
520 					  struct btrfs_file_extent_item);
521 
522 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
523 				   sizeof(cmp1));
524 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
525 				   sizeof(cmp2));
526 
527 		/*
528 		 * we already have a pointer to this exact extent,
529 		 * we don't have to do anything
530 		 */
531 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
532 			btrfs_release_path(root, path);
533 			goto out;
534 		}
535 	}
536 	btrfs_release_path(root, path);
537 
538 	saved_nbytes = inode_get_bytes(inode);
539 	/* drop any overlapping extents */
540 	ret = btrfs_drop_extents(trans, root, inode,
541 			 start, extent_end, start, &alloc_hint);
542 	BUG_ON(ret);
543 
544 	if (found_type == BTRFS_FILE_EXTENT_REG ||
545 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
546 		unsigned long dest_offset;
547 		struct btrfs_key ins;
548 
549 		ret = btrfs_insert_empty_item(trans, root, path, key,
550 					      sizeof(*item));
551 		BUG_ON(ret);
552 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
553 						    path->slots[0]);
554 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
555 				(unsigned long)item,  sizeof(*item));
556 
557 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
558 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
559 		ins.type = BTRFS_EXTENT_ITEM_KEY;
560 
561 		if (ins.objectid > 0) {
562 			u64 csum_start;
563 			u64 csum_end;
564 			LIST_HEAD(ordered_sums);
565 			/*
566 			 * is this extent already allocated in the extent
567 			 * allocation tree?  If so, just add a reference
568 			 */
569 			ret = btrfs_lookup_extent(root, ins.objectid,
570 						ins.offset);
571 			if (ret == 0) {
572 				ret = btrfs_inc_extent_ref(trans, root,
573 						ins.objectid, ins.offset,
574 						path->nodes[0]->start,
575 						root->root_key.objectid,
576 						trans->transid, key->objectid);
577 			} else {
578 				/*
579 				 * insert the extent pointer in the extent
580 				 * allocation tree
581 				 */
582 				ret = btrfs_alloc_logged_extent(trans, root,
583 						path->nodes[0]->start,
584 						root->root_key.objectid,
585 						trans->transid, key->objectid,
586 						&ins);
587 				BUG_ON(ret);
588 			}
589 			btrfs_release_path(root, path);
590 
591 			if (btrfs_file_extent_compression(eb, item)) {
592 				csum_start = ins.objectid;
593 				csum_end = csum_start + ins.offset;
594 			} else {
595 				csum_start = ins.objectid +
596 					btrfs_file_extent_offset(eb, item);
597 				csum_end = csum_start +
598 					btrfs_file_extent_num_bytes(eb, item);
599 			}
600 
601 			ret = btrfs_lookup_csums_range(root->log_root,
602 						csum_start, csum_end - 1,
603 						&ordered_sums);
604 			BUG_ON(ret);
605 			while (!list_empty(&ordered_sums)) {
606 				struct btrfs_ordered_sum *sums;
607 				sums = list_entry(ordered_sums.next,
608 						struct btrfs_ordered_sum,
609 						list);
610 				ret = btrfs_csum_file_blocks(trans,
611 						root->fs_info->csum_root,
612 						sums);
613 				BUG_ON(ret);
614 				list_del(&sums->list);
615 				kfree(sums);
616 			}
617 		} else {
618 			btrfs_release_path(root, path);
619 		}
620 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
621 		/* inline extents are easy, we just overwrite them */
622 		ret = overwrite_item(trans, root, path, eb, slot, key);
623 		BUG_ON(ret);
624 	}
625 
626 	inode_set_bytes(inode, saved_nbytes);
627 	btrfs_update_inode(trans, root, inode);
628 out:
629 	if (inode)
630 		iput(inode);
631 	return ret;
632 }
633 
634 /*
635  * when cleaning up conflicts between the directory names in the
636  * subvolume, directory names in the log and directory names in the
637  * inode back references, we may have to unlink inodes from directories.
638  *
639  * This is a helper function to do the unlink of a specific directory
640  * item
641  */
642 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
643 				      struct btrfs_root *root,
644 				      struct btrfs_path *path,
645 				      struct inode *dir,
646 				      struct btrfs_dir_item *di)
647 {
648 	struct inode *inode;
649 	char *name;
650 	int name_len;
651 	struct extent_buffer *leaf;
652 	struct btrfs_key location;
653 	int ret;
654 
655 	leaf = path->nodes[0];
656 
657 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
658 	name_len = btrfs_dir_name_len(leaf, di);
659 	name = kmalloc(name_len, GFP_NOFS);
660 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
661 	btrfs_release_path(root, path);
662 
663 	inode = read_one_inode(root, location.objectid);
664 	BUG_ON(!inode);
665 
666 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
667 	BUG_ON(ret);
668 
669 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
670 	BUG_ON(ret);
671 	kfree(name);
672 
673 	iput(inode);
674 	return ret;
675 }
676 
677 /*
678  * helper function to see if a given name and sequence number found
679  * in an inode back reference are already in a directory and correctly
680  * point to this inode
681  */
682 static noinline int inode_in_dir(struct btrfs_root *root,
683 				 struct btrfs_path *path,
684 				 u64 dirid, u64 objectid, u64 index,
685 				 const char *name, int name_len)
686 {
687 	struct btrfs_dir_item *di;
688 	struct btrfs_key location;
689 	int match = 0;
690 
691 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
692 					 index, name, name_len, 0);
693 	if (di && !IS_ERR(di)) {
694 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
695 		if (location.objectid != objectid)
696 			goto out;
697 	} else
698 		goto out;
699 	btrfs_release_path(root, path);
700 
701 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
702 	if (di && !IS_ERR(di)) {
703 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
704 		if (location.objectid != objectid)
705 			goto out;
706 	} else
707 		goto out;
708 	match = 1;
709 out:
710 	btrfs_release_path(root, path);
711 	return match;
712 }
713 
714 /*
715  * helper function to check a log tree for a named back reference in
716  * an inode.  This is used to decide if a back reference that is
717  * found in the subvolume conflicts with what we find in the log.
718  *
719  * inode backreferences may have multiple refs in a single item,
720  * during replay we process one reference at a time, and we don't
721  * want to delete valid links to a file from the subvolume if that
722  * link is also in the log.
723  */
724 static noinline int backref_in_log(struct btrfs_root *log,
725 				   struct btrfs_key *key,
726 				   char *name, int namelen)
727 {
728 	struct btrfs_path *path;
729 	struct btrfs_inode_ref *ref;
730 	unsigned long ptr;
731 	unsigned long ptr_end;
732 	unsigned long name_ptr;
733 	int found_name_len;
734 	int item_size;
735 	int ret;
736 	int match = 0;
737 
738 	path = btrfs_alloc_path();
739 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
740 	if (ret != 0)
741 		goto out;
742 
743 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
744 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
745 	ptr_end = ptr + item_size;
746 	while (ptr < ptr_end) {
747 		ref = (struct btrfs_inode_ref *)ptr;
748 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
749 		if (found_name_len == namelen) {
750 			name_ptr = (unsigned long)(ref + 1);
751 			ret = memcmp_extent_buffer(path->nodes[0], name,
752 						   name_ptr, namelen);
753 			if (ret == 0) {
754 				match = 1;
755 				goto out;
756 			}
757 		}
758 		ptr = (unsigned long)(ref + 1) + found_name_len;
759 	}
760 out:
761 	btrfs_free_path(path);
762 	return match;
763 }
764 
765 
766 /*
767  * replay one inode back reference item found in the log tree.
768  * eb, slot and key refer to the buffer and key found in the log tree.
769  * root is the destination we are replaying into, and path is for temp
770  * use by this function.  (it should be released on return).
771  */
772 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
773 				  struct btrfs_root *root,
774 				  struct btrfs_root *log,
775 				  struct btrfs_path *path,
776 				  struct extent_buffer *eb, int slot,
777 				  struct btrfs_key *key)
778 {
779 	struct inode *dir;
780 	int ret;
781 	struct btrfs_key location;
782 	struct btrfs_inode_ref *ref;
783 	struct btrfs_dir_item *di;
784 	struct inode *inode;
785 	char *name;
786 	int namelen;
787 	unsigned long ref_ptr;
788 	unsigned long ref_end;
789 
790 	location.objectid = key->objectid;
791 	location.type = BTRFS_INODE_ITEM_KEY;
792 	location.offset = 0;
793 
794 	/*
795 	 * it is possible that we didn't log all the parent directories
796 	 * for a given inode.  If we don't find the dir, just don't
797 	 * copy the back ref in.  The link count fixup code will take
798 	 * care of the rest
799 	 */
800 	dir = read_one_inode(root, key->offset);
801 	if (!dir)
802 		return -ENOENT;
803 
804 	inode = read_one_inode(root, key->objectid);
805 	BUG_ON(!dir);
806 
807 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
808 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
809 
810 again:
811 	ref = (struct btrfs_inode_ref *)ref_ptr;
812 
813 	namelen = btrfs_inode_ref_name_len(eb, ref);
814 	name = kmalloc(namelen, GFP_NOFS);
815 	BUG_ON(!name);
816 
817 	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
818 
819 	/* if we already have a perfect match, we're done */
820 	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
821 			 btrfs_inode_ref_index(eb, ref),
822 			 name, namelen)) {
823 		goto out;
824 	}
825 
826 	/*
827 	 * look for a conflicting back reference in the metadata.
828 	 * if we find one we have to unlink that name of the file
829 	 * before we add our new link.  Later on, we overwrite any
830 	 * existing back reference, and we don't want to create
831 	 * dangling pointers in the directory.
832 	 */
833 conflict_again:
834 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
835 	if (ret == 0) {
836 		char *victim_name;
837 		int victim_name_len;
838 		struct btrfs_inode_ref *victim_ref;
839 		unsigned long ptr;
840 		unsigned long ptr_end;
841 		struct extent_buffer *leaf = path->nodes[0];
842 
843 		/* are we trying to overwrite a back ref for the root directory
844 		 * if so, just jump out, we're done
845 		 */
846 		if (key->objectid == key->offset)
847 			goto out_nowrite;
848 
849 		/* check all the names in this back reference to see
850 		 * if they are in the log.  if so, we allow them to stay
851 		 * otherwise they must be unlinked as a conflict
852 		 */
853 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
854 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
855 		while (ptr < ptr_end) {
856 			victim_ref = (struct btrfs_inode_ref *)ptr;
857 			victim_name_len = btrfs_inode_ref_name_len(leaf,
858 								   victim_ref);
859 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
860 			BUG_ON(!victim_name);
861 
862 			read_extent_buffer(leaf, victim_name,
863 					   (unsigned long)(victim_ref + 1),
864 					   victim_name_len);
865 
866 			if (!backref_in_log(log, key, victim_name,
867 					    victim_name_len)) {
868 				btrfs_inc_nlink(inode);
869 				btrfs_release_path(root, path);
870 
871 				ret = btrfs_unlink_inode(trans, root, dir,
872 							 inode, victim_name,
873 							 victim_name_len);
874 				kfree(victim_name);
875 				btrfs_release_path(root, path);
876 				goto conflict_again;
877 			}
878 			kfree(victim_name);
879 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
880 		}
881 		BUG_ON(ret);
882 	}
883 	btrfs_release_path(root, path);
884 
885 	/* look for a conflicting sequence number */
886 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
887 					 btrfs_inode_ref_index(eb, ref),
888 					 name, namelen, 0);
889 	if (di && !IS_ERR(di)) {
890 		ret = drop_one_dir_item(trans, root, path, dir, di);
891 		BUG_ON(ret);
892 	}
893 	btrfs_release_path(root, path);
894 
895 
896 	/* look for a conflicting name */
897 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
898 				   name, namelen, 0);
899 	if (di && !IS_ERR(di)) {
900 		ret = drop_one_dir_item(trans, root, path, dir, di);
901 		BUG_ON(ret);
902 	}
903 	btrfs_release_path(root, path);
904 
905 	/* insert our name */
906 	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
907 			     btrfs_inode_ref_index(eb, ref));
908 	BUG_ON(ret);
909 
910 	btrfs_update_inode(trans, root, inode);
911 
912 out:
913 	ref_ptr = (unsigned long)(ref + 1) + namelen;
914 	kfree(name);
915 	if (ref_ptr < ref_end)
916 		goto again;
917 
918 	/* finally write the back reference in the inode */
919 	ret = overwrite_item(trans, root, path, eb, slot, key);
920 	BUG_ON(ret);
921 
922 out_nowrite:
923 	btrfs_release_path(root, path);
924 	iput(dir);
925 	iput(inode);
926 	return 0;
927 }
928 
929 /*
930  * There are a few corners where the link count of the file can't
931  * be properly maintained during replay.  So, instead of adding
932  * lots of complexity to the log code, we just scan the backrefs
933  * for any file that has been through replay.
934  *
935  * The scan will update the link count on the inode to reflect the
936  * number of back refs found.  If it goes down to zero, the iput
937  * will free the inode.
938  */
939 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
940 					   struct btrfs_root *root,
941 					   struct inode *inode)
942 {
943 	struct btrfs_path *path;
944 	int ret;
945 	struct btrfs_key key;
946 	u64 nlink = 0;
947 	unsigned long ptr;
948 	unsigned long ptr_end;
949 	int name_len;
950 
951 	key.objectid = inode->i_ino;
952 	key.type = BTRFS_INODE_REF_KEY;
953 	key.offset = (u64)-1;
954 
955 	path = btrfs_alloc_path();
956 
957 	while (1) {
958 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
959 		if (ret < 0)
960 			break;
961 		if (ret > 0) {
962 			if (path->slots[0] == 0)
963 				break;
964 			path->slots[0]--;
965 		}
966 		btrfs_item_key_to_cpu(path->nodes[0], &key,
967 				      path->slots[0]);
968 		if (key.objectid != inode->i_ino ||
969 		    key.type != BTRFS_INODE_REF_KEY)
970 			break;
971 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
972 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
973 						   path->slots[0]);
974 		while (ptr < ptr_end) {
975 			struct btrfs_inode_ref *ref;
976 
977 			ref = (struct btrfs_inode_ref *)ptr;
978 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
979 							    ref);
980 			ptr = (unsigned long)(ref + 1) + name_len;
981 			nlink++;
982 		}
983 
984 		if (key.offset == 0)
985 			break;
986 		key.offset--;
987 		btrfs_release_path(root, path);
988 	}
989 	btrfs_release_path(root, path);
990 	if (nlink != inode->i_nlink) {
991 		inode->i_nlink = nlink;
992 		btrfs_update_inode(trans, root, inode);
993 	}
994 	BTRFS_I(inode)->index_cnt = (u64)-1;
995 
996 	if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
997 		ret = replay_dir_deletes(trans, root, NULL, path,
998 					 inode->i_ino, 1);
999 		BUG_ON(ret);
1000 	}
1001 	btrfs_free_path(path);
1002 
1003 	return 0;
1004 }
1005 
1006 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1007 					    struct btrfs_root *root,
1008 					    struct btrfs_path *path)
1009 {
1010 	int ret;
1011 	struct btrfs_key key;
1012 	struct inode *inode;
1013 
1014 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1015 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1016 	key.offset = (u64)-1;
1017 	while (1) {
1018 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1019 		if (ret < 0)
1020 			break;
1021 
1022 		if (ret == 1) {
1023 			if (path->slots[0] == 0)
1024 				break;
1025 			path->slots[0]--;
1026 		}
1027 
1028 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1029 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1030 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1031 			break;
1032 
1033 		ret = btrfs_del_item(trans, root, path);
1034 		BUG_ON(ret);
1035 
1036 		btrfs_release_path(root, path);
1037 		inode = read_one_inode(root, key.offset);
1038 		BUG_ON(!inode);
1039 
1040 		ret = fixup_inode_link_count(trans, root, inode);
1041 		BUG_ON(ret);
1042 
1043 		iput(inode);
1044 
1045 		/*
1046 		 * fixup on a directory may create new entries,
1047 		 * make sure we always look for the highset possible
1048 		 * offset
1049 		 */
1050 		key.offset = (u64)-1;
1051 	}
1052 	btrfs_release_path(root, path);
1053 	return 0;
1054 }
1055 
1056 
1057 /*
1058  * record a given inode in the fixup dir so we can check its link
1059  * count when replay is done.  The link count is incremented here
1060  * so the inode won't go away until we check it
1061  */
1062 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1063 				      struct btrfs_root *root,
1064 				      struct btrfs_path *path,
1065 				      u64 objectid)
1066 {
1067 	struct btrfs_key key;
1068 	int ret = 0;
1069 	struct inode *inode;
1070 
1071 	inode = read_one_inode(root, objectid);
1072 	BUG_ON(!inode);
1073 
1074 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1075 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1076 	key.offset = objectid;
1077 
1078 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1079 
1080 	btrfs_release_path(root, path);
1081 	if (ret == 0) {
1082 		btrfs_inc_nlink(inode);
1083 		btrfs_update_inode(trans, root, inode);
1084 	} else if (ret == -EEXIST) {
1085 		ret = 0;
1086 	} else {
1087 		BUG();
1088 	}
1089 	iput(inode);
1090 
1091 	return ret;
1092 }
1093 
1094 /*
1095  * when replaying the log for a directory, we only insert names
1096  * for inodes that actually exist.  This means an fsync on a directory
1097  * does not implicitly fsync all the new files in it
1098  */
1099 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1100 				    struct btrfs_root *root,
1101 				    struct btrfs_path *path,
1102 				    u64 dirid, u64 index,
1103 				    char *name, int name_len, u8 type,
1104 				    struct btrfs_key *location)
1105 {
1106 	struct inode *inode;
1107 	struct inode *dir;
1108 	int ret;
1109 
1110 	inode = read_one_inode(root, location->objectid);
1111 	if (!inode)
1112 		return -ENOENT;
1113 
1114 	dir = read_one_inode(root, dirid);
1115 	if (!dir) {
1116 		iput(inode);
1117 		return -EIO;
1118 	}
1119 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1120 
1121 	/* FIXME, put inode into FIXUP list */
1122 
1123 	iput(inode);
1124 	iput(dir);
1125 	return ret;
1126 }
1127 
1128 /*
1129  * take a single entry in a log directory item and replay it into
1130  * the subvolume.
1131  *
1132  * if a conflicting item exists in the subdirectory already,
1133  * the inode it points to is unlinked and put into the link count
1134  * fix up tree.
1135  *
1136  * If a name from the log points to a file or directory that does
1137  * not exist in the FS, it is skipped.  fsyncs on directories
1138  * do not force down inodes inside that directory, just changes to the
1139  * names or unlinks in a directory.
1140  */
1141 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1142 				    struct btrfs_root *root,
1143 				    struct btrfs_path *path,
1144 				    struct extent_buffer *eb,
1145 				    struct btrfs_dir_item *di,
1146 				    struct btrfs_key *key)
1147 {
1148 	char *name;
1149 	int name_len;
1150 	struct btrfs_dir_item *dst_di;
1151 	struct btrfs_key found_key;
1152 	struct btrfs_key log_key;
1153 	struct inode *dir;
1154 	u8 log_type;
1155 	int exists;
1156 	int ret;
1157 
1158 	dir = read_one_inode(root, key->objectid);
1159 	BUG_ON(!dir);
1160 
1161 	name_len = btrfs_dir_name_len(eb, di);
1162 	name = kmalloc(name_len, GFP_NOFS);
1163 	log_type = btrfs_dir_type(eb, di);
1164 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1165 		   name_len);
1166 
1167 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1168 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1169 	if (exists == 0)
1170 		exists = 1;
1171 	else
1172 		exists = 0;
1173 	btrfs_release_path(root, path);
1174 
1175 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1176 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1177 				       name, name_len, 1);
1178 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1179 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1180 						     key->objectid,
1181 						     key->offset, name,
1182 						     name_len, 1);
1183 	} else {
1184 		BUG();
1185 	}
1186 	if (!dst_di || IS_ERR(dst_di)) {
1187 		/* we need a sequence number to insert, so we only
1188 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1189 		 */
1190 		if (key->type != BTRFS_DIR_INDEX_KEY)
1191 			goto out;
1192 		goto insert;
1193 	}
1194 
1195 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1196 	/* the existing item matches the logged item */
1197 	if (found_key.objectid == log_key.objectid &&
1198 	    found_key.type == log_key.type &&
1199 	    found_key.offset == log_key.offset &&
1200 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1201 		goto out;
1202 	}
1203 
1204 	/*
1205 	 * don't drop the conflicting directory entry if the inode
1206 	 * for the new entry doesn't exist
1207 	 */
1208 	if (!exists)
1209 		goto out;
1210 
1211 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1212 	BUG_ON(ret);
1213 
1214 	if (key->type == BTRFS_DIR_INDEX_KEY)
1215 		goto insert;
1216 out:
1217 	btrfs_release_path(root, path);
1218 	kfree(name);
1219 	iput(dir);
1220 	return 0;
1221 
1222 insert:
1223 	btrfs_release_path(root, path);
1224 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1225 			      name, name_len, log_type, &log_key);
1226 
1227 	if (ret && ret != -ENOENT)
1228 		BUG();
1229 	goto out;
1230 }
1231 
1232 /*
1233  * find all the names in a directory item and reconcile them into
1234  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1235  * one name in a directory item, but the same code gets used for
1236  * both directory index types
1237  */
1238 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1239 					struct btrfs_root *root,
1240 					struct btrfs_path *path,
1241 					struct extent_buffer *eb, int slot,
1242 					struct btrfs_key *key)
1243 {
1244 	int ret;
1245 	u32 item_size = btrfs_item_size_nr(eb, slot);
1246 	struct btrfs_dir_item *di;
1247 	int name_len;
1248 	unsigned long ptr;
1249 	unsigned long ptr_end;
1250 
1251 	ptr = btrfs_item_ptr_offset(eb, slot);
1252 	ptr_end = ptr + item_size;
1253 	while (ptr < ptr_end) {
1254 		di = (struct btrfs_dir_item *)ptr;
1255 		name_len = btrfs_dir_name_len(eb, di);
1256 		ret = replay_one_name(trans, root, path, eb, di, key);
1257 		BUG_ON(ret);
1258 		ptr = (unsigned long)(di + 1);
1259 		ptr += name_len;
1260 	}
1261 	return 0;
1262 }
1263 
1264 /*
1265  * directory replay has two parts.  There are the standard directory
1266  * items in the log copied from the subvolume, and range items
1267  * created in the log while the subvolume was logged.
1268  *
1269  * The range items tell us which parts of the key space the log
1270  * is authoritative for.  During replay, if a key in the subvolume
1271  * directory is in a logged range item, but not actually in the log
1272  * that means it was deleted from the directory before the fsync
1273  * and should be removed.
1274  */
1275 static noinline int find_dir_range(struct btrfs_root *root,
1276 				   struct btrfs_path *path,
1277 				   u64 dirid, int key_type,
1278 				   u64 *start_ret, u64 *end_ret)
1279 {
1280 	struct btrfs_key key;
1281 	u64 found_end;
1282 	struct btrfs_dir_log_item *item;
1283 	int ret;
1284 	int nritems;
1285 
1286 	if (*start_ret == (u64)-1)
1287 		return 1;
1288 
1289 	key.objectid = dirid;
1290 	key.type = key_type;
1291 	key.offset = *start_ret;
1292 
1293 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1294 	if (ret < 0)
1295 		goto out;
1296 	if (ret > 0) {
1297 		if (path->slots[0] == 0)
1298 			goto out;
1299 		path->slots[0]--;
1300 	}
1301 	if (ret != 0)
1302 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1303 
1304 	if (key.type != key_type || key.objectid != dirid) {
1305 		ret = 1;
1306 		goto next;
1307 	}
1308 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1309 			      struct btrfs_dir_log_item);
1310 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1311 
1312 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1313 		ret = 0;
1314 		*start_ret = key.offset;
1315 		*end_ret = found_end;
1316 		goto out;
1317 	}
1318 	ret = 1;
1319 next:
1320 	/* check the next slot in the tree to see if it is a valid item */
1321 	nritems = btrfs_header_nritems(path->nodes[0]);
1322 	if (path->slots[0] >= nritems) {
1323 		ret = btrfs_next_leaf(root, path);
1324 		if (ret)
1325 			goto out;
1326 	} else {
1327 		path->slots[0]++;
1328 	}
1329 
1330 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1331 
1332 	if (key.type != key_type || key.objectid != dirid) {
1333 		ret = 1;
1334 		goto out;
1335 	}
1336 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1337 			      struct btrfs_dir_log_item);
1338 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1339 	*start_ret = key.offset;
1340 	*end_ret = found_end;
1341 	ret = 0;
1342 out:
1343 	btrfs_release_path(root, path);
1344 	return ret;
1345 }
1346 
1347 /*
1348  * this looks for a given directory item in the log.  If the directory
1349  * item is not in the log, the item is removed and the inode it points
1350  * to is unlinked
1351  */
1352 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1353 				      struct btrfs_root *root,
1354 				      struct btrfs_root *log,
1355 				      struct btrfs_path *path,
1356 				      struct btrfs_path *log_path,
1357 				      struct inode *dir,
1358 				      struct btrfs_key *dir_key)
1359 {
1360 	int ret;
1361 	struct extent_buffer *eb;
1362 	int slot;
1363 	u32 item_size;
1364 	struct btrfs_dir_item *di;
1365 	struct btrfs_dir_item *log_di;
1366 	int name_len;
1367 	unsigned long ptr;
1368 	unsigned long ptr_end;
1369 	char *name;
1370 	struct inode *inode;
1371 	struct btrfs_key location;
1372 
1373 again:
1374 	eb = path->nodes[0];
1375 	slot = path->slots[0];
1376 	item_size = btrfs_item_size_nr(eb, slot);
1377 	ptr = btrfs_item_ptr_offset(eb, slot);
1378 	ptr_end = ptr + item_size;
1379 	while (ptr < ptr_end) {
1380 		di = (struct btrfs_dir_item *)ptr;
1381 		name_len = btrfs_dir_name_len(eb, di);
1382 		name = kmalloc(name_len, GFP_NOFS);
1383 		if (!name) {
1384 			ret = -ENOMEM;
1385 			goto out;
1386 		}
1387 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
1388 				  name_len);
1389 		log_di = NULL;
1390 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1391 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
1392 						       dir_key->objectid,
1393 						       name, name_len, 0);
1394 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1395 			log_di = btrfs_lookup_dir_index_item(trans, log,
1396 						     log_path,
1397 						     dir_key->objectid,
1398 						     dir_key->offset,
1399 						     name, name_len, 0);
1400 		}
1401 		if (!log_di || IS_ERR(log_di)) {
1402 			btrfs_dir_item_key_to_cpu(eb, di, &location);
1403 			btrfs_release_path(root, path);
1404 			btrfs_release_path(log, log_path);
1405 			inode = read_one_inode(root, location.objectid);
1406 			BUG_ON(!inode);
1407 
1408 			ret = link_to_fixup_dir(trans, root,
1409 						path, location.objectid);
1410 			BUG_ON(ret);
1411 			btrfs_inc_nlink(inode);
1412 			ret = btrfs_unlink_inode(trans, root, dir, inode,
1413 						 name, name_len);
1414 			BUG_ON(ret);
1415 			kfree(name);
1416 			iput(inode);
1417 
1418 			/* there might still be more names under this key
1419 			 * check and repeat if required
1420 			 */
1421 			ret = btrfs_search_slot(NULL, root, dir_key, path,
1422 						0, 0);
1423 			if (ret == 0)
1424 				goto again;
1425 			ret = 0;
1426 			goto out;
1427 		}
1428 		btrfs_release_path(log, log_path);
1429 		kfree(name);
1430 
1431 		ptr = (unsigned long)(di + 1);
1432 		ptr += name_len;
1433 	}
1434 	ret = 0;
1435 out:
1436 	btrfs_release_path(root, path);
1437 	btrfs_release_path(log, log_path);
1438 	return ret;
1439 }
1440 
1441 /*
1442  * deletion replay happens before we copy any new directory items
1443  * out of the log or out of backreferences from inodes.  It
1444  * scans the log to find ranges of keys that log is authoritative for,
1445  * and then scans the directory to find items in those ranges that are
1446  * not present in the log.
1447  *
1448  * Anything we don't find in the log is unlinked and removed from the
1449  * directory.
1450  */
1451 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1452 				       struct btrfs_root *root,
1453 				       struct btrfs_root *log,
1454 				       struct btrfs_path *path,
1455 				       u64 dirid, int del_all)
1456 {
1457 	u64 range_start;
1458 	u64 range_end;
1459 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1460 	int ret = 0;
1461 	struct btrfs_key dir_key;
1462 	struct btrfs_key found_key;
1463 	struct btrfs_path *log_path;
1464 	struct inode *dir;
1465 
1466 	dir_key.objectid = dirid;
1467 	dir_key.type = BTRFS_DIR_ITEM_KEY;
1468 	log_path = btrfs_alloc_path();
1469 	if (!log_path)
1470 		return -ENOMEM;
1471 
1472 	dir = read_one_inode(root, dirid);
1473 	/* it isn't an error if the inode isn't there, that can happen
1474 	 * because we replay the deletes before we copy in the inode item
1475 	 * from the log
1476 	 */
1477 	if (!dir) {
1478 		btrfs_free_path(log_path);
1479 		return 0;
1480 	}
1481 again:
1482 	range_start = 0;
1483 	range_end = 0;
1484 	while (1) {
1485 		if (del_all)
1486 			range_end = (u64)-1;
1487 		else {
1488 			ret = find_dir_range(log, path, dirid, key_type,
1489 					     &range_start, &range_end);
1490 			if (ret != 0)
1491 				break;
1492 		}
1493 
1494 		dir_key.offset = range_start;
1495 		while (1) {
1496 			int nritems;
1497 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
1498 						0, 0);
1499 			if (ret < 0)
1500 				goto out;
1501 
1502 			nritems = btrfs_header_nritems(path->nodes[0]);
1503 			if (path->slots[0] >= nritems) {
1504 				ret = btrfs_next_leaf(root, path);
1505 				if (ret)
1506 					break;
1507 			}
1508 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1509 					      path->slots[0]);
1510 			if (found_key.objectid != dirid ||
1511 			    found_key.type != dir_key.type)
1512 				goto next_type;
1513 
1514 			if (found_key.offset > range_end)
1515 				break;
1516 
1517 			ret = check_item_in_log(trans, root, log, path,
1518 						log_path, dir,
1519 						&found_key);
1520 			BUG_ON(ret);
1521 			if (found_key.offset == (u64)-1)
1522 				break;
1523 			dir_key.offset = found_key.offset + 1;
1524 		}
1525 		btrfs_release_path(root, path);
1526 		if (range_end == (u64)-1)
1527 			break;
1528 		range_start = range_end + 1;
1529 	}
1530 
1531 next_type:
1532 	ret = 0;
1533 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1534 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
1535 		dir_key.type = BTRFS_DIR_INDEX_KEY;
1536 		btrfs_release_path(root, path);
1537 		goto again;
1538 	}
1539 out:
1540 	btrfs_release_path(root, path);
1541 	btrfs_free_path(log_path);
1542 	iput(dir);
1543 	return ret;
1544 }
1545 
1546 /*
1547  * the process_func used to replay items from the log tree.  This
1548  * gets called in two different stages.  The first stage just looks
1549  * for inodes and makes sure they are all copied into the subvolume.
1550  *
1551  * The second stage copies all the other item types from the log into
1552  * the subvolume.  The two stage approach is slower, but gets rid of
1553  * lots of complexity around inodes referencing other inodes that exist
1554  * only in the log (references come from either directory items or inode
1555  * back refs).
1556  */
1557 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1558 			     struct walk_control *wc, u64 gen)
1559 {
1560 	int nritems;
1561 	struct btrfs_path *path;
1562 	struct btrfs_root *root = wc->replay_dest;
1563 	struct btrfs_key key;
1564 	u32 item_size;
1565 	int level;
1566 	int i;
1567 	int ret;
1568 
1569 	btrfs_read_buffer(eb, gen);
1570 
1571 	level = btrfs_header_level(eb);
1572 
1573 	if (level != 0)
1574 		return 0;
1575 
1576 	path = btrfs_alloc_path();
1577 	BUG_ON(!path);
1578 
1579 	nritems = btrfs_header_nritems(eb);
1580 	for (i = 0; i < nritems; i++) {
1581 		btrfs_item_key_to_cpu(eb, &key, i);
1582 		item_size = btrfs_item_size_nr(eb, i);
1583 
1584 		/* inode keys are done during the first stage */
1585 		if (key.type == BTRFS_INODE_ITEM_KEY &&
1586 		    wc->stage == LOG_WALK_REPLAY_INODES) {
1587 			struct inode *inode;
1588 			struct btrfs_inode_item *inode_item;
1589 			u32 mode;
1590 
1591 			inode_item = btrfs_item_ptr(eb, i,
1592 					    struct btrfs_inode_item);
1593 			mode = btrfs_inode_mode(eb, inode_item);
1594 			if (S_ISDIR(mode)) {
1595 				ret = replay_dir_deletes(wc->trans,
1596 					 root, log, path, key.objectid, 0);
1597 				BUG_ON(ret);
1598 			}
1599 			ret = overwrite_item(wc->trans, root, path,
1600 					     eb, i, &key);
1601 			BUG_ON(ret);
1602 
1603 			/* for regular files, truncate away
1604 			 * extents past the new EOF
1605 			 */
1606 			if (S_ISREG(mode)) {
1607 				inode = read_one_inode(root,
1608 						       key.objectid);
1609 				BUG_ON(!inode);
1610 
1611 				ret = btrfs_truncate_inode_items(wc->trans,
1612 					root, inode, inode->i_size,
1613 					BTRFS_EXTENT_DATA_KEY);
1614 				BUG_ON(ret);
1615 
1616 				/* if the nlink count is zero here, the iput
1617 				 * will free the inode.  We bump it to make
1618 				 * sure it doesn't get freed until the link
1619 				 * count fixup is done
1620 				 */
1621 				if (inode->i_nlink == 0) {
1622 					btrfs_inc_nlink(inode);
1623 					btrfs_update_inode(wc->trans,
1624 							   root, inode);
1625 				}
1626 				iput(inode);
1627 			}
1628 			ret = link_to_fixup_dir(wc->trans, root,
1629 						path, key.objectid);
1630 			BUG_ON(ret);
1631 		}
1632 		if (wc->stage < LOG_WALK_REPLAY_ALL)
1633 			continue;
1634 
1635 		/* these keys are simply copied */
1636 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
1637 			ret = overwrite_item(wc->trans, root, path,
1638 					     eb, i, &key);
1639 			BUG_ON(ret);
1640 		} else if (key.type == BTRFS_INODE_REF_KEY) {
1641 			ret = add_inode_ref(wc->trans, root, log, path,
1642 					    eb, i, &key);
1643 			BUG_ON(ret && ret != -ENOENT);
1644 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1645 			ret = replay_one_extent(wc->trans, root, path,
1646 						eb, i, &key);
1647 			BUG_ON(ret);
1648 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
1649 			   key.type == BTRFS_DIR_INDEX_KEY) {
1650 			ret = replay_one_dir_item(wc->trans, root, path,
1651 						  eb, i, &key);
1652 			BUG_ON(ret);
1653 		}
1654 	}
1655 	btrfs_free_path(path);
1656 	return 0;
1657 }
1658 
1659 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1660 				   struct btrfs_root *root,
1661 				   struct btrfs_path *path, int *level,
1662 				   struct walk_control *wc)
1663 {
1664 	u64 root_owner;
1665 	u64 root_gen;
1666 	u64 bytenr;
1667 	u64 ptr_gen;
1668 	struct extent_buffer *next;
1669 	struct extent_buffer *cur;
1670 	struct extent_buffer *parent;
1671 	u32 blocksize;
1672 	int ret = 0;
1673 
1674 	WARN_ON(*level < 0);
1675 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1676 
1677 	while (*level > 0) {
1678 		WARN_ON(*level < 0);
1679 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
1680 		cur = path->nodes[*level];
1681 
1682 		if (btrfs_header_level(cur) != *level)
1683 			WARN_ON(1);
1684 
1685 		if (path->slots[*level] >=
1686 		    btrfs_header_nritems(cur))
1687 			break;
1688 
1689 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1690 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1691 		blocksize = btrfs_level_size(root, *level - 1);
1692 
1693 		parent = path->nodes[*level];
1694 		root_owner = btrfs_header_owner(parent);
1695 		root_gen = btrfs_header_generation(parent);
1696 
1697 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1698 
1699 		wc->process_func(root, next, wc, ptr_gen);
1700 
1701 		if (*level == 1) {
1702 			path->slots[*level]++;
1703 			if (wc->free) {
1704 				btrfs_read_buffer(next, ptr_gen);
1705 
1706 				btrfs_tree_lock(next);
1707 				clean_tree_block(trans, root, next);
1708 				btrfs_set_lock_blocking(next);
1709 				btrfs_wait_tree_block_writeback(next);
1710 				btrfs_tree_unlock(next);
1711 
1712 				ret = btrfs_drop_leaf_ref(trans, root, next);
1713 				BUG_ON(ret);
1714 
1715 				WARN_ON(root_owner !=
1716 					BTRFS_TREE_LOG_OBJECTID);
1717 				ret = btrfs_free_reserved_extent(root,
1718 							 bytenr, blocksize);
1719 				BUG_ON(ret);
1720 			}
1721 			free_extent_buffer(next);
1722 			continue;
1723 		}
1724 		btrfs_read_buffer(next, ptr_gen);
1725 
1726 		WARN_ON(*level <= 0);
1727 		if (path->nodes[*level-1])
1728 			free_extent_buffer(path->nodes[*level-1]);
1729 		path->nodes[*level-1] = next;
1730 		*level = btrfs_header_level(next);
1731 		path->slots[*level] = 0;
1732 		cond_resched();
1733 	}
1734 	WARN_ON(*level < 0);
1735 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
1736 
1737 	if (path->nodes[*level] == root->node)
1738 		parent = path->nodes[*level];
1739 	else
1740 		parent = path->nodes[*level + 1];
1741 
1742 	bytenr = path->nodes[*level]->start;
1743 
1744 	blocksize = btrfs_level_size(root, *level);
1745 	root_owner = btrfs_header_owner(parent);
1746 	root_gen = btrfs_header_generation(parent);
1747 
1748 	wc->process_func(root, path->nodes[*level], wc,
1749 			 btrfs_header_generation(path->nodes[*level]));
1750 
1751 	if (wc->free) {
1752 		next = path->nodes[*level];
1753 		btrfs_tree_lock(next);
1754 		clean_tree_block(trans, root, next);
1755 		btrfs_set_lock_blocking(next);
1756 		btrfs_wait_tree_block_writeback(next);
1757 		btrfs_tree_unlock(next);
1758 
1759 		if (*level == 0) {
1760 			ret = btrfs_drop_leaf_ref(trans, root, next);
1761 			BUG_ON(ret);
1762 		}
1763 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1764 		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1765 		BUG_ON(ret);
1766 	}
1767 	free_extent_buffer(path->nodes[*level]);
1768 	path->nodes[*level] = NULL;
1769 	*level += 1;
1770 
1771 	cond_resched();
1772 	return 0;
1773 }
1774 
1775 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1776 				 struct btrfs_root *root,
1777 				 struct btrfs_path *path, int *level,
1778 				 struct walk_control *wc)
1779 {
1780 	u64 root_owner;
1781 	u64 root_gen;
1782 	int i;
1783 	int slot;
1784 	int ret;
1785 
1786 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1787 		slot = path->slots[i];
1788 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1789 			struct extent_buffer *node;
1790 			node = path->nodes[i];
1791 			path->slots[i]++;
1792 			*level = i;
1793 			WARN_ON(*level == 0);
1794 			return 0;
1795 		} else {
1796 			struct extent_buffer *parent;
1797 			if (path->nodes[*level] == root->node)
1798 				parent = path->nodes[*level];
1799 			else
1800 				parent = path->nodes[*level + 1];
1801 
1802 			root_owner = btrfs_header_owner(parent);
1803 			root_gen = btrfs_header_generation(parent);
1804 			wc->process_func(root, path->nodes[*level], wc,
1805 				 btrfs_header_generation(path->nodes[*level]));
1806 			if (wc->free) {
1807 				struct extent_buffer *next;
1808 
1809 				next = path->nodes[*level];
1810 
1811 				btrfs_tree_lock(next);
1812 				clean_tree_block(trans, root, next);
1813 				btrfs_set_lock_blocking(next);
1814 				btrfs_wait_tree_block_writeback(next);
1815 				btrfs_tree_unlock(next);
1816 
1817 				if (*level == 0) {
1818 					ret = btrfs_drop_leaf_ref(trans, root,
1819 								  next);
1820 					BUG_ON(ret);
1821 				}
1822 
1823 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1824 				ret = btrfs_free_reserved_extent(root,
1825 						path->nodes[*level]->start,
1826 						path->nodes[*level]->len);
1827 				BUG_ON(ret);
1828 			}
1829 			free_extent_buffer(path->nodes[*level]);
1830 			path->nodes[*level] = NULL;
1831 			*level = i + 1;
1832 		}
1833 	}
1834 	return 1;
1835 }
1836 
1837 /*
1838  * drop the reference count on the tree rooted at 'snap'.  This traverses
1839  * the tree freeing any blocks that have a ref count of zero after being
1840  * decremented.
1841  */
1842 static int walk_log_tree(struct btrfs_trans_handle *trans,
1843 			 struct btrfs_root *log, struct walk_control *wc)
1844 {
1845 	int ret = 0;
1846 	int wret;
1847 	int level;
1848 	struct btrfs_path *path;
1849 	int i;
1850 	int orig_level;
1851 
1852 	path = btrfs_alloc_path();
1853 	BUG_ON(!path);
1854 
1855 	level = btrfs_header_level(log->node);
1856 	orig_level = level;
1857 	path->nodes[level] = log->node;
1858 	extent_buffer_get(log->node);
1859 	path->slots[level] = 0;
1860 
1861 	while (1) {
1862 		wret = walk_down_log_tree(trans, log, path, &level, wc);
1863 		if (wret > 0)
1864 			break;
1865 		if (wret < 0)
1866 			ret = wret;
1867 
1868 		wret = walk_up_log_tree(trans, log, path, &level, wc);
1869 		if (wret > 0)
1870 			break;
1871 		if (wret < 0)
1872 			ret = wret;
1873 	}
1874 
1875 	/* was the root node processed? if not, catch it here */
1876 	if (path->nodes[orig_level]) {
1877 		wc->process_func(log, path->nodes[orig_level], wc,
1878 			 btrfs_header_generation(path->nodes[orig_level]));
1879 		if (wc->free) {
1880 			struct extent_buffer *next;
1881 
1882 			next = path->nodes[orig_level];
1883 
1884 			btrfs_tree_lock(next);
1885 			clean_tree_block(trans, log, next);
1886 			btrfs_set_lock_blocking(next);
1887 			btrfs_wait_tree_block_writeback(next);
1888 			btrfs_tree_unlock(next);
1889 
1890 			if (orig_level == 0) {
1891 				ret = btrfs_drop_leaf_ref(trans, log,
1892 							  next);
1893 				BUG_ON(ret);
1894 			}
1895 			WARN_ON(log->root_key.objectid !=
1896 				BTRFS_TREE_LOG_OBJECTID);
1897 			ret = btrfs_free_reserved_extent(log, next->start,
1898 							 next->len);
1899 			BUG_ON(ret);
1900 		}
1901 	}
1902 
1903 	for (i = 0; i <= orig_level; i++) {
1904 		if (path->nodes[i]) {
1905 			free_extent_buffer(path->nodes[i]);
1906 			path->nodes[i] = NULL;
1907 		}
1908 	}
1909 	btrfs_free_path(path);
1910 	return ret;
1911 }
1912 
1913 /*
1914  * helper function to update the item for a given subvolumes log root
1915  * in the tree of log roots
1916  */
1917 static int update_log_root(struct btrfs_trans_handle *trans,
1918 			   struct btrfs_root *log)
1919 {
1920 	int ret;
1921 
1922 	if (log->log_transid == 1) {
1923 		/* insert root item on the first sync */
1924 		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1925 				&log->root_key, &log->root_item);
1926 	} else {
1927 		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1928 				&log->root_key, &log->root_item);
1929 	}
1930 	return ret;
1931 }
1932 
1933 static int wait_log_commit(struct btrfs_trans_handle *trans,
1934 			   struct btrfs_root *root, unsigned long transid)
1935 {
1936 	DEFINE_WAIT(wait);
1937 	int index = transid % 2;
1938 
1939 	/*
1940 	 * we only allow two pending log transactions at a time,
1941 	 * so we know that if ours is more than 2 older than the
1942 	 * current transaction, we're done
1943 	 */
1944 	do {
1945 		prepare_to_wait(&root->log_commit_wait[index],
1946 				&wait, TASK_UNINTERRUPTIBLE);
1947 		mutex_unlock(&root->log_mutex);
1948 
1949 		if (root->fs_info->last_trans_log_full_commit !=
1950 		    trans->transid && root->log_transid < transid + 2 &&
1951 		    atomic_read(&root->log_commit[index]))
1952 			schedule();
1953 
1954 		finish_wait(&root->log_commit_wait[index], &wait);
1955 		mutex_lock(&root->log_mutex);
1956 	} while (root->log_transid < transid + 2 &&
1957 		 atomic_read(&root->log_commit[index]));
1958 	return 0;
1959 }
1960 
1961 static int wait_for_writer(struct btrfs_trans_handle *trans,
1962 			   struct btrfs_root *root)
1963 {
1964 	DEFINE_WAIT(wait);
1965 	while (atomic_read(&root->log_writers)) {
1966 		prepare_to_wait(&root->log_writer_wait,
1967 				&wait, TASK_UNINTERRUPTIBLE);
1968 		mutex_unlock(&root->log_mutex);
1969 		if (root->fs_info->last_trans_log_full_commit !=
1970 		    trans->transid && atomic_read(&root->log_writers))
1971 			schedule();
1972 		mutex_lock(&root->log_mutex);
1973 		finish_wait(&root->log_writer_wait, &wait);
1974 	}
1975 	return 0;
1976 }
1977 
1978 /*
1979  * btrfs_sync_log does sends a given tree log down to the disk and
1980  * updates the super blocks to record it.  When this call is done,
1981  * you know that any inodes previously logged are safely on disk only
1982  * if it returns 0.
1983  *
1984  * Any other return value means you need to call btrfs_commit_transaction.
1985  * Some of the edge cases for fsyncing directories that have had unlinks
1986  * or renames done in the past mean that sometimes the only safe
1987  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
1988  * that has happened.
1989  */
1990 int btrfs_sync_log(struct btrfs_trans_handle *trans,
1991 		   struct btrfs_root *root)
1992 {
1993 	int index1;
1994 	int index2;
1995 	int ret;
1996 	struct btrfs_root *log = root->log_root;
1997 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1998 
1999 	mutex_lock(&root->log_mutex);
2000 	index1 = root->log_transid % 2;
2001 	if (atomic_read(&root->log_commit[index1])) {
2002 		wait_log_commit(trans, root, root->log_transid);
2003 		mutex_unlock(&root->log_mutex);
2004 		return 0;
2005 	}
2006 	atomic_set(&root->log_commit[index1], 1);
2007 
2008 	/* wait for previous tree log sync to complete */
2009 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2010 		wait_log_commit(trans, root, root->log_transid - 1);
2011 
2012 	while (1) {
2013 		unsigned long batch = root->log_batch;
2014 		mutex_unlock(&root->log_mutex);
2015 		schedule_timeout_uninterruptible(1);
2016 		mutex_lock(&root->log_mutex);
2017 
2018 		wait_for_writer(trans, root);
2019 		if (batch == root->log_batch)
2020 			break;
2021 	}
2022 
2023 	/* bail out if we need to do a full commit */
2024 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2025 		ret = -EAGAIN;
2026 		mutex_unlock(&root->log_mutex);
2027 		goto out;
2028 	}
2029 
2030 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
2031 	BUG_ON(ret);
2032 
2033 	btrfs_set_root_bytenr(&log->root_item, log->node->start);
2034 	btrfs_set_root_generation(&log->root_item, trans->transid);
2035 	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2036 
2037 	root->log_batch = 0;
2038 	root->log_transid++;
2039 	log->log_transid = root->log_transid;
2040 	smp_mb();
2041 	/*
2042 	 * log tree has been flushed to disk, new modifications of
2043 	 * the log will be written to new positions. so it's safe to
2044 	 * allow log writers to go in.
2045 	 */
2046 	mutex_unlock(&root->log_mutex);
2047 
2048 	mutex_lock(&log_root_tree->log_mutex);
2049 	log_root_tree->log_batch++;
2050 	atomic_inc(&log_root_tree->log_writers);
2051 	mutex_unlock(&log_root_tree->log_mutex);
2052 
2053 	ret = update_log_root(trans, log);
2054 	BUG_ON(ret);
2055 
2056 	mutex_lock(&log_root_tree->log_mutex);
2057 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2058 		smp_mb();
2059 		if (waitqueue_active(&log_root_tree->log_writer_wait))
2060 			wake_up(&log_root_tree->log_writer_wait);
2061 	}
2062 
2063 	index2 = log_root_tree->log_transid % 2;
2064 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2065 		wait_log_commit(trans, log_root_tree,
2066 				log_root_tree->log_transid);
2067 		mutex_unlock(&log_root_tree->log_mutex);
2068 		goto out;
2069 	}
2070 	atomic_set(&log_root_tree->log_commit[index2], 1);
2071 
2072 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2073 		wait_log_commit(trans, log_root_tree,
2074 				log_root_tree->log_transid - 1);
2075 	}
2076 
2077 	wait_for_writer(trans, log_root_tree);
2078 
2079 	/*
2080 	 * now that we've moved on to the tree of log tree roots,
2081 	 * check the full commit flag again
2082 	 */
2083 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2084 		mutex_unlock(&log_root_tree->log_mutex);
2085 		ret = -EAGAIN;
2086 		goto out_wake_log_root;
2087 	}
2088 
2089 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2090 				&log_root_tree->dirty_log_pages);
2091 	BUG_ON(ret);
2092 
2093 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
2094 				log_root_tree->node->start);
2095 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
2096 				btrfs_header_level(log_root_tree->node));
2097 
2098 	log_root_tree->log_batch = 0;
2099 	log_root_tree->log_transid++;
2100 	smp_mb();
2101 
2102 	mutex_unlock(&log_root_tree->log_mutex);
2103 
2104 	/*
2105 	 * nobody else is going to jump in and write the the ctree
2106 	 * super here because the log_commit atomic below is protecting
2107 	 * us.  We must be called with a transaction handle pinning
2108 	 * the running transaction open, so a full commit can't hop
2109 	 * in and cause problems either.
2110 	 */
2111 	write_ctree_super(trans, root->fs_info->tree_root, 2);
2112 	ret = 0;
2113 
2114 out_wake_log_root:
2115 	atomic_set(&log_root_tree->log_commit[index2], 0);
2116 	smp_mb();
2117 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2118 		wake_up(&log_root_tree->log_commit_wait[index2]);
2119 out:
2120 	atomic_set(&root->log_commit[index1], 0);
2121 	smp_mb();
2122 	if (waitqueue_active(&root->log_commit_wait[index1]))
2123 		wake_up(&root->log_commit_wait[index1]);
2124 	return 0;
2125 }
2126 
2127 /*
2128  * free all the extents used by the tree log.  This should be called
2129  * at commit time of the full transaction
2130  */
2131 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2132 {
2133 	int ret;
2134 	struct btrfs_root *log;
2135 	struct key;
2136 	u64 start;
2137 	u64 end;
2138 	struct walk_control wc = {
2139 		.free = 1,
2140 		.process_func = process_one_buffer
2141 	};
2142 
2143 	if (!root->log_root || root->fs_info->log_root_recovering)
2144 		return 0;
2145 
2146 	log = root->log_root;
2147 	ret = walk_log_tree(trans, log, &wc);
2148 	BUG_ON(ret);
2149 
2150 	while (1) {
2151 		ret = find_first_extent_bit(&log->dirty_log_pages,
2152 				    0, &start, &end, EXTENT_DIRTY);
2153 		if (ret)
2154 			break;
2155 
2156 		clear_extent_dirty(&log->dirty_log_pages,
2157 				   start, end, GFP_NOFS);
2158 	}
2159 
2160 	if (log->log_transid > 0) {
2161 		ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2162 				     &log->root_key);
2163 		BUG_ON(ret);
2164 	}
2165 	root->log_root = NULL;
2166 	free_extent_buffer(log->node);
2167 	kfree(log);
2168 	return 0;
2169 }
2170 
2171 /*
2172  * If both a file and directory are logged, and unlinks or renames are
2173  * mixed in, we have a few interesting corners:
2174  *
2175  * create file X in dir Y
2176  * link file X to X.link in dir Y
2177  * fsync file X
2178  * unlink file X but leave X.link
2179  * fsync dir Y
2180  *
2181  * After a crash we would expect only X.link to exist.  But file X
2182  * didn't get fsync'd again so the log has back refs for X and X.link.
2183  *
2184  * We solve this by removing directory entries and inode backrefs from the
2185  * log when a file that was logged in the current transaction is
2186  * unlinked.  Any later fsync will include the updated log entries, and
2187  * we'll be able to reconstruct the proper directory items from backrefs.
2188  *
2189  * This optimizations allows us to avoid relogging the entire inode
2190  * or the entire directory.
2191  */
2192 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2193 				 struct btrfs_root *root,
2194 				 const char *name, int name_len,
2195 				 struct inode *dir, u64 index)
2196 {
2197 	struct btrfs_root *log;
2198 	struct btrfs_dir_item *di;
2199 	struct btrfs_path *path;
2200 	int ret;
2201 	int bytes_del = 0;
2202 
2203 	if (BTRFS_I(dir)->logged_trans < trans->transid)
2204 		return 0;
2205 
2206 	ret = join_running_log_trans(root);
2207 	if (ret)
2208 		return 0;
2209 
2210 	mutex_lock(&BTRFS_I(dir)->log_mutex);
2211 
2212 	log = root->log_root;
2213 	path = btrfs_alloc_path();
2214 	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2215 				   name, name_len, -1);
2216 	if (di && !IS_ERR(di)) {
2217 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2218 		bytes_del += name_len;
2219 		BUG_ON(ret);
2220 	}
2221 	btrfs_release_path(log, path);
2222 	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2223 					 index, name, name_len, -1);
2224 	if (di && !IS_ERR(di)) {
2225 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2226 		bytes_del += name_len;
2227 		BUG_ON(ret);
2228 	}
2229 
2230 	/* update the directory size in the log to reflect the names
2231 	 * we have removed
2232 	 */
2233 	if (bytes_del) {
2234 		struct btrfs_key key;
2235 
2236 		key.objectid = dir->i_ino;
2237 		key.offset = 0;
2238 		key.type = BTRFS_INODE_ITEM_KEY;
2239 		btrfs_release_path(log, path);
2240 
2241 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2242 		if (ret == 0) {
2243 			struct btrfs_inode_item *item;
2244 			u64 i_size;
2245 
2246 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2247 					      struct btrfs_inode_item);
2248 			i_size = btrfs_inode_size(path->nodes[0], item);
2249 			if (i_size > bytes_del)
2250 				i_size -= bytes_del;
2251 			else
2252 				i_size = 0;
2253 			btrfs_set_inode_size(path->nodes[0], item, i_size);
2254 			btrfs_mark_buffer_dirty(path->nodes[0]);
2255 		} else
2256 			ret = 0;
2257 		btrfs_release_path(log, path);
2258 	}
2259 
2260 	btrfs_free_path(path);
2261 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
2262 	btrfs_end_log_trans(root);
2263 
2264 	return 0;
2265 }
2266 
2267 /* see comments for btrfs_del_dir_entries_in_log */
2268 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2269 			       struct btrfs_root *root,
2270 			       const char *name, int name_len,
2271 			       struct inode *inode, u64 dirid)
2272 {
2273 	struct btrfs_root *log;
2274 	u64 index;
2275 	int ret;
2276 
2277 	if (BTRFS_I(inode)->logged_trans < trans->transid)
2278 		return 0;
2279 
2280 	ret = join_running_log_trans(root);
2281 	if (ret)
2282 		return 0;
2283 	log = root->log_root;
2284 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2285 
2286 	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2287 				  dirid, &index);
2288 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
2289 	btrfs_end_log_trans(root);
2290 
2291 	return ret;
2292 }
2293 
2294 /*
2295  * creates a range item in the log for 'dirid'.  first_offset and
2296  * last_offset tell us which parts of the key space the log should
2297  * be considered authoritative for.
2298  */
2299 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2300 				       struct btrfs_root *log,
2301 				       struct btrfs_path *path,
2302 				       int key_type, u64 dirid,
2303 				       u64 first_offset, u64 last_offset)
2304 {
2305 	int ret;
2306 	struct btrfs_key key;
2307 	struct btrfs_dir_log_item *item;
2308 
2309 	key.objectid = dirid;
2310 	key.offset = first_offset;
2311 	if (key_type == BTRFS_DIR_ITEM_KEY)
2312 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
2313 	else
2314 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
2315 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2316 	BUG_ON(ret);
2317 
2318 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2319 			      struct btrfs_dir_log_item);
2320 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2321 	btrfs_mark_buffer_dirty(path->nodes[0]);
2322 	btrfs_release_path(log, path);
2323 	return 0;
2324 }
2325 
2326 /*
2327  * log all the items included in the current transaction for a given
2328  * directory.  This also creates the range items in the log tree required
2329  * to replay anything deleted before the fsync
2330  */
2331 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2332 			  struct btrfs_root *root, struct inode *inode,
2333 			  struct btrfs_path *path,
2334 			  struct btrfs_path *dst_path, int key_type,
2335 			  u64 min_offset, u64 *last_offset_ret)
2336 {
2337 	struct btrfs_key min_key;
2338 	struct btrfs_key max_key;
2339 	struct btrfs_root *log = root->log_root;
2340 	struct extent_buffer *src;
2341 	int ret;
2342 	int i;
2343 	int nritems;
2344 	u64 first_offset = min_offset;
2345 	u64 last_offset = (u64)-1;
2346 
2347 	log = root->log_root;
2348 	max_key.objectid = inode->i_ino;
2349 	max_key.offset = (u64)-1;
2350 	max_key.type = key_type;
2351 
2352 	min_key.objectid = inode->i_ino;
2353 	min_key.type = key_type;
2354 	min_key.offset = min_offset;
2355 
2356 	path->keep_locks = 1;
2357 
2358 	ret = btrfs_search_forward(root, &min_key, &max_key,
2359 				   path, 0, trans->transid);
2360 
2361 	/*
2362 	 * we didn't find anything from this transaction, see if there
2363 	 * is anything at all
2364 	 */
2365 	if (ret != 0 || min_key.objectid != inode->i_ino ||
2366 	    min_key.type != key_type) {
2367 		min_key.objectid = inode->i_ino;
2368 		min_key.type = key_type;
2369 		min_key.offset = (u64)-1;
2370 		btrfs_release_path(root, path);
2371 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2372 		if (ret < 0) {
2373 			btrfs_release_path(root, path);
2374 			return ret;
2375 		}
2376 		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2377 
2378 		/* if ret == 0 there are items for this type,
2379 		 * create a range to tell us the last key of this type.
2380 		 * otherwise, there are no items in this directory after
2381 		 * *min_offset, and we create a range to indicate that.
2382 		 */
2383 		if (ret == 0) {
2384 			struct btrfs_key tmp;
2385 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2386 					      path->slots[0]);
2387 			if (key_type == tmp.type)
2388 				first_offset = max(min_offset, tmp.offset) + 1;
2389 		}
2390 		goto done;
2391 	}
2392 
2393 	/* go backward to find any previous key */
2394 	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2395 	if (ret == 0) {
2396 		struct btrfs_key tmp;
2397 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2398 		if (key_type == tmp.type) {
2399 			first_offset = tmp.offset;
2400 			ret = overwrite_item(trans, log, dst_path,
2401 					     path->nodes[0], path->slots[0],
2402 					     &tmp);
2403 		}
2404 	}
2405 	btrfs_release_path(root, path);
2406 
2407 	/* find the first key from this transaction again */
2408 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2409 	if (ret != 0) {
2410 		WARN_ON(1);
2411 		goto done;
2412 	}
2413 
2414 	/*
2415 	 * we have a block from this transaction, log every item in it
2416 	 * from our directory
2417 	 */
2418 	while (1) {
2419 		struct btrfs_key tmp;
2420 		src = path->nodes[0];
2421 		nritems = btrfs_header_nritems(src);
2422 		for (i = path->slots[0]; i < nritems; i++) {
2423 			btrfs_item_key_to_cpu(src, &min_key, i);
2424 
2425 			if (min_key.objectid != inode->i_ino ||
2426 			    min_key.type != key_type)
2427 				goto done;
2428 			ret = overwrite_item(trans, log, dst_path, src, i,
2429 					     &min_key);
2430 			BUG_ON(ret);
2431 		}
2432 		path->slots[0] = nritems;
2433 
2434 		/*
2435 		 * look ahead to the next item and see if it is also
2436 		 * from this directory and from this transaction
2437 		 */
2438 		ret = btrfs_next_leaf(root, path);
2439 		if (ret == 1) {
2440 			last_offset = (u64)-1;
2441 			goto done;
2442 		}
2443 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2444 		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2445 			last_offset = (u64)-1;
2446 			goto done;
2447 		}
2448 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2449 			ret = overwrite_item(trans, log, dst_path,
2450 					     path->nodes[0], path->slots[0],
2451 					     &tmp);
2452 
2453 			BUG_ON(ret);
2454 			last_offset = tmp.offset;
2455 			goto done;
2456 		}
2457 	}
2458 done:
2459 	*last_offset_ret = last_offset;
2460 	btrfs_release_path(root, path);
2461 	btrfs_release_path(log, dst_path);
2462 
2463 	/* insert the log range keys to indicate where the log is valid */
2464 	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2465 				 first_offset, last_offset);
2466 	BUG_ON(ret);
2467 	return 0;
2468 }
2469 
2470 /*
2471  * logging directories is very similar to logging inodes, We find all the items
2472  * from the current transaction and write them to the log.
2473  *
2474  * The recovery code scans the directory in the subvolume, and if it finds a
2475  * key in the range logged that is not present in the log tree, then it means
2476  * that dir entry was unlinked during the transaction.
2477  *
2478  * In order for that scan to work, we must include one key smaller than
2479  * the smallest logged by this transaction and one key larger than the largest
2480  * key logged by this transaction.
2481  */
2482 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2483 			  struct btrfs_root *root, struct inode *inode,
2484 			  struct btrfs_path *path,
2485 			  struct btrfs_path *dst_path)
2486 {
2487 	u64 min_key;
2488 	u64 max_key;
2489 	int ret;
2490 	int key_type = BTRFS_DIR_ITEM_KEY;
2491 
2492 again:
2493 	min_key = 0;
2494 	max_key = 0;
2495 	while (1) {
2496 		ret = log_dir_items(trans, root, inode, path,
2497 				    dst_path, key_type, min_key,
2498 				    &max_key);
2499 		BUG_ON(ret);
2500 		if (max_key == (u64)-1)
2501 			break;
2502 		min_key = max_key + 1;
2503 	}
2504 
2505 	if (key_type == BTRFS_DIR_ITEM_KEY) {
2506 		key_type = BTRFS_DIR_INDEX_KEY;
2507 		goto again;
2508 	}
2509 	return 0;
2510 }
2511 
2512 /*
2513  * a helper function to drop items from the log before we relog an
2514  * inode.  max_key_type indicates the highest item type to remove.
2515  * This cannot be run for file data extents because it does not
2516  * free the extents they point to.
2517  */
2518 static int drop_objectid_items(struct btrfs_trans_handle *trans,
2519 				  struct btrfs_root *log,
2520 				  struct btrfs_path *path,
2521 				  u64 objectid, int max_key_type)
2522 {
2523 	int ret;
2524 	struct btrfs_key key;
2525 	struct btrfs_key found_key;
2526 
2527 	key.objectid = objectid;
2528 	key.type = max_key_type;
2529 	key.offset = (u64)-1;
2530 
2531 	while (1) {
2532 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2533 
2534 		if (ret != 1)
2535 			break;
2536 
2537 		if (path->slots[0] == 0)
2538 			break;
2539 
2540 		path->slots[0]--;
2541 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2542 				      path->slots[0]);
2543 
2544 		if (found_key.objectid != objectid)
2545 			break;
2546 
2547 		ret = btrfs_del_item(trans, log, path);
2548 		BUG_ON(ret);
2549 		btrfs_release_path(log, path);
2550 	}
2551 	btrfs_release_path(log, path);
2552 	return 0;
2553 }
2554 
2555 static noinline int copy_items(struct btrfs_trans_handle *trans,
2556 			       struct btrfs_root *log,
2557 			       struct btrfs_path *dst_path,
2558 			       struct extent_buffer *src,
2559 			       int start_slot, int nr, int inode_only)
2560 {
2561 	unsigned long src_offset;
2562 	unsigned long dst_offset;
2563 	struct btrfs_file_extent_item *extent;
2564 	struct btrfs_inode_item *inode_item;
2565 	int ret;
2566 	struct btrfs_key *ins_keys;
2567 	u32 *ins_sizes;
2568 	char *ins_data;
2569 	int i;
2570 	struct list_head ordered_sums;
2571 
2572 	INIT_LIST_HEAD(&ordered_sums);
2573 
2574 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2575 			   nr * sizeof(u32), GFP_NOFS);
2576 	ins_sizes = (u32 *)ins_data;
2577 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2578 
2579 	for (i = 0; i < nr; i++) {
2580 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2581 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2582 	}
2583 	ret = btrfs_insert_empty_items(trans, log, dst_path,
2584 				       ins_keys, ins_sizes, nr);
2585 	BUG_ON(ret);
2586 
2587 	for (i = 0; i < nr; i++) {
2588 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2589 						   dst_path->slots[0]);
2590 
2591 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2592 
2593 		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2594 				   src_offset, ins_sizes[i]);
2595 
2596 		if (inode_only == LOG_INODE_EXISTS &&
2597 		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2598 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
2599 						    dst_path->slots[0],
2600 						    struct btrfs_inode_item);
2601 			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2602 
2603 			/* set the generation to zero so the recover code
2604 			 * can tell the difference between an logging
2605 			 * just to say 'this inode exists' and a logging
2606 			 * to say 'update this inode with these values'
2607 			 */
2608 			btrfs_set_inode_generation(dst_path->nodes[0],
2609 						   inode_item, 0);
2610 		}
2611 		/* take a reference on file data extents so that truncates
2612 		 * or deletes of this inode don't have to relog the inode
2613 		 * again
2614 		 */
2615 		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2616 			int found_type;
2617 			extent = btrfs_item_ptr(src, start_slot + i,
2618 						struct btrfs_file_extent_item);
2619 
2620 			found_type = btrfs_file_extent_type(src, extent);
2621 			if (found_type == BTRFS_FILE_EXTENT_REG ||
2622 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2623 				u64 ds = btrfs_file_extent_disk_bytenr(src,
2624 								   extent);
2625 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
2626 								      extent);
2627 				u64 cs = btrfs_file_extent_offset(src, extent);
2628 				u64 cl = btrfs_file_extent_num_bytes(src,
2629 								     extent);;
2630 				if (btrfs_file_extent_compression(src,
2631 								  extent)) {
2632 					cs = 0;
2633 					cl = dl;
2634 				}
2635 				/* ds == 0 is a hole */
2636 				if (ds != 0) {
2637 					ret = btrfs_inc_extent_ref(trans, log,
2638 						   ds, dl,
2639 						   dst_path->nodes[0]->start,
2640 						   BTRFS_TREE_LOG_OBJECTID,
2641 						   trans->transid,
2642 						   ins_keys[i].objectid);
2643 					BUG_ON(ret);
2644 					ret = btrfs_lookup_csums_range(
2645 						   log->fs_info->csum_root,
2646 						   ds + cs, ds + cs + cl - 1,
2647 						   &ordered_sums);
2648 					BUG_ON(ret);
2649 				}
2650 			}
2651 		}
2652 		dst_path->slots[0]++;
2653 	}
2654 
2655 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2656 	btrfs_release_path(log, dst_path);
2657 	kfree(ins_data);
2658 
2659 	/*
2660 	 * we have to do this after the loop above to avoid changing the
2661 	 * log tree while trying to change the log tree.
2662 	 */
2663 	while (!list_empty(&ordered_sums)) {
2664 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2665 						   struct btrfs_ordered_sum,
2666 						   list);
2667 		ret = btrfs_csum_file_blocks(trans, log, sums);
2668 		BUG_ON(ret);
2669 		list_del(&sums->list);
2670 		kfree(sums);
2671 	}
2672 	return 0;
2673 }
2674 
2675 /* log a single inode in the tree log.
2676  * At least one parent directory for this inode must exist in the tree
2677  * or be logged already.
2678  *
2679  * Any items from this inode changed by the current transaction are copied
2680  * to the log tree.  An extra reference is taken on any extents in this
2681  * file, allowing us to avoid a whole pile of corner cases around logging
2682  * blocks that have been removed from the tree.
2683  *
2684  * See LOG_INODE_ALL and related defines for a description of what inode_only
2685  * does.
2686  *
2687  * This handles both files and directories.
2688  */
2689 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2690 			     struct btrfs_root *root, struct inode *inode,
2691 			     int inode_only)
2692 {
2693 	struct btrfs_path *path;
2694 	struct btrfs_path *dst_path;
2695 	struct btrfs_key min_key;
2696 	struct btrfs_key max_key;
2697 	struct btrfs_root *log = root->log_root;
2698 	struct extent_buffer *src = NULL;
2699 	u32 size;
2700 	int ret;
2701 	int nritems;
2702 	int ins_start_slot = 0;
2703 	int ins_nr;
2704 
2705 	log = root->log_root;
2706 
2707 	path = btrfs_alloc_path();
2708 	dst_path = btrfs_alloc_path();
2709 
2710 	min_key.objectid = inode->i_ino;
2711 	min_key.type = BTRFS_INODE_ITEM_KEY;
2712 	min_key.offset = 0;
2713 
2714 	max_key.objectid = inode->i_ino;
2715 
2716 	/* today the code can only do partial logging of directories */
2717 	if (!S_ISDIR(inode->i_mode))
2718 	    inode_only = LOG_INODE_ALL;
2719 
2720 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2721 		max_key.type = BTRFS_XATTR_ITEM_KEY;
2722 	else
2723 		max_key.type = (u8)-1;
2724 	max_key.offset = (u64)-1;
2725 
2726 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2727 
2728 	/*
2729 	 * a brute force approach to making sure we get the most uptodate
2730 	 * copies of everything.
2731 	 */
2732 	if (S_ISDIR(inode->i_mode)) {
2733 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2734 
2735 		if (inode_only == LOG_INODE_EXISTS)
2736 			max_key_type = BTRFS_XATTR_ITEM_KEY;
2737 		ret = drop_objectid_items(trans, log, path,
2738 					  inode->i_ino, max_key_type);
2739 	} else {
2740 		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2741 	}
2742 	BUG_ON(ret);
2743 	path->keep_locks = 1;
2744 
2745 	while (1) {
2746 		ins_nr = 0;
2747 		ret = btrfs_search_forward(root, &min_key, &max_key,
2748 					   path, 0, trans->transid);
2749 		if (ret != 0)
2750 			break;
2751 again:
2752 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
2753 		if (min_key.objectid != inode->i_ino)
2754 			break;
2755 		if (min_key.type > max_key.type)
2756 			break;
2757 
2758 		src = path->nodes[0];
2759 		size = btrfs_item_size_nr(src, path->slots[0]);
2760 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2761 			ins_nr++;
2762 			goto next_slot;
2763 		} else if (!ins_nr) {
2764 			ins_start_slot = path->slots[0];
2765 			ins_nr = 1;
2766 			goto next_slot;
2767 		}
2768 
2769 		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2770 				 ins_nr, inode_only);
2771 		BUG_ON(ret);
2772 		ins_nr = 1;
2773 		ins_start_slot = path->slots[0];
2774 next_slot:
2775 
2776 		nritems = btrfs_header_nritems(path->nodes[0]);
2777 		path->slots[0]++;
2778 		if (path->slots[0] < nritems) {
2779 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2780 					      path->slots[0]);
2781 			goto again;
2782 		}
2783 		if (ins_nr) {
2784 			ret = copy_items(trans, log, dst_path, src,
2785 					 ins_start_slot,
2786 					 ins_nr, inode_only);
2787 			BUG_ON(ret);
2788 			ins_nr = 0;
2789 		}
2790 		btrfs_release_path(root, path);
2791 
2792 		if (min_key.offset < (u64)-1)
2793 			min_key.offset++;
2794 		else if (min_key.type < (u8)-1)
2795 			min_key.type++;
2796 		else if (min_key.objectid < (u64)-1)
2797 			min_key.objectid++;
2798 		else
2799 			break;
2800 	}
2801 	if (ins_nr) {
2802 		ret = copy_items(trans, log, dst_path, src,
2803 				 ins_start_slot,
2804 				 ins_nr, inode_only);
2805 		BUG_ON(ret);
2806 		ins_nr = 0;
2807 	}
2808 	WARN_ON(ins_nr);
2809 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2810 		btrfs_release_path(root, path);
2811 		btrfs_release_path(log, dst_path);
2812 		ret = log_directory_changes(trans, root, inode, path, dst_path);
2813 		BUG_ON(ret);
2814 	}
2815 	BTRFS_I(inode)->logged_trans = trans->transid;
2816 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
2817 
2818 	btrfs_free_path(path);
2819 	btrfs_free_path(dst_path);
2820 	return 0;
2821 }
2822 
2823 /*
2824  * follow the dentry parent pointers up the chain and see if any
2825  * of the directories in it require a full commit before they can
2826  * be logged.  Returns zero if nothing special needs to be done or 1 if
2827  * a full commit is required.
2828  */
2829 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2830 					       struct inode *inode,
2831 					       struct dentry *parent,
2832 					       struct super_block *sb,
2833 					       u64 last_committed)
2834 {
2835 	int ret = 0;
2836 	struct btrfs_root *root;
2837 
2838 	/*
2839 	 * for regular files, if its inode is already on disk, we don't
2840 	 * have to worry about the parents at all.  This is because
2841 	 * we can use the last_unlink_trans field to record renames
2842 	 * and other fun in this file.
2843 	 */
2844 	if (S_ISREG(inode->i_mode) &&
2845 	    BTRFS_I(inode)->generation <= last_committed &&
2846 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
2847 			goto out;
2848 
2849 	if (!S_ISDIR(inode->i_mode)) {
2850 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2851 			goto out;
2852 		inode = parent->d_inode;
2853 	}
2854 
2855 	while (1) {
2856 		BTRFS_I(inode)->logged_trans = trans->transid;
2857 		smp_mb();
2858 
2859 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2860 			root = BTRFS_I(inode)->root;
2861 
2862 			/*
2863 			 * make sure any commits to the log are forced
2864 			 * to be full commits
2865 			 */
2866 			root->fs_info->last_trans_log_full_commit =
2867 				trans->transid;
2868 			ret = 1;
2869 			break;
2870 		}
2871 
2872 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2873 			break;
2874 
2875 		if (parent == sb->s_root)
2876 			break;
2877 
2878 		parent = parent->d_parent;
2879 		inode = parent->d_inode;
2880 
2881 	}
2882 out:
2883 	return ret;
2884 }
2885 
2886 /*
2887  * helper function around btrfs_log_inode to make sure newly created
2888  * parent directories also end up in the log.  A minimal inode and backref
2889  * only logging is done of any parent directories that are older than
2890  * the last committed transaction
2891  */
2892 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2893 		    struct btrfs_root *root, struct inode *inode,
2894 		    struct dentry *parent, int exists_only)
2895 {
2896 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2897 	struct super_block *sb;
2898 	int ret = 0;
2899 	u64 last_committed = root->fs_info->last_trans_committed;
2900 
2901 	sb = inode->i_sb;
2902 
2903 	if (root->fs_info->last_trans_log_full_commit >
2904 	    root->fs_info->last_trans_committed) {
2905 		ret = 1;
2906 		goto end_no_trans;
2907 	}
2908 
2909 	ret = check_parent_dirs_for_sync(trans, inode, parent,
2910 					 sb, last_committed);
2911 	if (ret)
2912 		goto end_no_trans;
2913 
2914 	start_log_trans(trans, root);
2915 
2916 	ret = btrfs_log_inode(trans, root, inode, inode_only);
2917 	BUG_ON(ret);
2918 
2919 	/*
2920 	 * for regular files, if its inode is already on disk, we don't
2921 	 * have to worry about the parents at all.  This is because
2922 	 * we can use the last_unlink_trans field to record renames
2923 	 * and other fun in this file.
2924 	 */
2925 	if (S_ISREG(inode->i_mode) &&
2926 	    BTRFS_I(inode)->generation <= last_committed &&
2927 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
2928 			goto no_parent;
2929 
2930 	inode_only = LOG_INODE_EXISTS;
2931 	while (1) {
2932 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2933 			break;
2934 
2935 		inode = parent->d_inode;
2936 		if (BTRFS_I(inode)->generation >
2937 		    root->fs_info->last_trans_committed) {
2938 			ret = btrfs_log_inode(trans, root, inode, inode_only);
2939 			BUG_ON(ret);
2940 		}
2941 		if (parent == sb->s_root)
2942 			break;
2943 
2944 		parent = parent->d_parent;
2945 	}
2946 no_parent:
2947 	ret = 0;
2948 	btrfs_end_log_trans(root);
2949 end_no_trans:
2950 	return ret;
2951 }
2952 
2953 /*
2954  * it is not safe to log dentry if the chunk root has added new
2955  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
2956  * If this returns 1, you must commit the transaction to safely get your
2957  * data on disk.
2958  */
2959 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2960 			  struct btrfs_root *root, struct dentry *dentry)
2961 {
2962 	return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2963 				      dentry->d_parent, 0);
2964 }
2965 
2966 /*
2967  * should be called during mount to recover any replay any log trees
2968  * from the FS
2969  */
2970 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2971 {
2972 	int ret;
2973 	struct btrfs_path *path;
2974 	struct btrfs_trans_handle *trans;
2975 	struct btrfs_key key;
2976 	struct btrfs_key found_key;
2977 	struct btrfs_key tmp_key;
2978 	struct btrfs_root *log;
2979 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2980 	u64 highest_inode;
2981 	struct walk_control wc = {
2982 		.process_func = process_one_buffer,
2983 		.stage = 0,
2984 	};
2985 
2986 	fs_info->log_root_recovering = 1;
2987 	path = btrfs_alloc_path();
2988 	BUG_ON(!path);
2989 
2990 	trans = btrfs_start_transaction(fs_info->tree_root, 1);
2991 
2992 	wc.trans = trans;
2993 	wc.pin = 1;
2994 
2995 	walk_log_tree(trans, log_root_tree, &wc);
2996 
2997 again:
2998 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
2999 	key.offset = (u64)-1;
3000 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
3001 
3002 	while (1) {
3003 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
3004 		if (ret < 0)
3005 			break;
3006 		if (ret > 0) {
3007 			if (path->slots[0] == 0)
3008 				break;
3009 			path->slots[0]--;
3010 		}
3011 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3012 				      path->slots[0]);
3013 		btrfs_release_path(log_root_tree, path);
3014 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
3015 			break;
3016 
3017 		log = btrfs_read_fs_root_no_radix(log_root_tree,
3018 						  &found_key);
3019 		BUG_ON(!log);
3020 
3021 
3022 		tmp_key.objectid = found_key.offset;
3023 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
3024 		tmp_key.offset = (u64)-1;
3025 
3026 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
3027 		BUG_ON(!wc.replay_dest);
3028 
3029 		wc.replay_dest->log_root = log;
3030 		mutex_lock(&fs_info->trans_mutex);
3031 		btrfs_record_root_in_trans(wc.replay_dest);
3032 		mutex_unlock(&fs_info->trans_mutex);
3033 		ret = walk_log_tree(trans, log, &wc);
3034 		BUG_ON(ret);
3035 
3036 		if (wc.stage == LOG_WALK_REPLAY_ALL) {
3037 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
3038 						      path);
3039 			BUG_ON(ret);
3040 		}
3041 		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
3042 		if (ret == 0) {
3043 			wc.replay_dest->highest_inode = highest_inode;
3044 			wc.replay_dest->last_inode_alloc = highest_inode;
3045 		}
3046 
3047 		key.offset = found_key.offset - 1;
3048 		wc.replay_dest->log_root = NULL;
3049 		free_extent_buffer(log->node);
3050 		kfree(log);
3051 
3052 		if (found_key.offset == 0)
3053 			break;
3054 	}
3055 	btrfs_release_path(log_root_tree, path);
3056 
3057 	/* step one is to pin it all, step two is to replay just inodes */
3058 	if (wc.pin) {
3059 		wc.pin = 0;
3060 		wc.process_func = replay_one_buffer;
3061 		wc.stage = LOG_WALK_REPLAY_INODES;
3062 		goto again;
3063 	}
3064 	/* step three is to replay everything */
3065 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
3066 		wc.stage++;
3067 		goto again;
3068 	}
3069 
3070 	btrfs_free_path(path);
3071 
3072 	free_extent_buffer(log_root_tree->node);
3073 	log_root_tree->log_root = NULL;
3074 	fs_info->log_root_recovering = 0;
3075 
3076 	/* step 4: commit the transaction, which also unpins the blocks */
3077 	btrfs_commit_transaction(trans, fs_info->tree_root);
3078 
3079 	kfree(log_root_tree);
3080 	return 0;
3081 }
3082 
3083 /*
3084  * there are some corner cases where we want to force a full
3085  * commit instead of allowing a directory to be logged.
3086  *
3087  * They revolve around files there were unlinked from the directory, and
3088  * this function updates the parent directory so that a full commit is
3089  * properly done if it is fsync'd later after the unlinks are done.
3090  */
3091 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3092 			     struct inode *dir, struct inode *inode,
3093 			     int for_rename)
3094 {
3095 	/*
3096 	 * when we're logging a file, if it hasn't been renamed
3097 	 * or unlinked, and its inode is fully committed on disk,
3098 	 * we don't have to worry about walking up the directory chain
3099 	 * to log its parents.
3100 	 *
3101 	 * So, we use the last_unlink_trans field to put this transid
3102 	 * into the file.  When the file is logged we check it and
3103 	 * don't log the parents if the file is fully on disk.
3104 	 */
3105 	if (S_ISREG(inode->i_mode))
3106 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
3107 
3108 	/*
3109 	 * if this directory was already logged any new
3110 	 * names for this file/dir will get recorded
3111 	 */
3112 	smp_mb();
3113 	if (BTRFS_I(dir)->logged_trans == trans->transid)
3114 		return;
3115 
3116 	/*
3117 	 * if the inode we're about to unlink was logged,
3118 	 * the log will be properly updated for any new names
3119 	 */
3120 	if (BTRFS_I(inode)->logged_trans == trans->transid)
3121 		return;
3122 
3123 	/*
3124 	 * when renaming files across directories, if the directory
3125 	 * there we're unlinking from gets fsync'd later on, there's
3126 	 * no way to find the destination directory later and fsync it
3127 	 * properly.  So, we have to be conservative and force commits
3128 	 * so the new name gets discovered.
3129 	 */
3130 	if (for_rename)
3131 		goto record;
3132 
3133 	/* we can safely do the unlink without any special recording */
3134 	return;
3135 
3136 record:
3137 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
3138 }
3139 
3140 /*
3141  * Call this after adding a new name for a file and it will properly
3142  * update the log to reflect the new name.
3143  *
3144  * It will return zero if all goes well, and it will return 1 if a
3145  * full transaction commit is required.
3146  */
3147 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3148 			struct inode *inode, struct inode *old_dir,
3149 			struct dentry *parent)
3150 {
3151 	struct btrfs_root * root = BTRFS_I(inode)->root;
3152 
3153 	/*
3154 	 * this will force the logging code to walk the dentry chain
3155 	 * up for the file
3156 	 */
3157 	if (S_ISREG(inode->i_mode))
3158 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
3159 
3160 	/*
3161 	 * if this inode hasn't been logged and directory we're renaming it
3162 	 * from hasn't been logged, we don't need to log it
3163 	 */
3164 	if (BTRFS_I(inode)->logged_trans <=
3165 	    root->fs_info->last_trans_committed &&
3166 	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3167 		    root->fs_info->last_trans_committed))
3168 		return 0;
3169 
3170 	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3171 }
3172 
3173