xref: /openbmc/linux/fs/btrfs/tree-log.c (revision d2999e1b)
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/blkdev.h>
22 #include <linux/list_sort.h>
23 #include "tree-log.h"
24 #include "disk-io.h"
25 #include "locking.h"
26 #include "print-tree.h"
27 #include "backref.h"
28 #include "hash.h"
29 
30 /* magic values for the inode_only field in btrfs_log_inode:
31  *
32  * LOG_INODE_ALL means to log everything
33  * LOG_INODE_EXISTS means to log just enough to recreate the inode
34  * during log replay
35  */
36 #define LOG_INODE_ALL 0
37 #define LOG_INODE_EXISTS 1
38 
39 /*
40  * directory trouble cases
41  *
42  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
43  * log, we must force a full commit before doing an fsync of the directory
44  * where the unlink was done.
45  * ---> record transid of last unlink/rename per directory
46  *
47  * mkdir foo/some_dir
48  * normal commit
49  * rename foo/some_dir foo2/some_dir
50  * mkdir foo/some_dir
51  * fsync foo/some_dir/some_file
52  *
53  * The fsync above will unlink the original some_dir without recording
54  * it in its new location (foo2).  After a crash, some_dir will be gone
55  * unless the fsync of some_file forces a full commit
56  *
57  * 2) we must log any new names for any file or dir that is in the fsync
58  * log. ---> check inode while renaming/linking.
59  *
60  * 2a) we must log any new names for any file or dir during rename
61  * when the directory they are being removed from was logged.
62  * ---> check inode and old parent dir during rename
63  *
64  *  2a is actually the more important variant.  With the extra logging
65  *  a crash might unlink the old name without recreating the new one
66  *
67  * 3) after a crash, we must go through any directories with a link count
68  * of zero and redo the rm -rf
69  *
70  * mkdir f1/foo
71  * normal commit
72  * rm -rf f1/foo
73  * fsync(f1)
74  *
75  * The directory f1 was fully removed from the FS, but fsync was never
76  * called on f1, only its parent dir.  After a crash the rm -rf must
77  * be replayed.  This must be able to recurse down the entire
78  * directory tree.  The inode link count fixup code takes care of the
79  * ugly details.
80  */
81 
82 /*
83  * stages for the tree walking.  The first
84  * stage (0) is to only pin down the blocks we find
85  * the second stage (1) is to make sure that all the inodes
86  * we find in the log are created in the subvolume.
87  *
88  * The last stage is to deal with directories and links and extents
89  * and all the other fun semantics
90  */
91 #define LOG_WALK_PIN_ONLY 0
92 #define LOG_WALK_REPLAY_INODES 1
93 #define LOG_WALK_REPLAY_DIR_INDEX 2
94 #define LOG_WALK_REPLAY_ALL 3
95 
96 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 			     struct btrfs_root *root, struct inode *inode,
98 			     int inode_only);
99 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
100 			     struct btrfs_root *root,
101 			     struct btrfs_path *path, u64 objectid);
102 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
103 				       struct btrfs_root *root,
104 				       struct btrfs_root *log,
105 				       struct btrfs_path *path,
106 				       u64 dirid, int del_all);
107 
108 /*
109  * tree logging is a special write ahead log used to make sure that
110  * fsyncs and O_SYNCs can happen without doing full tree commits.
111  *
112  * Full tree commits are expensive because they require commonly
113  * modified blocks to be recowed, creating many dirty pages in the
114  * extent tree an 4x-6x higher write load than ext3.
115  *
116  * Instead of doing a tree commit on every fsync, we use the
117  * key ranges and transaction ids to find items for a given file or directory
118  * that have changed in this transaction.  Those items are copied into
119  * a special tree (one per subvolume root), that tree is written to disk
120  * and then the fsync is considered complete.
121  *
122  * After a crash, items are copied out of the log-tree back into the
123  * subvolume tree.  Any file data extents found are recorded in the extent
124  * allocation tree, and the log-tree freed.
125  *
126  * The log tree is read three times, once to pin down all the extents it is
127  * using in ram and once, once to create all the inodes logged in the tree
128  * and once to do all the other items.
129  */
130 
131 /*
132  * start a sub transaction and setup the log tree
133  * this increments the log tree writer count to make the people
134  * syncing the tree wait for us to finish
135  */
136 static int start_log_trans(struct btrfs_trans_handle *trans,
137 			   struct btrfs_root *root,
138 			   struct btrfs_log_ctx *ctx)
139 {
140 	int index;
141 	int ret;
142 
143 	mutex_lock(&root->log_mutex);
144 	if (root->log_root) {
145 		if (btrfs_need_log_full_commit(root->fs_info, trans)) {
146 			ret = -EAGAIN;
147 			goto out;
148 		}
149 		if (!root->log_start_pid) {
150 			root->log_start_pid = current->pid;
151 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
152 		} else if (root->log_start_pid != current->pid) {
153 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
154 		}
155 
156 		atomic_inc(&root->log_batch);
157 		atomic_inc(&root->log_writers);
158 		if (ctx) {
159 			index = root->log_transid % 2;
160 			list_add_tail(&ctx->list, &root->log_ctxs[index]);
161 			ctx->log_transid = root->log_transid;
162 		}
163 		mutex_unlock(&root->log_mutex);
164 		return 0;
165 	}
166 
167 	ret = 0;
168 	mutex_lock(&root->fs_info->tree_log_mutex);
169 	if (!root->fs_info->log_root_tree)
170 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
171 	mutex_unlock(&root->fs_info->tree_log_mutex);
172 	if (ret)
173 		goto out;
174 
175 	if (!root->log_root) {
176 		ret = btrfs_add_log_tree(trans, root);
177 		if (ret)
178 			goto out;
179 	}
180 	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
181 	root->log_start_pid = current->pid;
182 	atomic_inc(&root->log_batch);
183 	atomic_inc(&root->log_writers);
184 	if (ctx) {
185 		index = root->log_transid % 2;
186 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
187 		ctx->log_transid = root->log_transid;
188 	}
189 out:
190 	mutex_unlock(&root->log_mutex);
191 	return ret;
192 }
193 
194 /*
195  * returns 0 if there was a log transaction running and we were able
196  * to join, or returns -ENOENT if there were not transactions
197  * in progress
198  */
199 static int join_running_log_trans(struct btrfs_root *root)
200 {
201 	int ret = -ENOENT;
202 
203 	smp_mb();
204 	if (!root->log_root)
205 		return -ENOENT;
206 
207 	mutex_lock(&root->log_mutex);
208 	if (root->log_root) {
209 		ret = 0;
210 		atomic_inc(&root->log_writers);
211 	}
212 	mutex_unlock(&root->log_mutex);
213 	return ret;
214 }
215 
216 /*
217  * This either makes the current running log transaction wait
218  * until you call btrfs_end_log_trans() or it makes any future
219  * log transactions wait until you call btrfs_end_log_trans()
220  */
221 int btrfs_pin_log_trans(struct btrfs_root *root)
222 {
223 	int ret = -ENOENT;
224 
225 	mutex_lock(&root->log_mutex);
226 	atomic_inc(&root->log_writers);
227 	mutex_unlock(&root->log_mutex);
228 	return ret;
229 }
230 
231 /*
232  * indicate we're done making changes to the log tree
233  * and wake up anyone waiting to do a sync
234  */
235 void btrfs_end_log_trans(struct btrfs_root *root)
236 {
237 	if (atomic_dec_and_test(&root->log_writers)) {
238 		smp_mb();
239 		if (waitqueue_active(&root->log_writer_wait))
240 			wake_up(&root->log_writer_wait);
241 	}
242 }
243 
244 
245 /*
246  * the walk control struct is used to pass state down the chain when
247  * processing the log tree.  The stage field tells us which part
248  * of the log tree processing we are currently doing.  The others
249  * are state fields used for that specific part
250  */
251 struct walk_control {
252 	/* should we free the extent on disk when done?  This is used
253 	 * at transaction commit time while freeing a log tree
254 	 */
255 	int free;
256 
257 	/* should we write out the extent buffer?  This is used
258 	 * while flushing the log tree to disk during a sync
259 	 */
260 	int write;
261 
262 	/* should we wait for the extent buffer io to finish?  Also used
263 	 * while flushing the log tree to disk for a sync
264 	 */
265 	int wait;
266 
267 	/* pin only walk, we record which extents on disk belong to the
268 	 * log trees
269 	 */
270 	int pin;
271 
272 	/* what stage of the replay code we're currently in */
273 	int stage;
274 
275 	/* the root we are currently replaying */
276 	struct btrfs_root *replay_dest;
277 
278 	/* the trans handle for the current replay */
279 	struct btrfs_trans_handle *trans;
280 
281 	/* the function that gets used to process blocks we find in the
282 	 * tree.  Note the extent_buffer might not be up to date when it is
283 	 * passed in, and it must be checked or read if you need the data
284 	 * inside it
285 	 */
286 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
287 			    struct walk_control *wc, u64 gen);
288 };
289 
290 /*
291  * process_func used to pin down extents, write them or wait on them
292  */
293 static int process_one_buffer(struct btrfs_root *log,
294 			      struct extent_buffer *eb,
295 			      struct walk_control *wc, u64 gen)
296 {
297 	int ret = 0;
298 
299 	/*
300 	 * If this fs is mixed then we need to be able to process the leaves to
301 	 * pin down any logged extents, so we have to read the block.
302 	 */
303 	if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
304 		ret = btrfs_read_buffer(eb, gen);
305 		if (ret)
306 			return ret;
307 	}
308 
309 	if (wc->pin)
310 		ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
311 						      eb->start, eb->len);
312 
313 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
314 		if (wc->pin && btrfs_header_level(eb) == 0)
315 			ret = btrfs_exclude_logged_extents(log, eb);
316 		if (wc->write)
317 			btrfs_write_tree_block(eb);
318 		if (wc->wait)
319 			btrfs_wait_tree_block_writeback(eb);
320 	}
321 	return ret;
322 }
323 
324 /*
325  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
326  * to the src data we are copying out.
327  *
328  * root is the tree we are copying into, and path is a scratch
329  * path for use in this function (it should be released on entry and
330  * will be released on exit).
331  *
332  * If the key is already in the destination tree the existing item is
333  * overwritten.  If the existing item isn't big enough, it is extended.
334  * If it is too large, it is truncated.
335  *
336  * If the key isn't in the destination yet, a new item is inserted.
337  */
338 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
339 				   struct btrfs_root *root,
340 				   struct btrfs_path *path,
341 				   struct extent_buffer *eb, int slot,
342 				   struct btrfs_key *key)
343 {
344 	int ret;
345 	u32 item_size;
346 	u64 saved_i_size = 0;
347 	int save_old_i_size = 0;
348 	unsigned long src_ptr;
349 	unsigned long dst_ptr;
350 	int overwrite_root = 0;
351 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
352 
353 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
354 		overwrite_root = 1;
355 
356 	item_size = btrfs_item_size_nr(eb, slot);
357 	src_ptr = btrfs_item_ptr_offset(eb, slot);
358 
359 	/* look for the key in the destination tree */
360 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
361 	if (ret < 0)
362 		return ret;
363 
364 	if (ret == 0) {
365 		char *src_copy;
366 		char *dst_copy;
367 		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
368 						  path->slots[0]);
369 		if (dst_size != item_size)
370 			goto insert;
371 
372 		if (item_size == 0) {
373 			btrfs_release_path(path);
374 			return 0;
375 		}
376 		dst_copy = kmalloc(item_size, GFP_NOFS);
377 		src_copy = kmalloc(item_size, GFP_NOFS);
378 		if (!dst_copy || !src_copy) {
379 			btrfs_release_path(path);
380 			kfree(dst_copy);
381 			kfree(src_copy);
382 			return -ENOMEM;
383 		}
384 
385 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
386 
387 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
388 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
389 				   item_size);
390 		ret = memcmp(dst_copy, src_copy, item_size);
391 
392 		kfree(dst_copy);
393 		kfree(src_copy);
394 		/*
395 		 * they have the same contents, just return, this saves
396 		 * us from cowing blocks in the destination tree and doing
397 		 * extra writes that may not have been done by a previous
398 		 * sync
399 		 */
400 		if (ret == 0) {
401 			btrfs_release_path(path);
402 			return 0;
403 		}
404 
405 		/*
406 		 * We need to load the old nbytes into the inode so when we
407 		 * replay the extents we've logged we get the right nbytes.
408 		 */
409 		if (inode_item) {
410 			struct btrfs_inode_item *item;
411 			u64 nbytes;
412 			u32 mode;
413 
414 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
415 					      struct btrfs_inode_item);
416 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
417 			item = btrfs_item_ptr(eb, slot,
418 					      struct btrfs_inode_item);
419 			btrfs_set_inode_nbytes(eb, item, nbytes);
420 
421 			/*
422 			 * If this is a directory we need to reset the i_size to
423 			 * 0 so that we can set it up properly when replaying
424 			 * the rest of the items in this log.
425 			 */
426 			mode = btrfs_inode_mode(eb, item);
427 			if (S_ISDIR(mode))
428 				btrfs_set_inode_size(eb, item, 0);
429 		}
430 	} else if (inode_item) {
431 		struct btrfs_inode_item *item;
432 		u32 mode;
433 
434 		/*
435 		 * New inode, set nbytes to 0 so that the nbytes comes out
436 		 * properly when we replay the extents.
437 		 */
438 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
439 		btrfs_set_inode_nbytes(eb, item, 0);
440 
441 		/*
442 		 * If this is a directory we need to reset the i_size to 0 so
443 		 * that we can set it up properly when replaying the rest of
444 		 * the items in this log.
445 		 */
446 		mode = btrfs_inode_mode(eb, item);
447 		if (S_ISDIR(mode))
448 			btrfs_set_inode_size(eb, item, 0);
449 	}
450 insert:
451 	btrfs_release_path(path);
452 	/* try to insert the key into the destination tree */
453 	ret = btrfs_insert_empty_item(trans, root, path,
454 				      key, item_size);
455 
456 	/* make sure any existing item is the correct size */
457 	if (ret == -EEXIST) {
458 		u32 found_size;
459 		found_size = btrfs_item_size_nr(path->nodes[0],
460 						path->slots[0]);
461 		if (found_size > item_size)
462 			btrfs_truncate_item(root, path, item_size, 1);
463 		else if (found_size < item_size)
464 			btrfs_extend_item(root, path,
465 					  item_size - found_size);
466 	} else if (ret) {
467 		return ret;
468 	}
469 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
470 					path->slots[0]);
471 
472 	/* don't overwrite an existing inode if the generation number
473 	 * was logged as zero.  This is done when the tree logging code
474 	 * is just logging an inode to make sure it exists after recovery.
475 	 *
476 	 * Also, don't overwrite i_size on directories during replay.
477 	 * log replay inserts and removes directory items based on the
478 	 * state of the tree found in the subvolume, and i_size is modified
479 	 * as it goes
480 	 */
481 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
482 		struct btrfs_inode_item *src_item;
483 		struct btrfs_inode_item *dst_item;
484 
485 		src_item = (struct btrfs_inode_item *)src_ptr;
486 		dst_item = (struct btrfs_inode_item *)dst_ptr;
487 
488 		if (btrfs_inode_generation(eb, src_item) == 0)
489 			goto no_copy;
490 
491 		if (overwrite_root &&
492 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
493 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
494 			save_old_i_size = 1;
495 			saved_i_size = btrfs_inode_size(path->nodes[0],
496 							dst_item);
497 		}
498 	}
499 
500 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
501 			   src_ptr, item_size);
502 
503 	if (save_old_i_size) {
504 		struct btrfs_inode_item *dst_item;
505 		dst_item = (struct btrfs_inode_item *)dst_ptr;
506 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
507 	}
508 
509 	/* make sure the generation is filled in */
510 	if (key->type == BTRFS_INODE_ITEM_KEY) {
511 		struct btrfs_inode_item *dst_item;
512 		dst_item = (struct btrfs_inode_item *)dst_ptr;
513 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
514 			btrfs_set_inode_generation(path->nodes[0], dst_item,
515 						   trans->transid);
516 		}
517 	}
518 no_copy:
519 	btrfs_mark_buffer_dirty(path->nodes[0]);
520 	btrfs_release_path(path);
521 	return 0;
522 }
523 
524 /*
525  * simple helper to read an inode off the disk from a given root
526  * This can only be called for subvolume roots and not for the log
527  */
528 static noinline struct inode *read_one_inode(struct btrfs_root *root,
529 					     u64 objectid)
530 {
531 	struct btrfs_key key;
532 	struct inode *inode;
533 
534 	key.objectid = objectid;
535 	key.type = BTRFS_INODE_ITEM_KEY;
536 	key.offset = 0;
537 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
538 	if (IS_ERR(inode)) {
539 		inode = NULL;
540 	} else if (is_bad_inode(inode)) {
541 		iput(inode);
542 		inode = NULL;
543 	}
544 	return inode;
545 }
546 
547 /* replays a single extent in 'eb' at 'slot' with 'key' into the
548  * subvolume 'root'.  path is released on entry and should be released
549  * on exit.
550  *
551  * extents in the log tree have not been allocated out of the extent
552  * tree yet.  So, this completes the allocation, taking a reference
553  * as required if the extent already exists or creating a new extent
554  * if it isn't in the extent allocation tree yet.
555  *
556  * The extent is inserted into the file, dropping any existing extents
557  * from the file that overlap the new one.
558  */
559 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
560 				      struct btrfs_root *root,
561 				      struct btrfs_path *path,
562 				      struct extent_buffer *eb, int slot,
563 				      struct btrfs_key *key)
564 {
565 	int found_type;
566 	u64 extent_end;
567 	u64 start = key->offset;
568 	u64 nbytes = 0;
569 	struct btrfs_file_extent_item *item;
570 	struct inode *inode = NULL;
571 	unsigned long size;
572 	int ret = 0;
573 
574 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
575 	found_type = btrfs_file_extent_type(eb, item);
576 
577 	if (found_type == BTRFS_FILE_EXTENT_REG ||
578 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
579 		nbytes = btrfs_file_extent_num_bytes(eb, item);
580 		extent_end = start + nbytes;
581 
582 		/*
583 		 * We don't add to the inodes nbytes if we are prealloc or a
584 		 * hole.
585 		 */
586 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
587 			nbytes = 0;
588 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
589 		size = btrfs_file_extent_inline_len(eb, slot, item);
590 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
591 		extent_end = ALIGN(start + size, root->sectorsize);
592 	} else {
593 		ret = 0;
594 		goto out;
595 	}
596 
597 	inode = read_one_inode(root, key->objectid);
598 	if (!inode) {
599 		ret = -EIO;
600 		goto out;
601 	}
602 
603 	/*
604 	 * first check to see if we already have this extent in the
605 	 * file.  This must be done before the btrfs_drop_extents run
606 	 * so we don't try to drop this extent.
607 	 */
608 	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
609 				       start, 0);
610 
611 	if (ret == 0 &&
612 	    (found_type == BTRFS_FILE_EXTENT_REG ||
613 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
614 		struct btrfs_file_extent_item cmp1;
615 		struct btrfs_file_extent_item cmp2;
616 		struct btrfs_file_extent_item *existing;
617 		struct extent_buffer *leaf;
618 
619 		leaf = path->nodes[0];
620 		existing = btrfs_item_ptr(leaf, path->slots[0],
621 					  struct btrfs_file_extent_item);
622 
623 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
624 				   sizeof(cmp1));
625 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
626 				   sizeof(cmp2));
627 
628 		/*
629 		 * we already have a pointer to this exact extent,
630 		 * we don't have to do anything
631 		 */
632 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
633 			btrfs_release_path(path);
634 			goto out;
635 		}
636 	}
637 	btrfs_release_path(path);
638 
639 	/* drop any overlapping extents */
640 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
641 	if (ret)
642 		goto out;
643 
644 	if (found_type == BTRFS_FILE_EXTENT_REG ||
645 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
646 		u64 offset;
647 		unsigned long dest_offset;
648 		struct btrfs_key ins;
649 
650 		ret = btrfs_insert_empty_item(trans, root, path, key,
651 					      sizeof(*item));
652 		if (ret)
653 			goto out;
654 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
655 						    path->slots[0]);
656 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
657 				(unsigned long)item,  sizeof(*item));
658 
659 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
660 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
661 		ins.type = BTRFS_EXTENT_ITEM_KEY;
662 		offset = key->offset - btrfs_file_extent_offset(eb, item);
663 
664 		if (ins.objectid > 0) {
665 			u64 csum_start;
666 			u64 csum_end;
667 			LIST_HEAD(ordered_sums);
668 			/*
669 			 * is this extent already allocated in the extent
670 			 * allocation tree?  If so, just add a reference
671 			 */
672 			ret = btrfs_lookup_extent(root, ins.objectid,
673 						ins.offset);
674 			if (ret == 0) {
675 				ret = btrfs_inc_extent_ref(trans, root,
676 						ins.objectid, ins.offset,
677 						0, root->root_key.objectid,
678 						key->objectid, offset, 0);
679 				if (ret)
680 					goto out;
681 			} else {
682 				/*
683 				 * insert the extent pointer in the extent
684 				 * allocation tree
685 				 */
686 				ret = btrfs_alloc_logged_file_extent(trans,
687 						root, root->root_key.objectid,
688 						key->objectid, offset, &ins);
689 				if (ret)
690 					goto out;
691 			}
692 			btrfs_release_path(path);
693 
694 			if (btrfs_file_extent_compression(eb, item)) {
695 				csum_start = ins.objectid;
696 				csum_end = csum_start + ins.offset;
697 			} else {
698 				csum_start = ins.objectid +
699 					btrfs_file_extent_offset(eb, item);
700 				csum_end = csum_start +
701 					btrfs_file_extent_num_bytes(eb, item);
702 			}
703 
704 			ret = btrfs_lookup_csums_range(root->log_root,
705 						csum_start, csum_end - 1,
706 						&ordered_sums, 0);
707 			if (ret)
708 				goto out;
709 			while (!list_empty(&ordered_sums)) {
710 				struct btrfs_ordered_sum *sums;
711 				sums = list_entry(ordered_sums.next,
712 						struct btrfs_ordered_sum,
713 						list);
714 				if (!ret)
715 					ret = btrfs_csum_file_blocks(trans,
716 						root->fs_info->csum_root,
717 						sums);
718 				list_del(&sums->list);
719 				kfree(sums);
720 			}
721 			if (ret)
722 				goto out;
723 		} else {
724 			btrfs_release_path(path);
725 		}
726 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
727 		/* inline extents are easy, we just overwrite them */
728 		ret = overwrite_item(trans, root, path, eb, slot, key);
729 		if (ret)
730 			goto out;
731 	}
732 
733 	inode_add_bytes(inode, nbytes);
734 	ret = btrfs_update_inode(trans, root, inode);
735 out:
736 	if (inode)
737 		iput(inode);
738 	return ret;
739 }
740 
741 /*
742  * when cleaning up conflicts between the directory names in the
743  * subvolume, directory names in the log and directory names in the
744  * inode back references, we may have to unlink inodes from directories.
745  *
746  * This is a helper function to do the unlink of a specific directory
747  * item
748  */
749 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
750 				      struct btrfs_root *root,
751 				      struct btrfs_path *path,
752 				      struct inode *dir,
753 				      struct btrfs_dir_item *di)
754 {
755 	struct inode *inode;
756 	char *name;
757 	int name_len;
758 	struct extent_buffer *leaf;
759 	struct btrfs_key location;
760 	int ret;
761 
762 	leaf = path->nodes[0];
763 
764 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
765 	name_len = btrfs_dir_name_len(leaf, di);
766 	name = kmalloc(name_len, GFP_NOFS);
767 	if (!name)
768 		return -ENOMEM;
769 
770 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
771 	btrfs_release_path(path);
772 
773 	inode = read_one_inode(root, location.objectid);
774 	if (!inode) {
775 		ret = -EIO;
776 		goto out;
777 	}
778 
779 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
780 	if (ret)
781 		goto out;
782 
783 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
784 	if (ret)
785 		goto out;
786 	else
787 		ret = btrfs_run_delayed_items(trans, root);
788 out:
789 	kfree(name);
790 	iput(inode);
791 	return ret;
792 }
793 
794 /*
795  * helper function to see if a given name and sequence number found
796  * in an inode back reference are already in a directory and correctly
797  * point to this inode
798  */
799 static noinline int inode_in_dir(struct btrfs_root *root,
800 				 struct btrfs_path *path,
801 				 u64 dirid, u64 objectid, u64 index,
802 				 const char *name, int name_len)
803 {
804 	struct btrfs_dir_item *di;
805 	struct btrfs_key location;
806 	int match = 0;
807 
808 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
809 					 index, name, name_len, 0);
810 	if (di && !IS_ERR(di)) {
811 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
812 		if (location.objectid != objectid)
813 			goto out;
814 	} else
815 		goto out;
816 	btrfs_release_path(path);
817 
818 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
819 	if (di && !IS_ERR(di)) {
820 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
821 		if (location.objectid != objectid)
822 			goto out;
823 	} else
824 		goto out;
825 	match = 1;
826 out:
827 	btrfs_release_path(path);
828 	return match;
829 }
830 
831 /*
832  * helper function to check a log tree for a named back reference in
833  * an inode.  This is used to decide if a back reference that is
834  * found in the subvolume conflicts with what we find in the log.
835  *
836  * inode backreferences may have multiple refs in a single item,
837  * during replay we process one reference at a time, and we don't
838  * want to delete valid links to a file from the subvolume if that
839  * link is also in the log.
840  */
841 static noinline int backref_in_log(struct btrfs_root *log,
842 				   struct btrfs_key *key,
843 				   u64 ref_objectid,
844 				   char *name, int namelen)
845 {
846 	struct btrfs_path *path;
847 	struct btrfs_inode_ref *ref;
848 	unsigned long ptr;
849 	unsigned long ptr_end;
850 	unsigned long name_ptr;
851 	int found_name_len;
852 	int item_size;
853 	int ret;
854 	int match = 0;
855 
856 	path = btrfs_alloc_path();
857 	if (!path)
858 		return -ENOMEM;
859 
860 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
861 	if (ret != 0)
862 		goto out;
863 
864 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
865 
866 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
867 		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
868 						   name, namelen, NULL))
869 			match = 1;
870 
871 		goto out;
872 	}
873 
874 	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
875 	ptr_end = ptr + item_size;
876 	while (ptr < ptr_end) {
877 		ref = (struct btrfs_inode_ref *)ptr;
878 		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
879 		if (found_name_len == namelen) {
880 			name_ptr = (unsigned long)(ref + 1);
881 			ret = memcmp_extent_buffer(path->nodes[0], name,
882 						   name_ptr, namelen);
883 			if (ret == 0) {
884 				match = 1;
885 				goto out;
886 			}
887 		}
888 		ptr = (unsigned long)(ref + 1) + found_name_len;
889 	}
890 out:
891 	btrfs_free_path(path);
892 	return match;
893 }
894 
895 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
896 				  struct btrfs_root *root,
897 				  struct btrfs_path *path,
898 				  struct btrfs_root *log_root,
899 				  struct inode *dir, struct inode *inode,
900 				  struct extent_buffer *eb,
901 				  u64 inode_objectid, u64 parent_objectid,
902 				  u64 ref_index, char *name, int namelen,
903 				  int *search_done)
904 {
905 	int ret;
906 	char *victim_name;
907 	int victim_name_len;
908 	struct extent_buffer *leaf;
909 	struct btrfs_dir_item *di;
910 	struct btrfs_key search_key;
911 	struct btrfs_inode_extref *extref;
912 
913 again:
914 	/* Search old style refs */
915 	search_key.objectid = inode_objectid;
916 	search_key.type = BTRFS_INODE_REF_KEY;
917 	search_key.offset = parent_objectid;
918 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
919 	if (ret == 0) {
920 		struct btrfs_inode_ref *victim_ref;
921 		unsigned long ptr;
922 		unsigned long ptr_end;
923 
924 		leaf = path->nodes[0];
925 
926 		/* are we trying to overwrite a back ref for the root directory
927 		 * if so, just jump out, we're done
928 		 */
929 		if (search_key.objectid == search_key.offset)
930 			return 1;
931 
932 		/* check all the names in this back reference to see
933 		 * if they are in the log.  if so, we allow them to stay
934 		 * otherwise they must be unlinked as a conflict
935 		 */
936 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
937 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
938 		while (ptr < ptr_end) {
939 			victim_ref = (struct btrfs_inode_ref *)ptr;
940 			victim_name_len = btrfs_inode_ref_name_len(leaf,
941 								   victim_ref);
942 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
943 			if (!victim_name)
944 				return -ENOMEM;
945 
946 			read_extent_buffer(leaf, victim_name,
947 					   (unsigned long)(victim_ref + 1),
948 					   victim_name_len);
949 
950 			if (!backref_in_log(log_root, &search_key,
951 					    parent_objectid,
952 					    victim_name,
953 					    victim_name_len)) {
954 				inc_nlink(inode);
955 				btrfs_release_path(path);
956 
957 				ret = btrfs_unlink_inode(trans, root, dir,
958 							 inode, victim_name,
959 							 victim_name_len);
960 				kfree(victim_name);
961 				if (ret)
962 					return ret;
963 				ret = btrfs_run_delayed_items(trans, root);
964 				if (ret)
965 					return ret;
966 				*search_done = 1;
967 				goto again;
968 			}
969 			kfree(victim_name);
970 
971 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
972 		}
973 
974 		/*
975 		 * NOTE: we have searched root tree and checked the
976 		 * coresponding ref, it does not need to check again.
977 		 */
978 		*search_done = 1;
979 	}
980 	btrfs_release_path(path);
981 
982 	/* Same search but for extended refs */
983 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
984 					   inode_objectid, parent_objectid, 0,
985 					   0);
986 	if (!IS_ERR_OR_NULL(extref)) {
987 		u32 item_size;
988 		u32 cur_offset = 0;
989 		unsigned long base;
990 		struct inode *victim_parent;
991 
992 		leaf = path->nodes[0];
993 
994 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
995 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
996 
997 		while (cur_offset < item_size) {
998 			extref = (struct btrfs_inode_extref *)base + cur_offset;
999 
1000 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1001 
1002 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1003 				goto next;
1004 
1005 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1006 			if (!victim_name)
1007 				return -ENOMEM;
1008 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1009 					   victim_name_len);
1010 
1011 			search_key.objectid = inode_objectid;
1012 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1013 			search_key.offset = btrfs_extref_hash(parent_objectid,
1014 							      victim_name,
1015 							      victim_name_len);
1016 			ret = 0;
1017 			if (!backref_in_log(log_root, &search_key,
1018 					    parent_objectid, victim_name,
1019 					    victim_name_len)) {
1020 				ret = -ENOENT;
1021 				victim_parent = read_one_inode(root,
1022 							       parent_objectid);
1023 				if (victim_parent) {
1024 					inc_nlink(inode);
1025 					btrfs_release_path(path);
1026 
1027 					ret = btrfs_unlink_inode(trans, root,
1028 								 victim_parent,
1029 								 inode,
1030 								 victim_name,
1031 								 victim_name_len);
1032 					if (!ret)
1033 						ret = btrfs_run_delayed_items(
1034 								  trans, root);
1035 				}
1036 				iput(victim_parent);
1037 				kfree(victim_name);
1038 				if (ret)
1039 					return ret;
1040 				*search_done = 1;
1041 				goto again;
1042 			}
1043 			kfree(victim_name);
1044 			if (ret)
1045 				return ret;
1046 next:
1047 			cur_offset += victim_name_len + sizeof(*extref);
1048 		}
1049 		*search_done = 1;
1050 	}
1051 	btrfs_release_path(path);
1052 
1053 	/* look for a conflicting sequence number */
1054 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1055 					 ref_index, name, namelen, 0);
1056 	if (di && !IS_ERR(di)) {
1057 		ret = drop_one_dir_item(trans, root, path, dir, di);
1058 		if (ret)
1059 			return ret;
1060 	}
1061 	btrfs_release_path(path);
1062 
1063 	/* look for a conflicing name */
1064 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1065 				   name, namelen, 0);
1066 	if (di && !IS_ERR(di)) {
1067 		ret = drop_one_dir_item(trans, root, path, dir, di);
1068 		if (ret)
1069 			return ret;
1070 	}
1071 	btrfs_release_path(path);
1072 
1073 	return 0;
1074 }
1075 
1076 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1077 			     u32 *namelen, char **name, u64 *index,
1078 			     u64 *parent_objectid)
1079 {
1080 	struct btrfs_inode_extref *extref;
1081 
1082 	extref = (struct btrfs_inode_extref *)ref_ptr;
1083 
1084 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1085 	*name = kmalloc(*namelen, GFP_NOFS);
1086 	if (*name == NULL)
1087 		return -ENOMEM;
1088 
1089 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1090 			   *namelen);
1091 
1092 	*index = btrfs_inode_extref_index(eb, extref);
1093 	if (parent_objectid)
1094 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1095 
1096 	return 0;
1097 }
1098 
1099 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1100 			  u32 *namelen, char **name, u64 *index)
1101 {
1102 	struct btrfs_inode_ref *ref;
1103 
1104 	ref = (struct btrfs_inode_ref *)ref_ptr;
1105 
1106 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1107 	*name = kmalloc(*namelen, GFP_NOFS);
1108 	if (*name == NULL)
1109 		return -ENOMEM;
1110 
1111 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1112 
1113 	*index = btrfs_inode_ref_index(eb, ref);
1114 
1115 	return 0;
1116 }
1117 
1118 /*
1119  * replay one inode back reference item found in the log tree.
1120  * eb, slot and key refer to the buffer and key found in the log tree.
1121  * root is the destination we are replaying into, and path is for temp
1122  * use by this function.  (it should be released on return).
1123  */
1124 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1125 				  struct btrfs_root *root,
1126 				  struct btrfs_root *log,
1127 				  struct btrfs_path *path,
1128 				  struct extent_buffer *eb, int slot,
1129 				  struct btrfs_key *key)
1130 {
1131 	struct inode *dir = NULL;
1132 	struct inode *inode = NULL;
1133 	unsigned long ref_ptr;
1134 	unsigned long ref_end;
1135 	char *name = NULL;
1136 	int namelen;
1137 	int ret;
1138 	int search_done = 0;
1139 	int log_ref_ver = 0;
1140 	u64 parent_objectid;
1141 	u64 inode_objectid;
1142 	u64 ref_index = 0;
1143 	int ref_struct_size;
1144 
1145 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1146 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1147 
1148 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1149 		struct btrfs_inode_extref *r;
1150 
1151 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1152 		log_ref_ver = 1;
1153 		r = (struct btrfs_inode_extref *)ref_ptr;
1154 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1155 	} else {
1156 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1157 		parent_objectid = key->offset;
1158 	}
1159 	inode_objectid = key->objectid;
1160 
1161 	/*
1162 	 * it is possible that we didn't log all the parent directories
1163 	 * for a given inode.  If we don't find the dir, just don't
1164 	 * copy the back ref in.  The link count fixup code will take
1165 	 * care of the rest
1166 	 */
1167 	dir = read_one_inode(root, parent_objectid);
1168 	if (!dir) {
1169 		ret = -ENOENT;
1170 		goto out;
1171 	}
1172 
1173 	inode = read_one_inode(root, inode_objectid);
1174 	if (!inode) {
1175 		ret = -EIO;
1176 		goto out;
1177 	}
1178 
1179 	while (ref_ptr < ref_end) {
1180 		if (log_ref_ver) {
1181 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1182 						&ref_index, &parent_objectid);
1183 			/*
1184 			 * parent object can change from one array
1185 			 * item to another.
1186 			 */
1187 			if (!dir)
1188 				dir = read_one_inode(root, parent_objectid);
1189 			if (!dir) {
1190 				ret = -ENOENT;
1191 				goto out;
1192 			}
1193 		} else {
1194 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1195 					     &ref_index);
1196 		}
1197 		if (ret)
1198 			goto out;
1199 
1200 		/* if we already have a perfect match, we're done */
1201 		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1202 				  ref_index, name, namelen)) {
1203 			/*
1204 			 * look for a conflicting back reference in the
1205 			 * metadata. if we find one we have to unlink that name
1206 			 * of the file before we add our new link.  Later on, we
1207 			 * overwrite any existing back reference, and we don't
1208 			 * want to create dangling pointers in the directory.
1209 			 */
1210 
1211 			if (!search_done) {
1212 				ret = __add_inode_ref(trans, root, path, log,
1213 						      dir, inode, eb,
1214 						      inode_objectid,
1215 						      parent_objectid,
1216 						      ref_index, name, namelen,
1217 						      &search_done);
1218 				if (ret) {
1219 					if (ret == 1)
1220 						ret = 0;
1221 					goto out;
1222 				}
1223 			}
1224 
1225 			/* insert our name */
1226 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
1227 					     0, ref_index);
1228 			if (ret)
1229 				goto out;
1230 
1231 			btrfs_update_inode(trans, root, inode);
1232 		}
1233 
1234 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1235 		kfree(name);
1236 		name = NULL;
1237 		if (log_ref_ver) {
1238 			iput(dir);
1239 			dir = NULL;
1240 		}
1241 	}
1242 
1243 	/* finally write the back reference in the inode */
1244 	ret = overwrite_item(trans, root, path, eb, slot, key);
1245 out:
1246 	btrfs_release_path(path);
1247 	kfree(name);
1248 	iput(dir);
1249 	iput(inode);
1250 	return ret;
1251 }
1252 
1253 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1254 			      struct btrfs_root *root, u64 offset)
1255 {
1256 	int ret;
1257 	ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
1258 			offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
1259 	if (ret > 0)
1260 		ret = btrfs_insert_orphan_item(trans, root, offset);
1261 	return ret;
1262 }
1263 
1264 static int count_inode_extrefs(struct btrfs_root *root,
1265 			       struct inode *inode, struct btrfs_path *path)
1266 {
1267 	int ret = 0;
1268 	int name_len;
1269 	unsigned int nlink = 0;
1270 	u32 item_size;
1271 	u32 cur_offset = 0;
1272 	u64 inode_objectid = btrfs_ino(inode);
1273 	u64 offset = 0;
1274 	unsigned long ptr;
1275 	struct btrfs_inode_extref *extref;
1276 	struct extent_buffer *leaf;
1277 
1278 	while (1) {
1279 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1280 					    &extref, &offset);
1281 		if (ret)
1282 			break;
1283 
1284 		leaf = path->nodes[0];
1285 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1286 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1287 
1288 		while (cur_offset < item_size) {
1289 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1290 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1291 
1292 			nlink++;
1293 
1294 			cur_offset += name_len + sizeof(*extref);
1295 		}
1296 
1297 		offset++;
1298 		btrfs_release_path(path);
1299 	}
1300 	btrfs_release_path(path);
1301 
1302 	if (ret < 0)
1303 		return ret;
1304 	return nlink;
1305 }
1306 
1307 static int count_inode_refs(struct btrfs_root *root,
1308 			       struct inode *inode, struct btrfs_path *path)
1309 {
1310 	int ret;
1311 	struct btrfs_key key;
1312 	unsigned int nlink = 0;
1313 	unsigned long ptr;
1314 	unsigned long ptr_end;
1315 	int name_len;
1316 	u64 ino = btrfs_ino(inode);
1317 
1318 	key.objectid = ino;
1319 	key.type = BTRFS_INODE_REF_KEY;
1320 	key.offset = (u64)-1;
1321 
1322 	while (1) {
1323 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1324 		if (ret < 0)
1325 			break;
1326 		if (ret > 0) {
1327 			if (path->slots[0] == 0)
1328 				break;
1329 			path->slots[0]--;
1330 		}
1331 process_slot:
1332 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1333 				      path->slots[0]);
1334 		if (key.objectid != ino ||
1335 		    key.type != BTRFS_INODE_REF_KEY)
1336 			break;
1337 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1338 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1339 						   path->slots[0]);
1340 		while (ptr < ptr_end) {
1341 			struct btrfs_inode_ref *ref;
1342 
1343 			ref = (struct btrfs_inode_ref *)ptr;
1344 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1345 							    ref);
1346 			ptr = (unsigned long)(ref + 1) + name_len;
1347 			nlink++;
1348 		}
1349 
1350 		if (key.offset == 0)
1351 			break;
1352 		if (path->slots[0] > 0) {
1353 			path->slots[0]--;
1354 			goto process_slot;
1355 		}
1356 		key.offset--;
1357 		btrfs_release_path(path);
1358 	}
1359 	btrfs_release_path(path);
1360 
1361 	return nlink;
1362 }
1363 
1364 /*
1365  * There are a few corners where the link count of the file can't
1366  * be properly maintained during replay.  So, instead of adding
1367  * lots of complexity to the log code, we just scan the backrefs
1368  * for any file that has been through replay.
1369  *
1370  * The scan will update the link count on the inode to reflect the
1371  * number of back refs found.  If it goes down to zero, the iput
1372  * will free the inode.
1373  */
1374 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1375 					   struct btrfs_root *root,
1376 					   struct inode *inode)
1377 {
1378 	struct btrfs_path *path;
1379 	int ret;
1380 	u64 nlink = 0;
1381 	u64 ino = btrfs_ino(inode);
1382 
1383 	path = btrfs_alloc_path();
1384 	if (!path)
1385 		return -ENOMEM;
1386 
1387 	ret = count_inode_refs(root, inode, path);
1388 	if (ret < 0)
1389 		goto out;
1390 
1391 	nlink = ret;
1392 
1393 	ret = count_inode_extrefs(root, inode, path);
1394 	if (ret == -ENOENT)
1395 		ret = 0;
1396 
1397 	if (ret < 0)
1398 		goto out;
1399 
1400 	nlink += ret;
1401 
1402 	ret = 0;
1403 
1404 	if (nlink != inode->i_nlink) {
1405 		set_nlink(inode, nlink);
1406 		btrfs_update_inode(trans, root, inode);
1407 	}
1408 	BTRFS_I(inode)->index_cnt = (u64)-1;
1409 
1410 	if (inode->i_nlink == 0) {
1411 		if (S_ISDIR(inode->i_mode)) {
1412 			ret = replay_dir_deletes(trans, root, NULL, path,
1413 						 ino, 1);
1414 			if (ret)
1415 				goto out;
1416 		}
1417 		ret = insert_orphan_item(trans, root, ino);
1418 	}
1419 
1420 out:
1421 	btrfs_free_path(path);
1422 	return ret;
1423 }
1424 
1425 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1426 					    struct btrfs_root *root,
1427 					    struct btrfs_path *path)
1428 {
1429 	int ret;
1430 	struct btrfs_key key;
1431 	struct inode *inode;
1432 
1433 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1434 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1435 	key.offset = (u64)-1;
1436 	while (1) {
1437 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1438 		if (ret < 0)
1439 			break;
1440 
1441 		if (ret == 1) {
1442 			if (path->slots[0] == 0)
1443 				break;
1444 			path->slots[0]--;
1445 		}
1446 
1447 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1448 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1449 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1450 			break;
1451 
1452 		ret = btrfs_del_item(trans, root, path);
1453 		if (ret)
1454 			goto out;
1455 
1456 		btrfs_release_path(path);
1457 		inode = read_one_inode(root, key.offset);
1458 		if (!inode)
1459 			return -EIO;
1460 
1461 		ret = fixup_inode_link_count(trans, root, inode);
1462 		iput(inode);
1463 		if (ret)
1464 			goto out;
1465 
1466 		/*
1467 		 * fixup on a directory may create new entries,
1468 		 * make sure we always look for the highset possible
1469 		 * offset
1470 		 */
1471 		key.offset = (u64)-1;
1472 	}
1473 	ret = 0;
1474 out:
1475 	btrfs_release_path(path);
1476 	return ret;
1477 }
1478 
1479 
1480 /*
1481  * record a given inode in the fixup dir so we can check its link
1482  * count when replay is done.  The link count is incremented here
1483  * so the inode won't go away until we check it
1484  */
1485 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1486 				      struct btrfs_root *root,
1487 				      struct btrfs_path *path,
1488 				      u64 objectid)
1489 {
1490 	struct btrfs_key key;
1491 	int ret = 0;
1492 	struct inode *inode;
1493 
1494 	inode = read_one_inode(root, objectid);
1495 	if (!inode)
1496 		return -EIO;
1497 
1498 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1499 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1500 	key.offset = objectid;
1501 
1502 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1503 
1504 	btrfs_release_path(path);
1505 	if (ret == 0) {
1506 		if (!inode->i_nlink)
1507 			set_nlink(inode, 1);
1508 		else
1509 			inc_nlink(inode);
1510 		ret = btrfs_update_inode(trans, root, inode);
1511 	} else if (ret == -EEXIST) {
1512 		ret = 0;
1513 	} else {
1514 		BUG(); /* Logic Error */
1515 	}
1516 	iput(inode);
1517 
1518 	return ret;
1519 }
1520 
1521 /*
1522  * when replaying the log for a directory, we only insert names
1523  * for inodes that actually exist.  This means an fsync on a directory
1524  * does not implicitly fsync all the new files in it
1525  */
1526 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1527 				    struct btrfs_root *root,
1528 				    struct btrfs_path *path,
1529 				    u64 dirid, u64 index,
1530 				    char *name, int name_len, u8 type,
1531 				    struct btrfs_key *location)
1532 {
1533 	struct inode *inode;
1534 	struct inode *dir;
1535 	int ret;
1536 
1537 	inode = read_one_inode(root, location->objectid);
1538 	if (!inode)
1539 		return -ENOENT;
1540 
1541 	dir = read_one_inode(root, dirid);
1542 	if (!dir) {
1543 		iput(inode);
1544 		return -EIO;
1545 	}
1546 
1547 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1548 
1549 	/* FIXME, put inode into FIXUP list */
1550 
1551 	iput(inode);
1552 	iput(dir);
1553 	return ret;
1554 }
1555 
1556 /*
1557  * take a single entry in a log directory item and replay it into
1558  * the subvolume.
1559  *
1560  * if a conflicting item exists in the subdirectory already,
1561  * the inode it points to is unlinked and put into the link count
1562  * fix up tree.
1563  *
1564  * If a name from the log points to a file or directory that does
1565  * not exist in the FS, it is skipped.  fsyncs on directories
1566  * do not force down inodes inside that directory, just changes to the
1567  * names or unlinks in a directory.
1568  */
1569 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1570 				    struct btrfs_root *root,
1571 				    struct btrfs_path *path,
1572 				    struct extent_buffer *eb,
1573 				    struct btrfs_dir_item *di,
1574 				    struct btrfs_key *key)
1575 {
1576 	char *name;
1577 	int name_len;
1578 	struct btrfs_dir_item *dst_di;
1579 	struct btrfs_key found_key;
1580 	struct btrfs_key log_key;
1581 	struct inode *dir;
1582 	u8 log_type;
1583 	int exists;
1584 	int ret = 0;
1585 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1586 
1587 	dir = read_one_inode(root, key->objectid);
1588 	if (!dir)
1589 		return -EIO;
1590 
1591 	name_len = btrfs_dir_name_len(eb, di);
1592 	name = kmalloc(name_len, GFP_NOFS);
1593 	if (!name) {
1594 		ret = -ENOMEM;
1595 		goto out;
1596 	}
1597 
1598 	log_type = btrfs_dir_type(eb, di);
1599 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1600 		   name_len);
1601 
1602 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1603 	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1604 	if (exists == 0)
1605 		exists = 1;
1606 	else
1607 		exists = 0;
1608 	btrfs_release_path(path);
1609 
1610 	if (key->type == BTRFS_DIR_ITEM_KEY) {
1611 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1612 				       name, name_len, 1);
1613 	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1614 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1615 						     key->objectid,
1616 						     key->offset, name,
1617 						     name_len, 1);
1618 	} else {
1619 		/* Corruption */
1620 		ret = -EINVAL;
1621 		goto out;
1622 	}
1623 	if (IS_ERR_OR_NULL(dst_di)) {
1624 		/* we need a sequence number to insert, so we only
1625 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1626 		 */
1627 		if (key->type != BTRFS_DIR_INDEX_KEY)
1628 			goto out;
1629 		goto insert;
1630 	}
1631 
1632 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1633 	/* the existing item matches the logged item */
1634 	if (found_key.objectid == log_key.objectid &&
1635 	    found_key.type == log_key.type &&
1636 	    found_key.offset == log_key.offset &&
1637 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1638 		goto out;
1639 	}
1640 
1641 	/*
1642 	 * don't drop the conflicting directory entry if the inode
1643 	 * for the new entry doesn't exist
1644 	 */
1645 	if (!exists)
1646 		goto out;
1647 
1648 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1649 	if (ret)
1650 		goto out;
1651 
1652 	if (key->type == BTRFS_DIR_INDEX_KEY)
1653 		goto insert;
1654 out:
1655 	btrfs_release_path(path);
1656 	if (!ret && update_size) {
1657 		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1658 		ret = btrfs_update_inode(trans, root, dir);
1659 	}
1660 	kfree(name);
1661 	iput(dir);
1662 	return ret;
1663 
1664 insert:
1665 	btrfs_release_path(path);
1666 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1667 			      name, name_len, log_type, &log_key);
1668 	if (ret && ret != -ENOENT)
1669 		goto out;
1670 	update_size = false;
1671 	ret = 0;
1672 	goto out;
1673 }
1674 
1675 /*
1676  * find all the names in a directory item and reconcile them into
1677  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1678  * one name in a directory item, but the same code gets used for
1679  * both directory index types
1680  */
1681 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1682 					struct btrfs_root *root,
1683 					struct btrfs_path *path,
1684 					struct extent_buffer *eb, int slot,
1685 					struct btrfs_key *key)
1686 {
1687 	int ret;
1688 	u32 item_size = btrfs_item_size_nr(eb, slot);
1689 	struct btrfs_dir_item *di;
1690 	int name_len;
1691 	unsigned long ptr;
1692 	unsigned long ptr_end;
1693 
1694 	ptr = btrfs_item_ptr_offset(eb, slot);
1695 	ptr_end = ptr + item_size;
1696 	while (ptr < ptr_end) {
1697 		di = (struct btrfs_dir_item *)ptr;
1698 		if (verify_dir_item(root, eb, di))
1699 			return -EIO;
1700 		name_len = btrfs_dir_name_len(eb, di);
1701 		ret = replay_one_name(trans, root, path, eb, di, key);
1702 		if (ret)
1703 			return ret;
1704 		ptr = (unsigned long)(di + 1);
1705 		ptr += name_len;
1706 	}
1707 	return 0;
1708 }
1709 
1710 /*
1711  * directory replay has two parts.  There are the standard directory
1712  * items in the log copied from the subvolume, and range items
1713  * created in the log while the subvolume was logged.
1714  *
1715  * The range items tell us which parts of the key space the log
1716  * is authoritative for.  During replay, if a key in the subvolume
1717  * directory is in a logged range item, but not actually in the log
1718  * that means it was deleted from the directory before the fsync
1719  * and should be removed.
1720  */
1721 static noinline int find_dir_range(struct btrfs_root *root,
1722 				   struct btrfs_path *path,
1723 				   u64 dirid, int key_type,
1724 				   u64 *start_ret, u64 *end_ret)
1725 {
1726 	struct btrfs_key key;
1727 	u64 found_end;
1728 	struct btrfs_dir_log_item *item;
1729 	int ret;
1730 	int nritems;
1731 
1732 	if (*start_ret == (u64)-1)
1733 		return 1;
1734 
1735 	key.objectid = dirid;
1736 	key.type = key_type;
1737 	key.offset = *start_ret;
1738 
1739 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1740 	if (ret < 0)
1741 		goto out;
1742 	if (ret > 0) {
1743 		if (path->slots[0] == 0)
1744 			goto out;
1745 		path->slots[0]--;
1746 	}
1747 	if (ret != 0)
1748 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1749 
1750 	if (key.type != key_type || key.objectid != dirid) {
1751 		ret = 1;
1752 		goto next;
1753 	}
1754 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1755 			      struct btrfs_dir_log_item);
1756 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1757 
1758 	if (*start_ret >= key.offset && *start_ret <= found_end) {
1759 		ret = 0;
1760 		*start_ret = key.offset;
1761 		*end_ret = found_end;
1762 		goto out;
1763 	}
1764 	ret = 1;
1765 next:
1766 	/* check the next slot in the tree to see if it is a valid item */
1767 	nritems = btrfs_header_nritems(path->nodes[0]);
1768 	if (path->slots[0] >= nritems) {
1769 		ret = btrfs_next_leaf(root, path);
1770 		if (ret)
1771 			goto out;
1772 	} else {
1773 		path->slots[0]++;
1774 	}
1775 
1776 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1777 
1778 	if (key.type != key_type || key.objectid != dirid) {
1779 		ret = 1;
1780 		goto out;
1781 	}
1782 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1783 			      struct btrfs_dir_log_item);
1784 	found_end = btrfs_dir_log_end(path->nodes[0], item);
1785 	*start_ret = key.offset;
1786 	*end_ret = found_end;
1787 	ret = 0;
1788 out:
1789 	btrfs_release_path(path);
1790 	return ret;
1791 }
1792 
1793 /*
1794  * this looks for a given directory item in the log.  If the directory
1795  * item is not in the log, the item is removed and the inode it points
1796  * to is unlinked
1797  */
1798 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1799 				      struct btrfs_root *root,
1800 				      struct btrfs_root *log,
1801 				      struct btrfs_path *path,
1802 				      struct btrfs_path *log_path,
1803 				      struct inode *dir,
1804 				      struct btrfs_key *dir_key)
1805 {
1806 	int ret;
1807 	struct extent_buffer *eb;
1808 	int slot;
1809 	u32 item_size;
1810 	struct btrfs_dir_item *di;
1811 	struct btrfs_dir_item *log_di;
1812 	int name_len;
1813 	unsigned long ptr;
1814 	unsigned long ptr_end;
1815 	char *name;
1816 	struct inode *inode;
1817 	struct btrfs_key location;
1818 
1819 again:
1820 	eb = path->nodes[0];
1821 	slot = path->slots[0];
1822 	item_size = btrfs_item_size_nr(eb, slot);
1823 	ptr = btrfs_item_ptr_offset(eb, slot);
1824 	ptr_end = ptr + item_size;
1825 	while (ptr < ptr_end) {
1826 		di = (struct btrfs_dir_item *)ptr;
1827 		if (verify_dir_item(root, eb, di)) {
1828 			ret = -EIO;
1829 			goto out;
1830 		}
1831 
1832 		name_len = btrfs_dir_name_len(eb, di);
1833 		name = kmalloc(name_len, GFP_NOFS);
1834 		if (!name) {
1835 			ret = -ENOMEM;
1836 			goto out;
1837 		}
1838 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
1839 				  name_len);
1840 		log_di = NULL;
1841 		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1842 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
1843 						       dir_key->objectid,
1844 						       name, name_len, 0);
1845 		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1846 			log_di = btrfs_lookup_dir_index_item(trans, log,
1847 						     log_path,
1848 						     dir_key->objectid,
1849 						     dir_key->offset,
1850 						     name, name_len, 0);
1851 		}
1852 		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
1853 			btrfs_dir_item_key_to_cpu(eb, di, &location);
1854 			btrfs_release_path(path);
1855 			btrfs_release_path(log_path);
1856 			inode = read_one_inode(root, location.objectid);
1857 			if (!inode) {
1858 				kfree(name);
1859 				return -EIO;
1860 			}
1861 
1862 			ret = link_to_fixup_dir(trans, root,
1863 						path, location.objectid);
1864 			if (ret) {
1865 				kfree(name);
1866 				iput(inode);
1867 				goto out;
1868 			}
1869 
1870 			inc_nlink(inode);
1871 			ret = btrfs_unlink_inode(trans, root, dir, inode,
1872 						 name, name_len);
1873 			if (!ret)
1874 				ret = btrfs_run_delayed_items(trans, root);
1875 			kfree(name);
1876 			iput(inode);
1877 			if (ret)
1878 				goto out;
1879 
1880 			/* there might still be more names under this key
1881 			 * check and repeat if required
1882 			 */
1883 			ret = btrfs_search_slot(NULL, root, dir_key, path,
1884 						0, 0);
1885 			if (ret == 0)
1886 				goto again;
1887 			ret = 0;
1888 			goto out;
1889 		} else if (IS_ERR(log_di)) {
1890 			kfree(name);
1891 			return PTR_ERR(log_di);
1892 		}
1893 		btrfs_release_path(log_path);
1894 		kfree(name);
1895 
1896 		ptr = (unsigned long)(di + 1);
1897 		ptr += name_len;
1898 	}
1899 	ret = 0;
1900 out:
1901 	btrfs_release_path(path);
1902 	btrfs_release_path(log_path);
1903 	return ret;
1904 }
1905 
1906 /*
1907  * deletion replay happens before we copy any new directory items
1908  * out of the log or out of backreferences from inodes.  It
1909  * scans the log to find ranges of keys that log is authoritative for,
1910  * and then scans the directory to find items in those ranges that are
1911  * not present in the log.
1912  *
1913  * Anything we don't find in the log is unlinked and removed from the
1914  * directory.
1915  */
1916 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1917 				       struct btrfs_root *root,
1918 				       struct btrfs_root *log,
1919 				       struct btrfs_path *path,
1920 				       u64 dirid, int del_all)
1921 {
1922 	u64 range_start;
1923 	u64 range_end;
1924 	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1925 	int ret = 0;
1926 	struct btrfs_key dir_key;
1927 	struct btrfs_key found_key;
1928 	struct btrfs_path *log_path;
1929 	struct inode *dir;
1930 
1931 	dir_key.objectid = dirid;
1932 	dir_key.type = BTRFS_DIR_ITEM_KEY;
1933 	log_path = btrfs_alloc_path();
1934 	if (!log_path)
1935 		return -ENOMEM;
1936 
1937 	dir = read_one_inode(root, dirid);
1938 	/* it isn't an error if the inode isn't there, that can happen
1939 	 * because we replay the deletes before we copy in the inode item
1940 	 * from the log
1941 	 */
1942 	if (!dir) {
1943 		btrfs_free_path(log_path);
1944 		return 0;
1945 	}
1946 again:
1947 	range_start = 0;
1948 	range_end = 0;
1949 	while (1) {
1950 		if (del_all)
1951 			range_end = (u64)-1;
1952 		else {
1953 			ret = find_dir_range(log, path, dirid, key_type,
1954 					     &range_start, &range_end);
1955 			if (ret != 0)
1956 				break;
1957 		}
1958 
1959 		dir_key.offset = range_start;
1960 		while (1) {
1961 			int nritems;
1962 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
1963 						0, 0);
1964 			if (ret < 0)
1965 				goto out;
1966 
1967 			nritems = btrfs_header_nritems(path->nodes[0]);
1968 			if (path->slots[0] >= nritems) {
1969 				ret = btrfs_next_leaf(root, path);
1970 				if (ret)
1971 					break;
1972 			}
1973 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1974 					      path->slots[0]);
1975 			if (found_key.objectid != dirid ||
1976 			    found_key.type != dir_key.type)
1977 				goto next_type;
1978 
1979 			if (found_key.offset > range_end)
1980 				break;
1981 
1982 			ret = check_item_in_log(trans, root, log, path,
1983 						log_path, dir,
1984 						&found_key);
1985 			if (ret)
1986 				goto out;
1987 			if (found_key.offset == (u64)-1)
1988 				break;
1989 			dir_key.offset = found_key.offset + 1;
1990 		}
1991 		btrfs_release_path(path);
1992 		if (range_end == (u64)-1)
1993 			break;
1994 		range_start = range_end + 1;
1995 	}
1996 
1997 next_type:
1998 	ret = 0;
1999 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2000 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2001 		dir_key.type = BTRFS_DIR_INDEX_KEY;
2002 		btrfs_release_path(path);
2003 		goto again;
2004 	}
2005 out:
2006 	btrfs_release_path(path);
2007 	btrfs_free_path(log_path);
2008 	iput(dir);
2009 	return ret;
2010 }
2011 
2012 /*
2013  * the process_func used to replay items from the log tree.  This
2014  * gets called in two different stages.  The first stage just looks
2015  * for inodes and makes sure they are all copied into the subvolume.
2016  *
2017  * The second stage copies all the other item types from the log into
2018  * the subvolume.  The two stage approach is slower, but gets rid of
2019  * lots of complexity around inodes referencing other inodes that exist
2020  * only in the log (references come from either directory items or inode
2021  * back refs).
2022  */
2023 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2024 			     struct walk_control *wc, u64 gen)
2025 {
2026 	int nritems;
2027 	struct btrfs_path *path;
2028 	struct btrfs_root *root = wc->replay_dest;
2029 	struct btrfs_key key;
2030 	int level;
2031 	int i;
2032 	int ret;
2033 
2034 	ret = btrfs_read_buffer(eb, gen);
2035 	if (ret)
2036 		return ret;
2037 
2038 	level = btrfs_header_level(eb);
2039 
2040 	if (level != 0)
2041 		return 0;
2042 
2043 	path = btrfs_alloc_path();
2044 	if (!path)
2045 		return -ENOMEM;
2046 
2047 	nritems = btrfs_header_nritems(eb);
2048 	for (i = 0; i < nritems; i++) {
2049 		btrfs_item_key_to_cpu(eb, &key, i);
2050 
2051 		/* inode keys are done during the first stage */
2052 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2053 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2054 			struct btrfs_inode_item *inode_item;
2055 			u32 mode;
2056 
2057 			inode_item = btrfs_item_ptr(eb, i,
2058 					    struct btrfs_inode_item);
2059 			mode = btrfs_inode_mode(eb, inode_item);
2060 			if (S_ISDIR(mode)) {
2061 				ret = replay_dir_deletes(wc->trans,
2062 					 root, log, path, key.objectid, 0);
2063 				if (ret)
2064 					break;
2065 			}
2066 			ret = overwrite_item(wc->trans, root, path,
2067 					     eb, i, &key);
2068 			if (ret)
2069 				break;
2070 
2071 			/* for regular files, make sure corresponding
2072 			 * orhpan item exist. extents past the new EOF
2073 			 * will be truncated later by orphan cleanup.
2074 			 */
2075 			if (S_ISREG(mode)) {
2076 				ret = insert_orphan_item(wc->trans, root,
2077 							 key.objectid);
2078 				if (ret)
2079 					break;
2080 			}
2081 
2082 			ret = link_to_fixup_dir(wc->trans, root,
2083 						path, key.objectid);
2084 			if (ret)
2085 				break;
2086 		}
2087 
2088 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2089 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2090 			ret = replay_one_dir_item(wc->trans, root, path,
2091 						  eb, i, &key);
2092 			if (ret)
2093 				break;
2094 		}
2095 
2096 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2097 			continue;
2098 
2099 		/* these keys are simply copied */
2100 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2101 			ret = overwrite_item(wc->trans, root, path,
2102 					     eb, i, &key);
2103 			if (ret)
2104 				break;
2105 		} else if (key.type == BTRFS_INODE_REF_KEY ||
2106 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2107 			ret = add_inode_ref(wc->trans, root, log, path,
2108 					    eb, i, &key);
2109 			if (ret && ret != -ENOENT)
2110 				break;
2111 			ret = 0;
2112 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2113 			ret = replay_one_extent(wc->trans, root, path,
2114 						eb, i, &key);
2115 			if (ret)
2116 				break;
2117 		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2118 			ret = replay_one_dir_item(wc->trans, root, path,
2119 						  eb, i, &key);
2120 			if (ret)
2121 				break;
2122 		}
2123 	}
2124 	btrfs_free_path(path);
2125 	return ret;
2126 }
2127 
2128 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2129 				   struct btrfs_root *root,
2130 				   struct btrfs_path *path, int *level,
2131 				   struct walk_control *wc)
2132 {
2133 	u64 root_owner;
2134 	u64 bytenr;
2135 	u64 ptr_gen;
2136 	struct extent_buffer *next;
2137 	struct extent_buffer *cur;
2138 	struct extent_buffer *parent;
2139 	u32 blocksize;
2140 	int ret = 0;
2141 
2142 	WARN_ON(*level < 0);
2143 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2144 
2145 	while (*level > 0) {
2146 		WARN_ON(*level < 0);
2147 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
2148 		cur = path->nodes[*level];
2149 
2150 		WARN_ON(btrfs_header_level(cur) != *level);
2151 
2152 		if (path->slots[*level] >=
2153 		    btrfs_header_nritems(cur))
2154 			break;
2155 
2156 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2157 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2158 		blocksize = btrfs_level_size(root, *level - 1);
2159 
2160 		parent = path->nodes[*level];
2161 		root_owner = btrfs_header_owner(parent);
2162 
2163 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2164 		if (!next)
2165 			return -ENOMEM;
2166 
2167 		if (*level == 1) {
2168 			ret = wc->process_func(root, next, wc, ptr_gen);
2169 			if (ret) {
2170 				free_extent_buffer(next);
2171 				return ret;
2172 			}
2173 
2174 			path->slots[*level]++;
2175 			if (wc->free) {
2176 				ret = btrfs_read_buffer(next, ptr_gen);
2177 				if (ret) {
2178 					free_extent_buffer(next);
2179 					return ret;
2180 				}
2181 
2182 				if (trans) {
2183 					btrfs_tree_lock(next);
2184 					btrfs_set_lock_blocking(next);
2185 					clean_tree_block(trans, root, next);
2186 					btrfs_wait_tree_block_writeback(next);
2187 					btrfs_tree_unlock(next);
2188 				}
2189 
2190 				WARN_ON(root_owner !=
2191 					BTRFS_TREE_LOG_OBJECTID);
2192 				ret = btrfs_free_and_pin_reserved_extent(root,
2193 							 bytenr, blocksize);
2194 				if (ret) {
2195 					free_extent_buffer(next);
2196 					return ret;
2197 				}
2198 			}
2199 			free_extent_buffer(next);
2200 			continue;
2201 		}
2202 		ret = btrfs_read_buffer(next, ptr_gen);
2203 		if (ret) {
2204 			free_extent_buffer(next);
2205 			return ret;
2206 		}
2207 
2208 		WARN_ON(*level <= 0);
2209 		if (path->nodes[*level-1])
2210 			free_extent_buffer(path->nodes[*level-1]);
2211 		path->nodes[*level-1] = next;
2212 		*level = btrfs_header_level(next);
2213 		path->slots[*level] = 0;
2214 		cond_resched();
2215 	}
2216 	WARN_ON(*level < 0);
2217 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
2218 
2219 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2220 
2221 	cond_resched();
2222 	return 0;
2223 }
2224 
2225 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2226 				 struct btrfs_root *root,
2227 				 struct btrfs_path *path, int *level,
2228 				 struct walk_control *wc)
2229 {
2230 	u64 root_owner;
2231 	int i;
2232 	int slot;
2233 	int ret;
2234 
2235 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2236 		slot = path->slots[i];
2237 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2238 			path->slots[i]++;
2239 			*level = i;
2240 			WARN_ON(*level == 0);
2241 			return 0;
2242 		} else {
2243 			struct extent_buffer *parent;
2244 			if (path->nodes[*level] == root->node)
2245 				parent = path->nodes[*level];
2246 			else
2247 				parent = path->nodes[*level + 1];
2248 
2249 			root_owner = btrfs_header_owner(parent);
2250 			ret = wc->process_func(root, path->nodes[*level], wc,
2251 				 btrfs_header_generation(path->nodes[*level]));
2252 			if (ret)
2253 				return ret;
2254 
2255 			if (wc->free) {
2256 				struct extent_buffer *next;
2257 
2258 				next = path->nodes[*level];
2259 
2260 				if (trans) {
2261 					btrfs_tree_lock(next);
2262 					btrfs_set_lock_blocking(next);
2263 					clean_tree_block(trans, root, next);
2264 					btrfs_wait_tree_block_writeback(next);
2265 					btrfs_tree_unlock(next);
2266 				}
2267 
2268 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2269 				ret = btrfs_free_and_pin_reserved_extent(root,
2270 						path->nodes[*level]->start,
2271 						path->nodes[*level]->len);
2272 				if (ret)
2273 					return ret;
2274 			}
2275 			free_extent_buffer(path->nodes[*level]);
2276 			path->nodes[*level] = NULL;
2277 			*level = i + 1;
2278 		}
2279 	}
2280 	return 1;
2281 }
2282 
2283 /*
2284  * drop the reference count on the tree rooted at 'snap'.  This traverses
2285  * the tree freeing any blocks that have a ref count of zero after being
2286  * decremented.
2287  */
2288 static int walk_log_tree(struct btrfs_trans_handle *trans,
2289 			 struct btrfs_root *log, struct walk_control *wc)
2290 {
2291 	int ret = 0;
2292 	int wret;
2293 	int level;
2294 	struct btrfs_path *path;
2295 	int orig_level;
2296 
2297 	path = btrfs_alloc_path();
2298 	if (!path)
2299 		return -ENOMEM;
2300 
2301 	level = btrfs_header_level(log->node);
2302 	orig_level = level;
2303 	path->nodes[level] = log->node;
2304 	extent_buffer_get(log->node);
2305 	path->slots[level] = 0;
2306 
2307 	while (1) {
2308 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2309 		if (wret > 0)
2310 			break;
2311 		if (wret < 0) {
2312 			ret = wret;
2313 			goto out;
2314 		}
2315 
2316 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2317 		if (wret > 0)
2318 			break;
2319 		if (wret < 0) {
2320 			ret = wret;
2321 			goto out;
2322 		}
2323 	}
2324 
2325 	/* was the root node processed? if not, catch it here */
2326 	if (path->nodes[orig_level]) {
2327 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2328 			 btrfs_header_generation(path->nodes[orig_level]));
2329 		if (ret)
2330 			goto out;
2331 		if (wc->free) {
2332 			struct extent_buffer *next;
2333 
2334 			next = path->nodes[orig_level];
2335 
2336 			if (trans) {
2337 				btrfs_tree_lock(next);
2338 				btrfs_set_lock_blocking(next);
2339 				clean_tree_block(trans, log, next);
2340 				btrfs_wait_tree_block_writeback(next);
2341 				btrfs_tree_unlock(next);
2342 			}
2343 
2344 			WARN_ON(log->root_key.objectid !=
2345 				BTRFS_TREE_LOG_OBJECTID);
2346 			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2347 							 next->len);
2348 			if (ret)
2349 				goto out;
2350 		}
2351 	}
2352 
2353 out:
2354 	btrfs_free_path(path);
2355 	return ret;
2356 }
2357 
2358 /*
2359  * helper function to update the item for a given subvolumes log root
2360  * in the tree of log roots
2361  */
2362 static int update_log_root(struct btrfs_trans_handle *trans,
2363 			   struct btrfs_root *log)
2364 {
2365 	int ret;
2366 
2367 	if (log->log_transid == 1) {
2368 		/* insert root item on the first sync */
2369 		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2370 				&log->root_key, &log->root_item);
2371 	} else {
2372 		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2373 				&log->root_key, &log->root_item);
2374 	}
2375 	return ret;
2376 }
2377 
2378 static void wait_log_commit(struct btrfs_trans_handle *trans,
2379 			    struct btrfs_root *root, int transid)
2380 {
2381 	DEFINE_WAIT(wait);
2382 	int index = transid % 2;
2383 
2384 	/*
2385 	 * we only allow two pending log transactions at a time,
2386 	 * so we know that if ours is more than 2 older than the
2387 	 * current transaction, we're done
2388 	 */
2389 	do {
2390 		prepare_to_wait(&root->log_commit_wait[index],
2391 				&wait, TASK_UNINTERRUPTIBLE);
2392 		mutex_unlock(&root->log_mutex);
2393 
2394 		if (root->log_transid_committed < transid &&
2395 		    atomic_read(&root->log_commit[index]))
2396 			schedule();
2397 
2398 		finish_wait(&root->log_commit_wait[index], &wait);
2399 		mutex_lock(&root->log_mutex);
2400 	} while (root->log_transid_committed < transid &&
2401 		 atomic_read(&root->log_commit[index]));
2402 }
2403 
2404 static void wait_for_writer(struct btrfs_trans_handle *trans,
2405 			    struct btrfs_root *root)
2406 {
2407 	DEFINE_WAIT(wait);
2408 
2409 	while (atomic_read(&root->log_writers)) {
2410 		prepare_to_wait(&root->log_writer_wait,
2411 				&wait, TASK_UNINTERRUPTIBLE);
2412 		mutex_unlock(&root->log_mutex);
2413 		if (atomic_read(&root->log_writers))
2414 			schedule();
2415 		mutex_lock(&root->log_mutex);
2416 		finish_wait(&root->log_writer_wait, &wait);
2417 	}
2418 }
2419 
2420 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2421 					struct btrfs_log_ctx *ctx)
2422 {
2423 	if (!ctx)
2424 		return;
2425 
2426 	mutex_lock(&root->log_mutex);
2427 	list_del_init(&ctx->list);
2428 	mutex_unlock(&root->log_mutex);
2429 }
2430 
2431 /*
2432  * Invoked in log mutex context, or be sure there is no other task which
2433  * can access the list.
2434  */
2435 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2436 					     int index, int error)
2437 {
2438 	struct btrfs_log_ctx *ctx;
2439 
2440 	if (!error) {
2441 		INIT_LIST_HEAD(&root->log_ctxs[index]);
2442 		return;
2443 	}
2444 
2445 	list_for_each_entry(ctx, &root->log_ctxs[index], list)
2446 		ctx->log_ret = error;
2447 
2448 	INIT_LIST_HEAD(&root->log_ctxs[index]);
2449 }
2450 
2451 /*
2452  * btrfs_sync_log does sends a given tree log down to the disk and
2453  * updates the super blocks to record it.  When this call is done,
2454  * you know that any inodes previously logged are safely on disk only
2455  * if it returns 0.
2456  *
2457  * Any other return value means you need to call btrfs_commit_transaction.
2458  * Some of the edge cases for fsyncing directories that have had unlinks
2459  * or renames done in the past mean that sometimes the only safe
2460  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2461  * that has happened.
2462  */
2463 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2464 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2465 {
2466 	int index1;
2467 	int index2;
2468 	int mark;
2469 	int ret;
2470 	struct btrfs_root *log = root->log_root;
2471 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2472 	int log_transid = 0;
2473 	struct btrfs_log_ctx root_log_ctx;
2474 	struct blk_plug plug;
2475 
2476 	mutex_lock(&root->log_mutex);
2477 	log_transid = ctx->log_transid;
2478 	if (root->log_transid_committed >= log_transid) {
2479 		mutex_unlock(&root->log_mutex);
2480 		return ctx->log_ret;
2481 	}
2482 
2483 	index1 = log_transid % 2;
2484 	if (atomic_read(&root->log_commit[index1])) {
2485 		wait_log_commit(trans, root, log_transid);
2486 		mutex_unlock(&root->log_mutex);
2487 		return ctx->log_ret;
2488 	}
2489 	ASSERT(log_transid == root->log_transid);
2490 	atomic_set(&root->log_commit[index1], 1);
2491 
2492 	/* wait for previous tree log sync to complete */
2493 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2494 		wait_log_commit(trans, root, log_transid - 1);
2495 
2496 	while (1) {
2497 		int batch = atomic_read(&root->log_batch);
2498 		/* when we're on an ssd, just kick the log commit out */
2499 		if (!btrfs_test_opt(root, SSD) &&
2500 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2501 			mutex_unlock(&root->log_mutex);
2502 			schedule_timeout_uninterruptible(1);
2503 			mutex_lock(&root->log_mutex);
2504 		}
2505 		wait_for_writer(trans, root);
2506 		if (batch == atomic_read(&root->log_batch))
2507 			break;
2508 	}
2509 
2510 	/* bail out if we need to do a full commit */
2511 	if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2512 		ret = -EAGAIN;
2513 		btrfs_free_logged_extents(log, log_transid);
2514 		mutex_unlock(&root->log_mutex);
2515 		goto out;
2516 	}
2517 
2518 	if (log_transid % 2 == 0)
2519 		mark = EXTENT_DIRTY;
2520 	else
2521 		mark = EXTENT_NEW;
2522 
2523 	/* we start IO on  all the marked extents here, but we don't actually
2524 	 * wait for them until later.
2525 	 */
2526 	blk_start_plug(&plug);
2527 	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2528 	if (ret) {
2529 		blk_finish_plug(&plug);
2530 		btrfs_abort_transaction(trans, root, ret);
2531 		btrfs_free_logged_extents(log, log_transid);
2532 		btrfs_set_log_full_commit(root->fs_info, trans);
2533 		mutex_unlock(&root->log_mutex);
2534 		goto out;
2535 	}
2536 
2537 	btrfs_set_root_node(&log->root_item, log->node);
2538 
2539 	root->log_transid++;
2540 	log->log_transid = root->log_transid;
2541 	root->log_start_pid = 0;
2542 	/*
2543 	 * IO has been started, blocks of the log tree have WRITTEN flag set
2544 	 * in their headers. new modifications of the log will be written to
2545 	 * new positions. so it's safe to allow log writers to go in.
2546 	 */
2547 	mutex_unlock(&root->log_mutex);
2548 
2549 	btrfs_init_log_ctx(&root_log_ctx);
2550 
2551 	mutex_lock(&log_root_tree->log_mutex);
2552 	atomic_inc(&log_root_tree->log_batch);
2553 	atomic_inc(&log_root_tree->log_writers);
2554 
2555 	index2 = log_root_tree->log_transid % 2;
2556 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2557 	root_log_ctx.log_transid = log_root_tree->log_transid;
2558 
2559 	mutex_unlock(&log_root_tree->log_mutex);
2560 
2561 	ret = update_log_root(trans, log);
2562 
2563 	mutex_lock(&log_root_tree->log_mutex);
2564 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2565 		smp_mb();
2566 		if (waitqueue_active(&log_root_tree->log_writer_wait))
2567 			wake_up(&log_root_tree->log_writer_wait);
2568 	}
2569 
2570 	if (ret) {
2571 		if (!list_empty(&root_log_ctx.list))
2572 			list_del_init(&root_log_ctx.list);
2573 
2574 		blk_finish_plug(&plug);
2575 		btrfs_set_log_full_commit(root->fs_info, trans);
2576 
2577 		if (ret != -ENOSPC) {
2578 			btrfs_abort_transaction(trans, root, ret);
2579 			mutex_unlock(&log_root_tree->log_mutex);
2580 			goto out;
2581 		}
2582 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2583 		btrfs_free_logged_extents(log, log_transid);
2584 		mutex_unlock(&log_root_tree->log_mutex);
2585 		ret = -EAGAIN;
2586 		goto out;
2587 	}
2588 
2589 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2590 		mutex_unlock(&log_root_tree->log_mutex);
2591 		ret = root_log_ctx.log_ret;
2592 		goto out;
2593 	}
2594 
2595 	index2 = root_log_ctx.log_transid % 2;
2596 	if (atomic_read(&log_root_tree->log_commit[index2])) {
2597 		blk_finish_plug(&plug);
2598 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2599 		wait_log_commit(trans, log_root_tree,
2600 				root_log_ctx.log_transid);
2601 		btrfs_free_logged_extents(log, log_transid);
2602 		mutex_unlock(&log_root_tree->log_mutex);
2603 		ret = root_log_ctx.log_ret;
2604 		goto out;
2605 	}
2606 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2607 	atomic_set(&log_root_tree->log_commit[index2], 1);
2608 
2609 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2610 		wait_log_commit(trans, log_root_tree,
2611 				root_log_ctx.log_transid - 1);
2612 	}
2613 
2614 	wait_for_writer(trans, log_root_tree);
2615 
2616 	/*
2617 	 * now that we've moved on to the tree of log tree roots,
2618 	 * check the full commit flag again
2619 	 */
2620 	if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2621 		blk_finish_plug(&plug);
2622 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2623 		btrfs_free_logged_extents(log, log_transid);
2624 		mutex_unlock(&log_root_tree->log_mutex);
2625 		ret = -EAGAIN;
2626 		goto out_wake_log_root;
2627 	}
2628 
2629 	ret = btrfs_write_marked_extents(log_root_tree,
2630 					 &log_root_tree->dirty_log_pages,
2631 					 EXTENT_DIRTY | EXTENT_NEW);
2632 	blk_finish_plug(&plug);
2633 	if (ret) {
2634 		btrfs_set_log_full_commit(root->fs_info, trans);
2635 		btrfs_abort_transaction(trans, root, ret);
2636 		btrfs_free_logged_extents(log, log_transid);
2637 		mutex_unlock(&log_root_tree->log_mutex);
2638 		goto out_wake_log_root;
2639 	}
2640 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2641 	btrfs_wait_marked_extents(log_root_tree,
2642 				  &log_root_tree->dirty_log_pages,
2643 				  EXTENT_NEW | EXTENT_DIRTY);
2644 	btrfs_wait_logged_extents(log, log_transid);
2645 
2646 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
2647 				log_root_tree->node->start);
2648 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2649 				btrfs_header_level(log_root_tree->node));
2650 
2651 	log_root_tree->log_transid++;
2652 	mutex_unlock(&log_root_tree->log_mutex);
2653 
2654 	/*
2655 	 * nobody else is going to jump in and write the the ctree
2656 	 * super here because the log_commit atomic below is protecting
2657 	 * us.  We must be called with a transaction handle pinning
2658 	 * the running transaction open, so a full commit can't hop
2659 	 * in and cause problems either.
2660 	 */
2661 	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2662 	if (ret) {
2663 		btrfs_set_log_full_commit(root->fs_info, trans);
2664 		btrfs_abort_transaction(trans, root, ret);
2665 		goto out_wake_log_root;
2666 	}
2667 
2668 	mutex_lock(&root->log_mutex);
2669 	if (root->last_log_commit < log_transid)
2670 		root->last_log_commit = log_transid;
2671 	mutex_unlock(&root->log_mutex);
2672 
2673 out_wake_log_root:
2674 	/*
2675 	 * We needn't get log_mutex here because we are sure all
2676 	 * the other tasks are blocked.
2677 	 */
2678 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2679 
2680 	mutex_lock(&log_root_tree->log_mutex);
2681 	log_root_tree->log_transid_committed++;
2682 	atomic_set(&log_root_tree->log_commit[index2], 0);
2683 	mutex_unlock(&log_root_tree->log_mutex);
2684 
2685 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2686 		wake_up(&log_root_tree->log_commit_wait[index2]);
2687 out:
2688 	/* See above. */
2689 	btrfs_remove_all_log_ctxs(root, index1, ret);
2690 
2691 	mutex_lock(&root->log_mutex);
2692 	root->log_transid_committed++;
2693 	atomic_set(&root->log_commit[index1], 0);
2694 	mutex_unlock(&root->log_mutex);
2695 
2696 	if (waitqueue_active(&root->log_commit_wait[index1]))
2697 		wake_up(&root->log_commit_wait[index1]);
2698 	return ret;
2699 }
2700 
2701 static void free_log_tree(struct btrfs_trans_handle *trans,
2702 			  struct btrfs_root *log)
2703 {
2704 	int ret;
2705 	u64 start;
2706 	u64 end;
2707 	struct walk_control wc = {
2708 		.free = 1,
2709 		.process_func = process_one_buffer
2710 	};
2711 
2712 	ret = walk_log_tree(trans, log, &wc);
2713 	/* I don't think this can happen but just in case */
2714 	if (ret)
2715 		btrfs_abort_transaction(trans, log, ret);
2716 
2717 	while (1) {
2718 		ret = find_first_extent_bit(&log->dirty_log_pages,
2719 				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2720 				NULL);
2721 		if (ret)
2722 			break;
2723 
2724 		clear_extent_bits(&log->dirty_log_pages, start, end,
2725 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2726 	}
2727 
2728 	/*
2729 	 * We may have short-circuited the log tree with the full commit logic
2730 	 * and left ordered extents on our list, so clear these out to keep us
2731 	 * from leaking inodes and memory.
2732 	 */
2733 	btrfs_free_logged_extents(log, 0);
2734 	btrfs_free_logged_extents(log, 1);
2735 
2736 	free_extent_buffer(log->node);
2737 	kfree(log);
2738 }
2739 
2740 /*
2741  * free all the extents used by the tree log.  This should be called
2742  * at commit time of the full transaction
2743  */
2744 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2745 {
2746 	if (root->log_root) {
2747 		free_log_tree(trans, root->log_root);
2748 		root->log_root = NULL;
2749 	}
2750 	return 0;
2751 }
2752 
2753 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2754 			     struct btrfs_fs_info *fs_info)
2755 {
2756 	if (fs_info->log_root_tree) {
2757 		free_log_tree(trans, fs_info->log_root_tree);
2758 		fs_info->log_root_tree = NULL;
2759 	}
2760 	return 0;
2761 }
2762 
2763 /*
2764  * If both a file and directory are logged, and unlinks or renames are
2765  * mixed in, we have a few interesting corners:
2766  *
2767  * create file X in dir Y
2768  * link file X to X.link in dir Y
2769  * fsync file X
2770  * unlink file X but leave X.link
2771  * fsync dir Y
2772  *
2773  * After a crash we would expect only X.link to exist.  But file X
2774  * didn't get fsync'd again so the log has back refs for X and X.link.
2775  *
2776  * We solve this by removing directory entries and inode backrefs from the
2777  * log when a file that was logged in the current transaction is
2778  * unlinked.  Any later fsync will include the updated log entries, and
2779  * we'll be able to reconstruct the proper directory items from backrefs.
2780  *
2781  * This optimizations allows us to avoid relogging the entire inode
2782  * or the entire directory.
2783  */
2784 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2785 				 struct btrfs_root *root,
2786 				 const char *name, int name_len,
2787 				 struct inode *dir, u64 index)
2788 {
2789 	struct btrfs_root *log;
2790 	struct btrfs_dir_item *di;
2791 	struct btrfs_path *path;
2792 	int ret;
2793 	int err = 0;
2794 	int bytes_del = 0;
2795 	u64 dir_ino = btrfs_ino(dir);
2796 
2797 	if (BTRFS_I(dir)->logged_trans < trans->transid)
2798 		return 0;
2799 
2800 	ret = join_running_log_trans(root);
2801 	if (ret)
2802 		return 0;
2803 
2804 	mutex_lock(&BTRFS_I(dir)->log_mutex);
2805 
2806 	log = root->log_root;
2807 	path = btrfs_alloc_path();
2808 	if (!path) {
2809 		err = -ENOMEM;
2810 		goto out_unlock;
2811 	}
2812 
2813 	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2814 				   name, name_len, -1);
2815 	if (IS_ERR(di)) {
2816 		err = PTR_ERR(di);
2817 		goto fail;
2818 	}
2819 	if (di) {
2820 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2821 		bytes_del += name_len;
2822 		if (ret) {
2823 			err = ret;
2824 			goto fail;
2825 		}
2826 	}
2827 	btrfs_release_path(path);
2828 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2829 					 index, name, name_len, -1);
2830 	if (IS_ERR(di)) {
2831 		err = PTR_ERR(di);
2832 		goto fail;
2833 	}
2834 	if (di) {
2835 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
2836 		bytes_del += name_len;
2837 		if (ret) {
2838 			err = ret;
2839 			goto fail;
2840 		}
2841 	}
2842 
2843 	/* update the directory size in the log to reflect the names
2844 	 * we have removed
2845 	 */
2846 	if (bytes_del) {
2847 		struct btrfs_key key;
2848 
2849 		key.objectid = dir_ino;
2850 		key.offset = 0;
2851 		key.type = BTRFS_INODE_ITEM_KEY;
2852 		btrfs_release_path(path);
2853 
2854 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2855 		if (ret < 0) {
2856 			err = ret;
2857 			goto fail;
2858 		}
2859 		if (ret == 0) {
2860 			struct btrfs_inode_item *item;
2861 			u64 i_size;
2862 
2863 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2864 					      struct btrfs_inode_item);
2865 			i_size = btrfs_inode_size(path->nodes[0], item);
2866 			if (i_size > bytes_del)
2867 				i_size -= bytes_del;
2868 			else
2869 				i_size = 0;
2870 			btrfs_set_inode_size(path->nodes[0], item, i_size);
2871 			btrfs_mark_buffer_dirty(path->nodes[0]);
2872 		} else
2873 			ret = 0;
2874 		btrfs_release_path(path);
2875 	}
2876 fail:
2877 	btrfs_free_path(path);
2878 out_unlock:
2879 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
2880 	if (ret == -ENOSPC) {
2881 		btrfs_set_log_full_commit(root->fs_info, trans);
2882 		ret = 0;
2883 	} else if (ret < 0)
2884 		btrfs_abort_transaction(trans, root, ret);
2885 
2886 	btrfs_end_log_trans(root);
2887 
2888 	return err;
2889 }
2890 
2891 /* see comments for btrfs_del_dir_entries_in_log */
2892 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2893 			       struct btrfs_root *root,
2894 			       const char *name, int name_len,
2895 			       struct inode *inode, u64 dirid)
2896 {
2897 	struct btrfs_root *log;
2898 	u64 index;
2899 	int ret;
2900 
2901 	if (BTRFS_I(inode)->logged_trans < trans->transid)
2902 		return 0;
2903 
2904 	ret = join_running_log_trans(root);
2905 	if (ret)
2906 		return 0;
2907 	log = root->log_root;
2908 	mutex_lock(&BTRFS_I(inode)->log_mutex);
2909 
2910 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2911 				  dirid, &index);
2912 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
2913 	if (ret == -ENOSPC) {
2914 		btrfs_set_log_full_commit(root->fs_info, trans);
2915 		ret = 0;
2916 	} else if (ret < 0 && ret != -ENOENT)
2917 		btrfs_abort_transaction(trans, root, ret);
2918 	btrfs_end_log_trans(root);
2919 
2920 	return ret;
2921 }
2922 
2923 /*
2924  * creates a range item in the log for 'dirid'.  first_offset and
2925  * last_offset tell us which parts of the key space the log should
2926  * be considered authoritative for.
2927  */
2928 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2929 				       struct btrfs_root *log,
2930 				       struct btrfs_path *path,
2931 				       int key_type, u64 dirid,
2932 				       u64 first_offset, u64 last_offset)
2933 {
2934 	int ret;
2935 	struct btrfs_key key;
2936 	struct btrfs_dir_log_item *item;
2937 
2938 	key.objectid = dirid;
2939 	key.offset = first_offset;
2940 	if (key_type == BTRFS_DIR_ITEM_KEY)
2941 		key.type = BTRFS_DIR_LOG_ITEM_KEY;
2942 	else
2943 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
2944 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2945 	if (ret)
2946 		return ret;
2947 
2948 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2949 			      struct btrfs_dir_log_item);
2950 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2951 	btrfs_mark_buffer_dirty(path->nodes[0]);
2952 	btrfs_release_path(path);
2953 	return 0;
2954 }
2955 
2956 /*
2957  * log all the items included in the current transaction for a given
2958  * directory.  This also creates the range items in the log tree required
2959  * to replay anything deleted before the fsync
2960  */
2961 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2962 			  struct btrfs_root *root, struct inode *inode,
2963 			  struct btrfs_path *path,
2964 			  struct btrfs_path *dst_path, int key_type,
2965 			  u64 min_offset, u64 *last_offset_ret)
2966 {
2967 	struct btrfs_key min_key;
2968 	struct btrfs_root *log = root->log_root;
2969 	struct extent_buffer *src;
2970 	int err = 0;
2971 	int ret;
2972 	int i;
2973 	int nritems;
2974 	u64 first_offset = min_offset;
2975 	u64 last_offset = (u64)-1;
2976 	u64 ino = btrfs_ino(inode);
2977 
2978 	log = root->log_root;
2979 
2980 	min_key.objectid = ino;
2981 	min_key.type = key_type;
2982 	min_key.offset = min_offset;
2983 
2984 	path->keep_locks = 1;
2985 
2986 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
2987 
2988 	/*
2989 	 * we didn't find anything from this transaction, see if there
2990 	 * is anything at all
2991 	 */
2992 	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2993 		min_key.objectid = ino;
2994 		min_key.type = key_type;
2995 		min_key.offset = (u64)-1;
2996 		btrfs_release_path(path);
2997 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2998 		if (ret < 0) {
2999 			btrfs_release_path(path);
3000 			return ret;
3001 		}
3002 		ret = btrfs_previous_item(root, path, ino, key_type);
3003 
3004 		/* if ret == 0 there are items for this type,
3005 		 * create a range to tell us the last key of this type.
3006 		 * otherwise, there are no items in this directory after
3007 		 * *min_offset, and we create a range to indicate that.
3008 		 */
3009 		if (ret == 0) {
3010 			struct btrfs_key tmp;
3011 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3012 					      path->slots[0]);
3013 			if (key_type == tmp.type)
3014 				first_offset = max(min_offset, tmp.offset) + 1;
3015 		}
3016 		goto done;
3017 	}
3018 
3019 	/* go backward to find any previous key */
3020 	ret = btrfs_previous_item(root, path, ino, key_type);
3021 	if (ret == 0) {
3022 		struct btrfs_key tmp;
3023 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3024 		if (key_type == tmp.type) {
3025 			first_offset = tmp.offset;
3026 			ret = overwrite_item(trans, log, dst_path,
3027 					     path->nodes[0], path->slots[0],
3028 					     &tmp);
3029 			if (ret) {
3030 				err = ret;
3031 				goto done;
3032 			}
3033 		}
3034 	}
3035 	btrfs_release_path(path);
3036 
3037 	/* find the first key from this transaction again */
3038 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3039 	if (WARN_ON(ret != 0))
3040 		goto done;
3041 
3042 	/*
3043 	 * we have a block from this transaction, log every item in it
3044 	 * from our directory
3045 	 */
3046 	while (1) {
3047 		struct btrfs_key tmp;
3048 		src = path->nodes[0];
3049 		nritems = btrfs_header_nritems(src);
3050 		for (i = path->slots[0]; i < nritems; i++) {
3051 			btrfs_item_key_to_cpu(src, &min_key, i);
3052 
3053 			if (min_key.objectid != ino || min_key.type != key_type)
3054 				goto done;
3055 			ret = overwrite_item(trans, log, dst_path, src, i,
3056 					     &min_key);
3057 			if (ret) {
3058 				err = ret;
3059 				goto done;
3060 			}
3061 		}
3062 		path->slots[0] = nritems;
3063 
3064 		/*
3065 		 * look ahead to the next item and see if it is also
3066 		 * from this directory and from this transaction
3067 		 */
3068 		ret = btrfs_next_leaf(root, path);
3069 		if (ret == 1) {
3070 			last_offset = (u64)-1;
3071 			goto done;
3072 		}
3073 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3074 		if (tmp.objectid != ino || tmp.type != key_type) {
3075 			last_offset = (u64)-1;
3076 			goto done;
3077 		}
3078 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3079 			ret = overwrite_item(trans, log, dst_path,
3080 					     path->nodes[0], path->slots[0],
3081 					     &tmp);
3082 			if (ret)
3083 				err = ret;
3084 			else
3085 				last_offset = tmp.offset;
3086 			goto done;
3087 		}
3088 	}
3089 done:
3090 	btrfs_release_path(path);
3091 	btrfs_release_path(dst_path);
3092 
3093 	if (err == 0) {
3094 		*last_offset_ret = last_offset;
3095 		/*
3096 		 * insert the log range keys to indicate where the log
3097 		 * is valid
3098 		 */
3099 		ret = insert_dir_log_key(trans, log, path, key_type,
3100 					 ino, first_offset, last_offset);
3101 		if (ret)
3102 			err = ret;
3103 	}
3104 	return err;
3105 }
3106 
3107 /*
3108  * logging directories is very similar to logging inodes, We find all the items
3109  * from the current transaction and write them to the log.
3110  *
3111  * The recovery code scans the directory in the subvolume, and if it finds a
3112  * key in the range logged that is not present in the log tree, then it means
3113  * that dir entry was unlinked during the transaction.
3114  *
3115  * In order for that scan to work, we must include one key smaller than
3116  * the smallest logged by this transaction and one key larger than the largest
3117  * key logged by this transaction.
3118  */
3119 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3120 			  struct btrfs_root *root, struct inode *inode,
3121 			  struct btrfs_path *path,
3122 			  struct btrfs_path *dst_path)
3123 {
3124 	u64 min_key;
3125 	u64 max_key;
3126 	int ret;
3127 	int key_type = BTRFS_DIR_ITEM_KEY;
3128 
3129 again:
3130 	min_key = 0;
3131 	max_key = 0;
3132 	while (1) {
3133 		ret = log_dir_items(trans, root, inode, path,
3134 				    dst_path, key_type, min_key,
3135 				    &max_key);
3136 		if (ret)
3137 			return ret;
3138 		if (max_key == (u64)-1)
3139 			break;
3140 		min_key = max_key + 1;
3141 	}
3142 
3143 	if (key_type == BTRFS_DIR_ITEM_KEY) {
3144 		key_type = BTRFS_DIR_INDEX_KEY;
3145 		goto again;
3146 	}
3147 	return 0;
3148 }
3149 
3150 /*
3151  * a helper function to drop items from the log before we relog an
3152  * inode.  max_key_type indicates the highest item type to remove.
3153  * This cannot be run for file data extents because it does not
3154  * free the extents they point to.
3155  */
3156 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3157 				  struct btrfs_root *log,
3158 				  struct btrfs_path *path,
3159 				  u64 objectid, int max_key_type)
3160 {
3161 	int ret;
3162 	struct btrfs_key key;
3163 	struct btrfs_key found_key;
3164 	int start_slot;
3165 
3166 	key.objectid = objectid;
3167 	key.type = max_key_type;
3168 	key.offset = (u64)-1;
3169 
3170 	while (1) {
3171 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3172 		BUG_ON(ret == 0); /* Logic error */
3173 		if (ret < 0)
3174 			break;
3175 
3176 		if (path->slots[0] == 0)
3177 			break;
3178 
3179 		path->slots[0]--;
3180 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3181 				      path->slots[0]);
3182 
3183 		if (found_key.objectid != objectid)
3184 			break;
3185 
3186 		found_key.offset = 0;
3187 		found_key.type = 0;
3188 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3189 				       &start_slot);
3190 
3191 		ret = btrfs_del_items(trans, log, path, start_slot,
3192 				      path->slots[0] - start_slot + 1);
3193 		/*
3194 		 * If start slot isn't 0 then we don't need to re-search, we've
3195 		 * found the last guy with the objectid in this tree.
3196 		 */
3197 		if (ret || start_slot != 0)
3198 			break;
3199 		btrfs_release_path(path);
3200 	}
3201 	btrfs_release_path(path);
3202 	if (ret > 0)
3203 		ret = 0;
3204 	return ret;
3205 }
3206 
3207 static void fill_inode_item(struct btrfs_trans_handle *trans,
3208 			    struct extent_buffer *leaf,
3209 			    struct btrfs_inode_item *item,
3210 			    struct inode *inode, int log_inode_only)
3211 {
3212 	struct btrfs_map_token token;
3213 
3214 	btrfs_init_map_token(&token);
3215 
3216 	if (log_inode_only) {
3217 		/* set the generation to zero so the recover code
3218 		 * can tell the difference between an logging
3219 		 * just to say 'this inode exists' and a logging
3220 		 * to say 'update this inode with these values'
3221 		 */
3222 		btrfs_set_token_inode_generation(leaf, item, 0, &token);
3223 		btrfs_set_token_inode_size(leaf, item, 0, &token);
3224 	} else {
3225 		btrfs_set_token_inode_generation(leaf, item,
3226 						 BTRFS_I(inode)->generation,
3227 						 &token);
3228 		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3229 	}
3230 
3231 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3232 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3233 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3234 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3235 
3236 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3237 				     inode->i_atime.tv_sec, &token);
3238 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3239 				      inode->i_atime.tv_nsec, &token);
3240 
3241 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3242 				     inode->i_mtime.tv_sec, &token);
3243 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3244 				      inode->i_mtime.tv_nsec, &token);
3245 
3246 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3247 				     inode->i_ctime.tv_sec, &token);
3248 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3249 				      inode->i_ctime.tv_nsec, &token);
3250 
3251 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3252 				     &token);
3253 
3254 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3255 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3256 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3257 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3258 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3259 }
3260 
3261 static int log_inode_item(struct btrfs_trans_handle *trans,
3262 			  struct btrfs_root *log, struct btrfs_path *path,
3263 			  struct inode *inode)
3264 {
3265 	struct btrfs_inode_item *inode_item;
3266 	int ret;
3267 
3268 	ret = btrfs_insert_empty_item(trans, log, path,
3269 				      &BTRFS_I(inode)->location,
3270 				      sizeof(*inode_item));
3271 	if (ret && ret != -EEXIST)
3272 		return ret;
3273 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3274 				    struct btrfs_inode_item);
3275 	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3276 	btrfs_release_path(path);
3277 	return 0;
3278 }
3279 
3280 static noinline int copy_items(struct btrfs_trans_handle *trans,
3281 			       struct inode *inode,
3282 			       struct btrfs_path *dst_path,
3283 			       struct btrfs_path *src_path, u64 *last_extent,
3284 			       int start_slot, int nr, int inode_only)
3285 {
3286 	unsigned long src_offset;
3287 	unsigned long dst_offset;
3288 	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3289 	struct btrfs_file_extent_item *extent;
3290 	struct btrfs_inode_item *inode_item;
3291 	struct extent_buffer *src = src_path->nodes[0];
3292 	struct btrfs_key first_key, last_key, key;
3293 	int ret;
3294 	struct btrfs_key *ins_keys;
3295 	u32 *ins_sizes;
3296 	char *ins_data;
3297 	int i;
3298 	struct list_head ordered_sums;
3299 	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3300 	bool has_extents = false;
3301 	bool need_find_last_extent = (*last_extent == 0);
3302 	bool done = false;
3303 
3304 	INIT_LIST_HEAD(&ordered_sums);
3305 
3306 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3307 			   nr * sizeof(u32), GFP_NOFS);
3308 	if (!ins_data)
3309 		return -ENOMEM;
3310 
3311 	first_key.objectid = (u64)-1;
3312 
3313 	ins_sizes = (u32 *)ins_data;
3314 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3315 
3316 	for (i = 0; i < nr; i++) {
3317 		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3318 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3319 	}
3320 	ret = btrfs_insert_empty_items(trans, log, dst_path,
3321 				       ins_keys, ins_sizes, nr);
3322 	if (ret) {
3323 		kfree(ins_data);
3324 		return ret;
3325 	}
3326 
3327 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3328 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3329 						   dst_path->slots[0]);
3330 
3331 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3332 
3333 		if ((i == (nr - 1)))
3334 			last_key = ins_keys[i];
3335 
3336 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3337 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
3338 						    dst_path->slots[0],
3339 						    struct btrfs_inode_item);
3340 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
3341 					inode, inode_only == LOG_INODE_EXISTS);
3342 		} else {
3343 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3344 					   src_offset, ins_sizes[i]);
3345 		}
3346 
3347 		/*
3348 		 * We set need_find_last_extent here in case we know we were
3349 		 * processing other items and then walk into the first extent in
3350 		 * the inode.  If we don't hit an extent then nothing changes,
3351 		 * we'll do the last search the next time around.
3352 		 */
3353 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3354 			has_extents = true;
3355 			if (need_find_last_extent &&
3356 			    first_key.objectid == (u64)-1)
3357 				first_key = ins_keys[i];
3358 		} else {
3359 			need_find_last_extent = false;
3360 		}
3361 
3362 		/* take a reference on file data extents so that truncates
3363 		 * or deletes of this inode don't have to relog the inode
3364 		 * again
3365 		 */
3366 		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3367 		    !skip_csum) {
3368 			int found_type;
3369 			extent = btrfs_item_ptr(src, start_slot + i,
3370 						struct btrfs_file_extent_item);
3371 
3372 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
3373 				continue;
3374 
3375 			found_type = btrfs_file_extent_type(src, extent);
3376 			if (found_type == BTRFS_FILE_EXTENT_REG) {
3377 				u64 ds, dl, cs, cl;
3378 				ds = btrfs_file_extent_disk_bytenr(src,
3379 								extent);
3380 				/* ds == 0 is a hole */
3381 				if (ds == 0)
3382 					continue;
3383 
3384 				dl = btrfs_file_extent_disk_num_bytes(src,
3385 								extent);
3386 				cs = btrfs_file_extent_offset(src, extent);
3387 				cl = btrfs_file_extent_num_bytes(src,
3388 								extent);
3389 				if (btrfs_file_extent_compression(src,
3390 								  extent)) {
3391 					cs = 0;
3392 					cl = dl;
3393 				}
3394 
3395 				ret = btrfs_lookup_csums_range(
3396 						log->fs_info->csum_root,
3397 						ds + cs, ds + cs + cl - 1,
3398 						&ordered_sums, 0);
3399 				if (ret) {
3400 					btrfs_release_path(dst_path);
3401 					kfree(ins_data);
3402 					return ret;
3403 				}
3404 			}
3405 		}
3406 	}
3407 
3408 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3409 	btrfs_release_path(dst_path);
3410 	kfree(ins_data);
3411 
3412 	/*
3413 	 * we have to do this after the loop above to avoid changing the
3414 	 * log tree while trying to change the log tree.
3415 	 */
3416 	ret = 0;
3417 	while (!list_empty(&ordered_sums)) {
3418 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3419 						   struct btrfs_ordered_sum,
3420 						   list);
3421 		if (!ret)
3422 			ret = btrfs_csum_file_blocks(trans, log, sums);
3423 		list_del(&sums->list);
3424 		kfree(sums);
3425 	}
3426 
3427 	if (!has_extents)
3428 		return ret;
3429 
3430 	/*
3431 	 * Because we use btrfs_search_forward we could skip leaves that were
3432 	 * not modified and then assume *last_extent is valid when it really
3433 	 * isn't.  So back up to the previous leaf and read the end of the last
3434 	 * extent before we go and fill in holes.
3435 	 */
3436 	if (need_find_last_extent) {
3437 		u64 len;
3438 
3439 		ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
3440 		if (ret < 0)
3441 			return ret;
3442 		if (ret)
3443 			goto fill_holes;
3444 		if (src_path->slots[0])
3445 			src_path->slots[0]--;
3446 		src = src_path->nodes[0];
3447 		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3448 		if (key.objectid != btrfs_ino(inode) ||
3449 		    key.type != BTRFS_EXTENT_DATA_KEY)
3450 			goto fill_holes;
3451 		extent = btrfs_item_ptr(src, src_path->slots[0],
3452 					struct btrfs_file_extent_item);
3453 		if (btrfs_file_extent_type(src, extent) ==
3454 		    BTRFS_FILE_EXTENT_INLINE) {
3455 			len = btrfs_file_extent_inline_len(src,
3456 							   src_path->slots[0],
3457 							   extent);
3458 			*last_extent = ALIGN(key.offset + len,
3459 					     log->sectorsize);
3460 		} else {
3461 			len = btrfs_file_extent_num_bytes(src, extent);
3462 			*last_extent = key.offset + len;
3463 		}
3464 	}
3465 fill_holes:
3466 	/* So we did prev_leaf, now we need to move to the next leaf, but a few
3467 	 * things could have happened
3468 	 *
3469 	 * 1) A merge could have happened, so we could currently be on a leaf
3470 	 * that holds what we were copying in the first place.
3471 	 * 2) A split could have happened, and now not all of the items we want
3472 	 * are on the same leaf.
3473 	 *
3474 	 * So we need to adjust how we search for holes, we need to drop the
3475 	 * path and re-search for the first extent key we found, and then walk
3476 	 * forward until we hit the last one we copied.
3477 	 */
3478 	if (need_find_last_extent) {
3479 		/* btrfs_prev_leaf could return 1 without releasing the path */
3480 		btrfs_release_path(src_path);
3481 		ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
3482 					src_path, 0, 0);
3483 		if (ret < 0)
3484 			return ret;
3485 		ASSERT(ret == 0);
3486 		src = src_path->nodes[0];
3487 		i = src_path->slots[0];
3488 	} else {
3489 		i = start_slot;
3490 	}
3491 
3492 	/*
3493 	 * Ok so here we need to go through and fill in any holes we may have
3494 	 * to make sure that holes are punched for those areas in case they had
3495 	 * extents previously.
3496 	 */
3497 	while (!done) {
3498 		u64 offset, len;
3499 		u64 extent_end;
3500 
3501 		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3502 			ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
3503 			if (ret < 0)
3504 				return ret;
3505 			ASSERT(ret == 0);
3506 			src = src_path->nodes[0];
3507 			i = 0;
3508 		}
3509 
3510 		btrfs_item_key_to_cpu(src, &key, i);
3511 		if (!btrfs_comp_cpu_keys(&key, &last_key))
3512 			done = true;
3513 		if (key.objectid != btrfs_ino(inode) ||
3514 		    key.type != BTRFS_EXTENT_DATA_KEY) {
3515 			i++;
3516 			continue;
3517 		}
3518 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3519 		if (btrfs_file_extent_type(src, extent) ==
3520 		    BTRFS_FILE_EXTENT_INLINE) {
3521 			len = btrfs_file_extent_inline_len(src, i, extent);
3522 			extent_end = ALIGN(key.offset + len, log->sectorsize);
3523 		} else {
3524 			len = btrfs_file_extent_num_bytes(src, extent);
3525 			extent_end = key.offset + len;
3526 		}
3527 		i++;
3528 
3529 		if (*last_extent == key.offset) {
3530 			*last_extent = extent_end;
3531 			continue;
3532 		}
3533 		offset = *last_extent;
3534 		len = key.offset - *last_extent;
3535 		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3536 					       offset, 0, 0, len, 0, len, 0,
3537 					       0, 0);
3538 		if (ret)
3539 			break;
3540 		*last_extent = offset + len;
3541 	}
3542 	/*
3543 	 * Need to let the callers know we dropped the path so they should
3544 	 * re-search.
3545 	 */
3546 	if (!ret && need_find_last_extent)
3547 		ret = 1;
3548 	return ret;
3549 }
3550 
3551 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3552 {
3553 	struct extent_map *em1, *em2;
3554 
3555 	em1 = list_entry(a, struct extent_map, list);
3556 	em2 = list_entry(b, struct extent_map, list);
3557 
3558 	if (em1->start < em2->start)
3559 		return -1;
3560 	else if (em1->start > em2->start)
3561 		return 1;
3562 	return 0;
3563 }
3564 
3565 static int log_one_extent(struct btrfs_trans_handle *trans,
3566 			  struct inode *inode, struct btrfs_root *root,
3567 			  struct extent_map *em, struct btrfs_path *path,
3568 			  struct list_head *logged_list)
3569 {
3570 	struct btrfs_root *log = root->log_root;
3571 	struct btrfs_file_extent_item *fi;
3572 	struct extent_buffer *leaf;
3573 	struct btrfs_ordered_extent *ordered;
3574 	struct list_head ordered_sums;
3575 	struct btrfs_map_token token;
3576 	struct btrfs_key key;
3577 	u64 mod_start = em->mod_start;
3578 	u64 mod_len = em->mod_len;
3579 	u64 csum_offset;
3580 	u64 csum_len;
3581 	u64 extent_offset = em->start - em->orig_start;
3582 	u64 block_len;
3583 	int ret;
3584 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3585 	int extent_inserted = 0;
3586 
3587 	INIT_LIST_HEAD(&ordered_sums);
3588 	btrfs_init_map_token(&token);
3589 
3590 	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3591 				   em->start + em->len, NULL, 0, 1,
3592 				   sizeof(*fi), &extent_inserted);
3593 	if (ret)
3594 		return ret;
3595 
3596 	if (!extent_inserted) {
3597 		key.objectid = btrfs_ino(inode);
3598 		key.type = BTRFS_EXTENT_DATA_KEY;
3599 		key.offset = em->start;
3600 
3601 		ret = btrfs_insert_empty_item(trans, log, path, &key,
3602 					      sizeof(*fi));
3603 		if (ret)
3604 			return ret;
3605 	}
3606 	leaf = path->nodes[0];
3607 	fi = btrfs_item_ptr(leaf, path->slots[0],
3608 			    struct btrfs_file_extent_item);
3609 
3610 	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3611 					       &token);
3612 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3613 		skip_csum = true;
3614 		btrfs_set_token_file_extent_type(leaf, fi,
3615 						 BTRFS_FILE_EXTENT_PREALLOC,
3616 						 &token);
3617 	} else {
3618 		btrfs_set_token_file_extent_type(leaf, fi,
3619 						 BTRFS_FILE_EXTENT_REG,
3620 						 &token);
3621 		if (em->block_start == EXTENT_MAP_HOLE)
3622 			skip_csum = true;
3623 	}
3624 
3625 	block_len = max(em->block_len, em->orig_block_len);
3626 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
3627 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3628 							em->block_start,
3629 							&token);
3630 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3631 							   &token);
3632 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3633 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3634 							em->block_start -
3635 							extent_offset, &token);
3636 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3637 							   &token);
3638 	} else {
3639 		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3640 		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3641 							   &token);
3642 	}
3643 
3644 	btrfs_set_token_file_extent_offset(leaf, fi,
3645 					   em->start - em->orig_start,
3646 					   &token);
3647 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3648 	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3649 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3650 						&token);
3651 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3652 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3653 	btrfs_mark_buffer_dirty(leaf);
3654 
3655 	btrfs_release_path(path);
3656 	if (ret) {
3657 		return ret;
3658 	}
3659 
3660 	if (skip_csum)
3661 		return 0;
3662 
3663 	/*
3664 	 * First check and see if our csums are on our outstanding ordered
3665 	 * extents.
3666 	 */
3667 	list_for_each_entry(ordered, logged_list, log_list) {
3668 		struct btrfs_ordered_sum *sum;
3669 
3670 		if (!mod_len)
3671 			break;
3672 
3673 		if (ordered->file_offset + ordered->len <= mod_start ||
3674 		    mod_start + mod_len <= ordered->file_offset)
3675 			continue;
3676 
3677 		/*
3678 		 * We are going to copy all the csums on this ordered extent, so
3679 		 * go ahead and adjust mod_start and mod_len in case this
3680 		 * ordered extent has already been logged.
3681 		 */
3682 		if (ordered->file_offset > mod_start) {
3683 			if (ordered->file_offset + ordered->len >=
3684 			    mod_start + mod_len)
3685 				mod_len = ordered->file_offset - mod_start;
3686 			/*
3687 			 * If we have this case
3688 			 *
3689 			 * |--------- logged extent ---------|
3690 			 *       |----- ordered extent ----|
3691 			 *
3692 			 * Just don't mess with mod_start and mod_len, we'll
3693 			 * just end up logging more csums than we need and it
3694 			 * will be ok.
3695 			 */
3696 		} else {
3697 			if (ordered->file_offset + ordered->len <
3698 			    mod_start + mod_len) {
3699 				mod_len = (mod_start + mod_len) -
3700 					(ordered->file_offset + ordered->len);
3701 				mod_start = ordered->file_offset +
3702 					ordered->len;
3703 			} else {
3704 				mod_len = 0;
3705 			}
3706 		}
3707 
3708 		/*
3709 		 * To keep us from looping for the above case of an ordered
3710 		 * extent that falls inside of the logged extent.
3711 		 */
3712 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3713 				     &ordered->flags))
3714 			continue;
3715 
3716 		if (ordered->csum_bytes_left) {
3717 			btrfs_start_ordered_extent(inode, ordered, 0);
3718 			wait_event(ordered->wait,
3719 				   ordered->csum_bytes_left == 0);
3720 		}
3721 
3722 		list_for_each_entry(sum, &ordered->list, list) {
3723 			ret = btrfs_csum_file_blocks(trans, log, sum);
3724 			if (ret)
3725 				goto unlocked;
3726 		}
3727 
3728 	}
3729 unlocked:
3730 
3731 	if (!mod_len || ret)
3732 		return ret;
3733 
3734 	if (em->compress_type) {
3735 		csum_offset = 0;
3736 		csum_len = block_len;
3737 	} else {
3738 		csum_offset = mod_start - em->start;
3739 		csum_len = mod_len;
3740 	}
3741 
3742 	/* block start is already adjusted for the file extent offset. */
3743 	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3744 				       em->block_start + csum_offset,
3745 				       em->block_start + csum_offset +
3746 				       csum_len - 1, &ordered_sums, 0);
3747 	if (ret)
3748 		return ret;
3749 
3750 	while (!list_empty(&ordered_sums)) {
3751 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3752 						   struct btrfs_ordered_sum,
3753 						   list);
3754 		if (!ret)
3755 			ret = btrfs_csum_file_blocks(trans, log, sums);
3756 		list_del(&sums->list);
3757 		kfree(sums);
3758 	}
3759 
3760 	return ret;
3761 }
3762 
3763 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3764 				     struct btrfs_root *root,
3765 				     struct inode *inode,
3766 				     struct btrfs_path *path,
3767 				     struct list_head *logged_list)
3768 {
3769 	struct extent_map *em, *n;
3770 	struct list_head extents;
3771 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3772 	u64 test_gen;
3773 	int ret = 0;
3774 	int num = 0;
3775 
3776 	INIT_LIST_HEAD(&extents);
3777 
3778 	write_lock(&tree->lock);
3779 	test_gen = root->fs_info->last_trans_committed;
3780 
3781 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3782 		list_del_init(&em->list);
3783 
3784 		/*
3785 		 * Just an arbitrary number, this can be really CPU intensive
3786 		 * once we start getting a lot of extents, and really once we
3787 		 * have a bunch of extents we just want to commit since it will
3788 		 * be faster.
3789 		 */
3790 		if (++num > 32768) {
3791 			list_del_init(&tree->modified_extents);
3792 			ret = -EFBIG;
3793 			goto process;
3794 		}
3795 
3796 		if (em->generation <= test_gen)
3797 			continue;
3798 		/* Need a ref to keep it from getting evicted from cache */
3799 		atomic_inc(&em->refs);
3800 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3801 		list_add_tail(&em->list, &extents);
3802 		num++;
3803 	}
3804 
3805 	list_sort(NULL, &extents, extent_cmp);
3806 
3807 process:
3808 	while (!list_empty(&extents)) {
3809 		em = list_entry(extents.next, struct extent_map, list);
3810 
3811 		list_del_init(&em->list);
3812 
3813 		/*
3814 		 * If we had an error we just need to delete everybody from our
3815 		 * private list.
3816 		 */
3817 		if (ret) {
3818 			clear_em_logging(tree, em);
3819 			free_extent_map(em);
3820 			continue;
3821 		}
3822 
3823 		write_unlock(&tree->lock);
3824 
3825 		ret = log_one_extent(trans, inode, root, em, path, logged_list);
3826 		write_lock(&tree->lock);
3827 		clear_em_logging(tree, em);
3828 		free_extent_map(em);
3829 	}
3830 	WARN_ON(!list_empty(&extents));
3831 	write_unlock(&tree->lock);
3832 
3833 	btrfs_release_path(path);
3834 	return ret;
3835 }
3836 
3837 /* log a single inode in the tree log.
3838  * At least one parent directory for this inode must exist in the tree
3839  * or be logged already.
3840  *
3841  * Any items from this inode changed by the current transaction are copied
3842  * to the log tree.  An extra reference is taken on any extents in this
3843  * file, allowing us to avoid a whole pile of corner cases around logging
3844  * blocks that have been removed from the tree.
3845  *
3846  * See LOG_INODE_ALL and related defines for a description of what inode_only
3847  * does.
3848  *
3849  * This handles both files and directories.
3850  */
3851 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3852 			     struct btrfs_root *root, struct inode *inode,
3853 			     int inode_only)
3854 {
3855 	struct btrfs_path *path;
3856 	struct btrfs_path *dst_path;
3857 	struct btrfs_key min_key;
3858 	struct btrfs_key max_key;
3859 	struct btrfs_root *log = root->log_root;
3860 	struct extent_buffer *src = NULL;
3861 	LIST_HEAD(logged_list);
3862 	u64 last_extent = 0;
3863 	int err = 0;
3864 	int ret;
3865 	int nritems;
3866 	int ins_start_slot = 0;
3867 	int ins_nr;
3868 	bool fast_search = false;
3869 	u64 ino = btrfs_ino(inode);
3870 
3871 	path = btrfs_alloc_path();
3872 	if (!path)
3873 		return -ENOMEM;
3874 	dst_path = btrfs_alloc_path();
3875 	if (!dst_path) {
3876 		btrfs_free_path(path);
3877 		return -ENOMEM;
3878 	}
3879 
3880 	min_key.objectid = ino;
3881 	min_key.type = BTRFS_INODE_ITEM_KEY;
3882 	min_key.offset = 0;
3883 
3884 	max_key.objectid = ino;
3885 
3886 
3887 	/* today the code can only do partial logging of directories */
3888 	if (S_ISDIR(inode->i_mode) ||
3889 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3890 		       &BTRFS_I(inode)->runtime_flags) &&
3891 	     inode_only == LOG_INODE_EXISTS))
3892 		max_key.type = BTRFS_XATTR_ITEM_KEY;
3893 	else
3894 		max_key.type = (u8)-1;
3895 	max_key.offset = (u64)-1;
3896 
3897 	/* Only run delayed items if we are a dir or a new file */
3898 	if (S_ISDIR(inode->i_mode) ||
3899 	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3900 		ret = btrfs_commit_inode_delayed_items(trans, inode);
3901 		if (ret) {
3902 			btrfs_free_path(path);
3903 			btrfs_free_path(dst_path);
3904 			return ret;
3905 		}
3906 	}
3907 
3908 	mutex_lock(&BTRFS_I(inode)->log_mutex);
3909 
3910 	btrfs_get_logged_extents(inode, &logged_list);
3911 
3912 	/*
3913 	 * a brute force approach to making sure we get the most uptodate
3914 	 * copies of everything.
3915 	 */
3916 	if (S_ISDIR(inode->i_mode)) {
3917 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3918 
3919 		if (inode_only == LOG_INODE_EXISTS)
3920 			max_key_type = BTRFS_XATTR_ITEM_KEY;
3921 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3922 	} else {
3923 		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3924 				       &BTRFS_I(inode)->runtime_flags)) {
3925 			clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3926 				  &BTRFS_I(inode)->runtime_flags);
3927 			ret = btrfs_truncate_inode_items(trans, log,
3928 							 inode, 0, 0);
3929 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3930 					      &BTRFS_I(inode)->runtime_flags) ||
3931 			   inode_only == LOG_INODE_EXISTS) {
3932 			if (inode_only == LOG_INODE_ALL)
3933 				fast_search = true;
3934 			max_key.type = BTRFS_XATTR_ITEM_KEY;
3935 			ret = drop_objectid_items(trans, log, path, ino,
3936 						  max_key.type);
3937 		} else {
3938 			if (inode_only == LOG_INODE_ALL)
3939 				fast_search = true;
3940 			ret = log_inode_item(trans, log, dst_path, inode);
3941 			if (ret) {
3942 				err = ret;
3943 				goto out_unlock;
3944 			}
3945 			goto log_extents;
3946 		}
3947 
3948 	}
3949 	if (ret) {
3950 		err = ret;
3951 		goto out_unlock;
3952 	}
3953 	path->keep_locks = 1;
3954 
3955 	while (1) {
3956 		ins_nr = 0;
3957 		ret = btrfs_search_forward(root, &min_key,
3958 					   path, trans->transid);
3959 		if (ret != 0)
3960 			break;
3961 again:
3962 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
3963 		if (min_key.objectid != ino)
3964 			break;
3965 		if (min_key.type > max_key.type)
3966 			break;
3967 
3968 		src = path->nodes[0];
3969 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3970 			ins_nr++;
3971 			goto next_slot;
3972 		} else if (!ins_nr) {
3973 			ins_start_slot = path->slots[0];
3974 			ins_nr = 1;
3975 			goto next_slot;
3976 		}
3977 
3978 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
3979 				 ins_start_slot, ins_nr, inode_only);
3980 		if (ret < 0) {
3981 			err = ret;
3982 			goto out_unlock;
3983 		} if (ret) {
3984 			ins_nr = 0;
3985 			btrfs_release_path(path);
3986 			continue;
3987 		}
3988 		ins_nr = 1;
3989 		ins_start_slot = path->slots[0];
3990 next_slot:
3991 
3992 		nritems = btrfs_header_nritems(path->nodes[0]);
3993 		path->slots[0]++;
3994 		if (path->slots[0] < nritems) {
3995 			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3996 					      path->slots[0]);
3997 			goto again;
3998 		}
3999 		if (ins_nr) {
4000 			ret = copy_items(trans, inode, dst_path, path,
4001 					 &last_extent, ins_start_slot,
4002 					 ins_nr, inode_only);
4003 			if (ret < 0) {
4004 				err = ret;
4005 				goto out_unlock;
4006 			}
4007 			ret = 0;
4008 			ins_nr = 0;
4009 		}
4010 		btrfs_release_path(path);
4011 
4012 		if (min_key.offset < (u64)-1) {
4013 			min_key.offset++;
4014 		} else if (min_key.type < max_key.type) {
4015 			min_key.type++;
4016 			min_key.offset = 0;
4017 		} else {
4018 			break;
4019 		}
4020 	}
4021 	if (ins_nr) {
4022 		ret = copy_items(trans, inode, dst_path, path, &last_extent,
4023 				 ins_start_slot, ins_nr, inode_only);
4024 		if (ret < 0) {
4025 			err = ret;
4026 			goto out_unlock;
4027 		}
4028 		ret = 0;
4029 		ins_nr = 0;
4030 	}
4031 
4032 log_extents:
4033 	btrfs_release_path(path);
4034 	btrfs_release_path(dst_path);
4035 	if (fast_search) {
4036 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4037 						&logged_list);
4038 		if (ret) {
4039 			err = ret;
4040 			goto out_unlock;
4041 		}
4042 	} else if (inode_only == LOG_INODE_ALL) {
4043 		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4044 		struct extent_map *em, *n;
4045 
4046 		write_lock(&tree->lock);
4047 		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
4048 			list_del_init(&em->list);
4049 		write_unlock(&tree->lock);
4050 	}
4051 
4052 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4053 		ret = log_directory_changes(trans, root, inode, path, dst_path);
4054 		if (ret) {
4055 			err = ret;
4056 			goto out_unlock;
4057 		}
4058 	}
4059 	BTRFS_I(inode)->logged_trans = trans->transid;
4060 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4061 out_unlock:
4062 	if (unlikely(err))
4063 		btrfs_put_logged_extents(&logged_list);
4064 	else
4065 		btrfs_submit_logged_extents(&logged_list, log);
4066 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
4067 
4068 	btrfs_free_path(path);
4069 	btrfs_free_path(dst_path);
4070 	return err;
4071 }
4072 
4073 /*
4074  * follow the dentry parent pointers up the chain and see if any
4075  * of the directories in it require a full commit before they can
4076  * be logged.  Returns zero if nothing special needs to be done or 1 if
4077  * a full commit is required.
4078  */
4079 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4080 					       struct inode *inode,
4081 					       struct dentry *parent,
4082 					       struct super_block *sb,
4083 					       u64 last_committed)
4084 {
4085 	int ret = 0;
4086 	struct btrfs_root *root;
4087 	struct dentry *old_parent = NULL;
4088 	struct inode *orig_inode = inode;
4089 
4090 	/*
4091 	 * for regular files, if its inode is already on disk, we don't
4092 	 * have to worry about the parents at all.  This is because
4093 	 * we can use the last_unlink_trans field to record renames
4094 	 * and other fun in this file.
4095 	 */
4096 	if (S_ISREG(inode->i_mode) &&
4097 	    BTRFS_I(inode)->generation <= last_committed &&
4098 	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
4099 			goto out;
4100 
4101 	if (!S_ISDIR(inode->i_mode)) {
4102 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4103 			goto out;
4104 		inode = parent->d_inode;
4105 	}
4106 
4107 	while (1) {
4108 		/*
4109 		 * If we are logging a directory then we start with our inode,
4110 		 * not our parents inode, so we need to skipp setting the
4111 		 * logged_trans so that further down in the log code we don't
4112 		 * think this inode has already been logged.
4113 		 */
4114 		if (inode != orig_inode)
4115 			BTRFS_I(inode)->logged_trans = trans->transid;
4116 		smp_mb();
4117 
4118 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
4119 			root = BTRFS_I(inode)->root;
4120 
4121 			/*
4122 			 * make sure any commits to the log are forced
4123 			 * to be full commits
4124 			 */
4125 			btrfs_set_log_full_commit(root->fs_info, trans);
4126 			ret = 1;
4127 			break;
4128 		}
4129 
4130 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4131 			break;
4132 
4133 		if (IS_ROOT(parent))
4134 			break;
4135 
4136 		parent = dget_parent(parent);
4137 		dput(old_parent);
4138 		old_parent = parent;
4139 		inode = parent->d_inode;
4140 
4141 	}
4142 	dput(old_parent);
4143 out:
4144 	return ret;
4145 }
4146 
4147 /*
4148  * helper function around btrfs_log_inode to make sure newly created
4149  * parent directories also end up in the log.  A minimal inode and backref
4150  * only logging is done of any parent directories that are older than
4151  * the last committed transaction
4152  */
4153 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4154 			    	  struct btrfs_root *root, struct inode *inode,
4155 			    	  struct dentry *parent, int exists_only,
4156 				  struct btrfs_log_ctx *ctx)
4157 {
4158 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4159 	struct super_block *sb;
4160 	struct dentry *old_parent = NULL;
4161 	int ret = 0;
4162 	u64 last_committed = root->fs_info->last_trans_committed;
4163 
4164 	sb = inode->i_sb;
4165 
4166 	if (btrfs_test_opt(root, NOTREELOG)) {
4167 		ret = 1;
4168 		goto end_no_trans;
4169 	}
4170 
4171 	/*
4172 	 * The prev transaction commit doesn't complete, we need do
4173 	 * full commit by ourselves.
4174 	 */
4175 	if (root->fs_info->last_trans_log_full_commit >
4176 	    root->fs_info->last_trans_committed) {
4177 		ret = 1;
4178 		goto end_no_trans;
4179 	}
4180 
4181 	if (root != BTRFS_I(inode)->root ||
4182 	    btrfs_root_refs(&root->root_item) == 0) {
4183 		ret = 1;
4184 		goto end_no_trans;
4185 	}
4186 
4187 	ret = check_parent_dirs_for_sync(trans, inode, parent,
4188 					 sb, last_committed);
4189 	if (ret)
4190 		goto end_no_trans;
4191 
4192 	if (btrfs_inode_in_log(inode, trans->transid)) {
4193 		ret = BTRFS_NO_LOG_SYNC;
4194 		goto end_no_trans;
4195 	}
4196 
4197 	ret = start_log_trans(trans, root, ctx);
4198 	if (ret)
4199 		goto end_no_trans;
4200 
4201 	ret = btrfs_log_inode(trans, root, inode, inode_only);
4202 	if (ret)
4203 		goto end_trans;
4204 
4205 	/*
4206 	 * for regular files, if its inode is already on disk, we don't
4207 	 * have to worry about the parents at all.  This is because
4208 	 * we can use the last_unlink_trans field to record renames
4209 	 * and other fun in this file.
4210 	 */
4211 	if (S_ISREG(inode->i_mode) &&
4212 	    BTRFS_I(inode)->generation <= last_committed &&
4213 	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
4214 		ret = 0;
4215 		goto end_trans;
4216 	}
4217 
4218 	inode_only = LOG_INODE_EXISTS;
4219 	while (1) {
4220 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4221 			break;
4222 
4223 		inode = parent->d_inode;
4224 		if (root != BTRFS_I(inode)->root)
4225 			break;
4226 
4227 		if (BTRFS_I(inode)->generation >
4228 		    root->fs_info->last_trans_committed) {
4229 			ret = btrfs_log_inode(trans, root, inode, inode_only);
4230 			if (ret)
4231 				goto end_trans;
4232 		}
4233 		if (IS_ROOT(parent))
4234 			break;
4235 
4236 		parent = dget_parent(parent);
4237 		dput(old_parent);
4238 		old_parent = parent;
4239 	}
4240 	ret = 0;
4241 end_trans:
4242 	dput(old_parent);
4243 	if (ret < 0) {
4244 		btrfs_set_log_full_commit(root->fs_info, trans);
4245 		ret = 1;
4246 	}
4247 
4248 	if (ret)
4249 		btrfs_remove_log_ctx(root, ctx);
4250 	btrfs_end_log_trans(root);
4251 end_no_trans:
4252 	return ret;
4253 }
4254 
4255 /*
4256  * it is not safe to log dentry if the chunk root has added new
4257  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
4258  * If this returns 1, you must commit the transaction to safely get your
4259  * data on disk.
4260  */
4261 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4262 			  struct btrfs_root *root, struct dentry *dentry,
4263 			  struct btrfs_log_ctx *ctx)
4264 {
4265 	struct dentry *parent = dget_parent(dentry);
4266 	int ret;
4267 
4268 	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4269 				     0, ctx);
4270 	dput(parent);
4271 
4272 	return ret;
4273 }
4274 
4275 /*
4276  * should be called during mount to recover any replay any log trees
4277  * from the FS
4278  */
4279 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4280 {
4281 	int ret;
4282 	struct btrfs_path *path;
4283 	struct btrfs_trans_handle *trans;
4284 	struct btrfs_key key;
4285 	struct btrfs_key found_key;
4286 	struct btrfs_key tmp_key;
4287 	struct btrfs_root *log;
4288 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
4289 	struct walk_control wc = {
4290 		.process_func = process_one_buffer,
4291 		.stage = 0,
4292 	};
4293 
4294 	path = btrfs_alloc_path();
4295 	if (!path)
4296 		return -ENOMEM;
4297 
4298 	fs_info->log_root_recovering = 1;
4299 
4300 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4301 	if (IS_ERR(trans)) {
4302 		ret = PTR_ERR(trans);
4303 		goto error;
4304 	}
4305 
4306 	wc.trans = trans;
4307 	wc.pin = 1;
4308 
4309 	ret = walk_log_tree(trans, log_root_tree, &wc);
4310 	if (ret) {
4311 		btrfs_error(fs_info, ret, "Failed to pin buffers while "
4312 			    "recovering log root tree.");
4313 		goto error;
4314 	}
4315 
4316 again:
4317 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
4318 	key.offset = (u64)-1;
4319 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
4320 
4321 	while (1) {
4322 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4323 
4324 		if (ret < 0) {
4325 			btrfs_error(fs_info, ret,
4326 				    "Couldn't find tree log root.");
4327 			goto error;
4328 		}
4329 		if (ret > 0) {
4330 			if (path->slots[0] == 0)
4331 				break;
4332 			path->slots[0]--;
4333 		}
4334 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4335 				      path->slots[0]);
4336 		btrfs_release_path(path);
4337 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4338 			break;
4339 
4340 		log = btrfs_read_fs_root(log_root_tree, &found_key);
4341 		if (IS_ERR(log)) {
4342 			ret = PTR_ERR(log);
4343 			btrfs_error(fs_info, ret,
4344 				    "Couldn't read tree log root.");
4345 			goto error;
4346 		}
4347 
4348 		tmp_key.objectid = found_key.offset;
4349 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4350 		tmp_key.offset = (u64)-1;
4351 
4352 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4353 		if (IS_ERR(wc.replay_dest)) {
4354 			ret = PTR_ERR(wc.replay_dest);
4355 			free_extent_buffer(log->node);
4356 			free_extent_buffer(log->commit_root);
4357 			kfree(log);
4358 			btrfs_error(fs_info, ret, "Couldn't read target root "
4359 				    "for tree log recovery.");
4360 			goto error;
4361 		}
4362 
4363 		wc.replay_dest->log_root = log;
4364 		btrfs_record_root_in_trans(trans, wc.replay_dest);
4365 		ret = walk_log_tree(trans, log, &wc);
4366 
4367 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
4368 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
4369 						      path);
4370 		}
4371 
4372 		key.offset = found_key.offset - 1;
4373 		wc.replay_dest->log_root = NULL;
4374 		free_extent_buffer(log->node);
4375 		free_extent_buffer(log->commit_root);
4376 		kfree(log);
4377 
4378 		if (ret)
4379 			goto error;
4380 
4381 		if (found_key.offset == 0)
4382 			break;
4383 	}
4384 	btrfs_release_path(path);
4385 
4386 	/* step one is to pin it all, step two is to replay just inodes */
4387 	if (wc.pin) {
4388 		wc.pin = 0;
4389 		wc.process_func = replay_one_buffer;
4390 		wc.stage = LOG_WALK_REPLAY_INODES;
4391 		goto again;
4392 	}
4393 	/* step three is to replay everything */
4394 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
4395 		wc.stage++;
4396 		goto again;
4397 	}
4398 
4399 	btrfs_free_path(path);
4400 
4401 	/* step 4: commit the transaction, which also unpins the blocks */
4402 	ret = btrfs_commit_transaction(trans, fs_info->tree_root);
4403 	if (ret)
4404 		return ret;
4405 
4406 	free_extent_buffer(log_root_tree->node);
4407 	log_root_tree->log_root = NULL;
4408 	fs_info->log_root_recovering = 0;
4409 	kfree(log_root_tree);
4410 
4411 	return 0;
4412 error:
4413 	if (wc.trans)
4414 		btrfs_end_transaction(wc.trans, fs_info->tree_root);
4415 	btrfs_free_path(path);
4416 	return ret;
4417 }
4418 
4419 /*
4420  * there are some corner cases where we want to force a full
4421  * commit instead of allowing a directory to be logged.
4422  *
4423  * They revolve around files there were unlinked from the directory, and
4424  * this function updates the parent directory so that a full commit is
4425  * properly done if it is fsync'd later after the unlinks are done.
4426  */
4427 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4428 			     struct inode *dir, struct inode *inode,
4429 			     int for_rename)
4430 {
4431 	/*
4432 	 * when we're logging a file, if it hasn't been renamed
4433 	 * or unlinked, and its inode is fully committed on disk,
4434 	 * we don't have to worry about walking up the directory chain
4435 	 * to log its parents.
4436 	 *
4437 	 * So, we use the last_unlink_trans field to put this transid
4438 	 * into the file.  When the file is logged we check it and
4439 	 * don't log the parents if the file is fully on disk.
4440 	 */
4441 	if (S_ISREG(inode->i_mode))
4442 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
4443 
4444 	/*
4445 	 * if this directory was already logged any new
4446 	 * names for this file/dir will get recorded
4447 	 */
4448 	smp_mb();
4449 	if (BTRFS_I(dir)->logged_trans == trans->transid)
4450 		return;
4451 
4452 	/*
4453 	 * if the inode we're about to unlink was logged,
4454 	 * the log will be properly updated for any new names
4455 	 */
4456 	if (BTRFS_I(inode)->logged_trans == trans->transid)
4457 		return;
4458 
4459 	/*
4460 	 * when renaming files across directories, if the directory
4461 	 * there we're unlinking from gets fsync'd later on, there's
4462 	 * no way to find the destination directory later and fsync it
4463 	 * properly.  So, we have to be conservative and force commits
4464 	 * so the new name gets discovered.
4465 	 */
4466 	if (for_rename)
4467 		goto record;
4468 
4469 	/* we can safely do the unlink without any special recording */
4470 	return;
4471 
4472 record:
4473 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
4474 }
4475 
4476 /*
4477  * Call this after adding a new name for a file and it will properly
4478  * update the log to reflect the new name.
4479  *
4480  * It will return zero if all goes well, and it will return 1 if a
4481  * full transaction commit is required.
4482  */
4483 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4484 			struct inode *inode, struct inode *old_dir,
4485 			struct dentry *parent)
4486 {
4487 	struct btrfs_root * root = BTRFS_I(inode)->root;
4488 
4489 	/*
4490 	 * this will force the logging code to walk the dentry chain
4491 	 * up for the file
4492 	 */
4493 	if (S_ISREG(inode->i_mode))
4494 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
4495 
4496 	/*
4497 	 * if this inode hasn't been logged and directory we're renaming it
4498 	 * from hasn't been logged, we don't need to log it
4499 	 */
4500 	if (BTRFS_I(inode)->logged_trans <=
4501 	    root->fs_info->last_trans_committed &&
4502 	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4503 		    root->fs_info->last_trans_committed))
4504 		return 0;
4505 
4506 	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4507 }
4508 
4509