xref: /openbmc/linux/fs/btrfs/tree-log.c (revision 09138ba68c1487a42c400485e999386a74911dbc)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2008 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/slab.h>
8  #include <linux/blkdev.h>
9  #include <linux/list_sort.h>
10  #include <linux/iversion.h>
11  #include "misc.h"
12  #include "ctree.h"
13  #include "tree-log.h"
14  #include "disk-io.h"
15  #include "locking.h"
16  #include "print-tree.h"
17  #include "backref.h"
18  #include "compression.h"
19  #include "qgroup.h"
20  #include "block-group.h"
21  #include "space-info.h"
22  #include "zoned.h"
23  #include "inode-item.h"
24  #include "fs.h"
25  #include "accessors.h"
26  #include "extent-tree.h"
27  #include "root-tree.h"
28  #include "dir-item.h"
29  #include "file-item.h"
30  #include "file.h"
31  #include "orphan.h"
32  #include "tree-checker.h"
33  
34  #define MAX_CONFLICT_INODES 10
35  
36  /* magic values for the inode_only field in btrfs_log_inode:
37   *
38   * LOG_INODE_ALL means to log everything
39   * LOG_INODE_EXISTS means to log just enough to recreate the inode
40   * during log replay
41   */
42  enum {
43  	LOG_INODE_ALL,
44  	LOG_INODE_EXISTS,
45  };
46  
47  /*
48   * directory trouble cases
49   *
50   * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
51   * log, we must force a full commit before doing an fsync of the directory
52   * where the unlink was done.
53   * ---> record transid of last unlink/rename per directory
54   *
55   * mkdir foo/some_dir
56   * normal commit
57   * rename foo/some_dir foo2/some_dir
58   * mkdir foo/some_dir
59   * fsync foo/some_dir/some_file
60   *
61   * The fsync above will unlink the original some_dir without recording
62   * it in its new location (foo2).  After a crash, some_dir will be gone
63   * unless the fsync of some_file forces a full commit
64   *
65   * 2) we must log any new names for any file or dir that is in the fsync
66   * log. ---> check inode while renaming/linking.
67   *
68   * 2a) we must log any new names for any file or dir during rename
69   * when the directory they are being removed from was logged.
70   * ---> check inode and old parent dir during rename
71   *
72   *  2a is actually the more important variant.  With the extra logging
73   *  a crash might unlink the old name without recreating the new one
74   *
75   * 3) after a crash, we must go through any directories with a link count
76   * of zero and redo the rm -rf
77   *
78   * mkdir f1/foo
79   * normal commit
80   * rm -rf f1/foo
81   * fsync(f1)
82   *
83   * The directory f1 was fully removed from the FS, but fsync was never
84   * called on f1, only its parent dir.  After a crash the rm -rf must
85   * be replayed.  This must be able to recurse down the entire
86   * directory tree.  The inode link count fixup code takes care of the
87   * ugly details.
88   */
89  
90  /*
91   * stages for the tree walking.  The first
92   * stage (0) is to only pin down the blocks we find
93   * the second stage (1) is to make sure that all the inodes
94   * we find in the log are created in the subvolume.
95   *
96   * The last stage is to deal with directories and links and extents
97   * and all the other fun semantics
98   */
99  enum {
100  	LOG_WALK_PIN_ONLY,
101  	LOG_WALK_REPLAY_INODES,
102  	LOG_WALK_REPLAY_DIR_INDEX,
103  	LOG_WALK_REPLAY_ALL,
104  };
105  
106  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
107  			   struct btrfs_inode *inode,
108  			   int inode_only,
109  			   struct btrfs_log_ctx *ctx);
110  static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
111  			     struct btrfs_root *root,
112  			     struct btrfs_path *path, u64 objectid);
113  static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
114  				       struct btrfs_root *root,
115  				       struct btrfs_root *log,
116  				       struct btrfs_path *path,
117  				       u64 dirid, int del_all);
118  static void wait_log_commit(struct btrfs_root *root, int transid);
119  
120  /*
121   * tree logging is a special write ahead log used to make sure that
122   * fsyncs and O_SYNCs can happen without doing full tree commits.
123   *
124   * Full tree commits are expensive because they require commonly
125   * modified blocks to be recowed, creating many dirty pages in the
126   * extent tree an 4x-6x higher write load than ext3.
127   *
128   * Instead of doing a tree commit on every fsync, we use the
129   * key ranges and transaction ids to find items for a given file or directory
130   * that have changed in this transaction.  Those items are copied into
131   * a special tree (one per subvolume root), that tree is written to disk
132   * and then the fsync is considered complete.
133   *
134   * After a crash, items are copied out of the log-tree back into the
135   * subvolume tree.  Any file data extents found are recorded in the extent
136   * allocation tree, and the log-tree freed.
137   *
138   * The log tree is read three times, once to pin down all the extents it is
139   * using in ram and once, once to create all the inodes logged in the tree
140   * and once to do all the other items.
141   */
142  
btrfs_iget_logging(u64 objectid,struct btrfs_root * root)143  static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
144  {
145  	unsigned int nofs_flag;
146  	struct inode *inode;
147  
148  	/*
149  	 * We're holding a transaction handle whether we are logging or
150  	 * replaying a log tree, so we must make sure NOFS semantics apply
151  	 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
152  	 * to allocate an inode, which can recurse back into the filesystem and
153  	 * attempt a transaction commit, resulting in a deadlock.
154  	 */
155  	nofs_flag = memalloc_nofs_save();
156  	inode = btrfs_iget(root->fs_info->sb, objectid, root);
157  	memalloc_nofs_restore(nofs_flag);
158  
159  	return inode;
160  }
161  
162  /*
163   * start a sub transaction and setup the log tree
164   * this increments the log tree writer count to make the people
165   * syncing the tree wait for us to finish
166   */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)167  static int start_log_trans(struct btrfs_trans_handle *trans,
168  			   struct btrfs_root *root,
169  			   struct btrfs_log_ctx *ctx)
170  {
171  	struct btrfs_fs_info *fs_info = root->fs_info;
172  	struct btrfs_root *tree_root = fs_info->tree_root;
173  	const bool zoned = btrfs_is_zoned(fs_info);
174  	int ret = 0;
175  	bool created = false;
176  
177  	/*
178  	 * First check if the log root tree was already created. If not, create
179  	 * it before locking the root's log_mutex, just to keep lockdep happy.
180  	 */
181  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
182  		mutex_lock(&tree_root->log_mutex);
183  		if (!fs_info->log_root_tree) {
184  			ret = btrfs_init_log_root_tree(trans, fs_info);
185  			if (!ret) {
186  				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
187  				created = true;
188  			}
189  		}
190  		mutex_unlock(&tree_root->log_mutex);
191  		if (ret)
192  			return ret;
193  	}
194  
195  	mutex_lock(&root->log_mutex);
196  
197  again:
198  	if (root->log_root) {
199  		int index = (root->log_transid + 1) % 2;
200  
201  		if (btrfs_need_log_full_commit(trans)) {
202  			ret = BTRFS_LOG_FORCE_COMMIT;
203  			goto out;
204  		}
205  
206  		if (zoned && atomic_read(&root->log_commit[index])) {
207  			wait_log_commit(root, root->log_transid - 1);
208  			goto again;
209  		}
210  
211  		if (!root->log_start_pid) {
212  			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
213  			root->log_start_pid = current->pid;
214  		} else if (root->log_start_pid != current->pid) {
215  			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
216  		}
217  	} else {
218  		/*
219  		 * This means fs_info->log_root_tree was already created
220  		 * for some other FS trees. Do the full commit not to mix
221  		 * nodes from multiple log transactions to do sequential
222  		 * writing.
223  		 */
224  		if (zoned && !created) {
225  			ret = BTRFS_LOG_FORCE_COMMIT;
226  			goto out;
227  		}
228  
229  		ret = btrfs_add_log_tree(trans, root);
230  		if (ret)
231  			goto out;
232  
233  		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
234  		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
235  		root->log_start_pid = current->pid;
236  	}
237  
238  	atomic_inc(&root->log_writers);
239  	if (!ctx->logging_new_name) {
240  		int index = root->log_transid % 2;
241  		list_add_tail(&ctx->list, &root->log_ctxs[index]);
242  		ctx->log_transid = root->log_transid;
243  	}
244  
245  out:
246  	mutex_unlock(&root->log_mutex);
247  	return ret;
248  }
249  
250  /*
251   * returns 0 if there was a log transaction running and we were able
252   * to join, or returns -ENOENT if there were not transactions
253   * in progress
254   */
join_running_log_trans(struct btrfs_root * root)255  static int join_running_log_trans(struct btrfs_root *root)
256  {
257  	const bool zoned = btrfs_is_zoned(root->fs_info);
258  	int ret = -ENOENT;
259  
260  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
261  		return ret;
262  
263  	mutex_lock(&root->log_mutex);
264  again:
265  	if (root->log_root) {
266  		int index = (root->log_transid + 1) % 2;
267  
268  		ret = 0;
269  		if (zoned && atomic_read(&root->log_commit[index])) {
270  			wait_log_commit(root, root->log_transid - 1);
271  			goto again;
272  		}
273  		atomic_inc(&root->log_writers);
274  	}
275  	mutex_unlock(&root->log_mutex);
276  	return ret;
277  }
278  
279  /*
280   * This either makes the current running log transaction wait
281   * until you call btrfs_end_log_trans() or it makes any future
282   * log transactions wait until you call btrfs_end_log_trans()
283   */
btrfs_pin_log_trans(struct btrfs_root * root)284  void btrfs_pin_log_trans(struct btrfs_root *root)
285  {
286  	atomic_inc(&root->log_writers);
287  }
288  
289  /*
290   * indicate we're done making changes to the log tree
291   * and wake up anyone waiting to do a sync
292   */
btrfs_end_log_trans(struct btrfs_root * root)293  void btrfs_end_log_trans(struct btrfs_root *root)
294  {
295  	if (atomic_dec_and_test(&root->log_writers)) {
296  		/* atomic_dec_and_test implies a barrier */
297  		cond_wake_up_nomb(&root->log_writer_wait);
298  	}
299  }
300  
301  /*
302   * the walk control struct is used to pass state down the chain when
303   * processing the log tree.  The stage field tells us which part
304   * of the log tree processing we are currently doing.  The others
305   * are state fields used for that specific part
306   */
307  struct walk_control {
308  	/* should we free the extent on disk when done?  This is used
309  	 * at transaction commit time while freeing a log tree
310  	 */
311  	int free;
312  
313  	/* pin only walk, we record which extents on disk belong to the
314  	 * log trees
315  	 */
316  	int pin;
317  
318  	/* what stage of the replay code we're currently in */
319  	int stage;
320  
321  	/*
322  	 * Ignore any items from the inode currently being processed. Needs
323  	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
324  	 * the LOG_WALK_REPLAY_INODES stage.
325  	 */
326  	bool ignore_cur_inode;
327  
328  	/* the root we are currently replaying */
329  	struct btrfs_root *replay_dest;
330  
331  	/* the trans handle for the current replay */
332  	struct btrfs_trans_handle *trans;
333  
334  	/* the function that gets used to process blocks we find in the
335  	 * tree.  Note the extent_buffer might not be up to date when it is
336  	 * passed in, and it must be checked or read if you need the data
337  	 * inside it
338  	 */
339  	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
340  			    struct walk_control *wc, u64 gen, int level);
341  };
342  
343  /*
344   * process_func used to pin down extents, write them or wait on them
345   */
process_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)346  static int process_one_buffer(struct btrfs_root *log,
347  			      struct extent_buffer *eb,
348  			      struct walk_control *wc, u64 gen, int level)
349  {
350  	struct btrfs_fs_info *fs_info = log->fs_info;
351  	int ret = 0;
352  
353  	/*
354  	 * If this fs is mixed then we need to be able to process the leaves to
355  	 * pin down any logged extents, so we have to read the block.
356  	 */
357  	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
358  		struct btrfs_tree_parent_check check = {
359  			.level = level,
360  			.transid = gen
361  		};
362  
363  		ret = btrfs_read_extent_buffer(eb, &check);
364  		if (ret)
365  			return ret;
366  	}
367  
368  	if (wc->pin) {
369  		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
370  						      eb->len);
371  		if (ret)
372  			return ret;
373  
374  		if (btrfs_buffer_uptodate(eb, gen, 0) &&
375  		    btrfs_header_level(eb) == 0)
376  			ret = btrfs_exclude_logged_extents(eb);
377  	}
378  	return ret;
379  }
380  
381  /*
382   * Item overwrite used by replay and tree logging.  eb, slot and key all refer
383   * to the src data we are copying out.
384   *
385   * root is the tree we are copying into, and path is a scratch
386   * path for use in this function (it should be released on entry and
387   * will be released on exit).
388   *
389   * If the key is already in the destination tree the existing item is
390   * overwritten.  If the existing item isn't big enough, it is extended.
391   * If it is too large, it is truncated.
392   *
393   * If the key isn't in the destination yet, a new item is inserted.
394   */
overwrite_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)395  static int overwrite_item(struct btrfs_trans_handle *trans,
396  			  struct btrfs_root *root,
397  			  struct btrfs_path *path,
398  			  struct extent_buffer *eb, int slot,
399  			  struct btrfs_key *key)
400  {
401  	int ret;
402  	u32 item_size;
403  	u64 saved_i_size = 0;
404  	int save_old_i_size = 0;
405  	unsigned long src_ptr;
406  	unsigned long dst_ptr;
407  	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
408  
409  	/*
410  	 * This is only used during log replay, so the root is always from a
411  	 * fs/subvolume tree. In case we ever need to support a log root, then
412  	 * we'll have to clone the leaf in the path, release the path and use
413  	 * the leaf before writing into the log tree. See the comments at
414  	 * copy_items() for more details.
415  	 */
416  	ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
417  
418  	item_size = btrfs_item_size(eb, slot);
419  	src_ptr = btrfs_item_ptr_offset(eb, slot);
420  
421  	/* Look for the key in the destination tree. */
422  	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
423  	if (ret < 0)
424  		return ret;
425  
426  	if (ret == 0) {
427  		char *src_copy;
428  		char *dst_copy;
429  		u32 dst_size = btrfs_item_size(path->nodes[0],
430  						  path->slots[0]);
431  		if (dst_size != item_size)
432  			goto insert;
433  
434  		if (item_size == 0) {
435  			btrfs_release_path(path);
436  			return 0;
437  		}
438  		dst_copy = kmalloc(item_size, GFP_NOFS);
439  		src_copy = kmalloc(item_size, GFP_NOFS);
440  		if (!dst_copy || !src_copy) {
441  			btrfs_release_path(path);
442  			kfree(dst_copy);
443  			kfree(src_copy);
444  			return -ENOMEM;
445  		}
446  
447  		read_extent_buffer(eb, src_copy, src_ptr, item_size);
448  
449  		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
450  		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
451  				   item_size);
452  		ret = memcmp(dst_copy, src_copy, item_size);
453  
454  		kfree(dst_copy);
455  		kfree(src_copy);
456  		/*
457  		 * they have the same contents, just return, this saves
458  		 * us from cowing blocks in the destination tree and doing
459  		 * extra writes that may not have been done by a previous
460  		 * sync
461  		 */
462  		if (ret == 0) {
463  			btrfs_release_path(path);
464  			return 0;
465  		}
466  
467  		/*
468  		 * We need to load the old nbytes into the inode so when we
469  		 * replay the extents we've logged we get the right nbytes.
470  		 */
471  		if (inode_item) {
472  			struct btrfs_inode_item *item;
473  			u64 nbytes;
474  			u32 mode;
475  
476  			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
477  					      struct btrfs_inode_item);
478  			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
479  			item = btrfs_item_ptr(eb, slot,
480  					      struct btrfs_inode_item);
481  			btrfs_set_inode_nbytes(eb, item, nbytes);
482  
483  			/*
484  			 * If this is a directory we need to reset the i_size to
485  			 * 0 so that we can set it up properly when replaying
486  			 * the rest of the items in this log.
487  			 */
488  			mode = btrfs_inode_mode(eb, item);
489  			if (S_ISDIR(mode))
490  				btrfs_set_inode_size(eb, item, 0);
491  		}
492  	} else if (inode_item) {
493  		struct btrfs_inode_item *item;
494  		u32 mode;
495  
496  		/*
497  		 * New inode, set nbytes to 0 so that the nbytes comes out
498  		 * properly when we replay the extents.
499  		 */
500  		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
501  		btrfs_set_inode_nbytes(eb, item, 0);
502  
503  		/*
504  		 * If this is a directory we need to reset the i_size to 0 so
505  		 * that we can set it up properly when replaying the rest of
506  		 * the items in this log.
507  		 */
508  		mode = btrfs_inode_mode(eb, item);
509  		if (S_ISDIR(mode))
510  			btrfs_set_inode_size(eb, item, 0);
511  	}
512  insert:
513  	btrfs_release_path(path);
514  	/* try to insert the key into the destination tree */
515  	path->skip_release_on_error = 1;
516  	ret = btrfs_insert_empty_item(trans, root, path,
517  				      key, item_size);
518  	path->skip_release_on_error = 0;
519  
520  	/* make sure any existing item is the correct size */
521  	if (ret == -EEXIST || ret == -EOVERFLOW) {
522  		u32 found_size;
523  		found_size = btrfs_item_size(path->nodes[0],
524  						path->slots[0]);
525  		if (found_size > item_size)
526  			btrfs_truncate_item(trans, path, item_size, 1);
527  		else if (found_size < item_size)
528  			btrfs_extend_item(trans, path, item_size - found_size);
529  	} else if (ret) {
530  		return ret;
531  	}
532  	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
533  					path->slots[0]);
534  
535  	/* don't overwrite an existing inode if the generation number
536  	 * was logged as zero.  This is done when the tree logging code
537  	 * is just logging an inode to make sure it exists after recovery.
538  	 *
539  	 * Also, don't overwrite i_size on directories during replay.
540  	 * log replay inserts and removes directory items based on the
541  	 * state of the tree found in the subvolume, and i_size is modified
542  	 * as it goes
543  	 */
544  	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
545  		struct btrfs_inode_item *src_item;
546  		struct btrfs_inode_item *dst_item;
547  
548  		src_item = (struct btrfs_inode_item *)src_ptr;
549  		dst_item = (struct btrfs_inode_item *)dst_ptr;
550  
551  		if (btrfs_inode_generation(eb, src_item) == 0) {
552  			struct extent_buffer *dst_eb = path->nodes[0];
553  			const u64 ino_size = btrfs_inode_size(eb, src_item);
554  
555  			/*
556  			 * For regular files an ino_size == 0 is used only when
557  			 * logging that an inode exists, as part of a directory
558  			 * fsync, and the inode wasn't fsynced before. In this
559  			 * case don't set the size of the inode in the fs/subvol
560  			 * tree, otherwise we would be throwing valid data away.
561  			 */
562  			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
563  			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
564  			    ino_size != 0)
565  				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
566  			goto no_copy;
567  		}
568  
569  		if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
570  		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
571  			save_old_i_size = 1;
572  			saved_i_size = btrfs_inode_size(path->nodes[0],
573  							dst_item);
574  		}
575  	}
576  
577  	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
578  			   src_ptr, item_size);
579  
580  	if (save_old_i_size) {
581  		struct btrfs_inode_item *dst_item;
582  		dst_item = (struct btrfs_inode_item *)dst_ptr;
583  		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
584  	}
585  
586  	/* make sure the generation is filled in */
587  	if (key->type == BTRFS_INODE_ITEM_KEY) {
588  		struct btrfs_inode_item *dst_item;
589  		dst_item = (struct btrfs_inode_item *)dst_ptr;
590  		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
591  			btrfs_set_inode_generation(path->nodes[0], dst_item,
592  						   trans->transid);
593  		}
594  	}
595  no_copy:
596  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
597  	btrfs_release_path(path);
598  	return 0;
599  }
600  
read_alloc_one_name(struct extent_buffer * eb,void * start,int len,struct fscrypt_str * name)601  static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
602  			       struct fscrypt_str *name)
603  {
604  	char *buf;
605  
606  	buf = kmalloc(len, GFP_NOFS);
607  	if (!buf)
608  		return -ENOMEM;
609  
610  	read_extent_buffer(eb, buf, (unsigned long)start, len);
611  	name->name = buf;
612  	name->len = len;
613  	return 0;
614  }
615  
616  /*
617   * simple helper to read an inode off the disk from a given root
618   * This can only be called for subvolume roots and not for the log
619   */
read_one_inode(struct btrfs_root * root,u64 objectid)620  static noinline struct inode *read_one_inode(struct btrfs_root *root,
621  					     u64 objectid)
622  {
623  	struct inode *inode;
624  
625  	inode = btrfs_iget_logging(objectid, root);
626  	if (IS_ERR(inode))
627  		inode = NULL;
628  	return inode;
629  }
630  
631  /* replays a single extent in 'eb' at 'slot' with 'key' into the
632   * subvolume 'root'.  path is released on entry and should be released
633   * on exit.
634   *
635   * extents in the log tree have not been allocated out of the extent
636   * tree yet.  So, this completes the allocation, taking a reference
637   * as required if the extent already exists or creating a new extent
638   * if it isn't in the extent allocation tree yet.
639   *
640   * The extent is inserted into the file, dropping any existing extents
641   * from the file that overlap the new one.
642   */
replay_one_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)643  static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
644  				      struct btrfs_root *root,
645  				      struct btrfs_path *path,
646  				      struct extent_buffer *eb, int slot,
647  				      struct btrfs_key *key)
648  {
649  	struct btrfs_drop_extents_args drop_args = { 0 };
650  	struct btrfs_fs_info *fs_info = root->fs_info;
651  	int found_type;
652  	u64 extent_end;
653  	u64 start = key->offset;
654  	u64 nbytes = 0;
655  	struct btrfs_file_extent_item *item;
656  	struct inode *inode = NULL;
657  	unsigned long size;
658  	int ret = 0;
659  
660  	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
661  	found_type = btrfs_file_extent_type(eb, item);
662  
663  	if (found_type == BTRFS_FILE_EXTENT_REG ||
664  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
665  		nbytes = btrfs_file_extent_num_bytes(eb, item);
666  		extent_end = start + nbytes;
667  
668  		/*
669  		 * We don't add to the inodes nbytes if we are prealloc or a
670  		 * hole.
671  		 */
672  		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
673  			nbytes = 0;
674  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
675  		size = btrfs_file_extent_ram_bytes(eb, item);
676  		nbytes = btrfs_file_extent_ram_bytes(eb, item);
677  		extent_end = ALIGN(start + size,
678  				   fs_info->sectorsize);
679  	} else {
680  		ret = 0;
681  		goto out;
682  	}
683  
684  	inode = read_one_inode(root, key->objectid);
685  	if (!inode) {
686  		ret = -EIO;
687  		goto out;
688  	}
689  
690  	/*
691  	 * first check to see if we already have this extent in the
692  	 * file.  This must be done before the btrfs_drop_extents run
693  	 * so we don't try to drop this extent.
694  	 */
695  	ret = btrfs_lookup_file_extent(trans, root, path,
696  			btrfs_ino(BTRFS_I(inode)), start, 0);
697  
698  	if (ret == 0 &&
699  	    (found_type == BTRFS_FILE_EXTENT_REG ||
700  	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
701  		struct btrfs_file_extent_item cmp1;
702  		struct btrfs_file_extent_item cmp2;
703  		struct btrfs_file_extent_item *existing;
704  		struct extent_buffer *leaf;
705  
706  		leaf = path->nodes[0];
707  		existing = btrfs_item_ptr(leaf, path->slots[0],
708  					  struct btrfs_file_extent_item);
709  
710  		read_extent_buffer(eb, &cmp1, (unsigned long)item,
711  				   sizeof(cmp1));
712  		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
713  				   sizeof(cmp2));
714  
715  		/*
716  		 * we already have a pointer to this exact extent,
717  		 * we don't have to do anything
718  		 */
719  		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
720  			btrfs_release_path(path);
721  			goto out;
722  		}
723  	}
724  	btrfs_release_path(path);
725  
726  	/* drop any overlapping extents */
727  	drop_args.start = start;
728  	drop_args.end = extent_end;
729  	drop_args.drop_cache = true;
730  	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
731  	if (ret)
732  		goto out;
733  
734  	if (found_type == BTRFS_FILE_EXTENT_REG ||
735  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
736  		u64 offset;
737  		unsigned long dest_offset;
738  		struct btrfs_key ins;
739  
740  		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
741  		    btrfs_fs_incompat(fs_info, NO_HOLES))
742  			goto update_inode;
743  
744  		ret = btrfs_insert_empty_item(trans, root, path, key,
745  					      sizeof(*item));
746  		if (ret)
747  			goto out;
748  		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
749  						    path->slots[0]);
750  		copy_extent_buffer(path->nodes[0], eb, dest_offset,
751  				(unsigned long)item,  sizeof(*item));
752  
753  		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
754  		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
755  		ins.type = BTRFS_EXTENT_ITEM_KEY;
756  		offset = key->offset - btrfs_file_extent_offset(eb, item);
757  
758  		/*
759  		 * Manually record dirty extent, as here we did a shallow
760  		 * file extent item copy and skip normal backref update,
761  		 * but modifying extent tree all by ourselves.
762  		 * So need to manually record dirty extent for qgroup,
763  		 * as the owner of the file extent changed from log tree
764  		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
765  		 */
766  		ret = btrfs_qgroup_trace_extent(trans,
767  				btrfs_file_extent_disk_bytenr(eb, item),
768  				btrfs_file_extent_disk_num_bytes(eb, item));
769  		if (ret < 0)
770  			goto out;
771  
772  		if (ins.objectid > 0) {
773  			struct btrfs_ref ref = { 0 };
774  			u64 csum_start;
775  			u64 csum_end;
776  			LIST_HEAD(ordered_sums);
777  
778  			/*
779  			 * is this extent already allocated in the extent
780  			 * allocation tree?  If so, just add a reference
781  			 */
782  			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
783  						ins.offset);
784  			if (ret < 0) {
785  				goto out;
786  			} else if (ret == 0) {
787  				btrfs_init_generic_ref(&ref,
788  						BTRFS_ADD_DELAYED_REF,
789  						ins.objectid, ins.offset, 0);
790  				btrfs_init_data_ref(&ref,
791  						root->root_key.objectid,
792  						key->objectid, offset, 0, false);
793  				ret = btrfs_inc_extent_ref(trans, &ref);
794  				if (ret)
795  					goto out;
796  			} else {
797  				/*
798  				 * insert the extent pointer in the extent
799  				 * allocation tree
800  				 */
801  				ret = btrfs_alloc_logged_file_extent(trans,
802  						root->root_key.objectid,
803  						key->objectid, offset, &ins);
804  				if (ret)
805  					goto out;
806  			}
807  			btrfs_release_path(path);
808  
809  			if (btrfs_file_extent_compression(eb, item)) {
810  				csum_start = ins.objectid;
811  				csum_end = csum_start + ins.offset;
812  			} else {
813  				csum_start = ins.objectid +
814  					btrfs_file_extent_offset(eb, item);
815  				csum_end = csum_start +
816  					btrfs_file_extent_num_bytes(eb, item);
817  			}
818  
819  			ret = btrfs_lookup_csums_list(root->log_root,
820  						csum_start, csum_end - 1,
821  						&ordered_sums, 0, false);
822  			if (ret)
823  				goto out;
824  			/*
825  			 * Now delete all existing cums in the csum root that
826  			 * cover our range. We do this because we can have an
827  			 * extent that is completely referenced by one file
828  			 * extent item and partially referenced by another
829  			 * file extent item (like after using the clone or
830  			 * extent_same ioctls). In this case if we end up doing
831  			 * the replay of the one that partially references the
832  			 * extent first, and we do not do the csum deletion
833  			 * below, we can get 2 csum items in the csum tree that
834  			 * overlap each other. For example, imagine our log has
835  			 * the two following file extent items:
836  			 *
837  			 * key (257 EXTENT_DATA 409600)
838  			 *     extent data disk byte 12845056 nr 102400
839  			 *     extent data offset 20480 nr 20480 ram 102400
840  			 *
841  			 * key (257 EXTENT_DATA 819200)
842  			 *     extent data disk byte 12845056 nr 102400
843  			 *     extent data offset 0 nr 102400 ram 102400
844  			 *
845  			 * Where the second one fully references the 100K extent
846  			 * that starts at disk byte 12845056, and the log tree
847  			 * has a single csum item that covers the entire range
848  			 * of the extent:
849  			 *
850  			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
851  			 *
852  			 * After the first file extent item is replayed, the
853  			 * csum tree gets the following csum item:
854  			 *
855  			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
856  			 *
857  			 * Which covers the 20K sub-range starting at offset 20K
858  			 * of our extent. Now when we replay the second file
859  			 * extent item, if we do not delete existing csum items
860  			 * that cover any of its blocks, we end up getting two
861  			 * csum items in our csum tree that overlap each other:
862  			 *
863  			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
864  			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
865  			 *
866  			 * Which is a problem, because after this anyone trying
867  			 * to lookup up for the checksum of any block of our
868  			 * extent starting at an offset of 40K or higher, will
869  			 * end up looking at the second csum item only, which
870  			 * does not contain the checksum for any block starting
871  			 * at offset 40K or higher of our extent.
872  			 */
873  			while (!list_empty(&ordered_sums)) {
874  				struct btrfs_ordered_sum *sums;
875  				struct btrfs_root *csum_root;
876  
877  				sums = list_entry(ordered_sums.next,
878  						struct btrfs_ordered_sum,
879  						list);
880  				csum_root = btrfs_csum_root(fs_info,
881  							    sums->logical);
882  				if (!ret)
883  					ret = btrfs_del_csums(trans, csum_root,
884  							      sums->logical,
885  							      sums->len);
886  				if (!ret)
887  					ret = btrfs_csum_file_blocks(trans,
888  								     csum_root,
889  								     sums);
890  				list_del(&sums->list);
891  				kfree(sums);
892  			}
893  			if (ret)
894  				goto out;
895  		} else {
896  			btrfs_release_path(path);
897  		}
898  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
899  		/* inline extents are easy, we just overwrite them */
900  		ret = overwrite_item(trans, root, path, eb, slot, key);
901  		if (ret)
902  			goto out;
903  	}
904  
905  	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
906  						extent_end - start);
907  	if (ret)
908  		goto out;
909  
910  update_inode:
911  	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
912  	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
913  out:
914  	iput(inode);
915  	return ret;
916  }
917  
unlink_inode_for_log_replay(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)918  static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
919  				       struct btrfs_inode *dir,
920  				       struct btrfs_inode *inode,
921  				       const struct fscrypt_str *name)
922  {
923  	int ret;
924  
925  	ret = btrfs_unlink_inode(trans, dir, inode, name);
926  	if (ret)
927  		return ret;
928  	/*
929  	 * Whenever we need to check if a name exists or not, we check the
930  	 * fs/subvolume tree. So after an unlink we must run delayed items, so
931  	 * that future checks for a name during log replay see that the name
932  	 * does not exists anymore.
933  	 */
934  	return btrfs_run_delayed_items(trans);
935  }
936  
937  /*
938   * when cleaning up conflicts between the directory names in the
939   * subvolume, directory names in the log and directory names in the
940   * inode back references, we may have to unlink inodes from directories.
941   *
942   * This is a helper function to do the unlink of a specific directory
943   * item
944   */
drop_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * dir,struct btrfs_dir_item * di)945  static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
946  				      struct btrfs_path *path,
947  				      struct btrfs_inode *dir,
948  				      struct btrfs_dir_item *di)
949  {
950  	struct btrfs_root *root = dir->root;
951  	struct inode *inode;
952  	struct fscrypt_str name;
953  	struct extent_buffer *leaf;
954  	struct btrfs_key location;
955  	int ret;
956  
957  	leaf = path->nodes[0];
958  
959  	btrfs_dir_item_key_to_cpu(leaf, di, &location);
960  	ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
961  	if (ret)
962  		return -ENOMEM;
963  
964  	btrfs_release_path(path);
965  
966  	inode = read_one_inode(root, location.objectid);
967  	if (!inode) {
968  		ret = -EIO;
969  		goto out;
970  	}
971  
972  	ret = link_to_fixup_dir(trans, root, path, location.objectid);
973  	if (ret)
974  		goto out;
975  
976  	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
977  out:
978  	kfree(name.name);
979  	iput(inode);
980  	return ret;
981  }
982  
983  /*
984   * See if a given name and sequence number found in an inode back reference are
985   * already in a directory and correctly point to this inode.
986   *
987   * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
988   * exists.
989   */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,struct fscrypt_str * name)990  static noinline int inode_in_dir(struct btrfs_root *root,
991  				 struct btrfs_path *path,
992  				 u64 dirid, u64 objectid, u64 index,
993  				 struct fscrypt_str *name)
994  {
995  	struct btrfs_dir_item *di;
996  	struct btrfs_key location;
997  	int ret = 0;
998  
999  	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
1000  					 index, name, 0);
1001  	if (IS_ERR(di)) {
1002  		ret = PTR_ERR(di);
1003  		goto out;
1004  	} else if (di) {
1005  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1006  		if (location.objectid != objectid)
1007  			goto out;
1008  	} else {
1009  		goto out;
1010  	}
1011  
1012  	btrfs_release_path(path);
1013  	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
1014  	if (IS_ERR(di)) {
1015  		ret = PTR_ERR(di);
1016  		goto out;
1017  	} else if (di) {
1018  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1019  		if (location.objectid == objectid)
1020  			ret = 1;
1021  	}
1022  out:
1023  	btrfs_release_path(path);
1024  	return ret;
1025  }
1026  
1027  /*
1028   * helper function to check a log tree for a named back reference in
1029   * an inode.  This is used to decide if a back reference that is
1030   * found in the subvolume conflicts with what we find in the log.
1031   *
1032   * inode backreferences may have multiple refs in a single item,
1033   * during replay we process one reference at a time, and we don't
1034   * want to delete valid links to a file from the subvolume if that
1035   * link is also in the log.
1036   */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const struct fscrypt_str * name)1037  static noinline int backref_in_log(struct btrfs_root *log,
1038  				   struct btrfs_key *key,
1039  				   u64 ref_objectid,
1040  				   const struct fscrypt_str *name)
1041  {
1042  	struct btrfs_path *path;
1043  	int ret;
1044  
1045  	path = btrfs_alloc_path();
1046  	if (!path)
1047  		return -ENOMEM;
1048  
1049  	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1050  	if (ret < 0) {
1051  		goto out;
1052  	} else if (ret == 1) {
1053  		ret = 0;
1054  		goto out;
1055  	}
1056  
1057  	if (key->type == BTRFS_INODE_EXTREF_KEY)
1058  		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1059  						       path->slots[0],
1060  						       ref_objectid, name);
1061  	else
1062  		ret = !!btrfs_find_name_in_backref(path->nodes[0],
1063  						   path->slots[0], name);
1064  out:
1065  	btrfs_free_path(path);
1066  	return ret;
1067  }
1068  
__add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_root * log_root,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 inode_objectid,u64 parent_objectid,u64 ref_index,struct fscrypt_str * name)1069  static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1070  				  struct btrfs_root *root,
1071  				  struct btrfs_path *path,
1072  				  struct btrfs_root *log_root,
1073  				  struct btrfs_inode *dir,
1074  				  struct btrfs_inode *inode,
1075  				  u64 inode_objectid, u64 parent_objectid,
1076  				  u64 ref_index, struct fscrypt_str *name)
1077  {
1078  	int ret;
1079  	struct extent_buffer *leaf;
1080  	struct btrfs_dir_item *di;
1081  	struct btrfs_key search_key;
1082  	struct btrfs_inode_extref *extref;
1083  
1084  again:
1085  	/* Search old style refs */
1086  	search_key.objectid = inode_objectid;
1087  	search_key.type = BTRFS_INODE_REF_KEY;
1088  	search_key.offset = parent_objectid;
1089  	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1090  	if (ret == 0) {
1091  		struct btrfs_inode_ref *victim_ref;
1092  		unsigned long ptr;
1093  		unsigned long ptr_end;
1094  
1095  		leaf = path->nodes[0];
1096  
1097  		/* are we trying to overwrite a back ref for the root directory
1098  		 * if so, just jump out, we're done
1099  		 */
1100  		if (search_key.objectid == search_key.offset)
1101  			return 1;
1102  
1103  		/* check all the names in this back reference to see
1104  		 * if they are in the log.  if so, we allow them to stay
1105  		 * otherwise they must be unlinked as a conflict
1106  		 */
1107  		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1108  		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1109  		while (ptr < ptr_end) {
1110  			struct fscrypt_str victim_name;
1111  
1112  			victim_ref = (struct btrfs_inode_ref *)ptr;
1113  			ret = read_alloc_one_name(leaf, (victim_ref + 1),
1114  				 btrfs_inode_ref_name_len(leaf, victim_ref),
1115  				 &victim_name);
1116  			if (ret)
1117  				return ret;
1118  
1119  			ret = backref_in_log(log_root, &search_key,
1120  					     parent_objectid, &victim_name);
1121  			if (ret < 0) {
1122  				kfree(victim_name.name);
1123  				return ret;
1124  			} else if (!ret) {
1125  				inc_nlink(&inode->vfs_inode);
1126  				btrfs_release_path(path);
1127  
1128  				ret = unlink_inode_for_log_replay(trans, dir, inode,
1129  						&victim_name);
1130  				kfree(victim_name.name);
1131  				if (ret)
1132  					return ret;
1133  				goto again;
1134  			}
1135  			kfree(victim_name.name);
1136  
1137  			ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1138  		}
1139  	}
1140  	btrfs_release_path(path);
1141  
1142  	/* Same search but for extended refs */
1143  	extref = btrfs_lookup_inode_extref(NULL, root, path, name,
1144  					   inode_objectid, parent_objectid, 0,
1145  					   0);
1146  	if (IS_ERR(extref)) {
1147  		return PTR_ERR(extref);
1148  	} else if (extref) {
1149  		u32 item_size;
1150  		u32 cur_offset = 0;
1151  		unsigned long base;
1152  		struct inode *victim_parent;
1153  
1154  		leaf = path->nodes[0];
1155  
1156  		item_size = btrfs_item_size(leaf, path->slots[0]);
1157  		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1158  
1159  		while (cur_offset < item_size) {
1160  			struct fscrypt_str victim_name;
1161  
1162  			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1163  
1164  			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1165  				goto next;
1166  
1167  			ret = read_alloc_one_name(leaf, &extref->name,
1168  				 btrfs_inode_extref_name_len(leaf, extref),
1169  				 &victim_name);
1170  			if (ret)
1171  				return ret;
1172  
1173  			search_key.objectid = inode_objectid;
1174  			search_key.type = BTRFS_INODE_EXTREF_KEY;
1175  			search_key.offset = btrfs_extref_hash(parent_objectid,
1176  							      victim_name.name,
1177  							      victim_name.len);
1178  			ret = backref_in_log(log_root, &search_key,
1179  					     parent_objectid, &victim_name);
1180  			if (ret < 0) {
1181  				kfree(victim_name.name);
1182  				return ret;
1183  			} else if (!ret) {
1184  				ret = -ENOENT;
1185  				victim_parent = read_one_inode(root,
1186  						parent_objectid);
1187  				if (victim_parent) {
1188  					inc_nlink(&inode->vfs_inode);
1189  					btrfs_release_path(path);
1190  
1191  					ret = unlink_inode_for_log_replay(trans,
1192  							BTRFS_I(victim_parent),
1193  							inode, &victim_name);
1194  				}
1195  				iput(victim_parent);
1196  				kfree(victim_name.name);
1197  				if (ret)
1198  					return ret;
1199  				goto again;
1200  			}
1201  			kfree(victim_name.name);
1202  next:
1203  			cur_offset += victim_name.len + sizeof(*extref);
1204  		}
1205  	}
1206  	btrfs_release_path(path);
1207  
1208  	/* look for a conflicting sequence number */
1209  	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1210  					 ref_index, name, 0);
1211  	if (IS_ERR(di)) {
1212  		return PTR_ERR(di);
1213  	} else if (di) {
1214  		ret = drop_one_dir_item(trans, path, dir, di);
1215  		if (ret)
1216  			return ret;
1217  	}
1218  	btrfs_release_path(path);
1219  
1220  	/* look for a conflicting name */
1221  	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
1222  	if (IS_ERR(di)) {
1223  		return PTR_ERR(di);
1224  	} else if (di) {
1225  		ret = drop_one_dir_item(trans, path, dir, di);
1226  		if (ret)
1227  			return ret;
1228  	}
1229  	btrfs_release_path(path);
1230  
1231  	return 0;
1232  }
1233  
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index,u64 * parent_objectid)1234  static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1235  			     struct fscrypt_str *name, u64 *index,
1236  			     u64 *parent_objectid)
1237  {
1238  	struct btrfs_inode_extref *extref;
1239  	int ret;
1240  
1241  	extref = (struct btrfs_inode_extref *)ref_ptr;
1242  
1243  	ret = read_alloc_one_name(eb, &extref->name,
1244  				  btrfs_inode_extref_name_len(eb, extref), name);
1245  	if (ret)
1246  		return ret;
1247  
1248  	if (index)
1249  		*index = btrfs_inode_extref_index(eb, extref);
1250  	if (parent_objectid)
1251  		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1252  
1253  	return 0;
1254  }
1255  
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index)1256  static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1257  			  struct fscrypt_str *name, u64 *index)
1258  {
1259  	struct btrfs_inode_ref *ref;
1260  	int ret;
1261  
1262  	ref = (struct btrfs_inode_ref *)ref_ptr;
1263  
1264  	ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1265  				  name);
1266  	if (ret)
1267  		return ret;
1268  
1269  	if (index)
1270  		*index = btrfs_inode_ref_index(eb, ref);
1271  
1272  	return 0;
1273  }
1274  
1275  /*
1276   * Take an inode reference item from the log tree and iterate all names from the
1277   * inode reference item in the subvolume tree with the same key (if it exists).
1278   * For any name that is not in the inode reference item from the log tree, do a
1279   * proper unlink of that name (that is, remove its entry from the inode
1280   * reference item and both dir index keys).
1281   */
unlink_old_inode_refs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * inode,struct extent_buffer * log_eb,int log_slot,struct btrfs_key * key)1282  static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1283  				 struct btrfs_root *root,
1284  				 struct btrfs_path *path,
1285  				 struct btrfs_inode *inode,
1286  				 struct extent_buffer *log_eb,
1287  				 int log_slot,
1288  				 struct btrfs_key *key)
1289  {
1290  	int ret;
1291  	unsigned long ref_ptr;
1292  	unsigned long ref_end;
1293  	struct extent_buffer *eb;
1294  
1295  again:
1296  	btrfs_release_path(path);
1297  	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1298  	if (ret > 0) {
1299  		ret = 0;
1300  		goto out;
1301  	}
1302  	if (ret < 0)
1303  		goto out;
1304  
1305  	eb = path->nodes[0];
1306  	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1307  	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1308  	while (ref_ptr < ref_end) {
1309  		struct fscrypt_str name;
1310  		u64 parent_id;
1311  
1312  		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1313  			ret = extref_get_fields(eb, ref_ptr, &name,
1314  						NULL, &parent_id);
1315  		} else {
1316  			parent_id = key->offset;
1317  			ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1318  		}
1319  		if (ret)
1320  			goto out;
1321  
1322  		if (key->type == BTRFS_INODE_EXTREF_KEY)
1323  			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1324  							       parent_id, &name);
1325  		else
1326  			ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1327  
1328  		if (!ret) {
1329  			struct inode *dir;
1330  
1331  			btrfs_release_path(path);
1332  			dir = read_one_inode(root, parent_id);
1333  			if (!dir) {
1334  				ret = -ENOENT;
1335  				kfree(name.name);
1336  				goto out;
1337  			}
1338  			ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
1339  						 inode, &name);
1340  			kfree(name.name);
1341  			iput(dir);
1342  			if (ret)
1343  				goto out;
1344  			goto again;
1345  		}
1346  
1347  		kfree(name.name);
1348  		ref_ptr += name.len;
1349  		if (key->type == BTRFS_INODE_EXTREF_KEY)
1350  			ref_ptr += sizeof(struct btrfs_inode_extref);
1351  		else
1352  			ref_ptr += sizeof(struct btrfs_inode_ref);
1353  	}
1354  	ret = 0;
1355   out:
1356  	btrfs_release_path(path);
1357  	return ret;
1358  }
1359  
1360  /*
1361   * replay one inode back reference item found in the log tree.
1362   * eb, slot and key refer to the buffer and key found in the log tree.
1363   * root is the destination we are replaying into, and path is for temp
1364   * use by this function.  (it should be released on return).
1365   */
add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1366  static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1367  				  struct btrfs_root *root,
1368  				  struct btrfs_root *log,
1369  				  struct btrfs_path *path,
1370  				  struct extent_buffer *eb, int slot,
1371  				  struct btrfs_key *key)
1372  {
1373  	struct inode *dir = NULL;
1374  	struct inode *inode = NULL;
1375  	unsigned long ref_ptr;
1376  	unsigned long ref_end;
1377  	struct fscrypt_str name = { 0 };
1378  	int ret;
1379  	int log_ref_ver = 0;
1380  	u64 parent_objectid;
1381  	u64 inode_objectid;
1382  	u64 ref_index = 0;
1383  	int ref_struct_size;
1384  
1385  	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1386  	ref_end = ref_ptr + btrfs_item_size(eb, slot);
1387  
1388  	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1389  		struct btrfs_inode_extref *r;
1390  
1391  		ref_struct_size = sizeof(struct btrfs_inode_extref);
1392  		log_ref_ver = 1;
1393  		r = (struct btrfs_inode_extref *)ref_ptr;
1394  		parent_objectid = btrfs_inode_extref_parent(eb, r);
1395  	} else {
1396  		ref_struct_size = sizeof(struct btrfs_inode_ref);
1397  		parent_objectid = key->offset;
1398  	}
1399  	inode_objectid = key->objectid;
1400  
1401  	/*
1402  	 * it is possible that we didn't log all the parent directories
1403  	 * for a given inode.  If we don't find the dir, just don't
1404  	 * copy the back ref in.  The link count fixup code will take
1405  	 * care of the rest
1406  	 */
1407  	dir = read_one_inode(root, parent_objectid);
1408  	if (!dir) {
1409  		ret = -ENOENT;
1410  		goto out;
1411  	}
1412  
1413  	inode = read_one_inode(root, inode_objectid);
1414  	if (!inode) {
1415  		ret = -EIO;
1416  		goto out;
1417  	}
1418  
1419  	while (ref_ptr < ref_end) {
1420  		if (log_ref_ver) {
1421  			ret = extref_get_fields(eb, ref_ptr, &name,
1422  						&ref_index, &parent_objectid);
1423  			/*
1424  			 * parent object can change from one array
1425  			 * item to another.
1426  			 */
1427  			if (!dir)
1428  				dir = read_one_inode(root, parent_objectid);
1429  			if (!dir) {
1430  				ret = -ENOENT;
1431  				goto out;
1432  			}
1433  		} else {
1434  			ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
1435  		}
1436  		if (ret)
1437  			goto out;
1438  
1439  		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1440  				   btrfs_ino(BTRFS_I(inode)), ref_index, &name);
1441  		if (ret < 0) {
1442  			goto out;
1443  		} else if (ret == 0) {
1444  			/*
1445  			 * look for a conflicting back reference in the
1446  			 * metadata. if we find one we have to unlink that name
1447  			 * of the file before we add our new link.  Later on, we
1448  			 * overwrite any existing back reference, and we don't
1449  			 * want to create dangling pointers in the directory.
1450  			 */
1451  			ret = __add_inode_ref(trans, root, path, log,
1452  					      BTRFS_I(dir), BTRFS_I(inode),
1453  					      inode_objectid, parent_objectid,
1454  					      ref_index, &name);
1455  			if (ret) {
1456  				if (ret == 1)
1457  					ret = 0;
1458  				goto out;
1459  			}
1460  
1461  			/* insert our name */
1462  			ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1463  					     &name, 0, ref_index);
1464  			if (ret)
1465  				goto out;
1466  
1467  			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1468  			if (ret)
1469  				goto out;
1470  		}
1471  		/* Else, ret == 1, we already have a perfect match, we're done. */
1472  
1473  		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1474  		kfree(name.name);
1475  		name.name = NULL;
1476  		if (log_ref_ver) {
1477  			iput(dir);
1478  			dir = NULL;
1479  		}
1480  	}
1481  
1482  	/*
1483  	 * Before we overwrite the inode reference item in the subvolume tree
1484  	 * with the item from the log tree, we must unlink all names from the
1485  	 * parent directory that are in the subvolume's tree inode reference
1486  	 * item, otherwise we end up with an inconsistent subvolume tree where
1487  	 * dir index entries exist for a name but there is no inode reference
1488  	 * item with the same name.
1489  	 */
1490  	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1491  				    key);
1492  	if (ret)
1493  		goto out;
1494  
1495  	/* finally write the back reference in the inode */
1496  	ret = overwrite_item(trans, root, path, eb, slot, key);
1497  out:
1498  	btrfs_release_path(path);
1499  	kfree(name.name);
1500  	iput(dir);
1501  	iput(inode);
1502  	return ret;
1503  }
1504  
count_inode_extrefs(struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)1505  static int count_inode_extrefs(struct btrfs_root *root,
1506  		struct btrfs_inode *inode, struct btrfs_path *path)
1507  {
1508  	int ret = 0;
1509  	int name_len;
1510  	unsigned int nlink = 0;
1511  	u32 item_size;
1512  	u32 cur_offset = 0;
1513  	u64 inode_objectid = btrfs_ino(inode);
1514  	u64 offset = 0;
1515  	unsigned long ptr;
1516  	struct btrfs_inode_extref *extref;
1517  	struct extent_buffer *leaf;
1518  
1519  	while (1) {
1520  		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1521  					    &extref, &offset);
1522  		if (ret)
1523  			break;
1524  
1525  		leaf = path->nodes[0];
1526  		item_size = btrfs_item_size(leaf, path->slots[0]);
1527  		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1528  		cur_offset = 0;
1529  
1530  		while (cur_offset < item_size) {
1531  			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1532  			name_len = btrfs_inode_extref_name_len(leaf, extref);
1533  
1534  			nlink++;
1535  
1536  			cur_offset += name_len + sizeof(*extref);
1537  		}
1538  
1539  		offset++;
1540  		btrfs_release_path(path);
1541  	}
1542  	btrfs_release_path(path);
1543  
1544  	if (ret < 0 && ret != -ENOENT)
1545  		return ret;
1546  	return nlink;
1547  }
1548  
count_inode_refs(struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)1549  static int count_inode_refs(struct btrfs_root *root,
1550  			struct btrfs_inode *inode, struct btrfs_path *path)
1551  {
1552  	int ret;
1553  	struct btrfs_key key;
1554  	unsigned int nlink = 0;
1555  	unsigned long ptr;
1556  	unsigned long ptr_end;
1557  	int name_len;
1558  	u64 ino = btrfs_ino(inode);
1559  
1560  	key.objectid = ino;
1561  	key.type = BTRFS_INODE_REF_KEY;
1562  	key.offset = (u64)-1;
1563  
1564  	while (1) {
1565  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1566  		if (ret < 0)
1567  			break;
1568  		if (ret > 0) {
1569  			if (path->slots[0] == 0)
1570  				break;
1571  			path->slots[0]--;
1572  		}
1573  process_slot:
1574  		btrfs_item_key_to_cpu(path->nodes[0], &key,
1575  				      path->slots[0]);
1576  		if (key.objectid != ino ||
1577  		    key.type != BTRFS_INODE_REF_KEY)
1578  			break;
1579  		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1580  		ptr_end = ptr + btrfs_item_size(path->nodes[0],
1581  						   path->slots[0]);
1582  		while (ptr < ptr_end) {
1583  			struct btrfs_inode_ref *ref;
1584  
1585  			ref = (struct btrfs_inode_ref *)ptr;
1586  			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1587  							    ref);
1588  			ptr = (unsigned long)(ref + 1) + name_len;
1589  			nlink++;
1590  		}
1591  
1592  		if (key.offset == 0)
1593  			break;
1594  		if (path->slots[0] > 0) {
1595  			path->slots[0]--;
1596  			goto process_slot;
1597  		}
1598  		key.offset--;
1599  		btrfs_release_path(path);
1600  	}
1601  	btrfs_release_path(path);
1602  
1603  	return nlink;
1604  }
1605  
1606  /*
1607   * There are a few corners where the link count of the file can't
1608   * be properly maintained during replay.  So, instead of adding
1609   * lots of complexity to the log code, we just scan the backrefs
1610   * for any file that has been through replay.
1611   *
1612   * The scan will update the link count on the inode to reflect the
1613   * number of back refs found.  If it goes down to zero, the iput
1614   * will free the inode.
1615   */
fixup_inode_link_count(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode)1616  static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1617  					   struct btrfs_root *root,
1618  					   struct inode *inode)
1619  {
1620  	struct btrfs_path *path;
1621  	int ret;
1622  	u64 nlink = 0;
1623  	u64 ino = btrfs_ino(BTRFS_I(inode));
1624  
1625  	path = btrfs_alloc_path();
1626  	if (!path)
1627  		return -ENOMEM;
1628  
1629  	ret = count_inode_refs(root, BTRFS_I(inode), path);
1630  	if (ret < 0)
1631  		goto out;
1632  
1633  	nlink = ret;
1634  
1635  	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1636  	if (ret < 0)
1637  		goto out;
1638  
1639  	nlink += ret;
1640  
1641  	ret = 0;
1642  
1643  	if (nlink != inode->i_nlink) {
1644  		set_nlink(inode, nlink);
1645  		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1646  		if (ret)
1647  			goto out;
1648  	}
1649  	BTRFS_I(inode)->index_cnt = (u64)-1;
1650  
1651  	if (inode->i_nlink == 0) {
1652  		if (S_ISDIR(inode->i_mode)) {
1653  			ret = replay_dir_deletes(trans, root, NULL, path,
1654  						 ino, 1);
1655  			if (ret)
1656  				goto out;
1657  		}
1658  		ret = btrfs_insert_orphan_item(trans, root, ino);
1659  		if (ret == -EEXIST)
1660  			ret = 0;
1661  	}
1662  
1663  out:
1664  	btrfs_free_path(path);
1665  	return ret;
1666  }
1667  
fixup_inode_link_counts(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path)1668  static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1669  					    struct btrfs_root *root,
1670  					    struct btrfs_path *path)
1671  {
1672  	int ret;
1673  	struct btrfs_key key;
1674  	struct inode *inode;
1675  
1676  	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1677  	key.type = BTRFS_ORPHAN_ITEM_KEY;
1678  	key.offset = (u64)-1;
1679  	while (1) {
1680  		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1681  		if (ret < 0)
1682  			break;
1683  
1684  		if (ret == 1) {
1685  			ret = 0;
1686  			if (path->slots[0] == 0)
1687  				break;
1688  			path->slots[0]--;
1689  		}
1690  
1691  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1692  		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1693  		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1694  			break;
1695  
1696  		ret = btrfs_del_item(trans, root, path);
1697  		if (ret)
1698  			break;
1699  
1700  		btrfs_release_path(path);
1701  		inode = read_one_inode(root, key.offset);
1702  		if (!inode) {
1703  			ret = -EIO;
1704  			break;
1705  		}
1706  
1707  		ret = fixup_inode_link_count(trans, root, inode);
1708  		iput(inode);
1709  		if (ret)
1710  			break;
1711  
1712  		/*
1713  		 * fixup on a directory may create new entries,
1714  		 * make sure we always look for the highset possible
1715  		 * offset
1716  		 */
1717  		key.offset = (u64)-1;
1718  	}
1719  	btrfs_release_path(path);
1720  	return ret;
1721  }
1722  
1723  
1724  /*
1725   * record a given inode in the fixup dir so we can check its link
1726   * count when replay is done.  The link count is incremented here
1727   * so the inode won't go away until we check it
1728   */
link_to_fixup_dir(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 objectid)1729  static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1730  				      struct btrfs_root *root,
1731  				      struct btrfs_path *path,
1732  				      u64 objectid)
1733  {
1734  	struct btrfs_key key;
1735  	int ret = 0;
1736  	struct inode *inode;
1737  
1738  	inode = read_one_inode(root, objectid);
1739  	if (!inode)
1740  		return -EIO;
1741  
1742  	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1743  	key.type = BTRFS_ORPHAN_ITEM_KEY;
1744  	key.offset = objectid;
1745  
1746  	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1747  
1748  	btrfs_release_path(path);
1749  	if (ret == 0) {
1750  		if (!inode->i_nlink)
1751  			set_nlink(inode, 1);
1752  		else
1753  			inc_nlink(inode);
1754  		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1755  	} else if (ret == -EEXIST) {
1756  		ret = 0;
1757  	}
1758  	iput(inode);
1759  
1760  	return ret;
1761  }
1762  
1763  /*
1764   * when replaying the log for a directory, we only insert names
1765   * for inodes that actually exist.  This means an fsync on a directory
1766   * does not implicitly fsync all the new files in it
1767   */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,const struct fscrypt_str * name,struct btrfs_key * location)1768  static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1769  				    struct btrfs_root *root,
1770  				    u64 dirid, u64 index,
1771  				    const struct fscrypt_str *name,
1772  				    struct btrfs_key *location)
1773  {
1774  	struct inode *inode;
1775  	struct inode *dir;
1776  	int ret;
1777  
1778  	inode = read_one_inode(root, location->objectid);
1779  	if (!inode)
1780  		return -ENOENT;
1781  
1782  	dir = read_one_inode(root, dirid);
1783  	if (!dir) {
1784  		iput(inode);
1785  		return -EIO;
1786  	}
1787  
1788  	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1789  			     1, index);
1790  
1791  	/* FIXME, put inode into FIXUP list */
1792  
1793  	iput(inode);
1794  	iput(dir);
1795  	return ret;
1796  }
1797  
delete_conflicting_dir_entry(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_path * path,struct btrfs_dir_item * dst_di,const struct btrfs_key * log_key,u8 log_flags,bool exists)1798  static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1799  					struct btrfs_inode *dir,
1800  					struct btrfs_path *path,
1801  					struct btrfs_dir_item *dst_di,
1802  					const struct btrfs_key *log_key,
1803  					u8 log_flags,
1804  					bool exists)
1805  {
1806  	struct btrfs_key found_key;
1807  
1808  	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1809  	/* The existing dentry points to the same inode, don't delete it. */
1810  	if (found_key.objectid == log_key->objectid &&
1811  	    found_key.type == log_key->type &&
1812  	    found_key.offset == log_key->offset &&
1813  	    btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
1814  		return 1;
1815  
1816  	/*
1817  	 * Don't drop the conflicting directory entry if the inode for the new
1818  	 * entry doesn't exist.
1819  	 */
1820  	if (!exists)
1821  		return 0;
1822  
1823  	return drop_one_dir_item(trans, path, dir, dst_di);
1824  }
1825  
1826  /*
1827   * take a single entry in a log directory item and replay it into
1828   * the subvolume.
1829   *
1830   * if a conflicting item exists in the subdirectory already,
1831   * the inode it points to is unlinked and put into the link count
1832   * fix up tree.
1833   *
1834   * If a name from the log points to a file or directory that does
1835   * not exist in the FS, it is skipped.  fsyncs on directories
1836   * do not force down inodes inside that directory, just changes to the
1837   * names or unlinks in a directory.
1838   *
1839   * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1840   * non-existing inode) and 1 if the name was replayed.
1841   */
replay_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,struct btrfs_dir_item * di,struct btrfs_key * key)1842  static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1843  				    struct btrfs_root *root,
1844  				    struct btrfs_path *path,
1845  				    struct extent_buffer *eb,
1846  				    struct btrfs_dir_item *di,
1847  				    struct btrfs_key *key)
1848  {
1849  	struct fscrypt_str name = { 0 };
1850  	struct btrfs_dir_item *dir_dst_di;
1851  	struct btrfs_dir_item *index_dst_di;
1852  	bool dir_dst_matches = false;
1853  	bool index_dst_matches = false;
1854  	struct btrfs_key log_key;
1855  	struct btrfs_key search_key;
1856  	struct inode *dir;
1857  	u8 log_flags;
1858  	bool exists;
1859  	int ret;
1860  	bool update_size = true;
1861  	bool name_added = false;
1862  
1863  	dir = read_one_inode(root, key->objectid);
1864  	if (!dir)
1865  		return -EIO;
1866  
1867  	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1868  	if (ret)
1869  		goto out;
1870  
1871  	log_flags = btrfs_dir_flags(eb, di);
1872  	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1873  	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1874  	btrfs_release_path(path);
1875  	if (ret < 0)
1876  		goto out;
1877  	exists = (ret == 0);
1878  	ret = 0;
1879  
1880  	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1881  					   &name, 1);
1882  	if (IS_ERR(dir_dst_di)) {
1883  		ret = PTR_ERR(dir_dst_di);
1884  		goto out;
1885  	} else if (dir_dst_di) {
1886  		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1887  						   dir_dst_di, &log_key,
1888  						   log_flags, exists);
1889  		if (ret < 0)
1890  			goto out;
1891  		dir_dst_matches = (ret == 1);
1892  	}
1893  
1894  	btrfs_release_path(path);
1895  
1896  	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1897  						   key->objectid, key->offset,
1898  						   &name, 1);
1899  	if (IS_ERR(index_dst_di)) {
1900  		ret = PTR_ERR(index_dst_di);
1901  		goto out;
1902  	} else if (index_dst_di) {
1903  		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1904  						   index_dst_di, &log_key,
1905  						   log_flags, exists);
1906  		if (ret < 0)
1907  			goto out;
1908  		index_dst_matches = (ret == 1);
1909  	}
1910  
1911  	btrfs_release_path(path);
1912  
1913  	if (dir_dst_matches && index_dst_matches) {
1914  		ret = 0;
1915  		update_size = false;
1916  		goto out;
1917  	}
1918  
1919  	/*
1920  	 * Check if the inode reference exists in the log for the given name,
1921  	 * inode and parent inode
1922  	 */
1923  	search_key.objectid = log_key.objectid;
1924  	search_key.type = BTRFS_INODE_REF_KEY;
1925  	search_key.offset = key->objectid;
1926  	ret = backref_in_log(root->log_root, &search_key, 0, &name);
1927  	if (ret < 0) {
1928  	        goto out;
1929  	} else if (ret) {
1930  	        /* The dentry will be added later. */
1931  	        ret = 0;
1932  	        update_size = false;
1933  	        goto out;
1934  	}
1935  
1936  	search_key.objectid = log_key.objectid;
1937  	search_key.type = BTRFS_INODE_EXTREF_KEY;
1938  	search_key.offset = key->objectid;
1939  	ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
1940  	if (ret < 0) {
1941  		goto out;
1942  	} else if (ret) {
1943  		/* The dentry will be added later. */
1944  		ret = 0;
1945  		update_size = false;
1946  		goto out;
1947  	}
1948  	btrfs_release_path(path);
1949  	ret = insert_one_name(trans, root, key->objectid, key->offset,
1950  			      &name, &log_key);
1951  	if (ret && ret != -ENOENT && ret != -EEXIST)
1952  		goto out;
1953  	if (!ret)
1954  		name_added = true;
1955  	update_size = false;
1956  	ret = 0;
1957  
1958  out:
1959  	if (!ret && update_size) {
1960  		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
1961  		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
1962  	}
1963  	kfree(name.name);
1964  	iput(dir);
1965  	if (!ret && name_added)
1966  		ret = 1;
1967  	return ret;
1968  }
1969  
1970  /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
replay_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1971  static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1972  					struct btrfs_root *root,
1973  					struct btrfs_path *path,
1974  					struct extent_buffer *eb, int slot,
1975  					struct btrfs_key *key)
1976  {
1977  	int ret;
1978  	struct btrfs_dir_item *di;
1979  
1980  	/* We only log dir index keys, which only contain a single dir item. */
1981  	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
1982  
1983  	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1984  	ret = replay_one_name(trans, root, path, eb, di, key);
1985  	if (ret < 0)
1986  		return ret;
1987  
1988  	/*
1989  	 * If this entry refers to a non-directory (directories can not have a
1990  	 * link count > 1) and it was added in the transaction that was not
1991  	 * committed, make sure we fixup the link count of the inode the entry
1992  	 * points to. Otherwise something like the following would result in a
1993  	 * directory pointing to an inode with a wrong link that does not account
1994  	 * for this dir entry:
1995  	 *
1996  	 * mkdir testdir
1997  	 * touch testdir/foo
1998  	 * touch testdir/bar
1999  	 * sync
2000  	 *
2001  	 * ln testdir/bar testdir/bar_link
2002  	 * ln testdir/foo testdir/foo_link
2003  	 * xfs_io -c "fsync" testdir/bar
2004  	 *
2005  	 * <power failure>
2006  	 *
2007  	 * mount fs, log replay happens
2008  	 *
2009  	 * File foo would remain with a link count of 1 when it has two entries
2010  	 * pointing to it in the directory testdir. This would make it impossible
2011  	 * to ever delete the parent directory has it would result in stale
2012  	 * dentries that can never be deleted.
2013  	 */
2014  	if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
2015  		struct btrfs_path *fixup_path;
2016  		struct btrfs_key di_key;
2017  
2018  		fixup_path = btrfs_alloc_path();
2019  		if (!fixup_path)
2020  			return -ENOMEM;
2021  
2022  		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2023  		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2024  		btrfs_free_path(fixup_path);
2025  	}
2026  
2027  	return ret;
2028  }
2029  
2030  /*
2031   * directory replay has two parts.  There are the standard directory
2032   * items in the log copied from the subvolume, and range items
2033   * created in the log while the subvolume was logged.
2034   *
2035   * The range items tell us which parts of the key space the log
2036   * is authoritative for.  During replay, if a key in the subvolume
2037   * directory is in a logged range item, but not actually in the log
2038   * that means it was deleted from the directory before the fsync
2039   * and should be removed.
2040   */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 * start_ret,u64 * end_ret)2041  static noinline int find_dir_range(struct btrfs_root *root,
2042  				   struct btrfs_path *path,
2043  				   u64 dirid,
2044  				   u64 *start_ret, u64 *end_ret)
2045  {
2046  	struct btrfs_key key;
2047  	u64 found_end;
2048  	struct btrfs_dir_log_item *item;
2049  	int ret;
2050  	int nritems;
2051  
2052  	if (*start_ret == (u64)-1)
2053  		return 1;
2054  
2055  	key.objectid = dirid;
2056  	key.type = BTRFS_DIR_LOG_INDEX_KEY;
2057  	key.offset = *start_ret;
2058  
2059  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2060  	if (ret < 0)
2061  		goto out;
2062  	if (ret > 0) {
2063  		if (path->slots[0] == 0)
2064  			goto out;
2065  		path->slots[0]--;
2066  	}
2067  	if (ret != 0)
2068  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2069  
2070  	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2071  		ret = 1;
2072  		goto next;
2073  	}
2074  	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2075  			      struct btrfs_dir_log_item);
2076  	found_end = btrfs_dir_log_end(path->nodes[0], item);
2077  
2078  	if (*start_ret >= key.offset && *start_ret <= found_end) {
2079  		ret = 0;
2080  		*start_ret = key.offset;
2081  		*end_ret = found_end;
2082  		goto out;
2083  	}
2084  	ret = 1;
2085  next:
2086  	/* check the next slot in the tree to see if it is a valid item */
2087  	nritems = btrfs_header_nritems(path->nodes[0]);
2088  	path->slots[0]++;
2089  	if (path->slots[0] >= nritems) {
2090  		ret = btrfs_next_leaf(root, path);
2091  		if (ret)
2092  			goto out;
2093  	}
2094  
2095  	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2096  
2097  	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2098  		ret = 1;
2099  		goto out;
2100  	}
2101  	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2102  			      struct btrfs_dir_log_item);
2103  	found_end = btrfs_dir_log_end(path->nodes[0], item);
2104  	*start_ret = key.offset;
2105  	*end_ret = found_end;
2106  	ret = 0;
2107  out:
2108  	btrfs_release_path(path);
2109  	return ret;
2110  }
2111  
2112  /*
2113   * this looks for a given directory item in the log.  If the directory
2114   * item is not in the log, the item is removed and the inode it points
2115   * to is unlinked
2116   */
check_item_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_path * log_path,struct inode * dir,struct btrfs_key * dir_key)2117  static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2118  				      struct btrfs_root *log,
2119  				      struct btrfs_path *path,
2120  				      struct btrfs_path *log_path,
2121  				      struct inode *dir,
2122  				      struct btrfs_key *dir_key)
2123  {
2124  	struct btrfs_root *root = BTRFS_I(dir)->root;
2125  	int ret;
2126  	struct extent_buffer *eb;
2127  	int slot;
2128  	struct btrfs_dir_item *di;
2129  	struct fscrypt_str name = { 0 };
2130  	struct inode *inode = NULL;
2131  	struct btrfs_key location;
2132  
2133  	/*
2134  	 * Currently we only log dir index keys. Even if we replay a log created
2135  	 * by an older kernel that logged both dir index and dir item keys, all
2136  	 * we need to do is process the dir index keys, we (and our caller) can
2137  	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2138  	 */
2139  	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2140  
2141  	eb = path->nodes[0];
2142  	slot = path->slots[0];
2143  	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2144  	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2145  	if (ret)
2146  		goto out;
2147  
2148  	if (log) {
2149  		struct btrfs_dir_item *log_di;
2150  
2151  		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2152  						     dir_key->objectid,
2153  						     dir_key->offset, &name, 0);
2154  		if (IS_ERR(log_di)) {
2155  			ret = PTR_ERR(log_di);
2156  			goto out;
2157  		} else if (log_di) {
2158  			/* The dentry exists in the log, we have nothing to do. */
2159  			ret = 0;
2160  			goto out;
2161  		}
2162  	}
2163  
2164  	btrfs_dir_item_key_to_cpu(eb, di, &location);
2165  	btrfs_release_path(path);
2166  	btrfs_release_path(log_path);
2167  	inode = read_one_inode(root, location.objectid);
2168  	if (!inode) {
2169  		ret = -EIO;
2170  		goto out;
2171  	}
2172  
2173  	ret = link_to_fixup_dir(trans, root, path, location.objectid);
2174  	if (ret)
2175  		goto out;
2176  
2177  	inc_nlink(inode);
2178  	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
2179  					  &name);
2180  	/*
2181  	 * Unlike dir item keys, dir index keys can only have one name (entry) in
2182  	 * them, as there are no key collisions since each key has a unique offset
2183  	 * (an index number), so we're done.
2184  	 */
2185  out:
2186  	btrfs_release_path(path);
2187  	btrfs_release_path(log_path);
2188  	kfree(name.name);
2189  	iput(inode);
2190  	return ret;
2191  }
2192  
replay_xattr_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,const u64 ino)2193  static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2194  			      struct btrfs_root *root,
2195  			      struct btrfs_root *log,
2196  			      struct btrfs_path *path,
2197  			      const u64 ino)
2198  {
2199  	struct btrfs_key search_key;
2200  	struct btrfs_path *log_path;
2201  	int i;
2202  	int nritems;
2203  	int ret;
2204  
2205  	log_path = btrfs_alloc_path();
2206  	if (!log_path)
2207  		return -ENOMEM;
2208  
2209  	search_key.objectid = ino;
2210  	search_key.type = BTRFS_XATTR_ITEM_KEY;
2211  	search_key.offset = 0;
2212  again:
2213  	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2214  	if (ret < 0)
2215  		goto out;
2216  process_leaf:
2217  	nritems = btrfs_header_nritems(path->nodes[0]);
2218  	for (i = path->slots[0]; i < nritems; i++) {
2219  		struct btrfs_key key;
2220  		struct btrfs_dir_item *di;
2221  		struct btrfs_dir_item *log_di;
2222  		u32 total_size;
2223  		u32 cur;
2224  
2225  		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2226  		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2227  			ret = 0;
2228  			goto out;
2229  		}
2230  
2231  		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2232  		total_size = btrfs_item_size(path->nodes[0], i);
2233  		cur = 0;
2234  		while (cur < total_size) {
2235  			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2236  			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2237  			u32 this_len = sizeof(*di) + name_len + data_len;
2238  			char *name;
2239  
2240  			name = kmalloc(name_len, GFP_NOFS);
2241  			if (!name) {
2242  				ret = -ENOMEM;
2243  				goto out;
2244  			}
2245  			read_extent_buffer(path->nodes[0], name,
2246  					   (unsigned long)(di + 1), name_len);
2247  
2248  			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2249  						    name, name_len, 0);
2250  			btrfs_release_path(log_path);
2251  			if (!log_di) {
2252  				/* Doesn't exist in log tree, so delete it. */
2253  				btrfs_release_path(path);
2254  				di = btrfs_lookup_xattr(trans, root, path, ino,
2255  							name, name_len, -1);
2256  				kfree(name);
2257  				if (IS_ERR(di)) {
2258  					ret = PTR_ERR(di);
2259  					goto out;
2260  				}
2261  				ASSERT(di);
2262  				ret = btrfs_delete_one_dir_name(trans, root,
2263  								path, di);
2264  				if (ret)
2265  					goto out;
2266  				btrfs_release_path(path);
2267  				search_key = key;
2268  				goto again;
2269  			}
2270  			kfree(name);
2271  			if (IS_ERR(log_di)) {
2272  				ret = PTR_ERR(log_di);
2273  				goto out;
2274  			}
2275  			cur += this_len;
2276  			di = (struct btrfs_dir_item *)((char *)di + this_len);
2277  		}
2278  	}
2279  	ret = btrfs_next_leaf(root, path);
2280  	if (ret > 0)
2281  		ret = 0;
2282  	else if (ret == 0)
2283  		goto process_leaf;
2284  out:
2285  	btrfs_free_path(log_path);
2286  	btrfs_release_path(path);
2287  	return ret;
2288  }
2289  
2290  
2291  /*
2292   * deletion replay happens before we copy any new directory items
2293   * out of the log or out of backreferences from inodes.  It
2294   * scans the log to find ranges of keys that log is authoritative for,
2295   * and then scans the directory to find items in those ranges that are
2296   * not present in the log.
2297   *
2298   * Anything we don't find in the log is unlinked and removed from the
2299   * directory.
2300   */
replay_dir_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,int del_all)2301  static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2302  				       struct btrfs_root *root,
2303  				       struct btrfs_root *log,
2304  				       struct btrfs_path *path,
2305  				       u64 dirid, int del_all)
2306  {
2307  	u64 range_start;
2308  	u64 range_end;
2309  	int ret = 0;
2310  	struct btrfs_key dir_key;
2311  	struct btrfs_key found_key;
2312  	struct btrfs_path *log_path;
2313  	struct inode *dir;
2314  
2315  	dir_key.objectid = dirid;
2316  	dir_key.type = BTRFS_DIR_INDEX_KEY;
2317  	log_path = btrfs_alloc_path();
2318  	if (!log_path)
2319  		return -ENOMEM;
2320  
2321  	dir = read_one_inode(root, dirid);
2322  	/* it isn't an error if the inode isn't there, that can happen
2323  	 * because we replay the deletes before we copy in the inode item
2324  	 * from the log
2325  	 */
2326  	if (!dir) {
2327  		btrfs_free_path(log_path);
2328  		return 0;
2329  	}
2330  
2331  	range_start = 0;
2332  	range_end = 0;
2333  	while (1) {
2334  		if (del_all)
2335  			range_end = (u64)-1;
2336  		else {
2337  			ret = find_dir_range(log, path, dirid,
2338  					     &range_start, &range_end);
2339  			if (ret < 0)
2340  				goto out;
2341  			else if (ret > 0)
2342  				break;
2343  		}
2344  
2345  		dir_key.offset = range_start;
2346  		while (1) {
2347  			int nritems;
2348  			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2349  						0, 0);
2350  			if (ret < 0)
2351  				goto out;
2352  
2353  			nritems = btrfs_header_nritems(path->nodes[0]);
2354  			if (path->slots[0] >= nritems) {
2355  				ret = btrfs_next_leaf(root, path);
2356  				if (ret == 1)
2357  					break;
2358  				else if (ret < 0)
2359  					goto out;
2360  			}
2361  			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2362  					      path->slots[0]);
2363  			if (found_key.objectid != dirid ||
2364  			    found_key.type != dir_key.type) {
2365  				ret = 0;
2366  				goto out;
2367  			}
2368  
2369  			if (found_key.offset > range_end)
2370  				break;
2371  
2372  			ret = check_item_in_log(trans, log, path,
2373  						log_path, dir,
2374  						&found_key);
2375  			if (ret)
2376  				goto out;
2377  			if (found_key.offset == (u64)-1)
2378  				break;
2379  			dir_key.offset = found_key.offset + 1;
2380  		}
2381  		btrfs_release_path(path);
2382  		if (range_end == (u64)-1)
2383  			break;
2384  		range_start = range_end + 1;
2385  	}
2386  	ret = 0;
2387  out:
2388  	btrfs_release_path(path);
2389  	btrfs_free_path(log_path);
2390  	iput(dir);
2391  	return ret;
2392  }
2393  
2394  /*
2395   * the process_func used to replay items from the log tree.  This
2396   * gets called in two different stages.  The first stage just looks
2397   * for inodes and makes sure they are all copied into the subvolume.
2398   *
2399   * The second stage copies all the other item types from the log into
2400   * the subvolume.  The two stage approach is slower, but gets rid of
2401   * lots of complexity around inodes referencing other inodes that exist
2402   * only in the log (references come from either directory items or inode
2403   * back refs).
2404   */
replay_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2405  static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2406  			     struct walk_control *wc, u64 gen, int level)
2407  {
2408  	int nritems;
2409  	struct btrfs_tree_parent_check check = {
2410  		.transid = gen,
2411  		.level = level
2412  	};
2413  	struct btrfs_path *path;
2414  	struct btrfs_root *root = wc->replay_dest;
2415  	struct btrfs_key key;
2416  	int i;
2417  	int ret;
2418  
2419  	ret = btrfs_read_extent_buffer(eb, &check);
2420  	if (ret)
2421  		return ret;
2422  
2423  	level = btrfs_header_level(eb);
2424  
2425  	if (level != 0)
2426  		return 0;
2427  
2428  	path = btrfs_alloc_path();
2429  	if (!path)
2430  		return -ENOMEM;
2431  
2432  	nritems = btrfs_header_nritems(eb);
2433  	for (i = 0; i < nritems; i++) {
2434  		btrfs_item_key_to_cpu(eb, &key, i);
2435  
2436  		/* inode keys are done during the first stage */
2437  		if (key.type == BTRFS_INODE_ITEM_KEY &&
2438  		    wc->stage == LOG_WALK_REPLAY_INODES) {
2439  			struct btrfs_inode_item *inode_item;
2440  			u32 mode;
2441  
2442  			inode_item = btrfs_item_ptr(eb, i,
2443  					    struct btrfs_inode_item);
2444  			/*
2445  			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2446  			 * and never got linked before the fsync, skip it, as
2447  			 * replaying it is pointless since it would be deleted
2448  			 * later. We skip logging tmpfiles, but it's always
2449  			 * possible we are replaying a log created with a kernel
2450  			 * that used to log tmpfiles.
2451  			 */
2452  			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2453  				wc->ignore_cur_inode = true;
2454  				continue;
2455  			} else {
2456  				wc->ignore_cur_inode = false;
2457  			}
2458  			ret = replay_xattr_deletes(wc->trans, root, log,
2459  						   path, key.objectid);
2460  			if (ret)
2461  				break;
2462  			mode = btrfs_inode_mode(eb, inode_item);
2463  			if (S_ISDIR(mode)) {
2464  				ret = replay_dir_deletes(wc->trans,
2465  					 root, log, path, key.objectid, 0);
2466  				if (ret)
2467  					break;
2468  			}
2469  			ret = overwrite_item(wc->trans, root, path,
2470  					     eb, i, &key);
2471  			if (ret)
2472  				break;
2473  
2474  			/*
2475  			 * Before replaying extents, truncate the inode to its
2476  			 * size. We need to do it now and not after log replay
2477  			 * because before an fsync we can have prealloc extents
2478  			 * added beyond the inode's i_size. If we did it after,
2479  			 * through orphan cleanup for example, we would drop
2480  			 * those prealloc extents just after replaying them.
2481  			 */
2482  			if (S_ISREG(mode)) {
2483  				struct btrfs_drop_extents_args drop_args = { 0 };
2484  				struct inode *inode;
2485  				u64 from;
2486  
2487  				inode = read_one_inode(root, key.objectid);
2488  				if (!inode) {
2489  					ret = -EIO;
2490  					break;
2491  				}
2492  				from = ALIGN(i_size_read(inode),
2493  					     root->fs_info->sectorsize);
2494  				drop_args.start = from;
2495  				drop_args.end = (u64)-1;
2496  				drop_args.drop_cache = true;
2497  				ret = btrfs_drop_extents(wc->trans, root,
2498  							 BTRFS_I(inode),
2499  							 &drop_args);
2500  				if (!ret) {
2501  					inode_sub_bytes(inode,
2502  							drop_args.bytes_found);
2503  					/* Update the inode's nbytes. */
2504  					ret = btrfs_update_inode(wc->trans,
2505  							root, BTRFS_I(inode));
2506  				}
2507  				iput(inode);
2508  				if (ret)
2509  					break;
2510  			}
2511  
2512  			ret = link_to_fixup_dir(wc->trans, root,
2513  						path, key.objectid);
2514  			if (ret)
2515  				break;
2516  		}
2517  
2518  		if (wc->ignore_cur_inode)
2519  			continue;
2520  
2521  		if (key.type == BTRFS_DIR_INDEX_KEY &&
2522  		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2523  			ret = replay_one_dir_item(wc->trans, root, path,
2524  						  eb, i, &key);
2525  			if (ret)
2526  				break;
2527  		}
2528  
2529  		if (wc->stage < LOG_WALK_REPLAY_ALL)
2530  			continue;
2531  
2532  		/* these keys are simply copied */
2533  		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2534  			ret = overwrite_item(wc->trans, root, path,
2535  					     eb, i, &key);
2536  			if (ret)
2537  				break;
2538  		} else if (key.type == BTRFS_INODE_REF_KEY ||
2539  			   key.type == BTRFS_INODE_EXTREF_KEY) {
2540  			ret = add_inode_ref(wc->trans, root, log, path,
2541  					    eb, i, &key);
2542  			if (ret && ret != -ENOENT)
2543  				break;
2544  			ret = 0;
2545  		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2546  			ret = replay_one_extent(wc->trans, root, path,
2547  						eb, i, &key);
2548  			if (ret)
2549  				break;
2550  		}
2551  		/*
2552  		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2553  		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2554  		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2555  		 * older kernel with such keys, ignore them.
2556  		 */
2557  	}
2558  	btrfs_free_path(path);
2559  	return ret;
2560  }
2561  
2562  /*
2563   * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2564   */
unaccount_log_buffer(struct btrfs_fs_info * fs_info,u64 start)2565  static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2566  {
2567  	struct btrfs_block_group *cache;
2568  
2569  	cache = btrfs_lookup_block_group(fs_info, start);
2570  	if (!cache) {
2571  		btrfs_err(fs_info, "unable to find block group for %llu", start);
2572  		return;
2573  	}
2574  
2575  	spin_lock(&cache->space_info->lock);
2576  	spin_lock(&cache->lock);
2577  	cache->reserved -= fs_info->nodesize;
2578  	cache->space_info->bytes_reserved -= fs_info->nodesize;
2579  	spin_unlock(&cache->lock);
2580  	spin_unlock(&cache->space_info->lock);
2581  
2582  	btrfs_put_block_group(cache);
2583  }
2584  
clean_log_buffer(struct btrfs_trans_handle * trans,struct extent_buffer * eb)2585  static int clean_log_buffer(struct btrfs_trans_handle *trans,
2586  			    struct extent_buffer *eb)
2587  {
2588  	int ret;
2589  
2590  	btrfs_tree_lock(eb);
2591  	btrfs_clear_buffer_dirty(trans, eb);
2592  	wait_on_extent_buffer_writeback(eb);
2593  	btrfs_tree_unlock(eb);
2594  
2595  	if (trans) {
2596  		ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
2597  		if (ret)
2598  			return ret;
2599  		btrfs_redirty_list_add(trans->transaction, eb);
2600  	} else {
2601  		unaccount_log_buffer(eb->fs_info, eb->start);
2602  	}
2603  
2604  	return 0;
2605  }
2606  
walk_down_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2607  static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2608  				   struct btrfs_root *root,
2609  				   struct btrfs_path *path, int *level,
2610  				   struct walk_control *wc)
2611  {
2612  	struct btrfs_fs_info *fs_info = root->fs_info;
2613  	u64 bytenr;
2614  	u64 ptr_gen;
2615  	struct extent_buffer *next;
2616  	struct extent_buffer *cur;
2617  	int ret = 0;
2618  
2619  	while (*level > 0) {
2620  		struct btrfs_tree_parent_check check = { 0 };
2621  
2622  		cur = path->nodes[*level];
2623  
2624  		WARN_ON(btrfs_header_level(cur) != *level);
2625  
2626  		if (path->slots[*level] >=
2627  		    btrfs_header_nritems(cur))
2628  			break;
2629  
2630  		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2631  		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2632  		check.transid = ptr_gen;
2633  		check.level = *level - 1;
2634  		check.has_first_key = true;
2635  		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
2636  
2637  		next = btrfs_find_create_tree_block(fs_info, bytenr,
2638  						    btrfs_header_owner(cur),
2639  						    *level - 1);
2640  		if (IS_ERR(next))
2641  			return PTR_ERR(next);
2642  
2643  		if (*level == 1) {
2644  			ret = wc->process_func(root, next, wc, ptr_gen,
2645  					       *level - 1);
2646  			if (ret) {
2647  				free_extent_buffer(next);
2648  				return ret;
2649  			}
2650  
2651  			path->slots[*level]++;
2652  			if (wc->free) {
2653  				ret = btrfs_read_extent_buffer(next, &check);
2654  				if (ret) {
2655  					free_extent_buffer(next);
2656  					return ret;
2657  				}
2658  
2659  				ret = clean_log_buffer(trans, next);
2660  				if (ret) {
2661  					free_extent_buffer(next);
2662  					return ret;
2663  				}
2664  			}
2665  			free_extent_buffer(next);
2666  			continue;
2667  		}
2668  		ret = btrfs_read_extent_buffer(next, &check);
2669  		if (ret) {
2670  			free_extent_buffer(next);
2671  			return ret;
2672  		}
2673  
2674  		if (path->nodes[*level-1])
2675  			free_extent_buffer(path->nodes[*level-1]);
2676  		path->nodes[*level-1] = next;
2677  		*level = btrfs_header_level(next);
2678  		path->slots[*level] = 0;
2679  		cond_resched();
2680  	}
2681  	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2682  
2683  	cond_resched();
2684  	return 0;
2685  }
2686  
walk_up_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2687  static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2688  				 struct btrfs_root *root,
2689  				 struct btrfs_path *path, int *level,
2690  				 struct walk_control *wc)
2691  {
2692  	int i;
2693  	int slot;
2694  	int ret;
2695  
2696  	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2697  		slot = path->slots[i];
2698  		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2699  			path->slots[i]++;
2700  			*level = i;
2701  			WARN_ON(*level == 0);
2702  			return 0;
2703  		} else {
2704  			ret = wc->process_func(root, path->nodes[*level], wc,
2705  				 btrfs_header_generation(path->nodes[*level]),
2706  				 *level);
2707  			if (ret)
2708  				return ret;
2709  
2710  			if (wc->free) {
2711  				ret = clean_log_buffer(trans, path->nodes[*level]);
2712  				if (ret)
2713  					return ret;
2714  			}
2715  			free_extent_buffer(path->nodes[*level]);
2716  			path->nodes[*level] = NULL;
2717  			*level = i + 1;
2718  		}
2719  	}
2720  	return 1;
2721  }
2722  
2723  /*
2724   * drop the reference count on the tree rooted at 'snap'.  This traverses
2725   * the tree freeing any blocks that have a ref count of zero after being
2726   * decremented.
2727   */
walk_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct walk_control * wc)2728  static int walk_log_tree(struct btrfs_trans_handle *trans,
2729  			 struct btrfs_root *log, struct walk_control *wc)
2730  {
2731  	int ret = 0;
2732  	int wret;
2733  	int level;
2734  	struct btrfs_path *path;
2735  	int orig_level;
2736  
2737  	path = btrfs_alloc_path();
2738  	if (!path)
2739  		return -ENOMEM;
2740  
2741  	level = btrfs_header_level(log->node);
2742  	orig_level = level;
2743  	path->nodes[level] = log->node;
2744  	atomic_inc(&log->node->refs);
2745  	path->slots[level] = 0;
2746  
2747  	while (1) {
2748  		wret = walk_down_log_tree(trans, log, path, &level, wc);
2749  		if (wret > 0)
2750  			break;
2751  		if (wret < 0) {
2752  			ret = wret;
2753  			goto out;
2754  		}
2755  
2756  		wret = walk_up_log_tree(trans, log, path, &level, wc);
2757  		if (wret > 0)
2758  			break;
2759  		if (wret < 0) {
2760  			ret = wret;
2761  			goto out;
2762  		}
2763  	}
2764  
2765  	/* was the root node processed? if not, catch it here */
2766  	if (path->nodes[orig_level]) {
2767  		ret = wc->process_func(log, path->nodes[orig_level], wc,
2768  			 btrfs_header_generation(path->nodes[orig_level]),
2769  			 orig_level);
2770  		if (ret)
2771  			goto out;
2772  		if (wc->free)
2773  			ret = clean_log_buffer(trans, path->nodes[orig_level]);
2774  	}
2775  
2776  out:
2777  	btrfs_free_path(path);
2778  	return ret;
2779  }
2780  
2781  /*
2782   * helper function to update the item for a given subvolumes log root
2783   * in the tree of log roots
2784   */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)2785  static int update_log_root(struct btrfs_trans_handle *trans,
2786  			   struct btrfs_root *log,
2787  			   struct btrfs_root_item *root_item)
2788  {
2789  	struct btrfs_fs_info *fs_info = log->fs_info;
2790  	int ret;
2791  
2792  	if (log->log_transid == 1) {
2793  		/* insert root item on the first sync */
2794  		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2795  				&log->root_key, root_item);
2796  	} else {
2797  		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2798  				&log->root_key, root_item);
2799  	}
2800  	return ret;
2801  }
2802  
wait_log_commit(struct btrfs_root * root,int transid)2803  static void wait_log_commit(struct btrfs_root *root, int transid)
2804  {
2805  	DEFINE_WAIT(wait);
2806  	int index = transid % 2;
2807  
2808  	/*
2809  	 * we only allow two pending log transactions at a time,
2810  	 * so we know that if ours is more than 2 older than the
2811  	 * current transaction, we're done
2812  	 */
2813  	for (;;) {
2814  		prepare_to_wait(&root->log_commit_wait[index],
2815  				&wait, TASK_UNINTERRUPTIBLE);
2816  
2817  		if (!(root->log_transid_committed < transid &&
2818  		      atomic_read(&root->log_commit[index])))
2819  			break;
2820  
2821  		mutex_unlock(&root->log_mutex);
2822  		schedule();
2823  		mutex_lock(&root->log_mutex);
2824  	}
2825  	finish_wait(&root->log_commit_wait[index], &wait);
2826  }
2827  
wait_for_writer(struct btrfs_root * root)2828  static void wait_for_writer(struct btrfs_root *root)
2829  {
2830  	DEFINE_WAIT(wait);
2831  
2832  	for (;;) {
2833  		prepare_to_wait(&root->log_writer_wait, &wait,
2834  				TASK_UNINTERRUPTIBLE);
2835  		if (!atomic_read(&root->log_writers))
2836  			break;
2837  
2838  		mutex_unlock(&root->log_mutex);
2839  		schedule();
2840  		mutex_lock(&root->log_mutex);
2841  	}
2842  	finish_wait(&root->log_writer_wait, &wait);
2843  }
2844  
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)2845  static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2846  					struct btrfs_log_ctx *ctx)
2847  {
2848  	mutex_lock(&root->log_mutex);
2849  	list_del_init(&ctx->list);
2850  	mutex_unlock(&root->log_mutex);
2851  }
2852  
2853  /*
2854   * Invoked in log mutex context, or be sure there is no other task which
2855   * can access the list.
2856   */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)2857  static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2858  					     int index, int error)
2859  {
2860  	struct btrfs_log_ctx *ctx;
2861  	struct btrfs_log_ctx *safe;
2862  
2863  	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2864  		list_del_init(&ctx->list);
2865  		ctx->log_ret = error;
2866  	}
2867  }
2868  
2869  /*
2870   * btrfs_sync_log does sends a given tree log down to the disk and
2871   * updates the super blocks to record it.  When this call is done,
2872   * you know that any inodes previously logged are safely on disk only
2873   * if it returns 0.
2874   *
2875   * Any other return value means you need to call btrfs_commit_transaction.
2876   * Some of the edge cases for fsyncing directories that have had unlinks
2877   * or renames done in the past mean that sometimes the only safe
2878   * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2879   * that has happened.
2880   */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)2881  int btrfs_sync_log(struct btrfs_trans_handle *trans,
2882  		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2883  {
2884  	int index1;
2885  	int index2;
2886  	int mark;
2887  	int ret;
2888  	struct btrfs_fs_info *fs_info = root->fs_info;
2889  	struct btrfs_root *log = root->log_root;
2890  	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2891  	struct btrfs_root_item new_root_item;
2892  	int log_transid = 0;
2893  	struct btrfs_log_ctx root_log_ctx;
2894  	struct blk_plug plug;
2895  	u64 log_root_start;
2896  	u64 log_root_level;
2897  
2898  	mutex_lock(&root->log_mutex);
2899  	log_transid = ctx->log_transid;
2900  	if (root->log_transid_committed >= log_transid) {
2901  		mutex_unlock(&root->log_mutex);
2902  		return ctx->log_ret;
2903  	}
2904  
2905  	index1 = log_transid % 2;
2906  	if (atomic_read(&root->log_commit[index1])) {
2907  		wait_log_commit(root, log_transid);
2908  		mutex_unlock(&root->log_mutex);
2909  		return ctx->log_ret;
2910  	}
2911  	ASSERT(log_transid == root->log_transid);
2912  	atomic_set(&root->log_commit[index1], 1);
2913  
2914  	/* wait for previous tree log sync to complete */
2915  	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2916  		wait_log_commit(root, log_transid - 1);
2917  
2918  	while (1) {
2919  		int batch = atomic_read(&root->log_batch);
2920  		/* when we're on an ssd, just kick the log commit out */
2921  		if (!btrfs_test_opt(fs_info, SSD) &&
2922  		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2923  			mutex_unlock(&root->log_mutex);
2924  			schedule_timeout_uninterruptible(1);
2925  			mutex_lock(&root->log_mutex);
2926  		}
2927  		wait_for_writer(root);
2928  		if (batch == atomic_read(&root->log_batch))
2929  			break;
2930  	}
2931  
2932  	/* bail out if we need to do a full commit */
2933  	if (btrfs_need_log_full_commit(trans)) {
2934  		ret = BTRFS_LOG_FORCE_COMMIT;
2935  		mutex_unlock(&root->log_mutex);
2936  		goto out;
2937  	}
2938  
2939  	if (log_transid % 2 == 0)
2940  		mark = EXTENT_DIRTY;
2941  	else
2942  		mark = EXTENT_NEW;
2943  
2944  	/* we start IO on  all the marked extents here, but we don't actually
2945  	 * wait for them until later.
2946  	 */
2947  	blk_start_plug(&plug);
2948  	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2949  	/*
2950  	 * -EAGAIN happens when someone, e.g., a concurrent transaction
2951  	 *  commit, writes a dirty extent in this tree-log commit. This
2952  	 *  concurrent write will create a hole writing out the extents,
2953  	 *  and we cannot proceed on a zoned filesystem, requiring
2954  	 *  sequential writing. While we can bail out to a full commit
2955  	 *  here, but we can continue hoping the concurrent writing fills
2956  	 *  the hole.
2957  	 */
2958  	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
2959  		ret = 0;
2960  	if (ret) {
2961  		blk_finish_plug(&plug);
2962  		btrfs_set_log_full_commit(trans);
2963  		mutex_unlock(&root->log_mutex);
2964  		goto out;
2965  	}
2966  
2967  	/*
2968  	 * We _must_ update under the root->log_mutex in order to make sure we
2969  	 * have a consistent view of the log root we are trying to commit at
2970  	 * this moment.
2971  	 *
2972  	 * We _must_ copy this into a local copy, because we are not holding the
2973  	 * log_root_tree->log_mutex yet.  This is important because when we
2974  	 * commit the log_root_tree we must have a consistent view of the
2975  	 * log_root_tree when we update the super block to point at the
2976  	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
2977  	 * with the commit and possibly point at the new block which we may not
2978  	 * have written out.
2979  	 */
2980  	btrfs_set_root_node(&log->root_item, log->node);
2981  	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
2982  
2983  	root->log_transid++;
2984  	log->log_transid = root->log_transid;
2985  	root->log_start_pid = 0;
2986  	/*
2987  	 * IO has been started, blocks of the log tree have WRITTEN flag set
2988  	 * in their headers. new modifications of the log will be written to
2989  	 * new positions. so it's safe to allow log writers to go in.
2990  	 */
2991  	mutex_unlock(&root->log_mutex);
2992  
2993  	if (btrfs_is_zoned(fs_info)) {
2994  		mutex_lock(&fs_info->tree_root->log_mutex);
2995  		if (!log_root_tree->node) {
2996  			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
2997  			if (ret) {
2998  				mutex_unlock(&fs_info->tree_root->log_mutex);
2999  				blk_finish_plug(&plug);
3000  				goto out;
3001  			}
3002  		}
3003  		mutex_unlock(&fs_info->tree_root->log_mutex);
3004  	}
3005  
3006  	btrfs_init_log_ctx(&root_log_ctx, NULL);
3007  
3008  	mutex_lock(&log_root_tree->log_mutex);
3009  
3010  	index2 = log_root_tree->log_transid % 2;
3011  	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3012  	root_log_ctx.log_transid = log_root_tree->log_transid;
3013  
3014  	/*
3015  	 * Now we are safe to update the log_root_tree because we're under the
3016  	 * log_mutex, and we're a current writer so we're holding the commit
3017  	 * open until we drop the log_mutex.
3018  	 */
3019  	ret = update_log_root(trans, log, &new_root_item);
3020  	if (ret) {
3021  		if (!list_empty(&root_log_ctx.list))
3022  			list_del_init(&root_log_ctx.list);
3023  
3024  		blk_finish_plug(&plug);
3025  		btrfs_set_log_full_commit(trans);
3026  		if (ret != -ENOSPC)
3027  			btrfs_err(fs_info,
3028  				  "failed to update log for root %llu ret %d",
3029  				  root->root_key.objectid, ret);
3030  		btrfs_wait_tree_log_extents(log, mark);
3031  		mutex_unlock(&log_root_tree->log_mutex);
3032  		goto out;
3033  	}
3034  
3035  	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3036  		blk_finish_plug(&plug);
3037  		list_del_init(&root_log_ctx.list);
3038  		mutex_unlock(&log_root_tree->log_mutex);
3039  		ret = root_log_ctx.log_ret;
3040  		goto out;
3041  	}
3042  
3043  	index2 = root_log_ctx.log_transid % 2;
3044  	if (atomic_read(&log_root_tree->log_commit[index2])) {
3045  		blk_finish_plug(&plug);
3046  		ret = btrfs_wait_tree_log_extents(log, mark);
3047  		wait_log_commit(log_root_tree,
3048  				root_log_ctx.log_transid);
3049  		mutex_unlock(&log_root_tree->log_mutex);
3050  		if (!ret)
3051  			ret = root_log_ctx.log_ret;
3052  		goto out;
3053  	}
3054  	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3055  	atomic_set(&log_root_tree->log_commit[index2], 1);
3056  
3057  	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3058  		wait_log_commit(log_root_tree,
3059  				root_log_ctx.log_transid - 1);
3060  	}
3061  
3062  	/*
3063  	 * now that we've moved on to the tree of log tree roots,
3064  	 * check the full commit flag again
3065  	 */
3066  	if (btrfs_need_log_full_commit(trans)) {
3067  		blk_finish_plug(&plug);
3068  		btrfs_wait_tree_log_extents(log, mark);
3069  		mutex_unlock(&log_root_tree->log_mutex);
3070  		ret = BTRFS_LOG_FORCE_COMMIT;
3071  		goto out_wake_log_root;
3072  	}
3073  
3074  	ret = btrfs_write_marked_extents(fs_info,
3075  					 &log_root_tree->dirty_log_pages,
3076  					 EXTENT_DIRTY | EXTENT_NEW);
3077  	blk_finish_plug(&plug);
3078  	/*
3079  	 * As described above, -EAGAIN indicates a hole in the extents. We
3080  	 * cannot wait for these write outs since the waiting cause a
3081  	 * deadlock. Bail out to the full commit instead.
3082  	 */
3083  	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3084  		btrfs_set_log_full_commit(trans);
3085  		btrfs_wait_tree_log_extents(log, mark);
3086  		mutex_unlock(&log_root_tree->log_mutex);
3087  		goto out_wake_log_root;
3088  	} else if (ret) {
3089  		btrfs_set_log_full_commit(trans);
3090  		mutex_unlock(&log_root_tree->log_mutex);
3091  		goto out_wake_log_root;
3092  	}
3093  	ret = btrfs_wait_tree_log_extents(log, mark);
3094  	if (!ret)
3095  		ret = btrfs_wait_tree_log_extents(log_root_tree,
3096  						  EXTENT_NEW | EXTENT_DIRTY);
3097  	if (ret) {
3098  		btrfs_set_log_full_commit(trans);
3099  		mutex_unlock(&log_root_tree->log_mutex);
3100  		goto out_wake_log_root;
3101  	}
3102  
3103  	log_root_start = log_root_tree->node->start;
3104  	log_root_level = btrfs_header_level(log_root_tree->node);
3105  	log_root_tree->log_transid++;
3106  	mutex_unlock(&log_root_tree->log_mutex);
3107  
3108  	/*
3109  	 * Here we are guaranteed that nobody is going to write the superblock
3110  	 * for the current transaction before us and that neither we do write
3111  	 * our superblock before the previous transaction finishes its commit
3112  	 * and writes its superblock, because:
3113  	 *
3114  	 * 1) We are holding a handle on the current transaction, so no body
3115  	 *    can commit it until we release the handle;
3116  	 *
3117  	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3118  	 *    if the previous transaction is still committing, and hasn't yet
3119  	 *    written its superblock, we wait for it to do it, because a
3120  	 *    transaction commit acquires the tree_log_mutex when the commit
3121  	 *    begins and releases it only after writing its superblock.
3122  	 */
3123  	mutex_lock(&fs_info->tree_log_mutex);
3124  
3125  	/*
3126  	 * The previous transaction writeout phase could have failed, and thus
3127  	 * marked the fs in an error state.  We must not commit here, as we
3128  	 * could have updated our generation in the super_for_commit and
3129  	 * writing the super here would result in transid mismatches.  If there
3130  	 * is an error here just bail.
3131  	 */
3132  	if (BTRFS_FS_ERROR(fs_info)) {
3133  		ret = -EIO;
3134  		btrfs_set_log_full_commit(trans);
3135  		btrfs_abort_transaction(trans, ret);
3136  		mutex_unlock(&fs_info->tree_log_mutex);
3137  		goto out_wake_log_root;
3138  	}
3139  
3140  	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3141  	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3142  	ret = write_all_supers(fs_info, 1);
3143  	mutex_unlock(&fs_info->tree_log_mutex);
3144  	if (ret) {
3145  		btrfs_set_log_full_commit(trans);
3146  		btrfs_abort_transaction(trans, ret);
3147  		goto out_wake_log_root;
3148  	}
3149  
3150  	/*
3151  	 * We know there can only be one task here, since we have not yet set
3152  	 * root->log_commit[index1] to 0 and any task attempting to sync the
3153  	 * log must wait for the previous log transaction to commit if it's
3154  	 * still in progress or wait for the current log transaction commit if
3155  	 * someone else already started it. We use <= and not < because the
3156  	 * first log transaction has an ID of 0.
3157  	 */
3158  	ASSERT(root->last_log_commit <= log_transid);
3159  	root->last_log_commit = log_transid;
3160  
3161  out_wake_log_root:
3162  	mutex_lock(&log_root_tree->log_mutex);
3163  	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3164  
3165  	log_root_tree->log_transid_committed++;
3166  	atomic_set(&log_root_tree->log_commit[index2], 0);
3167  	mutex_unlock(&log_root_tree->log_mutex);
3168  
3169  	/*
3170  	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3171  	 * all the updates above are seen by the woken threads. It might not be
3172  	 * necessary, but proving that seems to be hard.
3173  	 */
3174  	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3175  out:
3176  	mutex_lock(&root->log_mutex);
3177  	btrfs_remove_all_log_ctxs(root, index1, ret);
3178  	root->log_transid_committed++;
3179  	atomic_set(&root->log_commit[index1], 0);
3180  	mutex_unlock(&root->log_mutex);
3181  
3182  	/*
3183  	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3184  	 * all the updates above are seen by the woken threads. It might not be
3185  	 * necessary, but proving that seems to be hard.
3186  	 */
3187  	cond_wake_up(&root->log_commit_wait[index1]);
3188  	return ret;
3189  }
3190  
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3191  static void free_log_tree(struct btrfs_trans_handle *trans,
3192  			  struct btrfs_root *log)
3193  {
3194  	int ret;
3195  	struct walk_control wc = {
3196  		.free = 1,
3197  		.process_func = process_one_buffer
3198  	};
3199  
3200  	if (log->node) {
3201  		ret = walk_log_tree(trans, log, &wc);
3202  		if (ret) {
3203  			/*
3204  			 * We weren't able to traverse the entire log tree, the
3205  			 * typical scenario is getting an -EIO when reading an
3206  			 * extent buffer of the tree, due to a previous writeback
3207  			 * failure of it.
3208  			 */
3209  			set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3210  				&log->fs_info->fs_state);
3211  
3212  			/*
3213  			 * Some extent buffers of the log tree may still be dirty
3214  			 * and not yet written back to storage, because we may
3215  			 * have updates to a log tree without syncing a log tree,
3216  			 * such as during rename and link operations. So flush
3217  			 * them out and wait for their writeback to complete, so
3218  			 * that we properly cleanup their state and pages.
3219  			 */
3220  			btrfs_write_marked_extents(log->fs_info,
3221  						   &log->dirty_log_pages,
3222  						   EXTENT_DIRTY | EXTENT_NEW);
3223  			btrfs_wait_tree_log_extents(log,
3224  						    EXTENT_DIRTY | EXTENT_NEW);
3225  
3226  			if (trans)
3227  				btrfs_abort_transaction(trans, ret);
3228  			else
3229  				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3230  		}
3231  	}
3232  
3233  	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3234  			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3235  	extent_io_tree_release(&log->log_csum_range);
3236  
3237  	btrfs_put_root(log);
3238  }
3239  
3240  /*
3241   * free all the extents used by the tree log.  This should be called
3242   * at commit time of the full transaction
3243   */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3244  int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3245  {
3246  	if (root->log_root) {
3247  		free_log_tree(trans, root->log_root);
3248  		root->log_root = NULL;
3249  		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3250  	}
3251  	return 0;
3252  }
3253  
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3254  int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3255  			     struct btrfs_fs_info *fs_info)
3256  {
3257  	if (fs_info->log_root_tree) {
3258  		free_log_tree(trans, fs_info->log_root_tree);
3259  		fs_info->log_root_tree = NULL;
3260  		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3261  	}
3262  	return 0;
3263  }
3264  
3265  /*
3266   * Check if an inode was logged in the current transaction. This correctly deals
3267   * with the case where the inode was logged but has a logged_trans of 0, which
3268   * happens if the inode is evicted and loaded again, as logged_trans is an in
3269   * memory only field (not persisted).
3270   *
3271   * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3272   * and < 0 on error.
3273   */
inode_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path_in)3274  static int inode_logged(const struct btrfs_trans_handle *trans,
3275  			struct btrfs_inode *inode,
3276  			struct btrfs_path *path_in)
3277  {
3278  	struct btrfs_path *path = path_in;
3279  	struct btrfs_key key;
3280  	int ret;
3281  
3282  	if (inode->logged_trans == trans->transid)
3283  		return 1;
3284  
3285  	/*
3286  	 * If logged_trans is not 0, then we know the inode logged was not logged
3287  	 * in this transaction, so we can return false right away.
3288  	 */
3289  	if (inode->logged_trans > 0)
3290  		return 0;
3291  
3292  	/*
3293  	 * If no log tree was created for this root in this transaction, then
3294  	 * the inode can not have been logged in this transaction. In that case
3295  	 * set logged_trans to anything greater than 0 and less than the current
3296  	 * transaction's ID, to avoid the search below in a future call in case
3297  	 * a log tree gets created after this.
3298  	 */
3299  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3300  		inode->logged_trans = trans->transid - 1;
3301  		return 0;
3302  	}
3303  
3304  	/*
3305  	 * We have a log tree and the inode's logged_trans is 0. We can't tell
3306  	 * for sure if the inode was logged before in this transaction by looking
3307  	 * only at logged_trans. We could be pessimistic and assume it was, but
3308  	 * that can lead to unnecessarily logging an inode during rename and link
3309  	 * operations, and then further updating the log in followup rename and
3310  	 * link operations, specially if it's a directory, which adds latency
3311  	 * visible to applications doing a series of rename or link operations.
3312  	 *
3313  	 * A logged_trans of 0 here can mean several things:
3314  	 *
3315  	 * 1) The inode was never logged since the filesystem was mounted, and may
3316  	 *    or may have not been evicted and loaded again;
3317  	 *
3318  	 * 2) The inode was logged in a previous transaction, then evicted and
3319  	 *    then loaded again;
3320  	 *
3321  	 * 3) The inode was logged in the current transaction, then evicted and
3322  	 *    then loaded again.
3323  	 *
3324  	 * For cases 1) and 2) we don't want to return true, but we need to detect
3325  	 * case 3) and return true. So we do a search in the log root for the inode
3326  	 * item.
3327  	 */
3328  	key.objectid = btrfs_ino(inode);
3329  	key.type = BTRFS_INODE_ITEM_KEY;
3330  	key.offset = 0;
3331  
3332  	if (!path) {
3333  		path = btrfs_alloc_path();
3334  		if (!path)
3335  			return -ENOMEM;
3336  	}
3337  
3338  	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3339  
3340  	if (path_in)
3341  		btrfs_release_path(path);
3342  	else
3343  		btrfs_free_path(path);
3344  
3345  	/*
3346  	 * Logging an inode always results in logging its inode item. So if we
3347  	 * did not find the item we know the inode was not logged for sure.
3348  	 */
3349  	if (ret < 0) {
3350  		return ret;
3351  	} else if (ret > 0) {
3352  		/*
3353  		 * Set logged_trans to a value greater than 0 and less then the
3354  		 * current transaction to avoid doing the search in future calls.
3355  		 */
3356  		inode->logged_trans = trans->transid - 1;
3357  		return 0;
3358  	}
3359  
3360  	/*
3361  	 * The inode was previously logged and then evicted, set logged_trans to
3362  	 * the current transacion's ID, to avoid future tree searches as long as
3363  	 * the inode is not evicted again.
3364  	 */
3365  	inode->logged_trans = trans->transid;
3366  
3367  	/*
3368  	 * If it's a directory, then we must set last_dir_index_offset to the
3369  	 * maximum possible value, so that the next attempt to log the inode does
3370  	 * not skip checking if dir index keys found in modified subvolume tree
3371  	 * leaves have been logged before, otherwise it would result in attempts
3372  	 * to insert duplicate dir index keys in the log tree. This must be done
3373  	 * because last_dir_index_offset is an in-memory only field, not persisted
3374  	 * in the inode item or any other on-disk structure, so its value is lost
3375  	 * once the inode is evicted.
3376  	 */
3377  	if (S_ISDIR(inode->vfs_inode.i_mode))
3378  		inode->last_dir_index_offset = (u64)-1;
3379  
3380  	return 1;
3381  }
3382  
3383  /*
3384   * Delete a directory entry from the log if it exists.
3385   *
3386   * Returns < 0 on error
3387   *           1 if the entry does not exists
3388   *           0 if the entry existed and was successfully deleted
3389   */
del_logged_dentry(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dir_ino,const struct fscrypt_str * name,u64 index)3390  static int del_logged_dentry(struct btrfs_trans_handle *trans,
3391  			     struct btrfs_root *log,
3392  			     struct btrfs_path *path,
3393  			     u64 dir_ino,
3394  			     const struct fscrypt_str *name,
3395  			     u64 index)
3396  {
3397  	struct btrfs_dir_item *di;
3398  
3399  	/*
3400  	 * We only log dir index items of a directory, so we don't need to look
3401  	 * for dir item keys.
3402  	 */
3403  	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3404  					 index, name, -1);
3405  	if (IS_ERR(di))
3406  		return PTR_ERR(di);
3407  	else if (!di)
3408  		return 1;
3409  
3410  	/*
3411  	 * We do not need to update the size field of the directory's
3412  	 * inode item because on log replay we update the field to reflect
3413  	 * all existing entries in the directory (see overwrite_item()).
3414  	 */
3415  	return btrfs_delete_one_dir_name(trans, log, path, di);
3416  }
3417  
3418  /*
3419   * If both a file and directory are logged, and unlinks or renames are
3420   * mixed in, we have a few interesting corners:
3421   *
3422   * create file X in dir Y
3423   * link file X to X.link in dir Y
3424   * fsync file X
3425   * unlink file X but leave X.link
3426   * fsync dir Y
3427   *
3428   * After a crash we would expect only X.link to exist.  But file X
3429   * didn't get fsync'd again so the log has back refs for X and X.link.
3430   *
3431   * We solve this by removing directory entries and inode backrefs from the
3432   * log when a file that was logged in the current transaction is
3433   * unlinked.  Any later fsync will include the updated log entries, and
3434   * we'll be able to reconstruct the proper directory items from backrefs.
3435   *
3436   * This optimizations allows us to avoid relogging the entire inode
3437   * or the entire directory.
3438   */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * dir,u64 index)3439  void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3440  				  struct btrfs_root *root,
3441  				  const struct fscrypt_str *name,
3442  				  struct btrfs_inode *dir, u64 index)
3443  {
3444  	struct btrfs_path *path;
3445  	int ret;
3446  
3447  	ret = inode_logged(trans, dir, NULL);
3448  	if (ret == 0)
3449  		return;
3450  	else if (ret < 0) {
3451  		btrfs_set_log_full_commit(trans);
3452  		return;
3453  	}
3454  
3455  	ret = join_running_log_trans(root);
3456  	if (ret)
3457  		return;
3458  
3459  	mutex_lock(&dir->log_mutex);
3460  
3461  	path = btrfs_alloc_path();
3462  	if (!path) {
3463  		ret = -ENOMEM;
3464  		goto out_unlock;
3465  	}
3466  
3467  	ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3468  				name, index);
3469  	btrfs_free_path(path);
3470  out_unlock:
3471  	mutex_unlock(&dir->log_mutex);
3472  	if (ret < 0)
3473  		btrfs_set_log_full_commit(trans);
3474  	btrfs_end_log_trans(root);
3475  }
3476  
3477  /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * inode,u64 dirid)3478  void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3479  				struct btrfs_root *root,
3480  				const struct fscrypt_str *name,
3481  				struct btrfs_inode *inode, u64 dirid)
3482  {
3483  	struct btrfs_root *log;
3484  	u64 index;
3485  	int ret;
3486  
3487  	ret = inode_logged(trans, inode, NULL);
3488  	if (ret == 0)
3489  		return;
3490  	else if (ret < 0) {
3491  		btrfs_set_log_full_commit(trans);
3492  		return;
3493  	}
3494  
3495  	ret = join_running_log_trans(root);
3496  	if (ret)
3497  		return;
3498  	log = root->log_root;
3499  	mutex_lock(&inode->log_mutex);
3500  
3501  	ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
3502  				  dirid, &index);
3503  	mutex_unlock(&inode->log_mutex);
3504  	if (ret < 0 && ret != -ENOENT)
3505  		btrfs_set_log_full_commit(trans);
3506  	btrfs_end_log_trans(root);
3507  }
3508  
3509  /*
3510   * creates a range item in the log for 'dirid'.  first_offset and
3511   * last_offset tell us which parts of the key space the log should
3512   * be considered authoritative for.
3513   */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,u64 first_offset,u64 last_offset)3514  static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3515  				       struct btrfs_root *log,
3516  				       struct btrfs_path *path,
3517  				       u64 dirid,
3518  				       u64 first_offset, u64 last_offset)
3519  {
3520  	int ret;
3521  	struct btrfs_key key;
3522  	struct btrfs_dir_log_item *item;
3523  
3524  	key.objectid = dirid;
3525  	key.offset = first_offset;
3526  	key.type = BTRFS_DIR_LOG_INDEX_KEY;
3527  	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3528  	/*
3529  	 * -EEXIST is fine and can happen sporadically when we are logging a
3530  	 * directory and have concurrent insertions in the subvolume's tree for
3531  	 * items from other inodes and that result in pushing off some dir items
3532  	 * from one leaf to another in order to accommodate for the new items.
3533  	 * This results in logging the same dir index range key.
3534  	 */
3535  	if (ret && ret != -EEXIST)
3536  		return ret;
3537  
3538  	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3539  			      struct btrfs_dir_log_item);
3540  	if (ret == -EEXIST) {
3541  		const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3542  
3543  		/*
3544  		 * btrfs_del_dir_entries_in_log() might have been called during
3545  		 * an unlink between the initial insertion of this key and the
3546  		 * current update, or we might be logging a single entry deletion
3547  		 * during a rename, so set the new last_offset to the max value.
3548  		 */
3549  		last_offset = max(last_offset, curr_end);
3550  	}
3551  	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3552  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
3553  	btrfs_release_path(path);
3554  	return 0;
3555  }
3556  
flush_dir_items_batch(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct extent_buffer * src,struct btrfs_path * dst_path,int start_slot,int count)3557  static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3558  				 struct btrfs_inode *inode,
3559  				 struct extent_buffer *src,
3560  				 struct btrfs_path *dst_path,
3561  				 int start_slot,
3562  				 int count)
3563  {
3564  	struct btrfs_root *log = inode->root->log_root;
3565  	char *ins_data = NULL;
3566  	struct btrfs_item_batch batch;
3567  	struct extent_buffer *dst;
3568  	unsigned long src_offset;
3569  	unsigned long dst_offset;
3570  	u64 last_index;
3571  	struct btrfs_key key;
3572  	u32 item_size;
3573  	int ret;
3574  	int i;
3575  
3576  	ASSERT(count > 0);
3577  	batch.nr = count;
3578  
3579  	if (count == 1) {
3580  		btrfs_item_key_to_cpu(src, &key, start_slot);
3581  		item_size = btrfs_item_size(src, start_slot);
3582  		batch.keys = &key;
3583  		batch.data_sizes = &item_size;
3584  		batch.total_data_size = item_size;
3585  	} else {
3586  		struct btrfs_key *ins_keys;
3587  		u32 *ins_sizes;
3588  
3589  		ins_data = kmalloc(count * sizeof(u32) +
3590  				   count * sizeof(struct btrfs_key), GFP_NOFS);
3591  		if (!ins_data)
3592  			return -ENOMEM;
3593  
3594  		ins_sizes = (u32 *)ins_data;
3595  		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3596  		batch.keys = ins_keys;
3597  		batch.data_sizes = ins_sizes;
3598  		batch.total_data_size = 0;
3599  
3600  		for (i = 0; i < count; i++) {
3601  			const int slot = start_slot + i;
3602  
3603  			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3604  			ins_sizes[i] = btrfs_item_size(src, slot);
3605  			batch.total_data_size += ins_sizes[i];
3606  		}
3607  	}
3608  
3609  	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3610  	if (ret)
3611  		goto out;
3612  
3613  	dst = dst_path->nodes[0];
3614  	/*
3615  	 * Copy all the items in bulk, in a single copy operation. Item data is
3616  	 * organized such that it's placed at the end of a leaf and from right
3617  	 * to left. For example, the data for the second item ends at an offset
3618  	 * that matches the offset where the data for the first item starts, the
3619  	 * data for the third item ends at an offset that matches the offset
3620  	 * where the data of the second items starts, and so on.
3621  	 * Therefore our source and destination start offsets for copy match the
3622  	 * offsets of the last items (highest slots).
3623  	 */
3624  	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3625  	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3626  	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3627  	btrfs_release_path(dst_path);
3628  
3629  	last_index = batch.keys[count - 1].offset;
3630  	ASSERT(last_index > inode->last_dir_index_offset);
3631  
3632  	/*
3633  	 * If for some unexpected reason the last item's index is not greater
3634  	 * than the last index we logged, warn and force a transaction commit.
3635  	 */
3636  	if (WARN_ON(last_index <= inode->last_dir_index_offset))
3637  		ret = BTRFS_LOG_FORCE_COMMIT;
3638  	else
3639  		inode->last_dir_index_offset = last_index;
3640  
3641  	if (btrfs_get_first_dir_index_to_log(inode) == 0)
3642  		btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
3643  out:
3644  	kfree(ins_data);
3645  
3646  	return ret;
3647  }
3648  
process_dir_items_leaf(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 * last_old_dentry_offset)3649  static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3650  				  struct btrfs_inode *inode,
3651  				  struct btrfs_path *path,
3652  				  struct btrfs_path *dst_path,
3653  				  struct btrfs_log_ctx *ctx,
3654  				  u64 *last_old_dentry_offset)
3655  {
3656  	struct btrfs_root *log = inode->root->log_root;
3657  	struct extent_buffer *src;
3658  	const int nritems = btrfs_header_nritems(path->nodes[0]);
3659  	const u64 ino = btrfs_ino(inode);
3660  	bool last_found = false;
3661  	int batch_start = 0;
3662  	int batch_size = 0;
3663  	int i;
3664  
3665  	/*
3666  	 * We need to clone the leaf, release the read lock on it, and use the
3667  	 * clone before modifying the log tree. See the comment at copy_items()
3668  	 * about why we need to do this.
3669  	 */
3670  	src = btrfs_clone_extent_buffer(path->nodes[0]);
3671  	if (!src)
3672  		return -ENOMEM;
3673  
3674  	i = path->slots[0];
3675  	btrfs_release_path(path);
3676  	path->nodes[0] = src;
3677  	path->slots[0] = i;
3678  
3679  	for (; i < nritems; i++) {
3680  		struct btrfs_dir_item *di;
3681  		struct btrfs_key key;
3682  		int ret;
3683  
3684  		btrfs_item_key_to_cpu(src, &key, i);
3685  
3686  		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3687  			last_found = true;
3688  			break;
3689  		}
3690  
3691  		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3692  
3693  		/*
3694  		 * Skip ranges of items that consist only of dir item keys created
3695  		 * in past transactions. However if we find a gap, we must log a
3696  		 * dir index range item for that gap, so that index keys in that
3697  		 * gap are deleted during log replay.
3698  		 */
3699  		if (btrfs_dir_transid(src, di) < trans->transid) {
3700  			if (key.offset > *last_old_dentry_offset + 1) {
3701  				ret = insert_dir_log_key(trans, log, dst_path,
3702  						 ino, *last_old_dentry_offset + 1,
3703  						 key.offset - 1);
3704  				if (ret < 0)
3705  					return ret;
3706  			}
3707  
3708  			*last_old_dentry_offset = key.offset;
3709  			continue;
3710  		}
3711  
3712  		/* If we logged this dir index item before, we can skip it. */
3713  		if (key.offset <= inode->last_dir_index_offset)
3714  			continue;
3715  
3716  		/*
3717  		 * We must make sure that when we log a directory entry, the
3718  		 * corresponding inode, after log replay, has a matching link
3719  		 * count. For example:
3720  		 *
3721  		 * touch foo
3722  		 * mkdir mydir
3723  		 * sync
3724  		 * ln foo mydir/bar
3725  		 * xfs_io -c "fsync" mydir
3726  		 * <crash>
3727  		 * <mount fs and log replay>
3728  		 *
3729  		 * Would result in a fsync log that when replayed, our file inode
3730  		 * would have a link count of 1, but we get two directory entries
3731  		 * pointing to the same inode. After removing one of the names,
3732  		 * it would not be possible to remove the other name, which
3733  		 * resulted always in stale file handle errors, and would not be
3734  		 * possible to rmdir the parent directory, since its i_size could
3735  		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3736  		 * resulting in -ENOTEMPTY errors.
3737  		 */
3738  		if (!ctx->log_new_dentries) {
3739  			struct btrfs_key di_key;
3740  
3741  			btrfs_dir_item_key_to_cpu(src, di, &di_key);
3742  			if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3743  				ctx->log_new_dentries = true;
3744  		}
3745  
3746  		if (batch_size == 0)
3747  			batch_start = i;
3748  		batch_size++;
3749  	}
3750  
3751  	if (batch_size > 0) {
3752  		int ret;
3753  
3754  		ret = flush_dir_items_batch(trans, inode, src, dst_path,
3755  					    batch_start, batch_size);
3756  		if (ret < 0)
3757  			return ret;
3758  	}
3759  
3760  	return last_found ? 1 : 0;
3761  }
3762  
3763  /*
3764   * log all the items included in the current transaction for a given
3765   * directory.  This also creates the range items in the log tree required
3766   * to replay anything deleted before the fsync
3767   */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)3768  static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3769  			  struct btrfs_inode *inode,
3770  			  struct btrfs_path *path,
3771  			  struct btrfs_path *dst_path,
3772  			  struct btrfs_log_ctx *ctx,
3773  			  u64 min_offset, u64 *last_offset_ret)
3774  {
3775  	struct btrfs_key min_key;
3776  	struct btrfs_root *root = inode->root;
3777  	struct btrfs_root *log = root->log_root;
3778  	int ret;
3779  	u64 last_old_dentry_offset = min_offset - 1;
3780  	u64 last_offset = (u64)-1;
3781  	u64 ino = btrfs_ino(inode);
3782  
3783  	min_key.objectid = ino;
3784  	min_key.type = BTRFS_DIR_INDEX_KEY;
3785  	min_key.offset = min_offset;
3786  
3787  	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3788  
3789  	/*
3790  	 * we didn't find anything from this transaction, see if there
3791  	 * is anything at all
3792  	 */
3793  	if (ret != 0 || min_key.objectid != ino ||
3794  	    min_key.type != BTRFS_DIR_INDEX_KEY) {
3795  		min_key.objectid = ino;
3796  		min_key.type = BTRFS_DIR_INDEX_KEY;
3797  		min_key.offset = (u64)-1;
3798  		btrfs_release_path(path);
3799  		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3800  		if (ret < 0) {
3801  			btrfs_release_path(path);
3802  			return ret;
3803  		}
3804  		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3805  
3806  		/* if ret == 0 there are items for this type,
3807  		 * create a range to tell us the last key of this type.
3808  		 * otherwise, there are no items in this directory after
3809  		 * *min_offset, and we create a range to indicate that.
3810  		 */
3811  		if (ret == 0) {
3812  			struct btrfs_key tmp;
3813  
3814  			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3815  					      path->slots[0]);
3816  			if (tmp.type == BTRFS_DIR_INDEX_KEY)
3817  				last_old_dentry_offset = tmp.offset;
3818  		} else if (ret > 0) {
3819  			ret = 0;
3820  		}
3821  
3822  		goto done;
3823  	}
3824  
3825  	/* go backward to find any previous key */
3826  	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3827  	if (ret == 0) {
3828  		struct btrfs_key tmp;
3829  
3830  		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3831  		/*
3832  		 * The dir index key before the first one we found that needs to
3833  		 * be logged might be in a previous leaf, and there might be a
3834  		 * gap between these keys, meaning that we had deletions that
3835  		 * happened. So the key range item we log (key type
3836  		 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3837  		 * previous key's offset plus 1, so that those deletes are replayed.
3838  		 */
3839  		if (tmp.type == BTRFS_DIR_INDEX_KEY)
3840  			last_old_dentry_offset = tmp.offset;
3841  	} else if (ret < 0) {
3842  		goto done;
3843  	}
3844  
3845  	btrfs_release_path(path);
3846  
3847  	/*
3848  	 * Find the first key from this transaction again or the one we were at
3849  	 * in the loop below in case we had to reschedule. We may be logging the
3850  	 * directory without holding its VFS lock, which happen when logging new
3851  	 * dentries (through log_new_dir_dentries()) or in some cases when we
3852  	 * need to log the parent directory of an inode. This means a dir index
3853  	 * key might be deleted from the inode's root, and therefore we may not
3854  	 * find it anymore. If we can't find it, just move to the next key. We
3855  	 * can not bail out and ignore, because if we do that we will simply
3856  	 * not log dir index keys that come after the one that was just deleted
3857  	 * and we can end up logging a dir index range that ends at (u64)-1
3858  	 * (@last_offset is initialized to that), resulting in removing dir
3859  	 * entries we should not remove at log replay time.
3860  	 */
3861  search:
3862  	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3863  	if (ret > 0) {
3864  		ret = btrfs_next_item(root, path);
3865  		if (ret > 0) {
3866  			/* There are no more keys in the inode's root. */
3867  			ret = 0;
3868  			goto done;
3869  		}
3870  	}
3871  	if (ret < 0)
3872  		goto done;
3873  
3874  	/*
3875  	 * we have a block from this transaction, log every item in it
3876  	 * from our directory
3877  	 */
3878  	while (1) {
3879  		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3880  					     &last_old_dentry_offset);
3881  		if (ret != 0) {
3882  			if (ret > 0)
3883  				ret = 0;
3884  			goto done;
3885  		}
3886  		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3887  
3888  		/*
3889  		 * look ahead to the next item and see if it is also
3890  		 * from this directory and from this transaction
3891  		 */
3892  		ret = btrfs_next_leaf(root, path);
3893  		if (ret) {
3894  			if (ret == 1) {
3895  				last_offset = (u64)-1;
3896  				ret = 0;
3897  			}
3898  			goto done;
3899  		}
3900  		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3901  		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3902  			last_offset = (u64)-1;
3903  			goto done;
3904  		}
3905  		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3906  			/*
3907  			 * The next leaf was not changed in the current transaction
3908  			 * and has at least one dir index key.
3909  			 * We check for the next key because there might have been
3910  			 * one or more deletions between the last key we logged and
3911  			 * that next key. So the key range item we log (key type
3912  			 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3913  			 * offset minus 1, so that those deletes are replayed.
3914  			 */
3915  			last_offset = min_key.offset - 1;
3916  			goto done;
3917  		}
3918  		if (need_resched()) {
3919  			btrfs_release_path(path);
3920  			cond_resched();
3921  			goto search;
3922  		}
3923  	}
3924  done:
3925  	btrfs_release_path(path);
3926  	btrfs_release_path(dst_path);
3927  
3928  	if (ret == 0) {
3929  		*last_offset_ret = last_offset;
3930  		/*
3931  		 * In case the leaf was changed in the current transaction but
3932  		 * all its dir items are from a past transaction, the last item
3933  		 * in the leaf is a dir item and there's no gap between that last
3934  		 * dir item and the first one on the next leaf (which did not
3935  		 * change in the current transaction), then we don't need to log
3936  		 * a range, last_old_dentry_offset is == to last_offset.
3937  		 */
3938  		ASSERT(last_old_dentry_offset <= last_offset);
3939  		if (last_old_dentry_offset < last_offset)
3940  			ret = insert_dir_log_key(trans, log, path, ino,
3941  						 last_old_dentry_offset + 1,
3942  						 last_offset);
3943  	}
3944  
3945  	return ret;
3946  }
3947  
3948  /*
3949   * If the inode was logged before and it was evicted, then its
3950   * last_dir_index_offset is (u64)-1, so we don't the value of the last index
3951   * key offset. If that's the case, search for it and update the inode. This
3952   * is to avoid lookups in the log tree every time we try to insert a dir index
3953   * key from a leaf changed in the current transaction, and to allow us to always
3954   * do batch insertions of dir index keys.
3955   */
update_last_dir_index_offset(struct btrfs_inode * inode,struct btrfs_path * path,const struct btrfs_log_ctx * ctx)3956  static int update_last_dir_index_offset(struct btrfs_inode *inode,
3957  					struct btrfs_path *path,
3958  					const struct btrfs_log_ctx *ctx)
3959  {
3960  	const u64 ino = btrfs_ino(inode);
3961  	struct btrfs_key key;
3962  	int ret;
3963  
3964  	lockdep_assert_held(&inode->log_mutex);
3965  
3966  	if (inode->last_dir_index_offset != (u64)-1)
3967  		return 0;
3968  
3969  	if (!ctx->logged_before) {
3970  		inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3971  		return 0;
3972  	}
3973  
3974  	key.objectid = ino;
3975  	key.type = BTRFS_DIR_INDEX_KEY;
3976  	key.offset = (u64)-1;
3977  
3978  	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3979  	/*
3980  	 * An error happened or we actually have an index key with an offset
3981  	 * value of (u64)-1. Bail out, we're done.
3982  	 */
3983  	if (ret <= 0)
3984  		goto out;
3985  
3986  	ret = 0;
3987  	inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3988  
3989  	/*
3990  	 * No dir index items, bail out and leave last_dir_index_offset with
3991  	 * the value right before the first valid index value.
3992  	 */
3993  	if (path->slots[0] == 0)
3994  		goto out;
3995  
3996  	/*
3997  	 * btrfs_search_slot() left us at one slot beyond the slot with the last
3998  	 * index key, or beyond the last key of the directory that is not an
3999  	 * index key. If we have an index key before, set last_dir_index_offset
4000  	 * to its offset value, otherwise leave it with a value right before the
4001  	 * first valid index value, as it means we have an empty directory.
4002  	 */
4003  	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4004  	if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4005  		inode->last_dir_index_offset = key.offset;
4006  
4007  out:
4008  	btrfs_release_path(path);
4009  
4010  	return ret;
4011  }
4012  
4013  /*
4014   * logging directories is very similar to logging inodes, We find all the items
4015   * from the current transaction and write them to the log.
4016   *
4017   * The recovery code scans the directory in the subvolume, and if it finds a
4018   * key in the range logged that is not present in the log tree, then it means
4019   * that dir entry was unlinked during the transaction.
4020   *
4021   * In order for that scan to work, we must include one key smaller than
4022   * the smallest logged by this transaction and one key larger than the largest
4023   * key logged by this transaction.
4024   */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)4025  static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4026  			  struct btrfs_inode *inode,
4027  			  struct btrfs_path *path,
4028  			  struct btrfs_path *dst_path,
4029  			  struct btrfs_log_ctx *ctx)
4030  {
4031  	u64 min_key;
4032  	u64 max_key;
4033  	int ret;
4034  
4035  	ret = update_last_dir_index_offset(inode, path, ctx);
4036  	if (ret)
4037  		return ret;
4038  
4039  	min_key = BTRFS_DIR_START_INDEX;
4040  	max_key = 0;
4041  
4042  	while (1) {
4043  		ret = log_dir_items(trans, inode, path, dst_path,
4044  				ctx, min_key, &max_key);
4045  		if (ret)
4046  			return ret;
4047  		if (max_key == (u64)-1)
4048  			break;
4049  		min_key = max_key + 1;
4050  	}
4051  
4052  	return 0;
4053  }
4054  
4055  /*
4056   * a helper function to drop items from the log before we relog an
4057   * inode.  max_key_type indicates the highest item type to remove.
4058   * This cannot be run for file data extents because it does not
4059   * free the extents they point to.
4060   */
drop_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,int max_key_type)4061  static int drop_inode_items(struct btrfs_trans_handle *trans,
4062  				  struct btrfs_root *log,
4063  				  struct btrfs_path *path,
4064  				  struct btrfs_inode *inode,
4065  				  int max_key_type)
4066  {
4067  	int ret;
4068  	struct btrfs_key key;
4069  	struct btrfs_key found_key;
4070  	int start_slot;
4071  
4072  	key.objectid = btrfs_ino(inode);
4073  	key.type = max_key_type;
4074  	key.offset = (u64)-1;
4075  
4076  	while (1) {
4077  		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4078  		if (ret < 0) {
4079  			break;
4080  		} else if (ret > 0) {
4081  			if (path->slots[0] == 0)
4082  				break;
4083  			path->slots[0]--;
4084  		}
4085  
4086  		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4087  				      path->slots[0]);
4088  
4089  		if (found_key.objectid != key.objectid)
4090  			break;
4091  
4092  		found_key.offset = 0;
4093  		found_key.type = 0;
4094  		ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4095  		if (ret < 0)
4096  			break;
4097  
4098  		ret = btrfs_del_items(trans, log, path, start_slot,
4099  				      path->slots[0] - start_slot + 1);
4100  		/*
4101  		 * If start slot isn't 0 then we don't need to re-search, we've
4102  		 * found the last guy with the objectid in this tree.
4103  		 */
4104  		if (ret || start_slot != 0)
4105  			break;
4106  		btrfs_release_path(path);
4107  	}
4108  	btrfs_release_path(path);
4109  	if (ret > 0)
4110  		ret = 0;
4111  	return ret;
4112  }
4113  
truncate_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log_root,struct btrfs_inode * inode,u64 new_size,u32 min_type)4114  static int truncate_inode_items(struct btrfs_trans_handle *trans,
4115  				struct btrfs_root *log_root,
4116  				struct btrfs_inode *inode,
4117  				u64 new_size, u32 min_type)
4118  {
4119  	struct btrfs_truncate_control control = {
4120  		.new_size = new_size,
4121  		.ino = btrfs_ino(inode),
4122  		.min_type = min_type,
4123  		.skip_ref_updates = true,
4124  	};
4125  
4126  	return btrfs_truncate_inode_items(trans, log_root, &control);
4127  }
4128  
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,int log_inode_only,u64 logged_isize)4129  static void fill_inode_item(struct btrfs_trans_handle *trans,
4130  			    struct extent_buffer *leaf,
4131  			    struct btrfs_inode_item *item,
4132  			    struct inode *inode, int log_inode_only,
4133  			    u64 logged_isize)
4134  {
4135  	struct btrfs_map_token token;
4136  	u64 flags;
4137  
4138  	btrfs_init_map_token(&token, leaf);
4139  
4140  	if (log_inode_only) {
4141  		/* set the generation to zero so the recover code
4142  		 * can tell the difference between an logging
4143  		 * just to say 'this inode exists' and a logging
4144  		 * to say 'update this inode with these values'
4145  		 */
4146  		btrfs_set_token_inode_generation(&token, item, 0);
4147  		btrfs_set_token_inode_size(&token, item, logged_isize);
4148  	} else {
4149  		btrfs_set_token_inode_generation(&token, item,
4150  						 BTRFS_I(inode)->generation);
4151  		btrfs_set_token_inode_size(&token, item, inode->i_size);
4152  	}
4153  
4154  	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4155  	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4156  	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4157  	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4158  
4159  	btrfs_set_token_timespec_sec(&token, &item->atime,
4160  				     inode->i_atime.tv_sec);
4161  	btrfs_set_token_timespec_nsec(&token, &item->atime,
4162  				      inode->i_atime.tv_nsec);
4163  
4164  	btrfs_set_token_timespec_sec(&token, &item->mtime,
4165  				     inode->i_mtime.tv_sec);
4166  	btrfs_set_token_timespec_nsec(&token, &item->mtime,
4167  				      inode->i_mtime.tv_nsec);
4168  
4169  	btrfs_set_token_timespec_sec(&token, &item->ctime,
4170  				     inode_get_ctime(inode).tv_sec);
4171  	btrfs_set_token_timespec_nsec(&token, &item->ctime,
4172  				      inode_get_ctime(inode).tv_nsec);
4173  
4174  	/*
4175  	 * We do not need to set the nbytes field, in fact during a fast fsync
4176  	 * its value may not even be correct, since a fast fsync does not wait
4177  	 * for ordered extent completion, which is where we update nbytes, it
4178  	 * only waits for writeback to complete. During log replay as we find
4179  	 * file extent items and replay them, we adjust the nbytes field of the
4180  	 * inode item in subvolume tree as needed (see overwrite_item()).
4181  	 */
4182  
4183  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4184  	btrfs_set_token_inode_transid(&token, item, trans->transid);
4185  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4186  	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4187  					  BTRFS_I(inode)->ro_flags);
4188  	btrfs_set_token_inode_flags(&token, item, flags);
4189  	btrfs_set_token_inode_block_group(&token, item, 0);
4190  }
4191  
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,bool inode_item_dropped)4192  static int log_inode_item(struct btrfs_trans_handle *trans,
4193  			  struct btrfs_root *log, struct btrfs_path *path,
4194  			  struct btrfs_inode *inode, bool inode_item_dropped)
4195  {
4196  	struct btrfs_inode_item *inode_item;
4197  	int ret;
4198  
4199  	/*
4200  	 * If we are doing a fast fsync and the inode was logged before in the
4201  	 * current transaction, then we know the inode was previously logged and
4202  	 * it exists in the log tree. For performance reasons, in this case use
4203  	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4204  	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4205  	 * contention in case there are concurrent fsyncs for other inodes of the
4206  	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4207  	 * already exists can also result in unnecessarily splitting a leaf.
4208  	 */
4209  	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4210  		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4211  		ASSERT(ret <= 0);
4212  		if (ret > 0)
4213  			ret = -ENOENT;
4214  	} else {
4215  		/*
4216  		 * This means it is the first fsync in the current transaction,
4217  		 * so the inode item is not in the log and we need to insert it.
4218  		 * We can never get -EEXIST because we are only called for a fast
4219  		 * fsync and in case an inode eviction happens after the inode was
4220  		 * logged before in the current transaction, when we load again
4221  		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4222  		 * flags and set ->logged_trans to 0.
4223  		 */
4224  		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4225  					      sizeof(*inode_item));
4226  		ASSERT(ret != -EEXIST);
4227  	}
4228  	if (ret)
4229  		return ret;
4230  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4231  				    struct btrfs_inode_item);
4232  	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4233  			0, 0);
4234  	btrfs_release_path(path);
4235  	return 0;
4236  }
4237  
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)4238  static int log_csums(struct btrfs_trans_handle *trans,
4239  		     struct btrfs_inode *inode,
4240  		     struct btrfs_root *log_root,
4241  		     struct btrfs_ordered_sum *sums)
4242  {
4243  	const u64 lock_end = sums->logical + sums->len - 1;
4244  	struct extent_state *cached_state = NULL;
4245  	int ret;
4246  
4247  	/*
4248  	 * If this inode was not used for reflink operations in the current
4249  	 * transaction with new extents, then do the fast path, no need to
4250  	 * worry about logging checksum items with overlapping ranges.
4251  	 */
4252  	if (inode->last_reflink_trans < trans->transid)
4253  		return btrfs_csum_file_blocks(trans, log_root, sums);
4254  
4255  	/*
4256  	 * Serialize logging for checksums. This is to avoid racing with the
4257  	 * same checksum being logged by another task that is logging another
4258  	 * file which happens to refer to the same extent as well. Such races
4259  	 * can leave checksum items in the log with overlapping ranges.
4260  	 */
4261  	ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4262  			  &cached_state);
4263  	if (ret)
4264  		return ret;
4265  	/*
4266  	 * Due to extent cloning, we might have logged a csum item that covers a
4267  	 * subrange of a cloned extent, and later we can end up logging a csum
4268  	 * item for a larger subrange of the same extent or the entire range.
4269  	 * This would leave csum items in the log tree that cover the same range
4270  	 * and break the searches for checksums in the log tree, resulting in
4271  	 * some checksums missing in the fs/subvolume tree. So just delete (or
4272  	 * trim and adjust) any existing csum items in the log for this range.
4273  	 */
4274  	ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4275  	if (!ret)
4276  		ret = btrfs_csum_file_blocks(trans, log_root, sums);
4277  
4278  	unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4279  		      &cached_state);
4280  
4281  	return ret;
4282  }
4283  
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize)4284  static noinline int copy_items(struct btrfs_trans_handle *trans,
4285  			       struct btrfs_inode *inode,
4286  			       struct btrfs_path *dst_path,
4287  			       struct btrfs_path *src_path,
4288  			       int start_slot, int nr, int inode_only,
4289  			       u64 logged_isize)
4290  {
4291  	struct btrfs_root *log = inode->root->log_root;
4292  	struct btrfs_file_extent_item *extent;
4293  	struct extent_buffer *src;
4294  	int ret = 0;
4295  	struct btrfs_key *ins_keys;
4296  	u32 *ins_sizes;
4297  	struct btrfs_item_batch batch;
4298  	char *ins_data;
4299  	int i;
4300  	int dst_index;
4301  	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4302  	const u64 i_size = i_size_read(&inode->vfs_inode);
4303  
4304  	/*
4305  	 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4306  	 * use the clone. This is because otherwise we would be changing the log
4307  	 * tree, to insert items from the subvolume tree or insert csum items,
4308  	 * while holding a read lock on a leaf from the subvolume tree, which
4309  	 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4310  	 *
4311  	 * 1) Modifying the log tree triggers an extent buffer allocation while
4312  	 *    holding a write lock on a parent extent buffer from the log tree.
4313  	 *    Allocating the pages for an extent buffer, or the extent buffer
4314  	 *    struct, can trigger inode eviction and finally the inode eviction
4315  	 *    will trigger a release/remove of a delayed node, which requires
4316  	 *    taking the delayed node's mutex;
4317  	 *
4318  	 * 2) Allocating a metadata extent for a log tree can trigger the async
4319  	 *    reclaim thread and make us wait for it to release enough space and
4320  	 *    unblock our reservation ticket. The reclaim thread can start
4321  	 *    flushing delayed items, and that in turn results in the need to
4322  	 *    lock delayed node mutexes and in the need to write lock extent
4323  	 *    buffers of a subvolume tree - all this while holding a write lock
4324  	 *    on the parent extent buffer in the log tree.
4325  	 *
4326  	 * So one task in scenario 1) running in parallel with another task in
4327  	 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4328  	 * node mutex while having a read lock on a leaf from the subvolume,
4329  	 * while the other is holding the delayed node's mutex and wants to
4330  	 * write lock the same subvolume leaf for flushing delayed items.
4331  	 */
4332  	src = btrfs_clone_extent_buffer(src_path->nodes[0]);
4333  	if (!src)
4334  		return -ENOMEM;
4335  
4336  	i = src_path->slots[0];
4337  	btrfs_release_path(src_path);
4338  	src_path->nodes[0] = src;
4339  	src_path->slots[0] = i;
4340  
4341  	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4342  			   nr * sizeof(u32), GFP_NOFS);
4343  	if (!ins_data)
4344  		return -ENOMEM;
4345  
4346  	ins_sizes = (u32 *)ins_data;
4347  	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4348  	batch.keys = ins_keys;
4349  	batch.data_sizes = ins_sizes;
4350  	batch.total_data_size = 0;
4351  	batch.nr = 0;
4352  
4353  	dst_index = 0;
4354  	for (i = 0; i < nr; i++) {
4355  		const int src_slot = start_slot + i;
4356  		struct btrfs_root *csum_root;
4357  		struct btrfs_ordered_sum *sums;
4358  		struct btrfs_ordered_sum *sums_next;
4359  		LIST_HEAD(ordered_sums);
4360  		u64 disk_bytenr;
4361  		u64 disk_num_bytes;
4362  		u64 extent_offset;
4363  		u64 extent_num_bytes;
4364  		bool is_old_extent;
4365  
4366  		btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4367  
4368  		if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4369  			goto add_to_batch;
4370  
4371  		extent = btrfs_item_ptr(src, src_slot,
4372  					struct btrfs_file_extent_item);
4373  
4374  		is_old_extent = (btrfs_file_extent_generation(src, extent) <
4375  				 trans->transid);
4376  
4377  		/*
4378  		 * Don't copy extents from past generations. That would make us
4379  		 * log a lot more metadata for common cases like doing only a
4380  		 * few random writes into a file and then fsync it for the first
4381  		 * time or after the full sync flag is set on the inode. We can
4382  		 * get leaves full of extent items, most of which are from past
4383  		 * generations, so we can skip them - as long as the inode has
4384  		 * not been the target of a reflink operation in this transaction,
4385  		 * as in that case it might have had file extent items with old
4386  		 * generations copied into it. We also must always log prealloc
4387  		 * extents that start at or beyond eof, otherwise we would lose
4388  		 * them on log replay.
4389  		 */
4390  		if (is_old_extent &&
4391  		    ins_keys[dst_index].offset < i_size &&
4392  		    inode->last_reflink_trans < trans->transid)
4393  			continue;
4394  
4395  		if (skip_csum)
4396  			goto add_to_batch;
4397  
4398  		/* Only regular extents have checksums. */
4399  		if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4400  			goto add_to_batch;
4401  
4402  		/*
4403  		 * If it's an extent created in a past transaction, then its
4404  		 * checksums are already accessible from the committed csum tree,
4405  		 * no need to log them.
4406  		 */
4407  		if (is_old_extent)
4408  			goto add_to_batch;
4409  
4410  		disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4411  		/* If it's an explicit hole, there are no checksums. */
4412  		if (disk_bytenr == 0)
4413  			goto add_to_batch;
4414  
4415  		disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4416  
4417  		if (btrfs_file_extent_compression(src, extent)) {
4418  			extent_offset = 0;
4419  			extent_num_bytes = disk_num_bytes;
4420  		} else {
4421  			extent_offset = btrfs_file_extent_offset(src, extent);
4422  			extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4423  		}
4424  
4425  		csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4426  		disk_bytenr += extent_offset;
4427  		ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4428  					      disk_bytenr + extent_num_bytes - 1,
4429  					      &ordered_sums, 0, false);
4430  		if (ret)
4431  			goto out;
4432  
4433  		list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4434  			if (!ret)
4435  				ret = log_csums(trans, inode, log, sums);
4436  			list_del(&sums->list);
4437  			kfree(sums);
4438  		}
4439  		if (ret)
4440  			goto out;
4441  
4442  add_to_batch:
4443  		ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4444  		batch.total_data_size += ins_sizes[dst_index];
4445  		batch.nr++;
4446  		dst_index++;
4447  	}
4448  
4449  	/*
4450  	 * We have a leaf full of old extent items that don't need to be logged,
4451  	 * so we don't need to do anything.
4452  	 */
4453  	if (batch.nr == 0)
4454  		goto out;
4455  
4456  	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4457  	if (ret)
4458  		goto out;
4459  
4460  	dst_index = 0;
4461  	for (i = 0; i < nr; i++) {
4462  		const int src_slot = start_slot + i;
4463  		const int dst_slot = dst_path->slots[0] + dst_index;
4464  		struct btrfs_key key;
4465  		unsigned long src_offset;
4466  		unsigned long dst_offset;
4467  
4468  		/*
4469  		 * We're done, all the remaining items in the source leaf
4470  		 * correspond to old file extent items.
4471  		 */
4472  		if (dst_index >= batch.nr)
4473  			break;
4474  
4475  		btrfs_item_key_to_cpu(src, &key, src_slot);
4476  
4477  		if (key.type != BTRFS_EXTENT_DATA_KEY)
4478  			goto copy_item;
4479  
4480  		extent = btrfs_item_ptr(src, src_slot,
4481  					struct btrfs_file_extent_item);
4482  
4483  		/* See the comment in the previous loop, same logic. */
4484  		if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4485  		    key.offset < i_size &&
4486  		    inode->last_reflink_trans < trans->transid)
4487  			continue;
4488  
4489  copy_item:
4490  		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4491  		src_offset = btrfs_item_ptr_offset(src, src_slot);
4492  
4493  		if (key.type == BTRFS_INODE_ITEM_KEY) {
4494  			struct btrfs_inode_item *inode_item;
4495  
4496  			inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4497  						    struct btrfs_inode_item);
4498  			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4499  					&inode->vfs_inode,
4500  					inode_only == LOG_INODE_EXISTS,
4501  					logged_isize);
4502  		} else {
4503  			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4504  					   src_offset, ins_sizes[dst_index]);
4505  		}
4506  
4507  		dst_index++;
4508  	}
4509  
4510  	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
4511  	btrfs_release_path(dst_path);
4512  out:
4513  	kfree(ins_data);
4514  
4515  	return ret;
4516  }
4517  
extent_cmp(void * priv,const struct list_head * a,const struct list_head * b)4518  static int extent_cmp(void *priv, const struct list_head *a,
4519  		      const struct list_head *b)
4520  {
4521  	const struct extent_map *em1, *em2;
4522  
4523  	em1 = list_entry(a, struct extent_map, list);
4524  	em2 = list_entry(b, struct extent_map, list);
4525  
4526  	if (em1->start < em2->start)
4527  		return -1;
4528  	else if (em1->start > em2->start)
4529  		return 1;
4530  	return 0;
4531  }
4532  
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4533  static int log_extent_csums(struct btrfs_trans_handle *trans,
4534  			    struct btrfs_inode *inode,
4535  			    struct btrfs_root *log_root,
4536  			    const struct extent_map *em,
4537  			    struct btrfs_log_ctx *ctx)
4538  {
4539  	struct btrfs_ordered_extent *ordered;
4540  	struct btrfs_root *csum_root;
4541  	u64 csum_offset;
4542  	u64 csum_len;
4543  	u64 mod_start = em->mod_start;
4544  	u64 mod_len = em->mod_len;
4545  	LIST_HEAD(ordered_sums);
4546  	int ret = 0;
4547  
4548  	if (inode->flags & BTRFS_INODE_NODATASUM ||
4549  	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4550  	    em->block_start == EXTENT_MAP_HOLE)
4551  		return 0;
4552  
4553  	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4554  		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4555  		const u64 mod_end = mod_start + mod_len;
4556  		struct btrfs_ordered_sum *sums;
4557  
4558  		if (mod_len == 0)
4559  			break;
4560  
4561  		if (ordered_end <= mod_start)
4562  			continue;
4563  		if (mod_end <= ordered->file_offset)
4564  			break;
4565  
4566  		/*
4567  		 * We are going to copy all the csums on this ordered extent, so
4568  		 * go ahead and adjust mod_start and mod_len in case this ordered
4569  		 * extent has already been logged.
4570  		 */
4571  		if (ordered->file_offset > mod_start) {
4572  			if (ordered_end >= mod_end)
4573  				mod_len = ordered->file_offset - mod_start;
4574  			/*
4575  			 * If we have this case
4576  			 *
4577  			 * |--------- logged extent ---------|
4578  			 *       |----- ordered extent ----|
4579  			 *
4580  			 * Just don't mess with mod_start and mod_len, we'll
4581  			 * just end up logging more csums than we need and it
4582  			 * will be ok.
4583  			 */
4584  		} else {
4585  			if (ordered_end < mod_end) {
4586  				mod_len = mod_end - ordered_end;
4587  				mod_start = ordered_end;
4588  			} else {
4589  				mod_len = 0;
4590  			}
4591  		}
4592  
4593  		/*
4594  		 * To keep us from looping for the above case of an ordered
4595  		 * extent that falls inside of the logged extent.
4596  		 */
4597  		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4598  			continue;
4599  
4600  		list_for_each_entry(sums, &ordered->list, list) {
4601  			ret = log_csums(trans, inode, log_root, sums);
4602  			if (ret)
4603  				return ret;
4604  		}
4605  	}
4606  
4607  	/* We're done, found all csums in the ordered extents. */
4608  	if (mod_len == 0)
4609  		return 0;
4610  
4611  	/* If we're compressed we have to save the entire range of csums. */
4612  	if (em->compress_type) {
4613  		csum_offset = 0;
4614  		csum_len = max(em->block_len, em->orig_block_len);
4615  	} else {
4616  		csum_offset = mod_start - em->start;
4617  		csum_len = mod_len;
4618  	}
4619  
4620  	/* block start is already adjusted for the file extent offset. */
4621  	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4622  	ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
4623  				      em->block_start + csum_offset +
4624  				      csum_len - 1, &ordered_sums, 0, false);
4625  	if (ret)
4626  		return ret;
4627  
4628  	while (!list_empty(&ordered_sums)) {
4629  		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4630  						   struct btrfs_ordered_sum,
4631  						   list);
4632  		if (!ret)
4633  			ret = log_csums(trans, inode, log_root, sums);
4634  		list_del(&sums->list);
4635  		kfree(sums);
4636  	}
4637  
4638  	return ret;
4639  }
4640  
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4641  static int log_one_extent(struct btrfs_trans_handle *trans,
4642  			  struct btrfs_inode *inode,
4643  			  const struct extent_map *em,
4644  			  struct btrfs_path *path,
4645  			  struct btrfs_log_ctx *ctx)
4646  {
4647  	struct btrfs_drop_extents_args drop_args = { 0 };
4648  	struct btrfs_root *log = inode->root->log_root;
4649  	struct btrfs_file_extent_item fi = { 0 };
4650  	struct extent_buffer *leaf;
4651  	struct btrfs_key key;
4652  	u64 extent_offset = em->start - em->orig_start;
4653  	u64 block_len;
4654  	int ret;
4655  
4656  	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4657  	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4658  		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4659  	else
4660  		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4661  
4662  	block_len = max(em->block_len, em->orig_block_len);
4663  	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4664  		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
4665  		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4666  	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4667  		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
4668  							extent_offset);
4669  		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4670  	}
4671  
4672  	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4673  	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4674  	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4675  	btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
4676  
4677  	ret = log_extent_csums(trans, inode, log, em, ctx);
4678  	if (ret)
4679  		return ret;
4680  
4681  	/*
4682  	 * If this is the first time we are logging the inode in the current
4683  	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4684  	 * because it does a deletion search, which always acquires write locks
4685  	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4686  	 * but also adds significant contention in a log tree, since log trees
4687  	 * are small, with a root at level 2 or 3 at most, due to their short
4688  	 * life span.
4689  	 */
4690  	if (ctx->logged_before) {
4691  		drop_args.path = path;
4692  		drop_args.start = em->start;
4693  		drop_args.end = em->start + em->len;
4694  		drop_args.replace_extent = true;
4695  		drop_args.extent_item_size = sizeof(fi);
4696  		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4697  		if (ret)
4698  			return ret;
4699  	}
4700  
4701  	if (!drop_args.extent_inserted) {
4702  		key.objectid = btrfs_ino(inode);
4703  		key.type = BTRFS_EXTENT_DATA_KEY;
4704  		key.offset = em->start;
4705  
4706  		ret = btrfs_insert_empty_item(trans, log, path, &key,
4707  					      sizeof(fi));
4708  		if (ret)
4709  			return ret;
4710  	}
4711  	leaf = path->nodes[0];
4712  	write_extent_buffer(leaf, &fi,
4713  			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4714  			    sizeof(fi));
4715  	btrfs_mark_buffer_dirty(trans, leaf);
4716  
4717  	btrfs_release_path(path);
4718  
4719  	return ret;
4720  }
4721  
4722  /*
4723   * Log all prealloc extents beyond the inode's i_size to make sure we do not
4724   * lose them after doing a full/fast fsync and replaying the log. We scan the
4725   * subvolume's root instead of iterating the inode's extent map tree because
4726   * otherwise we can log incorrect extent items based on extent map conversion.
4727   * That can happen due to the fact that extent maps are merged when they
4728   * are not in the extent map tree's list of modified extents.
4729   */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)4730  static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4731  				      struct btrfs_inode *inode,
4732  				      struct btrfs_path *path)
4733  {
4734  	struct btrfs_root *root = inode->root;
4735  	struct btrfs_key key;
4736  	const u64 i_size = i_size_read(&inode->vfs_inode);
4737  	const u64 ino = btrfs_ino(inode);
4738  	struct btrfs_path *dst_path = NULL;
4739  	bool dropped_extents = false;
4740  	u64 truncate_offset = i_size;
4741  	struct extent_buffer *leaf;
4742  	int slot;
4743  	int ins_nr = 0;
4744  	int start_slot = 0;
4745  	int ret;
4746  
4747  	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4748  		return 0;
4749  
4750  	key.objectid = ino;
4751  	key.type = BTRFS_EXTENT_DATA_KEY;
4752  	key.offset = i_size;
4753  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4754  	if (ret < 0)
4755  		goto out;
4756  
4757  	/*
4758  	 * We must check if there is a prealloc extent that starts before the
4759  	 * i_size and crosses the i_size boundary. This is to ensure later we
4760  	 * truncate down to the end of that extent and not to the i_size, as
4761  	 * otherwise we end up losing part of the prealloc extent after a log
4762  	 * replay and with an implicit hole if there is another prealloc extent
4763  	 * that starts at an offset beyond i_size.
4764  	 */
4765  	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4766  	if (ret < 0)
4767  		goto out;
4768  
4769  	if (ret == 0) {
4770  		struct btrfs_file_extent_item *ei;
4771  
4772  		leaf = path->nodes[0];
4773  		slot = path->slots[0];
4774  		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4775  
4776  		if (btrfs_file_extent_type(leaf, ei) ==
4777  		    BTRFS_FILE_EXTENT_PREALLOC) {
4778  			u64 extent_end;
4779  
4780  			btrfs_item_key_to_cpu(leaf, &key, slot);
4781  			extent_end = key.offset +
4782  				btrfs_file_extent_num_bytes(leaf, ei);
4783  
4784  			if (extent_end > i_size)
4785  				truncate_offset = extent_end;
4786  		}
4787  	} else {
4788  		ret = 0;
4789  	}
4790  
4791  	while (true) {
4792  		leaf = path->nodes[0];
4793  		slot = path->slots[0];
4794  
4795  		if (slot >= btrfs_header_nritems(leaf)) {
4796  			if (ins_nr > 0) {
4797  				ret = copy_items(trans, inode, dst_path, path,
4798  						 start_slot, ins_nr, 1, 0);
4799  				if (ret < 0)
4800  					goto out;
4801  				ins_nr = 0;
4802  			}
4803  			ret = btrfs_next_leaf(root, path);
4804  			if (ret < 0)
4805  				goto out;
4806  			if (ret > 0) {
4807  				ret = 0;
4808  				break;
4809  			}
4810  			continue;
4811  		}
4812  
4813  		btrfs_item_key_to_cpu(leaf, &key, slot);
4814  		if (key.objectid > ino)
4815  			break;
4816  		if (WARN_ON_ONCE(key.objectid < ino) ||
4817  		    key.type < BTRFS_EXTENT_DATA_KEY ||
4818  		    key.offset < i_size) {
4819  			path->slots[0]++;
4820  			continue;
4821  		}
4822  		/*
4823  		 * Avoid overlapping items in the log tree. The first time we
4824  		 * get here, get rid of everything from a past fsync. After
4825  		 * that, if the current extent starts before the end of the last
4826  		 * extent we copied, truncate the last one. This can happen if
4827  		 * an ordered extent completion modifies the subvolume tree
4828  		 * while btrfs_next_leaf() has the tree unlocked.
4829  		 */
4830  		if (!dropped_extents || key.offset < truncate_offset) {
4831  			ret = truncate_inode_items(trans, root->log_root, inode,
4832  						   min(key.offset, truncate_offset),
4833  						   BTRFS_EXTENT_DATA_KEY);
4834  			if (ret)
4835  				goto out;
4836  			dropped_extents = true;
4837  		}
4838  		truncate_offset = btrfs_file_extent_end(path);
4839  		if (ins_nr == 0)
4840  			start_slot = slot;
4841  		ins_nr++;
4842  		path->slots[0]++;
4843  		if (!dst_path) {
4844  			dst_path = btrfs_alloc_path();
4845  			if (!dst_path) {
4846  				ret = -ENOMEM;
4847  				goto out;
4848  			}
4849  		}
4850  	}
4851  	if (ins_nr > 0)
4852  		ret = copy_items(trans, inode, dst_path, path,
4853  				 start_slot, ins_nr, 1, 0);
4854  out:
4855  	btrfs_release_path(path);
4856  	btrfs_free_path(dst_path);
4857  	return ret;
4858  }
4859  
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4860  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4861  				     struct btrfs_inode *inode,
4862  				     struct btrfs_path *path,
4863  				     struct btrfs_log_ctx *ctx)
4864  {
4865  	struct btrfs_ordered_extent *ordered;
4866  	struct btrfs_ordered_extent *tmp;
4867  	struct extent_map *em, *n;
4868  	LIST_HEAD(extents);
4869  	struct extent_map_tree *tree = &inode->extent_tree;
4870  	int ret = 0;
4871  	int num = 0;
4872  
4873  	write_lock(&tree->lock);
4874  
4875  	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4876  		list_del_init(&em->list);
4877  		/*
4878  		 * Just an arbitrary number, this can be really CPU intensive
4879  		 * once we start getting a lot of extents, and really once we
4880  		 * have a bunch of extents we just want to commit since it will
4881  		 * be faster.
4882  		 */
4883  		if (++num > 32768) {
4884  			list_del_init(&tree->modified_extents);
4885  			ret = -EFBIG;
4886  			goto process;
4887  		}
4888  
4889  		if (em->generation < trans->transid)
4890  			continue;
4891  
4892  		/* We log prealloc extents beyond eof later. */
4893  		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4894  		    em->start >= i_size_read(&inode->vfs_inode))
4895  			continue;
4896  
4897  		/* Need a ref to keep it from getting evicted from cache */
4898  		refcount_inc(&em->refs);
4899  		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4900  		list_add_tail(&em->list, &extents);
4901  		num++;
4902  	}
4903  
4904  	list_sort(NULL, &extents, extent_cmp);
4905  process:
4906  	while (!list_empty(&extents)) {
4907  		em = list_entry(extents.next, struct extent_map, list);
4908  
4909  		list_del_init(&em->list);
4910  
4911  		/*
4912  		 * If we had an error we just need to delete everybody from our
4913  		 * private list.
4914  		 */
4915  		if (ret) {
4916  			clear_em_logging(tree, em);
4917  			free_extent_map(em);
4918  			continue;
4919  		}
4920  
4921  		write_unlock(&tree->lock);
4922  
4923  		ret = log_one_extent(trans, inode, em, path, ctx);
4924  		write_lock(&tree->lock);
4925  		clear_em_logging(tree, em);
4926  		free_extent_map(em);
4927  	}
4928  	WARN_ON(!list_empty(&extents));
4929  	write_unlock(&tree->lock);
4930  
4931  	if (!ret)
4932  		ret = btrfs_log_prealloc_extents(trans, inode, path);
4933  	if (ret)
4934  		return ret;
4935  
4936  	/*
4937  	 * We have logged all extents successfully, now make sure the commit of
4938  	 * the current transaction waits for the ordered extents to complete
4939  	 * before it commits and wipes out the log trees, otherwise we would
4940  	 * lose data if an ordered extents completes after the transaction
4941  	 * commits and a power failure happens after the transaction commit.
4942  	 */
4943  	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4944  		list_del_init(&ordered->log_list);
4945  		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4946  
4947  		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4948  			spin_lock_irq(&inode->ordered_tree.lock);
4949  			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4950  				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4951  				atomic_inc(&trans->transaction->pending_ordered);
4952  			}
4953  			spin_unlock_irq(&inode->ordered_tree.lock);
4954  		}
4955  		btrfs_put_ordered_extent(ordered);
4956  	}
4957  
4958  	return 0;
4959  }
4960  
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)4961  static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4962  			     struct btrfs_path *path, u64 *size_ret)
4963  {
4964  	struct btrfs_key key;
4965  	int ret;
4966  
4967  	key.objectid = btrfs_ino(inode);
4968  	key.type = BTRFS_INODE_ITEM_KEY;
4969  	key.offset = 0;
4970  
4971  	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4972  	if (ret < 0) {
4973  		return ret;
4974  	} else if (ret > 0) {
4975  		*size_ret = 0;
4976  	} else {
4977  		struct btrfs_inode_item *item;
4978  
4979  		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4980  				      struct btrfs_inode_item);
4981  		*size_ret = btrfs_inode_size(path->nodes[0], item);
4982  		/*
4983  		 * If the in-memory inode's i_size is smaller then the inode
4984  		 * size stored in the btree, return the inode's i_size, so
4985  		 * that we get a correct inode size after replaying the log
4986  		 * when before a power failure we had a shrinking truncate
4987  		 * followed by addition of a new name (rename / new hard link).
4988  		 * Otherwise return the inode size from the btree, to avoid
4989  		 * data loss when replaying a log due to previously doing a
4990  		 * write that expands the inode's size and logging a new name
4991  		 * immediately after.
4992  		 */
4993  		if (*size_ret > inode->vfs_inode.i_size)
4994  			*size_ret = inode->vfs_inode.i_size;
4995  	}
4996  
4997  	btrfs_release_path(path);
4998  	return 0;
4999  }
5000  
5001  /*
5002   * At the moment we always log all xattrs. This is to figure out at log replay
5003   * time which xattrs must have their deletion replayed. If a xattr is missing
5004   * in the log tree and exists in the fs/subvol tree, we delete it. This is
5005   * because if a xattr is deleted, the inode is fsynced and a power failure
5006   * happens, causing the log to be replayed the next time the fs is mounted,
5007   * we want the xattr to not exist anymore (same behaviour as other filesystems
5008   * with a journal, ext3/4, xfs, f2fs, etc).
5009   */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path)5010  static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5011  				struct btrfs_inode *inode,
5012  				struct btrfs_path *path,
5013  				struct btrfs_path *dst_path)
5014  {
5015  	struct btrfs_root *root = inode->root;
5016  	int ret;
5017  	struct btrfs_key key;
5018  	const u64 ino = btrfs_ino(inode);
5019  	int ins_nr = 0;
5020  	int start_slot = 0;
5021  	bool found_xattrs = false;
5022  
5023  	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5024  		return 0;
5025  
5026  	key.objectid = ino;
5027  	key.type = BTRFS_XATTR_ITEM_KEY;
5028  	key.offset = 0;
5029  
5030  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5031  	if (ret < 0)
5032  		return ret;
5033  
5034  	while (true) {
5035  		int slot = path->slots[0];
5036  		struct extent_buffer *leaf = path->nodes[0];
5037  		int nritems = btrfs_header_nritems(leaf);
5038  
5039  		if (slot >= nritems) {
5040  			if (ins_nr > 0) {
5041  				ret = copy_items(trans, inode, dst_path, path,
5042  						 start_slot, ins_nr, 1, 0);
5043  				if (ret < 0)
5044  					return ret;
5045  				ins_nr = 0;
5046  			}
5047  			ret = btrfs_next_leaf(root, path);
5048  			if (ret < 0)
5049  				return ret;
5050  			else if (ret > 0)
5051  				break;
5052  			continue;
5053  		}
5054  
5055  		btrfs_item_key_to_cpu(leaf, &key, slot);
5056  		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5057  			break;
5058  
5059  		if (ins_nr == 0)
5060  			start_slot = slot;
5061  		ins_nr++;
5062  		path->slots[0]++;
5063  		found_xattrs = true;
5064  		cond_resched();
5065  	}
5066  	if (ins_nr > 0) {
5067  		ret = copy_items(trans, inode, dst_path, path,
5068  				 start_slot, ins_nr, 1, 0);
5069  		if (ret < 0)
5070  			return ret;
5071  	}
5072  
5073  	if (!found_xattrs)
5074  		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5075  
5076  	return 0;
5077  }
5078  
5079  /*
5080   * When using the NO_HOLES feature if we punched a hole that causes the
5081   * deletion of entire leafs or all the extent items of the first leaf (the one
5082   * that contains the inode item and references) we may end up not processing
5083   * any extents, because there are no leafs with a generation matching the
5084   * current transaction that have extent items for our inode. So we need to find
5085   * if any holes exist and then log them. We also need to log holes after any
5086   * truncate operation that changes the inode's size.
5087   */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)5088  static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5089  			   struct btrfs_inode *inode,
5090  			   struct btrfs_path *path)
5091  {
5092  	struct btrfs_root *root = inode->root;
5093  	struct btrfs_fs_info *fs_info = root->fs_info;
5094  	struct btrfs_key key;
5095  	const u64 ino = btrfs_ino(inode);
5096  	const u64 i_size = i_size_read(&inode->vfs_inode);
5097  	u64 prev_extent_end = 0;
5098  	int ret;
5099  
5100  	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5101  		return 0;
5102  
5103  	key.objectid = ino;
5104  	key.type = BTRFS_EXTENT_DATA_KEY;
5105  	key.offset = 0;
5106  
5107  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5108  	if (ret < 0)
5109  		return ret;
5110  
5111  	while (true) {
5112  		struct extent_buffer *leaf = path->nodes[0];
5113  
5114  		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5115  			ret = btrfs_next_leaf(root, path);
5116  			if (ret < 0)
5117  				return ret;
5118  			if (ret > 0) {
5119  				ret = 0;
5120  				break;
5121  			}
5122  			leaf = path->nodes[0];
5123  		}
5124  
5125  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5126  		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5127  			break;
5128  
5129  		/* We have a hole, log it. */
5130  		if (prev_extent_end < key.offset) {
5131  			const u64 hole_len = key.offset - prev_extent_end;
5132  
5133  			/*
5134  			 * Release the path to avoid deadlocks with other code
5135  			 * paths that search the root while holding locks on
5136  			 * leafs from the log root.
5137  			 */
5138  			btrfs_release_path(path);
5139  			ret = btrfs_insert_hole_extent(trans, root->log_root,
5140  						       ino, prev_extent_end,
5141  						       hole_len);
5142  			if (ret < 0)
5143  				return ret;
5144  
5145  			/*
5146  			 * Search for the same key again in the root. Since it's
5147  			 * an extent item and we are holding the inode lock, the
5148  			 * key must still exist. If it doesn't just emit warning
5149  			 * and return an error to fall back to a transaction
5150  			 * commit.
5151  			 */
5152  			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5153  			if (ret < 0)
5154  				return ret;
5155  			if (WARN_ON(ret > 0))
5156  				return -ENOENT;
5157  			leaf = path->nodes[0];
5158  		}
5159  
5160  		prev_extent_end = btrfs_file_extent_end(path);
5161  		path->slots[0]++;
5162  		cond_resched();
5163  	}
5164  
5165  	if (prev_extent_end < i_size) {
5166  		u64 hole_len;
5167  
5168  		btrfs_release_path(path);
5169  		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5170  		ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5171  					       prev_extent_end, hole_len);
5172  		if (ret < 0)
5173  			return ret;
5174  	}
5175  
5176  	return 0;
5177  }
5178  
5179  /*
5180   * When we are logging a new inode X, check if it doesn't have a reference that
5181   * matches the reference from some other inode Y created in a past transaction
5182   * and that was renamed in the current transaction. If we don't do this, then at
5183   * log replay time we can lose inode Y (and all its files if it's a directory):
5184   *
5185   * mkdir /mnt/x
5186   * echo "hello world" > /mnt/x/foobar
5187   * sync
5188   * mv /mnt/x /mnt/y
5189   * mkdir /mnt/x                 # or touch /mnt/x
5190   * xfs_io -c fsync /mnt/x
5191   * <power fail>
5192   * mount fs, trigger log replay
5193   *
5194   * After the log replay procedure, we would lose the first directory and all its
5195   * files (file foobar).
5196   * For the case where inode Y is not a directory we simply end up losing it:
5197   *
5198   * echo "123" > /mnt/foo
5199   * sync
5200   * mv /mnt/foo /mnt/bar
5201   * echo "abc" > /mnt/foo
5202   * xfs_io -c fsync /mnt/foo
5203   * <power fail>
5204   *
5205   * We also need this for cases where a snapshot entry is replaced by some other
5206   * entry (file or directory) otherwise we end up with an unreplayable log due to
5207   * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5208   * if it were a regular entry:
5209   *
5210   * mkdir /mnt/x
5211   * btrfs subvolume snapshot /mnt /mnt/x/snap
5212   * btrfs subvolume delete /mnt/x/snap
5213   * rmdir /mnt/x
5214   * mkdir /mnt/x
5215   * fsync /mnt/x or fsync some new file inside it
5216   * <power fail>
5217   *
5218   * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5219   * the same transaction.
5220   */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)5221  static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5222  					 const int slot,
5223  					 const struct btrfs_key *key,
5224  					 struct btrfs_inode *inode,
5225  					 u64 *other_ino, u64 *other_parent)
5226  {
5227  	int ret;
5228  	struct btrfs_path *search_path;
5229  	char *name = NULL;
5230  	u32 name_len = 0;
5231  	u32 item_size = btrfs_item_size(eb, slot);
5232  	u32 cur_offset = 0;
5233  	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5234  
5235  	search_path = btrfs_alloc_path();
5236  	if (!search_path)
5237  		return -ENOMEM;
5238  	search_path->search_commit_root = 1;
5239  	search_path->skip_locking = 1;
5240  
5241  	while (cur_offset < item_size) {
5242  		u64 parent;
5243  		u32 this_name_len;
5244  		u32 this_len;
5245  		unsigned long name_ptr;
5246  		struct btrfs_dir_item *di;
5247  		struct fscrypt_str name_str;
5248  
5249  		if (key->type == BTRFS_INODE_REF_KEY) {
5250  			struct btrfs_inode_ref *iref;
5251  
5252  			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5253  			parent = key->offset;
5254  			this_name_len = btrfs_inode_ref_name_len(eb, iref);
5255  			name_ptr = (unsigned long)(iref + 1);
5256  			this_len = sizeof(*iref) + this_name_len;
5257  		} else {
5258  			struct btrfs_inode_extref *extref;
5259  
5260  			extref = (struct btrfs_inode_extref *)(ptr +
5261  							       cur_offset);
5262  			parent = btrfs_inode_extref_parent(eb, extref);
5263  			this_name_len = btrfs_inode_extref_name_len(eb, extref);
5264  			name_ptr = (unsigned long)&extref->name;
5265  			this_len = sizeof(*extref) + this_name_len;
5266  		}
5267  
5268  		if (this_name_len > name_len) {
5269  			char *new_name;
5270  
5271  			new_name = krealloc(name, this_name_len, GFP_NOFS);
5272  			if (!new_name) {
5273  				ret = -ENOMEM;
5274  				goto out;
5275  			}
5276  			name_len = this_name_len;
5277  			name = new_name;
5278  		}
5279  
5280  		read_extent_buffer(eb, name, name_ptr, this_name_len);
5281  
5282  		name_str.name = name;
5283  		name_str.len = this_name_len;
5284  		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5285  				parent, &name_str, 0);
5286  		if (di && !IS_ERR(di)) {
5287  			struct btrfs_key di_key;
5288  
5289  			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5290  						  di, &di_key);
5291  			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5292  				if (di_key.objectid != key->objectid) {
5293  					ret = 1;
5294  					*other_ino = di_key.objectid;
5295  					*other_parent = parent;
5296  				} else {
5297  					ret = 0;
5298  				}
5299  			} else {
5300  				ret = -EAGAIN;
5301  			}
5302  			goto out;
5303  		} else if (IS_ERR(di)) {
5304  			ret = PTR_ERR(di);
5305  			goto out;
5306  		}
5307  		btrfs_release_path(search_path);
5308  
5309  		cur_offset += this_len;
5310  	}
5311  	ret = 0;
5312  out:
5313  	btrfs_free_path(search_path);
5314  	kfree(name);
5315  	return ret;
5316  }
5317  
5318  /*
5319   * Check if we need to log an inode. This is used in contexts where while
5320   * logging an inode we need to log another inode (either that it exists or in
5321   * full mode). This is used instead of btrfs_inode_in_log() because the later
5322   * requires the inode to be in the log and have the log transaction committed,
5323   * while here we do not care if the log transaction was already committed - our
5324   * caller will commit the log later - and we want to avoid logging an inode
5325   * multiple times when multiple tasks have joined the same log transaction.
5326   */
need_log_inode(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5327  static bool need_log_inode(const struct btrfs_trans_handle *trans,
5328  			   struct btrfs_inode *inode)
5329  {
5330  	/*
5331  	 * If a directory was not modified, no dentries added or removed, we can
5332  	 * and should avoid logging it.
5333  	 */
5334  	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5335  		return false;
5336  
5337  	/*
5338  	 * If this inode does not have new/updated/deleted xattrs since the last
5339  	 * time it was logged and is flagged as logged in the current transaction,
5340  	 * we can skip logging it. As for new/deleted names, those are updated in
5341  	 * the log by link/unlink/rename operations.
5342  	 * In case the inode was logged and then evicted and reloaded, its
5343  	 * logged_trans will be 0, in which case we have to fully log it since
5344  	 * logged_trans is a transient field, not persisted.
5345  	 */
5346  	if (inode_logged(trans, inode, NULL) == 1 &&
5347  	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5348  		return false;
5349  
5350  	return true;
5351  }
5352  
5353  struct btrfs_dir_list {
5354  	u64 ino;
5355  	struct list_head list;
5356  };
5357  
5358  /*
5359   * Log the inodes of the new dentries of a directory.
5360   * See process_dir_items_leaf() for details about why it is needed.
5361   * This is a recursive operation - if an existing dentry corresponds to a
5362   * directory, that directory's new entries are logged too (same behaviour as
5363   * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5364   * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5365   * complains about the following circular lock dependency / possible deadlock:
5366   *
5367   *        CPU0                                        CPU1
5368   *        ----                                        ----
5369   * lock(&type->i_mutex_dir_key#3/2);
5370   *                                            lock(sb_internal#2);
5371   *                                            lock(&type->i_mutex_dir_key#3/2);
5372   * lock(&sb->s_type->i_mutex_key#14);
5373   *
5374   * Where sb_internal is the lock (a counter that works as a lock) acquired by
5375   * sb_start_intwrite() in btrfs_start_transaction().
5376   * Not acquiring the VFS lock of the inodes is still safe because:
5377   *
5378   * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5379   *    that while logging the inode new references (names) are added or removed
5380   *    from the inode, leaving the logged inode item with a link count that does
5381   *    not match the number of logged inode reference items. This is fine because
5382   *    at log replay time we compute the real number of links and correct the
5383   *    link count in the inode item (see replay_one_buffer() and
5384   *    link_to_fixup_dir());
5385   *
5386   * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5387   *    while logging the inode's items new index items (key type
5388   *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5389   *    has a size that doesn't match the sum of the lengths of all the logged
5390   *    names - this is ok, not a problem, because at log replay time we set the
5391   *    directory's i_size to the correct value (see replay_one_name() and
5392   *    overwrite_item()).
5393   */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5394  static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5395  				struct btrfs_inode *start_inode,
5396  				struct btrfs_log_ctx *ctx)
5397  {
5398  	struct btrfs_root *root = start_inode->root;
5399  	struct btrfs_path *path;
5400  	LIST_HEAD(dir_list);
5401  	struct btrfs_dir_list *dir_elem;
5402  	u64 ino = btrfs_ino(start_inode);
5403  	struct btrfs_inode *curr_inode = start_inode;
5404  	int ret = 0;
5405  
5406  	/*
5407  	 * If we are logging a new name, as part of a link or rename operation,
5408  	 * don't bother logging new dentries, as we just want to log the names
5409  	 * of an inode and that any new parents exist.
5410  	 */
5411  	if (ctx->logging_new_name)
5412  		return 0;
5413  
5414  	path = btrfs_alloc_path();
5415  	if (!path)
5416  		return -ENOMEM;
5417  
5418  	/* Pairs with btrfs_add_delayed_iput below. */
5419  	ihold(&curr_inode->vfs_inode);
5420  
5421  	while (true) {
5422  		struct inode *vfs_inode;
5423  		struct btrfs_key key;
5424  		struct btrfs_key found_key;
5425  		u64 next_index;
5426  		bool continue_curr_inode = true;
5427  		int iter_ret;
5428  
5429  		key.objectid = ino;
5430  		key.type = BTRFS_DIR_INDEX_KEY;
5431  		key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5432  		next_index = key.offset;
5433  again:
5434  		btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5435  			struct extent_buffer *leaf = path->nodes[0];
5436  			struct btrfs_dir_item *di;
5437  			struct btrfs_key di_key;
5438  			struct inode *di_inode;
5439  			int log_mode = LOG_INODE_EXISTS;
5440  			int type;
5441  
5442  			if (found_key.objectid != ino ||
5443  			    found_key.type != BTRFS_DIR_INDEX_KEY) {
5444  				continue_curr_inode = false;
5445  				break;
5446  			}
5447  
5448  			next_index = found_key.offset + 1;
5449  
5450  			di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5451  			type = btrfs_dir_ftype(leaf, di);
5452  			if (btrfs_dir_transid(leaf, di) < trans->transid)
5453  				continue;
5454  			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5455  			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5456  				continue;
5457  
5458  			btrfs_release_path(path);
5459  			di_inode = btrfs_iget_logging(di_key.objectid, root);
5460  			if (IS_ERR(di_inode)) {
5461  				ret = PTR_ERR(di_inode);
5462  				goto out;
5463  			}
5464  
5465  			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5466  				btrfs_add_delayed_iput(BTRFS_I(di_inode));
5467  				break;
5468  			}
5469  
5470  			ctx->log_new_dentries = false;
5471  			if (type == BTRFS_FT_DIR)
5472  				log_mode = LOG_INODE_ALL;
5473  			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
5474  					      log_mode, ctx);
5475  			btrfs_add_delayed_iput(BTRFS_I(di_inode));
5476  			if (ret)
5477  				goto out;
5478  			if (ctx->log_new_dentries) {
5479  				dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5480  				if (!dir_elem) {
5481  					ret = -ENOMEM;
5482  					goto out;
5483  				}
5484  				dir_elem->ino = di_key.objectid;
5485  				list_add_tail(&dir_elem->list, &dir_list);
5486  			}
5487  			break;
5488  		}
5489  
5490  		btrfs_release_path(path);
5491  
5492  		if (iter_ret < 0) {
5493  			ret = iter_ret;
5494  			goto out;
5495  		} else if (iter_ret > 0) {
5496  			continue_curr_inode = false;
5497  		} else {
5498  			key = found_key;
5499  		}
5500  
5501  		if (continue_curr_inode && key.offset < (u64)-1) {
5502  			key.offset++;
5503  			goto again;
5504  		}
5505  
5506  		btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5507  
5508  		if (list_empty(&dir_list))
5509  			break;
5510  
5511  		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5512  		ino = dir_elem->ino;
5513  		list_del(&dir_elem->list);
5514  		kfree(dir_elem);
5515  
5516  		btrfs_add_delayed_iput(curr_inode);
5517  		curr_inode = NULL;
5518  
5519  		vfs_inode = btrfs_iget_logging(ino, root);
5520  		if (IS_ERR(vfs_inode)) {
5521  			ret = PTR_ERR(vfs_inode);
5522  			break;
5523  		}
5524  		curr_inode = BTRFS_I(vfs_inode);
5525  	}
5526  out:
5527  	btrfs_free_path(path);
5528  	if (curr_inode)
5529  		btrfs_add_delayed_iput(curr_inode);
5530  
5531  	if (ret) {
5532  		struct btrfs_dir_list *next;
5533  
5534  		list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5535  			kfree(dir_elem);
5536  	}
5537  
5538  	return ret;
5539  }
5540  
5541  struct btrfs_ino_list {
5542  	u64 ino;
5543  	u64 parent;
5544  	struct list_head list;
5545  };
5546  
free_conflicting_inodes(struct btrfs_log_ctx * ctx)5547  static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5548  {
5549  	struct btrfs_ino_list *curr;
5550  	struct btrfs_ino_list *next;
5551  
5552  	list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5553  		list_del(&curr->list);
5554  		kfree(curr);
5555  	}
5556  }
5557  
conflicting_inode_is_dir(struct btrfs_root * root,u64 ino,struct btrfs_path * path)5558  static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5559  				    struct btrfs_path *path)
5560  {
5561  	struct btrfs_key key;
5562  	int ret;
5563  
5564  	key.objectid = ino;
5565  	key.type = BTRFS_INODE_ITEM_KEY;
5566  	key.offset = 0;
5567  
5568  	path->search_commit_root = 1;
5569  	path->skip_locking = 1;
5570  
5571  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5572  	if (WARN_ON_ONCE(ret > 0)) {
5573  		/*
5574  		 * We have previously found the inode through the commit root
5575  		 * so this should not happen. If it does, just error out and
5576  		 * fallback to a transaction commit.
5577  		 */
5578  		ret = -ENOENT;
5579  	} else if (ret == 0) {
5580  		struct btrfs_inode_item *item;
5581  
5582  		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5583  				      struct btrfs_inode_item);
5584  		if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5585  			ret = 1;
5586  	}
5587  
5588  	btrfs_release_path(path);
5589  	path->search_commit_root = 0;
5590  	path->skip_locking = 0;
5591  
5592  	return ret;
5593  }
5594  
add_conflicting_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 ino,u64 parent,struct btrfs_log_ctx * ctx)5595  static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5596  				 struct btrfs_root *root,
5597  				 struct btrfs_path *path,
5598  				 u64 ino, u64 parent,
5599  				 struct btrfs_log_ctx *ctx)
5600  {
5601  	struct btrfs_ino_list *ino_elem;
5602  	struct inode *inode;
5603  
5604  	/*
5605  	 * It's rare to have a lot of conflicting inodes, in practice it is not
5606  	 * common to have more than 1 or 2. We don't want to collect too many,
5607  	 * as we could end up logging too many inodes (even if only in
5608  	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5609  	 * commits.
5610  	 */
5611  	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5612  		return BTRFS_LOG_FORCE_COMMIT;
5613  
5614  	inode = btrfs_iget_logging(ino, root);
5615  	/*
5616  	 * If the other inode that had a conflicting dir entry was deleted in
5617  	 * the current transaction then we either:
5618  	 *
5619  	 * 1) Log the parent directory (later after adding it to the list) if
5620  	 *    the inode is a directory. This is because it may be a deleted
5621  	 *    subvolume/snapshot or it may be a regular directory that had
5622  	 *    deleted subvolumes/snapshots (or subdirectories that had them),
5623  	 *    and at the moment we can't deal with dropping subvolumes/snapshots
5624  	 *    during log replay. So we just log the parent, which will result in
5625  	 *    a fallback to a transaction commit if we are dealing with those
5626  	 *    cases (last_unlink_trans will match the current transaction);
5627  	 *
5628  	 * 2) Do nothing if it's not a directory. During log replay we simply
5629  	 *    unlink the conflicting dentry from the parent directory and then
5630  	 *    add the dentry for our inode. Like this we can avoid logging the
5631  	 *    parent directory (and maybe fallback to a transaction commit in
5632  	 *    case it has a last_unlink_trans == trans->transid, due to moving
5633  	 *    some inode from it to some other directory).
5634  	 */
5635  	if (IS_ERR(inode)) {
5636  		int ret = PTR_ERR(inode);
5637  
5638  		if (ret != -ENOENT)
5639  			return ret;
5640  
5641  		ret = conflicting_inode_is_dir(root, ino, path);
5642  		/* Not a directory or we got an error. */
5643  		if (ret <= 0)
5644  			return ret;
5645  
5646  		/* Conflicting inode is a directory, so we'll log its parent. */
5647  		ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5648  		if (!ino_elem)
5649  			return -ENOMEM;
5650  		ino_elem->ino = ino;
5651  		ino_elem->parent = parent;
5652  		list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5653  		ctx->num_conflict_inodes++;
5654  
5655  		return 0;
5656  	}
5657  
5658  	/*
5659  	 * If the inode was already logged skip it - otherwise we can hit an
5660  	 * infinite loop. Example:
5661  	 *
5662  	 * From the commit root (previous transaction) we have the following
5663  	 * inodes:
5664  	 *
5665  	 * inode 257 a directory
5666  	 * inode 258 with references "zz" and "zz_link" on inode 257
5667  	 * inode 259 with reference "a" on inode 257
5668  	 *
5669  	 * And in the current (uncommitted) transaction we have:
5670  	 *
5671  	 * inode 257 a directory, unchanged
5672  	 * inode 258 with references "a" and "a2" on inode 257
5673  	 * inode 259 with reference "zz_link" on inode 257
5674  	 * inode 261 with reference "zz" on inode 257
5675  	 *
5676  	 * When logging inode 261 the following infinite loop could
5677  	 * happen if we don't skip already logged inodes:
5678  	 *
5679  	 * - we detect inode 258 as a conflicting inode, with inode 261
5680  	 *   on reference "zz", and log it;
5681  	 *
5682  	 * - we detect inode 259 as a conflicting inode, with inode 258
5683  	 *   on reference "a", and log it;
5684  	 *
5685  	 * - we detect inode 258 as a conflicting inode, with inode 259
5686  	 *   on reference "zz_link", and log it - again! After this we
5687  	 *   repeat the above steps forever.
5688  	 *
5689  	 * Here we can use need_log_inode() because we only need to log the
5690  	 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5691  	 * so that the log ends up with the new name and without the old name.
5692  	 */
5693  	if (!need_log_inode(trans, BTRFS_I(inode))) {
5694  		btrfs_add_delayed_iput(BTRFS_I(inode));
5695  		return 0;
5696  	}
5697  
5698  	btrfs_add_delayed_iput(BTRFS_I(inode));
5699  
5700  	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5701  	if (!ino_elem)
5702  		return -ENOMEM;
5703  	ino_elem->ino = ino;
5704  	ino_elem->parent = parent;
5705  	list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5706  	ctx->num_conflict_inodes++;
5707  
5708  	return 0;
5709  }
5710  
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)5711  static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5712  				  struct btrfs_root *root,
5713  				  struct btrfs_log_ctx *ctx)
5714  {
5715  	int ret = 0;
5716  
5717  	/*
5718  	 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5719  	 * otherwise we could have unbounded recursion of btrfs_log_inode()
5720  	 * calls. This check guarantees we can have only 1 level of recursion.
5721  	 */
5722  	if (ctx->logging_conflict_inodes)
5723  		return 0;
5724  
5725  	ctx->logging_conflict_inodes = true;
5726  
5727  	/*
5728  	 * New conflicting inodes may be found and added to the list while we
5729  	 * are logging a conflicting inode, so keep iterating while the list is
5730  	 * not empty.
5731  	 */
5732  	while (!list_empty(&ctx->conflict_inodes)) {
5733  		struct btrfs_ino_list *curr;
5734  		struct inode *inode;
5735  		u64 ino;
5736  		u64 parent;
5737  
5738  		curr = list_first_entry(&ctx->conflict_inodes,
5739  					struct btrfs_ino_list, list);
5740  		ino = curr->ino;
5741  		parent = curr->parent;
5742  		list_del(&curr->list);
5743  		kfree(curr);
5744  
5745  		inode = btrfs_iget_logging(ino, root);
5746  		/*
5747  		 * If the other inode that had a conflicting dir entry was
5748  		 * deleted in the current transaction, we need to log its parent
5749  		 * directory. See the comment at add_conflicting_inode().
5750  		 */
5751  		if (IS_ERR(inode)) {
5752  			ret = PTR_ERR(inode);
5753  			if (ret != -ENOENT)
5754  				break;
5755  
5756  			inode = btrfs_iget_logging(parent, root);
5757  			if (IS_ERR(inode)) {
5758  				ret = PTR_ERR(inode);
5759  				break;
5760  			}
5761  
5762  			/*
5763  			 * Always log the directory, we cannot make this
5764  			 * conditional on need_log_inode() because the directory
5765  			 * might have been logged in LOG_INODE_EXISTS mode or
5766  			 * the dir index of the conflicting inode is not in a
5767  			 * dir index key range logged for the directory. So we
5768  			 * must make sure the deletion is recorded.
5769  			 */
5770  			ret = btrfs_log_inode(trans, BTRFS_I(inode),
5771  					      LOG_INODE_ALL, ctx);
5772  			btrfs_add_delayed_iput(BTRFS_I(inode));
5773  			if (ret)
5774  				break;
5775  			continue;
5776  		}
5777  
5778  		/*
5779  		 * Here we can use need_log_inode() because we only need to log
5780  		 * the inode in LOG_INODE_EXISTS mode and rename operations
5781  		 * update the log, so that the log ends up with the new name and
5782  		 * without the old name.
5783  		 *
5784  		 * We did this check at add_conflicting_inode(), but here we do
5785  		 * it again because if some other task logged the inode after
5786  		 * that, we can avoid doing it again.
5787  		 */
5788  		if (!need_log_inode(trans, BTRFS_I(inode))) {
5789  			btrfs_add_delayed_iput(BTRFS_I(inode));
5790  			continue;
5791  		}
5792  
5793  		/*
5794  		 * We are safe logging the other inode without acquiring its
5795  		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5796  		 * are safe against concurrent renames of the other inode as
5797  		 * well because during a rename we pin the log and update the
5798  		 * log with the new name before we unpin it.
5799  		 */
5800  		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
5801  		btrfs_add_delayed_iput(BTRFS_I(inode));
5802  		if (ret)
5803  			break;
5804  	}
5805  
5806  	ctx->logging_conflict_inodes = false;
5807  	if (ret)
5808  		free_conflicting_inodes(ctx);
5809  
5810  	return ret;
5811  }
5812  
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)5813  static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5814  				   struct btrfs_inode *inode,
5815  				   struct btrfs_key *min_key,
5816  				   const struct btrfs_key *max_key,
5817  				   struct btrfs_path *path,
5818  				   struct btrfs_path *dst_path,
5819  				   const u64 logged_isize,
5820  				   const int inode_only,
5821  				   struct btrfs_log_ctx *ctx,
5822  				   bool *need_log_inode_item)
5823  {
5824  	const u64 i_size = i_size_read(&inode->vfs_inode);
5825  	struct btrfs_root *root = inode->root;
5826  	int ins_start_slot = 0;
5827  	int ins_nr = 0;
5828  	int ret;
5829  
5830  	while (1) {
5831  		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5832  		if (ret < 0)
5833  			return ret;
5834  		if (ret > 0) {
5835  			ret = 0;
5836  			break;
5837  		}
5838  again:
5839  		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5840  		if (min_key->objectid != max_key->objectid)
5841  			break;
5842  		if (min_key->type > max_key->type)
5843  			break;
5844  
5845  		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5846  			*need_log_inode_item = false;
5847  		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5848  			   min_key->offset >= i_size) {
5849  			/*
5850  			 * Extents at and beyond eof are logged with
5851  			 * btrfs_log_prealloc_extents().
5852  			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5853  			 * and no keys greater than that, so bail out.
5854  			 */
5855  			break;
5856  		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5857  			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5858  			   (inode->generation == trans->transid ||
5859  			    ctx->logging_conflict_inodes)) {
5860  			u64 other_ino = 0;
5861  			u64 other_parent = 0;
5862  
5863  			ret = btrfs_check_ref_name_override(path->nodes[0],
5864  					path->slots[0], min_key, inode,
5865  					&other_ino, &other_parent);
5866  			if (ret < 0) {
5867  				return ret;
5868  			} else if (ret > 0 &&
5869  				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5870  				if (ins_nr > 0) {
5871  					ins_nr++;
5872  				} else {
5873  					ins_nr = 1;
5874  					ins_start_slot = path->slots[0];
5875  				}
5876  				ret = copy_items(trans, inode, dst_path, path,
5877  						 ins_start_slot, ins_nr,
5878  						 inode_only, logged_isize);
5879  				if (ret < 0)
5880  					return ret;
5881  				ins_nr = 0;
5882  
5883  				btrfs_release_path(path);
5884  				ret = add_conflicting_inode(trans, root, path,
5885  							    other_ino,
5886  							    other_parent, ctx);
5887  				if (ret)
5888  					return ret;
5889  				goto next_key;
5890  			}
5891  		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5892  			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5893  			if (ins_nr == 0)
5894  				goto next_slot;
5895  			ret = copy_items(trans, inode, dst_path, path,
5896  					 ins_start_slot,
5897  					 ins_nr, inode_only, logged_isize);
5898  			if (ret < 0)
5899  				return ret;
5900  			ins_nr = 0;
5901  			goto next_slot;
5902  		}
5903  
5904  		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5905  			ins_nr++;
5906  			goto next_slot;
5907  		} else if (!ins_nr) {
5908  			ins_start_slot = path->slots[0];
5909  			ins_nr = 1;
5910  			goto next_slot;
5911  		}
5912  
5913  		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5914  				 ins_nr, inode_only, logged_isize);
5915  		if (ret < 0)
5916  			return ret;
5917  		ins_nr = 1;
5918  		ins_start_slot = path->slots[0];
5919  next_slot:
5920  		path->slots[0]++;
5921  		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5922  			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5923  					      path->slots[0]);
5924  			goto again;
5925  		}
5926  		if (ins_nr) {
5927  			ret = copy_items(trans, inode, dst_path, path,
5928  					 ins_start_slot, ins_nr, inode_only,
5929  					 logged_isize);
5930  			if (ret < 0)
5931  				return ret;
5932  			ins_nr = 0;
5933  		}
5934  		btrfs_release_path(path);
5935  next_key:
5936  		if (min_key->offset < (u64)-1) {
5937  			min_key->offset++;
5938  		} else if (min_key->type < max_key->type) {
5939  			min_key->type++;
5940  			min_key->offset = 0;
5941  		} else {
5942  			break;
5943  		}
5944  
5945  		/*
5946  		 * We may process many leaves full of items for our inode, so
5947  		 * avoid monopolizing a cpu for too long by rescheduling while
5948  		 * not holding locks on any tree.
5949  		 */
5950  		cond_resched();
5951  	}
5952  	if (ins_nr) {
5953  		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5954  				 ins_nr, inode_only, logged_isize);
5955  		if (ret)
5956  			return ret;
5957  	}
5958  
5959  	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5960  		/*
5961  		 * Release the path because otherwise we might attempt to double
5962  		 * lock the same leaf with btrfs_log_prealloc_extents() below.
5963  		 */
5964  		btrfs_release_path(path);
5965  		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5966  	}
5967  
5968  	return ret;
5969  }
5970  
insert_delayed_items_batch(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,const struct btrfs_item_batch * batch,const struct btrfs_delayed_item * first_item)5971  static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
5972  				      struct btrfs_root *log,
5973  				      struct btrfs_path *path,
5974  				      const struct btrfs_item_batch *batch,
5975  				      const struct btrfs_delayed_item *first_item)
5976  {
5977  	const struct btrfs_delayed_item *curr = first_item;
5978  	int ret;
5979  
5980  	ret = btrfs_insert_empty_items(trans, log, path, batch);
5981  	if (ret)
5982  		return ret;
5983  
5984  	for (int i = 0; i < batch->nr; i++) {
5985  		char *data_ptr;
5986  
5987  		data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
5988  		write_extent_buffer(path->nodes[0], &curr->data,
5989  				    (unsigned long)data_ptr, curr->data_len);
5990  		curr = list_next_entry(curr, log_list);
5991  		path->slots[0]++;
5992  	}
5993  
5994  	btrfs_release_path(path);
5995  
5996  	return 0;
5997  }
5998  
log_delayed_insertion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)5999  static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6000  				       struct btrfs_inode *inode,
6001  				       struct btrfs_path *path,
6002  				       const struct list_head *delayed_ins_list,
6003  				       struct btrfs_log_ctx *ctx)
6004  {
6005  	/* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6006  	const int max_batch_size = 195;
6007  	const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6008  	const u64 ino = btrfs_ino(inode);
6009  	struct btrfs_root *log = inode->root->log_root;
6010  	struct btrfs_item_batch batch = {
6011  		.nr = 0,
6012  		.total_data_size = 0,
6013  	};
6014  	const struct btrfs_delayed_item *first = NULL;
6015  	const struct btrfs_delayed_item *curr;
6016  	char *ins_data;
6017  	struct btrfs_key *ins_keys;
6018  	u32 *ins_sizes;
6019  	u64 curr_batch_size = 0;
6020  	int batch_idx = 0;
6021  	int ret;
6022  
6023  	/* We are adding dir index items to the log tree. */
6024  	lockdep_assert_held(&inode->log_mutex);
6025  
6026  	/*
6027  	 * We collect delayed items before copying index keys from the subvolume
6028  	 * to the log tree. However just after we collected them, they may have
6029  	 * been flushed (all of them or just some of them), and therefore we
6030  	 * could have copied them from the subvolume tree to the log tree.
6031  	 * So find the first delayed item that was not yet logged (they are
6032  	 * sorted by index number).
6033  	 */
6034  	list_for_each_entry(curr, delayed_ins_list, log_list) {
6035  		if (curr->index > inode->last_dir_index_offset) {
6036  			first = curr;
6037  			break;
6038  		}
6039  	}
6040  
6041  	/* Empty list or all delayed items were already logged. */
6042  	if (!first)
6043  		return 0;
6044  
6045  	ins_data = kmalloc(max_batch_size * sizeof(u32) +
6046  			   max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6047  	if (!ins_data)
6048  		return -ENOMEM;
6049  	ins_sizes = (u32 *)ins_data;
6050  	batch.data_sizes = ins_sizes;
6051  	ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6052  	batch.keys = ins_keys;
6053  
6054  	curr = first;
6055  	while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6056  		const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6057  
6058  		if (curr_batch_size + curr_size > leaf_data_size ||
6059  		    batch.nr == max_batch_size) {
6060  			ret = insert_delayed_items_batch(trans, log, path,
6061  							 &batch, first);
6062  			if (ret)
6063  				goto out;
6064  			batch_idx = 0;
6065  			batch.nr = 0;
6066  			batch.total_data_size = 0;
6067  			curr_batch_size = 0;
6068  			first = curr;
6069  		}
6070  
6071  		ins_sizes[batch_idx] = curr->data_len;
6072  		ins_keys[batch_idx].objectid = ino;
6073  		ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6074  		ins_keys[batch_idx].offset = curr->index;
6075  		curr_batch_size += curr_size;
6076  		batch.total_data_size += curr->data_len;
6077  		batch.nr++;
6078  		batch_idx++;
6079  		curr = list_next_entry(curr, log_list);
6080  	}
6081  
6082  	ASSERT(batch.nr >= 1);
6083  	ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6084  
6085  	curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6086  			       log_list);
6087  	inode->last_dir_index_offset = curr->index;
6088  out:
6089  	kfree(ins_data);
6090  
6091  	return ret;
6092  }
6093  
log_delayed_deletions_full(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6094  static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6095  				      struct btrfs_inode *inode,
6096  				      struct btrfs_path *path,
6097  				      const struct list_head *delayed_del_list,
6098  				      struct btrfs_log_ctx *ctx)
6099  {
6100  	const u64 ino = btrfs_ino(inode);
6101  	const struct btrfs_delayed_item *curr;
6102  
6103  	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6104  				log_list);
6105  
6106  	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6107  		u64 first_dir_index = curr->index;
6108  		u64 last_dir_index;
6109  		const struct btrfs_delayed_item *next;
6110  		int ret;
6111  
6112  		/*
6113  		 * Find a range of consecutive dir index items to delete. Like
6114  		 * this we log a single dir range item spanning several contiguous
6115  		 * dir items instead of logging one range item per dir index item.
6116  		 */
6117  		next = list_next_entry(curr, log_list);
6118  		while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6119  			if (next->index != curr->index + 1)
6120  				break;
6121  			curr = next;
6122  			next = list_next_entry(next, log_list);
6123  		}
6124  
6125  		last_dir_index = curr->index;
6126  		ASSERT(last_dir_index >= first_dir_index);
6127  
6128  		ret = insert_dir_log_key(trans, inode->root->log_root, path,
6129  					 ino, first_dir_index, last_dir_index);
6130  		if (ret)
6131  			return ret;
6132  		curr = list_next_entry(curr, log_list);
6133  	}
6134  
6135  	return 0;
6136  }
6137  
batch_delete_dir_index_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx,const struct list_head * delayed_del_list,const struct btrfs_delayed_item * first,const struct btrfs_delayed_item ** last_ret)6138  static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6139  					struct btrfs_inode *inode,
6140  					struct btrfs_path *path,
6141  					struct btrfs_log_ctx *ctx,
6142  					const struct list_head *delayed_del_list,
6143  					const struct btrfs_delayed_item *first,
6144  					const struct btrfs_delayed_item **last_ret)
6145  {
6146  	const struct btrfs_delayed_item *next;
6147  	struct extent_buffer *leaf = path->nodes[0];
6148  	const int last_slot = btrfs_header_nritems(leaf) - 1;
6149  	int slot = path->slots[0] + 1;
6150  	const u64 ino = btrfs_ino(inode);
6151  
6152  	next = list_next_entry(first, log_list);
6153  
6154  	while (slot < last_slot &&
6155  	       !list_entry_is_head(next, delayed_del_list, log_list)) {
6156  		struct btrfs_key key;
6157  
6158  		btrfs_item_key_to_cpu(leaf, &key, slot);
6159  		if (key.objectid != ino ||
6160  		    key.type != BTRFS_DIR_INDEX_KEY ||
6161  		    key.offset != next->index)
6162  			break;
6163  
6164  		slot++;
6165  		*last_ret = next;
6166  		next = list_next_entry(next, log_list);
6167  	}
6168  
6169  	return btrfs_del_items(trans, inode->root->log_root, path,
6170  			       path->slots[0], slot - path->slots[0]);
6171  }
6172  
log_delayed_deletions_incremental(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6173  static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6174  					     struct btrfs_inode *inode,
6175  					     struct btrfs_path *path,
6176  					     const struct list_head *delayed_del_list,
6177  					     struct btrfs_log_ctx *ctx)
6178  {
6179  	struct btrfs_root *log = inode->root->log_root;
6180  	const struct btrfs_delayed_item *curr;
6181  	u64 last_range_start = 0;
6182  	u64 last_range_end = 0;
6183  	struct btrfs_key key;
6184  
6185  	key.objectid = btrfs_ino(inode);
6186  	key.type = BTRFS_DIR_INDEX_KEY;
6187  	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6188  				log_list);
6189  
6190  	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6191  		const struct btrfs_delayed_item *last = curr;
6192  		u64 first_dir_index = curr->index;
6193  		u64 last_dir_index;
6194  		bool deleted_items = false;
6195  		int ret;
6196  
6197  		key.offset = curr->index;
6198  		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6199  		if (ret < 0) {
6200  			return ret;
6201  		} else if (ret == 0) {
6202  			ret = batch_delete_dir_index_items(trans, inode, path, ctx,
6203  							   delayed_del_list, curr,
6204  							   &last);
6205  			if (ret)
6206  				return ret;
6207  			deleted_items = true;
6208  		}
6209  
6210  		btrfs_release_path(path);
6211  
6212  		/*
6213  		 * If we deleted items from the leaf, it means we have a range
6214  		 * item logging their range, so no need to add one or update an
6215  		 * existing one. Otherwise we have to log a dir range item.
6216  		 */
6217  		if (deleted_items)
6218  			goto next_batch;
6219  
6220  		last_dir_index = last->index;
6221  		ASSERT(last_dir_index >= first_dir_index);
6222  		/*
6223  		 * If this range starts right after where the previous one ends,
6224  		 * then we want to reuse the previous range item and change its
6225  		 * end offset to the end of this range. This is just to minimize
6226  		 * leaf space usage, by avoiding adding a new range item.
6227  		 */
6228  		if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6229  			first_dir_index = last_range_start;
6230  
6231  		ret = insert_dir_log_key(trans, log, path, key.objectid,
6232  					 first_dir_index, last_dir_index);
6233  		if (ret)
6234  			return ret;
6235  
6236  		last_range_start = first_dir_index;
6237  		last_range_end = last_dir_index;
6238  next_batch:
6239  		curr = list_next_entry(last, log_list);
6240  	}
6241  
6242  	return 0;
6243  }
6244  
log_delayed_deletion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6245  static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6246  				      struct btrfs_inode *inode,
6247  				      struct btrfs_path *path,
6248  				      const struct list_head *delayed_del_list,
6249  				      struct btrfs_log_ctx *ctx)
6250  {
6251  	/*
6252  	 * We are deleting dir index items from the log tree or adding range
6253  	 * items to it.
6254  	 */
6255  	lockdep_assert_held(&inode->log_mutex);
6256  
6257  	if (list_empty(delayed_del_list))
6258  		return 0;
6259  
6260  	if (ctx->logged_before)
6261  		return log_delayed_deletions_incremental(trans, inode, path,
6262  							 delayed_del_list, ctx);
6263  
6264  	return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6265  					  ctx);
6266  }
6267  
6268  /*
6269   * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6270   * items instead of the subvolume tree.
6271   */
log_new_delayed_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6272  static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6273  				    struct btrfs_inode *inode,
6274  				    const struct list_head *delayed_ins_list,
6275  				    struct btrfs_log_ctx *ctx)
6276  {
6277  	const bool orig_log_new_dentries = ctx->log_new_dentries;
6278  	struct btrfs_delayed_item *item;
6279  	int ret = 0;
6280  
6281  	/*
6282  	 * No need for the log mutex, plus to avoid potential deadlocks or
6283  	 * lockdep annotations due to nesting of delayed inode mutexes and log
6284  	 * mutexes.
6285  	 */
6286  	lockdep_assert_not_held(&inode->log_mutex);
6287  
6288  	ASSERT(!ctx->logging_new_delayed_dentries);
6289  	ctx->logging_new_delayed_dentries = true;
6290  
6291  	list_for_each_entry(item, delayed_ins_list, log_list) {
6292  		struct btrfs_dir_item *dir_item;
6293  		struct inode *di_inode;
6294  		struct btrfs_key key;
6295  		int log_mode = LOG_INODE_EXISTS;
6296  
6297  		dir_item = (struct btrfs_dir_item *)item->data;
6298  		btrfs_disk_key_to_cpu(&key, &dir_item->location);
6299  
6300  		if (key.type == BTRFS_ROOT_ITEM_KEY)
6301  			continue;
6302  
6303  		di_inode = btrfs_iget_logging(key.objectid, inode->root);
6304  		if (IS_ERR(di_inode)) {
6305  			ret = PTR_ERR(di_inode);
6306  			break;
6307  		}
6308  
6309  		if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6310  			btrfs_add_delayed_iput(BTRFS_I(di_inode));
6311  			continue;
6312  		}
6313  
6314  		if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6315  			log_mode = LOG_INODE_ALL;
6316  
6317  		ctx->log_new_dentries = false;
6318  		ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
6319  
6320  		if (!ret && ctx->log_new_dentries)
6321  			ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
6322  
6323  		btrfs_add_delayed_iput(BTRFS_I(di_inode));
6324  
6325  		if (ret)
6326  			break;
6327  	}
6328  
6329  	ctx->log_new_dentries = orig_log_new_dentries;
6330  	ctx->logging_new_delayed_dentries = false;
6331  
6332  	return ret;
6333  }
6334  
6335  /* log a single inode in the tree log.
6336   * At least one parent directory for this inode must exist in the tree
6337   * or be logged already.
6338   *
6339   * Any items from this inode changed by the current transaction are copied
6340   * to the log tree.  An extra reference is taken on any extents in this
6341   * file, allowing us to avoid a whole pile of corner cases around logging
6342   * blocks that have been removed from the tree.
6343   *
6344   * See LOG_INODE_ALL and related defines for a description of what inode_only
6345   * does.
6346   *
6347   * This handles both files and directories.
6348   */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)6349  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6350  			   struct btrfs_inode *inode,
6351  			   int inode_only,
6352  			   struct btrfs_log_ctx *ctx)
6353  {
6354  	struct btrfs_path *path;
6355  	struct btrfs_path *dst_path;
6356  	struct btrfs_key min_key;
6357  	struct btrfs_key max_key;
6358  	struct btrfs_root *log = inode->root->log_root;
6359  	int ret;
6360  	bool fast_search = false;
6361  	u64 ino = btrfs_ino(inode);
6362  	struct extent_map_tree *em_tree = &inode->extent_tree;
6363  	u64 logged_isize = 0;
6364  	bool need_log_inode_item = true;
6365  	bool xattrs_logged = false;
6366  	bool inode_item_dropped = true;
6367  	bool full_dir_logging = false;
6368  	LIST_HEAD(delayed_ins_list);
6369  	LIST_HEAD(delayed_del_list);
6370  
6371  	path = btrfs_alloc_path();
6372  	if (!path)
6373  		return -ENOMEM;
6374  	dst_path = btrfs_alloc_path();
6375  	if (!dst_path) {
6376  		btrfs_free_path(path);
6377  		return -ENOMEM;
6378  	}
6379  
6380  	min_key.objectid = ino;
6381  	min_key.type = BTRFS_INODE_ITEM_KEY;
6382  	min_key.offset = 0;
6383  
6384  	max_key.objectid = ino;
6385  
6386  
6387  	/* today the code can only do partial logging of directories */
6388  	if (S_ISDIR(inode->vfs_inode.i_mode) ||
6389  	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6390  		       &inode->runtime_flags) &&
6391  	     inode_only >= LOG_INODE_EXISTS))
6392  		max_key.type = BTRFS_XATTR_ITEM_KEY;
6393  	else
6394  		max_key.type = (u8)-1;
6395  	max_key.offset = (u64)-1;
6396  
6397  	if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6398  		full_dir_logging = true;
6399  
6400  	/*
6401  	 * If we are logging a directory while we are logging dentries of the
6402  	 * delayed items of some other inode, then we need to flush the delayed
6403  	 * items of this directory and not log the delayed items directly. This
6404  	 * is to prevent more than one level of recursion into btrfs_log_inode()
6405  	 * by having something like this:
6406  	 *
6407  	 *     $ mkdir -p a/b/c/d/e/f/g/h/...
6408  	 *     $ xfs_io -c "fsync" a
6409  	 *
6410  	 * Where all directories in the path did not exist before and are
6411  	 * created in the current transaction.
6412  	 * So in such a case we directly log the delayed items of the main
6413  	 * directory ("a") without flushing them first, while for each of its
6414  	 * subdirectories we flush their delayed items before logging them.
6415  	 * This prevents a potential unbounded recursion like this:
6416  	 *
6417  	 * btrfs_log_inode()
6418  	 *   log_new_delayed_dentries()
6419  	 *      btrfs_log_inode()
6420  	 *        log_new_delayed_dentries()
6421  	 *          btrfs_log_inode()
6422  	 *            log_new_delayed_dentries()
6423  	 *              (...)
6424  	 *
6425  	 * We have thresholds for the maximum number of delayed items to have in
6426  	 * memory, and once they are hit, the items are flushed asynchronously.
6427  	 * However the limit is quite high, so lets prevent deep levels of
6428  	 * recursion to happen by limiting the maximum depth to be 1.
6429  	 */
6430  	if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6431  		ret = btrfs_commit_inode_delayed_items(trans, inode);
6432  		if (ret)
6433  			goto out;
6434  	}
6435  
6436  	mutex_lock(&inode->log_mutex);
6437  
6438  	/*
6439  	 * For symlinks, we must always log their content, which is stored in an
6440  	 * inline extent, otherwise we could end up with an empty symlink after
6441  	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6442  	 * one attempts to create an empty symlink).
6443  	 * We don't need to worry about flushing delalloc, because when we create
6444  	 * the inline extent when the symlink is created (we never have delalloc
6445  	 * for symlinks).
6446  	 */
6447  	if (S_ISLNK(inode->vfs_inode.i_mode))
6448  		inode_only = LOG_INODE_ALL;
6449  
6450  	/*
6451  	 * Before logging the inode item, cache the value returned by
6452  	 * inode_logged(), because after that we have the need to figure out if
6453  	 * the inode was previously logged in this transaction.
6454  	 */
6455  	ret = inode_logged(trans, inode, path);
6456  	if (ret < 0)
6457  		goto out_unlock;
6458  	ctx->logged_before = (ret == 1);
6459  	ret = 0;
6460  
6461  	/*
6462  	 * This is for cases where logging a directory could result in losing a
6463  	 * a file after replaying the log. For example, if we move a file from a
6464  	 * directory A to a directory B, then fsync directory A, we have no way
6465  	 * to known the file was moved from A to B, so logging just A would
6466  	 * result in losing the file after a log replay.
6467  	 */
6468  	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6469  		ret = BTRFS_LOG_FORCE_COMMIT;
6470  		goto out_unlock;
6471  	}
6472  
6473  	/*
6474  	 * a brute force approach to making sure we get the most uptodate
6475  	 * copies of everything.
6476  	 */
6477  	if (S_ISDIR(inode->vfs_inode.i_mode)) {
6478  		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6479  		if (ctx->logged_before)
6480  			ret = drop_inode_items(trans, log, path, inode,
6481  					       BTRFS_XATTR_ITEM_KEY);
6482  	} else {
6483  		if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6484  			/*
6485  			 * Make sure the new inode item we write to the log has
6486  			 * the same isize as the current one (if it exists).
6487  			 * This is necessary to prevent data loss after log
6488  			 * replay, and also to prevent doing a wrong expanding
6489  			 * truncate - for e.g. create file, write 4K into offset
6490  			 * 0, fsync, write 4K into offset 4096, add hard link,
6491  			 * fsync some other file (to sync log), power fail - if
6492  			 * we use the inode's current i_size, after log replay
6493  			 * we get a 8Kb file, with the last 4Kb extent as a hole
6494  			 * (zeroes), as if an expanding truncate happened,
6495  			 * instead of getting a file of 4Kb only.
6496  			 */
6497  			ret = logged_inode_size(log, inode, path, &logged_isize);
6498  			if (ret)
6499  				goto out_unlock;
6500  		}
6501  		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6502  			     &inode->runtime_flags)) {
6503  			if (inode_only == LOG_INODE_EXISTS) {
6504  				max_key.type = BTRFS_XATTR_ITEM_KEY;
6505  				if (ctx->logged_before)
6506  					ret = drop_inode_items(trans, log, path,
6507  							       inode, max_key.type);
6508  			} else {
6509  				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6510  					  &inode->runtime_flags);
6511  				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6512  					  &inode->runtime_flags);
6513  				if (ctx->logged_before)
6514  					ret = truncate_inode_items(trans, log,
6515  								   inode, 0, 0);
6516  			}
6517  		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6518  					      &inode->runtime_flags) ||
6519  			   inode_only == LOG_INODE_EXISTS) {
6520  			if (inode_only == LOG_INODE_ALL)
6521  				fast_search = true;
6522  			max_key.type = BTRFS_XATTR_ITEM_KEY;
6523  			if (ctx->logged_before)
6524  				ret = drop_inode_items(trans, log, path, inode,
6525  						       max_key.type);
6526  		} else {
6527  			if (inode_only == LOG_INODE_ALL)
6528  				fast_search = true;
6529  			inode_item_dropped = false;
6530  			goto log_extents;
6531  		}
6532  
6533  	}
6534  	if (ret)
6535  		goto out_unlock;
6536  
6537  	/*
6538  	 * If we are logging a directory in full mode, collect the delayed items
6539  	 * before iterating the subvolume tree, so that we don't miss any new
6540  	 * dir index items in case they get flushed while or right after we are
6541  	 * iterating the subvolume tree.
6542  	 */
6543  	if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6544  		btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6545  					    &delayed_del_list);
6546  
6547  	ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6548  				      path, dst_path, logged_isize,
6549  				      inode_only, ctx,
6550  				      &need_log_inode_item);
6551  	if (ret)
6552  		goto out_unlock;
6553  
6554  	btrfs_release_path(path);
6555  	btrfs_release_path(dst_path);
6556  	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6557  	if (ret)
6558  		goto out_unlock;
6559  	xattrs_logged = true;
6560  	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6561  		btrfs_release_path(path);
6562  		btrfs_release_path(dst_path);
6563  		ret = btrfs_log_holes(trans, inode, path);
6564  		if (ret)
6565  			goto out_unlock;
6566  	}
6567  log_extents:
6568  	btrfs_release_path(path);
6569  	btrfs_release_path(dst_path);
6570  	if (need_log_inode_item) {
6571  		ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6572  		if (ret)
6573  			goto out_unlock;
6574  		/*
6575  		 * If we are doing a fast fsync and the inode was logged before
6576  		 * in this transaction, we don't need to log the xattrs because
6577  		 * they were logged before. If xattrs were added, changed or
6578  		 * deleted since the last time we logged the inode, then we have
6579  		 * already logged them because the inode had the runtime flag
6580  		 * BTRFS_INODE_COPY_EVERYTHING set.
6581  		 */
6582  		if (!xattrs_logged && inode->logged_trans < trans->transid) {
6583  			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6584  			if (ret)
6585  				goto out_unlock;
6586  			btrfs_release_path(path);
6587  		}
6588  	}
6589  	if (fast_search) {
6590  		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6591  		if (ret)
6592  			goto out_unlock;
6593  	} else if (inode_only == LOG_INODE_ALL) {
6594  		struct extent_map *em, *n;
6595  
6596  		write_lock(&em_tree->lock);
6597  		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6598  			list_del_init(&em->list);
6599  		write_unlock(&em_tree->lock);
6600  	}
6601  
6602  	if (full_dir_logging) {
6603  		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6604  		if (ret)
6605  			goto out_unlock;
6606  		ret = log_delayed_insertion_items(trans, inode, path,
6607  						  &delayed_ins_list, ctx);
6608  		if (ret)
6609  			goto out_unlock;
6610  		ret = log_delayed_deletion_items(trans, inode, path,
6611  						 &delayed_del_list, ctx);
6612  		if (ret)
6613  			goto out_unlock;
6614  	}
6615  
6616  	spin_lock(&inode->lock);
6617  	inode->logged_trans = trans->transid;
6618  	/*
6619  	 * Don't update last_log_commit if we logged that an inode exists.
6620  	 * We do this for three reasons:
6621  	 *
6622  	 * 1) We might have had buffered writes to this inode that were
6623  	 *    flushed and had their ordered extents completed in this
6624  	 *    transaction, but we did not previously log the inode with
6625  	 *    LOG_INODE_ALL. Later the inode was evicted and after that
6626  	 *    it was loaded again and this LOG_INODE_EXISTS log operation
6627  	 *    happened. We must make sure that if an explicit fsync against
6628  	 *    the inode is performed later, it logs the new extents, an
6629  	 *    updated inode item, etc, and syncs the log. The same logic
6630  	 *    applies to direct IO writes instead of buffered writes.
6631  	 *
6632  	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6633  	 *    is logged with an i_size of 0 or whatever value was logged
6634  	 *    before. If later the i_size of the inode is increased by a
6635  	 *    truncate operation, the log is synced through an fsync of
6636  	 *    some other inode and then finally an explicit fsync against
6637  	 *    this inode is made, we must make sure this fsync logs the
6638  	 *    inode with the new i_size, the hole between old i_size and
6639  	 *    the new i_size, and syncs the log.
6640  	 *
6641  	 * 3) If we are logging that an ancestor inode exists as part of
6642  	 *    logging a new name from a link or rename operation, don't update
6643  	 *    its last_log_commit - otherwise if an explicit fsync is made
6644  	 *    against an ancestor, the fsync considers the inode in the log
6645  	 *    and doesn't sync the log, resulting in the ancestor missing after
6646  	 *    a power failure unless the log was synced as part of an fsync
6647  	 *    against any other unrelated inode.
6648  	 */
6649  	if (inode_only != LOG_INODE_EXISTS)
6650  		inode->last_log_commit = inode->last_sub_trans;
6651  	spin_unlock(&inode->lock);
6652  
6653  	/*
6654  	 * Reset the last_reflink_trans so that the next fsync does not need to
6655  	 * go through the slower path when logging extents and their checksums.
6656  	 */
6657  	if (inode_only == LOG_INODE_ALL)
6658  		inode->last_reflink_trans = 0;
6659  
6660  out_unlock:
6661  	mutex_unlock(&inode->log_mutex);
6662  out:
6663  	btrfs_free_path(path);
6664  	btrfs_free_path(dst_path);
6665  
6666  	if (ret)
6667  		free_conflicting_inodes(ctx);
6668  	else
6669  		ret = log_conflicting_inodes(trans, inode->root, ctx);
6670  
6671  	if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6672  		if (!ret)
6673  			ret = log_new_delayed_dentries(trans, inode,
6674  						       &delayed_ins_list, ctx);
6675  
6676  		btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6677  					    &delayed_del_list);
6678  	}
6679  
6680  	return ret;
6681  }
6682  
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)6683  static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6684  				 struct btrfs_inode *inode,
6685  				 struct btrfs_log_ctx *ctx)
6686  {
6687  	int ret;
6688  	struct btrfs_path *path;
6689  	struct btrfs_key key;
6690  	struct btrfs_root *root = inode->root;
6691  	const u64 ino = btrfs_ino(inode);
6692  
6693  	path = btrfs_alloc_path();
6694  	if (!path)
6695  		return -ENOMEM;
6696  	path->skip_locking = 1;
6697  	path->search_commit_root = 1;
6698  
6699  	key.objectid = ino;
6700  	key.type = BTRFS_INODE_REF_KEY;
6701  	key.offset = 0;
6702  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6703  	if (ret < 0)
6704  		goto out;
6705  
6706  	while (true) {
6707  		struct extent_buffer *leaf = path->nodes[0];
6708  		int slot = path->slots[0];
6709  		u32 cur_offset = 0;
6710  		u32 item_size;
6711  		unsigned long ptr;
6712  
6713  		if (slot >= btrfs_header_nritems(leaf)) {
6714  			ret = btrfs_next_leaf(root, path);
6715  			if (ret < 0)
6716  				goto out;
6717  			else if (ret > 0)
6718  				break;
6719  			continue;
6720  		}
6721  
6722  		btrfs_item_key_to_cpu(leaf, &key, slot);
6723  		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6724  		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6725  			break;
6726  
6727  		item_size = btrfs_item_size(leaf, slot);
6728  		ptr = btrfs_item_ptr_offset(leaf, slot);
6729  		while (cur_offset < item_size) {
6730  			struct btrfs_key inode_key;
6731  			struct inode *dir_inode;
6732  
6733  			inode_key.type = BTRFS_INODE_ITEM_KEY;
6734  			inode_key.offset = 0;
6735  
6736  			if (key.type == BTRFS_INODE_EXTREF_KEY) {
6737  				struct btrfs_inode_extref *extref;
6738  
6739  				extref = (struct btrfs_inode_extref *)
6740  					(ptr + cur_offset);
6741  				inode_key.objectid = btrfs_inode_extref_parent(
6742  					leaf, extref);
6743  				cur_offset += sizeof(*extref);
6744  				cur_offset += btrfs_inode_extref_name_len(leaf,
6745  					extref);
6746  			} else {
6747  				inode_key.objectid = key.offset;
6748  				cur_offset = item_size;
6749  			}
6750  
6751  			dir_inode = btrfs_iget_logging(inode_key.objectid, root);
6752  			/*
6753  			 * If the parent inode was deleted, return an error to
6754  			 * fallback to a transaction commit. This is to prevent
6755  			 * getting an inode that was moved from one parent A to
6756  			 * a parent B, got its former parent A deleted and then
6757  			 * it got fsync'ed, from existing at both parents after
6758  			 * a log replay (and the old parent still existing).
6759  			 * Example:
6760  			 *
6761  			 * mkdir /mnt/A
6762  			 * mkdir /mnt/B
6763  			 * touch /mnt/B/bar
6764  			 * sync
6765  			 * mv /mnt/B/bar /mnt/A/bar
6766  			 * mv -T /mnt/A /mnt/B
6767  			 * fsync /mnt/B/bar
6768  			 * <power fail>
6769  			 *
6770  			 * If we ignore the old parent B which got deleted,
6771  			 * after a log replay we would have file bar linked
6772  			 * at both parents and the old parent B would still
6773  			 * exist.
6774  			 */
6775  			if (IS_ERR(dir_inode)) {
6776  				ret = PTR_ERR(dir_inode);
6777  				goto out;
6778  			}
6779  
6780  			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6781  				btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6782  				continue;
6783  			}
6784  
6785  			ctx->log_new_dentries = false;
6786  			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6787  					      LOG_INODE_ALL, ctx);
6788  			if (!ret && ctx->log_new_dentries)
6789  				ret = log_new_dir_dentries(trans,
6790  						   BTRFS_I(dir_inode), ctx);
6791  			btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6792  			if (ret)
6793  				goto out;
6794  		}
6795  		path->slots[0]++;
6796  	}
6797  	ret = 0;
6798  out:
6799  	btrfs_free_path(path);
6800  	return ret;
6801  }
6802  
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)6803  static int log_new_ancestors(struct btrfs_trans_handle *trans,
6804  			     struct btrfs_root *root,
6805  			     struct btrfs_path *path,
6806  			     struct btrfs_log_ctx *ctx)
6807  {
6808  	struct btrfs_key found_key;
6809  
6810  	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6811  
6812  	while (true) {
6813  		struct extent_buffer *leaf;
6814  		int slot;
6815  		struct btrfs_key search_key;
6816  		struct inode *inode;
6817  		u64 ino;
6818  		int ret = 0;
6819  
6820  		btrfs_release_path(path);
6821  
6822  		ino = found_key.offset;
6823  
6824  		search_key.objectid = found_key.offset;
6825  		search_key.type = BTRFS_INODE_ITEM_KEY;
6826  		search_key.offset = 0;
6827  		inode = btrfs_iget_logging(ino, root);
6828  		if (IS_ERR(inode))
6829  			return PTR_ERR(inode);
6830  
6831  		if (BTRFS_I(inode)->generation >= trans->transid &&
6832  		    need_log_inode(trans, BTRFS_I(inode)))
6833  			ret = btrfs_log_inode(trans, BTRFS_I(inode),
6834  					      LOG_INODE_EXISTS, ctx);
6835  		btrfs_add_delayed_iput(BTRFS_I(inode));
6836  		if (ret)
6837  			return ret;
6838  
6839  		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6840  			break;
6841  
6842  		search_key.type = BTRFS_INODE_REF_KEY;
6843  		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6844  		if (ret < 0)
6845  			return ret;
6846  
6847  		leaf = path->nodes[0];
6848  		slot = path->slots[0];
6849  		if (slot >= btrfs_header_nritems(leaf)) {
6850  			ret = btrfs_next_leaf(root, path);
6851  			if (ret < 0)
6852  				return ret;
6853  			else if (ret > 0)
6854  				return -ENOENT;
6855  			leaf = path->nodes[0];
6856  			slot = path->slots[0];
6857  		}
6858  
6859  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6860  		if (found_key.objectid != search_key.objectid ||
6861  		    found_key.type != BTRFS_INODE_REF_KEY)
6862  			return -ENOENT;
6863  	}
6864  	return 0;
6865  }
6866  
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6867  static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6868  				  struct btrfs_inode *inode,
6869  				  struct dentry *parent,
6870  				  struct btrfs_log_ctx *ctx)
6871  {
6872  	struct btrfs_root *root = inode->root;
6873  	struct dentry *old_parent = NULL;
6874  	struct super_block *sb = inode->vfs_inode.i_sb;
6875  	int ret = 0;
6876  
6877  	while (true) {
6878  		if (!parent || d_really_is_negative(parent) ||
6879  		    sb != parent->d_sb)
6880  			break;
6881  
6882  		inode = BTRFS_I(d_inode(parent));
6883  		if (root != inode->root)
6884  			break;
6885  
6886  		if (inode->generation >= trans->transid &&
6887  		    need_log_inode(trans, inode)) {
6888  			ret = btrfs_log_inode(trans, inode,
6889  					      LOG_INODE_EXISTS, ctx);
6890  			if (ret)
6891  				break;
6892  		}
6893  		if (IS_ROOT(parent))
6894  			break;
6895  
6896  		parent = dget_parent(parent);
6897  		dput(old_parent);
6898  		old_parent = parent;
6899  	}
6900  	dput(old_parent);
6901  
6902  	return ret;
6903  }
6904  
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6905  static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6906  				 struct btrfs_inode *inode,
6907  				 struct dentry *parent,
6908  				 struct btrfs_log_ctx *ctx)
6909  {
6910  	struct btrfs_root *root = inode->root;
6911  	const u64 ino = btrfs_ino(inode);
6912  	struct btrfs_path *path;
6913  	struct btrfs_key search_key;
6914  	int ret;
6915  
6916  	/*
6917  	 * For a single hard link case, go through a fast path that does not
6918  	 * need to iterate the fs/subvolume tree.
6919  	 */
6920  	if (inode->vfs_inode.i_nlink < 2)
6921  		return log_new_ancestors_fast(trans, inode, parent, ctx);
6922  
6923  	path = btrfs_alloc_path();
6924  	if (!path)
6925  		return -ENOMEM;
6926  
6927  	search_key.objectid = ino;
6928  	search_key.type = BTRFS_INODE_REF_KEY;
6929  	search_key.offset = 0;
6930  again:
6931  	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6932  	if (ret < 0)
6933  		goto out;
6934  	if (ret == 0)
6935  		path->slots[0]++;
6936  
6937  	while (true) {
6938  		struct extent_buffer *leaf = path->nodes[0];
6939  		int slot = path->slots[0];
6940  		struct btrfs_key found_key;
6941  
6942  		if (slot >= btrfs_header_nritems(leaf)) {
6943  			ret = btrfs_next_leaf(root, path);
6944  			if (ret < 0)
6945  				goto out;
6946  			else if (ret > 0)
6947  				break;
6948  			continue;
6949  		}
6950  
6951  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6952  		if (found_key.objectid != ino ||
6953  		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6954  			break;
6955  
6956  		/*
6957  		 * Don't deal with extended references because they are rare
6958  		 * cases and too complex to deal with (we would need to keep
6959  		 * track of which subitem we are processing for each item in
6960  		 * this loop, etc). So just return some error to fallback to
6961  		 * a transaction commit.
6962  		 */
6963  		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6964  			ret = -EMLINK;
6965  			goto out;
6966  		}
6967  
6968  		/*
6969  		 * Logging ancestors needs to do more searches on the fs/subvol
6970  		 * tree, so it releases the path as needed to avoid deadlocks.
6971  		 * Keep track of the last inode ref key and resume from that key
6972  		 * after logging all new ancestors for the current hard link.
6973  		 */
6974  		memcpy(&search_key, &found_key, sizeof(search_key));
6975  
6976  		ret = log_new_ancestors(trans, root, path, ctx);
6977  		if (ret)
6978  			goto out;
6979  		btrfs_release_path(path);
6980  		goto again;
6981  	}
6982  	ret = 0;
6983  out:
6984  	btrfs_free_path(path);
6985  	return ret;
6986  }
6987  
6988  /*
6989   * helper function around btrfs_log_inode to make sure newly created
6990   * parent directories also end up in the log.  A minimal inode and backref
6991   * only logging is done of any parent directories that are older than
6992   * the last committed transaction
6993   */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)6994  static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6995  				  struct btrfs_inode *inode,
6996  				  struct dentry *parent,
6997  				  int inode_only,
6998  				  struct btrfs_log_ctx *ctx)
6999  {
7000  	struct btrfs_root *root = inode->root;
7001  	struct btrfs_fs_info *fs_info = root->fs_info;
7002  	int ret = 0;
7003  	bool log_dentries = false;
7004  
7005  	if (btrfs_test_opt(fs_info, NOTREELOG)) {
7006  		ret = BTRFS_LOG_FORCE_COMMIT;
7007  		goto end_no_trans;
7008  	}
7009  
7010  	if (btrfs_root_refs(&root->root_item) == 0) {
7011  		ret = BTRFS_LOG_FORCE_COMMIT;
7012  		goto end_no_trans;
7013  	}
7014  
7015  	/*
7016  	 * Skip already logged inodes or inodes corresponding to tmpfiles
7017  	 * (since logging them is pointless, a link count of 0 means they
7018  	 * will never be accessible).
7019  	 */
7020  	if ((btrfs_inode_in_log(inode, trans->transid) &&
7021  	     list_empty(&ctx->ordered_extents)) ||
7022  	    inode->vfs_inode.i_nlink == 0) {
7023  		ret = BTRFS_NO_LOG_SYNC;
7024  		goto end_no_trans;
7025  	}
7026  
7027  	ret = start_log_trans(trans, root, ctx);
7028  	if (ret)
7029  		goto end_no_trans;
7030  
7031  	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7032  	if (ret)
7033  		goto end_trans;
7034  
7035  	/*
7036  	 * for regular files, if its inode is already on disk, we don't
7037  	 * have to worry about the parents at all.  This is because
7038  	 * we can use the last_unlink_trans field to record renames
7039  	 * and other fun in this file.
7040  	 */
7041  	if (S_ISREG(inode->vfs_inode.i_mode) &&
7042  	    inode->generation < trans->transid &&
7043  	    inode->last_unlink_trans < trans->transid) {
7044  		ret = 0;
7045  		goto end_trans;
7046  	}
7047  
7048  	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
7049  		log_dentries = true;
7050  
7051  	/*
7052  	 * On unlink we must make sure all our current and old parent directory
7053  	 * inodes are fully logged. This is to prevent leaving dangling
7054  	 * directory index entries in directories that were our parents but are
7055  	 * not anymore. Not doing this results in old parent directory being
7056  	 * impossible to delete after log replay (rmdir will always fail with
7057  	 * error -ENOTEMPTY).
7058  	 *
7059  	 * Example 1:
7060  	 *
7061  	 * mkdir testdir
7062  	 * touch testdir/foo
7063  	 * ln testdir/foo testdir/bar
7064  	 * sync
7065  	 * unlink testdir/bar
7066  	 * xfs_io -c fsync testdir/foo
7067  	 * <power failure>
7068  	 * mount fs, triggers log replay
7069  	 *
7070  	 * If we don't log the parent directory (testdir), after log replay the
7071  	 * directory still has an entry pointing to the file inode using the bar
7072  	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7073  	 * the file inode has a link count of 1.
7074  	 *
7075  	 * Example 2:
7076  	 *
7077  	 * mkdir testdir
7078  	 * touch foo
7079  	 * ln foo testdir/foo2
7080  	 * ln foo testdir/foo3
7081  	 * sync
7082  	 * unlink testdir/foo3
7083  	 * xfs_io -c fsync foo
7084  	 * <power failure>
7085  	 * mount fs, triggers log replay
7086  	 *
7087  	 * Similar as the first example, after log replay the parent directory
7088  	 * testdir still has an entry pointing to the inode file with name foo3
7089  	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7090  	 * and has a link count of 2.
7091  	 */
7092  	if (inode->last_unlink_trans >= trans->transid) {
7093  		ret = btrfs_log_all_parents(trans, inode, ctx);
7094  		if (ret)
7095  			goto end_trans;
7096  	}
7097  
7098  	ret = log_all_new_ancestors(trans, inode, parent, ctx);
7099  	if (ret)
7100  		goto end_trans;
7101  
7102  	if (log_dentries)
7103  		ret = log_new_dir_dentries(trans, inode, ctx);
7104  	else
7105  		ret = 0;
7106  end_trans:
7107  	if (ret < 0) {
7108  		btrfs_set_log_full_commit(trans);
7109  		ret = BTRFS_LOG_FORCE_COMMIT;
7110  	}
7111  
7112  	if (ret)
7113  		btrfs_remove_log_ctx(root, ctx);
7114  	btrfs_end_log_trans(root);
7115  end_no_trans:
7116  	return ret;
7117  }
7118  
7119  /*
7120   * it is not safe to log dentry if the chunk root has added new
7121   * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
7122   * If this returns 1, you must commit the transaction to safely get your
7123   * data on disk.
7124   */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)7125  int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7126  			  struct dentry *dentry,
7127  			  struct btrfs_log_ctx *ctx)
7128  {
7129  	struct dentry *parent = dget_parent(dentry);
7130  	int ret;
7131  
7132  	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7133  				     LOG_INODE_ALL, ctx);
7134  	dput(parent);
7135  
7136  	return ret;
7137  }
7138  
7139  /*
7140   * should be called during mount to recover any replay any log trees
7141   * from the FS
7142   */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)7143  int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7144  {
7145  	int ret;
7146  	struct btrfs_path *path;
7147  	struct btrfs_trans_handle *trans;
7148  	struct btrfs_key key;
7149  	struct btrfs_key found_key;
7150  	struct btrfs_root *log;
7151  	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7152  	struct walk_control wc = {
7153  		.process_func = process_one_buffer,
7154  		.stage = LOG_WALK_PIN_ONLY,
7155  	};
7156  
7157  	path = btrfs_alloc_path();
7158  	if (!path)
7159  		return -ENOMEM;
7160  
7161  	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7162  
7163  	trans = btrfs_start_transaction(fs_info->tree_root, 0);
7164  	if (IS_ERR(trans)) {
7165  		ret = PTR_ERR(trans);
7166  		goto error;
7167  	}
7168  
7169  	wc.trans = trans;
7170  	wc.pin = 1;
7171  
7172  	ret = walk_log_tree(trans, log_root_tree, &wc);
7173  	if (ret) {
7174  		btrfs_abort_transaction(trans, ret);
7175  		goto error;
7176  	}
7177  
7178  again:
7179  	key.objectid = BTRFS_TREE_LOG_OBJECTID;
7180  	key.offset = (u64)-1;
7181  	key.type = BTRFS_ROOT_ITEM_KEY;
7182  
7183  	while (1) {
7184  		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7185  
7186  		if (ret < 0) {
7187  			btrfs_abort_transaction(trans, ret);
7188  			goto error;
7189  		}
7190  		if (ret > 0) {
7191  			if (path->slots[0] == 0)
7192  				break;
7193  			path->slots[0]--;
7194  		}
7195  		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7196  				      path->slots[0]);
7197  		btrfs_release_path(path);
7198  		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7199  			break;
7200  
7201  		log = btrfs_read_tree_root(log_root_tree, &found_key);
7202  		if (IS_ERR(log)) {
7203  			ret = PTR_ERR(log);
7204  			btrfs_abort_transaction(trans, ret);
7205  			goto error;
7206  		}
7207  
7208  		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7209  						   true);
7210  		if (IS_ERR(wc.replay_dest)) {
7211  			ret = PTR_ERR(wc.replay_dest);
7212  
7213  			/*
7214  			 * We didn't find the subvol, likely because it was
7215  			 * deleted.  This is ok, simply skip this log and go to
7216  			 * the next one.
7217  			 *
7218  			 * We need to exclude the root because we can't have
7219  			 * other log replays overwriting this log as we'll read
7220  			 * it back in a few more times.  This will keep our
7221  			 * block from being modified, and we'll just bail for
7222  			 * each subsequent pass.
7223  			 */
7224  			if (ret == -ENOENT)
7225  				ret = btrfs_pin_extent_for_log_replay(trans,
7226  							log->node->start,
7227  							log->node->len);
7228  			btrfs_put_root(log);
7229  
7230  			if (!ret)
7231  				goto next;
7232  			btrfs_abort_transaction(trans, ret);
7233  			goto error;
7234  		}
7235  
7236  		wc.replay_dest->log_root = log;
7237  		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7238  		if (ret)
7239  			/* The loop needs to continue due to the root refs */
7240  			btrfs_abort_transaction(trans, ret);
7241  		else
7242  			ret = walk_log_tree(trans, log, &wc);
7243  
7244  		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7245  			ret = fixup_inode_link_counts(trans, wc.replay_dest,
7246  						      path);
7247  			if (ret)
7248  				btrfs_abort_transaction(trans, ret);
7249  		}
7250  
7251  		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7252  			struct btrfs_root *root = wc.replay_dest;
7253  
7254  			btrfs_release_path(path);
7255  
7256  			/*
7257  			 * We have just replayed everything, and the highest
7258  			 * objectid of fs roots probably has changed in case
7259  			 * some inode_item's got replayed.
7260  			 *
7261  			 * root->objectid_mutex is not acquired as log replay
7262  			 * could only happen during mount.
7263  			 */
7264  			ret = btrfs_init_root_free_objectid(root);
7265  			if (ret)
7266  				btrfs_abort_transaction(trans, ret);
7267  		}
7268  
7269  		wc.replay_dest->log_root = NULL;
7270  		btrfs_put_root(wc.replay_dest);
7271  		btrfs_put_root(log);
7272  
7273  		if (ret)
7274  			goto error;
7275  next:
7276  		if (found_key.offset == 0)
7277  			break;
7278  		key.offset = found_key.offset - 1;
7279  	}
7280  	btrfs_release_path(path);
7281  
7282  	/* step one is to pin it all, step two is to replay just inodes */
7283  	if (wc.pin) {
7284  		wc.pin = 0;
7285  		wc.process_func = replay_one_buffer;
7286  		wc.stage = LOG_WALK_REPLAY_INODES;
7287  		goto again;
7288  	}
7289  	/* step three is to replay everything */
7290  	if (wc.stage < LOG_WALK_REPLAY_ALL) {
7291  		wc.stage++;
7292  		goto again;
7293  	}
7294  
7295  	btrfs_free_path(path);
7296  
7297  	/* step 4: commit the transaction, which also unpins the blocks */
7298  	ret = btrfs_commit_transaction(trans);
7299  	if (ret)
7300  		return ret;
7301  
7302  	log_root_tree->log_root = NULL;
7303  	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7304  	btrfs_put_root(log_root_tree);
7305  
7306  	return 0;
7307  error:
7308  	if (wc.trans)
7309  		btrfs_end_transaction(wc.trans);
7310  	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7311  	btrfs_free_path(path);
7312  	return ret;
7313  }
7314  
7315  /*
7316   * there are some corner cases where we want to force a full
7317   * commit instead of allowing a directory to be logged.
7318   *
7319   * They revolve around files there were unlinked from the directory, and
7320   * this function updates the parent directory so that a full commit is
7321   * properly done if it is fsync'd later after the unlinks are done.
7322   *
7323   * Must be called before the unlink operations (updates to the subvolume tree,
7324   * inodes, etc) are done.
7325   */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,bool for_rename)7326  void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7327  			     struct btrfs_inode *dir, struct btrfs_inode *inode,
7328  			     bool for_rename)
7329  {
7330  	/*
7331  	 * when we're logging a file, if it hasn't been renamed
7332  	 * or unlinked, and its inode is fully committed on disk,
7333  	 * we don't have to worry about walking up the directory chain
7334  	 * to log its parents.
7335  	 *
7336  	 * So, we use the last_unlink_trans field to put this transid
7337  	 * into the file.  When the file is logged we check it and
7338  	 * don't log the parents if the file is fully on disk.
7339  	 */
7340  	mutex_lock(&inode->log_mutex);
7341  	inode->last_unlink_trans = trans->transid;
7342  	mutex_unlock(&inode->log_mutex);
7343  
7344  	if (!for_rename)
7345  		return;
7346  
7347  	/*
7348  	 * If this directory was already logged, any new names will be logged
7349  	 * with btrfs_log_new_name() and old names will be deleted from the log
7350  	 * tree with btrfs_del_dir_entries_in_log() or with
7351  	 * btrfs_del_inode_ref_in_log().
7352  	 */
7353  	if (inode_logged(trans, dir, NULL) == 1)
7354  		return;
7355  
7356  	/*
7357  	 * If the inode we're about to unlink was logged before, the log will be
7358  	 * properly updated with the new name with btrfs_log_new_name() and the
7359  	 * old name removed with btrfs_del_dir_entries_in_log() or with
7360  	 * btrfs_del_inode_ref_in_log().
7361  	 */
7362  	if (inode_logged(trans, inode, NULL) == 1)
7363  		return;
7364  
7365  	/*
7366  	 * when renaming files across directories, if the directory
7367  	 * there we're unlinking from gets fsync'd later on, there's
7368  	 * no way to find the destination directory later and fsync it
7369  	 * properly.  So, we have to be conservative and force commits
7370  	 * so the new name gets discovered.
7371  	 */
7372  	mutex_lock(&dir->log_mutex);
7373  	dir->last_unlink_trans = trans->transid;
7374  	mutex_unlock(&dir->log_mutex);
7375  }
7376  
7377  /*
7378   * Make sure that if someone attempts to fsync the parent directory of a deleted
7379   * snapshot, it ends up triggering a transaction commit. This is to guarantee
7380   * that after replaying the log tree of the parent directory's root we will not
7381   * see the snapshot anymore and at log replay time we will not see any log tree
7382   * corresponding to the deleted snapshot's root, which could lead to replaying
7383   * it after replaying the log tree of the parent directory (which would replay
7384   * the snapshot delete operation).
7385   *
7386   * Must be called before the actual snapshot destroy operation (updates to the
7387   * parent root and tree of tree roots trees, etc) are done.
7388   */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7389  void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7390  				   struct btrfs_inode *dir)
7391  {
7392  	mutex_lock(&dir->log_mutex);
7393  	dir->last_unlink_trans = trans->transid;
7394  	mutex_unlock(&dir->log_mutex);
7395  }
7396  
7397  /*
7398   * Update the log after adding a new name for an inode.
7399   *
7400   * @trans:              Transaction handle.
7401   * @old_dentry:         The dentry associated with the old name and the old
7402   *                      parent directory.
7403   * @old_dir:            The inode of the previous parent directory for the case
7404   *                      of a rename. For a link operation, it must be NULL.
7405   * @old_dir_index:      The index number associated with the old name, meaningful
7406   *                      only for rename operations (when @old_dir is not NULL).
7407   *                      Ignored for link operations.
7408   * @parent:             The dentry associated with the directory under which the
7409   *                      new name is located.
7410   *
7411   * Call this after adding a new name for an inode, as a result of a link or
7412   * rename operation, and it will properly update the log to reflect the new name.
7413   */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct dentry * old_dentry,struct btrfs_inode * old_dir,u64 old_dir_index,struct dentry * parent)7414  void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7415  			struct dentry *old_dentry, struct btrfs_inode *old_dir,
7416  			u64 old_dir_index, struct dentry *parent)
7417  {
7418  	struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7419  	struct btrfs_root *root = inode->root;
7420  	struct btrfs_log_ctx ctx;
7421  	bool log_pinned = false;
7422  	int ret;
7423  
7424  	/*
7425  	 * this will force the logging code to walk the dentry chain
7426  	 * up for the file
7427  	 */
7428  	if (!S_ISDIR(inode->vfs_inode.i_mode))
7429  		inode->last_unlink_trans = trans->transid;
7430  
7431  	/*
7432  	 * if this inode hasn't been logged and directory we're renaming it
7433  	 * from hasn't been logged, we don't need to log it
7434  	 */
7435  	ret = inode_logged(trans, inode, NULL);
7436  	if (ret < 0) {
7437  		goto out;
7438  	} else if (ret == 0) {
7439  		if (!old_dir)
7440  			return;
7441  		/*
7442  		 * If the inode was not logged and we are doing a rename (old_dir is not
7443  		 * NULL), check if old_dir was logged - if it was not we can return and
7444  		 * do nothing.
7445  		 */
7446  		ret = inode_logged(trans, old_dir, NULL);
7447  		if (ret < 0)
7448  			goto out;
7449  		else if (ret == 0)
7450  			return;
7451  	}
7452  	ret = 0;
7453  
7454  	/*
7455  	 * If we are doing a rename (old_dir is not NULL) from a directory that
7456  	 * was previously logged, make sure that on log replay we get the old
7457  	 * dir entry deleted. This is needed because we will also log the new
7458  	 * name of the renamed inode, so we need to make sure that after log
7459  	 * replay we don't end up with both the new and old dir entries existing.
7460  	 */
7461  	if (old_dir && old_dir->logged_trans == trans->transid) {
7462  		struct btrfs_root *log = old_dir->root->log_root;
7463  		struct btrfs_path *path;
7464  		struct fscrypt_name fname;
7465  
7466  		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7467  
7468  		ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7469  					     &old_dentry->d_name, 0, &fname);
7470  		if (ret)
7471  			goto out;
7472  		/*
7473  		 * We have two inodes to update in the log, the old directory and
7474  		 * the inode that got renamed, so we must pin the log to prevent
7475  		 * anyone from syncing the log until we have updated both inodes
7476  		 * in the log.
7477  		 */
7478  		ret = join_running_log_trans(root);
7479  		/*
7480  		 * At least one of the inodes was logged before, so this should
7481  		 * not fail, but if it does, it's not serious, just bail out and
7482  		 * mark the log for a full commit.
7483  		 */
7484  		if (WARN_ON_ONCE(ret < 0)) {
7485  			fscrypt_free_filename(&fname);
7486  			goto out;
7487  		}
7488  
7489  		log_pinned = true;
7490  
7491  		path = btrfs_alloc_path();
7492  		if (!path) {
7493  			ret = -ENOMEM;
7494  			fscrypt_free_filename(&fname);
7495  			goto out;
7496  		}
7497  
7498  		/*
7499  		 * Other concurrent task might be logging the old directory,
7500  		 * as it can be triggered when logging other inode that had or
7501  		 * still has a dentry in the old directory. We lock the old
7502  		 * directory's log_mutex to ensure the deletion of the old
7503  		 * name is persisted, because during directory logging we
7504  		 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7505  		 * the old name's dir index item is in the delayed items, so
7506  		 * it could be missed by an in progress directory logging.
7507  		 */
7508  		mutex_lock(&old_dir->log_mutex);
7509  		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7510  					&fname.disk_name, old_dir_index);
7511  		if (ret > 0) {
7512  			/*
7513  			 * The dentry does not exist in the log, so record its
7514  			 * deletion.
7515  			 */
7516  			btrfs_release_path(path);
7517  			ret = insert_dir_log_key(trans, log, path,
7518  						 btrfs_ino(old_dir),
7519  						 old_dir_index, old_dir_index);
7520  		}
7521  		mutex_unlock(&old_dir->log_mutex);
7522  
7523  		btrfs_free_path(path);
7524  		fscrypt_free_filename(&fname);
7525  		if (ret < 0)
7526  			goto out;
7527  	}
7528  
7529  	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
7530  	ctx.logging_new_name = true;
7531  	/*
7532  	 * We don't care about the return value. If we fail to log the new name
7533  	 * then we know the next attempt to sync the log will fallback to a full
7534  	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7535  	 * we don't need to worry about getting a log committed that has an
7536  	 * inconsistent state after a rename operation.
7537  	 */
7538  	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7539  	ASSERT(list_empty(&ctx.conflict_inodes));
7540  out:
7541  	/*
7542  	 * If an error happened mark the log for a full commit because it's not
7543  	 * consistent and up to date or we couldn't find out if one of the
7544  	 * inodes was logged before in this transaction. Do it before unpinning
7545  	 * the log, to avoid any races with someone else trying to commit it.
7546  	 */
7547  	if (ret < 0)
7548  		btrfs_set_log_full_commit(trans);
7549  	if (log_pinned)
7550  		btrfs_end_log_trans(root);
7551  }
7552  
7553