xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 36de991e)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligibility is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * Fast Commit Replay Idempotence
107  * ------------------------------
108  *
109  * Fast commits tags are idempotent in nature provided the recovery code follows
110  * certain rules. The guiding principle that the commit path follows while
111  * committing is that it stores the result of a particular operation instead of
112  * storing the procedure.
113  *
114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115  * was associated with inode 10. During fast commit, instead of storing this
116  * operation as a procedure "rename a to b", we store the resulting file system
117  * state as a "series" of outcomes:
118  *
119  * - Link dirent b to inode 10
120  * - Unlink dirent a
121  * - Inode <10> with valid refcount
122  *
123  * Now when recovery code runs, it needs "enforce" this state on the file
124  * system. This is what guarantees idempotence of fast commit replay.
125  *
126  * Let's take an example of a procedure that is not idempotent and see how fast
127  * commits make it idempotent. Consider following sequence of operations:
128  *
129  *     rm A;    mv B A;    read A
130  *  (x)     (y)        (z)
131  *
132  * (x), (y) and (z) are the points at which we can crash. If we store this
133  * sequence of operations as is then the replay is not idempotent. Let's say
134  * while in replay, we crash at (z). During the second replay, file A (which was
135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
136  * file named A would be absent when we try to read A. So, this sequence of
137  * operations is not idempotent. However, as mentioned above, instead of storing
138  * the procedure fast commits store the outcome of each procedure. Thus the fast
139  * commit log for above procedure would be as follows:
140  *
141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142  * inode 11 before the replay)
143  *
144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
145  * (w)          (x)                    (y)          (z)
146  *
147  * If we crash at (z), we will have file A linked to inode 11. During the second
148  * replay, we will remove file A (inode 11). But we will create it back and make
149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152  * similarly. Thus, by converting a non-idempotent procedure into a series of
153  * idempotent outcomes, fast commits ensured idempotence during the replay.
154  *
155  * TODOs
156  * -----
157  *
158  * 0) Fast commit replay path hardening: Fast commit replay code should use
159  *    journal handles to make sure all the updates it does during the replay
160  *    path are atomic. With that if we crash during fast commit replay, after
161  *    trying to do recovery again, we will find a file system where fast commit
162  *    area is invalid (because new full commit would be found). In order to deal
163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
164  *    superblock state is persisted before starting the replay, so that after
165  *    the crash, fast commit recovery code can look at that flag and perform
166  *    fast commit recovery even if that area is invalidated by later full
167  *    commits.
168  *
169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170  *    eligible update must be protected within ext4_fc_start_update() and
171  *    ext4_fc_stop_update(). These routines are called at much higher
172  *    routines. This can be made more fine grained by combining with
173  *    ext4_journal_start().
174  *
175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176  *
177  * 3) Handle more ineligible cases.
178  */
179 
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
182 
183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184 {
185 	BUFFER_TRACE(bh, "");
186 	if (uptodate) {
187 		ext4_debug("%s: Block %lld up-to-date",
188 			   __func__, bh->b_blocknr);
189 		set_buffer_uptodate(bh);
190 	} else {
191 		ext4_debug("%s: Block %lld not up-to-date",
192 			   __func__, bh->b_blocknr);
193 		clear_buffer_uptodate(bh);
194 	}
195 
196 	unlock_buffer(bh);
197 }
198 
199 static inline void ext4_fc_reset_inode(struct inode *inode)
200 {
201 	struct ext4_inode_info *ei = EXT4_I(inode);
202 
203 	ei->i_fc_lblk_start = 0;
204 	ei->i_fc_lblk_len = 0;
205 }
206 
207 void ext4_fc_init_inode(struct inode *inode)
208 {
209 	struct ext4_inode_info *ei = EXT4_I(inode);
210 
211 	ext4_fc_reset_inode(inode);
212 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 	INIT_LIST_HEAD(&ei->i_fc_list);
214 	init_waitqueue_head(&ei->i_fc_wait);
215 	atomic_set(&ei->i_fc_updates, 0);
216 }
217 
218 /* This function must be called with sbi->s_fc_lock held. */
219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221 {
222 	wait_queue_head_t *wq;
223 	struct ext4_inode_info *ei = EXT4_I(inode);
224 
225 #if (BITS_PER_LONG < 64)
226 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 			EXT4_STATE_FC_COMMITTING);
228 	wq = bit_waitqueue(&ei->i_state_flags,
229 				EXT4_STATE_FC_COMMITTING);
230 #else
231 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 			EXT4_STATE_FC_COMMITTING);
233 	wq = bit_waitqueue(&ei->i_flags,
234 				EXT4_STATE_FC_COMMITTING);
235 #endif
236 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 	schedule();
240 	finish_wait(wq, &wait.wq_entry);
241 }
242 
243 /*
244  * Inform Ext4's fast about start of an inode update
245  *
246  * This function is called by the high level call VFS callbacks before
247  * performing any inode update. This function blocks if there's an ongoing
248  * fast commit on the inode in question.
249  */
250 void ext4_fc_start_update(struct inode *inode)
251 {
252 	struct ext4_inode_info *ei = EXT4_I(inode);
253 
254 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 		return;
257 
258 restart:
259 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 	if (list_empty(&ei->i_fc_list))
261 		goto out;
262 
263 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 		ext4_fc_wait_committing_inode(inode);
265 		goto restart;
266 	}
267 out:
268 	atomic_inc(&ei->i_fc_updates);
269 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270 }
271 
272 /*
273  * Stop inode update and wake up waiting fast commits if any.
274  */
275 void ext4_fc_stop_update(struct inode *inode)
276 {
277 	struct ext4_inode_info *ei = EXT4_I(inode);
278 
279 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 		return;
282 
283 	if (atomic_dec_and_test(&ei->i_fc_updates))
284 		wake_up_all(&ei->i_fc_wait);
285 }
286 
287 /*
288  * Remove inode from fast commit list. If the inode is being committed
289  * we wait until inode commit is done.
290  */
291 void ext4_fc_del(struct inode *inode)
292 {
293 	struct ext4_inode_info *ei = EXT4_I(inode);
294 
295 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 		return;
298 
299 restart:
300 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 	if (list_empty(&ei->i_fc_list)) {
302 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 		return;
304 	}
305 
306 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 		ext4_fc_wait_committing_inode(inode);
308 		goto restart;
309 	}
310 	list_del_init(&ei->i_fc_list);
311 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312 }
313 
314 /*
315  * Mark file system as fast commit ineligible. This means that next commit
316  * operation would result in a full jbd2 commit.
317  */
318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319 {
320 	struct ext4_sb_info *sbi = EXT4_SB(sb);
321 
322 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 		return;
325 
326 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329 }
330 
331 /*
332  * Start a fast commit ineligible update. Any commits that happen while
333  * such an operation is in progress fall back to full commits.
334  */
335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336 {
337 	struct ext4_sb_info *sbi = EXT4_SB(sb);
338 
339 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 		return;
342 
343 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 	atomic_inc(&sbi->s_fc_ineligible_updates);
346 }
347 
348 /*
349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350  * to ensure that after stopping the ineligible update, at least one full
351  * commit takes place.
352  */
353 void ext4_fc_stop_ineligible(struct super_block *sb)
354 {
355 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 		return;
358 
359 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361 }
362 
363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
364 {
365 	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367 }
368 
369 /*
370  * Generic fast commit tracking function. If this is the first time this we are
371  * called after a full commit, we initialize fast commit fields and then call
372  * __fc_track_fn() with update = 0. If we have already been called after a full
373  * commit, we pass update = 1. Based on that, the track function can determine
374  * if it needs to track a field for the first time or if it needs to just
375  * update the previously tracked value.
376  *
377  * If enqueue is set, this function enqueues the inode in fast commit list.
378  */
379 static int ext4_fc_track_template(
380 	handle_t *handle, struct inode *inode,
381 	int (*__fc_track_fn)(struct inode *, void *, bool),
382 	void *args, int enqueue)
383 {
384 	bool update = false;
385 	struct ext4_inode_info *ei = EXT4_I(inode);
386 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 	tid_t tid = 0;
388 	int ret;
389 
390 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
392 		return -EOPNOTSUPP;
393 
394 	if (ext4_fc_is_ineligible(inode->i_sb))
395 		return -EINVAL;
396 
397 	tid = handle->h_transaction->t_tid;
398 	mutex_lock(&ei->i_fc_lock);
399 	if (tid == ei->i_sync_tid) {
400 		update = true;
401 	} else {
402 		ext4_fc_reset_inode(inode);
403 		ei->i_sync_tid = tid;
404 	}
405 	ret = __fc_track_fn(inode, args, update);
406 	mutex_unlock(&ei->i_fc_lock);
407 
408 	if (!enqueue)
409 		return ret;
410 
411 	spin_lock(&sbi->s_fc_lock);
412 	if (list_empty(&EXT4_I(inode)->i_fc_list))
413 		list_add_tail(&EXT4_I(inode)->i_fc_list,
414 				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 				&sbi->s_fc_q[FC_Q_STAGING] :
416 				&sbi->s_fc_q[FC_Q_MAIN]);
417 	spin_unlock(&sbi->s_fc_lock);
418 
419 	return ret;
420 }
421 
422 struct __track_dentry_update_args {
423 	struct dentry *dentry;
424 	int op;
425 };
426 
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429 {
430 	struct ext4_fc_dentry_update *node;
431 	struct ext4_inode_info *ei = EXT4_I(inode);
432 	struct __track_dentry_update_args *dentry_update =
433 		(struct __track_dentry_update_args *)arg;
434 	struct dentry *dentry = dentry_update->dentry;
435 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436 
437 	mutex_unlock(&ei->i_fc_lock);
438 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 	if (!node) {
440 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 		mutex_lock(&ei->i_fc_lock);
442 		return -ENOMEM;
443 	}
444 
445 	node->fcd_op = dentry_update->op;
446 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 	node->fcd_ino = inode->i_ino;
448 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 		if (!node->fcd_name.name) {
451 			kmem_cache_free(ext4_fc_dentry_cachep, node);
452 			ext4_fc_mark_ineligible(inode->i_sb,
453 				EXT4_FC_REASON_NOMEM);
454 			mutex_lock(&ei->i_fc_lock);
455 			return -ENOMEM;
456 		}
457 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 			dentry->d_name.len);
459 	} else {
460 		memcpy(node->fcd_iname, dentry->d_name.name,
461 			dentry->d_name.len);
462 		node->fcd_name.name = node->fcd_iname;
463 	}
464 	node->fcd_name.len = dentry->d_name.len;
465 
466 	spin_lock(&sbi->s_fc_lock);
467 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 		list_add_tail(&node->fcd_list,
469 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 	else
471 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 	spin_unlock(&sbi->s_fc_lock);
473 	mutex_lock(&ei->i_fc_lock);
474 
475 	return 0;
476 }
477 
478 void __ext4_fc_track_unlink(handle_t *handle,
479 		struct inode *inode, struct dentry *dentry)
480 {
481 	struct __track_dentry_update_args args;
482 	int ret;
483 
484 	args.dentry = dentry;
485 	args.op = EXT4_FC_TAG_UNLINK;
486 
487 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 					(void *)&args, 0);
489 	trace_ext4_fc_track_unlink(inode, dentry, ret);
490 }
491 
492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493 {
494 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495 }
496 
497 void __ext4_fc_track_link(handle_t *handle,
498 	struct inode *inode, struct dentry *dentry)
499 {
500 	struct __track_dentry_update_args args;
501 	int ret;
502 
503 	args.dentry = dentry;
504 	args.op = EXT4_FC_TAG_LINK;
505 
506 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 					(void *)&args, 0);
508 	trace_ext4_fc_track_link(inode, dentry, ret);
509 }
510 
511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512 {
513 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
514 }
515 
516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517 			  struct dentry *dentry)
518 {
519 	struct __track_dentry_update_args args;
520 	int ret;
521 
522 	args.dentry = dentry;
523 	args.op = EXT4_FC_TAG_CREAT;
524 
525 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 					(void *)&args, 0);
527 	trace_ext4_fc_track_create(inode, dentry, ret);
528 }
529 
530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531 {
532 	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
533 }
534 
535 /* __track_fn for inode tracking */
536 static int __track_inode(struct inode *inode, void *arg, bool update)
537 {
538 	if (update)
539 		return -EEXIST;
540 
541 	EXT4_I(inode)->i_fc_lblk_len = 0;
542 
543 	return 0;
544 }
545 
546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547 {
548 	int ret;
549 
550 	if (S_ISDIR(inode->i_mode))
551 		return;
552 
553 	if (ext4_should_journal_data(inode)) {
554 		ext4_fc_mark_ineligible(inode->i_sb,
555 					EXT4_FC_REASON_INODE_JOURNAL_DATA);
556 		return;
557 	}
558 
559 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560 	trace_ext4_fc_track_inode(inode, ret);
561 }
562 
563 struct __track_range_args {
564 	ext4_lblk_t start, end;
565 };
566 
567 /* __track_fn for tracking data updates */
568 static int __track_range(struct inode *inode, void *arg, bool update)
569 {
570 	struct ext4_inode_info *ei = EXT4_I(inode);
571 	ext4_lblk_t oldstart;
572 	struct __track_range_args *__arg =
573 		(struct __track_range_args *)arg;
574 
575 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577 		return -ECANCELED;
578 	}
579 
580 	oldstart = ei->i_fc_lblk_start;
581 
582 	if (update && ei->i_fc_lblk_len > 0) {
583 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584 		ei->i_fc_lblk_len =
585 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586 				ei->i_fc_lblk_start + 1;
587 	} else {
588 		ei->i_fc_lblk_start = __arg->start;
589 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590 	}
591 
592 	return 0;
593 }
594 
595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596 			 ext4_lblk_t end)
597 {
598 	struct __track_range_args args;
599 	int ret;
600 
601 	if (S_ISDIR(inode->i_mode))
602 		return;
603 
604 	args.start = start;
605 	args.end = end;
606 
607 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
608 
609 	trace_ext4_fc_track_range(inode, start, end, ret);
610 }
611 
612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613 {
614 	int write_flags = REQ_SYNC;
615 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616 
617 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
618 	if (test_opt(sb, BARRIER) && is_tail)
619 		write_flags |= REQ_FUA | REQ_PREFLUSH;
620 	lock_buffer(bh);
621 	set_buffer_dirty(bh);
622 	set_buffer_uptodate(bh);
623 	bh->b_end_io = ext4_end_buffer_io_sync;
624 	submit_bh(REQ_OP_WRITE, write_flags, bh);
625 	EXT4_SB(sb)->s_fc_bh = NULL;
626 }
627 
628 /* Ext4 commit path routines */
629 
630 /* memzero and update CRC */
631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632 				u32 *crc)
633 {
634 	void *ret;
635 
636 	ret = memset(dst, 0, len);
637 	if (crc)
638 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639 	return ret;
640 }
641 
642 /*
643  * Allocate len bytes on a fast commit buffer.
644  *
645  * During the commit time this function is used to manage fast commit
646  * block space. We don't split a fast commit log onto different
647  * blocks. So this function makes sure that if there's not enough space
648  * on the current block, the remaining space in the current block is
649  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650  * new block is from jbd2 and CRC is updated to reflect the padding
651  * we added.
652  */
653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654 {
655 	struct ext4_fc_tl *tl;
656 	struct ext4_sb_info *sbi = EXT4_SB(sb);
657 	struct buffer_head *bh;
658 	int bsize = sbi->s_journal->j_blocksize;
659 	int ret, off = sbi->s_fc_bytes % bsize;
660 	int pad_len;
661 
662 	/*
663 	 * After allocating len, we should have space at least for a 0 byte
664 	 * padding.
665 	 */
666 	if (len + sizeof(struct ext4_fc_tl) > bsize)
667 		return NULL;
668 
669 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670 		/*
671 		 * Only allocate from current buffer if we have enough space for
672 		 * this request AND we have space to add a zero byte padding.
673 		 */
674 		if (!sbi->s_fc_bh) {
675 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676 			if (ret)
677 				return NULL;
678 			sbi->s_fc_bh = bh;
679 		}
680 		sbi->s_fc_bytes += len;
681 		return sbi->s_fc_bh->b_data + off;
682 	}
683 	/* Need to add PAD tag */
684 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687 	tl->fc_len = cpu_to_le16(pad_len);
688 	if (crc)
689 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690 	if (pad_len > 0)
691 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692 	ext4_fc_submit_bh(sb, false);
693 
694 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695 	if (ret)
696 		return NULL;
697 	sbi->s_fc_bh = bh;
698 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699 	return sbi->s_fc_bh->b_data;
700 }
701 
702 /* memcpy to fc reserved space and update CRC */
703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704 				int len, u32 *crc)
705 {
706 	if (crc)
707 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708 	return memcpy(dst, src, len);
709 }
710 
711 /*
712  * Complete a fast commit by writing tail tag.
713  *
714  * Writing tail tag marks the end of a fast commit. In order to guarantee
715  * atomicity, after writing tail tag, even if there's space remaining
716  * in the block, next commit shouldn't use it. That's why tail tag
717  * has the length as that of the remaining space on the block.
718  */
719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720 {
721 	struct ext4_sb_info *sbi = EXT4_SB(sb);
722 	struct ext4_fc_tl tl;
723 	struct ext4_fc_tail tail;
724 	int off, bsize = sbi->s_journal->j_blocksize;
725 	u8 *dst;
726 
727 	/*
728 	 * ext4_fc_reserve_space takes care of allocating an extra block if
729 	 * there's no enough space on this block for accommodating this tail.
730 	 */
731 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732 	if (!dst)
733 		return -ENOSPC;
734 
735 	off = sbi->s_fc_bytes % bsize;
736 
737 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740 
741 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742 	dst += sizeof(tl);
743 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745 	dst += sizeof(tail.fc_tid);
746 	tail.fc_crc = cpu_to_le32(crc);
747 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748 
749 	ext4_fc_submit_bh(sb, true);
750 
751 	return 0;
752 }
753 
754 /*
755  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756  * Returns false if there's not enough space.
757  */
758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759 			   u32 *crc)
760 {
761 	struct ext4_fc_tl tl;
762 	u8 *dst;
763 
764 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765 	if (!dst)
766 		return false;
767 
768 	tl.fc_tag = cpu_to_le16(tag);
769 	tl.fc_len = cpu_to_le16(len);
770 
771 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773 
774 	return true;
775 }
776 
777 /* Same as above, but adds dentry tlv. */
778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
779 				   struct ext4_fc_dentry_update *fc_dentry)
780 {
781 	struct ext4_fc_dentry_info fcd;
782 	struct ext4_fc_tl tl;
783 	int dlen = fc_dentry->fcd_name.len;
784 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
785 					crc);
786 
787 	if (!dst)
788 		return false;
789 
790 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
791 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
792 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
793 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
794 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
795 	dst += sizeof(tl);
796 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
797 	dst += sizeof(fcd);
798 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
799 	dst += dlen;
800 
801 	return true;
802 }
803 
804 /*
805  * Writes inode in the fast commit space under TLV with tag @tag.
806  * Returns 0 on success, error on failure.
807  */
808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
809 {
810 	struct ext4_inode_info *ei = EXT4_I(inode);
811 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
812 	int ret;
813 	struct ext4_iloc iloc;
814 	struct ext4_fc_inode fc_inode;
815 	struct ext4_fc_tl tl;
816 	u8 *dst;
817 
818 	ret = ext4_get_inode_loc(inode, &iloc);
819 	if (ret)
820 		return ret;
821 
822 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
823 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
824 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
825 		inode_len += ei->i_extra_isize;
826 
827 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
828 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
829 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
830 
831 	dst = ext4_fc_reserve_space(inode->i_sb,
832 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
833 	if (!dst)
834 		return -ECANCELED;
835 
836 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
837 		return -ECANCELED;
838 	dst += sizeof(tl);
839 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
840 		return -ECANCELED;
841 	dst += sizeof(fc_inode);
842 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
843 					inode_len, crc))
844 		return -ECANCELED;
845 
846 	return 0;
847 }
848 
849 /*
850  * Writes updated data ranges for the inode in question. Updates CRC.
851  * Returns 0 on success, error otherwise.
852  */
853 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
854 {
855 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
856 	struct ext4_inode_info *ei = EXT4_I(inode);
857 	struct ext4_map_blocks map;
858 	struct ext4_fc_add_range fc_ext;
859 	struct ext4_fc_del_range lrange;
860 	struct ext4_extent *ex;
861 	int ret;
862 
863 	mutex_lock(&ei->i_fc_lock);
864 	if (ei->i_fc_lblk_len == 0) {
865 		mutex_unlock(&ei->i_fc_lock);
866 		return 0;
867 	}
868 	old_blk_size = ei->i_fc_lblk_start;
869 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
870 	ei->i_fc_lblk_len = 0;
871 	mutex_unlock(&ei->i_fc_lock);
872 
873 	cur_lblk_off = old_blk_size;
874 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
875 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
876 
877 	while (cur_lblk_off <= new_blk_size) {
878 		map.m_lblk = cur_lblk_off;
879 		map.m_len = new_blk_size - cur_lblk_off + 1;
880 		ret = ext4_map_blocks(NULL, inode, &map, 0);
881 		if (ret < 0)
882 			return -ECANCELED;
883 
884 		if (map.m_len == 0) {
885 			cur_lblk_off++;
886 			continue;
887 		}
888 
889 		if (ret == 0) {
890 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
891 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
892 			lrange.fc_len = cpu_to_le32(map.m_len);
893 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
894 					    sizeof(lrange), (u8 *)&lrange, crc))
895 				return -ENOSPC;
896 		} else {
897 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
898 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
899 
900 			/* Limit the number of blocks in one extent */
901 			map.m_len = min(max, map.m_len);
902 
903 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
904 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
905 			ex->ee_block = cpu_to_le32(map.m_lblk);
906 			ex->ee_len = cpu_to_le16(map.m_len);
907 			ext4_ext_store_pblock(ex, map.m_pblk);
908 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
909 				ext4_ext_mark_unwritten(ex);
910 			else
911 				ext4_ext_mark_initialized(ex);
912 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
913 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
914 				return -ENOSPC;
915 		}
916 
917 		cur_lblk_off += map.m_len;
918 	}
919 
920 	return 0;
921 }
922 
923 
924 /* Submit data for all the fast commit inodes */
925 static int ext4_fc_submit_inode_data_all(journal_t *journal)
926 {
927 	struct super_block *sb = (struct super_block *)(journal->j_private);
928 	struct ext4_sb_info *sbi = EXT4_SB(sb);
929 	struct ext4_inode_info *ei;
930 	int ret = 0;
931 
932 	spin_lock(&sbi->s_fc_lock);
933 	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
934 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
935 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
936 		while (atomic_read(&ei->i_fc_updates)) {
937 			DEFINE_WAIT(wait);
938 
939 			prepare_to_wait(&ei->i_fc_wait, &wait,
940 						TASK_UNINTERRUPTIBLE);
941 			if (atomic_read(&ei->i_fc_updates)) {
942 				spin_unlock(&sbi->s_fc_lock);
943 				schedule();
944 				spin_lock(&sbi->s_fc_lock);
945 			}
946 			finish_wait(&ei->i_fc_wait, &wait);
947 		}
948 		spin_unlock(&sbi->s_fc_lock);
949 		ret = jbd2_submit_inode_data(ei->jinode);
950 		if (ret)
951 			return ret;
952 		spin_lock(&sbi->s_fc_lock);
953 	}
954 	spin_unlock(&sbi->s_fc_lock);
955 
956 	return ret;
957 }
958 
959 /* Wait for completion of data for all the fast commit inodes */
960 static int ext4_fc_wait_inode_data_all(journal_t *journal)
961 {
962 	struct super_block *sb = (struct super_block *)(journal->j_private);
963 	struct ext4_sb_info *sbi = EXT4_SB(sb);
964 	struct ext4_inode_info *pos, *n;
965 	int ret = 0;
966 
967 	spin_lock(&sbi->s_fc_lock);
968 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
969 		if (!ext4_test_inode_state(&pos->vfs_inode,
970 					   EXT4_STATE_FC_COMMITTING))
971 			continue;
972 		spin_unlock(&sbi->s_fc_lock);
973 
974 		ret = jbd2_wait_inode_data(journal, pos->jinode);
975 		if (ret)
976 			return ret;
977 		spin_lock(&sbi->s_fc_lock);
978 	}
979 	spin_unlock(&sbi->s_fc_lock);
980 
981 	return 0;
982 }
983 
984 /* Commit all the directory entry updates */
985 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
986 __acquires(&sbi->s_fc_lock)
987 __releases(&sbi->s_fc_lock)
988 {
989 	struct super_block *sb = (struct super_block *)(journal->j_private);
990 	struct ext4_sb_info *sbi = EXT4_SB(sb);
991 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
992 	struct inode *inode;
993 	struct ext4_inode_info *ei, *ei_n;
994 	int ret;
995 
996 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
997 		return 0;
998 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
999 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1000 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1001 			spin_unlock(&sbi->s_fc_lock);
1002 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1003 				ret = -ENOSPC;
1004 				goto lock_and_exit;
1005 			}
1006 			spin_lock(&sbi->s_fc_lock);
1007 			continue;
1008 		}
1009 
1010 		inode = NULL;
1011 		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1012 					 i_fc_list) {
1013 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1014 				inode = &ei->vfs_inode;
1015 				break;
1016 			}
1017 		}
1018 		/*
1019 		 * If we don't find inode in our list, then it was deleted,
1020 		 * in which case, we don't need to record it's create tag.
1021 		 */
1022 		if (!inode)
1023 			continue;
1024 		spin_unlock(&sbi->s_fc_lock);
1025 
1026 		/*
1027 		 * We first write the inode and then the create dirent. This
1028 		 * allows the recovery code to create an unnamed inode first
1029 		 * and then link it to a directory entry. This allows us
1030 		 * to use namei.c routines almost as is and simplifies
1031 		 * the recovery code.
1032 		 */
1033 		ret = ext4_fc_write_inode(inode, crc);
1034 		if (ret)
1035 			goto lock_and_exit;
1036 
1037 		ret = ext4_fc_write_inode_data(inode, crc);
1038 		if (ret)
1039 			goto lock_and_exit;
1040 
1041 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1042 			ret = -ENOSPC;
1043 			goto lock_and_exit;
1044 		}
1045 
1046 		spin_lock(&sbi->s_fc_lock);
1047 	}
1048 	return 0;
1049 lock_and_exit:
1050 	spin_lock(&sbi->s_fc_lock);
1051 	return ret;
1052 }
1053 
1054 static int ext4_fc_perform_commit(journal_t *journal)
1055 {
1056 	struct super_block *sb = (struct super_block *)(journal->j_private);
1057 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1058 	struct ext4_inode_info *iter;
1059 	struct ext4_fc_head head;
1060 	struct inode *inode;
1061 	struct blk_plug plug;
1062 	int ret = 0;
1063 	u32 crc = 0;
1064 
1065 	ret = ext4_fc_submit_inode_data_all(journal);
1066 	if (ret)
1067 		return ret;
1068 
1069 	ret = ext4_fc_wait_inode_data_all(journal);
1070 	if (ret)
1071 		return ret;
1072 
1073 	/*
1074 	 * If file system device is different from journal device, issue a cache
1075 	 * flush before we start writing fast commit blocks.
1076 	 */
1077 	if (journal->j_fs_dev != journal->j_dev)
1078 		blkdev_issue_flush(journal->j_fs_dev);
1079 
1080 	blk_start_plug(&plug);
1081 	if (sbi->s_fc_bytes == 0) {
1082 		/*
1083 		 * Add a head tag only if this is the first fast commit
1084 		 * in this TID.
1085 		 */
1086 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1087 		head.fc_tid = cpu_to_le32(
1088 			sbi->s_journal->j_running_transaction->t_tid);
1089 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1090 			(u8 *)&head, &crc)) {
1091 			ret = -ENOSPC;
1092 			goto out;
1093 		}
1094 	}
1095 
1096 	spin_lock(&sbi->s_fc_lock);
1097 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1098 	if (ret) {
1099 		spin_unlock(&sbi->s_fc_lock);
1100 		goto out;
1101 	}
1102 
1103 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1104 		inode = &iter->vfs_inode;
1105 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106 			continue;
1107 
1108 		spin_unlock(&sbi->s_fc_lock);
1109 		ret = ext4_fc_write_inode_data(inode, &crc);
1110 		if (ret)
1111 			goto out;
1112 		ret = ext4_fc_write_inode(inode, &crc);
1113 		if (ret)
1114 			goto out;
1115 		spin_lock(&sbi->s_fc_lock);
1116 	}
1117 	spin_unlock(&sbi->s_fc_lock);
1118 
1119 	ret = ext4_fc_write_tail(sb, crc);
1120 
1121 out:
1122 	blk_finish_plug(&plug);
1123 	return ret;
1124 }
1125 
1126 /*
1127  * The main commit entry point. Performs a fast commit for transaction
1128  * commit_tid if needed. If it's not possible to perform a fast commit
1129  * due to various reasons, we fall back to full commit. Returns 0
1130  * on success, error otherwise.
1131  */
1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133 {
1134 	struct super_block *sb = (struct super_block *)(journal->j_private);
1135 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1136 	int nblks = 0, ret, bsize = journal->j_blocksize;
1137 	int subtid = atomic_read(&sbi->s_fc_subtid);
1138 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139 	ktime_t start_time, commit_time;
1140 
1141 	trace_ext4_fc_commit_start(sb);
1142 
1143 	start_time = ktime_get();
1144 
1145 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146 		(ext4_fc_is_ineligible(sb))) {
1147 		reason = EXT4_FC_REASON_INELIGIBLE;
1148 		goto out;
1149 	}
1150 
1151 restart_fc:
1152 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1153 	if (ret == -EALREADY) {
1154 		/* There was an ongoing commit, check if we need to restart */
1155 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156 			commit_tid > journal->j_commit_sequence)
1157 			goto restart_fc;
1158 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159 		goto out;
1160 	} else if (ret) {
1161 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162 		reason = EXT4_FC_REASON_FC_START_FAILED;
1163 		goto out;
1164 	}
1165 
1166 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167 	ret = ext4_fc_perform_commit(journal);
1168 	if (ret < 0) {
1169 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170 		reason = EXT4_FC_REASON_FC_FAILED;
1171 		goto out;
1172 	}
1173 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174 	ret = jbd2_fc_wait_bufs(journal, nblks);
1175 	if (ret < 0) {
1176 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177 		reason = EXT4_FC_REASON_FC_FAILED;
1178 		goto out;
1179 	}
1180 	atomic_inc(&sbi->s_fc_subtid);
1181 	jbd2_fc_end_commit(journal);
1182 out:
1183 	/* Has any ineligible update happened since we started? */
1184 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186 		reason = EXT4_FC_REASON_INELIGIBLE;
1187 	}
1188 
1189 	spin_lock(&sbi->s_fc_lock);
1190 	if (reason != EXT4_FC_REASON_OK &&
1191 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192 		sbi->s_fc_stats.fc_ineligible_commits++;
1193 	} else {
1194 		sbi->s_fc_stats.fc_num_commits++;
1195 		sbi->s_fc_stats.fc_numblks += nblks;
1196 	}
1197 	spin_unlock(&sbi->s_fc_lock);
1198 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1200 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201 	/*
1202 	 * weight the commit time higher than the average time so we don't
1203 	 * react too strongly to vast changes in the commit time
1204 	 */
1205 	if (likely(sbi->s_fc_avg_commit_time))
1206 		sbi->s_fc_avg_commit_time = (commit_time +
1207 				sbi->s_fc_avg_commit_time * 3) / 4;
1208 	else
1209 		sbi->s_fc_avg_commit_time = commit_time;
1210 	jbd_debug(1,
1211 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212 		nblks, reason, subtid);
1213 	if (reason == EXT4_FC_REASON_FC_FAILED)
1214 		return jbd2_fc_end_commit_fallback(journal);
1215 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216 		reason == EXT4_FC_REASON_INELIGIBLE)
1217 		return jbd2_complete_transaction(journal, commit_tid);
1218 	return 0;
1219 }
1220 
1221 /*
1222  * Fast commit cleanup routine. This is called after every fast commit and
1223  * full commit. full is true if we are called after a full commit.
1224  */
1225 static void ext4_fc_cleanup(journal_t *journal, int full)
1226 {
1227 	struct super_block *sb = journal->j_private;
1228 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1229 	struct ext4_inode_info *iter, *iter_n;
1230 	struct ext4_fc_dentry_update *fc_dentry;
1231 
1232 	if (full && sbi->s_fc_bh)
1233 		sbi->s_fc_bh = NULL;
1234 
1235 	jbd2_fc_release_bufs(journal);
1236 
1237 	spin_lock(&sbi->s_fc_lock);
1238 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1239 				 i_fc_list) {
1240 		list_del_init(&iter->i_fc_list);
1241 		ext4_clear_inode_state(&iter->vfs_inode,
1242 				       EXT4_STATE_FC_COMMITTING);
1243 		ext4_fc_reset_inode(&iter->vfs_inode);
1244 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1245 		smp_mb();
1246 #if (BITS_PER_LONG < 64)
1247 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1248 #else
1249 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1250 #endif
1251 	}
1252 
1253 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1254 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1255 					     struct ext4_fc_dentry_update,
1256 					     fcd_list);
1257 		list_del_init(&fc_dentry->fcd_list);
1258 		spin_unlock(&sbi->s_fc_lock);
1259 
1260 		if (fc_dentry->fcd_name.name &&
1261 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1262 			kfree(fc_dentry->fcd_name.name);
1263 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1264 		spin_lock(&sbi->s_fc_lock);
1265 	}
1266 
1267 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1268 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1269 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1270 				&sbi->s_fc_q[FC_Q_MAIN]);
1271 
1272 	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1273 	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1274 
1275 	if (full)
1276 		sbi->s_fc_bytes = 0;
1277 	spin_unlock(&sbi->s_fc_lock);
1278 	trace_ext4_fc_stats(sb);
1279 }
1280 
1281 /* Ext4 Replay Path Routines */
1282 
1283 /* Helper struct for dentry replay routines */
1284 struct dentry_info_args {
1285 	int parent_ino, dname_len, ino, inode_len;
1286 	char *dname;
1287 };
1288 
1289 static inline void tl_to_darg(struct dentry_info_args *darg,
1290 			      struct  ext4_fc_tl *tl, u8 *val)
1291 {
1292 	struct ext4_fc_dentry_info fcd;
1293 
1294 	memcpy(&fcd, val, sizeof(fcd));
1295 
1296 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1297 	darg->ino = le32_to_cpu(fcd.fc_ino);
1298 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1299 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1300 		sizeof(struct ext4_fc_dentry_info);
1301 }
1302 
1303 /* Unlink replay function */
1304 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1305 				 u8 *val)
1306 {
1307 	struct inode *inode, *old_parent;
1308 	struct qstr entry;
1309 	struct dentry_info_args darg;
1310 	int ret = 0;
1311 
1312 	tl_to_darg(&darg, tl, val);
1313 
1314 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315 			darg.parent_ino, darg.dname_len);
1316 
1317 	entry.name = darg.dname;
1318 	entry.len = darg.dname_len;
1319 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320 
1321 	if (IS_ERR(inode)) {
1322 		jbd_debug(1, "Inode %d not found", darg.ino);
1323 		return 0;
1324 	}
1325 
1326 	old_parent = ext4_iget(sb, darg.parent_ino,
1327 				EXT4_IGET_NORMAL);
1328 	if (IS_ERR(old_parent)) {
1329 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1330 		iput(inode);
1331 		return 0;
1332 	}
1333 
1334 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335 	/* -ENOENT ok coz it might not exist anymore. */
1336 	if (ret == -ENOENT)
1337 		ret = 0;
1338 	iput(old_parent);
1339 	iput(inode);
1340 	return ret;
1341 }
1342 
1343 static int ext4_fc_replay_link_internal(struct super_block *sb,
1344 				struct dentry_info_args *darg,
1345 				struct inode *inode)
1346 {
1347 	struct inode *dir = NULL;
1348 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350 	int ret = 0;
1351 
1352 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353 	if (IS_ERR(dir)) {
1354 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355 		dir = NULL;
1356 		goto out;
1357 	}
1358 
1359 	dentry_dir = d_obtain_alias(dir);
1360 	if (IS_ERR(dentry_dir)) {
1361 		jbd_debug(1, "Failed to obtain dentry");
1362 		dentry_dir = NULL;
1363 		goto out;
1364 	}
1365 
1366 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367 	if (!dentry_inode) {
1368 		jbd_debug(1, "Inode dentry not created.");
1369 		ret = -ENOMEM;
1370 		goto out;
1371 	}
1372 
1373 	ret = __ext4_link(dir, inode, dentry_inode);
1374 	/*
1375 	 * It's possible that link already existed since data blocks
1376 	 * for the dir in question got persisted before we crashed OR
1377 	 * we replayed this tag and crashed before the entire replay
1378 	 * could complete.
1379 	 */
1380 	if (ret && ret != -EEXIST) {
1381 		jbd_debug(1, "Failed to link\n");
1382 		goto out;
1383 	}
1384 
1385 	ret = 0;
1386 out:
1387 	if (dentry_dir) {
1388 		d_drop(dentry_dir);
1389 		dput(dentry_dir);
1390 	} else if (dir) {
1391 		iput(dir);
1392 	}
1393 	if (dentry_inode) {
1394 		d_drop(dentry_inode);
1395 		dput(dentry_inode);
1396 	}
1397 
1398 	return ret;
1399 }
1400 
1401 /* Link replay function */
1402 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1403 			       u8 *val)
1404 {
1405 	struct inode *inode;
1406 	struct dentry_info_args darg;
1407 	int ret = 0;
1408 
1409 	tl_to_darg(&darg, tl, val);
1410 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1411 			darg.parent_ino, darg.dname_len);
1412 
1413 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1414 	if (IS_ERR(inode)) {
1415 		jbd_debug(1, "Inode not found.");
1416 		return 0;
1417 	}
1418 
1419 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1420 	iput(inode);
1421 	return ret;
1422 }
1423 
1424 /*
1425  * Record all the modified inodes during replay. We use this later to setup
1426  * block bitmaps correctly.
1427  */
1428 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1429 {
1430 	struct ext4_fc_replay_state *state;
1431 	int i;
1432 
1433 	state = &EXT4_SB(sb)->s_fc_replay_state;
1434 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1435 		if (state->fc_modified_inodes[i] == ino)
1436 			return 0;
1437 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1438 		state->fc_modified_inodes_size +=
1439 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1440 		state->fc_modified_inodes = krealloc(
1441 					state->fc_modified_inodes, sizeof(int) *
1442 					state->fc_modified_inodes_size,
1443 					GFP_KERNEL);
1444 		if (!state->fc_modified_inodes)
1445 			return -ENOMEM;
1446 	}
1447 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1448 	return 0;
1449 }
1450 
1451 /*
1452  * Inode replay function
1453  */
1454 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1455 				u8 *val)
1456 {
1457 	struct ext4_fc_inode fc_inode;
1458 	struct ext4_inode *raw_inode;
1459 	struct ext4_inode *raw_fc_inode;
1460 	struct inode *inode = NULL;
1461 	struct ext4_iloc iloc;
1462 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1463 	struct ext4_extent_header *eh;
1464 
1465 	memcpy(&fc_inode, val, sizeof(fc_inode));
1466 
1467 	ino = le32_to_cpu(fc_inode.fc_ino);
1468 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1469 
1470 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1471 	if (!IS_ERR(inode)) {
1472 		ext4_ext_clear_bb(inode);
1473 		iput(inode);
1474 	}
1475 	inode = NULL;
1476 
1477 	ext4_fc_record_modified_inode(sb, ino);
1478 
1479 	raw_fc_inode = (struct ext4_inode *)
1480 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1481 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1482 	if (ret)
1483 		goto out;
1484 
1485 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1486 	raw_inode = ext4_raw_inode(&iloc);
1487 
1488 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1489 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1490 		inode_len - offsetof(struct ext4_inode, i_generation));
1491 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1492 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1493 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1494 			memset(eh, 0, sizeof(*eh));
1495 			eh->eh_magic = EXT4_EXT_MAGIC;
1496 			eh->eh_max = cpu_to_le16(
1497 				(sizeof(raw_inode->i_block) -
1498 				 sizeof(struct ext4_extent_header))
1499 				 / sizeof(struct ext4_extent));
1500 		}
1501 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1502 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1503 			sizeof(raw_inode->i_block));
1504 	}
1505 
1506 	/* Immediately update the inode on disk. */
1507 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1508 	if (ret)
1509 		goto out;
1510 	ret = sync_dirty_buffer(iloc.bh);
1511 	if (ret)
1512 		goto out;
1513 	ret = ext4_mark_inode_used(sb, ino);
1514 	if (ret)
1515 		goto out;
1516 
1517 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1518 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1519 	if (IS_ERR(inode)) {
1520 		jbd_debug(1, "Inode not found.");
1521 		return -EFSCORRUPTED;
1522 	}
1523 
1524 	/*
1525 	 * Our allocator could have made different decisions than before
1526 	 * crashing. This should be fixed but until then, we calculate
1527 	 * the number of blocks the inode.
1528 	 */
1529 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1530 		ext4_ext_replay_set_iblocks(inode);
1531 
1532 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533 	ext4_reset_inode_seed(inode);
1534 
1535 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537 	sync_dirty_buffer(iloc.bh);
1538 	brelse(iloc.bh);
1539 out:
1540 	iput(inode);
1541 	if (!ret)
1542 		blkdev_issue_flush(sb->s_bdev);
1543 
1544 	return 0;
1545 }
1546 
1547 /*
1548  * Dentry create replay function.
1549  *
1550  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551  * inode for which we are trying to create a dentry here, should already have
1552  * been replayed before we start here.
1553  */
1554 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555 				 u8 *val)
1556 {
1557 	int ret = 0;
1558 	struct inode *inode = NULL;
1559 	struct inode *dir = NULL;
1560 	struct dentry_info_args darg;
1561 
1562 	tl_to_darg(&darg, tl, val);
1563 
1564 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565 			darg.parent_ino, darg.dname_len);
1566 
1567 	/* This takes care of update group descriptor and other metadata */
1568 	ret = ext4_mark_inode_used(sb, darg.ino);
1569 	if (ret)
1570 		goto out;
1571 
1572 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573 	if (IS_ERR(inode)) {
1574 		jbd_debug(1, "inode %d not found.", darg.ino);
1575 		inode = NULL;
1576 		ret = -EINVAL;
1577 		goto out;
1578 	}
1579 
1580 	if (S_ISDIR(inode->i_mode)) {
1581 		/*
1582 		 * If we are creating a directory, we need to make sure that the
1583 		 * dot and dot dot dirents are setup properly.
1584 		 */
1585 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586 		if (IS_ERR(dir)) {
1587 			jbd_debug(1, "Dir %d not found.", darg.ino);
1588 			goto out;
1589 		}
1590 		ret = ext4_init_new_dir(NULL, dir, inode);
1591 		iput(dir);
1592 		if (ret) {
1593 			ret = 0;
1594 			goto out;
1595 		}
1596 	}
1597 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598 	if (ret)
1599 		goto out;
1600 	set_nlink(inode, 1);
1601 	ext4_mark_inode_dirty(NULL, inode);
1602 out:
1603 	if (inode)
1604 		iput(inode);
1605 	return ret;
1606 }
1607 
1608 /*
1609  * Record physical disk regions which are in use as per fast commit area. Our
1610  * simple replay phase allocator excludes these regions from allocation.
1611  */
1612 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614 {
1615 	struct ext4_fc_replay_state *state;
1616 	struct ext4_fc_alloc_region *region;
1617 
1618 	state = &EXT4_SB(sb)->s_fc_replay_state;
1619 	if (state->fc_regions_used == state->fc_regions_size) {
1620 		state->fc_regions_size +=
1621 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622 		state->fc_regions = krealloc(
1623 					state->fc_regions,
1624 					state->fc_regions_size *
1625 					sizeof(struct ext4_fc_alloc_region),
1626 					GFP_KERNEL);
1627 		if (!state->fc_regions)
1628 			return -ENOMEM;
1629 	}
1630 	region = &state->fc_regions[state->fc_regions_used++];
1631 	region->ino = ino;
1632 	region->lblk = lblk;
1633 	region->pblk = pblk;
1634 	region->len = len;
1635 
1636 	return 0;
1637 }
1638 
1639 /* Replay add range tag */
1640 static int ext4_fc_replay_add_range(struct super_block *sb,
1641 				    struct ext4_fc_tl *tl, u8 *val)
1642 {
1643 	struct ext4_fc_add_range fc_add_ex;
1644 	struct ext4_extent newex, *ex;
1645 	struct inode *inode;
1646 	ext4_lblk_t start, cur;
1647 	int remaining, len;
1648 	ext4_fsblk_t start_pblk;
1649 	struct ext4_map_blocks map;
1650 	struct ext4_ext_path *path = NULL;
1651 	int ret;
1652 
1653 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655 
1656 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658 		ext4_ext_get_actual_len(ex));
1659 
1660 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661 	if (IS_ERR(inode)) {
1662 		jbd_debug(1, "Inode not found.");
1663 		return 0;
1664 	}
1665 
1666 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667 
1668 	start = le32_to_cpu(ex->ee_block);
1669 	start_pblk = ext4_ext_pblock(ex);
1670 	len = ext4_ext_get_actual_len(ex);
1671 
1672 	cur = start;
1673 	remaining = len;
1674 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676 		  inode->i_ino);
1677 
1678 	while (remaining > 0) {
1679 		map.m_lblk = cur;
1680 		map.m_len = remaining;
1681 		map.m_pblk = 0;
1682 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1683 
1684 		if (ret < 0) {
1685 			iput(inode);
1686 			return 0;
1687 		}
1688 
1689 		if (ret == 0) {
1690 			/* Range is not mapped */
1691 			path = ext4_find_extent(inode, cur, NULL, 0);
1692 			if (IS_ERR(path)) {
1693 				iput(inode);
1694 				return 0;
1695 			}
1696 			memset(&newex, 0, sizeof(newex));
1697 			newex.ee_block = cpu_to_le32(cur);
1698 			ext4_ext_store_pblock(
1699 				&newex, start_pblk + cur - start);
1700 			newex.ee_len = cpu_to_le16(map.m_len);
1701 			if (ext4_ext_is_unwritten(ex))
1702 				ext4_ext_mark_unwritten(&newex);
1703 			down_write(&EXT4_I(inode)->i_data_sem);
1704 			ret = ext4_ext_insert_extent(
1705 				NULL, inode, &path, &newex, 0);
1706 			up_write((&EXT4_I(inode)->i_data_sem));
1707 			ext4_ext_drop_refs(path);
1708 			kfree(path);
1709 			if (ret) {
1710 				iput(inode);
1711 				return 0;
1712 			}
1713 			goto next;
1714 		}
1715 
1716 		if (start_pblk + cur - start != map.m_pblk) {
1717 			/*
1718 			 * Logical to physical mapping changed. This can happen
1719 			 * if this range was removed and then reallocated to
1720 			 * map to new physical blocks during a fast commit.
1721 			 */
1722 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723 					ext4_ext_is_unwritten(ex),
1724 					start_pblk + cur - start);
1725 			if (ret) {
1726 				iput(inode);
1727 				return 0;
1728 			}
1729 			/*
1730 			 * Mark the old blocks as free since they aren't used
1731 			 * anymore. We maintain an array of all the modified
1732 			 * inodes. In case these blocks are still used at either
1733 			 * a different logical range in the same inode or in
1734 			 * some different inode, we will mark them as allocated
1735 			 * at the end of the FC replay using our array of
1736 			 * modified inodes.
1737 			 */
1738 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739 			goto next;
1740 		}
1741 
1742 		/* Range is mapped and needs a state change */
1743 		jbd_debug(1, "Converting from %ld to %d %lld",
1744 				map.m_flags & EXT4_MAP_UNWRITTEN,
1745 			ext4_ext_is_unwritten(ex), map.m_pblk);
1746 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747 					ext4_ext_is_unwritten(ex), map.m_pblk);
1748 		if (ret) {
1749 			iput(inode);
1750 			return 0;
1751 		}
1752 		/*
1753 		 * We may have split the extent tree while toggling the state.
1754 		 * Try to shrink the extent tree now.
1755 		 */
1756 		ext4_ext_replay_shrink_inode(inode, start + len);
1757 next:
1758 		cur += map.m_len;
1759 		remaining -= map.m_len;
1760 	}
1761 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762 					sb->s_blocksize_bits);
1763 	iput(inode);
1764 	return 0;
1765 }
1766 
1767 /* Replay DEL_RANGE tag */
1768 static int
1769 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770 			 u8 *val)
1771 {
1772 	struct inode *inode;
1773 	struct ext4_fc_del_range lrange;
1774 	struct ext4_map_blocks map;
1775 	ext4_lblk_t cur, remaining;
1776 	int ret;
1777 
1778 	memcpy(&lrange, val, sizeof(lrange));
1779 	cur = le32_to_cpu(lrange.fc_lblk);
1780 	remaining = le32_to_cpu(lrange.fc_len);
1781 
1782 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1784 
1785 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786 	if (IS_ERR(inode)) {
1787 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788 		return 0;
1789 	}
1790 
1791 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792 
1793 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795 			le32_to_cpu(lrange.fc_len));
1796 	while (remaining > 0) {
1797 		map.m_lblk = cur;
1798 		map.m_len = remaining;
1799 
1800 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1801 		if (ret < 0) {
1802 			iput(inode);
1803 			return 0;
1804 		}
1805 		if (ret > 0) {
1806 			remaining -= ret;
1807 			cur += ret;
1808 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809 		} else {
1810 			remaining -= map.m_len;
1811 			cur += map.m_len;
1812 		}
1813 	}
1814 
1815 	ret = ext4_punch_hole(inode,
1816 		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817 		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1818 	if (ret)
1819 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820 	ext4_ext_replay_shrink_inode(inode,
1821 		i_size_read(inode) >> sb->s_blocksize_bits);
1822 	ext4_mark_inode_dirty(NULL, inode);
1823 	iput(inode);
1824 
1825 	return 0;
1826 }
1827 
1828 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829 {
1830 	struct ext4_fc_replay_state *state;
1831 	struct inode *inode;
1832 	struct ext4_ext_path *path = NULL;
1833 	struct ext4_map_blocks map;
1834 	int i, ret, j;
1835 	ext4_lblk_t cur, end;
1836 
1837 	state = &EXT4_SB(sb)->s_fc_replay_state;
1838 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840 			EXT4_IGET_NORMAL);
1841 		if (IS_ERR(inode)) {
1842 			jbd_debug(1, "Inode %d not found.",
1843 				state->fc_modified_inodes[i]);
1844 			continue;
1845 		}
1846 		cur = 0;
1847 		end = EXT_MAX_BLOCKS;
1848 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1849 			iput(inode);
1850 			continue;
1851 		}
1852 		while (cur < end) {
1853 			map.m_lblk = cur;
1854 			map.m_len = end - cur;
1855 
1856 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1857 			if (ret < 0)
1858 				break;
1859 
1860 			if (ret > 0) {
1861 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1862 				if (!IS_ERR(path)) {
1863 					for (j = 0; j < path->p_depth; j++)
1864 						ext4_mb_mark_bb(inode->i_sb,
1865 							path[j].p_block, 1, 1);
1866 					ext4_ext_drop_refs(path);
1867 					kfree(path);
1868 				}
1869 				cur += ret;
1870 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1871 							map.m_len, 1);
1872 			} else {
1873 				cur = cur + (map.m_len ? map.m_len : 1);
1874 			}
1875 		}
1876 		iput(inode);
1877 	}
1878 }
1879 
1880 /*
1881  * Check if block is in excluded regions for block allocation. The simple
1882  * allocator that runs during replay phase is calls this function to see
1883  * if it is okay to use a block.
1884  */
1885 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1886 {
1887 	int i;
1888 	struct ext4_fc_replay_state *state;
1889 
1890 	state = &EXT4_SB(sb)->s_fc_replay_state;
1891 	for (i = 0; i < state->fc_regions_valid; i++) {
1892 		if (state->fc_regions[i].ino == 0 ||
1893 			state->fc_regions[i].len == 0)
1894 			continue;
1895 		if (blk >= state->fc_regions[i].pblk &&
1896 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1897 			return true;
1898 	}
1899 	return false;
1900 }
1901 
1902 /* Cleanup function called after replay */
1903 void ext4_fc_replay_cleanup(struct super_block *sb)
1904 {
1905 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1906 
1907 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1908 	kfree(sbi->s_fc_replay_state.fc_regions);
1909 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1910 }
1911 
1912 /*
1913  * Recovery Scan phase handler
1914  *
1915  * This function is called during the scan phase and is responsible
1916  * for doing following things:
1917  * - Make sure the fast commit area has valid tags for replay
1918  * - Count number of tags that need to be replayed by the replay handler
1919  * - Verify CRC
1920  * - Create a list of excluded blocks for allocation during replay phase
1921  *
1922  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1923  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1924  * to indicate that scan has finished and JBD2 can now start replay phase.
1925  * It returns a negative error to indicate that there was an error. At the end
1926  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1927  * to indicate the number of tags that need to replayed during the replay phase.
1928  */
1929 static int ext4_fc_replay_scan(journal_t *journal,
1930 				struct buffer_head *bh, int off,
1931 				tid_t expected_tid)
1932 {
1933 	struct super_block *sb = journal->j_private;
1934 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1935 	struct ext4_fc_replay_state *state;
1936 	int ret = JBD2_FC_REPLAY_CONTINUE;
1937 	struct ext4_fc_add_range ext;
1938 	struct ext4_fc_tl tl;
1939 	struct ext4_fc_tail tail;
1940 	__u8 *start, *end, *cur, *val;
1941 	struct ext4_fc_head head;
1942 	struct ext4_extent *ex;
1943 
1944 	state = &sbi->s_fc_replay_state;
1945 
1946 	start = (u8 *)bh->b_data;
1947 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1948 
1949 	if (state->fc_replay_expected_off == 0) {
1950 		state->fc_cur_tag = 0;
1951 		state->fc_replay_num_tags = 0;
1952 		state->fc_crc = 0;
1953 		state->fc_regions = NULL;
1954 		state->fc_regions_valid = state->fc_regions_used =
1955 			state->fc_regions_size = 0;
1956 		/* Check if we can stop early */
1957 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1958 			!= EXT4_FC_TAG_HEAD)
1959 			return 0;
1960 	}
1961 
1962 	if (off != state->fc_replay_expected_off) {
1963 		ret = -EFSCORRUPTED;
1964 		goto out_err;
1965 	}
1966 
1967 	state->fc_replay_expected_off++;
1968 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1969 		memcpy(&tl, cur, sizeof(tl));
1970 		val = cur + sizeof(tl);
1971 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1972 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1973 		switch (le16_to_cpu(tl.fc_tag)) {
1974 		case EXT4_FC_TAG_ADD_RANGE:
1975 			memcpy(&ext, val, sizeof(ext));
1976 			ex = (struct ext4_extent *)&ext.fc_ex;
1977 			ret = ext4_fc_record_regions(sb,
1978 				le32_to_cpu(ext.fc_ino),
1979 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1980 				ext4_ext_get_actual_len(ex));
1981 			if (ret < 0)
1982 				break;
1983 			ret = JBD2_FC_REPLAY_CONTINUE;
1984 			fallthrough;
1985 		case EXT4_FC_TAG_DEL_RANGE:
1986 		case EXT4_FC_TAG_LINK:
1987 		case EXT4_FC_TAG_UNLINK:
1988 		case EXT4_FC_TAG_CREAT:
1989 		case EXT4_FC_TAG_INODE:
1990 		case EXT4_FC_TAG_PAD:
1991 			state->fc_cur_tag++;
1992 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1993 					sizeof(tl) + le16_to_cpu(tl.fc_len));
1994 			break;
1995 		case EXT4_FC_TAG_TAIL:
1996 			state->fc_cur_tag++;
1997 			memcpy(&tail, val, sizeof(tail));
1998 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1999 						sizeof(tl) +
2000 						offsetof(struct ext4_fc_tail,
2001 						fc_crc));
2002 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2003 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2004 				state->fc_replay_num_tags = state->fc_cur_tag;
2005 				state->fc_regions_valid =
2006 					state->fc_regions_used;
2007 			} else {
2008 				ret = state->fc_replay_num_tags ?
2009 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2010 			}
2011 			state->fc_crc = 0;
2012 			break;
2013 		case EXT4_FC_TAG_HEAD:
2014 			memcpy(&head, val, sizeof(head));
2015 			if (le32_to_cpu(head.fc_features) &
2016 				~EXT4_FC_SUPPORTED_FEATURES) {
2017 				ret = -EOPNOTSUPP;
2018 				break;
2019 			}
2020 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2021 				ret = JBD2_FC_REPLAY_STOP;
2022 				break;
2023 			}
2024 			state->fc_cur_tag++;
2025 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2026 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2027 			break;
2028 		default:
2029 			ret = state->fc_replay_num_tags ?
2030 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2031 		}
2032 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2033 			break;
2034 	}
2035 
2036 out_err:
2037 	trace_ext4_fc_replay_scan(sb, ret, off);
2038 	return ret;
2039 }
2040 
2041 /*
2042  * Main recovery path entry point.
2043  * The meaning of return codes is similar as above.
2044  */
2045 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2046 				enum passtype pass, int off, tid_t expected_tid)
2047 {
2048 	struct super_block *sb = journal->j_private;
2049 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2050 	struct ext4_fc_tl tl;
2051 	__u8 *start, *end, *cur, *val;
2052 	int ret = JBD2_FC_REPLAY_CONTINUE;
2053 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2054 	struct ext4_fc_tail tail;
2055 
2056 	if (pass == PASS_SCAN) {
2057 		state->fc_current_pass = PASS_SCAN;
2058 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2059 	}
2060 
2061 	if (state->fc_current_pass != pass) {
2062 		state->fc_current_pass = pass;
2063 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2064 	}
2065 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2066 		jbd_debug(1, "Replay stops\n");
2067 		ext4_fc_set_bitmaps_and_counters(sb);
2068 		return 0;
2069 	}
2070 
2071 #ifdef CONFIG_EXT4_DEBUG
2072 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2073 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2074 		return JBD2_FC_REPLAY_STOP;
2075 	}
2076 #endif
2077 
2078 	start = (u8 *)bh->b_data;
2079 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2080 
2081 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2082 		memcpy(&tl, cur, sizeof(tl));
2083 		val = cur + sizeof(tl);
2084 
2085 		if (state->fc_replay_num_tags == 0) {
2086 			ret = JBD2_FC_REPLAY_STOP;
2087 			ext4_fc_set_bitmaps_and_counters(sb);
2088 			break;
2089 		}
2090 		jbd_debug(3, "Replay phase, tag:%s\n",
2091 				tag2str(le16_to_cpu(tl.fc_tag)));
2092 		state->fc_replay_num_tags--;
2093 		switch (le16_to_cpu(tl.fc_tag)) {
2094 		case EXT4_FC_TAG_LINK:
2095 			ret = ext4_fc_replay_link(sb, &tl, val);
2096 			break;
2097 		case EXT4_FC_TAG_UNLINK:
2098 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2099 			break;
2100 		case EXT4_FC_TAG_ADD_RANGE:
2101 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2102 			break;
2103 		case EXT4_FC_TAG_CREAT:
2104 			ret = ext4_fc_replay_create(sb, &tl, val);
2105 			break;
2106 		case EXT4_FC_TAG_DEL_RANGE:
2107 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2108 			break;
2109 		case EXT4_FC_TAG_INODE:
2110 			ret = ext4_fc_replay_inode(sb, &tl, val);
2111 			break;
2112 		case EXT4_FC_TAG_PAD:
2113 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2114 					     le16_to_cpu(tl.fc_len), 0);
2115 			break;
2116 		case EXT4_FC_TAG_TAIL:
2117 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2118 					     le16_to_cpu(tl.fc_len), 0);
2119 			memcpy(&tail, val, sizeof(tail));
2120 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2121 			break;
2122 		case EXT4_FC_TAG_HEAD:
2123 			break;
2124 		default:
2125 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2126 					     le16_to_cpu(tl.fc_len), 0);
2127 			ret = -ECANCELED;
2128 			break;
2129 		}
2130 		if (ret < 0)
2131 			break;
2132 		ret = JBD2_FC_REPLAY_CONTINUE;
2133 	}
2134 	return ret;
2135 }
2136 
2137 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2138 {
2139 	/*
2140 	 * We set replay callback even if fast commit disabled because we may
2141 	 * could still have fast commit blocks that need to be replayed even if
2142 	 * fast commit has now been turned off.
2143 	 */
2144 	journal->j_fc_replay_callback = ext4_fc_replay;
2145 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2146 		return;
2147 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2148 }
2149 
2150 static const char *fc_ineligible_reasons[] = {
2151 	"Extended attributes changed",
2152 	"Cross rename",
2153 	"Journal flag changed",
2154 	"Insufficient memory",
2155 	"Swap boot",
2156 	"Resize",
2157 	"Dir renamed",
2158 	"Falloc range op",
2159 	"Data journalling",
2160 	"FC Commit Failed"
2161 };
2162 
2163 int ext4_fc_info_show(struct seq_file *seq, void *v)
2164 {
2165 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2166 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2167 	int i;
2168 
2169 	if (v != SEQ_START_TOKEN)
2170 		return 0;
2171 
2172 	seq_printf(seq,
2173 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2174 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2175 		   stats->fc_numblks,
2176 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2177 	seq_puts(seq, "Ineligible reasons:\n");
2178 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2179 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2180 			stats->fc_ineligible_reason_count[i]);
2181 
2182 	return 0;
2183 }
2184 
2185 int __init ext4_fc_init_dentry_cache(void)
2186 {
2187 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2188 					   SLAB_RECLAIM_ACCOUNT);
2189 
2190 	if (ext4_fc_dentry_cachep == NULL)
2191 		return -ENOMEM;
2192 
2193 	return 0;
2194 }
2195