xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 50dc9a85)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligibility is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * Fast Commit Replay Idempotence
107  * ------------------------------
108  *
109  * Fast commits tags are idempotent in nature provided the recovery code follows
110  * certain rules. The guiding principle that the commit path follows while
111  * committing is that it stores the result of a particular operation instead of
112  * storing the procedure.
113  *
114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115  * was associated with inode 10. During fast commit, instead of storing this
116  * operation as a procedure "rename a to b", we store the resulting file system
117  * state as a "series" of outcomes:
118  *
119  * - Link dirent b to inode 10
120  * - Unlink dirent a
121  * - Inode <10> with valid refcount
122  *
123  * Now when recovery code runs, it needs "enforce" this state on the file
124  * system. This is what guarantees idempotence of fast commit replay.
125  *
126  * Let's take an example of a procedure that is not idempotent and see how fast
127  * commits make it idempotent. Consider following sequence of operations:
128  *
129  *     rm A;    mv B A;    read A
130  *  (x)     (y)        (z)
131  *
132  * (x), (y) and (z) are the points at which we can crash. If we store this
133  * sequence of operations as is then the replay is not idempotent. Let's say
134  * while in replay, we crash at (z). During the second replay, file A (which was
135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
136  * file named A would be absent when we try to read A. So, this sequence of
137  * operations is not idempotent. However, as mentioned above, instead of storing
138  * the procedure fast commits store the outcome of each procedure. Thus the fast
139  * commit log for above procedure would be as follows:
140  *
141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142  * inode 11 before the replay)
143  *
144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
145  * (w)          (x)                    (y)          (z)
146  *
147  * If we crash at (z), we will have file A linked to inode 11. During the second
148  * replay, we will remove file A (inode 11). But we will create it back and make
149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152  * similarly. Thus, by converting a non-idempotent procedure into a series of
153  * idempotent outcomes, fast commits ensured idempotence during the replay.
154  *
155  * TODOs
156  * -----
157  *
158  * 0) Fast commit replay path hardening: Fast commit replay code should use
159  *    journal handles to make sure all the updates it does during the replay
160  *    path are atomic. With that if we crash during fast commit replay, after
161  *    trying to do recovery again, we will find a file system where fast commit
162  *    area is invalid (because new full commit would be found). In order to deal
163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
164  *    superblock state is persisted before starting the replay, so that after
165  *    the crash, fast commit recovery code can look at that flag and perform
166  *    fast commit recovery even if that area is invalidated by later full
167  *    commits.
168  *
169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170  *    eligible update must be protected within ext4_fc_start_update() and
171  *    ext4_fc_stop_update(). These routines are called at much higher
172  *    routines. This can be made more fine grained by combining with
173  *    ext4_journal_start().
174  *
175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176  *
177  * 3) Handle more ineligible cases.
178  */
179 
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
182 
183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184 {
185 	BUFFER_TRACE(bh, "");
186 	if (uptodate) {
187 		ext4_debug("%s: Block %lld up-to-date",
188 			   __func__, bh->b_blocknr);
189 		set_buffer_uptodate(bh);
190 	} else {
191 		ext4_debug("%s: Block %lld not up-to-date",
192 			   __func__, bh->b_blocknr);
193 		clear_buffer_uptodate(bh);
194 	}
195 
196 	unlock_buffer(bh);
197 }
198 
199 static inline void ext4_fc_reset_inode(struct inode *inode)
200 {
201 	struct ext4_inode_info *ei = EXT4_I(inode);
202 
203 	ei->i_fc_lblk_start = 0;
204 	ei->i_fc_lblk_len = 0;
205 }
206 
207 void ext4_fc_init_inode(struct inode *inode)
208 {
209 	struct ext4_inode_info *ei = EXT4_I(inode);
210 
211 	ext4_fc_reset_inode(inode);
212 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 	INIT_LIST_HEAD(&ei->i_fc_list);
214 	init_waitqueue_head(&ei->i_fc_wait);
215 	atomic_set(&ei->i_fc_updates, 0);
216 }
217 
218 /* This function must be called with sbi->s_fc_lock held. */
219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221 {
222 	wait_queue_head_t *wq;
223 	struct ext4_inode_info *ei = EXT4_I(inode);
224 
225 #if (BITS_PER_LONG < 64)
226 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 			EXT4_STATE_FC_COMMITTING);
228 	wq = bit_waitqueue(&ei->i_state_flags,
229 				EXT4_STATE_FC_COMMITTING);
230 #else
231 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 			EXT4_STATE_FC_COMMITTING);
233 	wq = bit_waitqueue(&ei->i_flags,
234 				EXT4_STATE_FC_COMMITTING);
235 #endif
236 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 	schedule();
240 	finish_wait(wq, &wait.wq_entry);
241 }
242 
243 /*
244  * Inform Ext4's fast about start of an inode update
245  *
246  * This function is called by the high level call VFS callbacks before
247  * performing any inode update. This function blocks if there's an ongoing
248  * fast commit on the inode in question.
249  */
250 void ext4_fc_start_update(struct inode *inode)
251 {
252 	struct ext4_inode_info *ei = EXT4_I(inode);
253 
254 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 		return;
257 
258 restart:
259 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 	if (list_empty(&ei->i_fc_list))
261 		goto out;
262 
263 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 		ext4_fc_wait_committing_inode(inode);
265 		goto restart;
266 	}
267 out:
268 	atomic_inc(&ei->i_fc_updates);
269 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270 }
271 
272 /*
273  * Stop inode update and wake up waiting fast commits if any.
274  */
275 void ext4_fc_stop_update(struct inode *inode)
276 {
277 	struct ext4_inode_info *ei = EXT4_I(inode);
278 
279 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 		return;
282 
283 	if (atomic_dec_and_test(&ei->i_fc_updates))
284 		wake_up_all(&ei->i_fc_wait);
285 }
286 
287 /*
288  * Remove inode from fast commit list. If the inode is being committed
289  * we wait until inode commit is done.
290  */
291 void ext4_fc_del(struct inode *inode)
292 {
293 	struct ext4_inode_info *ei = EXT4_I(inode);
294 
295 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 		return;
298 
299 restart:
300 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 	if (list_empty(&ei->i_fc_list)) {
302 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 		return;
304 	}
305 
306 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 		ext4_fc_wait_committing_inode(inode);
308 		goto restart;
309 	}
310 	list_del_init(&ei->i_fc_list);
311 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312 }
313 
314 /*
315  * Mark file system as fast commit ineligible. This means that next commit
316  * operation would result in a full jbd2 commit.
317  */
318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319 {
320 	struct ext4_sb_info *sbi = EXT4_SB(sb);
321 
322 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 		return;
325 
326 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329 }
330 
331 /*
332  * Start a fast commit ineligible update. Any commits that happen while
333  * such an operation is in progress fall back to full commits.
334  */
335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336 {
337 	struct ext4_sb_info *sbi = EXT4_SB(sb);
338 
339 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 		return;
342 
343 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 	atomic_inc(&sbi->s_fc_ineligible_updates);
346 }
347 
348 /*
349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350  * to ensure that after stopping the ineligible update, at least one full
351  * commit takes place.
352  */
353 void ext4_fc_stop_ineligible(struct super_block *sb)
354 {
355 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 		return;
358 
359 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361 }
362 
363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
364 {
365 	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367 }
368 
369 /*
370  * Generic fast commit tracking function. If this is the first time this we are
371  * called after a full commit, we initialize fast commit fields and then call
372  * __fc_track_fn() with update = 0. If we have already been called after a full
373  * commit, we pass update = 1. Based on that, the track function can determine
374  * if it needs to track a field for the first time or if it needs to just
375  * update the previously tracked value.
376  *
377  * If enqueue is set, this function enqueues the inode in fast commit list.
378  */
379 static int ext4_fc_track_template(
380 	handle_t *handle, struct inode *inode,
381 	int (*__fc_track_fn)(struct inode *, void *, bool),
382 	void *args, int enqueue)
383 {
384 	bool update = false;
385 	struct ext4_inode_info *ei = EXT4_I(inode);
386 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 	tid_t tid = 0;
388 	int ret;
389 
390 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
392 		return -EOPNOTSUPP;
393 
394 	if (ext4_fc_is_ineligible(inode->i_sb))
395 		return -EINVAL;
396 
397 	tid = handle->h_transaction->t_tid;
398 	mutex_lock(&ei->i_fc_lock);
399 	if (tid == ei->i_sync_tid) {
400 		update = true;
401 	} else {
402 		ext4_fc_reset_inode(inode);
403 		ei->i_sync_tid = tid;
404 	}
405 	ret = __fc_track_fn(inode, args, update);
406 	mutex_unlock(&ei->i_fc_lock);
407 
408 	if (!enqueue)
409 		return ret;
410 
411 	spin_lock(&sbi->s_fc_lock);
412 	if (list_empty(&EXT4_I(inode)->i_fc_list))
413 		list_add_tail(&EXT4_I(inode)->i_fc_list,
414 				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 				&sbi->s_fc_q[FC_Q_STAGING] :
416 				&sbi->s_fc_q[FC_Q_MAIN]);
417 	spin_unlock(&sbi->s_fc_lock);
418 
419 	return ret;
420 }
421 
422 struct __track_dentry_update_args {
423 	struct dentry *dentry;
424 	int op;
425 };
426 
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429 {
430 	struct ext4_fc_dentry_update *node;
431 	struct ext4_inode_info *ei = EXT4_I(inode);
432 	struct __track_dentry_update_args *dentry_update =
433 		(struct __track_dentry_update_args *)arg;
434 	struct dentry *dentry = dentry_update->dentry;
435 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436 
437 	mutex_unlock(&ei->i_fc_lock);
438 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 	if (!node) {
440 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 		mutex_lock(&ei->i_fc_lock);
442 		return -ENOMEM;
443 	}
444 
445 	node->fcd_op = dentry_update->op;
446 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 	node->fcd_ino = inode->i_ino;
448 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 		if (!node->fcd_name.name) {
451 			kmem_cache_free(ext4_fc_dentry_cachep, node);
452 			ext4_fc_mark_ineligible(inode->i_sb,
453 				EXT4_FC_REASON_NOMEM);
454 			mutex_lock(&ei->i_fc_lock);
455 			return -ENOMEM;
456 		}
457 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 			dentry->d_name.len);
459 	} else {
460 		memcpy(node->fcd_iname, dentry->d_name.name,
461 			dentry->d_name.len);
462 		node->fcd_name.name = node->fcd_iname;
463 	}
464 	node->fcd_name.len = dentry->d_name.len;
465 
466 	spin_lock(&sbi->s_fc_lock);
467 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 		list_add_tail(&node->fcd_list,
469 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 	else
471 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 	spin_unlock(&sbi->s_fc_lock);
473 	mutex_lock(&ei->i_fc_lock);
474 
475 	return 0;
476 }
477 
478 void __ext4_fc_track_unlink(handle_t *handle,
479 		struct inode *inode, struct dentry *dentry)
480 {
481 	struct __track_dentry_update_args args;
482 	int ret;
483 
484 	args.dentry = dentry;
485 	args.op = EXT4_FC_TAG_UNLINK;
486 
487 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 					(void *)&args, 0);
489 	trace_ext4_fc_track_unlink(inode, dentry, ret);
490 }
491 
492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493 {
494 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495 }
496 
497 void __ext4_fc_track_link(handle_t *handle,
498 	struct inode *inode, struct dentry *dentry)
499 {
500 	struct __track_dentry_update_args args;
501 	int ret;
502 
503 	args.dentry = dentry;
504 	args.op = EXT4_FC_TAG_LINK;
505 
506 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 					(void *)&args, 0);
508 	trace_ext4_fc_track_link(inode, dentry, ret);
509 }
510 
511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512 {
513 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
514 }
515 
516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517 			  struct dentry *dentry)
518 {
519 	struct __track_dentry_update_args args;
520 	int ret;
521 
522 	args.dentry = dentry;
523 	args.op = EXT4_FC_TAG_CREAT;
524 
525 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 					(void *)&args, 0);
527 	trace_ext4_fc_track_create(inode, dentry, ret);
528 }
529 
530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531 {
532 	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
533 }
534 
535 /* __track_fn for inode tracking */
536 static int __track_inode(struct inode *inode, void *arg, bool update)
537 {
538 	if (update)
539 		return -EEXIST;
540 
541 	EXT4_I(inode)->i_fc_lblk_len = 0;
542 
543 	return 0;
544 }
545 
546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547 {
548 	int ret;
549 
550 	if (S_ISDIR(inode->i_mode))
551 		return;
552 
553 	if (ext4_should_journal_data(inode)) {
554 		ext4_fc_mark_ineligible(inode->i_sb,
555 					EXT4_FC_REASON_INODE_JOURNAL_DATA);
556 		return;
557 	}
558 
559 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560 	trace_ext4_fc_track_inode(inode, ret);
561 }
562 
563 struct __track_range_args {
564 	ext4_lblk_t start, end;
565 };
566 
567 /* __track_fn for tracking data updates */
568 static int __track_range(struct inode *inode, void *arg, bool update)
569 {
570 	struct ext4_inode_info *ei = EXT4_I(inode);
571 	ext4_lblk_t oldstart;
572 	struct __track_range_args *__arg =
573 		(struct __track_range_args *)arg;
574 
575 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577 		return -ECANCELED;
578 	}
579 
580 	oldstart = ei->i_fc_lblk_start;
581 
582 	if (update && ei->i_fc_lblk_len > 0) {
583 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584 		ei->i_fc_lblk_len =
585 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586 				ei->i_fc_lblk_start + 1;
587 	} else {
588 		ei->i_fc_lblk_start = __arg->start;
589 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590 	}
591 
592 	return 0;
593 }
594 
595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596 			 ext4_lblk_t end)
597 {
598 	struct __track_range_args args;
599 	int ret;
600 
601 	if (S_ISDIR(inode->i_mode))
602 		return;
603 
604 	args.start = start;
605 	args.end = end;
606 
607 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
608 
609 	trace_ext4_fc_track_range(inode, start, end, ret);
610 }
611 
612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613 {
614 	int write_flags = REQ_SYNC;
615 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616 
617 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
618 	if (test_opt(sb, BARRIER) && is_tail)
619 		write_flags |= REQ_FUA | REQ_PREFLUSH;
620 	lock_buffer(bh);
621 	set_buffer_dirty(bh);
622 	set_buffer_uptodate(bh);
623 	bh->b_end_io = ext4_end_buffer_io_sync;
624 	submit_bh(REQ_OP_WRITE, write_flags, bh);
625 	EXT4_SB(sb)->s_fc_bh = NULL;
626 }
627 
628 /* Ext4 commit path routines */
629 
630 /* memzero and update CRC */
631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632 				u32 *crc)
633 {
634 	void *ret;
635 
636 	ret = memset(dst, 0, len);
637 	if (crc)
638 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639 	return ret;
640 }
641 
642 /*
643  * Allocate len bytes on a fast commit buffer.
644  *
645  * During the commit time this function is used to manage fast commit
646  * block space. We don't split a fast commit log onto different
647  * blocks. So this function makes sure that if there's not enough space
648  * on the current block, the remaining space in the current block is
649  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650  * new block is from jbd2 and CRC is updated to reflect the padding
651  * we added.
652  */
653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654 {
655 	struct ext4_fc_tl *tl;
656 	struct ext4_sb_info *sbi = EXT4_SB(sb);
657 	struct buffer_head *bh;
658 	int bsize = sbi->s_journal->j_blocksize;
659 	int ret, off = sbi->s_fc_bytes % bsize;
660 	int pad_len;
661 
662 	/*
663 	 * After allocating len, we should have space at least for a 0 byte
664 	 * padding.
665 	 */
666 	if (len + sizeof(struct ext4_fc_tl) > bsize)
667 		return NULL;
668 
669 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670 		/*
671 		 * Only allocate from current buffer if we have enough space for
672 		 * this request AND we have space to add a zero byte padding.
673 		 */
674 		if (!sbi->s_fc_bh) {
675 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676 			if (ret)
677 				return NULL;
678 			sbi->s_fc_bh = bh;
679 		}
680 		sbi->s_fc_bytes += len;
681 		return sbi->s_fc_bh->b_data + off;
682 	}
683 	/* Need to add PAD tag */
684 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687 	tl->fc_len = cpu_to_le16(pad_len);
688 	if (crc)
689 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690 	if (pad_len > 0)
691 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692 	ext4_fc_submit_bh(sb, false);
693 
694 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695 	if (ret)
696 		return NULL;
697 	sbi->s_fc_bh = bh;
698 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699 	return sbi->s_fc_bh->b_data;
700 }
701 
702 /* memcpy to fc reserved space and update CRC */
703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704 				int len, u32 *crc)
705 {
706 	if (crc)
707 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708 	return memcpy(dst, src, len);
709 }
710 
711 /*
712  * Complete a fast commit by writing tail tag.
713  *
714  * Writing tail tag marks the end of a fast commit. In order to guarantee
715  * atomicity, after writing tail tag, even if there's space remaining
716  * in the block, next commit shouldn't use it. That's why tail tag
717  * has the length as that of the remaining space on the block.
718  */
719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720 {
721 	struct ext4_sb_info *sbi = EXT4_SB(sb);
722 	struct ext4_fc_tl tl;
723 	struct ext4_fc_tail tail;
724 	int off, bsize = sbi->s_journal->j_blocksize;
725 	u8 *dst;
726 
727 	/*
728 	 * ext4_fc_reserve_space takes care of allocating an extra block if
729 	 * there's no enough space on this block for accommodating this tail.
730 	 */
731 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732 	if (!dst)
733 		return -ENOSPC;
734 
735 	off = sbi->s_fc_bytes % bsize;
736 
737 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740 
741 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742 	dst += sizeof(tl);
743 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745 	dst += sizeof(tail.fc_tid);
746 	tail.fc_crc = cpu_to_le32(crc);
747 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748 
749 	ext4_fc_submit_bh(sb, true);
750 
751 	return 0;
752 }
753 
754 /*
755  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756  * Returns false if there's not enough space.
757  */
758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759 			   u32 *crc)
760 {
761 	struct ext4_fc_tl tl;
762 	u8 *dst;
763 
764 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765 	if (!dst)
766 		return false;
767 
768 	tl.fc_tag = cpu_to_le16(tag);
769 	tl.fc_len = cpu_to_le16(len);
770 
771 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773 
774 	return true;
775 }
776 
777 /* Same as above, but adds dentry tlv. */
778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
779 				   struct ext4_fc_dentry_update *fc_dentry)
780 {
781 	struct ext4_fc_dentry_info fcd;
782 	struct ext4_fc_tl tl;
783 	int dlen = fc_dentry->fcd_name.len;
784 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
785 					crc);
786 
787 	if (!dst)
788 		return false;
789 
790 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
791 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
792 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
793 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
794 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
795 	dst += sizeof(tl);
796 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
797 	dst += sizeof(fcd);
798 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
799 	dst += dlen;
800 
801 	return true;
802 }
803 
804 /*
805  * Writes inode in the fast commit space under TLV with tag @tag.
806  * Returns 0 on success, error on failure.
807  */
808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
809 {
810 	struct ext4_inode_info *ei = EXT4_I(inode);
811 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
812 	int ret;
813 	struct ext4_iloc iloc;
814 	struct ext4_fc_inode fc_inode;
815 	struct ext4_fc_tl tl;
816 	u8 *dst;
817 
818 	ret = ext4_get_inode_loc(inode, &iloc);
819 	if (ret)
820 		return ret;
821 
822 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
823 		inode_len += ei->i_extra_isize;
824 
825 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
826 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
827 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
828 
829 	dst = ext4_fc_reserve_space(inode->i_sb,
830 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
831 	if (!dst)
832 		return -ECANCELED;
833 
834 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
835 		return -ECANCELED;
836 	dst += sizeof(tl);
837 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
838 		return -ECANCELED;
839 	dst += sizeof(fc_inode);
840 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
841 					inode_len, crc))
842 		return -ECANCELED;
843 
844 	return 0;
845 }
846 
847 /*
848  * Writes updated data ranges for the inode in question. Updates CRC.
849  * Returns 0 on success, error otherwise.
850  */
851 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
852 {
853 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
854 	struct ext4_inode_info *ei = EXT4_I(inode);
855 	struct ext4_map_blocks map;
856 	struct ext4_fc_add_range fc_ext;
857 	struct ext4_fc_del_range lrange;
858 	struct ext4_extent *ex;
859 	int ret;
860 
861 	mutex_lock(&ei->i_fc_lock);
862 	if (ei->i_fc_lblk_len == 0) {
863 		mutex_unlock(&ei->i_fc_lock);
864 		return 0;
865 	}
866 	old_blk_size = ei->i_fc_lblk_start;
867 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
868 	ei->i_fc_lblk_len = 0;
869 	mutex_unlock(&ei->i_fc_lock);
870 
871 	cur_lblk_off = old_blk_size;
872 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
873 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
874 
875 	while (cur_lblk_off <= new_blk_size) {
876 		map.m_lblk = cur_lblk_off;
877 		map.m_len = new_blk_size - cur_lblk_off + 1;
878 		ret = ext4_map_blocks(NULL, inode, &map, 0);
879 		if (ret < 0)
880 			return -ECANCELED;
881 
882 		if (map.m_len == 0) {
883 			cur_lblk_off++;
884 			continue;
885 		}
886 
887 		if (ret == 0) {
888 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
889 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
890 			lrange.fc_len = cpu_to_le32(map.m_len);
891 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
892 					    sizeof(lrange), (u8 *)&lrange, crc))
893 				return -ENOSPC;
894 		} else {
895 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
896 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
897 
898 			/* Limit the number of blocks in one extent */
899 			map.m_len = min(max, map.m_len);
900 
901 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
902 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
903 			ex->ee_block = cpu_to_le32(map.m_lblk);
904 			ex->ee_len = cpu_to_le16(map.m_len);
905 			ext4_ext_store_pblock(ex, map.m_pblk);
906 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
907 				ext4_ext_mark_unwritten(ex);
908 			else
909 				ext4_ext_mark_initialized(ex);
910 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
911 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
912 				return -ENOSPC;
913 		}
914 
915 		cur_lblk_off += map.m_len;
916 	}
917 
918 	return 0;
919 }
920 
921 
922 /* Submit data for all the fast commit inodes */
923 static int ext4_fc_submit_inode_data_all(journal_t *journal)
924 {
925 	struct super_block *sb = (struct super_block *)(journal->j_private);
926 	struct ext4_sb_info *sbi = EXT4_SB(sb);
927 	struct ext4_inode_info *ei;
928 	int ret = 0;
929 
930 	spin_lock(&sbi->s_fc_lock);
931 	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
932 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
933 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
934 		while (atomic_read(&ei->i_fc_updates)) {
935 			DEFINE_WAIT(wait);
936 
937 			prepare_to_wait(&ei->i_fc_wait, &wait,
938 						TASK_UNINTERRUPTIBLE);
939 			if (atomic_read(&ei->i_fc_updates)) {
940 				spin_unlock(&sbi->s_fc_lock);
941 				schedule();
942 				spin_lock(&sbi->s_fc_lock);
943 			}
944 			finish_wait(&ei->i_fc_wait, &wait);
945 		}
946 		spin_unlock(&sbi->s_fc_lock);
947 		ret = jbd2_submit_inode_data(ei->jinode);
948 		if (ret)
949 			return ret;
950 		spin_lock(&sbi->s_fc_lock);
951 	}
952 	spin_unlock(&sbi->s_fc_lock);
953 
954 	return ret;
955 }
956 
957 /* Wait for completion of data for all the fast commit inodes */
958 static int ext4_fc_wait_inode_data_all(journal_t *journal)
959 {
960 	struct super_block *sb = (struct super_block *)(journal->j_private);
961 	struct ext4_sb_info *sbi = EXT4_SB(sb);
962 	struct ext4_inode_info *pos, *n;
963 	int ret = 0;
964 
965 	spin_lock(&sbi->s_fc_lock);
966 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
967 		if (!ext4_test_inode_state(&pos->vfs_inode,
968 					   EXT4_STATE_FC_COMMITTING))
969 			continue;
970 		spin_unlock(&sbi->s_fc_lock);
971 
972 		ret = jbd2_wait_inode_data(journal, pos->jinode);
973 		if (ret)
974 			return ret;
975 		spin_lock(&sbi->s_fc_lock);
976 	}
977 	spin_unlock(&sbi->s_fc_lock);
978 
979 	return 0;
980 }
981 
982 /* Commit all the directory entry updates */
983 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
984 __acquires(&sbi->s_fc_lock)
985 __releases(&sbi->s_fc_lock)
986 {
987 	struct super_block *sb = (struct super_block *)(journal->j_private);
988 	struct ext4_sb_info *sbi = EXT4_SB(sb);
989 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
990 	struct inode *inode;
991 	struct ext4_inode_info *ei, *ei_n;
992 	int ret;
993 
994 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
995 		return 0;
996 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
997 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
998 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
999 			spin_unlock(&sbi->s_fc_lock);
1000 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1001 				ret = -ENOSPC;
1002 				goto lock_and_exit;
1003 			}
1004 			spin_lock(&sbi->s_fc_lock);
1005 			continue;
1006 		}
1007 
1008 		inode = NULL;
1009 		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1010 					 i_fc_list) {
1011 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1012 				inode = &ei->vfs_inode;
1013 				break;
1014 			}
1015 		}
1016 		/*
1017 		 * If we don't find inode in our list, then it was deleted,
1018 		 * in which case, we don't need to record it's create tag.
1019 		 */
1020 		if (!inode)
1021 			continue;
1022 		spin_unlock(&sbi->s_fc_lock);
1023 
1024 		/*
1025 		 * We first write the inode and then the create dirent. This
1026 		 * allows the recovery code to create an unnamed inode first
1027 		 * and then link it to a directory entry. This allows us
1028 		 * to use namei.c routines almost as is and simplifies
1029 		 * the recovery code.
1030 		 */
1031 		ret = ext4_fc_write_inode(inode, crc);
1032 		if (ret)
1033 			goto lock_and_exit;
1034 
1035 		ret = ext4_fc_write_inode_data(inode, crc);
1036 		if (ret)
1037 			goto lock_and_exit;
1038 
1039 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1040 			ret = -ENOSPC;
1041 			goto lock_and_exit;
1042 		}
1043 
1044 		spin_lock(&sbi->s_fc_lock);
1045 	}
1046 	return 0;
1047 lock_and_exit:
1048 	spin_lock(&sbi->s_fc_lock);
1049 	return ret;
1050 }
1051 
1052 static int ext4_fc_perform_commit(journal_t *journal)
1053 {
1054 	struct super_block *sb = (struct super_block *)(journal->j_private);
1055 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1056 	struct ext4_inode_info *iter;
1057 	struct ext4_fc_head head;
1058 	struct inode *inode;
1059 	struct blk_plug plug;
1060 	int ret = 0;
1061 	u32 crc = 0;
1062 
1063 	ret = ext4_fc_submit_inode_data_all(journal);
1064 	if (ret)
1065 		return ret;
1066 
1067 	ret = ext4_fc_wait_inode_data_all(journal);
1068 	if (ret)
1069 		return ret;
1070 
1071 	/*
1072 	 * If file system device is different from journal device, issue a cache
1073 	 * flush before we start writing fast commit blocks.
1074 	 */
1075 	if (journal->j_fs_dev != journal->j_dev)
1076 		blkdev_issue_flush(journal->j_fs_dev);
1077 
1078 	blk_start_plug(&plug);
1079 	if (sbi->s_fc_bytes == 0) {
1080 		/*
1081 		 * Add a head tag only if this is the first fast commit
1082 		 * in this TID.
1083 		 */
1084 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1085 		head.fc_tid = cpu_to_le32(
1086 			sbi->s_journal->j_running_transaction->t_tid);
1087 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1088 			(u8 *)&head, &crc)) {
1089 			ret = -ENOSPC;
1090 			goto out;
1091 		}
1092 	}
1093 
1094 	spin_lock(&sbi->s_fc_lock);
1095 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1096 	if (ret) {
1097 		spin_unlock(&sbi->s_fc_lock);
1098 		goto out;
1099 	}
1100 
1101 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1102 		inode = &iter->vfs_inode;
1103 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1104 			continue;
1105 
1106 		spin_unlock(&sbi->s_fc_lock);
1107 		ret = ext4_fc_write_inode_data(inode, &crc);
1108 		if (ret)
1109 			goto out;
1110 		ret = ext4_fc_write_inode(inode, &crc);
1111 		if (ret)
1112 			goto out;
1113 		spin_lock(&sbi->s_fc_lock);
1114 	}
1115 	spin_unlock(&sbi->s_fc_lock);
1116 
1117 	ret = ext4_fc_write_tail(sb, crc);
1118 
1119 out:
1120 	blk_finish_plug(&plug);
1121 	return ret;
1122 }
1123 
1124 /*
1125  * The main commit entry point. Performs a fast commit for transaction
1126  * commit_tid if needed. If it's not possible to perform a fast commit
1127  * due to various reasons, we fall back to full commit. Returns 0
1128  * on success, error otherwise.
1129  */
1130 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1131 {
1132 	struct super_block *sb = (struct super_block *)(journal->j_private);
1133 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1134 	int nblks = 0, ret, bsize = journal->j_blocksize;
1135 	int subtid = atomic_read(&sbi->s_fc_subtid);
1136 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1137 	ktime_t start_time, commit_time;
1138 
1139 	trace_ext4_fc_commit_start(sb);
1140 
1141 	start_time = ktime_get();
1142 
1143 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1144 		(ext4_fc_is_ineligible(sb))) {
1145 		reason = EXT4_FC_REASON_INELIGIBLE;
1146 		goto out;
1147 	}
1148 
1149 restart_fc:
1150 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1151 	if (ret == -EALREADY) {
1152 		/* There was an ongoing commit, check if we need to restart */
1153 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1154 			commit_tid > journal->j_commit_sequence)
1155 			goto restart_fc;
1156 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1157 		goto out;
1158 	} else if (ret) {
1159 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1160 		reason = EXT4_FC_REASON_FC_START_FAILED;
1161 		goto out;
1162 	}
1163 
1164 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1165 	ret = ext4_fc_perform_commit(journal);
1166 	if (ret < 0) {
1167 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1168 		reason = EXT4_FC_REASON_FC_FAILED;
1169 		goto out;
1170 	}
1171 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1172 	ret = jbd2_fc_wait_bufs(journal, nblks);
1173 	if (ret < 0) {
1174 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1175 		reason = EXT4_FC_REASON_FC_FAILED;
1176 		goto out;
1177 	}
1178 	atomic_inc(&sbi->s_fc_subtid);
1179 	jbd2_fc_end_commit(journal);
1180 out:
1181 	/* Has any ineligible update happened since we started? */
1182 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1183 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184 		reason = EXT4_FC_REASON_INELIGIBLE;
1185 	}
1186 
1187 	spin_lock(&sbi->s_fc_lock);
1188 	if (reason != EXT4_FC_REASON_OK &&
1189 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1190 		sbi->s_fc_stats.fc_ineligible_commits++;
1191 	} else {
1192 		sbi->s_fc_stats.fc_num_commits++;
1193 		sbi->s_fc_stats.fc_numblks += nblks;
1194 	}
1195 	spin_unlock(&sbi->s_fc_lock);
1196 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1197 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1198 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1199 	/*
1200 	 * weight the commit time higher than the average time so we don't
1201 	 * react too strongly to vast changes in the commit time
1202 	 */
1203 	if (likely(sbi->s_fc_avg_commit_time))
1204 		sbi->s_fc_avg_commit_time = (commit_time +
1205 				sbi->s_fc_avg_commit_time * 3) / 4;
1206 	else
1207 		sbi->s_fc_avg_commit_time = commit_time;
1208 	jbd_debug(1,
1209 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1210 		nblks, reason, subtid);
1211 	if (reason == EXT4_FC_REASON_FC_FAILED)
1212 		return jbd2_fc_end_commit_fallback(journal);
1213 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1214 		reason == EXT4_FC_REASON_INELIGIBLE)
1215 		return jbd2_complete_transaction(journal, commit_tid);
1216 	return 0;
1217 }
1218 
1219 /*
1220  * Fast commit cleanup routine. This is called after every fast commit and
1221  * full commit. full is true if we are called after a full commit.
1222  */
1223 static void ext4_fc_cleanup(journal_t *journal, int full)
1224 {
1225 	struct super_block *sb = journal->j_private;
1226 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1227 	struct ext4_inode_info *iter, *iter_n;
1228 	struct ext4_fc_dentry_update *fc_dentry;
1229 
1230 	if (full && sbi->s_fc_bh)
1231 		sbi->s_fc_bh = NULL;
1232 
1233 	jbd2_fc_release_bufs(journal);
1234 
1235 	spin_lock(&sbi->s_fc_lock);
1236 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1237 				 i_fc_list) {
1238 		list_del_init(&iter->i_fc_list);
1239 		ext4_clear_inode_state(&iter->vfs_inode,
1240 				       EXT4_STATE_FC_COMMITTING);
1241 		ext4_fc_reset_inode(&iter->vfs_inode);
1242 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1243 		smp_mb();
1244 #if (BITS_PER_LONG < 64)
1245 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1246 #else
1247 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1248 #endif
1249 	}
1250 
1251 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1252 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1253 					     struct ext4_fc_dentry_update,
1254 					     fcd_list);
1255 		list_del_init(&fc_dentry->fcd_list);
1256 		spin_unlock(&sbi->s_fc_lock);
1257 
1258 		if (fc_dentry->fcd_name.name &&
1259 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1260 			kfree(fc_dentry->fcd_name.name);
1261 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1262 		spin_lock(&sbi->s_fc_lock);
1263 	}
1264 
1265 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1266 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1267 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1268 				&sbi->s_fc_q[FC_Q_MAIN]);
1269 
1270 	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1271 	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1272 
1273 	if (full)
1274 		sbi->s_fc_bytes = 0;
1275 	spin_unlock(&sbi->s_fc_lock);
1276 	trace_ext4_fc_stats(sb);
1277 }
1278 
1279 /* Ext4 Replay Path Routines */
1280 
1281 /* Helper struct for dentry replay routines */
1282 struct dentry_info_args {
1283 	int parent_ino, dname_len, ino, inode_len;
1284 	char *dname;
1285 };
1286 
1287 static inline void tl_to_darg(struct dentry_info_args *darg,
1288 			      struct  ext4_fc_tl *tl, u8 *val)
1289 {
1290 	struct ext4_fc_dentry_info fcd;
1291 
1292 	memcpy(&fcd, val, sizeof(fcd));
1293 
1294 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1295 	darg->ino = le32_to_cpu(fcd.fc_ino);
1296 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1297 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1298 		sizeof(struct ext4_fc_dentry_info);
1299 }
1300 
1301 /* Unlink replay function */
1302 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1303 				 u8 *val)
1304 {
1305 	struct inode *inode, *old_parent;
1306 	struct qstr entry;
1307 	struct dentry_info_args darg;
1308 	int ret = 0;
1309 
1310 	tl_to_darg(&darg, tl, val);
1311 
1312 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1313 			darg.parent_ino, darg.dname_len);
1314 
1315 	entry.name = darg.dname;
1316 	entry.len = darg.dname_len;
1317 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1318 
1319 	if (IS_ERR(inode)) {
1320 		jbd_debug(1, "Inode %d not found", darg.ino);
1321 		return 0;
1322 	}
1323 
1324 	old_parent = ext4_iget(sb, darg.parent_ino,
1325 				EXT4_IGET_NORMAL);
1326 	if (IS_ERR(old_parent)) {
1327 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1328 		iput(inode);
1329 		return 0;
1330 	}
1331 
1332 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1333 	/* -ENOENT ok coz it might not exist anymore. */
1334 	if (ret == -ENOENT)
1335 		ret = 0;
1336 	iput(old_parent);
1337 	iput(inode);
1338 	return ret;
1339 }
1340 
1341 static int ext4_fc_replay_link_internal(struct super_block *sb,
1342 				struct dentry_info_args *darg,
1343 				struct inode *inode)
1344 {
1345 	struct inode *dir = NULL;
1346 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1347 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1348 	int ret = 0;
1349 
1350 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1351 	if (IS_ERR(dir)) {
1352 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1353 		dir = NULL;
1354 		goto out;
1355 	}
1356 
1357 	dentry_dir = d_obtain_alias(dir);
1358 	if (IS_ERR(dentry_dir)) {
1359 		jbd_debug(1, "Failed to obtain dentry");
1360 		dentry_dir = NULL;
1361 		goto out;
1362 	}
1363 
1364 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1365 	if (!dentry_inode) {
1366 		jbd_debug(1, "Inode dentry not created.");
1367 		ret = -ENOMEM;
1368 		goto out;
1369 	}
1370 
1371 	ret = __ext4_link(dir, inode, dentry_inode);
1372 	/*
1373 	 * It's possible that link already existed since data blocks
1374 	 * for the dir in question got persisted before we crashed OR
1375 	 * we replayed this tag and crashed before the entire replay
1376 	 * could complete.
1377 	 */
1378 	if (ret && ret != -EEXIST) {
1379 		jbd_debug(1, "Failed to link\n");
1380 		goto out;
1381 	}
1382 
1383 	ret = 0;
1384 out:
1385 	if (dentry_dir) {
1386 		d_drop(dentry_dir);
1387 		dput(dentry_dir);
1388 	} else if (dir) {
1389 		iput(dir);
1390 	}
1391 	if (dentry_inode) {
1392 		d_drop(dentry_inode);
1393 		dput(dentry_inode);
1394 	}
1395 
1396 	return ret;
1397 }
1398 
1399 /* Link replay function */
1400 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1401 			       u8 *val)
1402 {
1403 	struct inode *inode;
1404 	struct dentry_info_args darg;
1405 	int ret = 0;
1406 
1407 	tl_to_darg(&darg, tl, val);
1408 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1409 			darg.parent_ino, darg.dname_len);
1410 
1411 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1412 	if (IS_ERR(inode)) {
1413 		jbd_debug(1, "Inode not found.");
1414 		return 0;
1415 	}
1416 
1417 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1418 	iput(inode);
1419 	return ret;
1420 }
1421 
1422 /*
1423  * Record all the modified inodes during replay. We use this later to setup
1424  * block bitmaps correctly.
1425  */
1426 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1427 {
1428 	struct ext4_fc_replay_state *state;
1429 	int i;
1430 
1431 	state = &EXT4_SB(sb)->s_fc_replay_state;
1432 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1433 		if (state->fc_modified_inodes[i] == ino)
1434 			return 0;
1435 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1436 		state->fc_modified_inodes_size +=
1437 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1438 		state->fc_modified_inodes = krealloc(
1439 					state->fc_modified_inodes, sizeof(int) *
1440 					state->fc_modified_inodes_size,
1441 					GFP_KERNEL);
1442 		if (!state->fc_modified_inodes)
1443 			return -ENOMEM;
1444 	}
1445 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1446 	return 0;
1447 }
1448 
1449 /*
1450  * Inode replay function
1451  */
1452 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1453 				u8 *val)
1454 {
1455 	struct ext4_fc_inode fc_inode;
1456 	struct ext4_inode *raw_inode;
1457 	struct ext4_inode *raw_fc_inode;
1458 	struct inode *inode = NULL;
1459 	struct ext4_iloc iloc;
1460 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461 	struct ext4_extent_header *eh;
1462 
1463 	memcpy(&fc_inode, val, sizeof(fc_inode));
1464 
1465 	ino = le32_to_cpu(fc_inode.fc_ino);
1466 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467 
1468 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469 	if (!IS_ERR(inode)) {
1470 		ext4_ext_clear_bb(inode);
1471 		iput(inode);
1472 	}
1473 	inode = NULL;
1474 
1475 	ext4_fc_record_modified_inode(sb, ino);
1476 
1477 	raw_fc_inode = (struct ext4_inode *)
1478 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1479 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1480 	if (ret)
1481 		goto out;
1482 
1483 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1484 	raw_inode = ext4_raw_inode(&iloc);
1485 
1486 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1487 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1488 		inode_len - offsetof(struct ext4_inode, i_generation));
1489 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1490 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1491 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1492 			memset(eh, 0, sizeof(*eh));
1493 			eh->eh_magic = EXT4_EXT_MAGIC;
1494 			eh->eh_max = cpu_to_le16(
1495 				(sizeof(raw_inode->i_block) -
1496 				 sizeof(struct ext4_extent_header))
1497 				 / sizeof(struct ext4_extent));
1498 		}
1499 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1500 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1501 			sizeof(raw_inode->i_block));
1502 	}
1503 
1504 	/* Immediately update the inode on disk. */
1505 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1506 	if (ret)
1507 		goto out;
1508 	ret = sync_dirty_buffer(iloc.bh);
1509 	if (ret)
1510 		goto out;
1511 	ret = ext4_mark_inode_used(sb, ino);
1512 	if (ret)
1513 		goto out;
1514 
1515 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1516 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1517 	if (IS_ERR(inode)) {
1518 		jbd_debug(1, "Inode not found.");
1519 		return -EFSCORRUPTED;
1520 	}
1521 
1522 	/*
1523 	 * Our allocator could have made different decisions than before
1524 	 * crashing. This should be fixed but until then, we calculate
1525 	 * the number of blocks the inode.
1526 	 */
1527 	ext4_ext_replay_set_iblocks(inode);
1528 
1529 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1530 	ext4_reset_inode_seed(inode);
1531 
1532 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1533 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1534 	sync_dirty_buffer(iloc.bh);
1535 	brelse(iloc.bh);
1536 out:
1537 	iput(inode);
1538 	if (!ret)
1539 		blkdev_issue_flush(sb->s_bdev);
1540 
1541 	return 0;
1542 }
1543 
1544 /*
1545  * Dentry create replay function.
1546  *
1547  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1548  * inode for which we are trying to create a dentry here, should already have
1549  * been replayed before we start here.
1550  */
1551 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1552 				 u8 *val)
1553 {
1554 	int ret = 0;
1555 	struct inode *inode = NULL;
1556 	struct inode *dir = NULL;
1557 	struct dentry_info_args darg;
1558 
1559 	tl_to_darg(&darg, tl, val);
1560 
1561 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1562 			darg.parent_ino, darg.dname_len);
1563 
1564 	/* This takes care of update group descriptor and other metadata */
1565 	ret = ext4_mark_inode_used(sb, darg.ino);
1566 	if (ret)
1567 		goto out;
1568 
1569 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1570 	if (IS_ERR(inode)) {
1571 		jbd_debug(1, "inode %d not found.", darg.ino);
1572 		inode = NULL;
1573 		ret = -EINVAL;
1574 		goto out;
1575 	}
1576 
1577 	if (S_ISDIR(inode->i_mode)) {
1578 		/*
1579 		 * If we are creating a directory, we need to make sure that the
1580 		 * dot and dot dot dirents are setup properly.
1581 		 */
1582 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1583 		if (IS_ERR(dir)) {
1584 			jbd_debug(1, "Dir %d not found.", darg.ino);
1585 			goto out;
1586 		}
1587 		ret = ext4_init_new_dir(NULL, dir, inode);
1588 		iput(dir);
1589 		if (ret) {
1590 			ret = 0;
1591 			goto out;
1592 		}
1593 	}
1594 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1595 	if (ret)
1596 		goto out;
1597 	set_nlink(inode, 1);
1598 	ext4_mark_inode_dirty(NULL, inode);
1599 out:
1600 	if (inode)
1601 		iput(inode);
1602 	return ret;
1603 }
1604 
1605 /*
1606  * Record physical disk regions which are in use as per fast commit area. Our
1607  * simple replay phase allocator excludes these regions from allocation.
1608  */
1609 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1610 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1611 {
1612 	struct ext4_fc_replay_state *state;
1613 	struct ext4_fc_alloc_region *region;
1614 
1615 	state = &EXT4_SB(sb)->s_fc_replay_state;
1616 	if (state->fc_regions_used == state->fc_regions_size) {
1617 		state->fc_regions_size +=
1618 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1619 		state->fc_regions = krealloc(
1620 					state->fc_regions,
1621 					state->fc_regions_size *
1622 					sizeof(struct ext4_fc_alloc_region),
1623 					GFP_KERNEL);
1624 		if (!state->fc_regions)
1625 			return -ENOMEM;
1626 	}
1627 	region = &state->fc_regions[state->fc_regions_used++];
1628 	region->ino = ino;
1629 	region->lblk = lblk;
1630 	region->pblk = pblk;
1631 	region->len = len;
1632 
1633 	return 0;
1634 }
1635 
1636 /* Replay add range tag */
1637 static int ext4_fc_replay_add_range(struct super_block *sb,
1638 				    struct ext4_fc_tl *tl, u8 *val)
1639 {
1640 	struct ext4_fc_add_range fc_add_ex;
1641 	struct ext4_extent newex, *ex;
1642 	struct inode *inode;
1643 	ext4_lblk_t start, cur;
1644 	int remaining, len;
1645 	ext4_fsblk_t start_pblk;
1646 	struct ext4_map_blocks map;
1647 	struct ext4_ext_path *path = NULL;
1648 	int ret;
1649 
1650 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1651 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1652 
1653 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1654 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1655 		ext4_ext_get_actual_len(ex));
1656 
1657 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1658 	if (IS_ERR(inode)) {
1659 		jbd_debug(1, "Inode not found.");
1660 		return 0;
1661 	}
1662 
1663 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1664 
1665 	start = le32_to_cpu(ex->ee_block);
1666 	start_pblk = ext4_ext_pblock(ex);
1667 	len = ext4_ext_get_actual_len(ex);
1668 
1669 	cur = start;
1670 	remaining = len;
1671 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1672 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1673 		  inode->i_ino);
1674 
1675 	while (remaining > 0) {
1676 		map.m_lblk = cur;
1677 		map.m_len = remaining;
1678 		map.m_pblk = 0;
1679 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1680 
1681 		if (ret < 0) {
1682 			iput(inode);
1683 			return 0;
1684 		}
1685 
1686 		if (ret == 0) {
1687 			/* Range is not mapped */
1688 			path = ext4_find_extent(inode, cur, NULL, 0);
1689 			if (IS_ERR(path)) {
1690 				iput(inode);
1691 				return 0;
1692 			}
1693 			memset(&newex, 0, sizeof(newex));
1694 			newex.ee_block = cpu_to_le32(cur);
1695 			ext4_ext_store_pblock(
1696 				&newex, start_pblk + cur - start);
1697 			newex.ee_len = cpu_to_le16(map.m_len);
1698 			if (ext4_ext_is_unwritten(ex))
1699 				ext4_ext_mark_unwritten(&newex);
1700 			down_write(&EXT4_I(inode)->i_data_sem);
1701 			ret = ext4_ext_insert_extent(
1702 				NULL, inode, &path, &newex, 0);
1703 			up_write((&EXT4_I(inode)->i_data_sem));
1704 			ext4_ext_drop_refs(path);
1705 			kfree(path);
1706 			if (ret) {
1707 				iput(inode);
1708 				return 0;
1709 			}
1710 			goto next;
1711 		}
1712 
1713 		if (start_pblk + cur - start != map.m_pblk) {
1714 			/*
1715 			 * Logical to physical mapping changed. This can happen
1716 			 * if this range was removed and then reallocated to
1717 			 * map to new physical blocks during a fast commit.
1718 			 */
1719 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1720 					ext4_ext_is_unwritten(ex),
1721 					start_pblk + cur - start);
1722 			if (ret) {
1723 				iput(inode);
1724 				return 0;
1725 			}
1726 			/*
1727 			 * Mark the old blocks as free since they aren't used
1728 			 * anymore. We maintain an array of all the modified
1729 			 * inodes. In case these blocks are still used at either
1730 			 * a different logical range in the same inode or in
1731 			 * some different inode, we will mark them as allocated
1732 			 * at the end of the FC replay using our array of
1733 			 * modified inodes.
1734 			 */
1735 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1736 			goto next;
1737 		}
1738 
1739 		/* Range is mapped and needs a state change */
1740 		jbd_debug(1, "Converting from %ld to %d %lld",
1741 				map.m_flags & EXT4_MAP_UNWRITTEN,
1742 			ext4_ext_is_unwritten(ex), map.m_pblk);
1743 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1744 					ext4_ext_is_unwritten(ex), map.m_pblk);
1745 		if (ret) {
1746 			iput(inode);
1747 			return 0;
1748 		}
1749 		/*
1750 		 * We may have split the extent tree while toggling the state.
1751 		 * Try to shrink the extent tree now.
1752 		 */
1753 		ext4_ext_replay_shrink_inode(inode, start + len);
1754 next:
1755 		cur += map.m_len;
1756 		remaining -= map.m_len;
1757 	}
1758 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1759 					sb->s_blocksize_bits);
1760 	iput(inode);
1761 	return 0;
1762 }
1763 
1764 /* Replay DEL_RANGE tag */
1765 static int
1766 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1767 			 u8 *val)
1768 {
1769 	struct inode *inode;
1770 	struct ext4_fc_del_range lrange;
1771 	struct ext4_map_blocks map;
1772 	ext4_lblk_t cur, remaining;
1773 	int ret;
1774 
1775 	memcpy(&lrange, val, sizeof(lrange));
1776 	cur = le32_to_cpu(lrange.fc_lblk);
1777 	remaining = le32_to_cpu(lrange.fc_len);
1778 
1779 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1780 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1781 
1782 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1783 	if (IS_ERR(inode)) {
1784 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1785 		return 0;
1786 	}
1787 
1788 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1789 
1790 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1791 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1792 			le32_to_cpu(lrange.fc_len));
1793 	while (remaining > 0) {
1794 		map.m_lblk = cur;
1795 		map.m_len = remaining;
1796 
1797 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1798 		if (ret < 0) {
1799 			iput(inode);
1800 			return 0;
1801 		}
1802 		if (ret > 0) {
1803 			remaining -= ret;
1804 			cur += ret;
1805 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1806 		} else {
1807 			remaining -= map.m_len;
1808 			cur += map.m_len;
1809 		}
1810 	}
1811 
1812 	ret = ext4_punch_hole(inode,
1813 		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1814 		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1815 	if (ret)
1816 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1817 	ext4_ext_replay_shrink_inode(inode,
1818 		i_size_read(inode) >> sb->s_blocksize_bits);
1819 	ext4_mark_inode_dirty(NULL, inode);
1820 	iput(inode);
1821 
1822 	return 0;
1823 }
1824 
1825 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1826 {
1827 	struct ext4_fc_replay_state *state;
1828 	struct inode *inode;
1829 	struct ext4_ext_path *path = NULL;
1830 	struct ext4_map_blocks map;
1831 	int i, ret, j;
1832 	ext4_lblk_t cur, end;
1833 
1834 	state = &EXT4_SB(sb)->s_fc_replay_state;
1835 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1836 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1837 			EXT4_IGET_NORMAL);
1838 		if (IS_ERR(inode)) {
1839 			jbd_debug(1, "Inode %d not found.",
1840 				state->fc_modified_inodes[i]);
1841 			continue;
1842 		}
1843 		cur = 0;
1844 		end = EXT_MAX_BLOCKS;
1845 		while (cur < end) {
1846 			map.m_lblk = cur;
1847 			map.m_len = end - cur;
1848 
1849 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1850 			if (ret < 0)
1851 				break;
1852 
1853 			if (ret > 0) {
1854 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1855 				if (!IS_ERR(path)) {
1856 					for (j = 0; j < path->p_depth; j++)
1857 						ext4_mb_mark_bb(inode->i_sb,
1858 							path[j].p_block, 1, 1);
1859 					ext4_ext_drop_refs(path);
1860 					kfree(path);
1861 				}
1862 				cur += ret;
1863 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1864 							map.m_len, 1);
1865 			} else {
1866 				cur = cur + (map.m_len ? map.m_len : 1);
1867 			}
1868 		}
1869 		iput(inode);
1870 	}
1871 }
1872 
1873 /*
1874  * Check if block is in excluded regions for block allocation. The simple
1875  * allocator that runs during replay phase is calls this function to see
1876  * if it is okay to use a block.
1877  */
1878 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1879 {
1880 	int i;
1881 	struct ext4_fc_replay_state *state;
1882 
1883 	state = &EXT4_SB(sb)->s_fc_replay_state;
1884 	for (i = 0; i < state->fc_regions_valid; i++) {
1885 		if (state->fc_regions[i].ino == 0 ||
1886 			state->fc_regions[i].len == 0)
1887 			continue;
1888 		if (blk >= state->fc_regions[i].pblk &&
1889 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1890 			return true;
1891 	}
1892 	return false;
1893 }
1894 
1895 /* Cleanup function called after replay */
1896 void ext4_fc_replay_cleanup(struct super_block *sb)
1897 {
1898 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1899 
1900 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1901 	kfree(sbi->s_fc_replay_state.fc_regions);
1902 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1903 }
1904 
1905 /*
1906  * Recovery Scan phase handler
1907  *
1908  * This function is called during the scan phase and is responsible
1909  * for doing following things:
1910  * - Make sure the fast commit area has valid tags for replay
1911  * - Count number of tags that need to be replayed by the replay handler
1912  * - Verify CRC
1913  * - Create a list of excluded blocks for allocation during replay phase
1914  *
1915  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1916  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1917  * to indicate that scan has finished and JBD2 can now start replay phase.
1918  * It returns a negative error to indicate that there was an error. At the end
1919  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1920  * to indicate the number of tags that need to replayed during the replay phase.
1921  */
1922 static int ext4_fc_replay_scan(journal_t *journal,
1923 				struct buffer_head *bh, int off,
1924 				tid_t expected_tid)
1925 {
1926 	struct super_block *sb = journal->j_private;
1927 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1928 	struct ext4_fc_replay_state *state;
1929 	int ret = JBD2_FC_REPLAY_CONTINUE;
1930 	struct ext4_fc_add_range ext;
1931 	struct ext4_fc_tl tl;
1932 	struct ext4_fc_tail tail;
1933 	__u8 *start, *end, *cur, *val;
1934 	struct ext4_fc_head head;
1935 	struct ext4_extent *ex;
1936 
1937 	state = &sbi->s_fc_replay_state;
1938 
1939 	start = (u8 *)bh->b_data;
1940 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1941 
1942 	if (state->fc_replay_expected_off == 0) {
1943 		state->fc_cur_tag = 0;
1944 		state->fc_replay_num_tags = 0;
1945 		state->fc_crc = 0;
1946 		state->fc_regions = NULL;
1947 		state->fc_regions_valid = state->fc_regions_used =
1948 			state->fc_regions_size = 0;
1949 		/* Check if we can stop early */
1950 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1951 			!= EXT4_FC_TAG_HEAD)
1952 			return 0;
1953 	}
1954 
1955 	if (off != state->fc_replay_expected_off) {
1956 		ret = -EFSCORRUPTED;
1957 		goto out_err;
1958 	}
1959 
1960 	state->fc_replay_expected_off++;
1961 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1962 		memcpy(&tl, cur, sizeof(tl));
1963 		val = cur + sizeof(tl);
1964 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1965 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1966 		switch (le16_to_cpu(tl.fc_tag)) {
1967 		case EXT4_FC_TAG_ADD_RANGE:
1968 			memcpy(&ext, val, sizeof(ext));
1969 			ex = (struct ext4_extent *)&ext.fc_ex;
1970 			ret = ext4_fc_record_regions(sb,
1971 				le32_to_cpu(ext.fc_ino),
1972 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1973 				ext4_ext_get_actual_len(ex));
1974 			if (ret < 0)
1975 				break;
1976 			ret = JBD2_FC_REPLAY_CONTINUE;
1977 			fallthrough;
1978 		case EXT4_FC_TAG_DEL_RANGE:
1979 		case EXT4_FC_TAG_LINK:
1980 		case EXT4_FC_TAG_UNLINK:
1981 		case EXT4_FC_TAG_CREAT:
1982 		case EXT4_FC_TAG_INODE:
1983 		case EXT4_FC_TAG_PAD:
1984 			state->fc_cur_tag++;
1985 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1986 					sizeof(tl) + le16_to_cpu(tl.fc_len));
1987 			break;
1988 		case EXT4_FC_TAG_TAIL:
1989 			state->fc_cur_tag++;
1990 			memcpy(&tail, val, sizeof(tail));
1991 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1992 						sizeof(tl) +
1993 						offsetof(struct ext4_fc_tail,
1994 						fc_crc));
1995 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1996 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1997 				state->fc_replay_num_tags = state->fc_cur_tag;
1998 				state->fc_regions_valid =
1999 					state->fc_regions_used;
2000 			} else {
2001 				ret = state->fc_replay_num_tags ?
2002 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2003 			}
2004 			state->fc_crc = 0;
2005 			break;
2006 		case EXT4_FC_TAG_HEAD:
2007 			memcpy(&head, val, sizeof(head));
2008 			if (le32_to_cpu(head.fc_features) &
2009 				~EXT4_FC_SUPPORTED_FEATURES) {
2010 				ret = -EOPNOTSUPP;
2011 				break;
2012 			}
2013 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2014 				ret = JBD2_FC_REPLAY_STOP;
2015 				break;
2016 			}
2017 			state->fc_cur_tag++;
2018 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2019 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2020 			break;
2021 		default:
2022 			ret = state->fc_replay_num_tags ?
2023 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2024 		}
2025 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2026 			break;
2027 	}
2028 
2029 out_err:
2030 	trace_ext4_fc_replay_scan(sb, ret, off);
2031 	return ret;
2032 }
2033 
2034 /*
2035  * Main recovery path entry point.
2036  * The meaning of return codes is similar as above.
2037  */
2038 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2039 				enum passtype pass, int off, tid_t expected_tid)
2040 {
2041 	struct super_block *sb = journal->j_private;
2042 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2043 	struct ext4_fc_tl tl;
2044 	__u8 *start, *end, *cur, *val;
2045 	int ret = JBD2_FC_REPLAY_CONTINUE;
2046 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2047 	struct ext4_fc_tail tail;
2048 
2049 	if (pass == PASS_SCAN) {
2050 		state->fc_current_pass = PASS_SCAN;
2051 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2052 	}
2053 
2054 	if (state->fc_current_pass != pass) {
2055 		state->fc_current_pass = pass;
2056 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2057 	}
2058 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2059 		jbd_debug(1, "Replay stops\n");
2060 		ext4_fc_set_bitmaps_and_counters(sb);
2061 		return 0;
2062 	}
2063 
2064 #ifdef CONFIG_EXT4_DEBUG
2065 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2066 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2067 		return JBD2_FC_REPLAY_STOP;
2068 	}
2069 #endif
2070 
2071 	start = (u8 *)bh->b_data;
2072 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2073 
2074 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2075 		memcpy(&tl, cur, sizeof(tl));
2076 		val = cur + sizeof(tl);
2077 
2078 		if (state->fc_replay_num_tags == 0) {
2079 			ret = JBD2_FC_REPLAY_STOP;
2080 			ext4_fc_set_bitmaps_and_counters(sb);
2081 			break;
2082 		}
2083 		jbd_debug(3, "Replay phase, tag:%s\n",
2084 				tag2str(le16_to_cpu(tl.fc_tag)));
2085 		state->fc_replay_num_tags--;
2086 		switch (le16_to_cpu(tl.fc_tag)) {
2087 		case EXT4_FC_TAG_LINK:
2088 			ret = ext4_fc_replay_link(sb, &tl, val);
2089 			break;
2090 		case EXT4_FC_TAG_UNLINK:
2091 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2092 			break;
2093 		case EXT4_FC_TAG_ADD_RANGE:
2094 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2095 			break;
2096 		case EXT4_FC_TAG_CREAT:
2097 			ret = ext4_fc_replay_create(sb, &tl, val);
2098 			break;
2099 		case EXT4_FC_TAG_DEL_RANGE:
2100 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2101 			break;
2102 		case EXT4_FC_TAG_INODE:
2103 			ret = ext4_fc_replay_inode(sb, &tl, val);
2104 			break;
2105 		case EXT4_FC_TAG_PAD:
2106 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2107 					     le16_to_cpu(tl.fc_len), 0);
2108 			break;
2109 		case EXT4_FC_TAG_TAIL:
2110 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2111 					     le16_to_cpu(tl.fc_len), 0);
2112 			memcpy(&tail, val, sizeof(tail));
2113 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2114 			break;
2115 		case EXT4_FC_TAG_HEAD:
2116 			break;
2117 		default:
2118 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2119 					     le16_to_cpu(tl.fc_len), 0);
2120 			ret = -ECANCELED;
2121 			break;
2122 		}
2123 		if (ret < 0)
2124 			break;
2125 		ret = JBD2_FC_REPLAY_CONTINUE;
2126 	}
2127 	return ret;
2128 }
2129 
2130 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2131 {
2132 	/*
2133 	 * We set replay callback even if fast commit disabled because we may
2134 	 * could still have fast commit blocks that need to be replayed even if
2135 	 * fast commit has now been turned off.
2136 	 */
2137 	journal->j_fc_replay_callback = ext4_fc_replay;
2138 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2139 		return;
2140 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2141 }
2142 
2143 static const char *fc_ineligible_reasons[] = {
2144 	"Extended attributes changed",
2145 	"Cross rename",
2146 	"Journal flag changed",
2147 	"Insufficient memory",
2148 	"Swap boot",
2149 	"Resize",
2150 	"Dir renamed",
2151 	"Falloc range op",
2152 	"Data journalling",
2153 	"FC Commit Failed"
2154 };
2155 
2156 int ext4_fc_info_show(struct seq_file *seq, void *v)
2157 {
2158 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2159 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2160 	int i;
2161 
2162 	if (v != SEQ_START_TOKEN)
2163 		return 0;
2164 
2165 	seq_printf(seq,
2166 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2167 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2168 		   stats->fc_numblks,
2169 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2170 	seq_puts(seq, "Ineligible reasons:\n");
2171 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2172 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2173 			stats->fc_ineligible_reason_count[i]);
2174 
2175 	return 0;
2176 }
2177 
2178 int __init ext4_fc_init_dentry_cache(void)
2179 {
2180 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2181 					   SLAB_RECLAIM_ACCOUNT);
2182 
2183 	if (ext4_fc_dentry_cachep == NULL)
2184 		return -ENOMEM;
2185 
2186 	return 0;
2187 }
2188