xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 37b9345c)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	INIT_LIST_HEAD(&ei->i_fc_dilist);
203 	init_waitqueue_head(&ei->i_fc_wait);
204 	atomic_set(&ei->i_fc_updates, 0);
205 }
206 
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211 	wait_queue_head_t *wq;
212 	struct ext4_inode_info *ei = EXT4_I(inode);
213 
214 #if (BITS_PER_LONG < 64)
215 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 			EXT4_STATE_FC_COMMITTING);
217 	wq = bit_waitqueue(&ei->i_state_flags,
218 				EXT4_STATE_FC_COMMITTING);
219 #else
220 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 			EXT4_STATE_FC_COMMITTING);
222 	wq = bit_waitqueue(&ei->i_flags,
223 				EXT4_STATE_FC_COMMITTING);
224 #endif
225 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 	schedule();
229 	finish_wait(wq, &wait.wq_entry);
230 }
231 
232 /*
233  * Inform Ext4's fast about start of an inode update
234  *
235  * This function is called by the high level call VFS callbacks before
236  * performing any inode update. This function blocks if there's an ongoing
237  * fast commit on the inode in question.
238  */
239 void ext4_fc_start_update(struct inode *inode)
240 {
241 	struct ext4_inode_info *ei = EXT4_I(inode);
242 
243 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
244 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
245 		return;
246 
247 restart:
248 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
249 	if (list_empty(&ei->i_fc_list))
250 		goto out;
251 
252 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
253 		ext4_fc_wait_committing_inode(inode);
254 		goto restart;
255 	}
256 out:
257 	atomic_inc(&ei->i_fc_updates);
258 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
259 }
260 
261 /*
262  * Stop inode update and wake up waiting fast commits if any.
263  */
264 void ext4_fc_stop_update(struct inode *inode)
265 {
266 	struct ext4_inode_info *ei = EXT4_I(inode);
267 
268 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
269 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
270 		return;
271 
272 	if (atomic_dec_and_test(&ei->i_fc_updates))
273 		wake_up_all(&ei->i_fc_wait);
274 }
275 
276 /*
277  * Remove inode from fast commit list. If the inode is being committed
278  * we wait until inode commit is done.
279  */
280 void ext4_fc_del(struct inode *inode)
281 {
282 	struct ext4_inode_info *ei = EXT4_I(inode);
283 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
284 	struct ext4_fc_dentry_update *fc_dentry;
285 
286 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
287 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
288 		return;
289 
290 restart:
291 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
292 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
293 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
294 		return;
295 	}
296 
297 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
298 		ext4_fc_wait_committing_inode(inode);
299 		goto restart;
300 	}
301 
302 	if (!list_empty(&ei->i_fc_list))
303 		list_del_init(&ei->i_fc_list);
304 
305 	/*
306 	 * Since this inode is getting removed, let's also remove all FC
307 	 * dentry create references, since it is not needed to log it anyways.
308 	 */
309 	if (list_empty(&ei->i_fc_dilist)) {
310 		spin_unlock(&sbi->s_fc_lock);
311 		return;
312 	}
313 
314 	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
315 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
316 	list_del_init(&fc_dentry->fcd_list);
317 	list_del_init(&fc_dentry->fcd_dilist);
318 
319 	WARN_ON(!list_empty(&ei->i_fc_dilist));
320 	spin_unlock(&sbi->s_fc_lock);
321 
322 	if (fc_dentry->fcd_name.name &&
323 		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
324 		kfree(fc_dentry->fcd_name.name);
325 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
326 
327 	return;
328 }
329 
330 /*
331  * Mark file system as fast commit ineligible, and record latest
332  * ineligible transaction tid. This means until the recorded
333  * transaction, commit operation would result in a full jbd2 commit.
334  */
335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
336 {
337 	struct ext4_sb_info *sbi = EXT4_SB(sb);
338 	tid_t tid;
339 
340 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
341 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
342 		return;
343 
344 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
345 	if (handle && !IS_ERR(handle))
346 		tid = handle->h_transaction->t_tid;
347 	else {
348 		read_lock(&sbi->s_journal->j_state_lock);
349 		tid = sbi->s_journal->j_running_transaction ?
350 				sbi->s_journal->j_running_transaction->t_tid : 0;
351 		read_unlock(&sbi->s_journal->j_state_lock);
352 	}
353 	spin_lock(&sbi->s_fc_lock);
354 	if (sbi->s_fc_ineligible_tid < tid)
355 		sbi->s_fc_ineligible_tid = tid;
356 	spin_unlock(&sbi->s_fc_lock);
357 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
358 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
359 }
360 
361 /*
362  * Generic fast commit tracking function. If this is the first time this we are
363  * called after a full commit, we initialize fast commit fields and then call
364  * __fc_track_fn() with update = 0. If we have already been called after a full
365  * commit, we pass update = 1. Based on that, the track function can determine
366  * if it needs to track a field for the first time or if it needs to just
367  * update the previously tracked value.
368  *
369  * If enqueue is set, this function enqueues the inode in fast commit list.
370  */
371 static int ext4_fc_track_template(
372 	handle_t *handle, struct inode *inode,
373 	int (*__fc_track_fn)(struct inode *, void *, bool),
374 	void *args, int enqueue)
375 {
376 	bool update = false;
377 	struct ext4_inode_info *ei = EXT4_I(inode);
378 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
379 	tid_t tid = 0;
380 	int ret;
381 
382 	tid = handle->h_transaction->t_tid;
383 	mutex_lock(&ei->i_fc_lock);
384 	if (tid == ei->i_sync_tid) {
385 		update = true;
386 	} else {
387 		ext4_fc_reset_inode(inode);
388 		ei->i_sync_tid = tid;
389 	}
390 	ret = __fc_track_fn(inode, args, update);
391 	mutex_unlock(&ei->i_fc_lock);
392 
393 	if (!enqueue)
394 		return ret;
395 
396 	spin_lock(&sbi->s_fc_lock);
397 	if (list_empty(&EXT4_I(inode)->i_fc_list))
398 		list_add_tail(&EXT4_I(inode)->i_fc_list,
399 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
400 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
401 				&sbi->s_fc_q[FC_Q_STAGING] :
402 				&sbi->s_fc_q[FC_Q_MAIN]);
403 	spin_unlock(&sbi->s_fc_lock);
404 
405 	return ret;
406 }
407 
408 struct __track_dentry_update_args {
409 	struct dentry *dentry;
410 	int op;
411 };
412 
413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
414 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
415 {
416 	struct ext4_fc_dentry_update *node;
417 	struct ext4_inode_info *ei = EXT4_I(inode);
418 	struct __track_dentry_update_args *dentry_update =
419 		(struct __track_dentry_update_args *)arg;
420 	struct dentry *dentry = dentry_update->dentry;
421 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
422 
423 	mutex_unlock(&ei->i_fc_lock);
424 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
425 	if (!node) {
426 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
427 		mutex_lock(&ei->i_fc_lock);
428 		return -ENOMEM;
429 	}
430 
431 	node->fcd_op = dentry_update->op;
432 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
433 	node->fcd_ino = inode->i_ino;
434 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
435 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
436 		if (!node->fcd_name.name) {
437 			kmem_cache_free(ext4_fc_dentry_cachep, node);
438 			ext4_fc_mark_ineligible(inode->i_sb,
439 				EXT4_FC_REASON_NOMEM, NULL);
440 			mutex_lock(&ei->i_fc_lock);
441 			return -ENOMEM;
442 		}
443 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
444 			dentry->d_name.len);
445 	} else {
446 		memcpy(node->fcd_iname, dentry->d_name.name,
447 			dentry->d_name.len);
448 		node->fcd_name.name = node->fcd_iname;
449 	}
450 	node->fcd_name.len = dentry->d_name.len;
451 	INIT_LIST_HEAD(&node->fcd_dilist);
452 	spin_lock(&sbi->s_fc_lock);
453 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
454 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
455 		list_add_tail(&node->fcd_list,
456 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
457 	else
458 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
459 
460 	/*
461 	 * This helps us keep a track of all fc_dentry updates which is part of
462 	 * this ext4 inode. So in case the inode is getting unlinked, before
463 	 * even we get a chance to fsync, we could remove all fc_dentry
464 	 * references while evicting the inode in ext4_fc_del().
465 	 * Also with this, we don't need to loop over all the inodes in
466 	 * sbi->s_fc_q to get the corresponding inode in
467 	 * ext4_fc_commit_dentry_updates().
468 	 */
469 	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
470 		WARN_ON(!list_empty(&ei->i_fc_dilist));
471 		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
472 	}
473 	spin_unlock(&sbi->s_fc_lock);
474 	mutex_lock(&ei->i_fc_lock);
475 
476 	return 0;
477 }
478 
479 void __ext4_fc_track_unlink(handle_t *handle,
480 		struct inode *inode, struct dentry *dentry)
481 {
482 	struct __track_dentry_update_args args;
483 	int ret;
484 
485 	args.dentry = dentry;
486 	args.op = EXT4_FC_TAG_UNLINK;
487 
488 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
489 					(void *)&args, 0);
490 	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
491 }
492 
493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
494 {
495 	struct inode *inode = d_inode(dentry);
496 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
497 
498 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
499 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
500 		return;
501 
502 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503 		return;
504 
505 	__ext4_fc_track_unlink(handle, inode, dentry);
506 }
507 
508 void __ext4_fc_track_link(handle_t *handle,
509 	struct inode *inode, struct dentry *dentry)
510 {
511 	struct __track_dentry_update_args args;
512 	int ret;
513 
514 	args.dentry = dentry;
515 	args.op = EXT4_FC_TAG_LINK;
516 
517 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518 					(void *)&args, 0);
519 	trace_ext4_fc_track_link(handle, inode, dentry, ret);
520 }
521 
522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524 	struct inode *inode = d_inode(dentry);
525 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
526 
527 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
528 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
529 		return;
530 
531 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
532 		return;
533 
534 	__ext4_fc_track_link(handle, inode, dentry);
535 }
536 
537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
538 			  struct dentry *dentry)
539 {
540 	struct __track_dentry_update_args args;
541 	int ret;
542 
543 	args.dentry = dentry;
544 	args.op = EXT4_FC_TAG_CREAT;
545 
546 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
547 					(void *)&args, 0);
548 	trace_ext4_fc_track_create(handle, inode, dentry, ret);
549 }
550 
551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
552 {
553 	struct inode *inode = d_inode(dentry);
554 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
555 
556 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
557 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
558 		return;
559 
560 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
561 		return;
562 
563 	__ext4_fc_track_create(handle, inode, dentry);
564 }
565 
566 /* __track_fn for inode tracking */
567 static int __track_inode(struct inode *inode, void *arg, bool update)
568 {
569 	if (update)
570 		return -EEXIST;
571 
572 	EXT4_I(inode)->i_fc_lblk_len = 0;
573 
574 	return 0;
575 }
576 
577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
578 {
579 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
580 	int ret;
581 
582 	if (S_ISDIR(inode->i_mode))
583 		return;
584 
585 	if (ext4_should_journal_data(inode)) {
586 		ext4_fc_mark_ineligible(inode->i_sb,
587 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
588 		return;
589 	}
590 
591 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
592 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
593 		return;
594 
595 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
596 		return;
597 
598 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
599 	trace_ext4_fc_track_inode(handle, inode, ret);
600 }
601 
602 struct __track_range_args {
603 	ext4_lblk_t start, end;
604 };
605 
606 /* __track_fn for tracking data updates */
607 static int __track_range(struct inode *inode, void *arg, bool update)
608 {
609 	struct ext4_inode_info *ei = EXT4_I(inode);
610 	ext4_lblk_t oldstart;
611 	struct __track_range_args *__arg =
612 		(struct __track_range_args *)arg;
613 
614 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
615 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
616 		return -ECANCELED;
617 	}
618 
619 	oldstart = ei->i_fc_lblk_start;
620 
621 	if (update && ei->i_fc_lblk_len > 0) {
622 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
623 		ei->i_fc_lblk_len =
624 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
625 				ei->i_fc_lblk_start + 1;
626 	} else {
627 		ei->i_fc_lblk_start = __arg->start;
628 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
629 	}
630 
631 	return 0;
632 }
633 
634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
635 			 ext4_lblk_t end)
636 {
637 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
638 	struct __track_range_args args;
639 	int ret;
640 
641 	if (S_ISDIR(inode->i_mode))
642 		return;
643 
644 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
645 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
646 		return;
647 
648 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
649 		return;
650 
651 	args.start = start;
652 	args.end = end;
653 
654 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
655 
656 	trace_ext4_fc_track_range(handle, inode, start, end, ret);
657 }
658 
659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
660 {
661 	blk_opf_t write_flags = REQ_SYNC;
662 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
663 
664 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
665 	if (test_opt(sb, BARRIER) && is_tail)
666 		write_flags |= REQ_FUA | REQ_PREFLUSH;
667 	lock_buffer(bh);
668 	set_buffer_dirty(bh);
669 	set_buffer_uptodate(bh);
670 	bh->b_end_io = ext4_end_buffer_io_sync;
671 	submit_bh(REQ_OP_WRITE | write_flags, bh);
672 	EXT4_SB(sb)->s_fc_bh = NULL;
673 }
674 
675 /* Ext4 commit path routines */
676 
677 /* memzero and update CRC */
678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
679 				u32 *crc)
680 {
681 	void *ret;
682 
683 	ret = memset(dst, 0, len);
684 	if (crc)
685 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
686 	return ret;
687 }
688 
689 /*
690  * Allocate len bytes on a fast commit buffer.
691  *
692  * During the commit time this function is used to manage fast commit
693  * block space. We don't split a fast commit log onto different
694  * blocks. So this function makes sure that if there's not enough space
695  * on the current block, the remaining space in the current block is
696  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
697  * new block is from jbd2 and CRC is updated to reflect the padding
698  * we added.
699  */
700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
701 {
702 	struct ext4_fc_tl *tl;
703 	struct ext4_sb_info *sbi = EXT4_SB(sb);
704 	struct buffer_head *bh;
705 	int bsize = sbi->s_journal->j_blocksize;
706 	int ret, off = sbi->s_fc_bytes % bsize;
707 	int pad_len;
708 
709 	/*
710 	 * After allocating len, we should have space at least for a 0 byte
711 	 * padding.
712 	 */
713 	if (len + sizeof(struct ext4_fc_tl) > bsize)
714 		return NULL;
715 
716 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
717 		/*
718 		 * Only allocate from current buffer if we have enough space for
719 		 * this request AND we have space to add a zero byte padding.
720 		 */
721 		if (!sbi->s_fc_bh) {
722 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
723 			if (ret)
724 				return NULL;
725 			sbi->s_fc_bh = bh;
726 		}
727 		sbi->s_fc_bytes += len;
728 		return sbi->s_fc_bh->b_data + off;
729 	}
730 	/* Need to add PAD tag */
731 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
732 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
733 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
734 	tl->fc_len = cpu_to_le16(pad_len);
735 	if (crc)
736 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
737 	if (pad_len > 0)
738 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
739 	ext4_fc_submit_bh(sb, false);
740 
741 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
742 	if (ret)
743 		return NULL;
744 	sbi->s_fc_bh = bh;
745 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
746 	return sbi->s_fc_bh->b_data;
747 }
748 
749 /* memcpy to fc reserved space and update CRC */
750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
751 				int len, u32 *crc)
752 {
753 	if (crc)
754 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
755 	return memcpy(dst, src, len);
756 }
757 
758 /*
759  * Complete a fast commit by writing tail tag.
760  *
761  * Writing tail tag marks the end of a fast commit. In order to guarantee
762  * atomicity, after writing tail tag, even if there's space remaining
763  * in the block, next commit shouldn't use it. That's why tail tag
764  * has the length as that of the remaining space on the block.
765  */
766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
767 {
768 	struct ext4_sb_info *sbi = EXT4_SB(sb);
769 	struct ext4_fc_tl tl;
770 	struct ext4_fc_tail tail;
771 	int off, bsize = sbi->s_journal->j_blocksize;
772 	u8 *dst;
773 
774 	/*
775 	 * ext4_fc_reserve_space takes care of allocating an extra block if
776 	 * there's no enough space on this block for accommodating this tail.
777 	 */
778 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
779 	if (!dst)
780 		return -ENOSPC;
781 
782 	off = sbi->s_fc_bytes % bsize;
783 
784 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
785 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
786 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
787 
788 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
789 	dst += sizeof(tl);
790 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
791 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
792 	dst += sizeof(tail.fc_tid);
793 	tail.fc_crc = cpu_to_le32(crc);
794 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
795 
796 	ext4_fc_submit_bh(sb, true);
797 
798 	return 0;
799 }
800 
801 /*
802  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
803  * Returns false if there's not enough space.
804  */
805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
806 			   u32 *crc)
807 {
808 	struct ext4_fc_tl tl;
809 	u8 *dst;
810 
811 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
812 	if (!dst)
813 		return false;
814 
815 	tl.fc_tag = cpu_to_le16(tag);
816 	tl.fc_len = cpu_to_le16(len);
817 
818 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
819 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
820 
821 	return true;
822 }
823 
824 /* Same as above, but adds dentry tlv. */
825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
826 				   struct ext4_fc_dentry_update *fc_dentry)
827 {
828 	struct ext4_fc_dentry_info fcd;
829 	struct ext4_fc_tl tl;
830 	int dlen = fc_dentry->fcd_name.len;
831 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
832 					crc);
833 
834 	if (!dst)
835 		return false;
836 
837 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
838 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
839 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
840 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
841 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
842 	dst += sizeof(tl);
843 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
844 	dst += sizeof(fcd);
845 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
846 
847 	return true;
848 }
849 
850 /*
851  * Writes inode in the fast commit space under TLV with tag @tag.
852  * Returns 0 on success, error on failure.
853  */
854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
855 {
856 	struct ext4_inode_info *ei = EXT4_I(inode);
857 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
858 	int ret;
859 	struct ext4_iloc iloc;
860 	struct ext4_fc_inode fc_inode;
861 	struct ext4_fc_tl tl;
862 	u8 *dst;
863 
864 	ret = ext4_get_inode_loc(inode, &iloc);
865 	if (ret)
866 		return ret;
867 
868 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
869 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
870 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
871 		inode_len += ei->i_extra_isize;
872 
873 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
874 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
875 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
876 
877 	dst = ext4_fc_reserve_space(inode->i_sb,
878 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
879 	if (!dst)
880 		return -ECANCELED;
881 
882 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
883 		return -ECANCELED;
884 	dst += sizeof(tl);
885 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
886 		return -ECANCELED;
887 	dst += sizeof(fc_inode);
888 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
889 					inode_len, crc))
890 		return -ECANCELED;
891 
892 	return 0;
893 }
894 
895 /*
896  * Writes updated data ranges for the inode in question. Updates CRC.
897  * Returns 0 on success, error otherwise.
898  */
899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
900 {
901 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
902 	struct ext4_inode_info *ei = EXT4_I(inode);
903 	struct ext4_map_blocks map;
904 	struct ext4_fc_add_range fc_ext;
905 	struct ext4_fc_del_range lrange;
906 	struct ext4_extent *ex;
907 	int ret;
908 
909 	mutex_lock(&ei->i_fc_lock);
910 	if (ei->i_fc_lblk_len == 0) {
911 		mutex_unlock(&ei->i_fc_lock);
912 		return 0;
913 	}
914 	old_blk_size = ei->i_fc_lblk_start;
915 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
916 	ei->i_fc_lblk_len = 0;
917 	mutex_unlock(&ei->i_fc_lock);
918 
919 	cur_lblk_off = old_blk_size;
920 	ext4_debug("will try writing %d to %d for inode %ld\n",
921 		   cur_lblk_off, new_blk_size, inode->i_ino);
922 
923 	while (cur_lblk_off <= new_blk_size) {
924 		map.m_lblk = cur_lblk_off;
925 		map.m_len = new_blk_size - cur_lblk_off + 1;
926 		ret = ext4_map_blocks(NULL, inode, &map, 0);
927 		if (ret < 0)
928 			return -ECANCELED;
929 
930 		if (map.m_len == 0) {
931 			cur_lblk_off++;
932 			continue;
933 		}
934 
935 		if (ret == 0) {
936 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
937 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
938 			lrange.fc_len = cpu_to_le32(map.m_len);
939 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
940 					    sizeof(lrange), (u8 *)&lrange, crc))
941 				return -ENOSPC;
942 		} else {
943 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
944 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
945 
946 			/* Limit the number of blocks in one extent */
947 			map.m_len = min(max, map.m_len);
948 
949 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
950 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
951 			ex->ee_block = cpu_to_le32(map.m_lblk);
952 			ex->ee_len = cpu_to_le16(map.m_len);
953 			ext4_ext_store_pblock(ex, map.m_pblk);
954 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
955 				ext4_ext_mark_unwritten(ex);
956 			else
957 				ext4_ext_mark_initialized(ex);
958 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
959 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
960 				return -ENOSPC;
961 		}
962 
963 		cur_lblk_off += map.m_len;
964 	}
965 
966 	return 0;
967 }
968 
969 
970 /* Submit data for all the fast commit inodes */
971 static int ext4_fc_submit_inode_data_all(journal_t *journal)
972 {
973 	struct super_block *sb = journal->j_private;
974 	struct ext4_sb_info *sbi = EXT4_SB(sb);
975 	struct ext4_inode_info *ei;
976 	int ret = 0;
977 
978 	spin_lock(&sbi->s_fc_lock);
979 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
980 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
981 		while (atomic_read(&ei->i_fc_updates)) {
982 			DEFINE_WAIT(wait);
983 
984 			prepare_to_wait(&ei->i_fc_wait, &wait,
985 						TASK_UNINTERRUPTIBLE);
986 			if (atomic_read(&ei->i_fc_updates)) {
987 				spin_unlock(&sbi->s_fc_lock);
988 				schedule();
989 				spin_lock(&sbi->s_fc_lock);
990 			}
991 			finish_wait(&ei->i_fc_wait, &wait);
992 		}
993 		spin_unlock(&sbi->s_fc_lock);
994 		ret = jbd2_submit_inode_data(ei->jinode);
995 		if (ret)
996 			return ret;
997 		spin_lock(&sbi->s_fc_lock);
998 	}
999 	spin_unlock(&sbi->s_fc_lock);
1000 
1001 	return ret;
1002 }
1003 
1004 /* Wait for completion of data for all the fast commit inodes */
1005 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006 {
1007 	struct super_block *sb = journal->j_private;
1008 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1009 	struct ext4_inode_info *pos, *n;
1010 	int ret = 0;
1011 
1012 	spin_lock(&sbi->s_fc_lock);
1013 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014 		if (!ext4_test_inode_state(&pos->vfs_inode,
1015 					   EXT4_STATE_FC_COMMITTING))
1016 			continue;
1017 		spin_unlock(&sbi->s_fc_lock);
1018 
1019 		ret = jbd2_wait_inode_data(journal, pos->jinode);
1020 		if (ret)
1021 			return ret;
1022 		spin_lock(&sbi->s_fc_lock);
1023 	}
1024 	spin_unlock(&sbi->s_fc_lock);
1025 
1026 	return 0;
1027 }
1028 
1029 /* Commit all the directory entry updates */
1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1031 __acquires(&sbi->s_fc_lock)
1032 __releases(&sbi->s_fc_lock)
1033 {
1034 	struct super_block *sb = journal->j_private;
1035 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1036 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1037 	struct inode *inode;
1038 	struct ext4_inode_info *ei;
1039 	int ret;
1040 
1041 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042 		return 0;
1043 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1045 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046 			spin_unlock(&sbi->s_fc_lock);
1047 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1048 				ret = -ENOSPC;
1049 				goto lock_and_exit;
1050 			}
1051 			spin_lock(&sbi->s_fc_lock);
1052 			continue;
1053 		}
1054 		/*
1055 		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056 		 * corresponding inode pointer
1057 		 */
1058 		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1060 				struct ext4_inode_info, i_fc_dilist);
1061 		inode = &ei->vfs_inode;
1062 		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063 
1064 		spin_unlock(&sbi->s_fc_lock);
1065 
1066 		/*
1067 		 * We first write the inode and then the create dirent. This
1068 		 * allows the recovery code to create an unnamed inode first
1069 		 * and then link it to a directory entry. This allows us
1070 		 * to use namei.c routines almost as is and simplifies
1071 		 * the recovery code.
1072 		 */
1073 		ret = ext4_fc_write_inode(inode, crc);
1074 		if (ret)
1075 			goto lock_and_exit;
1076 
1077 		ret = ext4_fc_write_inode_data(inode, crc);
1078 		if (ret)
1079 			goto lock_and_exit;
1080 
1081 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1082 			ret = -ENOSPC;
1083 			goto lock_and_exit;
1084 		}
1085 
1086 		spin_lock(&sbi->s_fc_lock);
1087 	}
1088 	return 0;
1089 lock_and_exit:
1090 	spin_lock(&sbi->s_fc_lock);
1091 	return ret;
1092 }
1093 
1094 static int ext4_fc_perform_commit(journal_t *journal)
1095 {
1096 	struct super_block *sb = journal->j_private;
1097 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 	struct ext4_inode_info *iter;
1099 	struct ext4_fc_head head;
1100 	struct inode *inode;
1101 	struct blk_plug plug;
1102 	int ret = 0;
1103 	u32 crc = 0;
1104 
1105 	ret = ext4_fc_submit_inode_data_all(journal);
1106 	if (ret)
1107 		return ret;
1108 
1109 	ret = ext4_fc_wait_inode_data_all(journal);
1110 	if (ret)
1111 		return ret;
1112 
1113 	/*
1114 	 * If file system device is different from journal device, issue a cache
1115 	 * flush before we start writing fast commit blocks.
1116 	 */
1117 	if (journal->j_fs_dev != journal->j_dev)
1118 		blkdev_issue_flush(journal->j_fs_dev);
1119 
1120 	blk_start_plug(&plug);
1121 	if (sbi->s_fc_bytes == 0) {
1122 		/*
1123 		 * Add a head tag only if this is the first fast commit
1124 		 * in this TID.
1125 		 */
1126 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127 		head.fc_tid = cpu_to_le32(
1128 			sbi->s_journal->j_running_transaction->t_tid);
1129 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1130 			(u8 *)&head, &crc)) {
1131 			ret = -ENOSPC;
1132 			goto out;
1133 		}
1134 	}
1135 
1136 	spin_lock(&sbi->s_fc_lock);
1137 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138 	if (ret) {
1139 		spin_unlock(&sbi->s_fc_lock);
1140 		goto out;
1141 	}
1142 
1143 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1144 		inode = &iter->vfs_inode;
1145 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146 			continue;
1147 
1148 		spin_unlock(&sbi->s_fc_lock);
1149 		ret = ext4_fc_write_inode_data(inode, &crc);
1150 		if (ret)
1151 			goto out;
1152 		ret = ext4_fc_write_inode(inode, &crc);
1153 		if (ret)
1154 			goto out;
1155 		spin_lock(&sbi->s_fc_lock);
1156 	}
1157 	spin_unlock(&sbi->s_fc_lock);
1158 
1159 	ret = ext4_fc_write_tail(sb, crc);
1160 
1161 out:
1162 	blk_finish_plug(&plug);
1163 	return ret;
1164 }
1165 
1166 static void ext4_fc_update_stats(struct super_block *sb, int status,
1167 				 u64 commit_time, int nblks, tid_t commit_tid)
1168 {
1169 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170 
1171 	ext4_debug("Fast commit ended with status = %d for tid %u",
1172 			status, commit_tid);
1173 	if (status == EXT4_FC_STATUS_OK) {
1174 		stats->fc_num_commits++;
1175 		stats->fc_numblks += nblks;
1176 		if (likely(stats->s_fc_avg_commit_time))
1177 			stats->s_fc_avg_commit_time =
1178 				(commit_time +
1179 				 stats->s_fc_avg_commit_time * 3) / 4;
1180 		else
1181 			stats->s_fc_avg_commit_time = commit_time;
1182 	} else if (status == EXT4_FC_STATUS_FAILED ||
1183 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1184 		if (status == EXT4_FC_STATUS_FAILED)
1185 			stats->fc_failed_commits++;
1186 		stats->fc_ineligible_commits++;
1187 	} else {
1188 		stats->fc_skipped_commits++;
1189 	}
1190 	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1191 }
1192 
1193 /*
1194  * The main commit entry point. Performs a fast commit for transaction
1195  * commit_tid if needed. If it's not possible to perform a fast commit
1196  * due to various reasons, we fall back to full commit. Returns 0
1197  * on success, error otherwise.
1198  */
1199 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1200 {
1201 	struct super_block *sb = journal->j_private;
1202 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1203 	int nblks = 0, ret, bsize = journal->j_blocksize;
1204 	int subtid = atomic_read(&sbi->s_fc_subtid);
1205 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1206 	ktime_t start_time, commit_time;
1207 
1208 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1209 		return jbd2_complete_transaction(journal, commit_tid);
1210 
1211 	trace_ext4_fc_commit_start(sb, commit_tid);
1212 
1213 	start_time = ktime_get();
1214 
1215 restart_fc:
1216 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1217 	if (ret == -EALREADY) {
1218 		/* There was an ongoing commit, check if we need to restart */
1219 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1220 			commit_tid > journal->j_commit_sequence)
1221 			goto restart_fc;
1222 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1223 				commit_tid);
1224 		return 0;
1225 	} else if (ret) {
1226 		/*
1227 		 * Commit couldn't start. Just update stats and perform a
1228 		 * full commit.
1229 		 */
1230 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1231 				commit_tid);
1232 		return jbd2_complete_transaction(journal, commit_tid);
1233 	}
1234 
1235 	/*
1236 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1237 	 * if we are fast commit ineligible.
1238 	 */
1239 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1240 		status = EXT4_FC_STATUS_INELIGIBLE;
1241 		goto fallback;
1242 	}
1243 
1244 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1245 	ret = ext4_fc_perform_commit(journal);
1246 	if (ret < 0) {
1247 		status = EXT4_FC_STATUS_FAILED;
1248 		goto fallback;
1249 	}
1250 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1251 	ret = jbd2_fc_wait_bufs(journal, nblks);
1252 	if (ret < 0) {
1253 		status = EXT4_FC_STATUS_FAILED;
1254 		goto fallback;
1255 	}
1256 	atomic_inc(&sbi->s_fc_subtid);
1257 	ret = jbd2_fc_end_commit(journal);
1258 	/*
1259 	 * weight the commit time higher than the average time so we
1260 	 * don't react too strongly to vast changes in the commit time
1261 	 */
1262 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1263 	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1264 	return ret;
1265 
1266 fallback:
1267 	ret = jbd2_fc_end_commit_fallback(journal);
1268 	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1269 	return ret;
1270 }
1271 
1272 /*
1273  * Fast commit cleanup routine. This is called after every fast commit and
1274  * full commit. full is true if we are called after a full commit.
1275  */
1276 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1277 {
1278 	struct super_block *sb = journal->j_private;
1279 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1280 	struct ext4_inode_info *iter, *iter_n;
1281 	struct ext4_fc_dentry_update *fc_dentry;
1282 
1283 	if (full && sbi->s_fc_bh)
1284 		sbi->s_fc_bh = NULL;
1285 
1286 	trace_ext4_fc_cleanup(journal, full, tid);
1287 	jbd2_fc_release_bufs(journal);
1288 
1289 	spin_lock(&sbi->s_fc_lock);
1290 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1291 				 i_fc_list) {
1292 		list_del_init(&iter->i_fc_list);
1293 		ext4_clear_inode_state(&iter->vfs_inode,
1294 				       EXT4_STATE_FC_COMMITTING);
1295 		if (iter->i_sync_tid <= tid)
1296 			ext4_fc_reset_inode(&iter->vfs_inode);
1297 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1298 		smp_mb();
1299 #if (BITS_PER_LONG < 64)
1300 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1301 #else
1302 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1303 #endif
1304 	}
1305 
1306 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1307 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1308 					     struct ext4_fc_dentry_update,
1309 					     fcd_list);
1310 		list_del_init(&fc_dentry->fcd_list);
1311 		list_del_init(&fc_dentry->fcd_dilist);
1312 		spin_unlock(&sbi->s_fc_lock);
1313 
1314 		if (fc_dentry->fcd_name.name &&
1315 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1316 			kfree(fc_dentry->fcd_name.name);
1317 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1318 		spin_lock(&sbi->s_fc_lock);
1319 	}
1320 
1321 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1322 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1323 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1324 				&sbi->s_fc_q[FC_Q_MAIN]);
1325 
1326 	if (tid >= sbi->s_fc_ineligible_tid) {
1327 		sbi->s_fc_ineligible_tid = 0;
1328 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1329 	}
1330 
1331 	if (full)
1332 		sbi->s_fc_bytes = 0;
1333 	spin_unlock(&sbi->s_fc_lock);
1334 	trace_ext4_fc_stats(sb);
1335 }
1336 
1337 /* Ext4 Replay Path Routines */
1338 
1339 /* Helper struct for dentry replay routines */
1340 struct dentry_info_args {
1341 	int parent_ino, dname_len, ino, inode_len;
1342 	char *dname;
1343 };
1344 
1345 static inline void tl_to_darg(struct dentry_info_args *darg,
1346 			      struct  ext4_fc_tl *tl, u8 *val)
1347 {
1348 	struct ext4_fc_dentry_info fcd;
1349 
1350 	memcpy(&fcd, val, sizeof(fcd));
1351 
1352 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1353 	darg->ino = le32_to_cpu(fcd.fc_ino);
1354 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1355 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1356 		sizeof(struct ext4_fc_dentry_info);
1357 }
1358 
1359 /* Unlink replay function */
1360 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1361 				 u8 *val)
1362 {
1363 	struct inode *inode, *old_parent;
1364 	struct qstr entry;
1365 	struct dentry_info_args darg;
1366 	int ret = 0;
1367 
1368 	tl_to_darg(&darg, tl, val);
1369 
1370 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1371 			darg.parent_ino, darg.dname_len);
1372 
1373 	entry.name = darg.dname;
1374 	entry.len = darg.dname_len;
1375 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1376 
1377 	if (IS_ERR(inode)) {
1378 		ext4_debug("Inode %d not found", darg.ino);
1379 		return 0;
1380 	}
1381 
1382 	old_parent = ext4_iget(sb, darg.parent_ino,
1383 				EXT4_IGET_NORMAL);
1384 	if (IS_ERR(old_parent)) {
1385 		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1386 		iput(inode);
1387 		return 0;
1388 	}
1389 
1390 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1391 	/* -ENOENT ok coz it might not exist anymore. */
1392 	if (ret == -ENOENT)
1393 		ret = 0;
1394 	iput(old_parent);
1395 	iput(inode);
1396 	return ret;
1397 }
1398 
1399 static int ext4_fc_replay_link_internal(struct super_block *sb,
1400 				struct dentry_info_args *darg,
1401 				struct inode *inode)
1402 {
1403 	struct inode *dir = NULL;
1404 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1405 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1406 	int ret = 0;
1407 
1408 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1409 	if (IS_ERR(dir)) {
1410 		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1411 		dir = NULL;
1412 		goto out;
1413 	}
1414 
1415 	dentry_dir = d_obtain_alias(dir);
1416 	if (IS_ERR(dentry_dir)) {
1417 		ext4_debug("Failed to obtain dentry");
1418 		dentry_dir = NULL;
1419 		goto out;
1420 	}
1421 
1422 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1423 	if (!dentry_inode) {
1424 		ext4_debug("Inode dentry not created.");
1425 		ret = -ENOMEM;
1426 		goto out;
1427 	}
1428 
1429 	ret = __ext4_link(dir, inode, dentry_inode);
1430 	/*
1431 	 * It's possible that link already existed since data blocks
1432 	 * for the dir in question got persisted before we crashed OR
1433 	 * we replayed this tag and crashed before the entire replay
1434 	 * could complete.
1435 	 */
1436 	if (ret && ret != -EEXIST) {
1437 		ext4_debug("Failed to link\n");
1438 		goto out;
1439 	}
1440 
1441 	ret = 0;
1442 out:
1443 	if (dentry_dir) {
1444 		d_drop(dentry_dir);
1445 		dput(dentry_dir);
1446 	} else if (dir) {
1447 		iput(dir);
1448 	}
1449 	if (dentry_inode) {
1450 		d_drop(dentry_inode);
1451 		dput(dentry_inode);
1452 	}
1453 
1454 	return ret;
1455 }
1456 
1457 /* Link replay function */
1458 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1459 			       u8 *val)
1460 {
1461 	struct inode *inode;
1462 	struct dentry_info_args darg;
1463 	int ret = 0;
1464 
1465 	tl_to_darg(&darg, tl, val);
1466 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1467 			darg.parent_ino, darg.dname_len);
1468 
1469 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1470 	if (IS_ERR(inode)) {
1471 		ext4_debug("Inode not found.");
1472 		return 0;
1473 	}
1474 
1475 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1476 	iput(inode);
1477 	return ret;
1478 }
1479 
1480 /*
1481  * Record all the modified inodes during replay. We use this later to setup
1482  * block bitmaps correctly.
1483  */
1484 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1485 {
1486 	struct ext4_fc_replay_state *state;
1487 	int i;
1488 
1489 	state = &EXT4_SB(sb)->s_fc_replay_state;
1490 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1491 		if (state->fc_modified_inodes[i] == ino)
1492 			return 0;
1493 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1494 		state->fc_modified_inodes = krealloc(
1495 				state->fc_modified_inodes,
1496 				sizeof(int) * (state->fc_modified_inodes_size +
1497 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1498 				GFP_KERNEL);
1499 		if (!state->fc_modified_inodes)
1500 			return -ENOMEM;
1501 		state->fc_modified_inodes_size +=
1502 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1503 	}
1504 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1505 	return 0;
1506 }
1507 
1508 /*
1509  * Inode replay function
1510  */
1511 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1512 				u8 *val)
1513 {
1514 	struct ext4_fc_inode fc_inode;
1515 	struct ext4_inode *raw_inode;
1516 	struct ext4_inode *raw_fc_inode;
1517 	struct inode *inode = NULL;
1518 	struct ext4_iloc iloc;
1519 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1520 	struct ext4_extent_header *eh;
1521 
1522 	memcpy(&fc_inode, val, sizeof(fc_inode));
1523 
1524 	ino = le32_to_cpu(fc_inode.fc_ino);
1525 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1526 
1527 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1528 	if (!IS_ERR(inode)) {
1529 		ext4_ext_clear_bb(inode);
1530 		iput(inode);
1531 	}
1532 	inode = NULL;
1533 
1534 	ret = ext4_fc_record_modified_inode(sb, ino);
1535 	if (ret)
1536 		goto out;
1537 
1538 	raw_fc_inode = (struct ext4_inode *)
1539 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1540 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1541 	if (ret)
1542 		goto out;
1543 
1544 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1545 	raw_inode = ext4_raw_inode(&iloc);
1546 
1547 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1548 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1549 		inode_len - offsetof(struct ext4_inode, i_generation));
1550 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1551 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1552 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1553 			memset(eh, 0, sizeof(*eh));
1554 			eh->eh_magic = EXT4_EXT_MAGIC;
1555 			eh->eh_max = cpu_to_le16(
1556 				(sizeof(raw_inode->i_block) -
1557 				 sizeof(struct ext4_extent_header))
1558 				 / sizeof(struct ext4_extent));
1559 		}
1560 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1561 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1562 			sizeof(raw_inode->i_block));
1563 	}
1564 
1565 	/* Immediately update the inode on disk. */
1566 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1567 	if (ret)
1568 		goto out;
1569 	ret = sync_dirty_buffer(iloc.bh);
1570 	if (ret)
1571 		goto out;
1572 	ret = ext4_mark_inode_used(sb, ino);
1573 	if (ret)
1574 		goto out;
1575 
1576 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1577 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1578 	if (IS_ERR(inode)) {
1579 		ext4_debug("Inode not found.");
1580 		return -EFSCORRUPTED;
1581 	}
1582 
1583 	/*
1584 	 * Our allocator could have made different decisions than before
1585 	 * crashing. This should be fixed but until then, we calculate
1586 	 * the number of blocks the inode.
1587 	 */
1588 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1589 		ext4_ext_replay_set_iblocks(inode);
1590 
1591 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1592 	ext4_reset_inode_seed(inode);
1593 
1594 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1595 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1596 	sync_dirty_buffer(iloc.bh);
1597 	brelse(iloc.bh);
1598 out:
1599 	iput(inode);
1600 	if (!ret)
1601 		blkdev_issue_flush(sb->s_bdev);
1602 
1603 	return 0;
1604 }
1605 
1606 /*
1607  * Dentry create replay function.
1608  *
1609  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1610  * inode for which we are trying to create a dentry here, should already have
1611  * been replayed before we start here.
1612  */
1613 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1614 				 u8 *val)
1615 {
1616 	int ret = 0;
1617 	struct inode *inode = NULL;
1618 	struct inode *dir = NULL;
1619 	struct dentry_info_args darg;
1620 
1621 	tl_to_darg(&darg, tl, val);
1622 
1623 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1624 			darg.parent_ino, darg.dname_len);
1625 
1626 	/* This takes care of update group descriptor and other metadata */
1627 	ret = ext4_mark_inode_used(sb, darg.ino);
1628 	if (ret)
1629 		goto out;
1630 
1631 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1632 	if (IS_ERR(inode)) {
1633 		ext4_debug("inode %d not found.", darg.ino);
1634 		inode = NULL;
1635 		ret = -EINVAL;
1636 		goto out;
1637 	}
1638 
1639 	if (S_ISDIR(inode->i_mode)) {
1640 		/*
1641 		 * If we are creating a directory, we need to make sure that the
1642 		 * dot and dot dot dirents are setup properly.
1643 		 */
1644 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1645 		if (IS_ERR(dir)) {
1646 			ext4_debug("Dir %d not found.", darg.ino);
1647 			goto out;
1648 		}
1649 		ret = ext4_init_new_dir(NULL, dir, inode);
1650 		iput(dir);
1651 		if (ret) {
1652 			ret = 0;
1653 			goto out;
1654 		}
1655 	}
1656 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1657 	if (ret)
1658 		goto out;
1659 	set_nlink(inode, 1);
1660 	ext4_mark_inode_dirty(NULL, inode);
1661 out:
1662 	iput(inode);
1663 	return ret;
1664 }
1665 
1666 /*
1667  * Record physical disk regions which are in use as per fast commit area,
1668  * and used by inodes during replay phase. Our simple replay phase
1669  * allocator excludes these regions from allocation.
1670  */
1671 int ext4_fc_record_regions(struct super_block *sb, int ino,
1672 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1673 {
1674 	struct ext4_fc_replay_state *state;
1675 	struct ext4_fc_alloc_region *region;
1676 
1677 	state = &EXT4_SB(sb)->s_fc_replay_state;
1678 	/*
1679 	 * during replay phase, the fc_regions_valid may not same as
1680 	 * fc_regions_used, update it when do new additions.
1681 	 */
1682 	if (replay && state->fc_regions_used != state->fc_regions_valid)
1683 		state->fc_regions_used = state->fc_regions_valid;
1684 	if (state->fc_regions_used == state->fc_regions_size) {
1685 		state->fc_regions_size +=
1686 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1687 		state->fc_regions = krealloc(
1688 					state->fc_regions,
1689 					state->fc_regions_size *
1690 					sizeof(struct ext4_fc_alloc_region),
1691 					GFP_KERNEL);
1692 		if (!state->fc_regions)
1693 			return -ENOMEM;
1694 	}
1695 	region = &state->fc_regions[state->fc_regions_used++];
1696 	region->ino = ino;
1697 	region->lblk = lblk;
1698 	region->pblk = pblk;
1699 	region->len = len;
1700 
1701 	if (replay)
1702 		state->fc_regions_valid++;
1703 
1704 	return 0;
1705 }
1706 
1707 /* Replay add range tag */
1708 static int ext4_fc_replay_add_range(struct super_block *sb,
1709 				    struct ext4_fc_tl *tl, u8 *val)
1710 {
1711 	struct ext4_fc_add_range fc_add_ex;
1712 	struct ext4_extent newex, *ex;
1713 	struct inode *inode;
1714 	ext4_lblk_t start, cur;
1715 	int remaining, len;
1716 	ext4_fsblk_t start_pblk;
1717 	struct ext4_map_blocks map;
1718 	struct ext4_ext_path *path = NULL;
1719 	int ret;
1720 
1721 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1722 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1723 
1724 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1725 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1726 		ext4_ext_get_actual_len(ex));
1727 
1728 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1729 	if (IS_ERR(inode)) {
1730 		ext4_debug("Inode not found.");
1731 		return 0;
1732 	}
1733 
1734 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1735 	if (ret)
1736 		goto out;
1737 
1738 	start = le32_to_cpu(ex->ee_block);
1739 	start_pblk = ext4_ext_pblock(ex);
1740 	len = ext4_ext_get_actual_len(ex);
1741 
1742 	cur = start;
1743 	remaining = len;
1744 	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1745 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1746 		  inode->i_ino);
1747 
1748 	while (remaining > 0) {
1749 		map.m_lblk = cur;
1750 		map.m_len = remaining;
1751 		map.m_pblk = 0;
1752 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1753 
1754 		if (ret < 0)
1755 			goto out;
1756 
1757 		if (ret == 0) {
1758 			/* Range is not mapped */
1759 			path = ext4_find_extent(inode, cur, NULL, 0);
1760 			if (IS_ERR(path))
1761 				goto out;
1762 			memset(&newex, 0, sizeof(newex));
1763 			newex.ee_block = cpu_to_le32(cur);
1764 			ext4_ext_store_pblock(
1765 				&newex, start_pblk + cur - start);
1766 			newex.ee_len = cpu_to_le16(map.m_len);
1767 			if (ext4_ext_is_unwritten(ex))
1768 				ext4_ext_mark_unwritten(&newex);
1769 			down_write(&EXT4_I(inode)->i_data_sem);
1770 			ret = ext4_ext_insert_extent(
1771 				NULL, inode, &path, &newex, 0);
1772 			up_write((&EXT4_I(inode)->i_data_sem));
1773 			ext4_ext_drop_refs(path);
1774 			kfree(path);
1775 			if (ret)
1776 				goto out;
1777 			goto next;
1778 		}
1779 
1780 		if (start_pblk + cur - start != map.m_pblk) {
1781 			/*
1782 			 * Logical to physical mapping changed. This can happen
1783 			 * if this range was removed and then reallocated to
1784 			 * map to new physical blocks during a fast commit.
1785 			 */
1786 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1787 					ext4_ext_is_unwritten(ex),
1788 					start_pblk + cur - start);
1789 			if (ret)
1790 				goto out;
1791 			/*
1792 			 * Mark the old blocks as free since they aren't used
1793 			 * anymore. We maintain an array of all the modified
1794 			 * inodes. In case these blocks are still used at either
1795 			 * a different logical range in the same inode or in
1796 			 * some different inode, we will mark them as allocated
1797 			 * at the end of the FC replay using our array of
1798 			 * modified inodes.
1799 			 */
1800 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1801 			goto next;
1802 		}
1803 
1804 		/* Range is mapped and needs a state change */
1805 		ext4_debug("Converting from %ld to %d %lld",
1806 				map.m_flags & EXT4_MAP_UNWRITTEN,
1807 			ext4_ext_is_unwritten(ex), map.m_pblk);
1808 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1809 					ext4_ext_is_unwritten(ex), map.m_pblk);
1810 		if (ret)
1811 			goto out;
1812 		/*
1813 		 * We may have split the extent tree while toggling the state.
1814 		 * Try to shrink the extent tree now.
1815 		 */
1816 		ext4_ext_replay_shrink_inode(inode, start + len);
1817 next:
1818 		cur += map.m_len;
1819 		remaining -= map.m_len;
1820 	}
1821 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1822 					sb->s_blocksize_bits);
1823 out:
1824 	iput(inode);
1825 	return 0;
1826 }
1827 
1828 /* Replay DEL_RANGE tag */
1829 static int
1830 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1831 			 u8 *val)
1832 {
1833 	struct inode *inode;
1834 	struct ext4_fc_del_range lrange;
1835 	struct ext4_map_blocks map;
1836 	ext4_lblk_t cur, remaining;
1837 	int ret;
1838 
1839 	memcpy(&lrange, val, sizeof(lrange));
1840 	cur = le32_to_cpu(lrange.fc_lblk);
1841 	remaining = le32_to_cpu(lrange.fc_len);
1842 
1843 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1844 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1845 
1846 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1847 	if (IS_ERR(inode)) {
1848 		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1849 		return 0;
1850 	}
1851 
1852 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1853 	if (ret)
1854 		goto out;
1855 
1856 	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1857 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1858 			le32_to_cpu(lrange.fc_len));
1859 	while (remaining > 0) {
1860 		map.m_lblk = cur;
1861 		map.m_len = remaining;
1862 
1863 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1864 		if (ret < 0)
1865 			goto out;
1866 		if (ret > 0) {
1867 			remaining -= ret;
1868 			cur += ret;
1869 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1870 		} else {
1871 			remaining -= map.m_len;
1872 			cur += map.m_len;
1873 		}
1874 	}
1875 
1876 	down_write(&EXT4_I(inode)->i_data_sem);
1877 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1878 				le32_to_cpu(lrange.fc_lblk) +
1879 				le32_to_cpu(lrange.fc_len) - 1);
1880 	up_write(&EXT4_I(inode)->i_data_sem);
1881 	if (ret)
1882 		goto out;
1883 	ext4_ext_replay_shrink_inode(inode,
1884 		i_size_read(inode) >> sb->s_blocksize_bits);
1885 	ext4_mark_inode_dirty(NULL, inode);
1886 out:
1887 	iput(inode);
1888 	return 0;
1889 }
1890 
1891 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1892 {
1893 	struct ext4_fc_replay_state *state;
1894 	struct inode *inode;
1895 	struct ext4_ext_path *path = NULL;
1896 	struct ext4_map_blocks map;
1897 	int i, ret, j;
1898 	ext4_lblk_t cur, end;
1899 
1900 	state = &EXT4_SB(sb)->s_fc_replay_state;
1901 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1902 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1903 			EXT4_IGET_NORMAL);
1904 		if (IS_ERR(inode)) {
1905 			ext4_debug("Inode %d not found.",
1906 				state->fc_modified_inodes[i]);
1907 			continue;
1908 		}
1909 		cur = 0;
1910 		end = EXT_MAX_BLOCKS;
1911 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1912 			iput(inode);
1913 			continue;
1914 		}
1915 		while (cur < end) {
1916 			map.m_lblk = cur;
1917 			map.m_len = end - cur;
1918 
1919 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1920 			if (ret < 0)
1921 				break;
1922 
1923 			if (ret > 0) {
1924 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1925 				if (!IS_ERR(path)) {
1926 					for (j = 0; j < path->p_depth; j++)
1927 						ext4_mb_mark_bb(inode->i_sb,
1928 							path[j].p_block, 1, 1);
1929 					ext4_ext_drop_refs(path);
1930 					kfree(path);
1931 				}
1932 				cur += ret;
1933 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1934 							map.m_len, 1);
1935 			} else {
1936 				cur = cur + (map.m_len ? map.m_len : 1);
1937 			}
1938 		}
1939 		iput(inode);
1940 	}
1941 }
1942 
1943 /*
1944  * Check if block is in excluded regions for block allocation. The simple
1945  * allocator that runs during replay phase is calls this function to see
1946  * if it is okay to use a block.
1947  */
1948 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1949 {
1950 	int i;
1951 	struct ext4_fc_replay_state *state;
1952 
1953 	state = &EXT4_SB(sb)->s_fc_replay_state;
1954 	for (i = 0; i < state->fc_regions_valid; i++) {
1955 		if (state->fc_regions[i].ino == 0 ||
1956 			state->fc_regions[i].len == 0)
1957 			continue;
1958 		if (in_range(blk, state->fc_regions[i].pblk,
1959 					state->fc_regions[i].len))
1960 			return true;
1961 	}
1962 	return false;
1963 }
1964 
1965 /* Cleanup function called after replay */
1966 void ext4_fc_replay_cleanup(struct super_block *sb)
1967 {
1968 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1969 
1970 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1971 	kfree(sbi->s_fc_replay_state.fc_regions);
1972 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1973 }
1974 
1975 /*
1976  * Recovery Scan phase handler
1977  *
1978  * This function is called during the scan phase and is responsible
1979  * for doing following things:
1980  * - Make sure the fast commit area has valid tags for replay
1981  * - Count number of tags that need to be replayed by the replay handler
1982  * - Verify CRC
1983  * - Create a list of excluded blocks for allocation during replay phase
1984  *
1985  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1986  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1987  * to indicate that scan has finished and JBD2 can now start replay phase.
1988  * It returns a negative error to indicate that there was an error. At the end
1989  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1990  * to indicate the number of tags that need to replayed during the replay phase.
1991  */
1992 static int ext4_fc_replay_scan(journal_t *journal,
1993 				struct buffer_head *bh, int off,
1994 				tid_t expected_tid)
1995 {
1996 	struct super_block *sb = journal->j_private;
1997 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1998 	struct ext4_fc_replay_state *state;
1999 	int ret = JBD2_FC_REPLAY_CONTINUE;
2000 	struct ext4_fc_add_range ext;
2001 	struct ext4_fc_tl tl;
2002 	struct ext4_fc_tail tail;
2003 	__u8 *start, *end, *cur, *val;
2004 	struct ext4_fc_head head;
2005 	struct ext4_extent *ex;
2006 
2007 	state = &sbi->s_fc_replay_state;
2008 
2009 	start = (u8 *)bh->b_data;
2010 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2011 
2012 	if (state->fc_replay_expected_off == 0) {
2013 		state->fc_cur_tag = 0;
2014 		state->fc_replay_num_tags = 0;
2015 		state->fc_crc = 0;
2016 		state->fc_regions = NULL;
2017 		state->fc_regions_valid = state->fc_regions_used =
2018 			state->fc_regions_size = 0;
2019 		/* Check if we can stop early */
2020 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2021 			!= EXT4_FC_TAG_HEAD)
2022 			return 0;
2023 	}
2024 
2025 	if (off != state->fc_replay_expected_off) {
2026 		ret = -EFSCORRUPTED;
2027 		goto out_err;
2028 	}
2029 
2030 	state->fc_replay_expected_off++;
2031 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2032 		memcpy(&tl, cur, sizeof(tl));
2033 		val = cur + sizeof(tl);
2034 		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2035 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2036 		switch (le16_to_cpu(tl.fc_tag)) {
2037 		case EXT4_FC_TAG_ADD_RANGE:
2038 			memcpy(&ext, val, sizeof(ext));
2039 			ex = (struct ext4_extent *)&ext.fc_ex;
2040 			ret = ext4_fc_record_regions(sb,
2041 				le32_to_cpu(ext.fc_ino),
2042 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2043 				ext4_ext_get_actual_len(ex), 0);
2044 			if (ret < 0)
2045 				break;
2046 			ret = JBD2_FC_REPLAY_CONTINUE;
2047 			fallthrough;
2048 		case EXT4_FC_TAG_DEL_RANGE:
2049 		case EXT4_FC_TAG_LINK:
2050 		case EXT4_FC_TAG_UNLINK:
2051 		case EXT4_FC_TAG_CREAT:
2052 		case EXT4_FC_TAG_INODE:
2053 		case EXT4_FC_TAG_PAD:
2054 			state->fc_cur_tag++;
2055 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2056 					sizeof(tl) + le16_to_cpu(tl.fc_len));
2057 			break;
2058 		case EXT4_FC_TAG_TAIL:
2059 			state->fc_cur_tag++;
2060 			memcpy(&tail, val, sizeof(tail));
2061 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2062 						sizeof(tl) +
2063 						offsetof(struct ext4_fc_tail,
2064 						fc_crc));
2065 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2066 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2067 				state->fc_replay_num_tags = state->fc_cur_tag;
2068 				state->fc_regions_valid =
2069 					state->fc_regions_used;
2070 			} else {
2071 				ret = state->fc_replay_num_tags ?
2072 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2073 			}
2074 			state->fc_crc = 0;
2075 			break;
2076 		case EXT4_FC_TAG_HEAD:
2077 			memcpy(&head, val, sizeof(head));
2078 			if (le32_to_cpu(head.fc_features) &
2079 				~EXT4_FC_SUPPORTED_FEATURES) {
2080 				ret = -EOPNOTSUPP;
2081 				break;
2082 			}
2083 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2084 				ret = JBD2_FC_REPLAY_STOP;
2085 				break;
2086 			}
2087 			state->fc_cur_tag++;
2088 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2089 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2090 			break;
2091 		default:
2092 			ret = state->fc_replay_num_tags ?
2093 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2094 		}
2095 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2096 			break;
2097 	}
2098 
2099 out_err:
2100 	trace_ext4_fc_replay_scan(sb, ret, off);
2101 	return ret;
2102 }
2103 
2104 /*
2105  * Main recovery path entry point.
2106  * The meaning of return codes is similar as above.
2107  */
2108 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2109 				enum passtype pass, int off, tid_t expected_tid)
2110 {
2111 	struct super_block *sb = journal->j_private;
2112 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2113 	struct ext4_fc_tl tl;
2114 	__u8 *start, *end, *cur, *val;
2115 	int ret = JBD2_FC_REPLAY_CONTINUE;
2116 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2117 	struct ext4_fc_tail tail;
2118 
2119 	if (pass == PASS_SCAN) {
2120 		state->fc_current_pass = PASS_SCAN;
2121 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2122 	}
2123 
2124 	if (state->fc_current_pass != pass) {
2125 		state->fc_current_pass = pass;
2126 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2127 	}
2128 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2129 		ext4_debug("Replay stops\n");
2130 		ext4_fc_set_bitmaps_and_counters(sb);
2131 		return 0;
2132 	}
2133 
2134 #ifdef CONFIG_EXT4_DEBUG
2135 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2136 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2137 		return JBD2_FC_REPLAY_STOP;
2138 	}
2139 #endif
2140 
2141 	start = (u8 *)bh->b_data;
2142 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2143 
2144 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2145 		memcpy(&tl, cur, sizeof(tl));
2146 		val = cur + sizeof(tl);
2147 
2148 		if (state->fc_replay_num_tags == 0) {
2149 			ret = JBD2_FC_REPLAY_STOP;
2150 			ext4_fc_set_bitmaps_and_counters(sb);
2151 			break;
2152 		}
2153 		ext4_debug("Replay phase, tag:%s\n",
2154 				tag2str(le16_to_cpu(tl.fc_tag)));
2155 		state->fc_replay_num_tags--;
2156 		switch (le16_to_cpu(tl.fc_tag)) {
2157 		case EXT4_FC_TAG_LINK:
2158 			ret = ext4_fc_replay_link(sb, &tl, val);
2159 			break;
2160 		case EXT4_FC_TAG_UNLINK:
2161 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2162 			break;
2163 		case EXT4_FC_TAG_ADD_RANGE:
2164 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2165 			break;
2166 		case EXT4_FC_TAG_CREAT:
2167 			ret = ext4_fc_replay_create(sb, &tl, val);
2168 			break;
2169 		case EXT4_FC_TAG_DEL_RANGE:
2170 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2171 			break;
2172 		case EXT4_FC_TAG_INODE:
2173 			ret = ext4_fc_replay_inode(sb, &tl, val);
2174 			break;
2175 		case EXT4_FC_TAG_PAD:
2176 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2177 					     le16_to_cpu(tl.fc_len), 0);
2178 			break;
2179 		case EXT4_FC_TAG_TAIL:
2180 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2181 					     le16_to_cpu(tl.fc_len), 0);
2182 			memcpy(&tail, val, sizeof(tail));
2183 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2184 			break;
2185 		case EXT4_FC_TAG_HEAD:
2186 			break;
2187 		default:
2188 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2189 					     le16_to_cpu(tl.fc_len), 0);
2190 			ret = -ECANCELED;
2191 			break;
2192 		}
2193 		if (ret < 0)
2194 			break;
2195 		ret = JBD2_FC_REPLAY_CONTINUE;
2196 	}
2197 	return ret;
2198 }
2199 
2200 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2201 {
2202 	/*
2203 	 * We set replay callback even if fast commit disabled because we may
2204 	 * could still have fast commit blocks that need to be replayed even if
2205 	 * fast commit has now been turned off.
2206 	 */
2207 	journal->j_fc_replay_callback = ext4_fc_replay;
2208 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2209 		return;
2210 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2211 }
2212 
2213 static const char *fc_ineligible_reasons[] = {
2214 	"Extended attributes changed",
2215 	"Cross rename",
2216 	"Journal flag changed",
2217 	"Insufficient memory",
2218 	"Swap boot",
2219 	"Resize",
2220 	"Dir renamed",
2221 	"Falloc range op",
2222 	"Data journalling",
2223 	"FC Commit Failed"
2224 };
2225 
2226 int ext4_fc_info_show(struct seq_file *seq, void *v)
2227 {
2228 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2229 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2230 	int i;
2231 
2232 	if (v != SEQ_START_TOKEN)
2233 		return 0;
2234 
2235 	seq_printf(seq,
2236 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2237 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2238 		   stats->fc_numblks,
2239 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2240 	seq_puts(seq, "Ineligible reasons:\n");
2241 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2242 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2243 			stats->fc_ineligible_reason_count[i]);
2244 
2245 	return 0;
2246 }
2247 
2248 int __init ext4_fc_init_dentry_cache(void)
2249 {
2250 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2251 					   SLAB_RECLAIM_ACCOUNT);
2252 
2253 	if (ext4_fc_dentry_cachep == NULL)
2254 		return -ENOMEM;
2255 
2256 	return 0;
2257 }
2258 
2259 void ext4_fc_destroy_dentry_cache(void)
2260 {
2261 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2262 }
2263