xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 0a94608f)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	INIT_LIST_HEAD(&ei->i_fc_dilist);
203 	init_waitqueue_head(&ei->i_fc_wait);
204 	atomic_set(&ei->i_fc_updates, 0);
205 }
206 
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211 	wait_queue_head_t *wq;
212 	struct ext4_inode_info *ei = EXT4_I(inode);
213 
214 #if (BITS_PER_LONG < 64)
215 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 			EXT4_STATE_FC_COMMITTING);
217 	wq = bit_waitqueue(&ei->i_state_flags,
218 				EXT4_STATE_FC_COMMITTING);
219 #else
220 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 			EXT4_STATE_FC_COMMITTING);
222 	wq = bit_waitqueue(&ei->i_flags,
223 				EXT4_STATE_FC_COMMITTING);
224 #endif
225 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 	schedule();
229 	finish_wait(wq, &wait.wq_entry);
230 }
231 
232 /*
233  * Inform Ext4's fast about start of an inode update
234  *
235  * This function is called by the high level call VFS callbacks before
236  * performing any inode update. This function blocks if there's an ongoing
237  * fast commit on the inode in question.
238  */
239 void ext4_fc_start_update(struct inode *inode)
240 {
241 	struct ext4_inode_info *ei = EXT4_I(inode);
242 
243 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
244 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
245 		return;
246 
247 restart:
248 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
249 	if (list_empty(&ei->i_fc_list))
250 		goto out;
251 
252 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
253 		ext4_fc_wait_committing_inode(inode);
254 		goto restart;
255 	}
256 out:
257 	atomic_inc(&ei->i_fc_updates);
258 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
259 }
260 
261 /*
262  * Stop inode update and wake up waiting fast commits if any.
263  */
264 void ext4_fc_stop_update(struct inode *inode)
265 {
266 	struct ext4_inode_info *ei = EXT4_I(inode);
267 
268 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
269 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
270 		return;
271 
272 	if (atomic_dec_and_test(&ei->i_fc_updates))
273 		wake_up_all(&ei->i_fc_wait);
274 }
275 
276 /*
277  * Remove inode from fast commit list. If the inode is being committed
278  * we wait until inode commit is done.
279  */
280 void ext4_fc_del(struct inode *inode)
281 {
282 	struct ext4_inode_info *ei = EXT4_I(inode);
283 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
284 	struct ext4_fc_dentry_update *fc_dentry;
285 
286 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
287 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
288 		return;
289 
290 restart:
291 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
292 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
293 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
294 		return;
295 	}
296 
297 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
298 		ext4_fc_wait_committing_inode(inode);
299 		goto restart;
300 	}
301 
302 	if (!list_empty(&ei->i_fc_list))
303 		list_del_init(&ei->i_fc_list);
304 
305 	/*
306 	 * Since this inode is getting removed, let's also remove all FC
307 	 * dentry create references, since it is not needed to log it anyways.
308 	 */
309 	if (list_empty(&ei->i_fc_dilist)) {
310 		spin_unlock(&sbi->s_fc_lock);
311 		return;
312 	}
313 
314 	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
315 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
316 	list_del_init(&fc_dentry->fcd_list);
317 	list_del_init(&fc_dentry->fcd_dilist);
318 
319 	WARN_ON(!list_empty(&ei->i_fc_dilist));
320 	spin_unlock(&sbi->s_fc_lock);
321 
322 	if (fc_dentry->fcd_name.name &&
323 		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
324 		kfree(fc_dentry->fcd_name.name);
325 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
326 
327 	return;
328 }
329 
330 /*
331  * Mark file system as fast commit ineligible, and record latest
332  * ineligible transaction tid. This means until the recorded
333  * transaction, commit operation would result in a full jbd2 commit.
334  */
335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
336 {
337 	struct ext4_sb_info *sbi = EXT4_SB(sb);
338 	tid_t tid;
339 
340 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
341 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
342 		return;
343 
344 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
345 	if (handle && !IS_ERR(handle))
346 		tid = handle->h_transaction->t_tid;
347 	else {
348 		read_lock(&sbi->s_journal->j_state_lock);
349 		tid = sbi->s_journal->j_running_transaction ?
350 				sbi->s_journal->j_running_transaction->t_tid : 0;
351 		read_unlock(&sbi->s_journal->j_state_lock);
352 	}
353 	spin_lock(&sbi->s_fc_lock);
354 	if (sbi->s_fc_ineligible_tid < tid)
355 		sbi->s_fc_ineligible_tid = tid;
356 	spin_unlock(&sbi->s_fc_lock);
357 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
358 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
359 }
360 
361 /*
362  * Generic fast commit tracking function. If this is the first time this we are
363  * called after a full commit, we initialize fast commit fields and then call
364  * __fc_track_fn() with update = 0. If we have already been called after a full
365  * commit, we pass update = 1. Based on that, the track function can determine
366  * if it needs to track a field for the first time or if it needs to just
367  * update the previously tracked value.
368  *
369  * If enqueue is set, this function enqueues the inode in fast commit list.
370  */
371 static int ext4_fc_track_template(
372 	handle_t *handle, struct inode *inode,
373 	int (*__fc_track_fn)(struct inode *, void *, bool),
374 	void *args, int enqueue)
375 {
376 	bool update = false;
377 	struct ext4_inode_info *ei = EXT4_I(inode);
378 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
379 	tid_t tid = 0;
380 	int ret;
381 
382 	tid = handle->h_transaction->t_tid;
383 	mutex_lock(&ei->i_fc_lock);
384 	if (tid == ei->i_sync_tid) {
385 		update = true;
386 	} else {
387 		ext4_fc_reset_inode(inode);
388 		ei->i_sync_tid = tid;
389 	}
390 	ret = __fc_track_fn(inode, args, update);
391 	mutex_unlock(&ei->i_fc_lock);
392 
393 	if (!enqueue)
394 		return ret;
395 
396 	spin_lock(&sbi->s_fc_lock);
397 	if (list_empty(&EXT4_I(inode)->i_fc_list))
398 		list_add_tail(&EXT4_I(inode)->i_fc_list,
399 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
400 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
401 				&sbi->s_fc_q[FC_Q_STAGING] :
402 				&sbi->s_fc_q[FC_Q_MAIN]);
403 	spin_unlock(&sbi->s_fc_lock);
404 
405 	return ret;
406 }
407 
408 struct __track_dentry_update_args {
409 	struct dentry *dentry;
410 	int op;
411 };
412 
413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
414 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
415 {
416 	struct ext4_fc_dentry_update *node;
417 	struct ext4_inode_info *ei = EXT4_I(inode);
418 	struct __track_dentry_update_args *dentry_update =
419 		(struct __track_dentry_update_args *)arg;
420 	struct dentry *dentry = dentry_update->dentry;
421 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
422 
423 	mutex_unlock(&ei->i_fc_lock);
424 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
425 	if (!node) {
426 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
427 		mutex_lock(&ei->i_fc_lock);
428 		return -ENOMEM;
429 	}
430 
431 	node->fcd_op = dentry_update->op;
432 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
433 	node->fcd_ino = inode->i_ino;
434 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
435 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
436 		if (!node->fcd_name.name) {
437 			kmem_cache_free(ext4_fc_dentry_cachep, node);
438 			ext4_fc_mark_ineligible(inode->i_sb,
439 				EXT4_FC_REASON_NOMEM, NULL);
440 			mutex_lock(&ei->i_fc_lock);
441 			return -ENOMEM;
442 		}
443 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
444 			dentry->d_name.len);
445 	} else {
446 		memcpy(node->fcd_iname, dentry->d_name.name,
447 			dentry->d_name.len);
448 		node->fcd_name.name = node->fcd_iname;
449 	}
450 	node->fcd_name.len = dentry->d_name.len;
451 	INIT_LIST_HEAD(&node->fcd_dilist);
452 	spin_lock(&sbi->s_fc_lock);
453 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
454 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
455 		list_add_tail(&node->fcd_list,
456 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
457 	else
458 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
459 
460 	/*
461 	 * This helps us keep a track of all fc_dentry updates which is part of
462 	 * this ext4 inode. So in case the inode is getting unlinked, before
463 	 * even we get a chance to fsync, we could remove all fc_dentry
464 	 * references while evicting the inode in ext4_fc_del().
465 	 * Also with this, we don't need to loop over all the inodes in
466 	 * sbi->s_fc_q to get the corresponding inode in
467 	 * ext4_fc_commit_dentry_updates().
468 	 */
469 	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
470 		WARN_ON(!list_empty(&ei->i_fc_dilist));
471 		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
472 	}
473 	spin_unlock(&sbi->s_fc_lock);
474 	mutex_lock(&ei->i_fc_lock);
475 
476 	return 0;
477 }
478 
479 void __ext4_fc_track_unlink(handle_t *handle,
480 		struct inode *inode, struct dentry *dentry)
481 {
482 	struct __track_dentry_update_args args;
483 	int ret;
484 
485 	args.dentry = dentry;
486 	args.op = EXT4_FC_TAG_UNLINK;
487 
488 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
489 					(void *)&args, 0);
490 	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
491 }
492 
493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
494 {
495 	struct inode *inode = d_inode(dentry);
496 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
497 
498 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
499 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
500 		return;
501 
502 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503 		return;
504 
505 	__ext4_fc_track_unlink(handle, inode, dentry);
506 }
507 
508 void __ext4_fc_track_link(handle_t *handle,
509 	struct inode *inode, struct dentry *dentry)
510 {
511 	struct __track_dentry_update_args args;
512 	int ret;
513 
514 	args.dentry = dentry;
515 	args.op = EXT4_FC_TAG_LINK;
516 
517 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518 					(void *)&args, 0);
519 	trace_ext4_fc_track_link(handle, inode, dentry, ret);
520 }
521 
522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524 	struct inode *inode = d_inode(dentry);
525 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
526 
527 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
528 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
529 		return;
530 
531 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
532 		return;
533 
534 	__ext4_fc_track_link(handle, inode, dentry);
535 }
536 
537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
538 			  struct dentry *dentry)
539 {
540 	struct __track_dentry_update_args args;
541 	int ret;
542 
543 	args.dentry = dentry;
544 	args.op = EXT4_FC_TAG_CREAT;
545 
546 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
547 					(void *)&args, 0);
548 	trace_ext4_fc_track_create(handle, inode, dentry, ret);
549 }
550 
551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
552 {
553 	struct inode *inode = d_inode(dentry);
554 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
555 
556 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
557 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
558 		return;
559 
560 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
561 		return;
562 
563 	__ext4_fc_track_create(handle, inode, dentry);
564 }
565 
566 /* __track_fn for inode tracking */
567 static int __track_inode(struct inode *inode, void *arg, bool update)
568 {
569 	if (update)
570 		return -EEXIST;
571 
572 	EXT4_I(inode)->i_fc_lblk_len = 0;
573 
574 	return 0;
575 }
576 
577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
578 {
579 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
580 	int ret;
581 
582 	if (S_ISDIR(inode->i_mode))
583 		return;
584 
585 	if (ext4_should_journal_data(inode)) {
586 		ext4_fc_mark_ineligible(inode->i_sb,
587 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
588 		return;
589 	}
590 
591 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
592 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
593 		return;
594 
595 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
596 		return;
597 
598 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
599 	trace_ext4_fc_track_inode(handle, inode, ret);
600 }
601 
602 struct __track_range_args {
603 	ext4_lblk_t start, end;
604 };
605 
606 /* __track_fn for tracking data updates */
607 static int __track_range(struct inode *inode, void *arg, bool update)
608 {
609 	struct ext4_inode_info *ei = EXT4_I(inode);
610 	ext4_lblk_t oldstart;
611 	struct __track_range_args *__arg =
612 		(struct __track_range_args *)arg;
613 
614 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
615 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
616 		return -ECANCELED;
617 	}
618 
619 	oldstart = ei->i_fc_lblk_start;
620 
621 	if (update && ei->i_fc_lblk_len > 0) {
622 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
623 		ei->i_fc_lblk_len =
624 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
625 				ei->i_fc_lblk_start + 1;
626 	} else {
627 		ei->i_fc_lblk_start = __arg->start;
628 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
629 	}
630 
631 	return 0;
632 }
633 
634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
635 			 ext4_lblk_t end)
636 {
637 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
638 	struct __track_range_args args;
639 	int ret;
640 
641 	if (S_ISDIR(inode->i_mode))
642 		return;
643 
644 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
645 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
646 		return;
647 
648 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
649 		return;
650 
651 	args.start = start;
652 	args.end = end;
653 
654 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
655 
656 	trace_ext4_fc_track_range(handle, inode, start, end, ret);
657 }
658 
659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
660 {
661 	int write_flags = REQ_SYNC;
662 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
663 
664 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
665 	if (test_opt(sb, BARRIER) && is_tail)
666 		write_flags |= REQ_FUA | REQ_PREFLUSH;
667 	lock_buffer(bh);
668 	set_buffer_dirty(bh);
669 	set_buffer_uptodate(bh);
670 	bh->b_end_io = ext4_end_buffer_io_sync;
671 	submit_bh(REQ_OP_WRITE, write_flags, bh);
672 	EXT4_SB(sb)->s_fc_bh = NULL;
673 }
674 
675 /* Ext4 commit path routines */
676 
677 /* memzero and update CRC */
678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
679 				u32 *crc)
680 {
681 	void *ret;
682 
683 	ret = memset(dst, 0, len);
684 	if (crc)
685 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
686 	return ret;
687 }
688 
689 /*
690  * Allocate len bytes on a fast commit buffer.
691  *
692  * During the commit time this function is used to manage fast commit
693  * block space. We don't split a fast commit log onto different
694  * blocks. So this function makes sure that if there's not enough space
695  * on the current block, the remaining space in the current block is
696  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
697  * new block is from jbd2 and CRC is updated to reflect the padding
698  * we added.
699  */
700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
701 {
702 	struct ext4_fc_tl *tl;
703 	struct ext4_sb_info *sbi = EXT4_SB(sb);
704 	struct buffer_head *bh;
705 	int bsize = sbi->s_journal->j_blocksize;
706 	int ret, off = sbi->s_fc_bytes % bsize;
707 	int pad_len;
708 
709 	/*
710 	 * After allocating len, we should have space at least for a 0 byte
711 	 * padding.
712 	 */
713 	if (len + sizeof(struct ext4_fc_tl) > bsize)
714 		return NULL;
715 
716 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
717 		/*
718 		 * Only allocate from current buffer if we have enough space for
719 		 * this request AND we have space to add a zero byte padding.
720 		 */
721 		if (!sbi->s_fc_bh) {
722 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
723 			if (ret)
724 				return NULL;
725 			sbi->s_fc_bh = bh;
726 		}
727 		sbi->s_fc_bytes += len;
728 		return sbi->s_fc_bh->b_data + off;
729 	}
730 	/* Need to add PAD tag */
731 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
732 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
733 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
734 	tl->fc_len = cpu_to_le16(pad_len);
735 	if (crc)
736 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
737 	if (pad_len > 0)
738 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
739 	ext4_fc_submit_bh(sb, false);
740 
741 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
742 	if (ret)
743 		return NULL;
744 	sbi->s_fc_bh = bh;
745 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
746 	return sbi->s_fc_bh->b_data;
747 }
748 
749 /* memcpy to fc reserved space and update CRC */
750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
751 				int len, u32 *crc)
752 {
753 	if (crc)
754 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
755 	return memcpy(dst, src, len);
756 }
757 
758 /*
759  * Complete a fast commit by writing tail tag.
760  *
761  * Writing tail tag marks the end of a fast commit. In order to guarantee
762  * atomicity, after writing tail tag, even if there's space remaining
763  * in the block, next commit shouldn't use it. That's why tail tag
764  * has the length as that of the remaining space on the block.
765  */
766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
767 {
768 	struct ext4_sb_info *sbi = EXT4_SB(sb);
769 	struct ext4_fc_tl tl;
770 	struct ext4_fc_tail tail;
771 	int off, bsize = sbi->s_journal->j_blocksize;
772 	u8 *dst;
773 
774 	/*
775 	 * ext4_fc_reserve_space takes care of allocating an extra block if
776 	 * there's no enough space on this block for accommodating this tail.
777 	 */
778 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
779 	if (!dst)
780 		return -ENOSPC;
781 
782 	off = sbi->s_fc_bytes % bsize;
783 
784 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
785 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
786 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
787 
788 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
789 	dst += sizeof(tl);
790 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
791 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
792 	dst += sizeof(tail.fc_tid);
793 	tail.fc_crc = cpu_to_le32(crc);
794 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
795 
796 	ext4_fc_submit_bh(sb, true);
797 
798 	return 0;
799 }
800 
801 /*
802  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
803  * Returns false if there's not enough space.
804  */
805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
806 			   u32 *crc)
807 {
808 	struct ext4_fc_tl tl;
809 	u8 *dst;
810 
811 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
812 	if (!dst)
813 		return false;
814 
815 	tl.fc_tag = cpu_to_le16(tag);
816 	tl.fc_len = cpu_to_le16(len);
817 
818 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
819 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
820 
821 	return true;
822 }
823 
824 /* Same as above, but adds dentry tlv. */
825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
826 				   struct ext4_fc_dentry_update *fc_dentry)
827 {
828 	struct ext4_fc_dentry_info fcd;
829 	struct ext4_fc_tl tl;
830 	int dlen = fc_dentry->fcd_name.len;
831 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
832 					crc);
833 
834 	if (!dst)
835 		return false;
836 
837 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
838 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
839 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
840 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
841 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
842 	dst += sizeof(tl);
843 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
844 	dst += sizeof(fcd);
845 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
846 
847 	return true;
848 }
849 
850 /*
851  * Writes inode in the fast commit space under TLV with tag @tag.
852  * Returns 0 on success, error on failure.
853  */
854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
855 {
856 	struct ext4_inode_info *ei = EXT4_I(inode);
857 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
858 	int ret;
859 	struct ext4_iloc iloc;
860 	struct ext4_fc_inode fc_inode;
861 	struct ext4_fc_tl tl;
862 	u8 *dst;
863 
864 	ret = ext4_get_inode_loc(inode, &iloc);
865 	if (ret)
866 		return ret;
867 
868 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
869 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
870 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
871 		inode_len += ei->i_extra_isize;
872 
873 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
874 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
875 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
876 
877 	dst = ext4_fc_reserve_space(inode->i_sb,
878 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
879 	if (!dst)
880 		return -ECANCELED;
881 
882 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
883 		return -ECANCELED;
884 	dst += sizeof(tl);
885 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
886 		return -ECANCELED;
887 	dst += sizeof(fc_inode);
888 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
889 					inode_len, crc))
890 		return -ECANCELED;
891 
892 	return 0;
893 }
894 
895 /*
896  * Writes updated data ranges for the inode in question. Updates CRC.
897  * Returns 0 on success, error otherwise.
898  */
899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
900 {
901 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
902 	struct ext4_inode_info *ei = EXT4_I(inode);
903 	struct ext4_map_blocks map;
904 	struct ext4_fc_add_range fc_ext;
905 	struct ext4_fc_del_range lrange;
906 	struct ext4_extent *ex;
907 	int ret;
908 
909 	mutex_lock(&ei->i_fc_lock);
910 	if (ei->i_fc_lblk_len == 0) {
911 		mutex_unlock(&ei->i_fc_lock);
912 		return 0;
913 	}
914 	old_blk_size = ei->i_fc_lblk_start;
915 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
916 	ei->i_fc_lblk_len = 0;
917 	mutex_unlock(&ei->i_fc_lock);
918 
919 	cur_lblk_off = old_blk_size;
920 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
921 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
922 
923 	while (cur_lblk_off <= new_blk_size) {
924 		map.m_lblk = cur_lblk_off;
925 		map.m_len = new_blk_size - cur_lblk_off + 1;
926 		ret = ext4_map_blocks(NULL, inode, &map, 0);
927 		if (ret < 0)
928 			return -ECANCELED;
929 
930 		if (map.m_len == 0) {
931 			cur_lblk_off++;
932 			continue;
933 		}
934 
935 		if (ret == 0) {
936 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
937 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
938 			lrange.fc_len = cpu_to_le32(map.m_len);
939 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
940 					    sizeof(lrange), (u8 *)&lrange, crc))
941 				return -ENOSPC;
942 		} else {
943 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
944 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
945 
946 			/* Limit the number of blocks in one extent */
947 			map.m_len = min(max, map.m_len);
948 
949 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
950 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
951 			ex->ee_block = cpu_to_le32(map.m_lblk);
952 			ex->ee_len = cpu_to_le16(map.m_len);
953 			ext4_ext_store_pblock(ex, map.m_pblk);
954 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
955 				ext4_ext_mark_unwritten(ex);
956 			else
957 				ext4_ext_mark_initialized(ex);
958 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
959 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
960 				return -ENOSPC;
961 		}
962 
963 		cur_lblk_off += map.m_len;
964 	}
965 
966 	return 0;
967 }
968 
969 
970 /* Submit data for all the fast commit inodes */
971 static int ext4_fc_submit_inode_data_all(journal_t *journal)
972 {
973 	struct super_block *sb = (struct super_block *)(journal->j_private);
974 	struct ext4_sb_info *sbi = EXT4_SB(sb);
975 	struct ext4_inode_info *ei;
976 	int ret = 0;
977 
978 	spin_lock(&sbi->s_fc_lock);
979 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
980 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
981 		while (atomic_read(&ei->i_fc_updates)) {
982 			DEFINE_WAIT(wait);
983 
984 			prepare_to_wait(&ei->i_fc_wait, &wait,
985 						TASK_UNINTERRUPTIBLE);
986 			if (atomic_read(&ei->i_fc_updates)) {
987 				spin_unlock(&sbi->s_fc_lock);
988 				schedule();
989 				spin_lock(&sbi->s_fc_lock);
990 			}
991 			finish_wait(&ei->i_fc_wait, &wait);
992 		}
993 		spin_unlock(&sbi->s_fc_lock);
994 		ret = jbd2_submit_inode_data(ei->jinode);
995 		if (ret)
996 			return ret;
997 		spin_lock(&sbi->s_fc_lock);
998 	}
999 	spin_unlock(&sbi->s_fc_lock);
1000 
1001 	return ret;
1002 }
1003 
1004 /* Wait for completion of data for all the fast commit inodes */
1005 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006 {
1007 	struct super_block *sb = (struct super_block *)(journal->j_private);
1008 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1009 	struct ext4_inode_info *pos, *n;
1010 	int ret = 0;
1011 
1012 	spin_lock(&sbi->s_fc_lock);
1013 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014 		if (!ext4_test_inode_state(&pos->vfs_inode,
1015 					   EXT4_STATE_FC_COMMITTING))
1016 			continue;
1017 		spin_unlock(&sbi->s_fc_lock);
1018 
1019 		ret = jbd2_wait_inode_data(journal, pos->jinode);
1020 		if (ret)
1021 			return ret;
1022 		spin_lock(&sbi->s_fc_lock);
1023 	}
1024 	spin_unlock(&sbi->s_fc_lock);
1025 
1026 	return 0;
1027 }
1028 
1029 /* Commit all the directory entry updates */
1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1031 __acquires(&sbi->s_fc_lock)
1032 __releases(&sbi->s_fc_lock)
1033 {
1034 	struct super_block *sb = (struct super_block *)(journal->j_private);
1035 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1036 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1037 	struct inode *inode;
1038 	struct ext4_inode_info *ei;
1039 	int ret;
1040 
1041 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042 		return 0;
1043 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1045 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046 			spin_unlock(&sbi->s_fc_lock);
1047 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1048 				ret = -ENOSPC;
1049 				goto lock_and_exit;
1050 			}
1051 			spin_lock(&sbi->s_fc_lock);
1052 			continue;
1053 		}
1054 		/*
1055 		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056 		 * corresponding inode pointer
1057 		 */
1058 		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1060 				struct ext4_inode_info, i_fc_dilist);
1061 		inode = &ei->vfs_inode;
1062 		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063 
1064 		spin_unlock(&sbi->s_fc_lock);
1065 
1066 		/*
1067 		 * We first write the inode and then the create dirent. This
1068 		 * allows the recovery code to create an unnamed inode first
1069 		 * and then link it to a directory entry. This allows us
1070 		 * to use namei.c routines almost as is and simplifies
1071 		 * the recovery code.
1072 		 */
1073 		ret = ext4_fc_write_inode(inode, crc);
1074 		if (ret)
1075 			goto lock_and_exit;
1076 
1077 		ret = ext4_fc_write_inode_data(inode, crc);
1078 		if (ret)
1079 			goto lock_and_exit;
1080 
1081 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1082 			ret = -ENOSPC;
1083 			goto lock_and_exit;
1084 		}
1085 
1086 		spin_lock(&sbi->s_fc_lock);
1087 	}
1088 	return 0;
1089 lock_and_exit:
1090 	spin_lock(&sbi->s_fc_lock);
1091 	return ret;
1092 }
1093 
1094 static int ext4_fc_perform_commit(journal_t *journal)
1095 {
1096 	struct super_block *sb = (struct super_block *)(journal->j_private);
1097 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 	struct ext4_inode_info *iter;
1099 	struct ext4_fc_head head;
1100 	struct inode *inode;
1101 	struct blk_plug plug;
1102 	int ret = 0;
1103 	u32 crc = 0;
1104 
1105 	ret = ext4_fc_submit_inode_data_all(journal);
1106 	if (ret)
1107 		return ret;
1108 
1109 	ret = ext4_fc_wait_inode_data_all(journal);
1110 	if (ret)
1111 		return ret;
1112 
1113 	/*
1114 	 * If file system device is different from journal device, issue a cache
1115 	 * flush before we start writing fast commit blocks.
1116 	 */
1117 	if (journal->j_fs_dev != journal->j_dev)
1118 		blkdev_issue_flush(journal->j_fs_dev);
1119 
1120 	blk_start_plug(&plug);
1121 	if (sbi->s_fc_bytes == 0) {
1122 		/*
1123 		 * Add a head tag only if this is the first fast commit
1124 		 * in this TID.
1125 		 */
1126 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127 		head.fc_tid = cpu_to_le32(
1128 			sbi->s_journal->j_running_transaction->t_tid);
1129 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1130 			(u8 *)&head, &crc)) {
1131 			ret = -ENOSPC;
1132 			goto out;
1133 		}
1134 	}
1135 
1136 	spin_lock(&sbi->s_fc_lock);
1137 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138 	if (ret) {
1139 		spin_unlock(&sbi->s_fc_lock);
1140 		goto out;
1141 	}
1142 
1143 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1144 		inode = &iter->vfs_inode;
1145 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146 			continue;
1147 
1148 		spin_unlock(&sbi->s_fc_lock);
1149 		ret = ext4_fc_write_inode_data(inode, &crc);
1150 		if (ret)
1151 			goto out;
1152 		ret = ext4_fc_write_inode(inode, &crc);
1153 		if (ret)
1154 			goto out;
1155 		spin_lock(&sbi->s_fc_lock);
1156 	}
1157 	spin_unlock(&sbi->s_fc_lock);
1158 
1159 	ret = ext4_fc_write_tail(sb, crc);
1160 
1161 out:
1162 	blk_finish_plug(&plug);
1163 	return ret;
1164 }
1165 
1166 static void ext4_fc_update_stats(struct super_block *sb, int status,
1167 				 u64 commit_time, int nblks, tid_t commit_tid)
1168 {
1169 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170 
1171 	jbd_debug(1, "Fast commit ended with status = %d for tid %u",
1172 			status, commit_tid);
1173 	if (status == EXT4_FC_STATUS_OK) {
1174 		stats->fc_num_commits++;
1175 		stats->fc_numblks += nblks;
1176 		if (likely(stats->s_fc_avg_commit_time))
1177 			stats->s_fc_avg_commit_time =
1178 				(commit_time +
1179 				 stats->s_fc_avg_commit_time * 3) / 4;
1180 		else
1181 			stats->s_fc_avg_commit_time = commit_time;
1182 	} else if (status == EXT4_FC_STATUS_FAILED ||
1183 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1184 		if (status == EXT4_FC_STATUS_FAILED)
1185 			stats->fc_failed_commits++;
1186 		stats->fc_ineligible_commits++;
1187 	} else {
1188 		stats->fc_skipped_commits++;
1189 	}
1190 	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1191 }
1192 
1193 /*
1194  * The main commit entry point. Performs a fast commit for transaction
1195  * commit_tid if needed. If it's not possible to perform a fast commit
1196  * due to various reasons, we fall back to full commit. Returns 0
1197  * on success, error otherwise.
1198  */
1199 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1200 {
1201 	struct super_block *sb = (struct super_block *)(journal->j_private);
1202 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1203 	int nblks = 0, ret, bsize = journal->j_blocksize;
1204 	int subtid = atomic_read(&sbi->s_fc_subtid);
1205 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1206 	ktime_t start_time, commit_time;
1207 
1208 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1209 		return jbd2_complete_transaction(journal, commit_tid);
1210 
1211 	trace_ext4_fc_commit_start(sb, commit_tid);
1212 
1213 	start_time = ktime_get();
1214 
1215 restart_fc:
1216 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1217 	if (ret == -EALREADY) {
1218 		/* There was an ongoing commit, check if we need to restart */
1219 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1220 			commit_tid > journal->j_commit_sequence)
1221 			goto restart_fc;
1222 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1223 				commit_tid);
1224 		return 0;
1225 	} else if (ret) {
1226 		/*
1227 		 * Commit couldn't start. Just update stats and perform a
1228 		 * full commit.
1229 		 */
1230 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1231 				commit_tid);
1232 		return jbd2_complete_transaction(journal, commit_tid);
1233 	}
1234 
1235 	/*
1236 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1237 	 * if we are fast commit ineligible.
1238 	 */
1239 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1240 		status = EXT4_FC_STATUS_INELIGIBLE;
1241 		goto fallback;
1242 	}
1243 
1244 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1245 	ret = ext4_fc_perform_commit(journal);
1246 	if (ret < 0) {
1247 		status = EXT4_FC_STATUS_FAILED;
1248 		goto fallback;
1249 	}
1250 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1251 	ret = jbd2_fc_wait_bufs(journal, nblks);
1252 	if (ret < 0) {
1253 		status = EXT4_FC_STATUS_FAILED;
1254 		goto fallback;
1255 	}
1256 	atomic_inc(&sbi->s_fc_subtid);
1257 	ret = jbd2_fc_end_commit(journal);
1258 	/*
1259 	 * weight the commit time higher than the average time so we
1260 	 * don't react too strongly to vast changes in the commit time
1261 	 */
1262 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1263 	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1264 	return ret;
1265 
1266 fallback:
1267 	ret = jbd2_fc_end_commit_fallback(journal);
1268 	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1269 	return ret;
1270 }
1271 
1272 /*
1273  * Fast commit cleanup routine. This is called after every fast commit and
1274  * full commit. full is true if we are called after a full commit.
1275  */
1276 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1277 {
1278 	struct super_block *sb = journal->j_private;
1279 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1280 	struct ext4_inode_info *iter, *iter_n;
1281 	struct ext4_fc_dentry_update *fc_dentry;
1282 
1283 	if (full && sbi->s_fc_bh)
1284 		sbi->s_fc_bh = NULL;
1285 
1286 	trace_ext4_fc_cleanup(journal, full, tid);
1287 	jbd2_fc_release_bufs(journal);
1288 
1289 	spin_lock(&sbi->s_fc_lock);
1290 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1291 				 i_fc_list) {
1292 		list_del_init(&iter->i_fc_list);
1293 		ext4_clear_inode_state(&iter->vfs_inode,
1294 				       EXT4_STATE_FC_COMMITTING);
1295 		if (iter->i_sync_tid <= tid)
1296 			ext4_fc_reset_inode(&iter->vfs_inode);
1297 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1298 		smp_mb();
1299 #if (BITS_PER_LONG < 64)
1300 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1301 #else
1302 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1303 #endif
1304 	}
1305 
1306 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1307 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1308 					     struct ext4_fc_dentry_update,
1309 					     fcd_list);
1310 		list_del_init(&fc_dentry->fcd_list);
1311 		list_del_init(&fc_dentry->fcd_dilist);
1312 		spin_unlock(&sbi->s_fc_lock);
1313 
1314 		if (fc_dentry->fcd_name.name &&
1315 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1316 			kfree(fc_dentry->fcd_name.name);
1317 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1318 		spin_lock(&sbi->s_fc_lock);
1319 	}
1320 
1321 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1322 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1323 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1324 				&sbi->s_fc_q[FC_Q_MAIN]);
1325 
1326 	if (tid >= sbi->s_fc_ineligible_tid) {
1327 		sbi->s_fc_ineligible_tid = 0;
1328 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1329 	}
1330 
1331 	if (full)
1332 		sbi->s_fc_bytes = 0;
1333 	spin_unlock(&sbi->s_fc_lock);
1334 	trace_ext4_fc_stats(sb);
1335 }
1336 
1337 /* Ext4 Replay Path Routines */
1338 
1339 /* Helper struct for dentry replay routines */
1340 struct dentry_info_args {
1341 	int parent_ino, dname_len, ino, inode_len;
1342 	char *dname;
1343 };
1344 
1345 static inline void tl_to_darg(struct dentry_info_args *darg,
1346 			      struct  ext4_fc_tl *tl, u8 *val)
1347 {
1348 	struct ext4_fc_dentry_info fcd;
1349 
1350 	memcpy(&fcd, val, sizeof(fcd));
1351 
1352 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1353 	darg->ino = le32_to_cpu(fcd.fc_ino);
1354 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1355 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1356 		sizeof(struct ext4_fc_dentry_info);
1357 }
1358 
1359 /* Unlink replay function */
1360 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1361 				 u8 *val)
1362 {
1363 	struct inode *inode, *old_parent;
1364 	struct qstr entry;
1365 	struct dentry_info_args darg;
1366 	int ret = 0;
1367 
1368 	tl_to_darg(&darg, tl, val);
1369 
1370 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1371 			darg.parent_ino, darg.dname_len);
1372 
1373 	entry.name = darg.dname;
1374 	entry.len = darg.dname_len;
1375 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1376 
1377 	if (IS_ERR(inode)) {
1378 		jbd_debug(1, "Inode %d not found", darg.ino);
1379 		return 0;
1380 	}
1381 
1382 	old_parent = ext4_iget(sb, darg.parent_ino,
1383 				EXT4_IGET_NORMAL);
1384 	if (IS_ERR(old_parent)) {
1385 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1386 		iput(inode);
1387 		return 0;
1388 	}
1389 
1390 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1391 	/* -ENOENT ok coz it might not exist anymore. */
1392 	if (ret == -ENOENT)
1393 		ret = 0;
1394 	iput(old_parent);
1395 	iput(inode);
1396 	return ret;
1397 }
1398 
1399 static int ext4_fc_replay_link_internal(struct super_block *sb,
1400 				struct dentry_info_args *darg,
1401 				struct inode *inode)
1402 {
1403 	struct inode *dir = NULL;
1404 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1405 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1406 	int ret = 0;
1407 
1408 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1409 	if (IS_ERR(dir)) {
1410 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1411 		dir = NULL;
1412 		goto out;
1413 	}
1414 
1415 	dentry_dir = d_obtain_alias(dir);
1416 	if (IS_ERR(dentry_dir)) {
1417 		jbd_debug(1, "Failed to obtain dentry");
1418 		dentry_dir = NULL;
1419 		goto out;
1420 	}
1421 
1422 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1423 	if (!dentry_inode) {
1424 		jbd_debug(1, "Inode dentry not created.");
1425 		ret = -ENOMEM;
1426 		goto out;
1427 	}
1428 
1429 	ret = __ext4_link(dir, inode, dentry_inode);
1430 	/*
1431 	 * It's possible that link already existed since data blocks
1432 	 * for the dir in question got persisted before we crashed OR
1433 	 * we replayed this tag and crashed before the entire replay
1434 	 * could complete.
1435 	 */
1436 	if (ret && ret != -EEXIST) {
1437 		jbd_debug(1, "Failed to link\n");
1438 		goto out;
1439 	}
1440 
1441 	ret = 0;
1442 out:
1443 	if (dentry_dir) {
1444 		d_drop(dentry_dir);
1445 		dput(dentry_dir);
1446 	} else if (dir) {
1447 		iput(dir);
1448 	}
1449 	if (dentry_inode) {
1450 		d_drop(dentry_inode);
1451 		dput(dentry_inode);
1452 	}
1453 
1454 	return ret;
1455 }
1456 
1457 /* Link replay function */
1458 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1459 			       u8 *val)
1460 {
1461 	struct inode *inode;
1462 	struct dentry_info_args darg;
1463 	int ret = 0;
1464 
1465 	tl_to_darg(&darg, tl, val);
1466 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1467 			darg.parent_ino, darg.dname_len);
1468 
1469 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1470 	if (IS_ERR(inode)) {
1471 		jbd_debug(1, "Inode not found.");
1472 		return 0;
1473 	}
1474 
1475 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1476 	iput(inode);
1477 	return ret;
1478 }
1479 
1480 /*
1481  * Record all the modified inodes during replay. We use this later to setup
1482  * block bitmaps correctly.
1483  */
1484 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1485 {
1486 	struct ext4_fc_replay_state *state;
1487 	int i;
1488 
1489 	state = &EXT4_SB(sb)->s_fc_replay_state;
1490 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1491 		if (state->fc_modified_inodes[i] == ino)
1492 			return 0;
1493 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1494 		state->fc_modified_inodes = krealloc(
1495 				state->fc_modified_inodes,
1496 				sizeof(int) * (state->fc_modified_inodes_size +
1497 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1498 				GFP_KERNEL);
1499 		if (!state->fc_modified_inodes)
1500 			return -ENOMEM;
1501 		state->fc_modified_inodes_size +=
1502 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1503 	}
1504 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1505 	return 0;
1506 }
1507 
1508 /*
1509  * Inode replay function
1510  */
1511 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1512 				u8 *val)
1513 {
1514 	struct ext4_fc_inode fc_inode;
1515 	struct ext4_inode *raw_inode;
1516 	struct ext4_inode *raw_fc_inode;
1517 	struct inode *inode = NULL;
1518 	struct ext4_iloc iloc;
1519 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1520 	struct ext4_extent_header *eh;
1521 
1522 	memcpy(&fc_inode, val, sizeof(fc_inode));
1523 
1524 	ino = le32_to_cpu(fc_inode.fc_ino);
1525 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1526 
1527 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1528 	if (!IS_ERR(inode)) {
1529 		ext4_ext_clear_bb(inode);
1530 		iput(inode);
1531 	}
1532 	inode = NULL;
1533 
1534 	ret = ext4_fc_record_modified_inode(sb, ino);
1535 	if (ret)
1536 		goto out;
1537 
1538 	raw_fc_inode = (struct ext4_inode *)
1539 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1540 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1541 	if (ret)
1542 		goto out;
1543 
1544 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1545 	raw_inode = ext4_raw_inode(&iloc);
1546 
1547 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1548 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1549 		inode_len - offsetof(struct ext4_inode, i_generation));
1550 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1551 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1552 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1553 			memset(eh, 0, sizeof(*eh));
1554 			eh->eh_magic = EXT4_EXT_MAGIC;
1555 			eh->eh_max = cpu_to_le16(
1556 				(sizeof(raw_inode->i_block) -
1557 				 sizeof(struct ext4_extent_header))
1558 				 / sizeof(struct ext4_extent));
1559 		}
1560 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1561 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1562 			sizeof(raw_inode->i_block));
1563 	}
1564 
1565 	/* Immediately update the inode on disk. */
1566 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1567 	if (ret)
1568 		goto out;
1569 	ret = sync_dirty_buffer(iloc.bh);
1570 	if (ret)
1571 		goto out;
1572 	ret = ext4_mark_inode_used(sb, ino);
1573 	if (ret)
1574 		goto out;
1575 
1576 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1577 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1578 	if (IS_ERR(inode)) {
1579 		jbd_debug(1, "Inode not found.");
1580 		return -EFSCORRUPTED;
1581 	}
1582 
1583 	/*
1584 	 * Our allocator could have made different decisions than before
1585 	 * crashing. This should be fixed but until then, we calculate
1586 	 * the number of blocks the inode.
1587 	 */
1588 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1589 		ext4_ext_replay_set_iblocks(inode);
1590 
1591 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1592 	ext4_reset_inode_seed(inode);
1593 
1594 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1595 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1596 	sync_dirty_buffer(iloc.bh);
1597 	brelse(iloc.bh);
1598 out:
1599 	iput(inode);
1600 	if (!ret)
1601 		blkdev_issue_flush(sb->s_bdev);
1602 
1603 	return 0;
1604 }
1605 
1606 /*
1607  * Dentry create replay function.
1608  *
1609  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1610  * inode for which we are trying to create a dentry here, should already have
1611  * been replayed before we start here.
1612  */
1613 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1614 				 u8 *val)
1615 {
1616 	int ret = 0;
1617 	struct inode *inode = NULL;
1618 	struct inode *dir = NULL;
1619 	struct dentry_info_args darg;
1620 
1621 	tl_to_darg(&darg, tl, val);
1622 
1623 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1624 			darg.parent_ino, darg.dname_len);
1625 
1626 	/* This takes care of update group descriptor and other metadata */
1627 	ret = ext4_mark_inode_used(sb, darg.ino);
1628 	if (ret)
1629 		goto out;
1630 
1631 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1632 	if (IS_ERR(inode)) {
1633 		jbd_debug(1, "inode %d not found.", darg.ino);
1634 		inode = NULL;
1635 		ret = -EINVAL;
1636 		goto out;
1637 	}
1638 
1639 	if (S_ISDIR(inode->i_mode)) {
1640 		/*
1641 		 * If we are creating a directory, we need to make sure that the
1642 		 * dot and dot dot dirents are setup properly.
1643 		 */
1644 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1645 		if (IS_ERR(dir)) {
1646 			jbd_debug(1, "Dir %d not found.", darg.ino);
1647 			goto out;
1648 		}
1649 		ret = ext4_init_new_dir(NULL, dir, inode);
1650 		iput(dir);
1651 		if (ret) {
1652 			ret = 0;
1653 			goto out;
1654 		}
1655 	}
1656 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1657 	if (ret)
1658 		goto out;
1659 	set_nlink(inode, 1);
1660 	ext4_mark_inode_dirty(NULL, inode);
1661 out:
1662 	if (inode)
1663 		iput(inode);
1664 	return ret;
1665 }
1666 
1667 /*
1668  * Record physical disk regions which are in use as per fast commit area,
1669  * and used by inodes during replay phase. Our simple replay phase
1670  * allocator excludes these regions from allocation.
1671  */
1672 int ext4_fc_record_regions(struct super_block *sb, int ino,
1673 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1674 {
1675 	struct ext4_fc_replay_state *state;
1676 	struct ext4_fc_alloc_region *region;
1677 
1678 	state = &EXT4_SB(sb)->s_fc_replay_state;
1679 	/*
1680 	 * during replay phase, the fc_regions_valid may not same as
1681 	 * fc_regions_used, update it when do new additions.
1682 	 */
1683 	if (replay && state->fc_regions_used != state->fc_regions_valid)
1684 		state->fc_regions_used = state->fc_regions_valid;
1685 	if (state->fc_regions_used == state->fc_regions_size) {
1686 		state->fc_regions_size +=
1687 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1688 		state->fc_regions = krealloc(
1689 					state->fc_regions,
1690 					state->fc_regions_size *
1691 					sizeof(struct ext4_fc_alloc_region),
1692 					GFP_KERNEL);
1693 		if (!state->fc_regions)
1694 			return -ENOMEM;
1695 	}
1696 	region = &state->fc_regions[state->fc_regions_used++];
1697 	region->ino = ino;
1698 	region->lblk = lblk;
1699 	region->pblk = pblk;
1700 	region->len = len;
1701 
1702 	if (replay)
1703 		state->fc_regions_valid++;
1704 
1705 	return 0;
1706 }
1707 
1708 /* Replay add range tag */
1709 static int ext4_fc_replay_add_range(struct super_block *sb,
1710 				    struct ext4_fc_tl *tl, u8 *val)
1711 {
1712 	struct ext4_fc_add_range fc_add_ex;
1713 	struct ext4_extent newex, *ex;
1714 	struct inode *inode;
1715 	ext4_lblk_t start, cur;
1716 	int remaining, len;
1717 	ext4_fsblk_t start_pblk;
1718 	struct ext4_map_blocks map;
1719 	struct ext4_ext_path *path = NULL;
1720 	int ret;
1721 
1722 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1723 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1724 
1725 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1726 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1727 		ext4_ext_get_actual_len(ex));
1728 
1729 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1730 	if (IS_ERR(inode)) {
1731 		jbd_debug(1, "Inode not found.");
1732 		return 0;
1733 	}
1734 
1735 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1736 	if (ret)
1737 		goto out;
1738 
1739 	start = le32_to_cpu(ex->ee_block);
1740 	start_pblk = ext4_ext_pblock(ex);
1741 	len = ext4_ext_get_actual_len(ex);
1742 
1743 	cur = start;
1744 	remaining = len;
1745 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1746 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1747 		  inode->i_ino);
1748 
1749 	while (remaining > 0) {
1750 		map.m_lblk = cur;
1751 		map.m_len = remaining;
1752 		map.m_pblk = 0;
1753 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1754 
1755 		if (ret < 0)
1756 			goto out;
1757 
1758 		if (ret == 0) {
1759 			/* Range is not mapped */
1760 			path = ext4_find_extent(inode, cur, NULL, 0);
1761 			if (IS_ERR(path))
1762 				goto out;
1763 			memset(&newex, 0, sizeof(newex));
1764 			newex.ee_block = cpu_to_le32(cur);
1765 			ext4_ext_store_pblock(
1766 				&newex, start_pblk + cur - start);
1767 			newex.ee_len = cpu_to_le16(map.m_len);
1768 			if (ext4_ext_is_unwritten(ex))
1769 				ext4_ext_mark_unwritten(&newex);
1770 			down_write(&EXT4_I(inode)->i_data_sem);
1771 			ret = ext4_ext_insert_extent(
1772 				NULL, inode, &path, &newex, 0);
1773 			up_write((&EXT4_I(inode)->i_data_sem));
1774 			ext4_ext_drop_refs(path);
1775 			kfree(path);
1776 			if (ret)
1777 				goto out;
1778 			goto next;
1779 		}
1780 
1781 		if (start_pblk + cur - start != map.m_pblk) {
1782 			/*
1783 			 * Logical to physical mapping changed. This can happen
1784 			 * if this range was removed and then reallocated to
1785 			 * map to new physical blocks during a fast commit.
1786 			 */
1787 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1788 					ext4_ext_is_unwritten(ex),
1789 					start_pblk + cur - start);
1790 			if (ret)
1791 				goto out;
1792 			/*
1793 			 * Mark the old blocks as free since they aren't used
1794 			 * anymore. We maintain an array of all the modified
1795 			 * inodes. In case these blocks are still used at either
1796 			 * a different logical range in the same inode or in
1797 			 * some different inode, we will mark them as allocated
1798 			 * at the end of the FC replay using our array of
1799 			 * modified inodes.
1800 			 */
1801 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1802 			goto next;
1803 		}
1804 
1805 		/* Range is mapped and needs a state change */
1806 		jbd_debug(1, "Converting from %ld to %d %lld",
1807 				map.m_flags & EXT4_MAP_UNWRITTEN,
1808 			ext4_ext_is_unwritten(ex), map.m_pblk);
1809 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1810 					ext4_ext_is_unwritten(ex), map.m_pblk);
1811 		if (ret)
1812 			goto out;
1813 		/*
1814 		 * We may have split the extent tree while toggling the state.
1815 		 * Try to shrink the extent tree now.
1816 		 */
1817 		ext4_ext_replay_shrink_inode(inode, start + len);
1818 next:
1819 		cur += map.m_len;
1820 		remaining -= map.m_len;
1821 	}
1822 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1823 					sb->s_blocksize_bits);
1824 out:
1825 	iput(inode);
1826 	return 0;
1827 }
1828 
1829 /* Replay DEL_RANGE tag */
1830 static int
1831 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1832 			 u8 *val)
1833 {
1834 	struct inode *inode;
1835 	struct ext4_fc_del_range lrange;
1836 	struct ext4_map_blocks map;
1837 	ext4_lblk_t cur, remaining;
1838 	int ret;
1839 
1840 	memcpy(&lrange, val, sizeof(lrange));
1841 	cur = le32_to_cpu(lrange.fc_lblk);
1842 	remaining = le32_to_cpu(lrange.fc_len);
1843 
1844 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1845 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1846 
1847 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1848 	if (IS_ERR(inode)) {
1849 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1850 		return 0;
1851 	}
1852 
1853 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1854 	if (ret)
1855 		goto out;
1856 
1857 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1858 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1859 			le32_to_cpu(lrange.fc_len));
1860 	while (remaining > 0) {
1861 		map.m_lblk = cur;
1862 		map.m_len = remaining;
1863 
1864 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1865 		if (ret < 0)
1866 			goto out;
1867 		if (ret > 0) {
1868 			remaining -= ret;
1869 			cur += ret;
1870 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1871 		} else {
1872 			remaining -= map.m_len;
1873 			cur += map.m_len;
1874 		}
1875 	}
1876 
1877 	down_write(&EXT4_I(inode)->i_data_sem);
1878 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1879 				le32_to_cpu(lrange.fc_lblk) +
1880 				le32_to_cpu(lrange.fc_len) - 1);
1881 	up_write(&EXT4_I(inode)->i_data_sem);
1882 	if (ret)
1883 		goto out;
1884 	ext4_ext_replay_shrink_inode(inode,
1885 		i_size_read(inode) >> sb->s_blocksize_bits);
1886 	ext4_mark_inode_dirty(NULL, inode);
1887 out:
1888 	iput(inode);
1889 	return 0;
1890 }
1891 
1892 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1893 {
1894 	struct ext4_fc_replay_state *state;
1895 	struct inode *inode;
1896 	struct ext4_ext_path *path = NULL;
1897 	struct ext4_map_blocks map;
1898 	int i, ret, j;
1899 	ext4_lblk_t cur, end;
1900 
1901 	state = &EXT4_SB(sb)->s_fc_replay_state;
1902 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1903 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1904 			EXT4_IGET_NORMAL);
1905 		if (IS_ERR(inode)) {
1906 			jbd_debug(1, "Inode %d not found.",
1907 				state->fc_modified_inodes[i]);
1908 			continue;
1909 		}
1910 		cur = 0;
1911 		end = EXT_MAX_BLOCKS;
1912 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1913 			iput(inode);
1914 			continue;
1915 		}
1916 		while (cur < end) {
1917 			map.m_lblk = cur;
1918 			map.m_len = end - cur;
1919 
1920 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1921 			if (ret < 0)
1922 				break;
1923 
1924 			if (ret > 0) {
1925 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1926 				if (!IS_ERR(path)) {
1927 					for (j = 0; j < path->p_depth; j++)
1928 						ext4_mb_mark_bb(inode->i_sb,
1929 							path[j].p_block, 1, 1);
1930 					ext4_ext_drop_refs(path);
1931 					kfree(path);
1932 				}
1933 				cur += ret;
1934 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1935 							map.m_len, 1);
1936 			} else {
1937 				cur = cur + (map.m_len ? map.m_len : 1);
1938 			}
1939 		}
1940 		iput(inode);
1941 	}
1942 }
1943 
1944 /*
1945  * Check if block is in excluded regions for block allocation. The simple
1946  * allocator that runs during replay phase is calls this function to see
1947  * if it is okay to use a block.
1948  */
1949 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1950 {
1951 	int i;
1952 	struct ext4_fc_replay_state *state;
1953 
1954 	state = &EXT4_SB(sb)->s_fc_replay_state;
1955 	for (i = 0; i < state->fc_regions_valid; i++) {
1956 		if (state->fc_regions[i].ino == 0 ||
1957 			state->fc_regions[i].len == 0)
1958 			continue;
1959 		if (in_range(blk, state->fc_regions[i].pblk,
1960 					state->fc_regions[i].len))
1961 			return true;
1962 	}
1963 	return false;
1964 }
1965 
1966 /* Cleanup function called after replay */
1967 void ext4_fc_replay_cleanup(struct super_block *sb)
1968 {
1969 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1970 
1971 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1972 	kfree(sbi->s_fc_replay_state.fc_regions);
1973 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1974 }
1975 
1976 /*
1977  * Recovery Scan phase handler
1978  *
1979  * This function is called during the scan phase and is responsible
1980  * for doing following things:
1981  * - Make sure the fast commit area has valid tags for replay
1982  * - Count number of tags that need to be replayed by the replay handler
1983  * - Verify CRC
1984  * - Create a list of excluded blocks for allocation during replay phase
1985  *
1986  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1987  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1988  * to indicate that scan has finished and JBD2 can now start replay phase.
1989  * It returns a negative error to indicate that there was an error. At the end
1990  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1991  * to indicate the number of tags that need to replayed during the replay phase.
1992  */
1993 static int ext4_fc_replay_scan(journal_t *journal,
1994 				struct buffer_head *bh, int off,
1995 				tid_t expected_tid)
1996 {
1997 	struct super_block *sb = journal->j_private;
1998 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1999 	struct ext4_fc_replay_state *state;
2000 	int ret = JBD2_FC_REPLAY_CONTINUE;
2001 	struct ext4_fc_add_range ext;
2002 	struct ext4_fc_tl tl;
2003 	struct ext4_fc_tail tail;
2004 	__u8 *start, *end, *cur, *val;
2005 	struct ext4_fc_head head;
2006 	struct ext4_extent *ex;
2007 
2008 	state = &sbi->s_fc_replay_state;
2009 
2010 	start = (u8 *)bh->b_data;
2011 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2012 
2013 	if (state->fc_replay_expected_off == 0) {
2014 		state->fc_cur_tag = 0;
2015 		state->fc_replay_num_tags = 0;
2016 		state->fc_crc = 0;
2017 		state->fc_regions = NULL;
2018 		state->fc_regions_valid = state->fc_regions_used =
2019 			state->fc_regions_size = 0;
2020 		/* Check if we can stop early */
2021 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2022 			!= EXT4_FC_TAG_HEAD)
2023 			return 0;
2024 	}
2025 
2026 	if (off != state->fc_replay_expected_off) {
2027 		ret = -EFSCORRUPTED;
2028 		goto out_err;
2029 	}
2030 
2031 	state->fc_replay_expected_off++;
2032 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2033 		memcpy(&tl, cur, sizeof(tl));
2034 		val = cur + sizeof(tl);
2035 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
2036 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2037 		switch (le16_to_cpu(tl.fc_tag)) {
2038 		case EXT4_FC_TAG_ADD_RANGE:
2039 			memcpy(&ext, val, sizeof(ext));
2040 			ex = (struct ext4_extent *)&ext.fc_ex;
2041 			ret = ext4_fc_record_regions(sb,
2042 				le32_to_cpu(ext.fc_ino),
2043 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2044 				ext4_ext_get_actual_len(ex), 0);
2045 			if (ret < 0)
2046 				break;
2047 			ret = JBD2_FC_REPLAY_CONTINUE;
2048 			fallthrough;
2049 		case EXT4_FC_TAG_DEL_RANGE:
2050 		case EXT4_FC_TAG_LINK:
2051 		case EXT4_FC_TAG_UNLINK:
2052 		case EXT4_FC_TAG_CREAT:
2053 		case EXT4_FC_TAG_INODE:
2054 		case EXT4_FC_TAG_PAD:
2055 			state->fc_cur_tag++;
2056 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2057 					sizeof(tl) + le16_to_cpu(tl.fc_len));
2058 			break;
2059 		case EXT4_FC_TAG_TAIL:
2060 			state->fc_cur_tag++;
2061 			memcpy(&tail, val, sizeof(tail));
2062 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2063 						sizeof(tl) +
2064 						offsetof(struct ext4_fc_tail,
2065 						fc_crc));
2066 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2067 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2068 				state->fc_replay_num_tags = state->fc_cur_tag;
2069 				state->fc_regions_valid =
2070 					state->fc_regions_used;
2071 			} else {
2072 				ret = state->fc_replay_num_tags ?
2073 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2074 			}
2075 			state->fc_crc = 0;
2076 			break;
2077 		case EXT4_FC_TAG_HEAD:
2078 			memcpy(&head, val, sizeof(head));
2079 			if (le32_to_cpu(head.fc_features) &
2080 				~EXT4_FC_SUPPORTED_FEATURES) {
2081 				ret = -EOPNOTSUPP;
2082 				break;
2083 			}
2084 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2085 				ret = JBD2_FC_REPLAY_STOP;
2086 				break;
2087 			}
2088 			state->fc_cur_tag++;
2089 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2090 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2091 			break;
2092 		default:
2093 			ret = state->fc_replay_num_tags ?
2094 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2095 		}
2096 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2097 			break;
2098 	}
2099 
2100 out_err:
2101 	trace_ext4_fc_replay_scan(sb, ret, off);
2102 	return ret;
2103 }
2104 
2105 /*
2106  * Main recovery path entry point.
2107  * The meaning of return codes is similar as above.
2108  */
2109 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2110 				enum passtype pass, int off, tid_t expected_tid)
2111 {
2112 	struct super_block *sb = journal->j_private;
2113 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2114 	struct ext4_fc_tl tl;
2115 	__u8 *start, *end, *cur, *val;
2116 	int ret = JBD2_FC_REPLAY_CONTINUE;
2117 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2118 	struct ext4_fc_tail tail;
2119 
2120 	if (pass == PASS_SCAN) {
2121 		state->fc_current_pass = PASS_SCAN;
2122 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2123 	}
2124 
2125 	if (state->fc_current_pass != pass) {
2126 		state->fc_current_pass = pass;
2127 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2128 	}
2129 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2130 		jbd_debug(1, "Replay stops\n");
2131 		ext4_fc_set_bitmaps_and_counters(sb);
2132 		return 0;
2133 	}
2134 
2135 #ifdef CONFIG_EXT4_DEBUG
2136 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2137 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2138 		return JBD2_FC_REPLAY_STOP;
2139 	}
2140 #endif
2141 
2142 	start = (u8 *)bh->b_data;
2143 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2144 
2145 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2146 		memcpy(&tl, cur, sizeof(tl));
2147 		val = cur + sizeof(tl);
2148 
2149 		if (state->fc_replay_num_tags == 0) {
2150 			ret = JBD2_FC_REPLAY_STOP;
2151 			ext4_fc_set_bitmaps_and_counters(sb);
2152 			break;
2153 		}
2154 		jbd_debug(3, "Replay phase, tag:%s\n",
2155 				tag2str(le16_to_cpu(tl.fc_tag)));
2156 		state->fc_replay_num_tags--;
2157 		switch (le16_to_cpu(tl.fc_tag)) {
2158 		case EXT4_FC_TAG_LINK:
2159 			ret = ext4_fc_replay_link(sb, &tl, val);
2160 			break;
2161 		case EXT4_FC_TAG_UNLINK:
2162 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2163 			break;
2164 		case EXT4_FC_TAG_ADD_RANGE:
2165 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2166 			break;
2167 		case EXT4_FC_TAG_CREAT:
2168 			ret = ext4_fc_replay_create(sb, &tl, val);
2169 			break;
2170 		case EXT4_FC_TAG_DEL_RANGE:
2171 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2172 			break;
2173 		case EXT4_FC_TAG_INODE:
2174 			ret = ext4_fc_replay_inode(sb, &tl, val);
2175 			break;
2176 		case EXT4_FC_TAG_PAD:
2177 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2178 					     le16_to_cpu(tl.fc_len), 0);
2179 			break;
2180 		case EXT4_FC_TAG_TAIL:
2181 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2182 					     le16_to_cpu(tl.fc_len), 0);
2183 			memcpy(&tail, val, sizeof(tail));
2184 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2185 			break;
2186 		case EXT4_FC_TAG_HEAD:
2187 			break;
2188 		default:
2189 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2190 					     le16_to_cpu(tl.fc_len), 0);
2191 			ret = -ECANCELED;
2192 			break;
2193 		}
2194 		if (ret < 0)
2195 			break;
2196 		ret = JBD2_FC_REPLAY_CONTINUE;
2197 	}
2198 	return ret;
2199 }
2200 
2201 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2202 {
2203 	/*
2204 	 * We set replay callback even if fast commit disabled because we may
2205 	 * could still have fast commit blocks that need to be replayed even if
2206 	 * fast commit has now been turned off.
2207 	 */
2208 	journal->j_fc_replay_callback = ext4_fc_replay;
2209 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2210 		return;
2211 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2212 }
2213 
2214 static const char *fc_ineligible_reasons[] = {
2215 	"Extended attributes changed",
2216 	"Cross rename",
2217 	"Journal flag changed",
2218 	"Insufficient memory",
2219 	"Swap boot",
2220 	"Resize",
2221 	"Dir renamed",
2222 	"Falloc range op",
2223 	"Data journalling",
2224 	"FC Commit Failed"
2225 };
2226 
2227 int ext4_fc_info_show(struct seq_file *seq, void *v)
2228 {
2229 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2230 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2231 	int i;
2232 
2233 	if (v != SEQ_START_TOKEN)
2234 		return 0;
2235 
2236 	seq_printf(seq,
2237 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2238 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2239 		   stats->fc_numblks,
2240 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2241 	seq_puts(seq, "Ineligible reasons:\n");
2242 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2243 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2244 			stats->fc_ineligible_reason_count[i]);
2245 
2246 	return 0;
2247 }
2248 
2249 int __init ext4_fc_init_dentry_cache(void)
2250 {
2251 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2252 					   SLAB_RECLAIM_ACCOUNT);
2253 
2254 	if (ext4_fc_dentry_cachep == NULL)
2255 		return -ENOMEM;
2256 
2257 	return 0;
2258 }
2259 
2260 void ext4_fc_destroy_dentry_cache(void)
2261 {
2262 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2263 }
2264