xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 08f4c42abad1b93914a93f9042e2443593bd3137)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	INIT_LIST_HEAD(&ei->i_fc_dilist);
203 	init_waitqueue_head(&ei->i_fc_wait);
204 	atomic_set(&ei->i_fc_updates, 0);
205 }
206 
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211 	wait_queue_head_t *wq;
212 	struct ext4_inode_info *ei = EXT4_I(inode);
213 
214 #if (BITS_PER_LONG < 64)
215 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 			EXT4_STATE_FC_COMMITTING);
217 	wq = bit_waitqueue(&ei->i_state_flags,
218 				EXT4_STATE_FC_COMMITTING);
219 #else
220 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 			EXT4_STATE_FC_COMMITTING);
222 	wq = bit_waitqueue(&ei->i_flags,
223 				EXT4_STATE_FC_COMMITTING);
224 #endif
225 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 	schedule();
229 	finish_wait(wq, &wait.wq_entry);
230 }
231 
232 /*
233  * Inform Ext4's fast about start of an inode update
234  *
235  * This function is called by the high level call VFS callbacks before
236  * performing any inode update. This function blocks if there's an ongoing
237  * fast commit on the inode in question.
238  */
239 void ext4_fc_start_update(struct inode *inode)
240 {
241 	struct ext4_inode_info *ei = EXT4_I(inode);
242 
243 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
244 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
245 		return;
246 
247 restart:
248 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
249 	if (list_empty(&ei->i_fc_list))
250 		goto out;
251 
252 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
253 		ext4_fc_wait_committing_inode(inode);
254 		goto restart;
255 	}
256 out:
257 	atomic_inc(&ei->i_fc_updates);
258 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
259 }
260 
261 /*
262  * Stop inode update and wake up waiting fast commits if any.
263  */
264 void ext4_fc_stop_update(struct inode *inode)
265 {
266 	struct ext4_inode_info *ei = EXT4_I(inode);
267 
268 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
269 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
270 		return;
271 
272 	if (atomic_dec_and_test(&ei->i_fc_updates))
273 		wake_up_all(&ei->i_fc_wait);
274 }
275 
276 /*
277  * Remove inode from fast commit list. If the inode is being committed
278  * we wait until inode commit is done.
279  */
280 void ext4_fc_del(struct inode *inode)
281 {
282 	struct ext4_inode_info *ei = EXT4_I(inode);
283 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
284 	struct ext4_fc_dentry_update *fc_dentry;
285 
286 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
287 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
288 		return;
289 
290 restart:
291 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
292 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
293 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
294 		return;
295 	}
296 
297 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
298 		ext4_fc_wait_committing_inode(inode);
299 		goto restart;
300 	}
301 
302 	if (!list_empty(&ei->i_fc_list))
303 		list_del_init(&ei->i_fc_list);
304 
305 	/*
306 	 * Since this inode is getting removed, let's also remove all FC
307 	 * dentry create references, since it is not needed to log it anyways.
308 	 */
309 	if (list_empty(&ei->i_fc_dilist)) {
310 		spin_unlock(&sbi->s_fc_lock);
311 		return;
312 	}
313 
314 	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
315 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
316 	list_del_init(&fc_dentry->fcd_list);
317 	list_del_init(&fc_dentry->fcd_dilist);
318 
319 	WARN_ON(!list_empty(&ei->i_fc_dilist));
320 	spin_unlock(&sbi->s_fc_lock);
321 
322 	if (fc_dentry->fcd_name.name &&
323 		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
324 		kfree(fc_dentry->fcd_name.name);
325 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
326 
327 	return;
328 }
329 
330 /*
331  * Mark file system as fast commit ineligible, and record latest
332  * ineligible transaction tid. This means until the recorded
333  * transaction, commit operation would result in a full jbd2 commit.
334  */
335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
336 {
337 	struct ext4_sb_info *sbi = EXT4_SB(sb);
338 	tid_t tid;
339 
340 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
341 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
342 		return;
343 
344 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
345 	if (handle && !IS_ERR(handle))
346 		tid = handle->h_transaction->t_tid;
347 	else {
348 		read_lock(&sbi->s_journal->j_state_lock);
349 		tid = sbi->s_journal->j_running_transaction ?
350 				sbi->s_journal->j_running_transaction->t_tid : 0;
351 		read_unlock(&sbi->s_journal->j_state_lock);
352 	}
353 	spin_lock(&sbi->s_fc_lock);
354 	if (sbi->s_fc_ineligible_tid < tid)
355 		sbi->s_fc_ineligible_tid = tid;
356 	spin_unlock(&sbi->s_fc_lock);
357 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
358 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
359 }
360 
361 /*
362  * Generic fast commit tracking function. If this is the first time this we are
363  * called after a full commit, we initialize fast commit fields and then call
364  * __fc_track_fn() with update = 0. If we have already been called after a full
365  * commit, we pass update = 1. Based on that, the track function can determine
366  * if it needs to track a field for the first time or if it needs to just
367  * update the previously tracked value.
368  *
369  * If enqueue is set, this function enqueues the inode in fast commit list.
370  */
371 static int ext4_fc_track_template(
372 	handle_t *handle, struct inode *inode,
373 	int (*__fc_track_fn)(struct inode *, void *, bool),
374 	void *args, int enqueue)
375 {
376 	bool update = false;
377 	struct ext4_inode_info *ei = EXT4_I(inode);
378 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
379 	tid_t tid = 0;
380 	int ret;
381 
382 	tid = handle->h_transaction->t_tid;
383 	mutex_lock(&ei->i_fc_lock);
384 	if (tid == ei->i_sync_tid) {
385 		update = true;
386 	} else {
387 		ext4_fc_reset_inode(inode);
388 		ei->i_sync_tid = tid;
389 	}
390 	ret = __fc_track_fn(inode, args, update);
391 	mutex_unlock(&ei->i_fc_lock);
392 
393 	if (!enqueue)
394 		return ret;
395 
396 	spin_lock(&sbi->s_fc_lock);
397 	if (list_empty(&EXT4_I(inode)->i_fc_list))
398 		list_add_tail(&EXT4_I(inode)->i_fc_list,
399 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
400 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
401 				&sbi->s_fc_q[FC_Q_STAGING] :
402 				&sbi->s_fc_q[FC_Q_MAIN]);
403 	spin_unlock(&sbi->s_fc_lock);
404 
405 	return ret;
406 }
407 
408 struct __track_dentry_update_args {
409 	struct dentry *dentry;
410 	int op;
411 };
412 
413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
414 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
415 {
416 	struct ext4_fc_dentry_update *node;
417 	struct ext4_inode_info *ei = EXT4_I(inode);
418 	struct __track_dentry_update_args *dentry_update =
419 		(struct __track_dentry_update_args *)arg;
420 	struct dentry *dentry = dentry_update->dentry;
421 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
422 
423 	mutex_unlock(&ei->i_fc_lock);
424 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
425 	if (!node) {
426 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
427 		mutex_lock(&ei->i_fc_lock);
428 		return -ENOMEM;
429 	}
430 
431 	node->fcd_op = dentry_update->op;
432 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
433 	node->fcd_ino = inode->i_ino;
434 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
435 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
436 		if (!node->fcd_name.name) {
437 			kmem_cache_free(ext4_fc_dentry_cachep, node);
438 			ext4_fc_mark_ineligible(inode->i_sb,
439 				EXT4_FC_REASON_NOMEM, NULL);
440 			mutex_lock(&ei->i_fc_lock);
441 			return -ENOMEM;
442 		}
443 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
444 			dentry->d_name.len);
445 	} else {
446 		memcpy(node->fcd_iname, dentry->d_name.name,
447 			dentry->d_name.len);
448 		node->fcd_name.name = node->fcd_iname;
449 	}
450 	node->fcd_name.len = dentry->d_name.len;
451 	INIT_LIST_HEAD(&node->fcd_dilist);
452 	spin_lock(&sbi->s_fc_lock);
453 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
454 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
455 		list_add_tail(&node->fcd_list,
456 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
457 	else
458 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
459 
460 	/*
461 	 * This helps us keep a track of all fc_dentry updates which is part of
462 	 * this ext4 inode. So in case the inode is getting unlinked, before
463 	 * even we get a chance to fsync, we could remove all fc_dentry
464 	 * references while evicting the inode in ext4_fc_del().
465 	 * Also with this, we don't need to loop over all the inodes in
466 	 * sbi->s_fc_q to get the corresponding inode in
467 	 * ext4_fc_commit_dentry_updates().
468 	 */
469 	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
470 		WARN_ON(!list_empty(&ei->i_fc_dilist));
471 		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
472 	}
473 	spin_unlock(&sbi->s_fc_lock);
474 	mutex_lock(&ei->i_fc_lock);
475 
476 	return 0;
477 }
478 
479 void __ext4_fc_track_unlink(handle_t *handle,
480 		struct inode *inode, struct dentry *dentry)
481 {
482 	struct __track_dentry_update_args args;
483 	int ret;
484 
485 	args.dentry = dentry;
486 	args.op = EXT4_FC_TAG_UNLINK;
487 
488 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
489 					(void *)&args, 0);
490 	trace_ext4_fc_track_unlink(inode, dentry, ret);
491 }
492 
493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
494 {
495 	struct inode *inode = d_inode(dentry);
496 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
497 
498 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
499 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
500 		return;
501 
502 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503 		return;
504 
505 	__ext4_fc_track_unlink(handle, inode, dentry);
506 }
507 
508 void __ext4_fc_track_link(handle_t *handle,
509 	struct inode *inode, struct dentry *dentry)
510 {
511 	struct __track_dentry_update_args args;
512 	int ret;
513 
514 	args.dentry = dentry;
515 	args.op = EXT4_FC_TAG_LINK;
516 
517 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518 					(void *)&args, 0);
519 	trace_ext4_fc_track_link(inode, dentry, ret);
520 }
521 
522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524 	struct inode *inode = d_inode(dentry);
525 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
526 
527 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
528 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
529 		return;
530 
531 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
532 		return;
533 
534 	__ext4_fc_track_link(handle, inode, dentry);
535 }
536 
537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
538 			  struct dentry *dentry)
539 {
540 	struct __track_dentry_update_args args;
541 	int ret;
542 
543 	args.dentry = dentry;
544 	args.op = EXT4_FC_TAG_CREAT;
545 
546 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
547 					(void *)&args, 0);
548 	trace_ext4_fc_track_create(inode, dentry, ret);
549 }
550 
551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
552 {
553 	struct inode *inode = d_inode(dentry);
554 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
555 
556 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
557 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
558 		return;
559 
560 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
561 		return;
562 
563 	__ext4_fc_track_create(handle, inode, dentry);
564 }
565 
566 /* __track_fn for inode tracking */
567 static int __track_inode(struct inode *inode, void *arg, bool update)
568 {
569 	if (update)
570 		return -EEXIST;
571 
572 	EXT4_I(inode)->i_fc_lblk_len = 0;
573 
574 	return 0;
575 }
576 
577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
578 {
579 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
580 	int ret;
581 
582 	if (S_ISDIR(inode->i_mode))
583 		return;
584 
585 	if (ext4_should_journal_data(inode)) {
586 		ext4_fc_mark_ineligible(inode->i_sb,
587 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
588 		return;
589 	}
590 
591 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
592 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
593 		return;
594 
595 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
596 		return;
597 
598 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
599 	trace_ext4_fc_track_inode(inode, ret);
600 }
601 
602 struct __track_range_args {
603 	ext4_lblk_t start, end;
604 };
605 
606 /* __track_fn for tracking data updates */
607 static int __track_range(struct inode *inode, void *arg, bool update)
608 {
609 	struct ext4_inode_info *ei = EXT4_I(inode);
610 	ext4_lblk_t oldstart;
611 	struct __track_range_args *__arg =
612 		(struct __track_range_args *)arg;
613 
614 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
615 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
616 		return -ECANCELED;
617 	}
618 
619 	oldstart = ei->i_fc_lblk_start;
620 
621 	if (update && ei->i_fc_lblk_len > 0) {
622 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
623 		ei->i_fc_lblk_len =
624 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
625 				ei->i_fc_lblk_start + 1;
626 	} else {
627 		ei->i_fc_lblk_start = __arg->start;
628 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
629 	}
630 
631 	return 0;
632 }
633 
634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
635 			 ext4_lblk_t end)
636 {
637 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
638 	struct __track_range_args args;
639 	int ret;
640 
641 	if (S_ISDIR(inode->i_mode))
642 		return;
643 
644 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
645 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
646 		return;
647 
648 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
649 		return;
650 
651 	args.start = start;
652 	args.end = end;
653 
654 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
655 
656 	trace_ext4_fc_track_range(inode, start, end, ret);
657 }
658 
659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
660 {
661 	int write_flags = REQ_SYNC;
662 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
663 
664 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
665 	if (test_opt(sb, BARRIER) && is_tail)
666 		write_flags |= REQ_FUA | REQ_PREFLUSH;
667 	lock_buffer(bh);
668 	set_buffer_dirty(bh);
669 	set_buffer_uptodate(bh);
670 	bh->b_end_io = ext4_end_buffer_io_sync;
671 	submit_bh(REQ_OP_WRITE, write_flags, bh);
672 	EXT4_SB(sb)->s_fc_bh = NULL;
673 }
674 
675 /* Ext4 commit path routines */
676 
677 /* memzero and update CRC */
678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
679 				u32 *crc)
680 {
681 	void *ret;
682 
683 	ret = memset(dst, 0, len);
684 	if (crc)
685 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
686 	return ret;
687 }
688 
689 /*
690  * Allocate len bytes on a fast commit buffer.
691  *
692  * During the commit time this function is used to manage fast commit
693  * block space. We don't split a fast commit log onto different
694  * blocks. So this function makes sure that if there's not enough space
695  * on the current block, the remaining space in the current block is
696  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
697  * new block is from jbd2 and CRC is updated to reflect the padding
698  * we added.
699  */
700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
701 {
702 	struct ext4_fc_tl *tl;
703 	struct ext4_sb_info *sbi = EXT4_SB(sb);
704 	struct buffer_head *bh;
705 	int bsize = sbi->s_journal->j_blocksize;
706 	int ret, off = sbi->s_fc_bytes % bsize;
707 	int pad_len;
708 
709 	/*
710 	 * After allocating len, we should have space at least for a 0 byte
711 	 * padding.
712 	 */
713 	if (len + sizeof(struct ext4_fc_tl) > bsize)
714 		return NULL;
715 
716 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
717 		/*
718 		 * Only allocate from current buffer if we have enough space for
719 		 * this request AND we have space to add a zero byte padding.
720 		 */
721 		if (!sbi->s_fc_bh) {
722 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
723 			if (ret)
724 				return NULL;
725 			sbi->s_fc_bh = bh;
726 		}
727 		sbi->s_fc_bytes += len;
728 		return sbi->s_fc_bh->b_data + off;
729 	}
730 	/* Need to add PAD tag */
731 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
732 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
733 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
734 	tl->fc_len = cpu_to_le16(pad_len);
735 	if (crc)
736 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
737 	if (pad_len > 0)
738 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
739 	ext4_fc_submit_bh(sb, false);
740 
741 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
742 	if (ret)
743 		return NULL;
744 	sbi->s_fc_bh = bh;
745 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
746 	return sbi->s_fc_bh->b_data;
747 }
748 
749 /* memcpy to fc reserved space and update CRC */
750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
751 				int len, u32 *crc)
752 {
753 	if (crc)
754 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
755 	return memcpy(dst, src, len);
756 }
757 
758 /*
759  * Complete a fast commit by writing tail tag.
760  *
761  * Writing tail tag marks the end of a fast commit. In order to guarantee
762  * atomicity, after writing tail tag, even if there's space remaining
763  * in the block, next commit shouldn't use it. That's why tail tag
764  * has the length as that of the remaining space on the block.
765  */
766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
767 {
768 	struct ext4_sb_info *sbi = EXT4_SB(sb);
769 	struct ext4_fc_tl tl;
770 	struct ext4_fc_tail tail;
771 	int off, bsize = sbi->s_journal->j_blocksize;
772 	u8 *dst;
773 
774 	/*
775 	 * ext4_fc_reserve_space takes care of allocating an extra block if
776 	 * there's no enough space on this block for accommodating this tail.
777 	 */
778 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
779 	if (!dst)
780 		return -ENOSPC;
781 
782 	off = sbi->s_fc_bytes % bsize;
783 
784 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
785 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
786 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
787 
788 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
789 	dst += sizeof(tl);
790 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
791 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
792 	dst += sizeof(tail.fc_tid);
793 	tail.fc_crc = cpu_to_le32(crc);
794 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
795 
796 	ext4_fc_submit_bh(sb, true);
797 
798 	return 0;
799 }
800 
801 /*
802  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
803  * Returns false if there's not enough space.
804  */
805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
806 			   u32 *crc)
807 {
808 	struct ext4_fc_tl tl;
809 	u8 *dst;
810 
811 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
812 	if (!dst)
813 		return false;
814 
815 	tl.fc_tag = cpu_to_le16(tag);
816 	tl.fc_len = cpu_to_le16(len);
817 
818 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
819 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
820 
821 	return true;
822 }
823 
824 /* Same as above, but adds dentry tlv. */
825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
826 				   struct ext4_fc_dentry_update *fc_dentry)
827 {
828 	struct ext4_fc_dentry_info fcd;
829 	struct ext4_fc_tl tl;
830 	int dlen = fc_dentry->fcd_name.len;
831 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
832 					crc);
833 
834 	if (!dst)
835 		return false;
836 
837 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
838 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
839 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
840 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
841 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
842 	dst += sizeof(tl);
843 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
844 	dst += sizeof(fcd);
845 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
846 
847 	return true;
848 }
849 
850 /*
851  * Writes inode in the fast commit space under TLV with tag @tag.
852  * Returns 0 on success, error on failure.
853  */
854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
855 {
856 	struct ext4_inode_info *ei = EXT4_I(inode);
857 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
858 	int ret;
859 	struct ext4_iloc iloc;
860 	struct ext4_fc_inode fc_inode;
861 	struct ext4_fc_tl tl;
862 	u8 *dst;
863 
864 	ret = ext4_get_inode_loc(inode, &iloc);
865 	if (ret)
866 		return ret;
867 
868 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
869 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
870 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
871 		inode_len += ei->i_extra_isize;
872 
873 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
874 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
875 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
876 
877 	dst = ext4_fc_reserve_space(inode->i_sb,
878 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
879 	if (!dst)
880 		return -ECANCELED;
881 
882 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
883 		return -ECANCELED;
884 	dst += sizeof(tl);
885 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
886 		return -ECANCELED;
887 	dst += sizeof(fc_inode);
888 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
889 					inode_len, crc))
890 		return -ECANCELED;
891 
892 	return 0;
893 }
894 
895 /*
896  * Writes updated data ranges for the inode in question. Updates CRC.
897  * Returns 0 on success, error otherwise.
898  */
899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
900 {
901 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
902 	struct ext4_inode_info *ei = EXT4_I(inode);
903 	struct ext4_map_blocks map;
904 	struct ext4_fc_add_range fc_ext;
905 	struct ext4_fc_del_range lrange;
906 	struct ext4_extent *ex;
907 	int ret;
908 
909 	mutex_lock(&ei->i_fc_lock);
910 	if (ei->i_fc_lblk_len == 0) {
911 		mutex_unlock(&ei->i_fc_lock);
912 		return 0;
913 	}
914 	old_blk_size = ei->i_fc_lblk_start;
915 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
916 	ei->i_fc_lblk_len = 0;
917 	mutex_unlock(&ei->i_fc_lock);
918 
919 	cur_lblk_off = old_blk_size;
920 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
921 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
922 
923 	while (cur_lblk_off <= new_blk_size) {
924 		map.m_lblk = cur_lblk_off;
925 		map.m_len = new_blk_size - cur_lblk_off + 1;
926 		ret = ext4_map_blocks(NULL, inode, &map, 0);
927 		if (ret < 0)
928 			return -ECANCELED;
929 
930 		if (map.m_len == 0) {
931 			cur_lblk_off++;
932 			continue;
933 		}
934 
935 		if (ret == 0) {
936 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
937 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
938 			lrange.fc_len = cpu_to_le32(map.m_len);
939 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
940 					    sizeof(lrange), (u8 *)&lrange, crc))
941 				return -ENOSPC;
942 		} else {
943 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
944 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
945 
946 			/* Limit the number of blocks in one extent */
947 			map.m_len = min(max, map.m_len);
948 
949 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
950 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
951 			ex->ee_block = cpu_to_le32(map.m_lblk);
952 			ex->ee_len = cpu_to_le16(map.m_len);
953 			ext4_ext_store_pblock(ex, map.m_pblk);
954 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
955 				ext4_ext_mark_unwritten(ex);
956 			else
957 				ext4_ext_mark_initialized(ex);
958 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
959 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
960 				return -ENOSPC;
961 		}
962 
963 		cur_lblk_off += map.m_len;
964 	}
965 
966 	return 0;
967 }
968 
969 
970 /* Submit data for all the fast commit inodes */
971 static int ext4_fc_submit_inode_data_all(journal_t *journal)
972 {
973 	struct super_block *sb = (struct super_block *)(journal->j_private);
974 	struct ext4_sb_info *sbi = EXT4_SB(sb);
975 	struct ext4_inode_info *ei;
976 	int ret = 0;
977 
978 	spin_lock(&sbi->s_fc_lock);
979 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
980 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
981 		while (atomic_read(&ei->i_fc_updates)) {
982 			DEFINE_WAIT(wait);
983 
984 			prepare_to_wait(&ei->i_fc_wait, &wait,
985 						TASK_UNINTERRUPTIBLE);
986 			if (atomic_read(&ei->i_fc_updates)) {
987 				spin_unlock(&sbi->s_fc_lock);
988 				schedule();
989 				spin_lock(&sbi->s_fc_lock);
990 			}
991 			finish_wait(&ei->i_fc_wait, &wait);
992 		}
993 		spin_unlock(&sbi->s_fc_lock);
994 		ret = jbd2_submit_inode_data(ei->jinode);
995 		if (ret)
996 			return ret;
997 		spin_lock(&sbi->s_fc_lock);
998 	}
999 	spin_unlock(&sbi->s_fc_lock);
1000 
1001 	return ret;
1002 }
1003 
1004 /* Wait for completion of data for all the fast commit inodes */
1005 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006 {
1007 	struct super_block *sb = (struct super_block *)(journal->j_private);
1008 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1009 	struct ext4_inode_info *pos, *n;
1010 	int ret = 0;
1011 
1012 	spin_lock(&sbi->s_fc_lock);
1013 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014 		if (!ext4_test_inode_state(&pos->vfs_inode,
1015 					   EXT4_STATE_FC_COMMITTING))
1016 			continue;
1017 		spin_unlock(&sbi->s_fc_lock);
1018 
1019 		ret = jbd2_wait_inode_data(journal, pos->jinode);
1020 		if (ret)
1021 			return ret;
1022 		spin_lock(&sbi->s_fc_lock);
1023 	}
1024 	spin_unlock(&sbi->s_fc_lock);
1025 
1026 	return 0;
1027 }
1028 
1029 /* Commit all the directory entry updates */
1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1031 __acquires(&sbi->s_fc_lock)
1032 __releases(&sbi->s_fc_lock)
1033 {
1034 	struct super_block *sb = (struct super_block *)(journal->j_private);
1035 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1036 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1037 	struct inode *inode;
1038 	struct ext4_inode_info *ei;
1039 	int ret;
1040 
1041 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042 		return 0;
1043 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1045 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046 			spin_unlock(&sbi->s_fc_lock);
1047 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1048 				ret = -ENOSPC;
1049 				goto lock_and_exit;
1050 			}
1051 			spin_lock(&sbi->s_fc_lock);
1052 			continue;
1053 		}
1054 		/*
1055 		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056 		 * corresponding inode pointer
1057 		 */
1058 		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1060 				struct ext4_inode_info, i_fc_dilist);
1061 		inode = &ei->vfs_inode;
1062 		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063 
1064 		spin_unlock(&sbi->s_fc_lock);
1065 
1066 		/*
1067 		 * We first write the inode and then the create dirent. This
1068 		 * allows the recovery code to create an unnamed inode first
1069 		 * and then link it to a directory entry. This allows us
1070 		 * to use namei.c routines almost as is and simplifies
1071 		 * the recovery code.
1072 		 */
1073 		ret = ext4_fc_write_inode(inode, crc);
1074 		if (ret)
1075 			goto lock_and_exit;
1076 
1077 		ret = ext4_fc_write_inode_data(inode, crc);
1078 		if (ret)
1079 			goto lock_and_exit;
1080 
1081 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1082 			ret = -ENOSPC;
1083 			goto lock_and_exit;
1084 		}
1085 
1086 		spin_lock(&sbi->s_fc_lock);
1087 	}
1088 	return 0;
1089 lock_and_exit:
1090 	spin_lock(&sbi->s_fc_lock);
1091 	return ret;
1092 }
1093 
1094 static int ext4_fc_perform_commit(journal_t *journal)
1095 {
1096 	struct super_block *sb = (struct super_block *)(journal->j_private);
1097 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 	struct ext4_inode_info *iter;
1099 	struct ext4_fc_head head;
1100 	struct inode *inode;
1101 	struct blk_plug plug;
1102 	int ret = 0;
1103 	u32 crc = 0;
1104 
1105 	ret = ext4_fc_submit_inode_data_all(journal);
1106 	if (ret)
1107 		return ret;
1108 
1109 	ret = ext4_fc_wait_inode_data_all(journal);
1110 	if (ret)
1111 		return ret;
1112 
1113 	/*
1114 	 * If file system device is different from journal device, issue a cache
1115 	 * flush before we start writing fast commit blocks.
1116 	 */
1117 	if (journal->j_fs_dev != journal->j_dev)
1118 		blkdev_issue_flush(journal->j_fs_dev);
1119 
1120 	blk_start_plug(&plug);
1121 	if (sbi->s_fc_bytes == 0) {
1122 		/*
1123 		 * Add a head tag only if this is the first fast commit
1124 		 * in this TID.
1125 		 */
1126 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127 		head.fc_tid = cpu_to_le32(
1128 			sbi->s_journal->j_running_transaction->t_tid);
1129 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1130 			(u8 *)&head, &crc)) {
1131 			ret = -ENOSPC;
1132 			goto out;
1133 		}
1134 	}
1135 
1136 	spin_lock(&sbi->s_fc_lock);
1137 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138 	if (ret) {
1139 		spin_unlock(&sbi->s_fc_lock);
1140 		goto out;
1141 	}
1142 
1143 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1144 		inode = &iter->vfs_inode;
1145 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146 			continue;
1147 
1148 		spin_unlock(&sbi->s_fc_lock);
1149 		ret = ext4_fc_write_inode_data(inode, &crc);
1150 		if (ret)
1151 			goto out;
1152 		ret = ext4_fc_write_inode(inode, &crc);
1153 		if (ret)
1154 			goto out;
1155 		spin_lock(&sbi->s_fc_lock);
1156 	}
1157 	spin_unlock(&sbi->s_fc_lock);
1158 
1159 	ret = ext4_fc_write_tail(sb, crc);
1160 
1161 out:
1162 	blk_finish_plug(&plug);
1163 	return ret;
1164 }
1165 
1166 static void ext4_fc_update_stats(struct super_block *sb, int status,
1167 				 u64 commit_time, int nblks)
1168 {
1169 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170 
1171 	jbd_debug(1, "Fast commit ended with status = %d", status);
1172 	if (status == EXT4_FC_STATUS_OK) {
1173 		stats->fc_num_commits++;
1174 		stats->fc_numblks += nblks;
1175 		if (likely(stats->s_fc_avg_commit_time))
1176 			stats->s_fc_avg_commit_time =
1177 				(commit_time +
1178 				 stats->s_fc_avg_commit_time * 3) / 4;
1179 		else
1180 			stats->s_fc_avg_commit_time = commit_time;
1181 	} else if (status == EXT4_FC_STATUS_FAILED ||
1182 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1183 		if (status == EXT4_FC_STATUS_FAILED)
1184 			stats->fc_failed_commits++;
1185 		stats->fc_ineligible_commits++;
1186 	} else {
1187 		stats->fc_skipped_commits++;
1188 	}
1189 	trace_ext4_fc_commit_stop(sb, nblks, status);
1190 }
1191 
1192 /*
1193  * The main commit entry point. Performs a fast commit for transaction
1194  * commit_tid if needed. If it's not possible to perform a fast commit
1195  * due to various reasons, we fall back to full commit. Returns 0
1196  * on success, error otherwise.
1197  */
1198 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1199 {
1200 	struct super_block *sb = (struct super_block *)(journal->j_private);
1201 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1202 	int nblks = 0, ret, bsize = journal->j_blocksize;
1203 	int subtid = atomic_read(&sbi->s_fc_subtid);
1204 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1205 	ktime_t start_time, commit_time;
1206 
1207 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1208 		return jbd2_complete_transaction(journal, commit_tid);
1209 
1210 	trace_ext4_fc_commit_start(sb);
1211 
1212 	start_time = ktime_get();
1213 
1214 restart_fc:
1215 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1216 	if (ret == -EALREADY) {
1217 		/* There was an ongoing commit, check if we need to restart */
1218 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1219 			commit_tid > journal->j_commit_sequence)
1220 			goto restart_fc;
1221 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
1222 		return 0;
1223 	} else if (ret) {
1224 		/*
1225 		 * Commit couldn't start. Just update stats and perform a
1226 		 * full commit.
1227 		 */
1228 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
1229 		return jbd2_complete_transaction(journal, commit_tid);
1230 	}
1231 
1232 	/*
1233 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1234 	 * if we are fast commit ineligible.
1235 	 */
1236 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1237 		status = EXT4_FC_STATUS_INELIGIBLE;
1238 		goto fallback;
1239 	}
1240 
1241 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1242 	ret = ext4_fc_perform_commit(journal);
1243 	if (ret < 0) {
1244 		status = EXT4_FC_STATUS_FAILED;
1245 		goto fallback;
1246 	}
1247 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1248 	ret = jbd2_fc_wait_bufs(journal, nblks);
1249 	if (ret < 0) {
1250 		status = EXT4_FC_STATUS_FAILED;
1251 		goto fallback;
1252 	}
1253 	atomic_inc(&sbi->s_fc_subtid);
1254 	ret = jbd2_fc_end_commit(journal);
1255 	/*
1256 	 * weight the commit time higher than the average time so we
1257 	 * don't react too strongly to vast changes in the commit time
1258 	 */
1259 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1260 	ext4_fc_update_stats(sb, status, commit_time, nblks);
1261 	return ret;
1262 
1263 fallback:
1264 	ret = jbd2_fc_end_commit_fallback(journal);
1265 	ext4_fc_update_stats(sb, status, 0, 0);
1266 	return ret;
1267 }
1268 
1269 /*
1270  * Fast commit cleanup routine. This is called after every fast commit and
1271  * full commit. full is true if we are called after a full commit.
1272  */
1273 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1274 {
1275 	struct super_block *sb = journal->j_private;
1276 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1277 	struct ext4_inode_info *iter, *iter_n;
1278 	struct ext4_fc_dentry_update *fc_dentry;
1279 
1280 	if (full && sbi->s_fc_bh)
1281 		sbi->s_fc_bh = NULL;
1282 
1283 	trace_ext4_fc_cleanup(journal, full, tid);
1284 	jbd2_fc_release_bufs(journal);
1285 
1286 	spin_lock(&sbi->s_fc_lock);
1287 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1288 				 i_fc_list) {
1289 		list_del_init(&iter->i_fc_list);
1290 		ext4_clear_inode_state(&iter->vfs_inode,
1291 				       EXT4_STATE_FC_COMMITTING);
1292 		if (iter->i_sync_tid <= tid)
1293 			ext4_fc_reset_inode(&iter->vfs_inode);
1294 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1295 		smp_mb();
1296 #if (BITS_PER_LONG < 64)
1297 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1298 #else
1299 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1300 #endif
1301 	}
1302 
1303 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1304 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1305 					     struct ext4_fc_dentry_update,
1306 					     fcd_list);
1307 		list_del_init(&fc_dentry->fcd_list);
1308 		list_del_init(&fc_dentry->fcd_dilist);
1309 		spin_unlock(&sbi->s_fc_lock);
1310 
1311 		if (fc_dentry->fcd_name.name &&
1312 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1313 			kfree(fc_dentry->fcd_name.name);
1314 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1315 		spin_lock(&sbi->s_fc_lock);
1316 	}
1317 
1318 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1319 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1320 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1321 				&sbi->s_fc_q[FC_Q_MAIN]);
1322 
1323 	if (tid >= sbi->s_fc_ineligible_tid) {
1324 		sbi->s_fc_ineligible_tid = 0;
1325 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1326 	}
1327 
1328 	if (full)
1329 		sbi->s_fc_bytes = 0;
1330 	spin_unlock(&sbi->s_fc_lock);
1331 	trace_ext4_fc_stats(sb);
1332 }
1333 
1334 /* Ext4 Replay Path Routines */
1335 
1336 /* Helper struct for dentry replay routines */
1337 struct dentry_info_args {
1338 	int parent_ino, dname_len, ino, inode_len;
1339 	char *dname;
1340 };
1341 
1342 static inline void tl_to_darg(struct dentry_info_args *darg,
1343 			      struct  ext4_fc_tl *tl, u8 *val)
1344 {
1345 	struct ext4_fc_dentry_info fcd;
1346 
1347 	memcpy(&fcd, val, sizeof(fcd));
1348 
1349 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1350 	darg->ino = le32_to_cpu(fcd.fc_ino);
1351 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1352 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1353 		sizeof(struct ext4_fc_dentry_info);
1354 }
1355 
1356 /* Unlink replay function */
1357 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1358 				 u8 *val)
1359 {
1360 	struct inode *inode, *old_parent;
1361 	struct qstr entry;
1362 	struct dentry_info_args darg;
1363 	int ret = 0;
1364 
1365 	tl_to_darg(&darg, tl, val);
1366 
1367 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1368 			darg.parent_ino, darg.dname_len);
1369 
1370 	entry.name = darg.dname;
1371 	entry.len = darg.dname_len;
1372 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1373 
1374 	if (IS_ERR(inode)) {
1375 		jbd_debug(1, "Inode %d not found", darg.ino);
1376 		return 0;
1377 	}
1378 
1379 	old_parent = ext4_iget(sb, darg.parent_ino,
1380 				EXT4_IGET_NORMAL);
1381 	if (IS_ERR(old_parent)) {
1382 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1383 		iput(inode);
1384 		return 0;
1385 	}
1386 
1387 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1388 	/* -ENOENT ok coz it might not exist anymore. */
1389 	if (ret == -ENOENT)
1390 		ret = 0;
1391 	iput(old_parent);
1392 	iput(inode);
1393 	return ret;
1394 }
1395 
1396 static int ext4_fc_replay_link_internal(struct super_block *sb,
1397 				struct dentry_info_args *darg,
1398 				struct inode *inode)
1399 {
1400 	struct inode *dir = NULL;
1401 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1402 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1403 	int ret = 0;
1404 
1405 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1406 	if (IS_ERR(dir)) {
1407 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1408 		dir = NULL;
1409 		goto out;
1410 	}
1411 
1412 	dentry_dir = d_obtain_alias(dir);
1413 	if (IS_ERR(dentry_dir)) {
1414 		jbd_debug(1, "Failed to obtain dentry");
1415 		dentry_dir = NULL;
1416 		goto out;
1417 	}
1418 
1419 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1420 	if (!dentry_inode) {
1421 		jbd_debug(1, "Inode dentry not created.");
1422 		ret = -ENOMEM;
1423 		goto out;
1424 	}
1425 
1426 	ret = __ext4_link(dir, inode, dentry_inode);
1427 	/*
1428 	 * It's possible that link already existed since data blocks
1429 	 * for the dir in question got persisted before we crashed OR
1430 	 * we replayed this tag and crashed before the entire replay
1431 	 * could complete.
1432 	 */
1433 	if (ret && ret != -EEXIST) {
1434 		jbd_debug(1, "Failed to link\n");
1435 		goto out;
1436 	}
1437 
1438 	ret = 0;
1439 out:
1440 	if (dentry_dir) {
1441 		d_drop(dentry_dir);
1442 		dput(dentry_dir);
1443 	} else if (dir) {
1444 		iput(dir);
1445 	}
1446 	if (dentry_inode) {
1447 		d_drop(dentry_inode);
1448 		dput(dentry_inode);
1449 	}
1450 
1451 	return ret;
1452 }
1453 
1454 /* Link replay function */
1455 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1456 			       u8 *val)
1457 {
1458 	struct inode *inode;
1459 	struct dentry_info_args darg;
1460 	int ret = 0;
1461 
1462 	tl_to_darg(&darg, tl, val);
1463 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1464 			darg.parent_ino, darg.dname_len);
1465 
1466 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1467 	if (IS_ERR(inode)) {
1468 		jbd_debug(1, "Inode not found.");
1469 		return 0;
1470 	}
1471 
1472 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1473 	iput(inode);
1474 	return ret;
1475 }
1476 
1477 /*
1478  * Record all the modified inodes during replay. We use this later to setup
1479  * block bitmaps correctly.
1480  */
1481 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1482 {
1483 	struct ext4_fc_replay_state *state;
1484 	int i;
1485 
1486 	state = &EXT4_SB(sb)->s_fc_replay_state;
1487 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1488 		if (state->fc_modified_inodes[i] == ino)
1489 			return 0;
1490 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1491 		state->fc_modified_inodes = krealloc(
1492 				state->fc_modified_inodes,
1493 				sizeof(int) * (state->fc_modified_inodes_size +
1494 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1495 				GFP_KERNEL);
1496 		if (!state->fc_modified_inodes)
1497 			return -ENOMEM;
1498 		state->fc_modified_inodes_size +=
1499 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1500 	}
1501 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1502 	return 0;
1503 }
1504 
1505 /*
1506  * Inode replay function
1507  */
1508 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1509 				u8 *val)
1510 {
1511 	struct ext4_fc_inode fc_inode;
1512 	struct ext4_inode *raw_inode;
1513 	struct ext4_inode *raw_fc_inode;
1514 	struct inode *inode = NULL;
1515 	struct ext4_iloc iloc;
1516 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1517 	struct ext4_extent_header *eh;
1518 
1519 	memcpy(&fc_inode, val, sizeof(fc_inode));
1520 
1521 	ino = le32_to_cpu(fc_inode.fc_ino);
1522 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1523 
1524 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1525 	if (!IS_ERR(inode)) {
1526 		ext4_ext_clear_bb(inode);
1527 		iput(inode);
1528 	}
1529 	inode = NULL;
1530 
1531 	ret = ext4_fc_record_modified_inode(sb, ino);
1532 	if (ret)
1533 		goto out;
1534 
1535 	raw_fc_inode = (struct ext4_inode *)
1536 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1537 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1538 	if (ret)
1539 		goto out;
1540 
1541 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1542 	raw_inode = ext4_raw_inode(&iloc);
1543 
1544 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1545 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1546 		inode_len - offsetof(struct ext4_inode, i_generation));
1547 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1548 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1549 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1550 			memset(eh, 0, sizeof(*eh));
1551 			eh->eh_magic = EXT4_EXT_MAGIC;
1552 			eh->eh_max = cpu_to_le16(
1553 				(sizeof(raw_inode->i_block) -
1554 				 sizeof(struct ext4_extent_header))
1555 				 / sizeof(struct ext4_extent));
1556 		}
1557 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1558 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1559 			sizeof(raw_inode->i_block));
1560 	}
1561 
1562 	/* Immediately update the inode on disk. */
1563 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1564 	if (ret)
1565 		goto out;
1566 	ret = sync_dirty_buffer(iloc.bh);
1567 	if (ret)
1568 		goto out;
1569 	ret = ext4_mark_inode_used(sb, ino);
1570 	if (ret)
1571 		goto out;
1572 
1573 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1574 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1575 	if (IS_ERR(inode)) {
1576 		jbd_debug(1, "Inode not found.");
1577 		return -EFSCORRUPTED;
1578 	}
1579 
1580 	/*
1581 	 * Our allocator could have made different decisions than before
1582 	 * crashing. This should be fixed but until then, we calculate
1583 	 * the number of blocks the inode.
1584 	 */
1585 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1586 		ext4_ext_replay_set_iblocks(inode);
1587 
1588 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1589 	ext4_reset_inode_seed(inode);
1590 
1591 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1592 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1593 	sync_dirty_buffer(iloc.bh);
1594 	brelse(iloc.bh);
1595 out:
1596 	iput(inode);
1597 	if (!ret)
1598 		blkdev_issue_flush(sb->s_bdev);
1599 
1600 	return 0;
1601 }
1602 
1603 /*
1604  * Dentry create replay function.
1605  *
1606  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1607  * inode for which we are trying to create a dentry here, should already have
1608  * been replayed before we start here.
1609  */
1610 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1611 				 u8 *val)
1612 {
1613 	int ret = 0;
1614 	struct inode *inode = NULL;
1615 	struct inode *dir = NULL;
1616 	struct dentry_info_args darg;
1617 
1618 	tl_to_darg(&darg, tl, val);
1619 
1620 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1621 			darg.parent_ino, darg.dname_len);
1622 
1623 	/* This takes care of update group descriptor and other metadata */
1624 	ret = ext4_mark_inode_used(sb, darg.ino);
1625 	if (ret)
1626 		goto out;
1627 
1628 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1629 	if (IS_ERR(inode)) {
1630 		jbd_debug(1, "inode %d not found.", darg.ino);
1631 		inode = NULL;
1632 		ret = -EINVAL;
1633 		goto out;
1634 	}
1635 
1636 	if (S_ISDIR(inode->i_mode)) {
1637 		/*
1638 		 * If we are creating a directory, we need to make sure that the
1639 		 * dot and dot dot dirents are setup properly.
1640 		 */
1641 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1642 		if (IS_ERR(dir)) {
1643 			jbd_debug(1, "Dir %d not found.", darg.ino);
1644 			goto out;
1645 		}
1646 		ret = ext4_init_new_dir(NULL, dir, inode);
1647 		iput(dir);
1648 		if (ret) {
1649 			ret = 0;
1650 			goto out;
1651 		}
1652 	}
1653 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1654 	if (ret)
1655 		goto out;
1656 	set_nlink(inode, 1);
1657 	ext4_mark_inode_dirty(NULL, inode);
1658 out:
1659 	if (inode)
1660 		iput(inode);
1661 	return ret;
1662 }
1663 
1664 /*
1665  * Record physical disk regions which are in use as per fast commit area,
1666  * and used by inodes during replay phase. Our simple replay phase
1667  * allocator excludes these regions from allocation.
1668  */
1669 int ext4_fc_record_regions(struct super_block *sb, int ino,
1670 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1671 {
1672 	struct ext4_fc_replay_state *state;
1673 	struct ext4_fc_alloc_region *region;
1674 
1675 	state = &EXT4_SB(sb)->s_fc_replay_state;
1676 	/*
1677 	 * during replay phase, the fc_regions_valid may not same as
1678 	 * fc_regions_used, update it when do new additions.
1679 	 */
1680 	if (replay && state->fc_regions_used != state->fc_regions_valid)
1681 		state->fc_regions_used = state->fc_regions_valid;
1682 	if (state->fc_regions_used == state->fc_regions_size) {
1683 		state->fc_regions_size +=
1684 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1685 		state->fc_regions = krealloc(
1686 					state->fc_regions,
1687 					state->fc_regions_size *
1688 					sizeof(struct ext4_fc_alloc_region),
1689 					GFP_KERNEL);
1690 		if (!state->fc_regions)
1691 			return -ENOMEM;
1692 	}
1693 	region = &state->fc_regions[state->fc_regions_used++];
1694 	region->ino = ino;
1695 	region->lblk = lblk;
1696 	region->pblk = pblk;
1697 	region->len = len;
1698 
1699 	if (replay)
1700 		state->fc_regions_valid++;
1701 
1702 	return 0;
1703 }
1704 
1705 /* Replay add range tag */
1706 static int ext4_fc_replay_add_range(struct super_block *sb,
1707 				    struct ext4_fc_tl *tl, u8 *val)
1708 {
1709 	struct ext4_fc_add_range fc_add_ex;
1710 	struct ext4_extent newex, *ex;
1711 	struct inode *inode;
1712 	ext4_lblk_t start, cur;
1713 	int remaining, len;
1714 	ext4_fsblk_t start_pblk;
1715 	struct ext4_map_blocks map;
1716 	struct ext4_ext_path *path = NULL;
1717 	int ret;
1718 
1719 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1720 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1721 
1722 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1723 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1724 		ext4_ext_get_actual_len(ex));
1725 
1726 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1727 	if (IS_ERR(inode)) {
1728 		jbd_debug(1, "Inode not found.");
1729 		return 0;
1730 	}
1731 
1732 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1733 	if (ret)
1734 		goto out;
1735 
1736 	start = le32_to_cpu(ex->ee_block);
1737 	start_pblk = ext4_ext_pblock(ex);
1738 	len = ext4_ext_get_actual_len(ex);
1739 
1740 	cur = start;
1741 	remaining = len;
1742 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1743 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1744 		  inode->i_ino);
1745 
1746 	while (remaining > 0) {
1747 		map.m_lblk = cur;
1748 		map.m_len = remaining;
1749 		map.m_pblk = 0;
1750 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1751 
1752 		if (ret < 0)
1753 			goto out;
1754 
1755 		if (ret == 0) {
1756 			/* Range is not mapped */
1757 			path = ext4_find_extent(inode, cur, NULL, 0);
1758 			if (IS_ERR(path))
1759 				goto out;
1760 			memset(&newex, 0, sizeof(newex));
1761 			newex.ee_block = cpu_to_le32(cur);
1762 			ext4_ext_store_pblock(
1763 				&newex, start_pblk + cur - start);
1764 			newex.ee_len = cpu_to_le16(map.m_len);
1765 			if (ext4_ext_is_unwritten(ex))
1766 				ext4_ext_mark_unwritten(&newex);
1767 			down_write(&EXT4_I(inode)->i_data_sem);
1768 			ret = ext4_ext_insert_extent(
1769 				NULL, inode, &path, &newex, 0);
1770 			up_write((&EXT4_I(inode)->i_data_sem));
1771 			ext4_ext_drop_refs(path);
1772 			kfree(path);
1773 			if (ret)
1774 				goto out;
1775 			goto next;
1776 		}
1777 
1778 		if (start_pblk + cur - start != map.m_pblk) {
1779 			/*
1780 			 * Logical to physical mapping changed. This can happen
1781 			 * if this range was removed and then reallocated to
1782 			 * map to new physical blocks during a fast commit.
1783 			 */
1784 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1785 					ext4_ext_is_unwritten(ex),
1786 					start_pblk + cur - start);
1787 			if (ret)
1788 				goto out;
1789 			/*
1790 			 * Mark the old blocks as free since they aren't used
1791 			 * anymore. We maintain an array of all the modified
1792 			 * inodes. In case these blocks are still used at either
1793 			 * a different logical range in the same inode or in
1794 			 * some different inode, we will mark them as allocated
1795 			 * at the end of the FC replay using our array of
1796 			 * modified inodes.
1797 			 */
1798 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1799 			goto next;
1800 		}
1801 
1802 		/* Range is mapped and needs a state change */
1803 		jbd_debug(1, "Converting from %ld to %d %lld",
1804 				map.m_flags & EXT4_MAP_UNWRITTEN,
1805 			ext4_ext_is_unwritten(ex), map.m_pblk);
1806 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1807 					ext4_ext_is_unwritten(ex), map.m_pblk);
1808 		if (ret)
1809 			goto out;
1810 		/*
1811 		 * We may have split the extent tree while toggling the state.
1812 		 * Try to shrink the extent tree now.
1813 		 */
1814 		ext4_ext_replay_shrink_inode(inode, start + len);
1815 next:
1816 		cur += map.m_len;
1817 		remaining -= map.m_len;
1818 	}
1819 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1820 					sb->s_blocksize_bits);
1821 out:
1822 	iput(inode);
1823 	return 0;
1824 }
1825 
1826 /* Replay DEL_RANGE tag */
1827 static int
1828 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1829 			 u8 *val)
1830 {
1831 	struct inode *inode;
1832 	struct ext4_fc_del_range lrange;
1833 	struct ext4_map_blocks map;
1834 	ext4_lblk_t cur, remaining;
1835 	int ret;
1836 
1837 	memcpy(&lrange, val, sizeof(lrange));
1838 	cur = le32_to_cpu(lrange.fc_lblk);
1839 	remaining = le32_to_cpu(lrange.fc_len);
1840 
1841 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1842 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1843 
1844 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1845 	if (IS_ERR(inode)) {
1846 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1847 		return 0;
1848 	}
1849 
1850 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1851 	if (ret)
1852 		goto out;
1853 
1854 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1855 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1856 			le32_to_cpu(lrange.fc_len));
1857 	while (remaining > 0) {
1858 		map.m_lblk = cur;
1859 		map.m_len = remaining;
1860 
1861 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1862 		if (ret < 0)
1863 			goto out;
1864 		if (ret > 0) {
1865 			remaining -= ret;
1866 			cur += ret;
1867 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1868 		} else {
1869 			remaining -= map.m_len;
1870 			cur += map.m_len;
1871 		}
1872 	}
1873 
1874 	down_write(&EXT4_I(inode)->i_data_sem);
1875 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1876 				le32_to_cpu(lrange.fc_lblk) +
1877 				le32_to_cpu(lrange.fc_len) - 1);
1878 	up_write(&EXT4_I(inode)->i_data_sem);
1879 	if (ret)
1880 		goto out;
1881 	ext4_ext_replay_shrink_inode(inode,
1882 		i_size_read(inode) >> sb->s_blocksize_bits);
1883 	ext4_mark_inode_dirty(NULL, inode);
1884 out:
1885 	iput(inode);
1886 	return 0;
1887 }
1888 
1889 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1890 {
1891 	struct ext4_fc_replay_state *state;
1892 	struct inode *inode;
1893 	struct ext4_ext_path *path = NULL;
1894 	struct ext4_map_blocks map;
1895 	int i, ret, j;
1896 	ext4_lblk_t cur, end;
1897 
1898 	state = &EXT4_SB(sb)->s_fc_replay_state;
1899 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1900 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1901 			EXT4_IGET_NORMAL);
1902 		if (IS_ERR(inode)) {
1903 			jbd_debug(1, "Inode %d not found.",
1904 				state->fc_modified_inodes[i]);
1905 			continue;
1906 		}
1907 		cur = 0;
1908 		end = EXT_MAX_BLOCKS;
1909 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1910 			iput(inode);
1911 			continue;
1912 		}
1913 		while (cur < end) {
1914 			map.m_lblk = cur;
1915 			map.m_len = end - cur;
1916 
1917 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1918 			if (ret < 0)
1919 				break;
1920 
1921 			if (ret > 0) {
1922 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1923 				if (!IS_ERR(path)) {
1924 					for (j = 0; j < path->p_depth; j++)
1925 						ext4_mb_mark_bb(inode->i_sb,
1926 							path[j].p_block, 1, 1);
1927 					ext4_ext_drop_refs(path);
1928 					kfree(path);
1929 				}
1930 				cur += ret;
1931 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1932 							map.m_len, 1);
1933 			} else {
1934 				cur = cur + (map.m_len ? map.m_len : 1);
1935 			}
1936 		}
1937 		iput(inode);
1938 	}
1939 }
1940 
1941 /*
1942  * Check if block is in excluded regions for block allocation. The simple
1943  * allocator that runs during replay phase is calls this function to see
1944  * if it is okay to use a block.
1945  */
1946 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1947 {
1948 	int i;
1949 	struct ext4_fc_replay_state *state;
1950 
1951 	state = &EXT4_SB(sb)->s_fc_replay_state;
1952 	for (i = 0; i < state->fc_regions_valid; i++) {
1953 		if (state->fc_regions[i].ino == 0 ||
1954 			state->fc_regions[i].len == 0)
1955 			continue;
1956 		if (in_range(blk, state->fc_regions[i].pblk,
1957 					state->fc_regions[i].len))
1958 			return true;
1959 	}
1960 	return false;
1961 }
1962 
1963 /* Cleanup function called after replay */
1964 void ext4_fc_replay_cleanup(struct super_block *sb)
1965 {
1966 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1967 
1968 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1969 	kfree(sbi->s_fc_replay_state.fc_regions);
1970 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1971 }
1972 
1973 /*
1974  * Recovery Scan phase handler
1975  *
1976  * This function is called during the scan phase and is responsible
1977  * for doing following things:
1978  * - Make sure the fast commit area has valid tags for replay
1979  * - Count number of tags that need to be replayed by the replay handler
1980  * - Verify CRC
1981  * - Create a list of excluded blocks for allocation during replay phase
1982  *
1983  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1984  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1985  * to indicate that scan has finished and JBD2 can now start replay phase.
1986  * It returns a negative error to indicate that there was an error. At the end
1987  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1988  * to indicate the number of tags that need to replayed during the replay phase.
1989  */
1990 static int ext4_fc_replay_scan(journal_t *journal,
1991 				struct buffer_head *bh, int off,
1992 				tid_t expected_tid)
1993 {
1994 	struct super_block *sb = journal->j_private;
1995 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1996 	struct ext4_fc_replay_state *state;
1997 	int ret = JBD2_FC_REPLAY_CONTINUE;
1998 	struct ext4_fc_add_range ext;
1999 	struct ext4_fc_tl tl;
2000 	struct ext4_fc_tail tail;
2001 	__u8 *start, *end, *cur, *val;
2002 	struct ext4_fc_head head;
2003 	struct ext4_extent *ex;
2004 
2005 	state = &sbi->s_fc_replay_state;
2006 
2007 	start = (u8 *)bh->b_data;
2008 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2009 
2010 	if (state->fc_replay_expected_off == 0) {
2011 		state->fc_cur_tag = 0;
2012 		state->fc_replay_num_tags = 0;
2013 		state->fc_crc = 0;
2014 		state->fc_regions = NULL;
2015 		state->fc_regions_valid = state->fc_regions_used =
2016 			state->fc_regions_size = 0;
2017 		/* Check if we can stop early */
2018 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2019 			!= EXT4_FC_TAG_HEAD)
2020 			return 0;
2021 	}
2022 
2023 	if (off != state->fc_replay_expected_off) {
2024 		ret = -EFSCORRUPTED;
2025 		goto out_err;
2026 	}
2027 
2028 	state->fc_replay_expected_off++;
2029 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2030 		memcpy(&tl, cur, sizeof(tl));
2031 		val = cur + sizeof(tl);
2032 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
2033 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2034 		switch (le16_to_cpu(tl.fc_tag)) {
2035 		case EXT4_FC_TAG_ADD_RANGE:
2036 			memcpy(&ext, val, sizeof(ext));
2037 			ex = (struct ext4_extent *)&ext.fc_ex;
2038 			ret = ext4_fc_record_regions(sb,
2039 				le32_to_cpu(ext.fc_ino),
2040 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2041 				ext4_ext_get_actual_len(ex), 0);
2042 			if (ret < 0)
2043 				break;
2044 			ret = JBD2_FC_REPLAY_CONTINUE;
2045 			fallthrough;
2046 		case EXT4_FC_TAG_DEL_RANGE:
2047 		case EXT4_FC_TAG_LINK:
2048 		case EXT4_FC_TAG_UNLINK:
2049 		case EXT4_FC_TAG_CREAT:
2050 		case EXT4_FC_TAG_INODE:
2051 		case EXT4_FC_TAG_PAD:
2052 			state->fc_cur_tag++;
2053 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2054 					sizeof(tl) + le16_to_cpu(tl.fc_len));
2055 			break;
2056 		case EXT4_FC_TAG_TAIL:
2057 			state->fc_cur_tag++;
2058 			memcpy(&tail, val, sizeof(tail));
2059 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2060 						sizeof(tl) +
2061 						offsetof(struct ext4_fc_tail,
2062 						fc_crc));
2063 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2064 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2065 				state->fc_replay_num_tags = state->fc_cur_tag;
2066 				state->fc_regions_valid =
2067 					state->fc_regions_used;
2068 			} else {
2069 				ret = state->fc_replay_num_tags ?
2070 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2071 			}
2072 			state->fc_crc = 0;
2073 			break;
2074 		case EXT4_FC_TAG_HEAD:
2075 			memcpy(&head, val, sizeof(head));
2076 			if (le32_to_cpu(head.fc_features) &
2077 				~EXT4_FC_SUPPORTED_FEATURES) {
2078 				ret = -EOPNOTSUPP;
2079 				break;
2080 			}
2081 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2082 				ret = JBD2_FC_REPLAY_STOP;
2083 				break;
2084 			}
2085 			state->fc_cur_tag++;
2086 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2087 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2088 			break;
2089 		default:
2090 			ret = state->fc_replay_num_tags ?
2091 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2092 		}
2093 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2094 			break;
2095 	}
2096 
2097 out_err:
2098 	trace_ext4_fc_replay_scan(sb, ret, off);
2099 	return ret;
2100 }
2101 
2102 /*
2103  * Main recovery path entry point.
2104  * The meaning of return codes is similar as above.
2105  */
2106 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2107 				enum passtype pass, int off, tid_t expected_tid)
2108 {
2109 	struct super_block *sb = journal->j_private;
2110 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2111 	struct ext4_fc_tl tl;
2112 	__u8 *start, *end, *cur, *val;
2113 	int ret = JBD2_FC_REPLAY_CONTINUE;
2114 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2115 	struct ext4_fc_tail tail;
2116 
2117 	if (pass == PASS_SCAN) {
2118 		state->fc_current_pass = PASS_SCAN;
2119 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2120 	}
2121 
2122 	if (state->fc_current_pass != pass) {
2123 		state->fc_current_pass = pass;
2124 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2125 	}
2126 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2127 		jbd_debug(1, "Replay stops\n");
2128 		ext4_fc_set_bitmaps_and_counters(sb);
2129 		return 0;
2130 	}
2131 
2132 #ifdef CONFIG_EXT4_DEBUG
2133 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2134 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2135 		return JBD2_FC_REPLAY_STOP;
2136 	}
2137 #endif
2138 
2139 	start = (u8 *)bh->b_data;
2140 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2141 
2142 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2143 		memcpy(&tl, cur, sizeof(tl));
2144 		val = cur + sizeof(tl);
2145 
2146 		if (state->fc_replay_num_tags == 0) {
2147 			ret = JBD2_FC_REPLAY_STOP;
2148 			ext4_fc_set_bitmaps_and_counters(sb);
2149 			break;
2150 		}
2151 		jbd_debug(3, "Replay phase, tag:%s\n",
2152 				tag2str(le16_to_cpu(tl.fc_tag)));
2153 		state->fc_replay_num_tags--;
2154 		switch (le16_to_cpu(tl.fc_tag)) {
2155 		case EXT4_FC_TAG_LINK:
2156 			ret = ext4_fc_replay_link(sb, &tl, val);
2157 			break;
2158 		case EXT4_FC_TAG_UNLINK:
2159 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2160 			break;
2161 		case EXT4_FC_TAG_ADD_RANGE:
2162 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2163 			break;
2164 		case EXT4_FC_TAG_CREAT:
2165 			ret = ext4_fc_replay_create(sb, &tl, val);
2166 			break;
2167 		case EXT4_FC_TAG_DEL_RANGE:
2168 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2169 			break;
2170 		case EXT4_FC_TAG_INODE:
2171 			ret = ext4_fc_replay_inode(sb, &tl, val);
2172 			break;
2173 		case EXT4_FC_TAG_PAD:
2174 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2175 					     le16_to_cpu(tl.fc_len), 0);
2176 			break;
2177 		case EXT4_FC_TAG_TAIL:
2178 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2179 					     le16_to_cpu(tl.fc_len), 0);
2180 			memcpy(&tail, val, sizeof(tail));
2181 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2182 			break;
2183 		case EXT4_FC_TAG_HEAD:
2184 			break;
2185 		default:
2186 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2187 					     le16_to_cpu(tl.fc_len), 0);
2188 			ret = -ECANCELED;
2189 			break;
2190 		}
2191 		if (ret < 0)
2192 			break;
2193 		ret = JBD2_FC_REPLAY_CONTINUE;
2194 	}
2195 	return ret;
2196 }
2197 
2198 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2199 {
2200 	/*
2201 	 * We set replay callback even if fast commit disabled because we may
2202 	 * could still have fast commit blocks that need to be replayed even if
2203 	 * fast commit has now been turned off.
2204 	 */
2205 	journal->j_fc_replay_callback = ext4_fc_replay;
2206 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2207 		return;
2208 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2209 }
2210 
2211 static const char *fc_ineligible_reasons[] = {
2212 	"Extended attributes changed",
2213 	"Cross rename",
2214 	"Journal flag changed",
2215 	"Insufficient memory",
2216 	"Swap boot",
2217 	"Resize",
2218 	"Dir renamed",
2219 	"Falloc range op",
2220 	"Data journalling",
2221 	"FC Commit Failed"
2222 };
2223 
2224 int ext4_fc_info_show(struct seq_file *seq, void *v)
2225 {
2226 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2227 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2228 	int i;
2229 
2230 	if (v != SEQ_START_TOKEN)
2231 		return 0;
2232 
2233 	seq_printf(seq,
2234 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2235 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2236 		   stats->fc_numblks,
2237 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2238 	seq_puts(seq, "Ineligible reasons:\n");
2239 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2240 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2241 			stats->fc_ineligible_reason_count[i]);
2242 
2243 	return 0;
2244 }
2245 
2246 int __init ext4_fc_init_dentry_cache(void)
2247 {
2248 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2249 					   SLAB_RECLAIM_ACCOUNT);
2250 
2251 	if (ext4_fc_dentry_cachep == NULL)
2252 		return -ENOMEM;
2253 
2254 	return 0;
2255 }
2256 
2257 void ext4_fc_destroy_dentry_cache(void)
2258 {
2259 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2260 }
2261