xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 27cd49780381c6ccbf248798e5e8fd076200ffba)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	INIT_LIST_HEAD(&ei->i_fc_dilist);
203 	init_waitqueue_head(&ei->i_fc_wait);
204 	atomic_set(&ei->i_fc_updates, 0);
205 }
206 
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211 	wait_queue_head_t *wq;
212 	struct ext4_inode_info *ei = EXT4_I(inode);
213 
214 #if (BITS_PER_LONG < 64)
215 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 			EXT4_STATE_FC_COMMITTING);
217 	wq = bit_waitqueue(&ei->i_state_flags,
218 				EXT4_STATE_FC_COMMITTING);
219 #else
220 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 			EXT4_STATE_FC_COMMITTING);
222 	wq = bit_waitqueue(&ei->i_flags,
223 				EXT4_STATE_FC_COMMITTING);
224 #endif
225 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 	schedule();
229 	finish_wait(wq, &wait.wq_entry);
230 }
231 
232 static bool ext4_fc_disabled(struct super_block *sb)
233 {
234 	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
235 		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
236 }
237 
238 /*
239  * Inform Ext4's fast about start of an inode update
240  *
241  * This function is called by the high level call VFS callbacks before
242  * performing any inode update. This function blocks if there's an ongoing
243  * fast commit on the inode in question.
244  */
245 void ext4_fc_start_update(struct inode *inode)
246 {
247 	struct ext4_inode_info *ei = EXT4_I(inode);
248 
249 	if (ext4_fc_disabled(inode->i_sb))
250 		return;
251 
252 restart:
253 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
254 	if (list_empty(&ei->i_fc_list))
255 		goto out;
256 
257 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
258 		ext4_fc_wait_committing_inode(inode);
259 		goto restart;
260 	}
261 out:
262 	atomic_inc(&ei->i_fc_updates);
263 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
264 }
265 
266 /*
267  * Stop inode update and wake up waiting fast commits if any.
268  */
269 void ext4_fc_stop_update(struct inode *inode)
270 {
271 	struct ext4_inode_info *ei = EXT4_I(inode);
272 
273 	if (ext4_fc_disabled(inode->i_sb))
274 		return;
275 
276 	if (atomic_dec_and_test(&ei->i_fc_updates))
277 		wake_up_all(&ei->i_fc_wait);
278 }
279 
280 /*
281  * Remove inode from fast commit list. If the inode is being committed
282  * we wait until inode commit is done.
283  */
284 void ext4_fc_del(struct inode *inode)
285 {
286 	struct ext4_inode_info *ei = EXT4_I(inode);
287 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
288 	struct ext4_fc_dentry_update *fc_dentry;
289 
290 	if (ext4_fc_disabled(inode->i_sb))
291 		return;
292 
293 restart:
294 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
295 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
296 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
297 		return;
298 	}
299 
300 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
301 		ext4_fc_wait_committing_inode(inode);
302 		goto restart;
303 	}
304 
305 	if (!list_empty(&ei->i_fc_list))
306 		list_del_init(&ei->i_fc_list);
307 
308 	/*
309 	 * Since this inode is getting removed, let's also remove all FC
310 	 * dentry create references, since it is not needed to log it anyways.
311 	 */
312 	if (list_empty(&ei->i_fc_dilist)) {
313 		spin_unlock(&sbi->s_fc_lock);
314 		return;
315 	}
316 
317 	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
318 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
319 	list_del_init(&fc_dentry->fcd_list);
320 	list_del_init(&fc_dentry->fcd_dilist);
321 
322 	WARN_ON(!list_empty(&ei->i_fc_dilist));
323 	spin_unlock(&sbi->s_fc_lock);
324 
325 	if (fc_dentry->fcd_name.name &&
326 		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
327 		kfree(fc_dentry->fcd_name.name);
328 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
329 
330 	return;
331 }
332 
333 /*
334  * Mark file system as fast commit ineligible, and record latest
335  * ineligible transaction tid. This means until the recorded
336  * transaction, commit operation would result in a full jbd2 commit.
337  */
338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
339 {
340 	struct ext4_sb_info *sbi = EXT4_SB(sb);
341 	tid_t tid;
342 
343 	if (ext4_fc_disabled(sb))
344 		return;
345 
346 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
347 	if (handle && !IS_ERR(handle))
348 		tid = handle->h_transaction->t_tid;
349 	else {
350 		read_lock(&sbi->s_journal->j_state_lock);
351 		tid = sbi->s_journal->j_running_transaction ?
352 				sbi->s_journal->j_running_transaction->t_tid : 0;
353 		read_unlock(&sbi->s_journal->j_state_lock);
354 	}
355 	spin_lock(&sbi->s_fc_lock);
356 	if (sbi->s_fc_ineligible_tid < tid)
357 		sbi->s_fc_ineligible_tid = tid;
358 	spin_unlock(&sbi->s_fc_lock);
359 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
360 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
361 }
362 
363 /*
364  * Generic fast commit tracking function. If this is the first time this we are
365  * called after a full commit, we initialize fast commit fields and then call
366  * __fc_track_fn() with update = 0. If we have already been called after a full
367  * commit, we pass update = 1. Based on that, the track function can determine
368  * if it needs to track a field for the first time or if it needs to just
369  * update the previously tracked value.
370  *
371  * If enqueue is set, this function enqueues the inode in fast commit list.
372  */
373 static int ext4_fc_track_template(
374 	handle_t *handle, struct inode *inode,
375 	int (*__fc_track_fn)(struct inode *, void *, bool),
376 	void *args, int enqueue)
377 {
378 	bool update = false;
379 	struct ext4_inode_info *ei = EXT4_I(inode);
380 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
381 	tid_t tid = 0;
382 	int ret;
383 
384 	tid = handle->h_transaction->t_tid;
385 	mutex_lock(&ei->i_fc_lock);
386 	if (tid == ei->i_sync_tid) {
387 		update = true;
388 	} else {
389 		ext4_fc_reset_inode(inode);
390 		ei->i_sync_tid = tid;
391 	}
392 	ret = __fc_track_fn(inode, args, update);
393 	mutex_unlock(&ei->i_fc_lock);
394 
395 	if (!enqueue)
396 		return ret;
397 
398 	spin_lock(&sbi->s_fc_lock);
399 	if (list_empty(&EXT4_I(inode)->i_fc_list))
400 		list_add_tail(&EXT4_I(inode)->i_fc_list,
401 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
402 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
403 				&sbi->s_fc_q[FC_Q_STAGING] :
404 				&sbi->s_fc_q[FC_Q_MAIN]);
405 	spin_unlock(&sbi->s_fc_lock);
406 
407 	return ret;
408 }
409 
410 struct __track_dentry_update_args {
411 	struct dentry *dentry;
412 	int op;
413 };
414 
415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
416 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
417 {
418 	struct ext4_fc_dentry_update *node;
419 	struct ext4_inode_info *ei = EXT4_I(inode);
420 	struct __track_dentry_update_args *dentry_update =
421 		(struct __track_dentry_update_args *)arg;
422 	struct dentry *dentry = dentry_update->dentry;
423 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
424 
425 	mutex_unlock(&ei->i_fc_lock);
426 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
427 	if (!node) {
428 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
429 		mutex_lock(&ei->i_fc_lock);
430 		return -ENOMEM;
431 	}
432 
433 	node->fcd_op = dentry_update->op;
434 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
435 	node->fcd_ino = inode->i_ino;
436 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
437 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
438 		if (!node->fcd_name.name) {
439 			kmem_cache_free(ext4_fc_dentry_cachep, node);
440 			ext4_fc_mark_ineligible(inode->i_sb,
441 				EXT4_FC_REASON_NOMEM, NULL);
442 			mutex_lock(&ei->i_fc_lock);
443 			return -ENOMEM;
444 		}
445 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
446 			dentry->d_name.len);
447 	} else {
448 		memcpy(node->fcd_iname, dentry->d_name.name,
449 			dentry->d_name.len);
450 		node->fcd_name.name = node->fcd_iname;
451 	}
452 	node->fcd_name.len = dentry->d_name.len;
453 	INIT_LIST_HEAD(&node->fcd_dilist);
454 	spin_lock(&sbi->s_fc_lock);
455 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
456 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
457 		list_add_tail(&node->fcd_list,
458 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
459 	else
460 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
461 
462 	/*
463 	 * This helps us keep a track of all fc_dentry updates which is part of
464 	 * this ext4 inode. So in case the inode is getting unlinked, before
465 	 * even we get a chance to fsync, we could remove all fc_dentry
466 	 * references while evicting the inode in ext4_fc_del().
467 	 * Also with this, we don't need to loop over all the inodes in
468 	 * sbi->s_fc_q to get the corresponding inode in
469 	 * ext4_fc_commit_dentry_updates().
470 	 */
471 	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
472 		WARN_ON(!list_empty(&ei->i_fc_dilist));
473 		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
474 	}
475 	spin_unlock(&sbi->s_fc_lock);
476 	mutex_lock(&ei->i_fc_lock);
477 
478 	return 0;
479 }
480 
481 void __ext4_fc_track_unlink(handle_t *handle,
482 		struct inode *inode, struct dentry *dentry)
483 {
484 	struct __track_dentry_update_args args;
485 	int ret;
486 
487 	args.dentry = dentry;
488 	args.op = EXT4_FC_TAG_UNLINK;
489 
490 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
491 					(void *)&args, 0);
492 	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
493 }
494 
495 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
496 {
497 	struct inode *inode = d_inode(dentry);
498 
499 	if (ext4_fc_disabled(inode->i_sb))
500 		return;
501 
502 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503 		return;
504 
505 	__ext4_fc_track_unlink(handle, inode, dentry);
506 }
507 
508 void __ext4_fc_track_link(handle_t *handle,
509 	struct inode *inode, struct dentry *dentry)
510 {
511 	struct __track_dentry_update_args args;
512 	int ret;
513 
514 	args.dentry = dentry;
515 	args.op = EXT4_FC_TAG_LINK;
516 
517 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518 					(void *)&args, 0);
519 	trace_ext4_fc_track_link(handle, inode, dentry, ret);
520 }
521 
522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524 	struct inode *inode = d_inode(dentry);
525 
526 	if (ext4_fc_disabled(inode->i_sb))
527 		return;
528 
529 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
530 		return;
531 
532 	__ext4_fc_track_link(handle, inode, dentry);
533 }
534 
535 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
536 			  struct dentry *dentry)
537 {
538 	struct __track_dentry_update_args args;
539 	int ret;
540 
541 	args.dentry = dentry;
542 	args.op = EXT4_FC_TAG_CREAT;
543 
544 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
545 					(void *)&args, 0);
546 	trace_ext4_fc_track_create(handle, inode, dentry, ret);
547 }
548 
549 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
550 {
551 	struct inode *inode = d_inode(dentry);
552 
553 	if (ext4_fc_disabled(inode->i_sb))
554 		return;
555 
556 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
557 		return;
558 
559 	__ext4_fc_track_create(handle, inode, dentry);
560 }
561 
562 /* __track_fn for inode tracking */
563 static int __track_inode(struct inode *inode, void *arg, bool update)
564 {
565 	if (update)
566 		return -EEXIST;
567 
568 	EXT4_I(inode)->i_fc_lblk_len = 0;
569 
570 	return 0;
571 }
572 
573 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
574 {
575 	int ret;
576 
577 	if (S_ISDIR(inode->i_mode))
578 		return;
579 
580 	if (ext4_fc_disabled(inode->i_sb))
581 		return;
582 
583 	if (ext4_should_journal_data(inode)) {
584 		ext4_fc_mark_ineligible(inode->i_sb,
585 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
586 		return;
587 	}
588 
589 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
590 		return;
591 
592 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
593 	trace_ext4_fc_track_inode(handle, inode, ret);
594 }
595 
596 struct __track_range_args {
597 	ext4_lblk_t start, end;
598 };
599 
600 /* __track_fn for tracking data updates */
601 static int __track_range(struct inode *inode, void *arg, bool update)
602 {
603 	struct ext4_inode_info *ei = EXT4_I(inode);
604 	ext4_lblk_t oldstart;
605 	struct __track_range_args *__arg =
606 		(struct __track_range_args *)arg;
607 
608 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
609 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
610 		return -ECANCELED;
611 	}
612 
613 	oldstart = ei->i_fc_lblk_start;
614 
615 	if (update && ei->i_fc_lblk_len > 0) {
616 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
617 		ei->i_fc_lblk_len =
618 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
619 				ei->i_fc_lblk_start + 1;
620 	} else {
621 		ei->i_fc_lblk_start = __arg->start;
622 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
623 	}
624 
625 	return 0;
626 }
627 
628 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
629 			 ext4_lblk_t end)
630 {
631 	struct __track_range_args args;
632 	int ret;
633 
634 	if (S_ISDIR(inode->i_mode))
635 		return;
636 
637 	if (ext4_fc_disabled(inode->i_sb))
638 		return;
639 
640 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
641 		return;
642 
643 	args.start = start;
644 	args.end = end;
645 
646 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
647 
648 	trace_ext4_fc_track_range(handle, inode, start, end, ret);
649 }
650 
651 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
652 {
653 	blk_opf_t write_flags = REQ_SYNC;
654 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
655 
656 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
657 	if (test_opt(sb, BARRIER) && is_tail)
658 		write_flags |= REQ_FUA | REQ_PREFLUSH;
659 	lock_buffer(bh);
660 	set_buffer_dirty(bh);
661 	set_buffer_uptodate(bh);
662 	bh->b_end_io = ext4_end_buffer_io_sync;
663 	submit_bh(REQ_OP_WRITE | write_flags, bh);
664 	EXT4_SB(sb)->s_fc_bh = NULL;
665 }
666 
667 /* Ext4 commit path routines */
668 
669 /* memzero and update CRC */
670 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
671 				u32 *crc)
672 {
673 	void *ret;
674 
675 	ret = memset(dst, 0, len);
676 	if (crc)
677 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
678 	return ret;
679 }
680 
681 /*
682  * Allocate len bytes on a fast commit buffer.
683  *
684  * During the commit time this function is used to manage fast commit
685  * block space. We don't split a fast commit log onto different
686  * blocks. So this function makes sure that if there's not enough space
687  * on the current block, the remaining space in the current block is
688  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
689  * new block is from jbd2 and CRC is updated to reflect the padding
690  * we added.
691  */
692 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
693 {
694 	struct ext4_fc_tl *tl;
695 	struct ext4_sb_info *sbi = EXT4_SB(sb);
696 	struct buffer_head *bh;
697 	int bsize = sbi->s_journal->j_blocksize;
698 	int ret, off = sbi->s_fc_bytes % bsize;
699 	int pad_len;
700 
701 	/*
702 	 * After allocating len, we should have space at least for a 0 byte
703 	 * padding.
704 	 */
705 	if (len + sizeof(struct ext4_fc_tl) > bsize)
706 		return NULL;
707 
708 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
709 		/*
710 		 * Only allocate from current buffer if we have enough space for
711 		 * this request AND we have space to add a zero byte padding.
712 		 */
713 		if (!sbi->s_fc_bh) {
714 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
715 			if (ret)
716 				return NULL;
717 			sbi->s_fc_bh = bh;
718 		}
719 		sbi->s_fc_bytes += len;
720 		return sbi->s_fc_bh->b_data + off;
721 	}
722 	/* Need to add PAD tag */
723 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
724 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
725 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
726 	tl->fc_len = cpu_to_le16(pad_len);
727 	if (crc)
728 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
729 	if (pad_len > 0)
730 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
731 	ext4_fc_submit_bh(sb, false);
732 
733 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
734 	if (ret)
735 		return NULL;
736 	sbi->s_fc_bh = bh;
737 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
738 	return sbi->s_fc_bh->b_data;
739 }
740 
741 /* memcpy to fc reserved space and update CRC */
742 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
743 				int len, u32 *crc)
744 {
745 	if (crc)
746 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
747 	return memcpy(dst, src, len);
748 }
749 
750 /*
751  * Complete a fast commit by writing tail tag.
752  *
753  * Writing tail tag marks the end of a fast commit. In order to guarantee
754  * atomicity, after writing tail tag, even if there's space remaining
755  * in the block, next commit shouldn't use it. That's why tail tag
756  * has the length as that of the remaining space on the block.
757  */
758 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
759 {
760 	struct ext4_sb_info *sbi = EXT4_SB(sb);
761 	struct ext4_fc_tl tl;
762 	struct ext4_fc_tail tail;
763 	int off, bsize = sbi->s_journal->j_blocksize;
764 	u8 *dst;
765 
766 	/*
767 	 * ext4_fc_reserve_space takes care of allocating an extra block if
768 	 * there's no enough space on this block for accommodating this tail.
769 	 */
770 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
771 	if (!dst)
772 		return -ENOSPC;
773 
774 	off = sbi->s_fc_bytes % bsize;
775 
776 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
777 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
778 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
779 
780 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
781 	dst += sizeof(tl);
782 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
783 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
784 	dst += sizeof(tail.fc_tid);
785 	tail.fc_crc = cpu_to_le32(crc);
786 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
787 
788 	ext4_fc_submit_bh(sb, true);
789 
790 	return 0;
791 }
792 
793 /*
794  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
795  * Returns false if there's not enough space.
796  */
797 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
798 			   u32 *crc)
799 {
800 	struct ext4_fc_tl tl;
801 	u8 *dst;
802 
803 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
804 	if (!dst)
805 		return false;
806 
807 	tl.fc_tag = cpu_to_le16(tag);
808 	tl.fc_len = cpu_to_le16(len);
809 
810 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
811 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
812 
813 	return true;
814 }
815 
816 /* Same as above, but adds dentry tlv. */
817 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
818 				   struct ext4_fc_dentry_update *fc_dentry)
819 {
820 	struct ext4_fc_dentry_info fcd;
821 	struct ext4_fc_tl tl;
822 	int dlen = fc_dentry->fcd_name.len;
823 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
824 					crc);
825 
826 	if (!dst)
827 		return false;
828 
829 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
830 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
831 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
832 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
833 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
834 	dst += sizeof(tl);
835 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
836 	dst += sizeof(fcd);
837 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
838 
839 	return true;
840 }
841 
842 /*
843  * Writes inode in the fast commit space under TLV with tag @tag.
844  * Returns 0 on success, error on failure.
845  */
846 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
847 {
848 	struct ext4_inode_info *ei = EXT4_I(inode);
849 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
850 	int ret;
851 	struct ext4_iloc iloc;
852 	struct ext4_fc_inode fc_inode;
853 	struct ext4_fc_tl tl;
854 	u8 *dst;
855 
856 	ret = ext4_get_inode_loc(inode, &iloc);
857 	if (ret)
858 		return ret;
859 
860 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
861 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
862 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
863 		inode_len += ei->i_extra_isize;
864 
865 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
866 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
867 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
868 
869 	ret = -ECANCELED;
870 	dst = ext4_fc_reserve_space(inode->i_sb,
871 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
872 	if (!dst)
873 		goto err;
874 
875 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
876 		goto err;
877 	dst += sizeof(tl);
878 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
879 		goto err;
880 	dst += sizeof(fc_inode);
881 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
882 					inode_len, crc))
883 		goto err;
884 	ret = 0;
885 err:
886 	brelse(iloc.bh);
887 	return ret;
888 }
889 
890 /*
891  * Writes updated data ranges for the inode in question. Updates CRC.
892  * Returns 0 on success, error otherwise.
893  */
894 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
895 {
896 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
897 	struct ext4_inode_info *ei = EXT4_I(inode);
898 	struct ext4_map_blocks map;
899 	struct ext4_fc_add_range fc_ext;
900 	struct ext4_fc_del_range lrange;
901 	struct ext4_extent *ex;
902 	int ret;
903 
904 	mutex_lock(&ei->i_fc_lock);
905 	if (ei->i_fc_lblk_len == 0) {
906 		mutex_unlock(&ei->i_fc_lock);
907 		return 0;
908 	}
909 	old_blk_size = ei->i_fc_lblk_start;
910 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
911 	ei->i_fc_lblk_len = 0;
912 	mutex_unlock(&ei->i_fc_lock);
913 
914 	cur_lblk_off = old_blk_size;
915 	ext4_debug("will try writing %d to %d for inode %ld\n",
916 		   cur_lblk_off, new_blk_size, inode->i_ino);
917 
918 	while (cur_lblk_off <= new_blk_size) {
919 		map.m_lblk = cur_lblk_off;
920 		map.m_len = new_blk_size - cur_lblk_off + 1;
921 		ret = ext4_map_blocks(NULL, inode, &map, 0);
922 		if (ret < 0)
923 			return -ECANCELED;
924 
925 		if (map.m_len == 0) {
926 			cur_lblk_off++;
927 			continue;
928 		}
929 
930 		if (ret == 0) {
931 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
932 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
933 			lrange.fc_len = cpu_to_le32(map.m_len);
934 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
935 					    sizeof(lrange), (u8 *)&lrange, crc))
936 				return -ENOSPC;
937 		} else {
938 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
939 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
940 
941 			/* Limit the number of blocks in one extent */
942 			map.m_len = min(max, map.m_len);
943 
944 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
945 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
946 			ex->ee_block = cpu_to_le32(map.m_lblk);
947 			ex->ee_len = cpu_to_le16(map.m_len);
948 			ext4_ext_store_pblock(ex, map.m_pblk);
949 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
950 				ext4_ext_mark_unwritten(ex);
951 			else
952 				ext4_ext_mark_initialized(ex);
953 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
954 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
955 				return -ENOSPC;
956 		}
957 
958 		cur_lblk_off += map.m_len;
959 	}
960 
961 	return 0;
962 }
963 
964 
965 /* Submit data for all the fast commit inodes */
966 static int ext4_fc_submit_inode_data_all(journal_t *journal)
967 {
968 	struct super_block *sb = journal->j_private;
969 	struct ext4_sb_info *sbi = EXT4_SB(sb);
970 	struct ext4_inode_info *ei;
971 	int ret = 0;
972 
973 	spin_lock(&sbi->s_fc_lock);
974 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
975 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
976 		while (atomic_read(&ei->i_fc_updates)) {
977 			DEFINE_WAIT(wait);
978 
979 			prepare_to_wait(&ei->i_fc_wait, &wait,
980 						TASK_UNINTERRUPTIBLE);
981 			if (atomic_read(&ei->i_fc_updates)) {
982 				spin_unlock(&sbi->s_fc_lock);
983 				schedule();
984 				spin_lock(&sbi->s_fc_lock);
985 			}
986 			finish_wait(&ei->i_fc_wait, &wait);
987 		}
988 		spin_unlock(&sbi->s_fc_lock);
989 		ret = jbd2_submit_inode_data(ei->jinode);
990 		if (ret)
991 			return ret;
992 		spin_lock(&sbi->s_fc_lock);
993 	}
994 	spin_unlock(&sbi->s_fc_lock);
995 
996 	return ret;
997 }
998 
999 /* Wait for completion of data for all the fast commit inodes */
1000 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1001 {
1002 	struct super_block *sb = journal->j_private;
1003 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1004 	struct ext4_inode_info *pos, *n;
1005 	int ret = 0;
1006 
1007 	spin_lock(&sbi->s_fc_lock);
1008 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1009 		if (!ext4_test_inode_state(&pos->vfs_inode,
1010 					   EXT4_STATE_FC_COMMITTING))
1011 			continue;
1012 		spin_unlock(&sbi->s_fc_lock);
1013 
1014 		ret = jbd2_wait_inode_data(journal, pos->jinode);
1015 		if (ret)
1016 			return ret;
1017 		spin_lock(&sbi->s_fc_lock);
1018 	}
1019 	spin_unlock(&sbi->s_fc_lock);
1020 
1021 	return 0;
1022 }
1023 
1024 /* Commit all the directory entry updates */
1025 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1026 __acquires(&sbi->s_fc_lock)
1027 __releases(&sbi->s_fc_lock)
1028 {
1029 	struct super_block *sb = journal->j_private;
1030 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1031 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1032 	struct inode *inode;
1033 	struct ext4_inode_info *ei;
1034 	int ret;
1035 
1036 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1037 		return 0;
1038 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1039 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1040 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1041 			spin_unlock(&sbi->s_fc_lock);
1042 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1043 				ret = -ENOSPC;
1044 				goto lock_and_exit;
1045 			}
1046 			spin_lock(&sbi->s_fc_lock);
1047 			continue;
1048 		}
1049 		/*
1050 		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1051 		 * corresponding inode pointer
1052 		 */
1053 		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1054 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1055 				struct ext4_inode_info, i_fc_dilist);
1056 		inode = &ei->vfs_inode;
1057 		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1058 
1059 		spin_unlock(&sbi->s_fc_lock);
1060 
1061 		/*
1062 		 * We first write the inode and then the create dirent. This
1063 		 * allows the recovery code to create an unnamed inode first
1064 		 * and then link it to a directory entry. This allows us
1065 		 * to use namei.c routines almost as is and simplifies
1066 		 * the recovery code.
1067 		 */
1068 		ret = ext4_fc_write_inode(inode, crc);
1069 		if (ret)
1070 			goto lock_and_exit;
1071 
1072 		ret = ext4_fc_write_inode_data(inode, crc);
1073 		if (ret)
1074 			goto lock_and_exit;
1075 
1076 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1077 			ret = -ENOSPC;
1078 			goto lock_and_exit;
1079 		}
1080 
1081 		spin_lock(&sbi->s_fc_lock);
1082 	}
1083 	return 0;
1084 lock_and_exit:
1085 	spin_lock(&sbi->s_fc_lock);
1086 	return ret;
1087 }
1088 
1089 static int ext4_fc_perform_commit(journal_t *journal)
1090 {
1091 	struct super_block *sb = journal->j_private;
1092 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1093 	struct ext4_inode_info *iter;
1094 	struct ext4_fc_head head;
1095 	struct inode *inode;
1096 	struct blk_plug plug;
1097 	int ret = 0;
1098 	u32 crc = 0;
1099 
1100 	ret = ext4_fc_submit_inode_data_all(journal);
1101 	if (ret)
1102 		return ret;
1103 
1104 	ret = ext4_fc_wait_inode_data_all(journal);
1105 	if (ret)
1106 		return ret;
1107 
1108 	/*
1109 	 * If file system device is different from journal device, issue a cache
1110 	 * flush before we start writing fast commit blocks.
1111 	 */
1112 	if (journal->j_fs_dev != journal->j_dev)
1113 		blkdev_issue_flush(journal->j_fs_dev);
1114 
1115 	blk_start_plug(&plug);
1116 	if (sbi->s_fc_bytes == 0) {
1117 		/*
1118 		 * Add a head tag only if this is the first fast commit
1119 		 * in this TID.
1120 		 */
1121 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1122 		head.fc_tid = cpu_to_le32(
1123 			sbi->s_journal->j_running_transaction->t_tid);
1124 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1125 			(u8 *)&head, &crc)) {
1126 			ret = -ENOSPC;
1127 			goto out;
1128 		}
1129 	}
1130 
1131 	spin_lock(&sbi->s_fc_lock);
1132 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1133 	if (ret) {
1134 		spin_unlock(&sbi->s_fc_lock);
1135 		goto out;
1136 	}
1137 
1138 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1139 		inode = &iter->vfs_inode;
1140 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1141 			continue;
1142 
1143 		spin_unlock(&sbi->s_fc_lock);
1144 		ret = ext4_fc_write_inode_data(inode, &crc);
1145 		if (ret)
1146 			goto out;
1147 		ret = ext4_fc_write_inode(inode, &crc);
1148 		if (ret)
1149 			goto out;
1150 		spin_lock(&sbi->s_fc_lock);
1151 	}
1152 	spin_unlock(&sbi->s_fc_lock);
1153 
1154 	ret = ext4_fc_write_tail(sb, crc);
1155 
1156 out:
1157 	blk_finish_plug(&plug);
1158 	return ret;
1159 }
1160 
1161 static void ext4_fc_update_stats(struct super_block *sb, int status,
1162 				 u64 commit_time, int nblks, tid_t commit_tid)
1163 {
1164 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1165 
1166 	ext4_debug("Fast commit ended with status = %d for tid %u",
1167 			status, commit_tid);
1168 	if (status == EXT4_FC_STATUS_OK) {
1169 		stats->fc_num_commits++;
1170 		stats->fc_numblks += nblks;
1171 		if (likely(stats->s_fc_avg_commit_time))
1172 			stats->s_fc_avg_commit_time =
1173 				(commit_time +
1174 				 stats->s_fc_avg_commit_time * 3) / 4;
1175 		else
1176 			stats->s_fc_avg_commit_time = commit_time;
1177 	} else if (status == EXT4_FC_STATUS_FAILED ||
1178 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1179 		if (status == EXT4_FC_STATUS_FAILED)
1180 			stats->fc_failed_commits++;
1181 		stats->fc_ineligible_commits++;
1182 	} else {
1183 		stats->fc_skipped_commits++;
1184 	}
1185 	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1186 }
1187 
1188 /*
1189  * The main commit entry point. Performs a fast commit for transaction
1190  * commit_tid if needed. If it's not possible to perform a fast commit
1191  * due to various reasons, we fall back to full commit. Returns 0
1192  * on success, error otherwise.
1193  */
1194 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1195 {
1196 	struct super_block *sb = journal->j_private;
1197 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1198 	int nblks = 0, ret, bsize = journal->j_blocksize;
1199 	int subtid = atomic_read(&sbi->s_fc_subtid);
1200 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1201 	ktime_t start_time, commit_time;
1202 
1203 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1204 		return jbd2_complete_transaction(journal, commit_tid);
1205 
1206 	trace_ext4_fc_commit_start(sb, commit_tid);
1207 
1208 	start_time = ktime_get();
1209 
1210 restart_fc:
1211 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1212 	if (ret == -EALREADY) {
1213 		/* There was an ongoing commit, check if we need to restart */
1214 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1215 			commit_tid > journal->j_commit_sequence)
1216 			goto restart_fc;
1217 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1218 				commit_tid);
1219 		return 0;
1220 	} else if (ret) {
1221 		/*
1222 		 * Commit couldn't start. Just update stats and perform a
1223 		 * full commit.
1224 		 */
1225 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1226 				commit_tid);
1227 		return jbd2_complete_transaction(journal, commit_tid);
1228 	}
1229 
1230 	/*
1231 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1232 	 * if we are fast commit ineligible.
1233 	 */
1234 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1235 		status = EXT4_FC_STATUS_INELIGIBLE;
1236 		goto fallback;
1237 	}
1238 
1239 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1240 	ret = ext4_fc_perform_commit(journal);
1241 	if (ret < 0) {
1242 		status = EXT4_FC_STATUS_FAILED;
1243 		goto fallback;
1244 	}
1245 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1246 	ret = jbd2_fc_wait_bufs(journal, nblks);
1247 	if (ret < 0) {
1248 		status = EXT4_FC_STATUS_FAILED;
1249 		goto fallback;
1250 	}
1251 	atomic_inc(&sbi->s_fc_subtid);
1252 	ret = jbd2_fc_end_commit(journal);
1253 	/*
1254 	 * weight the commit time higher than the average time so we
1255 	 * don't react too strongly to vast changes in the commit time
1256 	 */
1257 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1258 	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1259 	return ret;
1260 
1261 fallback:
1262 	ret = jbd2_fc_end_commit_fallback(journal);
1263 	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1264 	return ret;
1265 }
1266 
1267 /*
1268  * Fast commit cleanup routine. This is called after every fast commit and
1269  * full commit. full is true if we are called after a full commit.
1270  */
1271 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1272 {
1273 	struct super_block *sb = journal->j_private;
1274 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1275 	struct ext4_inode_info *iter, *iter_n;
1276 	struct ext4_fc_dentry_update *fc_dentry;
1277 
1278 	if (full && sbi->s_fc_bh)
1279 		sbi->s_fc_bh = NULL;
1280 
1281 	trace_ext4_fc_cleanup(journal, full, tid);
1282 	jbd2_fc_release_bufs(journal);
1283 
1284 	spin_lock(&sbi->s_fc_lock);
1285 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1286 				 i_fc_list) {
1287 		list_del_init(&iter->i_fc_list);
1288 		ext4_clear_inode_state(&iter->vfs_inode,
1289 				       EXT4_STATE_FC_COMMITTING);
1290 		if (iter->i_sync_tid <= tid)
1291 			ext4_fc_reset_inode(&iter->vfs_inode);
1292 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1293 		smp_mb();
1294 #if (BITS_PER_LONG < 64)
1295 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1296 #else
1297 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1298 #endif
1299 	}
1300 
1301 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1302 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1303 					     struct ext4_fc_dentry_update,
1304 					     fcd_list);
1305 		list_del_init(&fc_dentry->fcd_list);
1306 		list_del_init(&fc_dentry->fcd_dilist);
1307 		spin_unlock(&sbi->s_fc_lock);
1308 
1309 		if (fc_dentry->fcd_name.name &&
1310 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1311 			kfree(fc_dentry->fcd_name.name);
1312 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1313 		spin_lock(&sbi->s_fc_lock);
1314 	}
1315 
1316 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1317 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1318 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1319 				&sbi->s_fc_q[FC_Q_MAIN]);
1320 
1321 	if (tid >= sbi->s_fc_ineligible_tid) {
1322 		sbi->s_fc_ineligible_tid = 0;
1323 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1324 	}
1325 
1326 	if (full)
1327 		sbi->s_fc_bytes = 0;
1328 	spin_unlock(&sbi->s_fc_lock);
1329 	trace_ext4_fc_stats(sb);
1330 }
1331 
1332 /* Ext4 Replay Path Routines */
1333 
1334 /* Helper struct for dentry replay routines */
1335 struct dentry_info_args {
1336 	int parent_ino, dname_len, ino, inode_len;
1337 	char *dname;
1338 };
1339 
1340 static inline void tl_to_darg(struct dentry_info_args *darg,
1341 			      struct  ext4_fc_tl *tl, u8 *val)
1342 {
1343 	struct ext4_fc_dentry_info fcd;
1344 
1345 	memcpy(&fcd, val, sizeof(fcd));
1346 
1347 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1348 	darg->ino = le32_to_cpu(fcd.fc_ino);
1349 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1350 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1351 		sizeof(struct ext4_fc_dentry_info);
1352 }
1353 
1354 /* Unlink replay function */
1355 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1356 				 u8 *val)
1357 {
1358 	struct inode *inode, *old_parent;
1359 	struct qstr entry;
1360 	struct dentry_info_args darg;
1361 	int ret = 0;
1362 
1363 	tl_to_darg(&darg, tl, val);
1364 
1365 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1366 			darg.parent_ino, darg.dname_len);
1367 
1368 	entry.name = darg.dname;
1369 	entry.len = darg.dname_len;
1370 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1371 
1372 	if (IS_ERR(inode)) {
1373 		ext4_debug("Inode %d not found", darg.ino);
1374 		return 0;
1375 	}
1376 
1377 	old_parent = ext4_iget(sb, darg.parent_ino,
1378 				EXT4_IGET_NORMAL);
1379 	if (IS_ERR(old_parent)) {
1380 		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1381 		iput(inode);
1382 		return 0;
1383 	}
1384 
1385 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1386 	/* -ENOENT ok coz it might not exist anymore. */
1387 	if (ret == -ENOENT)
1388 		ret = 0;
1389 	iput(old_parent);
1390 	iput(inode);
1391 	return ret;
1392 }
1393 
1394 static int ext4_fc_replay_link_internal(struct super_block *sb,
1395 				struct dentry_info_args *darg,
1396 				struct inode *inode)
1397 {
1398 	struct inode *dir = NULL;
1399 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1400 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1401 	int ret = 0;
1402 
1403 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1404 	if (IS_ERR(dir)) {
1405 		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1406 		dir = NULL;
1407 		goto out;
1408 	}
1409 
1410 	dentry_dir = d_obtain_alias(dir);
1411 	if (IS_ERR(dentry_dir)) {
1412 		ext4_debug("Failed to obtain dentry");
1413 		dentry_dir = NULL;
1414 		goto out;
1415 	}
1416 
1417 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1418 	if (!dentry_inode) {
1419 		ext4_debug("Inode dentry not created.");
1420 		ret = -ENOMEM;
1421 		goto out;
1422 	}
1423 
1424 	ret = __ext4_link(dir, inode, dentry_inode);
1425 	/*
1426 	 * It's possible that link already existed since data blocks
1427 	 * for the dir in question got persisted before we crashed OR
1428 	 * we replayed this tag and crashed before the entire replay
1429 	 * could complete.
1430 	 */
1431 	if (ret && ret != -EEXIST) {
1432 		ext4_debug("Failed to link\n");
1433 		goto out;
1434 	}
1435 
1436 	ret = 0;
1437 out:
1438 	if (dentry_dir) {
1439 		d_drop(dentry_dir);
1440 		dput(dentry_dir);
1441 	} else if (dir) {
1442 		iput(dir);
1443 	}
1444 	if (dentry_inode) {
1445 		d_drop(dentry_inode);
1446 		dput(dentry_inode);
1447 	}
1448 
1449 	return ret;
1450 }
1451 
1452 /* Link replay function */
1453 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1454 			       u8 *val)
1455 {
1456 	struct inode *inode;
1457 	struct dentry_info_args darg;
1458 	int ret = 0;
1459 
1460 	tl_to_darg(&darg, tl, val);
1461 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1462 			darg.parent_ino, darg.dname_len);
1463 
1464 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1465 	if (IS_ERR(inode)) {
1466 		ext4_debug("Inode not found.");
1467 		return 0;
1468 	}
1469 
1470 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1471 	iput(inode);
1472 	return ret;
1473 }
1474 
1475 /*
1476  * Record all the modified inodes during replay. We use this later to setup
1477  * block bitmaps correctly.
1478  */
1479 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1480 {
1481 	struct ext4_fc_replay_state *state;
1482 	int i;
1483 
1484 	state = &EXT4_SB(sb)->s_fc_replay_state;
1485 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1486 		if (state->fc_modified_inodes[i] == ino)
1487 			return 0;
1488 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1489 		int *fc_modified_inodes;
1490 
1491 		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1492 				sizeof(int) * (state->fc_modified_inodes_size +
1493 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1494 				GFP_KERNEL);
1495 		if (!fc_modified_inodes)
1496 			return -ENOMEM;
1497 		state->fc_modified_inodes = fc_modified_inodes;
1498 		state->fc_modified_inodes_size +=
1499 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1500 	}
1501 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1502 	return 0;
1503 }
1504 
1505 /*
1506  * Inode replay function
1507  */
1508 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1509 				u8 *val)
1510 {
1511 	struct ext4_fc_inode fc_inode;
1512 	struct ext4_inode *raw_inode;
1513 	struct ext4_inode *raw_fc_inode;
1514 	struct inode *inode = NULL;
1515 	struct ext4_iloc iloc;
1516 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1517 	struct ext4_extent_header *eh;
1518 
1519 	memcpy(&fc_inode, val, sizeof(fc_inode));
1520 
1521 	ino = le32_to_cpu(fc_inode.fc_ino);
1522 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1523 
1524 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1525 	if (!IS_ERR(inode)) {
1526 		ext4_ext_clear_bb(inode);
1527 		iput(inode);
1528 	}
1529 	inode = NULL;
1530 
1531 	ret = ext4_fc_record_modified_inode(sb, ino);
1532 	if (ret)
1533 		goto out;
1534 
1535 	raw_fc_inode = (struct ext4_inode *)
1536 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1537 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1538 	if (ret)
1539 		goto out;
1540 
1541 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1542 	raw_inode = ext4_raw_inode(&iloc);
1543 
1544 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1545 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1546 		inode_len - offsetof(struct ext4_inode, i_generation));
1547 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1548 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1549 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1550 			memset(eh, 0, sizeof(*eh));
1551 			eh->eh_magic = EXT4_EXT_MAGIC;
1552 			eh->eh_max = cpu_to_le16(
1553 				(sizeof(raw_inode->i_block) -
1554 				 sizeof(struct ext4_extent_header))
1555 				 / sizeof(struct ext4_extent));
1556 		}
1557 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1558 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1559 			sizeof(raw_inode->i_block));
1560 	}
1561 
1562 	/* Immediately update the inode on disk. */
1563 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1564 	if (ret)
1565 		goto out;
1566 	ret = sync_dirty_buffer(iloc.bh);
1567 	if (ret)
1568 		goto out;
1569 	ret = ext4_mark_inode_used(sb, ino);
1570 	if (ret)
1571 		goto out;
1572 
1573 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1574 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1575 	if (IS_ERR(inode)) {
1576 		ext4_debug("Inode not found.");
1577 		return -EFSCORRUPTED;
1578 	}
1579 
1580 	/*
1581 	 * Our allocator could have made different decisions than before
1582 	 * crashing. This should be fixed but until then, we calculate
1583 	 * the number of blocks the inode.
1584 	 */
1585 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1586 		ext4_ext_replay_set_iblocks(inode);
1587 
1588 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1589 	ext4_reset_inode_seed(inode);
1590 
1591 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1592 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1593 	sync_dirty_buffer(iloc.bh);
1594 	brelse(iloc.bh);
1595 out:
1596 	iput(inode);
1597 	if (!ret)
1598 		blkdev_issue_flush(sb->s_bdev);
1599 
1600 	return 0;
1601 }
1602 
1603 /*
1604  * Dentry create replay function.
1605  *
1606  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1607  * inode for which we are trying to create a dentry here, should already have
1608  * been replayed before we start here.
1609  */
1610 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1611 				 u8 *val)
1612 {
1613 	int ret = 0;
1614 	struct inode *inode = NULL;
1615 	struct inode *dir = NULL;
1616 	struct dentry_info_args darg;
1617 
1618 	tl_to_darg(&darg, tl, val);
1619 
1620 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1621 			darg.parent_ino, darg.dname_len);
1622 
1623 	/* This takes care of update group descriptor and other metadata */
1624 	ret = ext4_mark_inode_used(sb, darg.ino);
1625 	if (ret)
1626 		goto out;
1627 
1628 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1629 	if (IS_ERR(inode)) {
1630 		ext4_debug("inode %d not found.", darg.ino);
1631 		inode = NULL;
1632 		ret = -EINVAL;
1633 		goto out;
1634 	}
1635 
1636 	if (S_ISDIR(inode->i_mode)) {
1637 		/*
1638 		 * If we are creating a directory, we need to make sure that the
1639 		 * dot and dot dot dirents are setup properly.
1640 		 */
1641 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1642 		if (IS_ERR(dir)) {
1643 			ext4_debug("Dir %d not found.", darg.ino);
1644 			goto out;
1645 		}
1646 		ret = ext4_init_new_dir(NULL, dir, inode);
1647 		iput(dir);
1648 		if (ret) {
1649 			ret = 0;
1650 			goto out;
1651 		}
1652 	}
1653 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1654 	if (ret)
1655 		goto out;
1656 	set_nlink(inode, 1);
1657 	ext4_mark_inode_dirty(NULL, inode);
1658 out:
1659 	iput(inode);
1660 	return ret;
1661 }
1662 
1663 /*
1664  * Record physical disk regions which are in use as per fast commit area,
1665  * and used by inodes during replay phase. Our simple replay phase
1666  * allocator excludes these regions from allocation.
1667  */
1668 int ext4_fc_record_regions(struct super_block *sb, int ino,
1669 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1670 {
1671 	struct ext4_fc_replay_state *state;
1672 	struct ext4_fc_alloc_region *region;
1673 
1674 	state = &EXT4_SB(sb)->s_fc_replay_state;
1675 	/*
1676 	 * during replay phase, the fc_regions_valid may not same as
1677 	 * fc_regions_used, update it when do new additions.
1678 	 */
1679 	if (replay && state->fc_regions_used != state->fc_regions_valid)
1680 		state->fc_regions_used = state->fc_regions_valid;
1681 	if (state->fc_regions_used == state->fc_regions_size) {
1682 		struct ext4_fc_alloc_region *fc_regions;
1683 
1684 		fc_regions = krealloc(state->fc_regions,
1685 				      sizeof(struct ext4_fc_alloc_region) *
1686 				      (state->fc_regions_size +
1687 				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1688 				      GFP_KERNEL);
1689 		if (!fc_regions)
1690 			return -ENOMEM;
1691 		state->fc_regions_size +=
1692 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1693 		state->fc_regions = fc_regions;
1694 	}
1695 	region = &state->fc_regions[state->fc_regions_used++];
1696 	region->ino = ino;
1697 	region->lblk = lblk;
1698 	region->pblk = pblk;
1699 	region->len = len;
1700 
1701 	if (replay)
1702 		state->fc_regions_valid++;
1703 
1704 	return 0;
1705 }
1706 
1707 /* Replay add range tag */
1708 static int ext4_fc_replay_add_range(struct super_block *sb,
1709 				    struct ext4_fc_tl *tl, u8 *val)
1710 {
1711 	struct ext4_fc_add_range fc_add_ex;
1712 	struct ext4_extent newex, *ex;
1713 	struct inode *inode;
1714 	ext4_lblk_t start, cur;
1715 	int remaining, len;
1716 	ext4_fsblk_t start_pblk;
1717 	struct ext4_map_blocks map;
1718 	struct ext4_ext_path *path = NULL;
1719 	int ret;
1720 
1721 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1722 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1723 
1724 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1725 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1726 		ext4_ext_get_actual_len(ex));
1727 
1728 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1729 	if (IS_ERR(inode)) {
1730 		ext4_debug("Inode not found.");
1731 		return 0;
1732 	}
1733 
1734 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1735 	if (ret)
1736 		goto out;
1737 
1738 	start = le32_to_cpu(ex->ee_block);
1739 	start_pblk = ext4_ext_pblock(ex);
1740 	len = ext4_ext_get_actual_len(ex);
1741 
1742 	cur = start;
1743 	remaining = len;
1744 	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1745 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1746 		  inode->i_ino);
1747 
1748 	while (remaining > 0) {
1749 		map.m_lblk = cur;
1750 		map.m_len = remaining;
1751 		map.m_pblk = 0;
1752 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1753 
1754 		if (ret < 0)
1755 			goto out;
1756 
1757 		if (ret == 0) {
1758 			/* Range is not mapped */
1759 			path = ext4_find_extent(inode, cur, NULL, 0);
1760 			if (IS_ERR(path))
1761 				goto out;
1762 			memset(&newex, 0, sizeof(newex));
1763 			newex.ee_block = cpu_to_le32(cur);
1764 			ext4_ext_store_pblock(
1765 				&newex, start_pblk + cur - start);
1766 			newex.ee_len = cpu_to_le16(map.m_len);
1767 			if (ext4_ext_is_unwritten(ex))
1768 				ext4_ext_mark_unwritten(&newex);
1769 			down_write(&EXT4_I(inode)->i_data_sem);
1770 			ret = ext4_ext_insert_extent(
1771 				NULL, inode, &path, &newex, 0);
1772 			up_write((&EXT4_I(inode)->i_data_sem));
1773 			ext4_ext_drop_refs(path);
1774 			kfree(path);
1775 			if (ret)
1776 				goto out;
1777 			goto next;
1778 		}
1779 
1780 		if (start_pblk + cur - start != map.m_pblk) {
1781 			/*
1782 			 * Logical to physical mapping changed. This can happen
1783 			 * if this range was removed and then reallocated to
1784 			 * map to new physical blocks during a fast commit.
1785 			 */
1786 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1787 					ext4_ext_is_unwritten(ex),
1788 					start_pblk + cur - start);
1789 			if (ret)
1790 				goto out;
1791 			/*
1792 			 * Mark the old blocks as free since they aren't used
1793 			 * anymore. We maintain an array of all the modified
1794 			 * inodes. In case these blocks are still used at either
1795 			 * a different logical range in the same inode or in
1796 			 * some different inode, we will mark them as allocated
1797 			 * at the end of the FC replay using our array of
1798 			 * modified inodes.
1799 			 */
1800 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1801 			goto next;
1802 		}
1803 
1804 		/* Range is mapped and needs a state change */
1805 		ext4_debug("Converting from %ld to %d %lld",
1806 				map.m_flags & EXT4_MAP_UNWRITTEN,
1807 			ext4_ext_is_unwritten(ex), map.m_pblk);
1808 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1809 					ext4_ext_is_unwritten(ex), map.m_pblk);
1810 		if (ret)
1811 			goto out;
1812 		/*
1813 		 * We may have split the extent tree while toggling the state.
1814 		 * Try to shrink the extent tree now.
1815 		 */
1816 		ext4_ext_replay_shrink_inode(inode, start + len);
1817 next:
1818 		cur += map.m_len;
1819 		remaining -= map.m_len;
1820 	}
1821 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1822 					sb->s_blocksize_bits);
1823 out:
1824 	iput(inode);
1825 	return 0;
1826 }
1827 
1828 /* Replay DEL_RANGE tag */
1829 static int
1830 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1831 			 u8 *val)
1832 {
1833 	struct inode *inode;
1834 	struct ext4_fc_del_range lrange;
1835 	struct ext4_map_blocks map;
1836 	ext4_lblk_t cur, remaining;
1837 	int ret;
1838 
1839 	memcpy(&lrange, val, sizeof(lrange));
1840 	cur = le32_to_cpu(lrange.fc_lblk);
1841 	remaining = le32_to_cpu(lrange.fc_len);
1842 
1843 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1844 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1845 
1846 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1847 	if (IS_ERR(inode)) {
1848 		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1849 		return 0;
1850 	}
1851 
1852 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1853 	if (ret)
1854 		goto out;
1855 
1856 	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1857 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1858 			le32_to_cpu(lrange.fc_len));
1859 	while (remaining > 0) {
1860 		map.m_lblk = cur;
1861 		map.m_len = remaining;
1862 
1863 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1864 		if (ret < 0)
1865 			goto out;
1866 		if (ret > 0) {
1867 			remaining -= ret;
1868 			cur += ret;
1869 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1870 		} else {
1871 			remaining -= map.m_len;
1872 			cur += map.m_len;
1873 		}
1874 	}
1875 
1876 	down_write(&EXT4_I(inode)->i_data_sem);
1877 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1878 				le32_to_cpu(lrange.fc_lblk) +
1879 				le32_to_cpu(lrange.fc_len) - 1);
1880 	up_write(&EXT4_I(inode)->i_data_sem);
1881 	if (ret)
1882 		goto out;
1883 	ext4_ext_replay_shrink_inode(inode,
1884 		i_size_read(inode) >> sb->s_blocksize_bits);
1885 	ext4_mark_inode_dirty(NULL, inode);
1886 out:
1887 	iput(inode);
1888 	return 0;
1889 }
1890 
1891 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1892 {
1893 	struct ext4_fc_replay_state *state;
1894 	struct inode *inode;
1895 	struct ext4_ext_path *path = NULL;
1896 	struct ext4_map_blocks map;
1897 	int i, ret, j;
1898 	ext4_lblk_t cur, end;
1899 
1900 	state = &EXT4_SB(sb)->s_fc_replay_state;
1901 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1902 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1903 			EXT4_IGET_NORMAL);
1904 		if (IS_ERR(inode)) {
1905 			ext4_debug("Inode %d not found.",
1906 				state->fc_modified_inodes[i]);
1907 			continue;
1908 		}
1909 		cur = 0;
1910 		end = EXT_MAX_BLOCKS;
1911 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1912 			iput(inode);
1913 			continue;
1914 		}
1915 		while (cur < end) {
1916 			map.m_lblk = cur;
1917 			map.m_len = end - cur;
1918 
1919 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1920 			if (ret < 0)
1921 				break;
1922 
1923 			if (ret > 0) {
1924 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1925 				if (!IS_ERR(path)) {
1926 					for (j = 0; j < path->p_depth; j++)
1927 						ext4_mb_mark_bb(inode->i_sb,
1928 							path[j].p_block, 1, 1);
1929 					ext4_ext_drop_refs(path);
1930 					kfree(path);
1931 				}
1932 				cur += ret;
1933 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1934 							map.m_len, 1);
1935 			} else {
1936 				cur = cur + (map.m_len ? map.m_len : 1);
1937 			}
1938 		}
1939 		iput(inode);
1940 	}
1941 }
1942 
1943 /*
1944  * Check if block is in excluded regions for block allocation. The simple
1945  * allocator that runs during replay phase is calls this function to see
1946  * if it is okay to use a block.
1947  */
1948 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1949 {
1950 	int i;
1951 	struct ext4_fc_replay_state *state;
1952 
1953 	state = &EXT4_SB(sb)->s_fc_replay_state;
1954 	for (i = 0; i < state->fc_regions_valid; i++) {
1955 		if (state->fc_regions[i].ino == 0 ||
1956 			state->fc_regions[i].len == 0)
1957 			continue;
1958 		if (in_range(blk, state->fc_regions[i].pblk,
1959 					state->fc_regions[i].len))
1960 			return true;
1961 	}
1962 	return false;
1963 }
1964 
1965 /* Cleanup function called after replay */
1966 void ext4_fc_replay_cleanup(struct super_block *sb)
1967 {
1968 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1969 
1970 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1971 	kfree(sbi->s_fc_replay_state.fc_regions);
1972 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1973 }
1974 
1975 /*
1976  * Recovery Scan phase handler
1977  *
1978  * This function is called during the scan phase and is responsible
1979  * for doing following things:
1980  * - Make sure the fast commit area has valid tags for replay
1981  * - Count number of tags that need to be replayed by the replay handler
1982  * - Verify CRC
1983  * - Create a list of excluded blocks for allocation during replay phase
1984  *
1985  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1986  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1987  * to indicate that scan has finished and JBD2 can now start replay phase.
1988  * It returns a negative error to indicate that there was an error. At the end
1989  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1990  * to indicate the number of tags that need to replayed during the replay phase.
1991  */
1992 static int ext4_fc_replay_scan(journal_t *journal,
1993 				struct buffer_head *bh, int off,
1994 				tid_t expected_tid)
1995 {
1996 	struct super_block *sb = journal->j_private;
1997 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1998 	struct ext4_fc_replay_state *state;
1999 	int ret = JBD2_FC_REPLAY_CONTINUE;
2000 	struct ext4_fc_add_range ext;
2001 	struct ext4_fc_tl tl;
2002 	struct ext4_fc_tail tail;
2003 	__u8 *start, *end, *cur, *val;
2004 	struct ext4_fc_head head;
2005 	struct ext4_extent *ex;
2006 
2007 	state = &sbi->s_fc_replay_state;
2008 
2009 	start = (u8 *)bh->b_data;
2010 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2011 
2012 	if (state->fc_replay_expected_off == 0) {
2013 		state->fc_cur_tag = 0;
2014 		state->fc_replay_num_tags = 0;
2015 		state->fc_crc = 0;
2016 		state->fc_regions = NULL;
2017 		state->fc_regions_valid = state->fc_regions_used =
2018 			state->fc_regions_size = 0;
2019 		/* Check if we can stop early */
2020 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2021 			!= EXT4_FC_TAG_HEAD)
2022 			return 0;
2023 	}
2024 
2025 	if (off != state->fc_replay_expected_off) {
2026 		ret = -EFSCORRUPTED;
2027 		goto out_err;
2028 	}
2029 
2030 	state->fc_replay_expected_off++;
2031 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2032 		memcpy(&tl, cur, sizeof(tl));
2033 		val = cur + sizeof(tl);
2034 		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2035 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2036 		switch (le16_to_cpu(tl.fc_tag)) {
2037 		case EXT4_FC_TAG_ADD_RANGE:
2038 			memcpy(&ext, val, sizeof(ext));
2039 			ex = (struct ext4_extent *)&ext.fc_ex;
2040 			ret = ext4_fc_record_regions(sb,
2041 				le32_to_cpu(ext.fc_ino),
2042 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2043 				ext4_ext_get_actual_len(ex), 0);
2044 			if (ret < 0)
2045 				break;
2046 			ret = JBD2_FC_REPLAY_CONTINUE;
2047 			fallthrough;
2048 		case EXT4_FC_TAG_DEL_RANGE:
2049 		case EXT4_FC_TAG_LINK:
2050 		case EXT4_FC_TAG_UNLINK:
2051 		case EXT4_FC_TAG_CREAT:
2052 		case EXT4_FC_TAG_INODE:
2053 		case EXT4_FC_TAG_PAD:
2054 			state->fc_cur_tag++;
2055 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2056 					sizeof(tl) + le16_to_cpu(tl.fc_len));
2057 			break;
2058 		case EXT4_FC_TAG_TAIL:
2059 			state->fc_cur_tag++;
2060 			memcpy(&tail, val, sizeof(tail));
2061 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2062 						sizeof(tl) +
2063 						offsetof(struct ext4_fc_tail,
2064 						fc_crc));
2065 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2066 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2067 				state->fc_replay_num_tags = state->fc_cur_tag;
2068 				state->fc_regions_valid =
2069 					state->fc_regions_used;
2070 			} else {
2071 				ret = state->fc_replay_num_tags ?
2072 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2073 			}
2074 			state->fc_crc = 0;
2075 			break;
2076 		case EXT4_FC_TAG_HEAD:
2077 			memcpy(&head, val, sizeof(head));
2078 			if (le32_to_cpu(head.fc_features) &
2079 				~EXT4_FC_SUPPORTED_FEATURES) {
2080 				ret = -EOPNOTSUPP;
2081 				break;
2082 			}
2083 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2084 				ret = JBD2_FC_REPLAY_STOP;
2085 				break;
2086 			}
2087 			state->fc_cur_tag++;
2088 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2089 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2090 			break;
2091 		default:
2092 			ret = state->fc_replay_num_tags ?
2093 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2094 		}
2095 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2096 			break;
2097 	}
2098 
2099 out_err:
2100 	trace_ext4_fc_replay_scan(sb, ret, off);
2101 	return ret;
2102 }
2103 
2104 /*
2105  * Main recovery path entry point.
2106  * The meaning of return codes is similar as above.
2107  */
2108 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2109 				enum passtype pass, int off, tid_t expected_tid)
2110 {
2111 	struct super_block *sb = journal->j_private;
2112 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2113 	struct ext4_fc_tl tl;
2114 	__u8 *start, *end, *cur, *val;
2115 	int ret = JBD2_FC_REPLAY_CONTINUE;
2116 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2117 	struct ext4_fc_tail tail;
2118 
2119 	if (pass == PASS_SCAN) {
2120 		state->fc_current_pass = PASS_SCAN;
2121 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2122 	}
2123 
2124 	if (state->fc_current_pass != pass) {
2125 		state->fc_current_pass = pass;
2126 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2127 	}
2128 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2129 		ext4_debug("Replay stops\n");
2130 		ext4_fc_set_bitmaps_and_counters(sb);
2131 		return 0;
2132 	}
2133 
2134 #ifdef CONFIG_EXT4_DEBUG
2135 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2136 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2137 		return JBD2_FC_REPLAY_STOP;
2138 	}
2139 #endif
2140 
2141 	start = (u8 *)bh->b_data;
2142 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2143 
2144 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2145 		memcpy(&tl, cur, sizeof(tl));
2146 		val = cur + sizeof(tl);
2147 
2148 		if (state->fc_replay_num_tags == 0) {
2149 			ret = JBD2_FC_REPLAY_STOP;
2150 			ext4_fc_set_bitmaps_and_counters(sb);
2151 			break;
2152 		}
2153 		ext4_debug("Replay phase, tag:%s\n",
2154 				tag2str(le16_to_cpu(tl.fc_tag)));
2155 		state->fc_replay_num_tags--;
2156 		switch (le16_to_cpu(tl.fc_tag)) {
2157 		case EXT4_FC_TAG_LINK:
2158 			ret = ext4_fc_replay_link(sb, &tl, val);
2159 			break;
2160 		case EXT4_FC_TAG_UNLINK:
2161 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2162 			break;
2163 		case EXT4_FC_TAG_ADD_RANGE:
2164 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2165 			break;
2166 		case EXT4_FC_TAG_CREAT:
2167 			ret = ext4_fc_replay_create(sb, &tl, val);
2168 			break;
2169 		case EXT4_FC_TAG_DEL_RANGE:
2170 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2171 			break;
2172 		case EXT4_FC_TAG_INODE:
2173 			ret = ext4_fc_replay_inode(sb, &tl, val);
2174 			break;
2175 		case EXT4_FC_TAG_PAD:
2176 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2177 					     le16_to_cpu(tl.fc_len), 0);
2178 			break;
2179 		case EXT4_FC_TAG_TAIL:
2180 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2181 					     le16_to_cpu(tl.fc_len), 0);
2182 			memcpy(&tail, val, sizeof(tail));
2183 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2184 			break;
2185 		case EXT4_FC_TAG_HEAD:
2186 			break;
2187 		default:
2188 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2189 					     le16_to_cpu(tl.fc_len), 0);
2190 			ret = -ECANCELED;
2191 			break;
2192 		}
2193 		if (ret < 0)
2194 			break;
2195 		ret = JBD2_FC_REPLAY_CONTINUE;
2196 	}
2197 	return ret;
2198 }
2199 
2200 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2201 {
2202 	/*
2203 	 * We set replay callback even if fast commit disabled because we may
2204 	 * could still have fast commit blocks that need to be replayed even if
2205 	 * fast commit has now been turned off.
2206 	 */
2207 	journal->j_fc_replay_callback = ext4_fc_replay;
2208 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2209 		return;
2210 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2211 }
2212 
2213 static const char *fc_ineligible_reasons[] = {
2214 	"Extended attributes changed",
2215 	"Cross rename",
2216 	"Journal flag changed",
2217 	"Insufficient memory",
2218 	"Swap boot",
2219 	"Resize",
2220 	"Dir renamed",
2221 	"Falloc range op",
2222 	"Data journalling",
2223 	"FC Commit Failed"
2224 };
2225 
2226 int ext4_fc_info_show(struct seq_file *seq, void *v)
2227 {
2228 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2229 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2230 	int i;
2231 
2232 	if (v != SEQ_START_TOKEN)
2233 		return 0;
2234 
2235 	seq_printf(seq,
2236 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2237 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2238 		   stats->fc_numblks,
2239 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2240 	seq_puts(seq, "Ineligible reasons:\n");
2241 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2242 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2243 			stats->fc_ineligible_reason_count[i]);
2244 
2245 	return 0;
2246 }
2247 
2248 int __init ext4_fc_init_dentry_cache(void)
2249 {
2250 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2251 					   SLAB_RECLAIM_ACCOUNT);
2252 
2253 	if (ext4_fc_dentry_cachep == NULL)
2254 		return -ENOMEM;
2255 
2256 	return 0;
2257 }
2258 
2259 void ext4_fc_destroy_dentry_cache(void)
2260 {
2261 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2262 }
2263